代码运行 499-500 次后递归......为什么？所有帮助表示赞赏。新颖的网络爬虫答案

【问题标题】：Recursion after 499-500 times of code running... why? All help appreciated. Novel web-scrape代码运行 499-500 次后递归......为什么？所有帮助表示赞赏。新颖的网络爬虫
【发布时间】：2021-04-21 21:26:03
【问题描述】：

使用 chrome 90 和 python 3.9。我刚安装时，所有导入都已完全更新。

因为我有一个糟糕的 ISP，所以我制作了这个脚本来将互联网上的小说复制到文本文件中，以便在我的互联网中断时离线查看。该脚本几乎可以正常工作，直到弹出递归错误，然后我必须在设置后手动进入并更改章节。我对代码的预期结果是运行直到小说完全复制（从第 1 章到######）到文本文件，无论有多少章。

在我复制了 499 或 500 章后总是出现递归错误。我不知道为什么它这么低，也不知道它是如何得到这个错误的。我读过递归错误通常发生在 999 次迭代之后。

错误 ::（前两行重复了很长一段时间）

  File "C:\Users\james\Documents\Novels\PEERLESS MARTIAL GOD\novel.py", line 42, in CopyChapter
    NextChapter()
  File "C:\Users\james\Documents\Novels\PEERLESS MARTIAL GOD\novel.py", line 49, in NextChapter
    link = driver.find_element_by_link_text(cLink)
  File "C:\Program Files\Python39\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 428, in find_element_by_link_text
    return self.find_element(by=By.LINK_TEXT, value=link_text)
  File "C:\Program Files\Python39\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 976, in find_element
    return self.execute(Command.FIND_ELEMENT, {
  File "C:\Program Files\Python39\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 319, in execute
    response = self.command_executor.execute(driver_command, params)
  File "C:\Program Files\Python39\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 374, in execute
    return self._request(command_info[0], url, body=data)
  File "C:\Program Files\Python39\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 397, in _request
    resp = self._conn.request(method, url, body=body, headers=headers)
  File "C:\Program Files\Python39\lib\site-packages\urllib3\request.py", line 78, in request
    return self.request_encode_body(
  File "C:\Program Files\Python39\lib\site-packages\urllib3\request.py", line 170, in request_encode_body
    return self.urlopen(method, url, **extra_kw)
  File "C:\Program Files\Python39\lib\site-packages\urllib3\poolmanager.py", line 375, in urlopen
    response = conn.urlopen(method, u.request_uri, **kw)
  File "C:\Program Files\Python39\lib\site-packages\urllib3\connectionpool.py", line 699, in urlopen
    httplib_response = self._make_request(
  File "C:\Program Files\Python39\lib\site-packages\urllib3\connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "C:\Program Files\Python39\lib\site-packages\urllib3\connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "C:\Program Files\Python39\lib\http\client.py", line 1347, in getresponse
    response.begin()
  File "C:\Program Files\Python39\lib\http\client.py", line 331, in begin
    self.headers = self.msg = parse_headers(self.fp)
  File "C:\Program Files\Python39\lib\http\client.py", line 225, in parse_headers
    return email.parser.Parser(_class=_class).parsestr(hstring)
  File "C:\Program Files\Python39\lib\email\parser.py", line 67, in parsestr
    return self.parse(StringIO(text), headersonly=headersonly)
  File "C:\Program Files\Python39\lib\email\parser.py", line 56, in parse
    feedparser.feed(data)
  File "C:\Program Files\Python39\lib\email\feedparser.py", line 176, in feed
    self._call_parse()
  File "C:\Program Files\Python39\lib\email\feedparser.py", line 180, in _call_parse
    self._parse()
  File "C:\Program Files\Python39\lib\email\feedparser.py", line 295, in _parsegen
    if self._cur.get_content_maintype() == 'message':
  File "C:\Program Files\Python39\lib\email\message.py", line 594, in get_content_maintype
    ctype = self.get_content_type()
  File "C:\Program Files\Python39\lib\email\message.py", line 578, in get_content_type
    value = self.get('content-type', missing)
  File "C:\Program Files\Python39\lib\email\message.py", line 471, in get
    return self.policy.header_fetch_parse(k, v)
  File "C:\Program Files\Python39\lib\email\_policybase.py", line 316, in header_fetch_parse
    return self._sanitize_header(name, value)
  File "C:\Program Files\Python39\lib\email\_policybase.py", line 287, in _sanitize_header
    if _has_surrogates(value):
  File "C:\Program Files\Python39\lib\email\utils.py", line 57, in _has_surrogates
    s.encode()
RecursionError: maximum recursion depth exceeded while calling a Python object

代码::

#! python3
import requests
import bs4 as BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
from unidecode import unidecode

CHROMEDRIVER_PATH = 'C:\Program Files\Python39\chromedriver.exe'

NovelChapter = 'peerless-martial-god/chapter-1-spirit-awakening.html'
BaseURL = 'https://novelfull.com'
url = '%(U)s/%(N)s' % {'U': BaseURL, "N": NovelChapter}

options = Options()
options.add_argument("--headless") # Runs Chrome in headless mode.
driver = webdriver.Chrome(CHROMEDRIVER_PATH, options=options)
driver.get(url)

def Close():
    driver.stop_client()
    driver.close()
    driver.quit()

# start copy of chapter and add to a file
def CopyChapter():
    global soup
    soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
    readables = soup.find(id='chapter-content')
    name = driver.title
    filename = name.replace('<',' ').replace('"',' ').replace('>',' ').replace('/',' ').replace("|",' ').replace("?",' ').replace("*",' ').replace(":", ' -').replace('Read ',"").replace(' online free from your Mobile, Table, PC... Novel Updates Daily ',"").replace(' online free - Novel Full',"")
    file_name = (filename + '.txt')
    print(file_name)
    data = ''
    for data in soup.find_all("p"):
        myfile = open(file_name, 'a+')
        myfile.write(unidecode(data.get_text())+'\n'+'\n')
        myfile.close()
    global lastURL
    lastURL = driver.current_url
    print('**********Chapter Copied!**********')
    NextChapter()
# end copy of chapter and add to a file

# start goto next chapter if exists then return to copy chapter else Close()
def NextChapter():
    bLink = soup.find(id = "next_chap")
    cLink = 'Next Chapter'
    link = driver.find_element_by_link_text(cLink)
    link.click()
    global currentURL
    currentURL = driver.current_url
    if currentURL != lastURL:
        CopyChapter()
    else:
        print('Finished!!!')
        Close()
# end goto next chapter if exists then return to copy chapter else Close()

CopyChapter()
#EOF

【问题讨论】：

NextChapter 调用 CopyChapter 反之亦然，因此每个新章节都会向堆栈添加两个新函数调用。为了解决这个问题，您可以将递归循环转换为 while 循环，从而完全避免最大递归深度。
这确实有效，谢谢。我花了一些时间才让这一切正常工作，使用几个 while 循环来做各种事情。以及一些 for 循环。

标签： python-3.x recursion web-scraping beautifulsoup selenium-chromedriver

【解决方案1】：

看起来不如 defs 好，但可以完美地满足我的需要。添加了一些内容，例如为文本文件创建文件夹以及从章节列表页面开始。可能有很多东西可以优化，但它对我来说很重要。

#! python3
import os
import requests
import bs4 as BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
from unidecode import unidecode

CHROMEDRIVER_PATH = 'C:\Program Files\Python39\chromedriver.exe'

def Close():
    driver.stop_client()
    driver.close()
    driver.quit()

global NovelName
NovelName = ['']
global DIR
global baseDIR
baseDIR = "C:/Users/james/Documents/Novels"
    
while NovelName:
    NN = NovelName.pop(-1)
    NNx = NN.replace('.html', '').replace('-', ' ').upper()
    DIR = '%(B)s/%(N)s' % {'B': baseDIR, "N": NNx}
    os.mkdir(DIR)

    BaseURL = 'https://novelfull.com'
    url = '%(U)s/%(N)s' % {'U': BaseURL, "N": NN}
    options = Options()
    options.add_argument("--headless")
    driver = webdriver.Chrome(CHROMEDRIVER_PATH, options=options)
    driver.get(url)
    print(url)
    global currentURL
    currentURL = driver.current_url
    global lastURL
    lastURL = ''
    
    soupx = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
    ChapterList = soupx.find(id='list-chapter')
    CL = []
    for i in ChapterList.find_all("li"):
        CL.append(i)
    NovelChapter1Raw = CL[0]
    xx=[]
    for i in NovelChapter1Raw.find_all("a"):
        for x in i.find_all("span"):
            xx.append(x)
            ChapterTextX = ' '.join(map(str, xx))
    ChapterText = ChapterTextX.replace('<span class="chapter-text">','').replace('</span>','')
    BaseURL = 'https://novelfull.com'
    link = driver.find_element_by_link_text(ChapterText)
    url = '%(U)s/%(N)s' % {'U': BaseURL, "N": link}
    link.click()
    currentURL = driver.current_url

    while currentURL != lastURL:
        global soup
        soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
        readables = soup.find(id='chapter-content')
        name = driver.title
        filename = name.replace('<',' ').replace('"',' ').replace('>',' ').replace('/',' ').replace("|",' ').replace("?",' ').replace("*",' ').replace(":", ' -').replace('Read ',"").replace(' online free from your Mobile, Table, PC... Novel Updates Daily ',"").replace(' online free - Novel Full',"")
        file_name = (filename + '.txt')
        print(file_name)
        data = ''
        for data in soup.find_all("p"):
            myfile = open(DIR +'/'+ file_name, 'a+')
            myfile.write(unidecode(data.get_text())+'\n'+'\n')
            myfile.close()
        lastURL = driver.current_url
        print('**********Chapter Copied!**********')
        bLink = soup.find(id = "next_chap")
        cLink = 'Next Chapter'
        link = driver.find_element_by_link_text(cLink)
        link.click()
        currentURL = driver.current_url
        
    print('Finished!!!')
    Close()
print('Finished!!!')
Close() #<- throws a bunch of errors but makes sure everything closes.

#EOF

【讨论】：

完整的小说刮板：github.com/hanziwolf33/Multiple-Novel-Scapper 运行时没有任何重大问题，除了我的互联网混乱。