【发布时间】:2021-04-21 21:26:03
【问题描述】:
使用 chrome 90 和 python 3.9。我刚安装时,所有导入都已完全更新。
因为我有一个糟糕的 ISP,所以我制作了这个脚本来将互联网上的小说复制到文本文件中,以便在我的互联网中断时离线查看。该脚本几乎可以正常工作,直到弹出递归错误,然后我必须在设置后手动进入并更改章节。我对代码的预期结果是运行直到小说完全复制(从第 1 章到######)到文本文件,无论有多少章。
在我复制了 499 或 500 章后总是出现递归错误。我不知道为什么它这么低,也不知道它是如何得到这个错误的。我读过递归错误通常发生在 999 次迭代之后。
错误 ::(前两行重复了很长一段时间)
File "C:\Users\james\Documents\Novels\PEERLESS MARTIAL GOD\novel.py", line 42, in CopyChapter
NextChapter()
File "C:\Users\james\Documents\Novels\PEERLESS MARTIAL GOD\novel.py", line 49, in NextChapter
link = driver.find_element_by_link_text(cLink)
File "C:\Program Files\Python39\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 428, in find_element_by_link_text
return self.find_element(by=By.LINK_TEXT, value=link_text)
File "C:\Program Files\Python39\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 976, in find_element
return self.execute(Command.FIND_ELEMENT, {
File "C:\Program Files\Python39\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 319, in execute
response = self.command_executor.execute(driver_command, params)
File "C:\Program Files\Python39\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 374, in execute
return self._request(command_info[0], url, body=data)
File "C:\Program Files\Python39\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 397, in _request
resp = self._conn.request(method, url, body=body, headers=headers)
File "C:\Program Files\Python39\lib\site-packages\urllib3\request.py", line 78, in request
return self.request_encode_body(
File "C:\Program Files\Python39\lib\site-packages\urllib3\request.py", line 170, in request_encode_body
return self.urlopen(method, url, **extra_kw)
File "C:\Program Files\Python39\lib\site-packages\urllib3\poolmanager.py", line 375, in urlopen
response = conn.urlopen(method, u.request_uri, **kw)
File "C:\Program Files\Python39\lib\site-packages\urllib3\connectionpool.py", line 699, in urlopen
httplib_response = self._make_request(
File "C:\Program Files\Python39\lib\site-packages\urllib3\connectionpool.py", line 445, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "C:\Program Files\Python39\lib\site-packages\urllib3\connectionpool.py", line 440, in _make_request
httplib_response = conn.getresponse()
File "C:\Program Files\Python39\lib\http\client.py", line 1347, in getresponse
response.begin()
File "C:\Program Files\Python39\lib\http\client.py", line 331, in begin
self.headers = self.msg = parse_headers(self.fp)
File "C:\Program Files\Python39\lib\http\client.py", line 225, in parse_headers
return email.parser.Parser(_class=_class).parsestr(hstring)
File "C:\Program Files\Python39\lib\email\parser.py", line 67, in parsestr
return self.parse(StringIO(text), headersonly=headersonly)
File "C:\Program Files\Python39\lib\email\parser.py", line 56, in parse
feedparser.feed(data)
File "C:\Program Files\Python39\lib\email\feedparser.py", line 176, in feed
self._call_parse()
File "C:\Program Files\Python39\lib\email\feedparser.py", line 180, in _call_parse
self._parse()
File "C:\Program Files\Python39\lib\email\feedparser.py", line 295, in _parsegen
if self._cur.get_content_maintype() == 'message':
File "C:\Program Files\Python39\lib\email\message.py", line 594, in get_content_maintype
ctype = self.get_content_type()
File "C:\Program Files\Python39\lib\email\message.py", line 578, in get_content_type
value = self.get('content-type', missing)
File "C:\Program Files\Python39\lib\email\message.py", line 471, in get
return self.policy.header_fetch_parse(k, v)
File "C:\Program Files\Python39\lib\email\_policybase.py", line 316, in header_fetch_parse
return self._sanitize_header(name, value)
File "C:\Program Files\Python39\lib\email\_policybase.py", line 287, in _sanitize_header
if _has_surrogates(value):
File "C:\Program Files\Python39\lib\email\utils.py", line 57, in _has_surrogates
s.encode()
RecursionError: maximum recursion depth exceeded while calling a Python object
代码::
#! python3
import requests
import bs4 as BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
from unidecode import unidecode
CHROMEDRIVER_PATH = 'C:\Program Files\Python39\chromedriver.exe'
NovelChapter = 'peerless-martial-god/chapter-1-spirit-awakening.html'
BaseURL = 'https://novelfull.com'
url = '%(U)s/%(N)s' % {'U': BaseURL, "N": NovelChapter}
options = Options()
options.add_argument("--headless") # Runs Chrome in headless mode.
driver = webdriver.Chrome(CHROMEDRIVER_PATH, options=options)
driver.get(url)
def Close():
driver.stop_client()
driver.close()
driver.quit()
# start copy of chapter and add to a file
def CopyChapter():
global soup
soup = BeautifulSoup.BeautifulSoup(driver.page_source, 'html.parser')
readables = soup.find(id='chapter-content')
name = driver.title
filename = name.replace('<',' ').replace('"',' ').replace('>',' ').replace('/',' ').replace("|",' ').replace("?",' ').replace("*",' ').replace(":", ' -').replace('Read ',"").replace(' online free from your Mobile, Table, PC... Novel Updates Daily ',"").replace(' online free - Novel Full',"")
file_name = (filename + '.txt')
print(file_name)
data = ''
for data in soup.find_all("p"):
myfile = open(file_name, 'a+')
myfile.write(unidecode(data.get_text())+'\n'+'\n')
myfile.close()
global lastURL
lastURL = driver.current_url
print('**********Chapter Copied!**********')
NextChapter()
# end copy of chapter and add to a file
# start goto next chapter if exists then return to copy chapter else Close()
def NextChapter():
bLink = soup.find(id = "next_chap")
cLink = 'Next Chapter'
link = driver.find_element_by_link_text(cLink)
link.click()
global currentURL
currentURL = driver.current_url
if currentURL != lastURL:
CopyChapter()
else:
print('Finished!!!')
Close()
# end goto next chapter if exists then return to copy chapter else Close()
CopyChapter()
#EOF
【问题讨论】:
-
NextChapter调用CopyChapter反之亦然,因此每个新章节都会向堆栈添加两个新函数调用。为了解决这个问题,您可以将递归循环转换为 while 循环,从而完全避免最大递归深度。 -
这确实有效,谢谢。我花了一些时间才让这一切正常工作,使用几个 while 循环来做各种事情。以及一些 for 循环。
标签: python-3.x recursion web-scraping beautifulsoup selenium-chromedriver