【发布时间】:2018-03-15 20:24:14
【问题描述】:
我想将 google 搜索的链接标题废弃仅 20 页左右。 我在前一天尝试过这段代码,它正在工作!但是今天,它向我发送了 503 错误。
我搜索了解决这个问题的方法。以下是我尝试过的。
- 延迟时间(通过在 25 之后的行中插入 'time.sleep(60)' 代码。
- “假用户代理”库。
但是,看到 503 错误.. 这是文件。
import requests
from bs4 import BeautifulSoup
from collections import Counter
#google, '소프트웨어 교육'
base_google1_url = "https://www.google.co.kr/search?q=%EC%86%8C%ED%94%84%ED%8A%B8%EC%9B%A8%EC%96%B4+%EA%B5%90%EC%9C%A1&safe=active&ei=rv_RWYyaKcmW0gTqsa_IDg&start="
extra_google1_url="&sa=N&biw=958&bih=954"
#google, 'sw교육'
base_google2_url="https://www.google.co.kr/search?q=sw%EA%B5%90%EC%9C%A1&safe=active&ei=kLzUWYONLYa30QS4r5KACA&start="
extra_google2_url="&sa=N&biw=887&bih=950"
#book.naver, '소프트웨어 교육'
base_naver_url = "http://book.naver.com/search/search_in.nhn?query=%EC%86%8C%ED%94%84%ED%8A%B8%EC%9B%A8%EC%96%B4+%EA%B5%90%EC%9C%A1&&pattern=0&orderType=rel.desc&viewType=list&searchType=bookSearch&serviceSm=service.basic&title=&author=&publisher=&isbn=&toc=&subject=&publishStartDay=&publishEndDay=&categoryId=&qdt=1&filterType=0&filterValue=&serviceIc=service.author&buyAllow=0&ebook=0&page="
#from: https://docs.python.org/2/library/collections.html
cnt = Counter()
#bring search info
def get_html (site_name, content_num):
_html = ""
if site_name == 'google1':
google1_url = base_google1_url + str(content_num) + extra_google1_url
resp = requests.get(google1_url)
elif site_name == 'google2':
google2_url = base_google2_url + str(content_num) + extra_google2_url
resp = requests.get(google2_url)
elif site_name == 'naver':
naver_url = base_naver_url + str(content_num)
resp = requests.get(naver_url)
if resp.status_code == 200:
_html = resp.text
return _html
def word_count (name):
for content in name.contents:
words = content.split()
for word in words:
cnt[word] += 1
counting = cnt
return counting
def main():
cnt.clear()
counting = cnt
page_num = 0
#bring google '소프트웨어 교육' search info~~
while page_num < 20:
content_num = page_num*10
html = get_html("google1", content_num)
soup = BeautifulSoup(html, 'html.parser')
texts = soup.find_all('h3')
invalid_tag = ['b']
for text in texts:
for match in text.find_all(invalid_tag):
match.replaceWithChildren()
names = text.find_all('a')
for name in names:
counting = word_count(name)
page_num += 1
page_num = 0
#bring google 'sw교육' search info~~
while page_num < 20:
content_num = page_num*10
html = get_html("google2", content_num)
soup = BeautifulSoup(html, 'html.parser')
texts = soup.find_all('h3')
invalid_tag = ['b', 'a']
for text in texts:
for match in text.find_all(invalid_tag):
match.replaceWithChildren()
counting = word_count(text)
print(text)
page_num += 1
#bring naver book search info~~
page_num = 1
while page_num < 40:
html = get_html("naver", page_num)
soup = BeautifulSoup(html, 'html.parser')
texts = soup.find_all("dt")
invalid_tag = ['a','strong', 'span', 'img']
for text in texts:
for match in text.find_all(invalid_tag):
match.replaceWithChildren()
counting = word_count(text)
page_num += 1
#deleting useless keywords: if need to include len(k) == 1, instead of 'len(k) == 1 and ~ ' use following code --'or (len(k) == 1 and ord(k) >=33 and ord(k)<65)'
#https://stackoverflow.com/questions/8448202/remove-more-than-one-key-from-python-dict
del counting['소프트웨어'], counting['교육']
for key in [k for k in counting if len(k) == 1 or type(k) == int]: del counting[key]
count_20 = counting.most_common(20)
print(count_20)
if __name__ == '__main__':
main()
请帮帮我! 先感谢您。
【问题讨论】:
-
我自己得到 200 个。您是否自己从浏览器中打开了这些 URL?您的 IP 可能被 Google 屏蔽了,您可能需要输入验证码或类似的东西?
标签: python web-scraping beautifulsoup http-status-code-503