【发布时间】:2022-08-17 05:00:55
【问题描述】:
from urllib import response
import requests
import urllib
import pandas as pd
from requests_html import HTML
from requests_html import HTMLSession
def get_source(url):
\"\"\"Return the source code for the provided URL.
Args:
url (string): URL of the page to scrape.
Returns:
response (object): HTTP response object from requests_html.
\"\"\"
try:
session = HTMLSession()
response = session.get(url)
return response
except requests.exceptions.RequestException as e:
print(e)
def scrape_google(query):
query = urllib.parse.quote_plus(query)
response = get_source(\"https://www.google.com/search?q=\" + query)
links = list(response.html.absolute_links)
google_domains = (\'https://www.google.\',
\'https://google.\',
\'https://webcache.googleusercontent.\',
\'http://webcache.googleusercontent.\',
\'https://policies.google.\',
\'https://support.google.\',
\'https://maps.google.\')
for url in links[:]:
if url.startswith(google_domains):
links.remove(url)
return links
def get_results(query):
query = urllib.parse.quote_plus(query)
response = get_source(\"https://www.google.co.uk/search?q=\" + query)
return response
def parse_results(response):
css_identifier_result = \".tF2Cxc\"
css_identifier_title = \"h3\"
css_identifier_link = \".yuRUbf a\"
css_identifier_text = \".VwiC3b\"
results = response.html.find(css_identifier_result)
output = []
for result in results:
item = {
\'title\': result.find(css_identifier_title, first=True).text,
\'link\': result.find(css_identifier_link, first=True).attrs[\'href\'],
\'text\': result.find(css_identifier_text, first=True).text
}
output.append(item)
return output
def google_search(query):
response = get_results(query)
return parse_results(response)
我想在我的代码中添加一个部分来更改页面,但我找不到方法!有人可以帮忙吗?
-
不要刮谷歌,使用他们的 API
-
是的,但我不想使用 google api
-
我不认为我将其列为一个选项,使用 Google 的搜索引擎 API,它也会变得更容易,您不需要解析任何内容,只需从字典中获取值
-
这回答了你的问题了吗? Searching in Google with Python 不过,请阅读关于该问题的第二条评论,您应该再次使用他们的 API
-
我最近遇到了一个和你类似的问题。我附上我的答案的链接:stackoverflow.com/a/72938742/18597245
标签: python web-scraping