Python爬虫基础代码

主要记录Request和网页解析。
# 请求头
import requests


# 发起一次网页请求
response = requests.get(URL)
# 附带header信息或者参数
myheaders = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36 Edg/96.0.1054.34"}
p = {\'param1\':\'p1\'}
response = requests.get(URL,params = p, headers = myheaders )


# 获取html文本
strData = response.text # 文本格式,一般用这个
strData  = respones.content # 字节流
# 乱码时，根据具体情况修改编码
response.encoding # 响应头部字符编码
response.status_code # 响应码

# html文件解析 1:使用bs4
import bs4
soup = bs4.BeautifulSoup(strData,\'html.parser\') #将文件解析成Python的对象树
# 其余具体操作:https://geek-docs.com/python/python-tutorial/python-beautifulsoup.html