刚接触python,试一下爬虫。拿自己的Blog开刀

import requests
from bs4 import BeautifulSoup
import pprint
url = "https://www.cnblogs.com/zyqgold/"

#爬取分页
def download_all_htmls():
htmls = []
for i in range(7):
url = f"https://www.cnblogs.com/zyqgold/default.html?page={i+1}"
#print("页面URL:",url)
r = requests.get(url)
if r.status_code != 200:
raise Exception("error")
htmls.append(r.text)
return htmls
#爬取分页里边的文章链接
def parse_single_html(html):
soup = BeautifulSoup(html,"html.parser")
articles = soup.find_all("a",class_= "postTitle2 vertical-middle")
nodes =[]
for article in articles:
nodes.append({"name":article.span.string,"link":article.attrs["href"]})
return nodes

htmls = download_all_htmls()

all_html = []
for html in htmls:
all_html.extend(parse_single_html(html))
pprint.pprint(all_html)

相关文章:

  • 2021-06-25
  • 2021-12-18
  • 2021-07-23
  • 2022-01-27
  • 2022-02-09
  • 2021-05-18
  • 2021-11-30
  • 2022-02-07
猜你喜欢
  • 2022-12-23
  • 2022-12-23
  • 2021-08-11
  • 2021-07-01
  • 2022-01-14
  • 2021-10-28
  • 2021-05-04
相关资源
相似解决方案