win0211

1、古诗文网爬虫

import requests,re


headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
}

my_poetic_list = []


def get_poetics(my_url):
    text = requests.get(url=my_url,headers = headers).text

    titles = re.findall(r\'<div class="cont">.*?<b>(.*?)</b>\',text,re.DOTALL)

    years = re.findall(r\'<p class="source">.*?<a.*?>(.*?)</a>\',text,re.DOTALL)

    potes = re.findall(r\'<p class="source">.*?<a.*?>.*?</a>.*?<a.*?>(.*?)</a>\',text,re.DOTALL)

    poetic = re.findall(r\'<div class="contson".*?>(.*?)</div>\',text,re.DOTALL)

    poetic_list = []
    for i in poetic:
        i = re.sub(r"<.*?>",\'\',i).replace("\n",\'\').replace("\u3000",\'\')
        poetic_list.append(i)
    for key,value in enumerate(titles):
        my_poetic = {}
        my_poetic["title"] = titles[key]
        my_poetic["year"] = years[key]
        my_poetic["pote"] = potes[key]
        my_poetic["poetics"] = poetic_list[key]
        my_poetic_list.append(my_poetic)

if  __name__ == \'__main__\':
    for i in range(0,11):
        url = "https://www.gushiwen.org/default_{}.aspx".format(i)
        get_poetics(url)
    for i in my_poetic_list:
        print(i)

2、糗事百科案例

import re,requests


headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
}
my_lovehhy = []
def get_acticles(url):
    text = requests.get(url=url,headers=headers).text

    titles = re.findall(r\'<h3.*?><a.*?>(.*?)</a>\',text,re.DOTALL)
    articles = re.findall(r\'<div id="endtext">(.*?)</div>\',text,re.DOTALL)
    for key,article in enumerate(articles):
        article = re.sub(r"<.*?>",\'\',article).replace("\u3000",\'\')
        lovehhy = {}
        lovehhy["title"] = titles[key]
        lovehhy["content"] = article
        my_lovehhy.append(lovehhy)

if __name__ == \'__main__\':
    for i in range(10):
        url = "http://www.lovehhy.net/Joke/Detail/QSBK/{}".format(i)
        get_acticles(url)
    for i in my_lovehhy:
        print(i)

分类:

技术点:

相关文章:

  • 2021-12-22
  • 2021-10-01
  • 2021-04-23
  • 2021-07-15
  • 2021-07-08
  • 2021-09-23
猜你喜欢
  • 2021-10-23
  • 2022-12-23
  • 2021-04-05
  • 2021-08-11
  • 2022-12-23
  • 2021-07-07
相关资源
相似解决方案