he-qing-qing

爬虫 实践 小例子

import requests,os
from urllib import request
from lxml import etree

dirName = \'./books\'
if not os.path.exists(dirName):
    os.mkdir(dirName)

headers={
    \'User-Agent\': \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36\'
}

url = \'http://www.shicimingju.com/book\'

page_text = requests.get(url,headers=headers).text   

# print(page_text)

tree = etree.HTML(page_text)
a_list = tree.xpath(\'//div[@class="bookmark-list"]//a\')   
for a in a_list:
    bookname = a.xpath(\'./text()\')[0]
    book_path = "http://www.shicimingju.com" + a.xpath(\'./@href\')[0]
    #print(bookname,book_path) # 不取第一个元素的话返回的是列表   [\'三国演义\'] [\'/book/sanguoyanyi.html\']
    book_page = requests.get(book_path,headers=headers).text
    tree = etree.HTML(book_page)
    book_a_list = tree.xpath(\'//div[@class="book-mulu"]//a\')
    path = dirName + \'/\' + bookname
    with open(path,\'w\',encoding=\'utf-8\') as f:
        for a in book_a_list:
            title = a.xpath(\'./text()\')[0]
            detail_path = \'http://www.shicimingju.com\'+a.xpath(\'./@href\')[0]
            detail_page = requests.get(detail_path,headers=headers).text
            content = etree.HTML(detail_page).xpath(\'//div[@class="chapter_content"]//text()\')
            content = \'\'.join(content)
            f.write(title+\':\' + content + \'\n\')
            
            print(title,"下载成功")
           

分类:

技术点:

相关文章:

  • 2022-12-23
  • 2022-12-23
  • 2021-11-26
  • 2021-04-07
  • 2021-12-09
  • 2021-11-23
  • 2022-12-23
  • 2021-09-25
猜你喜欢
  • 2021-12-30
  • 2021-12-09
  • 2021-07-16
  • 2022-12-23
  • 2022-12-23
  • 2021-11-20
  • 2022-12-23
相关资源
相似解决方案