xiaozx
网页分析

 首先来看下要爬取的网站的页面

查看网页源代码:你会发现它是由js动态加载显示的

所以采用selenium+谷歌无头浏览器来爬取它

1 加载网站,并拖动到底,发现其还有个加载更多

 2 模拟点击它,然后再次拖动到底,,就可以加载完整个页面

示例代码
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from time import sleep
from lxml import etree
import os
import requests

# 使用谷歌无头浏览器来加载动态js
def main():
    # 创建一个无头浏览器对象
    chrome_options = Options()
    # 设置它为无框模式
    chrome_options.add_argument(\'--headless\')
    # 如果在windows上运行需要加代码
    chrome_options.add_argument(\'--disable-gpu\')
    browser = webdriver.Chrome(chrome_options=chrome_options)
    # 设置一个10秒的隐式等待
    browser.implicitly_wait(10)
    browser.get(url)
    sleep(1)
    # 翻到页底
    browser.execute_script(\'window.scrollTo(0,document.body.scrollHeight)\')
    # 点击加载更多
    browser.find_element(By.CSS_SELECTOR, \'.load_more_btn\').click()
    sleep(1)
    # 再次翻页到底
    browser.execute_script(\'window.scrollTo(0,document.body.scrollHeight)\')
    # 拿到页面源代码
    source = browser.page_source
    browser.quit()
    with open(\'xinwen.html\', \'w\', encoding=\'utf-8\') as f:
        f.write(source)
        parse_page(source)

# 对新闻列表页面进行解析
def parse_page(html):
    # 创建etree对象
    tree = etree.HTML(html)
    new_lst = tree.xpath(\'//div[@class="ndi_main"]/div\')
    for one_new in new_lst:
        title = one_new.xpath(\'.//div[@class="news_title"]/h3/a/text()\')[0]
        link = one_new.xpath(\'.//div[@class="news_title"]/h3/a/@href\')[0]
        write_in(title, link)

# 将其写入到文件
def write_in(title, link):
    print(\'开始写入篇新闻{}\'.format(title))
    response = requests.get(url=link, headers=headers)
    tree = etree.HTML(response.text)
    content_lst = tree.xpath(\'//div[@class="post_text"]//p\')
    title = title.replace(\'?\', \'\')
    with open(\'new/\' + title + \'.txt\', \'a+\', encoding=\'utf-8\') as f:
        for one_content in content_lst:
            if one_content.text:
                    con = one_content.text.strip()
                    f.write(con + \'\n\')


if __name__ == \'__main__\':
    url = \'https://news.163.com/domestic/\'
    headers = {"User-Agent": \'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0\'}
    if not os.path.exists(\'new\'):
        os.mkdir(\'new\')
    main()

 得到结果:

随意打开一个txt:

Scrapy版

wangyi.py

# -*- coding: utf-8 -*-
import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from happy1.items import Happy1Item

class WangyiSpider(scrapy.Spider):
    name = \'wangyi\'
    # allowed_domains = [\'https://news.163.com/domestic/\']
    start_urls = [\'http://news.163.com/domestic/\']

    def __init__(self):
        # 创建一个无头浏览器对象
        chrome_options = Options()
        # 设置它为无框模式
        chrome_options.add_argument(\'--headless\')
        # 如果在windows上运行需要加代码
        chrome_options.add_argument(\'--disable-gpu\')
        # 示例话一个浏览器对象(实例化一次)
        self.bro = webdriver.Chrome(chrome_options=chrome_options)

    def parse(self, response):
        new_lst = response.xpath(\'//div[@class="ndi_main"]/div\')
        for one_new in new_lst:
            item = Happy1Item()
            title = one_new.xpath(\'.//div[@class="news_title"]/h3/a/text()\')[0].extract()
            link = one_new.xpath(\'.//div[@class="news_title"]/h3/a/@href\')[0].extract()
            item[\'title\'] = title
            yield scrapy.Request(url=link,callback=self.parse_detail, meta={\'item\':item})

    def parse_detail(self, response):
        item = response.meta[\'item\']
        content_list = response.xpath(\'//div[@class="post_text"]//p/text()\').extract()
        item[\'content\'] = content_list
        yield item

    # 在爬虫结束后,关闭浏览器
    def close(self, spider):
        print(\'爬虫结束\')
        self.bro.quit()
pipelines.py
class Happy1Pipeline(object):
    def __init__(self):
        self.fp = None

    def open_spider(self, spider):
        print(\'开始爬虫\')

    def process_item(self, item, spider):
        title = item[\'title\'].replace(\'?\', \'\')
        self.fp = open(\'news/\' + title + \'.txt\', \'a+\', encoding=\'utf-8\')
        for one in item[\'content\']:
            self.fp.write(one.strip() + \'\n\')
        self.fp.close()
        return item
items.py
import scrapy


class Happy1Item(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    content = scrapy.Field()
middlewares.py
    def process_response(self, request, response, spider):
        if request.url in [\'http://news.163.com/domestic/\']:
            spider.bro.get(url=request.url)
            time.sleep(1)
            spider.bro.execute_script(\'window.scrollTo(0,document.body.scrollHeight)\')
            spider.bro.find_element(By.CSS_SELECTOR, \'.load_more_btn\').click()
            time.sleep(1)
            spider.bro.execute_script(\'window.scrollTo(0,document.body.scrollHeight)\')
            page_text = spider.bro.page_source
            return HtmlResponse(url=spider.bro.current_url, body=page_text, encoding=\'utf-8\', request=request)
        else:
            return response
settings.py
DOWNLOADER_MIDDLEWARES = {
   \'happy1.middlewares.Happy1DownloaderMiddleware\': 543,
}

ITEM_PIPELINES = {
   \'happy1.pipelines.Happy1Pipeline\': 300,
}

 得到结果

总结:

1 其实主要的工作还是模拟浏览器来进行操作。

2 处理动态的js其实还有其他办法。

3 爬虫的方法有好多种,主要还是选择适合自己的。

4 自己的代码写的太烂了。

 

分类:

技术点:

相关文章: