[python web scraping 2nd edition]2 笔记

2.1 3种抓取网页的方法

1.正则表达式

使用正则表达式匹配网页中的元素。
此方案的弊端是，网页若发生变化，则方法很可能会失效。同时该方法存在难以构造、可读性差等问题。

2.Beautiful Soup

Beautiful Soup是Python模块，可解析网页，并提供了定位元素的接口。
Beautiful Soup首先将网页解析成soup文档，接着利用find()、find_all()等定位元素。

3.lxml

lxml基于C语言编写，解析速度快于Beautiful Soup。
同bs一样，第一步是将不合法的HTML解析为统一的格式，选择元素方面lxml有css和xpath两种选择器，表格中是两种选择器选择元素的语法对比：
[python web scraping 2nd edition]2 笔记
xpath更准确、具体，而css可用于动态网页解析

4.3种网页抓取方法的实现

import re
from bs4 import BeautifulSoup
from lxml.html import fromstring

FIELDS = ('area', 'population', 'iso', 'country', 'capital',
          'continent', 'tld', 'currency_code', 'currency_name',
          'phone', 'postal_code_format', 'postal_code_regex',
          'languages', 'neighbours')


def re_scraper(html):
    """ Using regex to extract data from country pages. """
    results = {}
    for field in FIELDS:
        results[field] = re.search(
            '<tr id="places_%s__row">.*?<td class="w2p_fw">(.*?)</td>'
            % field, html).groups()[0]
    return results


def bs_scraper(html):
    """ Using beautifulsoup to extract data from country pages. """
    soup = BeautifulSoup(html, 'html.parser')
    results = {}
    for field in FIELDS:
        results[field] = soup.find('table').find(
            'tr', id='places_%s__row' % field).find(
                'td', class_='w2p_fw').text
    return results


def lxml_scraper(html):
    """ Using lxml and cssselect to extract data from country pages. """
    tree = fromstring(html)
    results = {}
    for field in FIELDS:
        results[field] = tree.cssselect(
            'table > tr#places_%s__row > td.w2p_fw' % field)[0].text_content()
    return results


def lxml_xpath_scraper(html):
    """ Using lxml and xpath to extract data from country pages. """
    tree = fromstring(html)
    results = {}
    for field in FIELDS:
        results[field] = tree.xpath(
            '//tr[@id="places_%s__row"]/td[@class="w2p_fw"]' % field)[0].text_content()
    return results

5性能比较

以上每种方法爬取1000次，检查结果正确性，并打印总用时。

import time
import re
from ch1_link_crawler import download
NUM_ITERATIONS = 1000 # number of times to test each scraper
html = download('http://example.webscraping.com/places/default/view/United-Kingdom-239')
scrapers = [
('Regular expressions', re_scraper),
('BeautifulSoup', bs_scraper),
('Lxml', lxml_scraper),
('Xpath', lxml_xpath_scraper)]
for name, scraper in scrapers:
    # record start time of scrape
    start = time.time()
    for i in range(NUM_ITERATIONS):
        if scraper == re_scraper:
            re.purge()#默认情况下，正则表达式会缓存搜索结果，为了公平起见，我们需要使用该方法清除缓存。
        result = scraper(html)
        # check scraped result is as expected
        assert result['area'] == '244,820 square kilometres'
    # record end time of scrape and output the total
    end = time.time()
    print('%s: %.2f seconds' % (name, end - start))

[python web scraping 2nd edition]2 笔记
Beautiful Soup较其他2种方法慢了6倍多，lxml和正则表达式是用C语言写的，而Beautiful Soup纯粹用Python写的，结果符合预期。lxml和正则表达式表现差不多，在搜索元素前，lxml必须解析成内部格式，产生了额外开销。但在爬取同一网页的多个特征时，这一开销会降低，lxml也就更具竞争力。
[python web scraping 2nd edition]2 笔记
通常情况下，lxml 是爬取数据的最好选择，这是因为它不仅速度快，功能也更加丰富，而正则表达式和 Beautiful Soup只在某些特定场景下有用。

2.2 添加爬取回调

添加一个callback参数处理抓取行为。callback是一个函数，在发生某个特定事件之后会调用该函数，这里会在网页下载完成后调用。最后得到的结果保存到csv文档中

def scrape_callback(url, html):
    """ Scrape each row from the country data using XPath and lxml """
    fields = ('area', 'population', 'iso', 'country', 'capital',
              'continent', 'tld', 'currency_code', 'currency_name',
              'phone', 'postal_code_format', 'postal_code_regex',
              'languages', 'neighbours')
    if re.search('/view/', url):
        tree = fromstring(html)
        all_rows = [
            tree.xpath('//tr[@id="places_%s__row"]/td[@class="w2p_fw"]' % field)[0].text_content()
            for field in fields]
        print(url, all_rows)
def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp',
                 proxy=None, delay=3, max_depth=4, scrape_callback=None):
    """ Crawl from the given start URL following links matched by link_regex. In the current
        implementation, we do not actually scrapy any information.
        args:
            start_url (str): web site to start crawl
            link_regex (str): regex to match for links
        kwargs:
            robots_url (str): url of the site's robots.txt (default: start_url + /robots.txt)
            user_agent (str): user agent (default: wswp)
            proxy (str): proxy url, ex 'http://IP' (default: None)
            delay (int): seconds to throttle between requests to one domain (default: 3)
            max_depth (int): maximum crawl depth (to avoid traps) (default: 4)
            scrape_callback (function): function to call after each download (default: None)
    """
    crawl_queue = [start_url]
    # keep track which URL's have seen before
    seen = {}
    data = []
    if not robots_url:
        robots_url = '{}/robots.txt'.format(start_url)
    rp = get_robots_parser(robots_url)
    throttle = Throttle(delay)
    while crawl_queue:
        url = crawl_queue.pop()
        # check url passes robots.txt restrictions
        if rp.can_fetch(user_agent, url):
            depth = seen.get(url, 0)
            if depth == max_depth:
                print('Skipping %s due to depth' % url)
                continue
            throttle.wait(url)
            html = download(url, user_agent=user_agent, proxy=proxy)
            if not html:
                continue
            if scrape_callback:
                data.extend(scrape_callback(url, html) or [])
            # filter for links matching our regular expression
            for link in get_links(html):
                if re.match(link_regex, link):
                    abs_link = urljoin(start_url, link)
                    if abs_link not in seen:
                        seen[abs_link] = depth + 1
                        crawl_queue.append(abs_link)
        else:
            print('Blocked by robots.txt:', url)
import csv
import re
from lxml.html import fromstring
class CsvCallback:
    def __init__(self):
        self.writer = csv.writer(open('countries.csv', 'w'))
        self.fields = ('area', 'population', 'iso', 'country',
        'capital', 'continent', 'tld', 'currency_code',
        'currency_name',
        'phone', 'postal_code_format', 'postal_code_regex',
        'languages', 'neighbours')
        self.writer.writerow(self.fields)
    def __call__(self, url, html):
        if re.search('/view/', url):
            tree = fromstring(html)
            all_rows = [tree.xpath( '//tr[@id="places_%s__row"]/td[@class="w2p_fw"]' %field)[0].text_content() for field in self.fields]
            self.writer.writerow(all_rows)
            
link_crawler('http://example.webscraping.com', '/places/default'+'/(index|view)', max_depth=2,scrape_callback=CsvCallback())

__ call__是一个特殊的方法，当一个对象被“调用”为一个函数时调用该方法，这就是在链接爬虫中使用cache_callback的方式。这意味着scrape_callback（url，html）等同于调用scrape_callback .__ call __（url，html）。