爬虫实例：每日一文和豆瓣电影

这篇博客主要是写两个爬虫，一个抓取静态网站的文字和图片，一个抓取动态网站的电影及相关消息。

1.每日一文（http://voice.meiriyiwen.com/）

#coding=utf-8
#爬取每日一文前10页内容
from lxml import etree
import requests
import urllib2,urllib
import sys
import os
import time

tmpt_url = 'http://voice.meiriyiwen.com/voice/past?page=%d'
urllist = [tmpt_url%i for i in range(1,11)]

def get_url():
    for url in urllist:
        try:
            headers = {
            'Host':'voice.meiriyiwen.com',
            'Upgrade-Insecure-Requests':'1',
            'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36' ,
            'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Encoding':'gzip, deflate, sdch, br',
            'Accept-Language':'zh-CN,zh;q=0.8',
            'Cache-Control':'max-age=0',
            'Connection':'keep-alive'}
            #proxies = { "http": "dev-proxy.oa.com:8080","https": "dev-proxy.oa.com:8080",}
            time.sleep(0.5)
            response = requests.get(url,headers = headers)
            print response.status_code
            get_info(response)

        except urllib2.URLError, e:
            print e.reason

def get_info(response):
    global count
    html = response.content
    #print html
    tree = etree.HTML(html)
    rez = tree.xpath('//*[@class="img_list"]')

    for i in rez:
        title = i.xpath('//*[@class="list_author"]/a/text()')
        author = i.xpath('//*[@class="author_name"]/text()')
        
        for x,y in zip(title,author):
            count += 1
            print count,'|',x.replace(u'\xa0','').strip(),'|',y.replace(u'\xa0','').strip()

if __name__ == '__main__':
    count = 0
    get_url()

View Code