这篇博客主要是写两个爬虫,一个抓取静态网站的文字和图片,一个抓取动态网站的电影及相关消息。
1.每日一文(http://voice.meiriyiwen.com/)
#coding=utf-8 #爬取每日一文前10页内容 from lxml import etree import requests import urllib2,urllib import sys import os import time tmpt_url = 'http://voice.meiriyiwen.com/voice/past?page=%d' urllist = [tmpt_url%i for i in range(1,11)] def get_url(): for url in urllist: try: headers = { 'Host':'voice.meiriyiwen.com', 'Upgrade-Insecure-Requests':'1', 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36' , 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding':'gzip, deflate, sdch, br', 'Accept-Language':'zh-CN,zh;q=0.8', 'Cache-Control':'max-age=0', 'Connection':'keep-alive'} #proxies = { "http": "dev-proxy.oa.com:8080","https": "dev-proxy.oa.com:8080",} time.sleep(0.5) response = requests.get(url,headers = headers) print response.status_code get_info(response) except urllib2.URLError, e: print e.reason def get_info(response): global count html = response.content #print html tree = etree.HTML(html) rez = tree.xpath('//*[@class="img_list"]') for i in rez: title = i.xpath('//*[@class="list_author"]/a/text()') author = i.xpath('//*[@class="author_name"]/text()') for x,y in zip(title,author): count += 1 print count,'|',x.replace(u'\xa0','').strip(),'|',y.replace(u'\xa0','').strip() if __name__ == '__main__': count = 0 get_url()