1、创建工程
scrapy startproject gosuncn
2、创建项目
cd gosuncn
scrapy genspider gaoxinxing gosuncn.zhiye.com
3、运行项目
crawl gaoxinxing
4、gaoxinxing.py代码
# -*- coding: utf-8 -*- import scrapy import logging logger = logging.getLogger(__name__) #引入日志 class GaoxinxingSpider(scrapy.Spider): name = 'gaoxinxing' allowed_domains = ['gosuncn.zhiye.com'] start_urls = ['http://gosuncn.zhiye.com/Social'] next_page_num = 1 def parse(self, response): tr_list = response.xpath("//table[@class='jobsTable']/tr")[1:] #print(tr_list) for tr in tr_list: item = {} item["position"]=tr.xpath(".//td[1]/a/text()").extract_first() item["platform"] = tr.xpath(".//td[3]/text()").extract_first() item["num"] = tr.xpath(".//td[4]/text()").extract_first() item["time"] = tr.xpath(".//td[6]/text()").extract_first() logger.warning(item) #打印日志 yield item #next_page_url = response.xpath("//div[@class='pager2']//a[@class='next']/@href").extract_first() #print(next_page_url) self.next_page_num = self.next_page_num+1 if self.next_page_num<=4: next_url = "http://gosuncn.zhiye.com/social/?PageIndex=" + str(self.next_page_num) print(next_url) yield scrapy.Request( next_url, callback=self.parse )