1、创建工程

scrapy startproject gosuncn

2、创建项目

cd gosuncn
 scrapy genspider gaoxinxing gosuncn.zhiye.com

3、运行项目

 crawl gaoxinxing

4、gaoxinxing.py代码

# -*- coding: utf-8 -*-
import scrapy
import logging

logger = logging.getLogger(__name__)
#引入日志
class GaoxinxingSpider(scrapy.Spider):
    name = 'gaoxinxing'
    allowed_domains = ['gosuncn.zhiye.com']
    start_urls = ['http://gosuncn.zhiye.com/Social']
    next_page_num = 1
    def parse(self, response):
        tr_list = response.xpath("//table[@class='jobsTable']/tr")[1:]
        #print(tr_list)
        for tr in tr_list:
            item = {}
            item["position"]=tr.xpath(".//td[1]/a/text()").extract_first()
            item["platform"] = tr.xpath(".//td[3]/text()").extract_first()
            item["num"] = tr.xpath(".//td[4]/text()").extract_first()
            item["time"] = tr.xpath(".//td[6]/text()").extract_first()
            logger.warning(item) #打印日志
            yield item

        #next_page_url = response.xpath("//div[@class='pager2']//a[@class='next']/@href").extract_first()
        #print(next_page_url)
        self.next_page_num = self.next_page_num+1
        if self.next_page_num<=4:
            next_url = "http://gosuncn.zhiye.com/social/?PageIndex=" + str(self.next_page_num)
            print(next_url)
            yield scrapy.Request(
                next_url,
                callback=self.parse
            )
View Code

相关文章: