该程序主要为了抓取人人车卖车信息,包括车系、车型号、购车日期、卖车价格、行驶路程、首付价格等等信息。话不多说直接代码。
入库之后将Mongodb里的信息导出成Excel语句
mongoexport -d myDB -c user -f _id,name,password,adress --csv -o ./user.csv
-d 标示 数据库
-c 标示 数据表
-f 需要提取的field用逗号分隔
-o 输出路径
车系py文件
# -*- coding: utf-8 -*-
import re
from urllib.request import urlopen
from scrapy.http import Request
# from urllib.request import Request
from bs4 import BeautifulSoup
from lxml import etree
import pymongo
import scrapy
from scrapy.selector import HtmlXPathSelector
client = pymongo.MongoClient(host="127.0.0.1")
db = client.renrenche
collection = db.Carclass #表名classification
import redis #导入redis数据库
r = redis.Redis(host=\'127.0.0.1\', port=6379, db=0)
class renrencheSpider(scrapy.Spider):
name = "Carinfo1"
allowed_domains = ["renrenche.com"] #允许访问的域
start_urls = [
"https://www.renrenche.com/bj/ershouche/"
]
#每爬完一个网页会回调parse方法
def parse(self, response):
hxs = HtmlXPathSelector(response)
hx = hxs.select(\'//div[@class="brand-more-content"]/div[@class="brand-section brand-section-1"]/p[@class="bl"]/span[@class="bn"]/a\')
for secItem in hx:
url = secItem.select("@href").extract()
c = "https://www.renrenche.com"+url[0]
name = secItem.select("text()").extract()
classid =self.insertMongo(name,None)
print(c)
print(name)
request = Request(c,callback=lambda response,pid=str(classid):self.parse_subclass(response,pid))
yield request
def parse_subclass(self, response,pid):
# print(response.body.decode(\'utf-8\'))
hxs = HtmlXPathSelector(response)
hx = hxs.select(\'//ul[@id="filter_series"]/li[@class=""]/a\')
for secItem in hx:
urls = secItem.select("@href").extract()
url = "https://www.renrenche.com" + urls[0]
name = secItem.select("text()").extract()
print(url)
print(name)
classid = self.insertMongo(name,pid)
self.pushRedis(classid,url,pid)
def insertMongo(self,classname,pid):
classid = collection.insert({\'classname\':classname,\'pid\':pid})
return classid
def pushRedis(self,classid,url,pid,):
carurl = \'%s,%s,%s\' %(classid,url,pid)
r.lpush(\'carurl\',carurl)
卖车各种信息py文件
# -*- coding: utf-8 -*-
import re
from urllib.request import urlopen
from scrapy.http import Request
import pymongo
import scrapy
from time import sleep
from scrapy.selector import HtmlXPathSelector
client = pymongo.MongoClient(host="127.0.0.1")
db = client.renrenche
collection = db.Carinfo
import redis # 导入redis数据库
r = redis.Redis(host=\'127.0.0.1\', port=6379, db=0)
class renrencheSpider(scrapy.Spider):
name = "Carinfo2"
allowed_domains = ["renrenche.com"]
dict = {}
start_urls = []
def __init__(self): # 定义一个方法
a = r.lrange(\'carurl\', 0, -1)
for item in a:
novelurl = bytes.decode(item)
arr = novelurl.split(\',\') # 分割字符串
renrencheSpider.start_urls.append(arr[1])
pid = arr[0]
url = arr[1]
self.dict[url] = {"pid":pid,"num":0}
def parse(self, response):
classInfo = self.dict[response.url]
pid = classInfo[\'pid\']
num = classInfo[\'num\']
# print(self.dict)
if num>3:
return None
hxs = HtmlXPathSelector(response)
hx = hxs.select(\'//ul[@class="row-fluid list-row js-car-list"]\')
s=""
for secItem in hx:
hx1 = secItem.select(\'//li[@class="span6 list-item car-item"]/a[@rrc-event-param="search"]/h3\')
name = hx1.select("text()").extract()
a = "型号:"+name[0]
# self.insertMongo(classname=a)
s +=a+"\n"
# classid = collection.insert({\'carinfo\': a, \'pid\': pid})
# print(a)
for secItem in hx:
hx2 = secItem.select(\'//div[@class="mileage"]/span[@class="basic"]\')
name = hx2.select("text()").extract()
b = "购车年份/公里数:"+name[0]+"/"+name[1]
# self.insertMongo(classname1=b)
s +=b+"\n"
# print(b)
for secItem in hx:
hx3 = secItem.select(\'//div[@class="tags-box"]/div[@class="price"]\')
name = hx3.select("text()").extract()
c = str(name[0])
c = c.strip()
c = "卖车价格:"+c+"万"
# self.insertMongo(classname2=c)
s +=c+"\n"
# print(c)
for secItem in hx:
hx4 = secItem.select(\'//div[@class="down-payment"]/div[@class="m-l"]\')
name = hx4.select("text()").extract()
d = "首付:"+name[0]+"万"
# self.insertMongo(classname3=d,pid=pid)
s +=d+"\n"
# print(d)
# print(s)
arr = s.split(\'\n\')
print(arr[0])
classid = self.insertMongo(arr[0],arr[1],arr[2],arr[3],pid)
# classid = self.insertMongo(s, pid)
def insertMongo(self, classname,classname1,classname2,classname3, pid):
classid = collection.insert({\'classname\': classname,\'classname1\':classname1,\'classname2\':classname2,\'classname3\':classname3, \'pid\': pid})
return classid
# r.lpush(\'novelnameurl\', novelnameurl)