代码:
import requests
from pyquery import PyQuery as pq
import csv
from threading import Thread,Lock
import time
import os
def get_one_url(url):
try:
headers={
\'Referer\':\'https: // s1.ljcdn.com / matrix_pc / dist / pc / src / common / css / common.css?_v = 20191213202326259\',
\'User - Agent\':\'Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 76.0.3809.132Safari / 537.36\',
}
# 获取响应
response=requests.get(url,headers,proxies=None)
# 判断请求状态
if response.status_code==200:
return response.text
return None
except requests.exceptions.ConnectionError as e:
print(\'error\',e.args)
def parse_one_url(html):
doc=pq(html)
items=doc(\'.content__list--item--main\').items()
for item in items:
# 上锁,等待每个线程的解析执行完之后在执行其他线程
mutes.acquire()
info= {
\'content\': item.find(\'a\').text().split()[0],
\'room\': item.find(\'a\').text().split()[1],
\'direction\': item.find(\'a\').text().split()[2],
\'size\':item.find(\'p.content__list--item--des\').text().split(\'/\')[1],
\'price\':item.find(\'.content__list--item-price\').text(),
\'location\':item.find(\'.content__list--item--des a\').text().replace(\' \',\'-\')
}
# 释放互斥锁
mutes.release()
print(info)
# 生成器,使循环结束之后再返回
yield info
# 创建互斥锁
mutes = Lock()
def man(page,city):
url = \'https://sz.lianjia.com/zufang/%s/pg%d/#contentList\'%(city,page)
html = get_one_url(url)
infos=parse_one_url(html)
save(infos,city)
def save(infos,city):
\'\'\'
判断文件是否存在,如果存在就说明表头已经写了就不写表头,反之加上表头
:param infos: 保存的信息
:return:
\'\'\'
if os.path.exists(\'%s.csv\' %(city)):
with open(\'链家租房.csv\', \'a\', encoding=\'utf-8\') as csvfile:
fieldname = [\'content\', \'room\', \'direction\', \'size\', \'location\', \'price\']
writer = csv.DictWriter(csvfile, fieldname)
for info in infos:
writer.writerow(info)
else:
with open(\'%s.csv\' %(city),\'a\',encoding=\'utf-8\') as csvfile:
fieldname=[\'content\',\'room\',\'direction\',\'size\',\'location\',\'price\']
writer = csv.DictWriter(csvfile,fieldname) #DictWriter方法使csv文件可以写入字典
writer.writeheader()
for info in infos:
writer.writerow(info)
if __name__==\'__main__\':
cities = [\'luohuqu\', \'longhuaqu\', \'futianqu\']
# 循环深圳各区的缩写
for city in cities:
ts =[]
# 创建多个线程分页爬取,使爬虫效率大大提高
for i in range(1,50):
exec(\'t{0} = Thread(target=man,args=(i,city))\'.format(i))
exec(\'ts.append(t{0})\'.format(i))
for t in ts:
t.start()
# 等待所有线程全部执行结束
time.sleep(4)
print(\'%s租房信息存入成功\' %(city))
print(\'全部存入成功\')
改进:可以动态爬取,用户输入想要爬取的深圳区域,程序自动爬取并保存;可以多进程爬取,同时爬取不同区域的住房信息并分别保存不同的csv文件;如果怕被封ip,还可以连接代理的 api,用代理爬取。