python抓取链家房源信息
闲着没事就抓取了下链家网的房源信息,抓取的是北京二手房的信息情况,然后通过网址进行分析,有100页,并且每页的url都是类似的
url = \'https://bj.lianjia.com/ershoufang/pg\' + 页数,然后请求是get 请求,所以静态页面,然后依次来进行分析,并且存储在mongodb中,每次插入的时候还是要字符串装换成json格式在进行插入,页面的解析用的是bs,解析很方便,代码用的是单进程,耗时是大致66s,因为怕ip被封,所以在每次页面请求之后都要sleep 1秒。
#-*-coding:utf-8-*- import urllib import urllib2 import re import requests import json import lxml from bs4 import BeautifulSoup import time from pymongo import MongoClient from lxml import etree client = MongoClient(\'localhost\',27017) db = client.test House = db.House headers = { \'Accept\':\'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8\', \'Accept-Encoding\':\'gzip, deflate, br\', \'Accept-Language\':\'zh-CN,zh;q=0.9\', \'Cache-Control\':\'max-age=0\', \'Connection\':\'keep-alive\', \'Cookie\':\'......\', \'Host\':\'bj.lianjia.com\', \'Upgrade-Insecure-Requests\':\'1\', \'User-Agent\':\'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36\' } URL = \'https://bj.lianjia.com/ershoufang/pg\' def download(url): num_try = 2 while num_try > 0: num_try -= 1 try: content = requests.get(url,headers = headers) return content.text except urllib2.URLError as e: print \'Download error\',e.reason return None def get_message(url): html = download(url) soup = BeautifulSoup(html,\'html.parser\') prices = soup.find_all(\'div\',\'priceInfo\') total_price = [] for each in prices: total_price.append(each.span.string) address = [] house_types = [] areas = [] towards = [] decorates = [] elevates = [] message = soup.find_all(\'div\',attrs={\'class\':\'houseInfo\'}) for each in message: List = each.get_text().split(\'|\') address.append(List[0].strip()) house_types.append(List[1].strip()) areas.append(List[2].strip()) towards.append(List[3].strip()) decorates.append(List[4].strip()) if len(List) == 5: elevates.append("None") else: elevates.append(List[5].strip()) for addres,house_type,area,price,toward,decorate,elevate in zip(address,house_types,areas,total_price,towards,decorates,elevates): mess = "{\"Address\":\"%s\",\"House_type\":\"%s\",\"Area\":\"%s\",\"Price\":\"%s\",\"Toward\":\"%s\",\"Decorate\":\"%s\",\"Elevete\":\"%s\"}"%(addres,house_type,area,price,toward,decorate,elevate) print mess message = json.loads(mess) House.insert(message) if __name__ == \'__main__\': t = time.time() print t for num in xrange(1,101): url = URL + str(num) print url get_message(url) time.sleep(1) t1 = time.time() print \'Total time:\' print t1 - t - 100