chenyang920

python抓取链家房源信息

闲着没事就抓取了下链家网的房源信息,抓取的是北京二手房的信息情况,然后通过网址进行分析,有100页,并且每页的url都是类似的

url = \'https://bj.lianjia.com/ershoufang/pg\' + 页数,然后请求是get 请求,所以静态页面,然后依次来进行分析,并且存储在mongodb中,每次插入的时候还是要字符串装换成json格式在进行插入,页面的解析用的是bs,解析很方便,代码用的是单进程,耗时是大致66s,因为怕ip被封,所以在每次页面请求之后都要sleep 1秒。

#-*-coding:utf-8-*-
import urllib
import urllib2
import re
import requests
import json
import lxml
from bs4 import BeautifulSoup
import time
from pymongo import MongoClient

from lxml import etree
client = MongoClient(\'localhost\',27017)
db = client.test
House = db.House
headers = {
    \'Accept\':\'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8\',
    \'Accept-Encoding\':\'gzip, deflate, br\',
    \'Accept-Language\':\'zh-CN,zh;q=0.9\',
    \'Cache-Control\':\'max-age=0\',
    \'Connection\':\'keep-alive\',
    \'Cookie\':\'......\',
    \'Host\':\'bj.lianjia.com\',
    \'Upgrade-Insecure-Requests\':\'1\',
    \'User-Agent\':\'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36\'
}
URL = \'https://bj.lianjia.com/ershoufang/pg\'
def download(url):
    num_try = 2
    while num_try > 0:
        num_try -= 1
        try:
            content = requests.get(url,headers = headers)
            return content.text
        except urllib2.URLError as e:
            print \'Download error\',e.reason

    return None


def get_message(url):
    html = download(url)

    soup = BeautifulSoup(html,\'html.parser\')
    prices = soup.find_all(\'div\',\'priceInfo\')
    total_price = []
    for each in prices:
        total_price.append(each.span.string)
    address = []
    house_types = []
    areas = []
    towards = []
    decorates = []
    elevates = []
    message = soup.find_all(\'div\',attrs={\'class\':\'houseInfo\'})
    for each in message:
        List = each.get_text().split(\'|\')
        address.append(List[0].strip())
        house_types.append(List[1].strip())
        areas.append(List[2].strip())
        towards.append(List[3].strip())
        decorates.append(List[4].strip())
        if len(List) == 5:
            elevates.append("None")
        else:
            elevates.append(List[5].strip())
    for addres,house_type,area,price,toward,decorate,elevate in zip(address,house_types,areas,total_price,towards,decorates,elevates):
        mess = "{\"Address\":\"%s\",\"House_type\":\"%s\",\"Area\":\"%s\",\"Price\":\"%s\",\"Toward\":\"%s\",\"Decorate\":\"%s\",\"Elevete\":\"%s\"}"%(addres,house_type,area,price,toward,decorate,elevate)
        print mess
        message = json.loads(mess)
        House.insert(message)

if __name__ == \'__main__\':
    t = time.time()
    print t
    for num in xrange(1,101):

        url = URL + str(num)
        print url
        get_message(url)
        time.sleep(1)
    t1 = time.time()
    print \'Total time:\'
    print t1 - t - 100

 

分类:

技术点:

相关文章: