Tsukasa
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Author;Tsukasa



import requests
from bs4 import BeautifulSoup
import pandas
import time


url_all = []
url_in = input(\'输入你所需要城市的字母简写:\n如:中山 zs , 广州 gz\n!!!不要乱输入,不然运行不了\')
url_number = 1+int(input(\'输入爬取页数:\'))

okl = []
def open(nobe):
    res = requests.get(nobe)
    soup = BeautifulSoup(res.text,\'html5lib\')
    http_start = []
    url_start = \'http://esf.\'+url_in+\'.fang.com\'
    for title in soup.select(\'.houseList dl\'):  #网址链接列表
        url_end = title.select(\'.title a \')[0][\'href\']
        http_start.append(url_start + url_end)
    return http_start


#获取详细信息
def content(url):
    info = {}
    info[\'网页\'] = url
    res = requests.get(url)
    soup = BeautifulSoup(res.text,\'html5lib\')
    info[\'标题\'] = soup.select(\'h1\')[0].text.strip()  #获取标题
    info[\'总价\'] = soup.select(\'.red20b\')[0].text + \'万\'   #总价
    info[\'联系电话\'] = soup.select(\'#mobilecode\')[0].text   #电话
    for sl in soup.select(\'span\'):  #获取发布时间
        if \'发布时间\' in sl.text.lstrip(\'<span>\'):
            key , value = (sl.text.strip().rstrip(\'(\').split(\':\'))
            info[key] = value + \'*\' + soup.select(\'#Time\')[0].text
    for dd in soup.select(\'dd\'):  #获取详细内容
        if \':\' in dd.text.strip():
            key , value = (dd.text.strip().split(\':\'))
            info[key] = value
    return info




print(\'----------正在运行,请不要关闭----------\')
url_home = (\'http://esf.\'+ url_in + \'.fang.com/house/i3{}/\')
for url_next in range(1,url_number):
    url_all.append((url_home.format(url_next)))

home = []
for i in url_all:
    a = (open(i))
    print(\'正在获取 -----> \',i,\' <-----\')
    time.sleep(1)
    for b in a:
        home.append(content(b))
        print(\'\t正在获取详细信息 -> \',b,\' <-----\')
        time.sleep(2)

    #home.append(content(open(i[0])))
last = pandas.DataFrame(home)
last.to_excel(\'temp.xlsx\',sheet_name=\'房源信息\')
print(\'----------运行结束----------\n\n----------查看根目录---------\')


abcdefg = input(\'完成运行\')

  源码先奉上,以后在填坑

分类:

技术点:

相关文章: