lehoso

爬虫模板几乎一个样儿
提前安装pandas用于存储数据、requests处理URL资源、Bs4(BeautifulSoup4)提取数据、lxml读取网页

爬取链家二手房信息

# -*- coding: utf-8 -*-
# @Author  : LEHOSO
# @FileName: Lianjia2.py
# @Time    : 2021/10/11 16:55

import time

import pandas as pd
import requests
from bs4 import BeautifulSoup

# 表头
header = {
    \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \'
                  \'AppleWebKit/537.36 (KHTML, like Gecko) \'
                  \'Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.30\'
}
a = []


def get_info(url):
    wb_data = requests.get(url, headers=header)
    # 爬取整个网页
    soup = BeautifulSoup(wb_data.text, \'lxml\')
    # 网页单个元素
    ranks = soup.select(\'div.totalPrice.totalPrice2 > span\')
    title = soup.select(\'div.title > a\')
    location = soup.select(\'div.flood > div > a\')
    area = soup.select(\'div.flood > div > a:nth-child(3)\')
    fllowInfo = soup.select(\'div.followInfo\')
    # 存入进列表
    for ranks, titles, locaitons, areas, fllowInfos in zip(ranks, title, location, area, fllowInfo):
        data = {
            \'价格\': ranks.get_text().strip(),
            \'标题\': titles.get_text().strip(),
            \'位置\': locaitons.get_text().strip() + \'-\' + areas.get_text().strip(),
            \'关注\': fllowInfos.get_text().strip().split(\'/\')[0],
            \'距今发布日期\': fllowInfos.get_text().strip().split(\'/\')[1]
        }
        a.append(data)
        print(data)


if __name__ == \'__main__\':
    # 网址路径
    urls = [
        \'https://cq.lianjia.com/ershoufang/\'
    ]
    for url in urls:
        get_info(url)
        time.sleep(2)
    # pandas存入数据
    df_out = pd.DataFrame(a, columns=[\'价格\', \'标题\', \'位置\', \'关注\', \'距今发布日期\'])
    #导出为xlsx格式
    df_out.to_excel(\'aaa.xlsx\')

分类:

技术点:

相关文章: