爬虫模板几乎一个样儿
提前安装pandas用于存储数据、requests处理URL资源、Bs4(BeautifulSoup4)提取数据、lxml读取网页
爬取链家二手房信息
# -*- coding: utf-8 -*-
# @Author : LEHOSO
# @FileName: Lianjia2.py
# @Time : 2021/10/11 16:55
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
# 表头
header = {
\'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \'
\'AppleWebKit/537.36 (KHTML, like Gecko) \'
\'Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.30\'
}
a = []
def get_info(url):
wb_data = requests.get(url, headers=header)
# 爬取整个网页
soup = BeautifulSoup(wb_data.text, \'lxml\')
# 网页单个元素
ranks = soup.select(\'div.totalPrice.totalPrice2 > span\')
title = soup.select(\'div.title > a\')
location = soup.select(\'div.flood > div > a\')
area = soup.select(\'div.flood > div > a:nth-child(3)\')
fllowInfo = soup.select(\'div.followInfo\')
# 存入进列表
for ranks, titles, locaitons, areas, fllowInfos in zip(ranks, title, location, area, fllowInfo):
data = {
\'价格\': ranks.get_text().strip(),
\'标题\': titles.get_text().strip(),
\'位置\': locaitons.get_text().strip() + \'-\' + areas.get_text().strip(),
\'关注\': fllowInfos.get_text().strip().split(\'/\')[0],
\'距今发布日期\': fllowInfos.get_text().strip().split(\'/\')[1]
}
a.append(data)
print(data)
if __name__ == \'__main__\':
# 网址路径
urls = [
\'https://cq.lianjia.com/ershoufang/\'
]
for url in urls:
get_info(url)
time.sleep(2)
# pandas存入数据
df_out = pd.DataFrame(a, columns=[\'价格\', \'标题\', \'位置\', \'关注\', \'距今发布日期\'])
#导出为xlsx格式
df_out.to_excel(\'aaa.xlsx\')