ken-yu

爬取安居客-苏州吴江所有二手楼盘并导出到EXCEL表

爬取后保留的信息有,"标题","楼盘名称","地址",

https://suzhou.anjuke.com/sale/p{}

import requests
from lxml import etree
import csv
 
class Anjuke():
    def __init__(self):
        self.url_temp = "https://suzhou.anjuke.com/sale/p{}"
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
 
    def get_url_list(self):
        return [self.url_temp.format(i) for i in range(1, 3)] #这里爬取1-3页。
 
    def pase_url(self, url):
        response = requests.get(url, headers=self.headers)
        return response.content.decode()
 
    def get_content_list(self, html_str):
        html = etree.HTML(html_str)
        content_list = []
        div_list = html.xpath(\'//ul[@id="houselist-mod-new"]/li\')
        for div in div_list:
            item = {}
            item["标题"] = div.xpath(
                \'.//div[@class="house-title"]/a/text()\')
            item["标题"] = item["标题"][0].strip()
            item["楼盘名称"] = div.xpath(
                \'.//div[@class="details-item"]/span[@class="comm-address"]/text()\')
            item["楼盘名称"] = item[\'楼盘名称\'][0].split("\xa0")[0].strip()
            item["地址"] = div.xpath(
                \'.//div[@class="details-item"]/span[@class="comm-address"]/text()\')
            item["地址"] = item[\'地址\'][0].split("\xa0")[-1].strip()
            content_list.append(item)
        return content_list
 
    def save_content_list(self, content_list):
        headers = ["标题","楼盘名称","地址"]
        with open("信息.csv","w",encoding="utf-8-sig", newline="") as fp:
            writer = csv.DictWriter(fp, headers)
            writer.writeheader()
            writer.writerows(content_list)
 
        # for i in content_list:
        #     print(i["title"])
 
 
    def run(self):
        url_list = self.get_url_list()
        for url in url_list:
            html_str = self.pase_url(url)
            content_list = self.get_content_list(html_str)
            self.save_content_list(content_list)
 
if __name__ == \'__main__\':
    Anjuke = Anjuke()
    Anjuke.run()

  

分类:

技术点:

相关文章:

  • 2021-11-12
  • 2021-10-18
  • 2021-11-12
  • 2021-11-12
  • 2021-11-12
  • 2021-11-12
  • 2021-11-12
  • 2021-11-22
猜你喜欢
  • 2021-11-22
  • 2021-11-12
  • 2021-11-12
  • 2021-08-14
  • 2021-11-12
  • 2021-11-12
  • 2021-11-12
相关资源
相似解决方案