gxy-9977

爬虫-链家二手房

import ssl, re,json
from urllib.request import urlopen
# 去掉数字签名证书
ssl._create_default_https_context = ssl._create_unverified_context
def getPage(url): response = urlopen(url) return response.read().decode("utf-8")

def parsePage(d): com = re.compile(r\'<!-- 热推标签、埋点 -->.*?data-is_focus="(?:1)?" data-sl="">(?P<title>.*?)</a>\' r\'.*?data-el="region">(?P<xiaoqu>.*?)</a>\' r\'.*?</span>(?P<huxing>.*?)<span\' r\'.*?/</span>(?P<mianji>.*?)<span\' r\'.*?/</span>(?P<chaoxiang>.*?)<span\' r\'.*?/</span>(?P<zhuangxiu>.*?)<\' r\'(?:span class="divide">/</span>(?P<dianti>.*?)<)?\'#()?括号里的东西出现0次或1次 (?:)表示取消()权限:findall会优先把匹配结果组里内容返回,如果想要匹配结果,取消权限即可 r\'.*?div class="positionInfo">(?P<flood>.*?)<span\' r\'.*?/</span>(?P<floodtime>.*?)<span\' r\'.*?target="_blank">(?P<diqu>.*?)</a>\' r\'.*?class="followInfo">(?P<followInfo>.*?)<span\' r\'.*?/</span>(?P<daikancishu>.*?)<div class="tag">\' r\'(?:<span class="subway">(?P<subway>.*?)</span>)?\'#可有可无 r\'(?:<span class=".*?">(?P<fangben>.*?)</span>)?\'#可有可无 r\'(?:<span class="haskey">(?P<haskey>.*?)</span>)?\'#可有可无 r\'.*?<div class="totalPrice"><span>(?P<totalPrice>.*?)</div>\' r\'.*?data-price=".*?"><span>(?P<unitPrice>.*?)</span>\' ,re.S) retsult=com.finditer(d) for i in retsult: yield {"title":i.group("title"), "xiaoqu": i.group("xiaoqu"), "huxing": i.group("huxing"), "mianji": i.group("mianji"), "chaoxiang": i.group("chaoxiang"), "zhuangxiu": i.group("zhuangxiu"), "dianti": i.group("dianti"), "flood": i.group("flood"), "floodtime": i.group("floodtime"), "diqu": i.group("diqu"), "followInfo": i.group("followInfo"), "daikancishu": i.group("daikancishu"), "subway": i.group("subway"), "fangben": i.group("fangben"), "haskey": i.group("haskey"), "totalPrice": re.sub("</span>","",i.group("totalPrice")), "unitPrice": i.group("unitPrice"), } f = open("lianjia_Second-hand house_info", mode="a", encoding="utf-8") for i in range(100): if i==0: url="https://bj.lianjia.com/ershoufang/" else: url = "https://bj.lianjia.com/ershoufang/"+"pg%s" % (i+1) print(url) ret = parsePage(getPage(url)) for obj in ret: data = json.dumps(obj, ensure_ascii=False) print(data) f.write(data + "\n") f.flush() f.close()

 

发表于 2018-10-16 15:59  爱吃魚的小姐姐  阅读(249)  评论(0编辑  收藏  举报
 

分类:

技术点:

相关文章:

  • 2021-10-05
  • 2021-11-05
  • 2018-02-04
  • 2021-06-24
  • 2021-10-05
  • 2021-12-18
  • 2021-11-15
  • 2021-07-31
猜你喜欢
  • 2021-11-23
  • 2021-12-19
  • 2021-06-17
  • 2021-11-15
  • 2021-09-28
  • 2019-06-02
  • 2021-11-15
相关资源
相似解决方案