链家二手房案例(xpath)
实现步骤
1.确定是否为静态
打开二手房页面 -> 查看网页源码 -> 搜索关键字
2.xpath表达式
1、基准xpath表达式(匹配每个房源信息节点列表) //ul[@class="sellListContent"]/li[@class="clear LOGVIEWDATA LOGCLICKDATA"] 2、依次遍历后每个房源信息xpath表达式 * 名称: \'.//a[@data-el="region"]/text()\' # 户型+面积+方位+是否精装 info_list = \'.//div[@class="houseInfo"]/text()\' [0].strip().split(\'|\') * 户型(model): info_list[1] * 面积(area): info_list[2] * 方位(direction): info_list[3] * 精装(perfect): info_list[4] * 楼层(floor): \'.//div[@class="positionInfo"]/text()\' * 区域(address): \'.//div[@class="positionInfo"]/a/text()\' * 总价(total_price): \'.//div[@class="totalPrice"]/span/text()\' * 单价(unit_price): \'.//div[@class="unitPrice"]/span/text()\'
3.实现代码
import requests from lxml import etree import time import random class LianjiaSpider(object): def __init__(self): self.url = \'https://bj.lianjia.com/ershoufang/pg{}/\' self.headers = {\'User-Agent\' : \'Mozilla/5.0\'} def get_page(self,url): try: # 设定超时时间,超时后抛出异常,被except捕捉,继续执行此函数再次请求 res = requests.get(url,headers=self.headers,timeout=5) res.encoding = \'utf-8\' html = res.text self.parse_page(html) except Exception as e: self.get_page(url) def parse_page(self,html): parse_html = etree.HTML(html) # 基准xpath,匹配每个房源信息的节点对象 li_list = parse_html.xpath(\'//ul[@class="sellListContent"]/li[@class="clear LOGVIEWDATA LOGCLICKDATA"]\') # 定义空字典,用来存储抓取的最终数据 house_dict = {} # 遍历依次匹配每个房源信息,获取所有所需数据 for li in li_list: # 房源名称 name_list = li.xpath(\'.//a[@data-el="region"]/text()\') house_dict[\'house_name\'] = [ name_list[0] if name_list else None ][0] # 列表:户型+面积+方位+是否精装 info_list = li.xpath(\'.//div[@class="houseInfo"]/text()\') house_info = [ info_list[0].strip().split(\'|\') if info_list else None ][0] if house_info: # 户型 house_dict[\'house_model\'] = house_info[1] # 面积 house_dict[\'area\'] = house_info[2] # 方位 house_dict[\'direction\'] = house_info[3] # 是否精装 house_dict[\'hardcover\'] = house_info[4] ########################################### # 楼层 floor_list = li.xpath(\'.//div[@class="positionInfo"]/text()\') house_dict[\'floor\'] = [ floor_list[0].strip()[:-2] if floor_list else None ][0] # 区域 address_list = li.xpath(\'.//div[@class="positionInfo"]/a/text()\') house_dict[\'address\'] = [ address_list[0].strip() if address_list else None ][0] # 总价 total_list = li.xpath(\'.//div[@class="totalPrice"]/span/text()\') house_dict[\'total_price\'] = [ total_list[0].strip() if total_list else None ][0] # 单价 unit_list = li.xpath(\'.//div[@class="unitPrice"]/span/text()\') house_dict[\'unit_price\'] = [ unit_list[0].strip() if unit_list else None ][0] print(house_dict) def main(self): for pg in range(1,11): url = self.url.format(str(pg)) self.get_page(url) print(\'第%d页爬取成功\' % pg) time.sleep(random.randint(1,3)) if __name__ == \'__main__\': start = time.time() spider = LianjiaSpider() spider.main() end = time.time() print(\'执行时间:%.2f\' % (end-start))
目标:抓取指定贴吧所有图片
思路:
1、获取贴吧主页URL,下一页,找到不同页的URL规律 2、获取1页中所有帖子URL地址: [帖子链接1,帖子链接2,...] 3、对每个帖子链接发请求,获取图片URL 4、向图片的URL发请求,以wb方式写入本地文件
实现步骤:
1.贴吧url规律
http://tieba.baidu.com/f?kw=??&pn=50
2.xpath表达式
1、帖子链接xpath //div[@class="t_con cleafix"]/div/div/div/a/@href 2、图片链接xpath //div[@class="d_post_content j_d_post_content clearfix"]/img[@class="BDE_Image"]/@src 3、视频链接xpath //div[@class="video_src_wrapper"]/embed/@data-video # 注意: 此处视频链接前端对响应内容做了处理,需要查看网页源代码来查看,复制HTML代码在线格式化
3.代码实现
import requests from urllib import parse from lxml import etree import time import random class BaiduImgSpider(object): def __init__(self): self.url = \'http://tieba.baidu.com/f?{}\' self.headers = {\'User-Agent\':\'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)\'} # 获取html函数 def get_html(self,url): try: res = requests.get(url=url,headers=self.headers) res.encoding = \'utf-8\' html = res.text return html except Exception as e: self.get_html(url) # 解析html函数 def xpath_func(self,xpath_bds,html): parse_html = etree.HTML(html) r_list = parse_html.xpath(xpath_bds) return r_list # 一级页面:获取帖子链接,最终搞定所有图片下载 # 还记得吗?多级页面抓取所有数据都在一级页面中搞定!!! def get_tlink(self,url): html = self.get_html(url) xpath_bds = \'//div[@class="t_con cleafix"]/div/div/div/a/@href\' # tlink_list: [\'/p/23234\',\'/p/9032323\'] tlink_list = self.xpath_func(xpath_bds,html) # 依次遍历每个帖子链接,搞定所有的图片下载 if tlink_list: for tlink in tlink_list: t_url = \'http://tieba.baidu.com\' + tlink # 提取图片链接并保存 self.get_image(t_url) time.sleep(random.randint(1,3)) else: print(\'No Data\') # 获取图片链接 def get_image(self,t_url): html = self.get_html(t_url) # 提取图片链接 xpath_bds = \'//*[@class="d_post_content j_d_post_content clearfix"]/img/@src\' imglink_list = self.xpath_func(xpath_bds,html) for imglink in imglink_list: html_content = requests.get(imglink,headers=self.headers).content filename = imglink[-10:] with open(filename,\'wb\') as f: f.write(html_content) print(\'%s下载成功\' % filename) # 指定贴吧名称,起始页和终止页,爬取图片 def main(self): name = input(\'请输入贴吧名:\') begin = int(input(\'请输入起始页:\')) end = int(input(\'请输入终止页:\')) for page in range(begin,end+1): # 查询参数编码 params = { \'kw\' : name, \'pn\' : str( (page-1)*50 ) } params = parse.urlencode(params) url = self.url.format(params) # 开始获取图片 self.get_tlink(url) if __name__ == \'__main__\': spider = BaiduImgSpider() spider.main()