在开发工具内获取“请求头”来伪装成浏览器,以便更好地抓取数据
!/usr/bin/env python -*- encoding:UTF-8 -*- import requests headers = { \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36\' } res = requests.get(\'http://bj.xiaozhu.com/\',headers=headers) # get方法加入请求头 try: print(res.text) except ConnectionError: print(\'拒绝连接\') # 通过BeautiSoup库解析得到的Soup文档是标准结构化数据比上面的更好 import requests from bs4 import BeautifulSoup headers = { \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36\' } res = requests.get(\'http://bj.xiaozhu.com/\',headers=headers) # get方法加入请求头 try: soup = BeautifulSoup(res.text, \'html.parser\') print(soup.prettify()) except ConnectionError: print(\'拒绝连接\')
更新后:
price = soup.select(\'#page_list > ul > li:nth-child(1) > div.result_btm_con.lodgeunitname > div:nth-child(1) > \' \'span.result_price > i\')
完整代码
headers = { \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 \' \'Safari/537.36 \' } res = requests.get(\'http://bj.xiaozhu.com/\', headers=headers) # get方法加入请求头 soup = BeautifulSoup(res.text, \'html.parser\') # 定位元素位置并通过selector方法提取 prices = soup.select( \'#page_list > ul > li > div.result_btm_con.lodgeunitname > div:nth-child(1) > span.result_price > i\') for price in prices: print(price.get_text())
# print(prince) 带有标签
爬取北京地区短租房信息:
import random import requests from bs4 import BeautifulSoup import time # 加入请求头伪装成浏览器 headers = { # 通过Chrome浏览器复制User-Agent \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36\' } # 定义判断用户性别的函数 def judgment_sex(class_name): if class_name == [\'member_ico1\']: return \'女\' else: return \'男\' # 获取详细页URL函数 def get_links(url): try: wb_date = requests.get(url, headers) except ConnectionAbortedError: print(\'拒绝连接\') soup = BeautifulSoup(wb_date.text, \'lxml\') links = soup.select(\'#page_list > ul > li > a\') for link in links: herf = link.get("href") get_info(herf) # 获取网页信息函数 def get_info(url): wb_date = requests.get(url, headers) soup = BeautifulSoup(wb_date.text, \'lxml\') # 通过浏览器copy selector tittles = soup.select(\'div.pho_info > h4\') addresses = soup.select(\'span.pr5\') prises = soup.select(\'#pricePart > div.day_l > span\') images = soup.select(\'#floatRightBox > div.js_box.clearfix > div.member_pic > a > img\') names = soup.select(\'#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a\') sexs = soup.select(\'#floatRightBox > div.js_box.clearfix > div.member_pic > div\') for tittle, address, prise, image, name, sex in zip(tittles, addresses, prises, images, names, sexs): date = { \'tittle\': tittle.get_text().strip(), \'address\': address.get_text().strip(), \'price\': prise.get_text(), \'image\': image.get("src"), \'name\': name.get_text(), \'sex\': judgment_sex(sex.get("class")) } print(date) if __name__ == \'__main__\': urls = [\'http://bj.xiaozhu.com/search-duanzufang-p{}-0/\'.format(number) for number in range(1, 14)] for single_url in urls: get_links(single_url) # 休眠十秒,防止被封IP time.sleep(random.randint(10, 13)) # 缺点:缺少IP管理,采用休眠方法,效率低
爬取酷狗top1.0版:
#!/usr/bin/env python # -*- encoding:UTF-8 -*- from bs4 import BeautifulSoup import requests import time,random headers = { \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36\' } def get_info(url): """获取信息函数""" wb_data = requests.get(url,headers) soup = BeautifulSoup(wb_data.text,\'lxml\') ranks = soup.select(\'span.pc_temp_num\') titles =soup.select(\'div.pc_temp_songlist>ul>li>a\') times = soup.select(\'span.pc_temp_tips_r>span\') for rank,title,time in zip(ranks,titles,times): data = { \'rank\':rank.get_text().strip(), \'singer\':title.get_text().split(\'-\')[0], \'song\':title.get_text().split(\'-\')[1], \'time\':time.get_text().strip() } print(data) if __name__ == \'__main__\': """主程序入口""" urls = [\'https://www.kugou.com/yy/rank/home/{}-8888.html\'.format(i) for i in range(1,25)] for url in urls: get_info(url) time.sleep(random.randint(3,5))
爬取酷狗top1.1版:
#!/usr/bin/env python # -*- encoding:UTF-8 -*- from bs4 import BeautifulSoup import requests import time,random headers = { \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36\' } def get_info(url): """获取信息函数""" wb_data = requests.get(url,headers) soup = BeautifulSoup(wb_data.text,\'lxml\') ranks = soup.select(\'span.pc_temp_num\') titles =soup.select(\'a.pc_temp_songname\') times = soup.select(\'span.pc_temp_time\') for rank,title,time in zip(ranks,titles,times): data = { \'rank\':rank.get_text().strip(), \'singer\':title.get_text().split(\'-\')[0], \'song\':title.get_text().split(\'-\')[1], \'time\':time.get_text().strip() } print(data) if __name__ == \'__main__\': """主程序入口""" urls = [\'https://www.kugou.com/yy/rank/home/{}-8888.html\'.format(i) for i in range(1,25)] for url in urls: get_info(url) time.sleep(random.randint(3,5))
爬取价格:
import re import requests res = requests.get(\'http://bj.xiaozhu.com/\') prices = re.findall(\'<span class="result_price">¥<i>(.*?)</i>起/晚</span>\', res.text) for price in prices: print(price)
注意:
<span class="result_price">¥<i>488</i>起/晚</span>
¥ 和 ¥等价,但爬取时,不能出现¥