爬虫（二） - 爱码网

　　在开发工具内获取“请求头”来伪装成浏览器，以便更好地抓取数据

!/usr/bin/env python
-*- encoding:UTF-8 -*-

import requests

headers = {
    \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36\'
}
res = requests.get(\'http://bj.xiaozhu.com/\',headers=headers)    # get方法加入请求头
try:
    print(res.text)
except ConnectionError:
    print(\'拒绝连接\')


# 通过BeautiSoup库解析得到的Soup文档是标准结构化数据比上面的更好
import requests
from bs4 import BeautifulSoup

headers = {
    \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36\'
}
res = requests.get(\'http://bj.xiaozhu.com/\',headers=headers)    # get方法加入请求头
try:
    soup = BeautifulSoup(res.text, \'html.parser\')
    print(soup.prettify())
except ConnectionError:
    print(\'拒绝连接\')

更新后：

price = soup.select(\'#page_list > ul > li:nth-child(1) > div.result_btm_con.lodgeunitname > div:nth-child(1) > \'
                    \'span.result_price > i\')

完整代码

headers = {
    \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 \'
                  \'Safari/537.36 \'
}
res = requests.get(\'http://bj.xiaozhu.com/\', headers=headers)  # get方法加入请求头

soup = BeautifulSoup(res.text, \'html.parser\')
# 定位元素位置并通过selector方法提取
prices = soup.select(
    \'#page_list > ul > li > div.result_btm_con.lodgeunitname > div:nth-child(1) >  span.result_price > i\')
for price in prices:
    print(price.get_text())
　　# print(prince) 带有标签

爬取北京地区短租房信息：

import random

import requests
from bs4 import BeautifulSoup
import time

# 加入请求头伪装成浏览器
headers = {
    # 通过Chrome浏览器复制User-Agent
    \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36\'
}


# 定义判断用户性别的函数
def judgment_sex(class_name):
    if class_name == [\'member_ico1\']:
        return \'女\'
    else:
        return \'男\'


# 获取详细页URL函数
def get_links(url):
    try:
        wb_date = requests.get(url, headers)
    except ConnectionAbortedError:
        print(\'拒绝连接\')
    soup = BeautifulSoup(wb_date.text, \'lxml\')
    links = soup.select(\'#page_list > ul > li > a\')
    for link in links:
        herf = link.get("href")
        get_info(herf)


# 获取网页信息函数
def get_info(url):
    wb_date = requests.get(url, headers)
    soup = BeautifulSoup(wb_date.text, \'lxml\')
    # 通过浏览器copy selector
    tittles = soup.select(\'div.pho_info > h4\')
    addresses = soup.select(\'span.pr5\')
    prises = soup.select(\'#pricePart > div.day_l > span\')
    images = soup.select(\'#floatRightBox > div.js_box.clearfix > div.member_pic > a > img\')
    names = soup.select(\'#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a\')
    sexs = soup.select(\'#floatRightBox > div.js_box.clearfix > div.member_pic > div\')
    for tittle, address, prise, image, name, sex in zip(tittles, addresses, prises, images, names, sexs):
        date = {
            \'tittle\': tittle.get_text().strip(),
            \'address\': address.get_text().strip(),
            \'price\': prise.get_text(),
            \'image\': image.get("src"),
            \'name\': name.get_text(),
            \'sex\': judgment_sex(sex.get("class"))
        }
        print(date)


if __name__ == \'__main__\':

    urls = [\'http://bj.xiaozhu.com/search-duanzufang-p{}-0/\'.format(number) for number in range(1, 14)]
    for single_url in urls:
        get_links(single_url)
        # 休眠十秒，防止被封IP
        time.sleep(random.randint(10, 13))

        # 缺点：缺少IP管理，采用休眠方法，效率低

爬取酷狗top1.0版：

#!/usr/bin/env python
# -*- encoding:UTF-8 -*-

from bs4 import BeautifulSoup
import requests
import time,random

headers = {
    \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36\'
}

def get_info(url):
    """获取信息函数"""
    wb_data = requests.get(url,headers)
    soup = BeautifulSoup(wb_data.text,\'lxml\')
    ranks = soup.select(\'span.pc_temp_num\')
    titles =soup.select(\'div.pc_temp_songlist>ul>li>a\')
    times = soup.select(\'span.pc_temp_tips_r>span\')
    for rank,title,time in zip(ranks,titles,times):
        data = {
            \'rank\':rank.get_text().strip(),
            \'singer\':title.get_text().split(\'-\')[0],
            \'song\':title.get_text().split(\'-\')[1],
            \'time\':time.get_text().strip()
        }
        print(data)



if __name__ == \'__main__\':
    """主程序入口"""
    urls = [\'https://www.kugou.com/yy/rank/home/{}-8888.html\'.format(i) for i in range(1,25)]
    for url in urls:
        get_info(url)
    time.sleep(random.randint(3,5))

爬取酷狗top1.1版：

#!/usr/bin/env python
# -*- encoding:UTF-8 -*-

from bs4 import BeautifulSoup
import requests
import time,random

headers = {
    \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36\'
}

def get_info(url):
    """获取信息函数"""
    wb_data = requests.get(url,headers)
    soup = BeautifulSoup(wb_data.text,\'lxml\')
    ranks = soup.select(\'span.pc_temp_num\')
    titles =soup.select(\'a.pc_temp_songname\')
    times = soup.select(\'span.pc_temp_time\')
    for rank,title,time in zip(ranks,titles,times):
        data = {
            \'rank\':rank.get_text().strip(),
            \'singer\':title.get_text().split(\'-\')[0],
            \'song\':title.get_text().split(\'-\')[1],
            \'time\':time.get_text().strip()
        }
        print(data)



if __name__ == \'__main__\':
    """主程序入口"""
    urls = [\'https://www.kugou.com/yy/rank/home/{}-8888.html\'.format(i) for i in range(1,25)]
    for url in urls:
        get_info(url)
    time.sleep(random.randint(3,5))

爬取价格：

import re
import requests
res = requests.get(\'http://bj.xiaozhu.com/\')
prices = re.findall(\'<span class="result_price">&#165;<i>(.*?)</i>起/晚</span>\', res.text)
for price in prices:
    print(price)

注意：

<span class="result_price">¥<i>488</i>起/晚</span>

¥   和  &#165;等价，但爬取时，不能出现¥