november1943

title: python爬虫 爬去58同城二手平板电脑信息
tags: python,爬虫
grammar_cjkRuby: true

爬去http://bj.58.com/pbdn/0/pn2/中除转转、推广商品以外的产品信息

# coding:utf-8
# 爬取58同城二手电脑信息
# 进入http://bj.58.com/pbdn/0/pn2/页面
# 爬取列表中除转转、推广商品外的正常商品

from bs4 import BeautifulSoup
import requests
import time


def get_links_from(who_sells):  # 爬取列表中除转转、推广商品外的正常商品爬取列表中除转转、推广商品外的正常商品的连接
    urls = []
    list_view = \'http://bj.58.com/pbdn/{}/pn2/\'.format(str(who_sells))
    wb_data = requests.get(list_view)
    soup = BeautifulSoup(wb_data.text, \'lxml\')

    # 通过对页面分析 发现商品链接在 tr > td.t > a.t 中
    for link in soup.select(\'tr td.t a.t\'):
        if len(link.get(\'href\').split(\'?\')[0]) == 53: # 因为转转商品也符合 tr > td.t > a.t,要排除,观察发现正常商品链接
            # 的长度为53, 可通过字符串长度筛选去正常的连接
            urls.append(link.get(\'href\').split(\'?\')[0])
    return urls


def get_views(url):
    id = url.split(\'/\')[-1].strip(\'x.shtml\')
    api = \'http://jst1.58.com/counter?infoid={}\'.format(id)
    js = requests.get(api)
    views = js.text.split(\'=\')[-1]
    return views


def get_item_info(who_sells=0): #
    urls = get_links_from(who_sells)

    for url in urls:

        time.sleep(2)
        web_data = requests.get(url)
        soup = BeautifulSoup(web_data.text, \'lxml\')
        data = {
            \'title\': soup.title.text,
            \'price\': soup.find_all(\'span\', \'price c_f50\')[0].text,
            \'area\': list(soup.select(\'.c_25d\')[0].stripped_strings) if soup.find_all(\'span\',\'c_25d\') else None,
            \'date\': soup.select(\'.time\')[0].text,
            \'cate\': \'个人\' if who_sells == 0 else \'商家\',
            \'views\': get_views(url)
        }
        print(data)

get_item_info()

分类:

技术点:

相关文章: