#第一个模块 抓取所有频道链接 from bs4 import BeautifulSoup import requests start_url = \'http://bj.58.com/sale.shtml\' url_host = \'http://bj.58.com\' def get_index_url(url): wb_data = requests.get(url) soup = BeautifulSoup(wb_data.text, \'lxml\') links = soup.select(\'ul.ym-submnu > li > b > a\') for link in links: page_url = url_host + link.get(\'href\') print(page_url) get_index_url(start_url) #第二个模块 抓取所有商品链接和详情数据 from bs4 import BeautifulSoup import requests import time import pymongo client = pymongo.MongoClient(\'localhost\', 27017) ceshi = client[\'ceshi\'] url_list = ceshi[\'url_list4\'] item_info = ceshi[\'item_info4\'] # 在最左边是在python 中对象的名称,后面的是在数据库中的名称 # spider 1 def get_links_from(channel, pages): # td.t 没有这个就终止 list_view = \'{}/pn{}/\'.format(channel, str(pages)) wb_data = requests.get(list_view) time.sleep(1) soup = BeautifulSoup(wb_data.text, \'lxml\') if soup.find(\'td\', \'t\'): for link in soup.select(\'td.t a.t\'): item_link = link.get(\'href\').split(\'?\')[0] if item_link != \'http://jump.zhineng.58.com/jump\': url_list.insert({\'url\':item_link}) print(item_link) # return urls else: # It\'s the last page ! pass # spider 2 def get_item_info(url): wb_data = requests.get(url) soup = BeautifulSoup(wb_data.text, \'lxml\') if url[:25] == \'http://zhuanzhuan.58.com/\': data={ \'title\':soup.title.text, \'price\': soup.select(\'span.price_now\')[0].text, \'area\':soup.select(\'div.palce_li > span > i\')[0].text, \'url\':url } item_info.insert(data) else: data={ \'title\': soup.title.text, \'price\':soup.select(\'span.price.c_f50\')[0].text, \'area\':soup.select(\'div.su_con > a \')[0].get_text(), \'sale_man\':soup.select(\'ul.vcard > li > a \')[0].text, \'url\':url } item_info.insert(data) #第三个模块 主文件运行开始抓取 from multiprocessing import Pool from pages_parsing import get_item_info,url_list,item_info,get_links_from from channel_extact import channel_list item_url = (item[\'url\'] for item in url_list.find()) index_urls0 = (item[\'url\'] for item in item_info.find()) x = set(item_url) y = set(index_urls0) rest_of_urls = x-y def get_all_links_from(channel): for i in range(1,100): get_links_from(channel,i) return rest_of_urls if __name__ == \'__main__\': pool = Pool() # pool = Pool(processes=6) #pool.map(get_all_links_from,channel_list.split()) pool.map(get_item_info,rest_of_urls) # count = 0 # for url in rest_of_urls: # print(url) # count += 1 # print(count) #第四个模块 查看数据流 import time from pages_parsing import url_list while True: print(url_list.find().count()) time.sleep(5)