MC-Curry

 

 1 from selenium import webdriver
 2 from selenium.webdriver.common.by import By
 3 from selenium.webdriver.support import expected_conditions as EC
 4 from selenium.webdriver.support.wait import WebDriverWait
 5 from selenium.common.exceptions import TimeoutException
 6 from pyquery import PyQuery as pq
 7 import re
 8 from config import *
 9 import pymongo
10 
11 client = pymongo.MongoClient(MONGO_URL)
12 db = client[MONGO_DB]
13 browser = webdriver.Chrome()
14 
15 wait = WebDriverWait(browser, 10)
16 
17 
18 def search():
19     try:
20         browser.get(\'https://www.taobao.com\')
21         input_ = wait.until(
22             EC.presence_of_element_located((By.CSS_SELECTOR, \'#q\'))
23         )
24         submit = wait.until(
25             EC.element_to_be_clickable((By.CSS_SELECTOR, \'#J_TSearchForm > div.search-button > button\'))        
26         )
27 
28         input_.send_keys(\'xiaomi\')
29         submit.click()
30 
31         total = wait.until(
32             EC.presence_of_element_located((By.CSS_SELECTOR, \'#mainsrp-pager > div > div > div > div.total\'))
33             )    
34         get_products()
35         return total.text    
36     except TimeoutException:
37         return search()
38 
39 def next_page(page_num):
40     try:
41         input_ = wait.until(
42                 EC.presence_of_element_located((By.CSS_SELECTOR, \'#mainsrp-pager > div > div > div > div.form > input\'))
43             )
44         submit = wait.until(
45                 EC.element_to_be_clickable((By.CSS_SELECTOR, \'#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit\'))        
46             )
47         input_.clear()
48         input_.send_keys(page_num)
49         submit.click()
50         wait.until(EC.text_to_be_present_in_element(
51             (By.CSS_SELECTOR, \'#mainsrp-pager > div > div > div > ul > li.item.active > span\'), str(page_num)))
52         get_products()
53     except TimeoutException:
54         next_page(page_num)
55 
56 def get_products():
57     wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, \'#mainsrp-itemlist .items .item\')))
58     html = browser.page_source
59     doc = pq(html)
60     items = doc(\'#mainsrp-itemlist .items .item\').items()
61     for item in items:
62         product ={
63             \'image\': item.find(\'.pic .img\').attr(\'src\'),
64             \'price\': item.find(\'.price\').text(),
65             \'deal\': item.find(\'.deal-cnt\').text()[:-3],
66             \'title\': item.find(\'.title\').text(),
67             \'shop\': item.find(\'.shop\').text(),
68             \'location\': item.find(\'.location\').text()
69 
70         }
71         print(product)
72         save_to_mongo(product)
73 
74 def save_to_mongo(result):
75     try:
76         if db[MONGO_TABLE].insert(result):
77             print(\'success save to mongodb\', result)
78     except Exception:
79         print(\'error to mongo\')
80 
81 def main():
82     total = search()
83     total = int(re.compile(\'(\d+)\').search(total).group(1))
84     # print(total)
85     for i in range(2, total):
86         next_page(i)
87     browser.close()
88 
89 if __name__ == \'__main__\':
90     main()

 

 config.py

1 MONGO_URL = \'localhost\'
2 MONGO_DB = \'taobao\'
3 MONGO_TABLE = \'product\'

 

 

 

运行结果:

 数据库:

 

 

 

分类:

技术点:

相关文章: