smbl

在学习了网易云课堂上崔庆才老师的Python3爬虫三大案例实战分享之后模仿了一段代码,PhantomJS和MongoDB还没学,暂时没放进去,用pandas代替。

 1 from selenium import webdriver
 2 from selenium.common.exceptions import TimeoutException
 3 from selenium.webdriver.common.by import By
 4 from selenium.webdriver.support.ui import WebDriverWait
 5 from selenium.webdriver.support import expected_conditions as EC
 6 import re
 7 from pyquery import PyQuery as pq
 8 import pandas as pd
 9 
10 browser = webdriver.Chrome()
11 wait = WebDriverWait(browser, 10)
12 totaldata = []
13 def search():
14     global totaldata
15     try:
16         browser.get(\'https://www.taobao.com\')
17         input = wait.until(
18             EC.presence_of_element_located((By.CSS_SELECTOR, "#q"))
19         )
20         submit = wait.until(
21             EC.element_to_be_clickable((By.CSS_SELECTOR, "#J_TSearchForm > div.search-button > button"))
22         )
23         input.send_keys(\'鸡蛋\')
24         submit.click()
25         total = wait.until(
26             EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.total"))
27         )
28         totaldata.extend(get_products())
29         return total.text
30     except TimeoutException:
31         return search()
32 
33 def next_page(page_number):
34     global totaldata
35     try:
36         input = wait.until(
37             EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input"))
38         )
39         submit = wait.until(
40             EC.element_to_be_clickable((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit"))
41         )
42         input.clear()
43         input.send_keys(page_number)
44         submit.click()
45         wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, \'#mainsrp-pager > div > div > div > ul > li.item.active > span\'), str(page_number)))
46         totaldata.extend(get_products())
47     except TimeoutException:
48         return next_page(page_number)
49 
50 def get_products():
51     wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, \'#mainsrp-itemlist .items .item\')))
52     html = browser.page_source
53     doc = pq(html)
54     items = doc(\'#mainsrp-itemlist .items .item\').items()
55     data = []
56     for item in items:
57         product = {
58             \'image\': item.find(\'.pic .img\').attr(\'src\'),
59             \'price\': item.find(\'.price\').text().replace(\'\n\', \'\'),
60             \'deal\': item.find(\'.deal-cnt\').text()[:-3],
61             \'title\': item.find(\'.title\').text().replace(\'\n\', \'\'),
62             \'shop\': item.find(\'.shop\').text(),
63             \'location\': item.find(\'.location\').text()
64         }
65         data.append(product)
66     return data
67 
68 def main():
69 
70     search()
71     total = search()
72     total = int(re.compile(\'(\d+)\').search(total).group(1))
73     for i in range(2, total+1):
74         next_page(i)
75     df = pd.DataFrame(totaldata)
76     df.to_excel(\'taobaoeggs.xlsx\')
77 
78 if __name__ == \'__main__\':
79     main()

 



分类:

技术点:

相关文章: