由于PhantomJS已经停止更新,所以使用chrome浏览器的headless模式代替,代码如下:
from selenium import webdriver chrome_options = webdriver.ChromeOptions() chrome_options.add_argument(\'--headless\') browser=webdriver.Chrome(options=chrome_options) browser.get(\'https://www.baidu.com/\') print(browser.current_url)
爬取淘宝的代码:
别人的代码:
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException import re from pyquery import PyQuery as pq from pymongo import MongoClient client = MongoClient() db = client[\'MONGO_DB\'] browser = webdriver.Chrome() wait = WebDriverWait(browser,10) #使用webdriver打开chrome,打开淘宝页面,搜索美食关键字,返回总页数 def search(): try: browser.get(\'https://www.taobao.com\') input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,\'#q\'))) submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,\'#J_TSearchForm > div.search-button > button\'))) input.send_keys(\'ipad\') submit.click() total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,\'#mainsrp-pager > div > div > div > div.total\'))) get_products() return total.text except TimeoutException: print(\'timeout!\') return search() #进行页面的跳转,输入下一页的页号,然后点击确定按钮,在高亮区域判定是否正确跳转 def next_page(page_num): try: input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,\'#mainsrp-pager > div > div > div > \' \'div.form > input\'))) submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,\'#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit\'))) input.clear() input.send_keys(page_num) submit.click() wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,\'#mainsrp-pager > div > div > div > ul > li.item.active > span\'),str(page_num))) get_products() except TimeoutException: next_page(page_num) #获取商品详情 def get_products(): wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,\'#mainsrp-itemlist .items .item\'))) html = browser.page_source doc = pq(html) items = doc(\'#mainsrp-itemlist .items .item\').items() for item in items: product = { \'image\':item.find(\'.pic .img\').attr(\'src\'), \'price\':item.find(\'.price\').text(), \'deal\':item.find(\'.deal-cnt\').text()[:-3], \'title\':item.find(\'.title\').text(), \'shop\':item.find(\'.shop\').text(), \'location\':item.find(\'.location\').text(), } print(product) save_to_mongo(product) def save_to_mongo(result): try: if db[\'MONGO_DB\'].insert(result): print(\'存储成功\',result) except Exception: print(\'存储失败\',result) def main(): total = search() total = int(re.search(\'(\d+)\',total).group(1)) #\'\d\'表示匹配数字 for i in range(2,total+1): next_page(i) if __name__ == \'__main__\': main()
崔老师的代码:
from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from urllib.parse import quote from pyquery import PyQuery as pq from pymongo import MongoClient browser = webdriver.Chrome() wait = WebDriverWait(browser, 10) KEYWORD = \'iPad\' def index_page(page): try: url = \'https://s.taobao.com/search?q=\' + quote(KEYWORD) browser.get(url) if page > 1: input = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, \'#mainsrp-pager div.form > input\'))) submit = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, \'#mainsrp-pager div.form > span.btn.J_Submit\'))) input.clear() input.send_keys(page) submit.click() wait.until( EC.text_to_be_present_in_element((By.CSS_SELECTOR, \'#mainsrp-pager li.item.active > span\'), str(page))) wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, \'.m-itemlist .items .item\'))) get_products() except TimeoutException: index_page(page) def get_products(): """ 提取商品数据 """ html = browser.page_source doc = pq(html) items = doc(\'#mainsrp-itemlist .items .item\').items() for item in items: product = { \'image\': item.find(\'.pic .img\').attr(\'data-src\'), \'price\': item.find(\'.price\').text(), \'deal\': item.find(\'.deal-cnt\').text(), \'title\': item.find(\'.title\').text(), \'shop\': item.find(\'.shop\').text(), \'location\': item.find(\'.location\').text() } print(product) save_to_mongo(product) MONGO_URL = \'localhost\' MONGO_DB = \'taobao\' MONGO_COLLECTION = \'products\' client = MongoClient(MONGO_URL) db = client[MONGO_DB] def save_to_mongo(result): """ 保存至MongoDB :param result: 结果 """ try: if db[MONGO_COLLECTION].insert(result): print(\'存储到MongoDB成功\') except Exception: print(\'存储到MongoDB失败\') MAX_PAGE = 100 if __name__ == \'__main__\': for i in range(1, MAX_PAGE + 1): index_page(i)
其他人帮助的代码
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from selenium.common.exceptions import TimeoutException from urllib.parse import quote from pyquery import PyQuery as pq import os import openpyxl import random import time browser=webdriver.Chrome() base_url=\'https://s.taobao.com/search?q=\' keywords=\'ipad\' url=base_url+quote(keywords) wait=WebDriverWait(browser,15) page_max=100 def log_out(browser): login_switch=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,\'div.login-switch i#J_Quick2Static\'))) login_switch.click() weibo_login=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,\'a.weibo-login\'))) weibo_login.click() username=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,\'div.inp.username input\'))) password=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,\'div.inp.password input\'))) username.send_keys(\'xxx\') password.send_keys(\'xxx\') submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,\'span[node-type="submitStates"]\'))) submit.click() def get_page(page): print(\'正在打印 %d 页\'%page) try: if page==1: browser.get(url) if \'手机扫码,安全登录\' in browser.page_source: log_out(browser) else: input=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,\'input.J_Input\'))) submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,\'span.J_Submit\'))) input.clear() input.send_keys(page) submit.click() wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,\'div#mainsrp-pager ul.items li.item.active span\'),str(page))) wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, \'div.m-itemlist\'))) if browser.page_source!=None: return browser else: get_page(page) except TimeoutException as e: get_page(page) def get_products(browser): html=browser.page_source doc=pq(html) for item in doc(\'#mainsrp-itemlist .items .item\').items(): image=item.find(\'.pic .img\').attr(\'data-src\') price=item.find(\'.price\').text().replace(\'\n\',\'\') deal=item.find(\'.deal-cnt\').text() title=item.find(\'.title\').text() shop=item.find(\'.shop\').text() location=item.find(\'.location\').text().replace(\' \',\'\') yield [image,price,deal,title,shop,location] def save(out): filename=\'taobao_\'+keywords+\'.xlsx\' if not os.path.exists(filename): workbook=openpyxl.Workbook() sheet=workbook.create_sheet(index=0,title=keywords) sheet.append([\'图片\',\'价格\',\'成交人数\',\'商品\',\'店铺\',\'地点\']) workbook.save(filename) workbook=openpyxl.load_workbook(filename) sheet=workbook[keywords] for row in out: print(row) sheet.append(row) workbook.save(filename) def main(): for page in range(1,page_max+1): browser=get_page(page) out=get_products(browser) save(out) time.sleep(random.randint(1,5)) if __name__==\'__main__\': main()
自己的代码:
from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from pyquery import PyQuery as pq from pymongo import MongoClient #创建mogodb数据对象 client=MongoClient() db=client[\'taobao\'] collection=db[\'taobao\'] browser=webdriver.Chrome() wait=WebDriverWait(browser,10) max_page=100 def index_page(): try: browser.get(\'https://www.taobao.com\') input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, \'#q\'))) submit = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, \'#J_TSearchForm > div.search-button > button\'))) input.send_keys(\'ipad\') submit.click() print(\'正在爬取第\', page, \'页\') get_products() num = browser.find_element_by_link_text(\'下一页\') num.click() except TimeoutException: print(\'time out!\') return index_page() # 提取商品数据 def get_products(): wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, \'mainsrp-itemlist.items.item\'))) html=browser.page_source doc=pq(html) items=doc(\'#mainsrp-itemlist.items.item\').items() for item in items: product={ \'image\': item.find(\'.pic a img\').attr(\'data-src\'), \'price\': item.find(\'.price\').text(), \'deal\': item.finc(\'.deal-cnt\').text(), \'title\': item.find(\'.title\').text(), \'shop\': item.find(\'.shop\').text(), \'location\': item.find(\'.location\').text() } print(product) save_to_mogo(product) #保存到mogodb def save_to_mogo(result): try: if collection.insert(result): print(\'保存成功\',result) except Exception: print(\'保存失败\',result) if __name__==\'__main__\': for page in range(2, max_page + 1): index_page(page)