selenium+PhantomJS 抓取淘宝搜索商品

最近项目有些需求，抓取淘宝的搜索商品，抓取的品类还多。直接用selenium+PhantomJS 抓取淘宝搜索商品，快速完成。
#-*- coding:utf-8 -*-
__author__ =\'\'
import logging
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time,re
from tqdm import tqdm
from pyquery import PyQuery as pq
from tianmao.data_tmall import keywords
from dbutils import mysql_util
from config import retry_count
"""
抓取天猫，
"""
logging.basicConfig(level=logging.INFO,
                        format=\'%(asctime)s [%(levelname)s] [%(filename)s] [%(threadName)s] [line:%(lineno)d] [%(funcName)s] %(message)s\',
                        datefmt=\'%Y-%m-%d %H:%M:%S\')


class tianmao_spider(object):

    def __init__(self):
        self.SERVICE_ARGS = [\'--disk-cache=true\',\'--load-images=false\']
        self.target_url =\'https://www.tmall.com/\'
        self.browser = webdriver.PhantomJS(service_args=self.SERVICE_ARGS)
        self.wait = WebDriverWait(self.browser, 10) #设置10秒超时
        self.browser.set_window_size(1400, 900)
        # self.browser.add_cookie()
        self.mysql_util = mysql_util()

    def search(self, category, keyword, page=2):
        print(\'正在搜索:{0}\'.format(keyword))
        total = 0
        for i in range(0, retry_count): #重试3次，3次不成功则跳过
            try:
                self.browser.get(self.target_url)
                input = self.wait.until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "#mq"))
                )
                submit = self.wait.until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, \'#mallSearch > form > fieldset > div > button\')))
                input.send_keys(keyword)
                submit.click()
                total = self.wait.until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, \'#content > div > div.ui-page > div > b.ui-page-skip > form\')))
                if page == 2:  # 从首页开始抓取
                    self.get_products(category, keyword)
                total = int(re.compile(\'(\d+)\').search(total[0].text).group(1))
                break
            except TimeoutException as e:
                logging.info("正在重试第{0}次，出现：{1}".format(i + 1, e))
                if i == retry_count - 1:  # 将类目和搜索关键字按字典写入txt
                    f = open(\'tmall_retry_crawl.txt\', \'a\', encoding=\'utf-8\')
                    f.write(category + ":" + keyword + \'\n\')
                time.sleep(1)
        return total

    def get_products(self, category, keyword):
        self.wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, \'#J_ItemList\')))
        html = self.browser.page_source
        doc = pq(html)
        items = doc(\'#J_ItemList .product\').items()
        for item in items:
            #下载图片
            img_url = item.find(\'img\').attr(\'src\') if item.find(\'img\').attr(\'src\') != None else item.find(
                \'img\').attr(\'data-src\')
            if img_url != None:
                if not str(img_url).startswith("http"):
                    img_url = \'http:{0}\'.format(img_url)
                img_save_path = \'\' #IMG_PATH + \'/{0}.jpg\'.format(uuid.uuid4())
                #download_img(img_url, img_save_path)
            #获取详情页面
            item_url = item.find(\'a\').attr(\'href\')
            logging.info(\'详情页面url：{0}\'.format(item_url))
            if item_url == None:
                item_detail = \'\'
            else:
                if not str(item_url).startswith(\'http\'):
                    item_url = "https:" + item_url
                # item_detail = get_item_detail(item_url)
                item_detail = \'\'
            #保存到MySQL
            product = {
                \'target\': \'tmall\',
                \'category\': category,
                \'keyword\': keyword,
                \'item_url\': item_url,
                \'image_url\': img_url,
                \'image_save_path\': img_save_path, #div:nth-child(3) >
                \'title\': item.find(\'div > div.productTitle\').text(),
                \'price\': item.find(\'div > p.productPrice\').text(),
                \'deal\': item.find(\'div > p.productStatus\').text().replace(\'阿里旺旺\',\'\').strip(),
                \'shop\': item.find(\'div > div.productShop\').text(),
                \'location\': \'\',
                \'item_detail\': item_detail,
                \'create_time\': time.strftime(\'%Y-%m-%d %H:%M:%S\', time.localtime(time.time()))

            }
            self.mysql_util.sava_to_mysql(\'t_tmall\', product)


    def next_page(self, page_number, category, keyword):
        for i in range(0, retry_count): #重试3次
            try:
                input = self.wait.until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "#content > div > div.ui-page > div > b.ui-page-skip > form > input.ui-page-skipTo"))
                )
                submit = self.wait.until(EC.element_to_be_clickable(
                    (By.CSS_SELECTOR, \'#content > div > div.ui-page > div > b.ui-page-skip > form > button\')))
                input.clear()
                input.send_keys(page_number)
                submit.click()
                self.wait.until(EC.element_to_be_clickable(
                    (By.CSS_SELECTOR, \'#content > div > div.ui-page > div > b.ui-page-skip > form > button\')))
                self.get_products(category, keyword)
                break
            except TimeoutException as e:
                logging.info(e)
                time.sleep(0.5)
                # self.next_page(page_number, category, keyword)

    def start_crawler(self, page=2):
        try:
            for category in keywords.keys():
                keyword_list = keywords[category]
                for keyword in keyword_list:
                    total = self.search(category, keyword, page=page)
                    # total = int(re.compile(\'(\d+)\').search(total).group(1))
                    for i in tqdm(range(page, total + 1)):
                        print(\'总共{0}页,正在翻第{1}页，抓取类别：{2}，搜索关键字：{3}\'.format(total, i, category, keyword))
                        self.next_page(i, category, keyword)
                    if page != 2:  # 下一个产品必须从第二页开始抓取，中断后可以直接从中断页继续抓取
                        page = 2
                time.sleep(0.5)
        except Exception as e:
            print(e)
        finally:
            self.browser.close()


if __name__ == \'__main__\':
    tmall = tianmao_spider()
    tmall.start_crawler()