1 # -*- coding: utf-8 -*- 2 # @Time : 2020/9/11 16:23 3 # @Author : Chunfang 4 # @Email : 3470959534@qq.com 5 # @File : amazon_bestseller_cate_url.py 6 # @Software: PyCharm 7 8 import random,requests 9 import re 10 11 def secend_cates_url(url):#正则匹配二级标题 12 # print(url) 13 page_data = get_data(url) 14 url_cates = re.findall("<li><a href=\'(https://www.amazon.com/Best.*?)\'>(.*?)</a></li>", page_data, re.S) 15 # print(\'二级标题有\',url_cates) 16 url_cate_all.append(url_cates) 17 # print(page_data) 18 19 def get_html_data(page_data):#正则匹配一级标题 20 url_cates = re.findall("<li><a href=\'(https://www.amazon.com/Best.*?)\'>(.*?)</a></li>",page_data,re.S) 21 # print(\'一级标题有\',url_cates) 22 url_cate_all.append(url_cates) 23 # secend_cates_url(url_cates[0][0]) 24 for i in range(len(url_cates)): 25 secend_cates_url(url_cates[i][0]) 26 27 def randHeader(): 28 head_connection = [\'Keep-Alive\', \'close\'] 29 head_accept = [\'text/html, application/xhtml+xml, */*\'] 30 head_accept_language = [\'zh-CN,fr-FR;q=0.5\', \'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3\'] 31 head_user_agent = [\'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko\', 32 \'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36\', 33 \'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36\', 34 \'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)\', 35 \'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1\', 36 \'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3\', 37 \'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12\', 38 \'Opera/9.27 (Windows NT 5.2; U; zh-cn)\', 39 \'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0\', 40 \'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)\', 41 \'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6\', 42 \'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)\', 43 \'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)\', 44 \'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)\', 45 \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 \', 46 \'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)\', 47 \'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 \', 48 \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER\', 49 \'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)\', 50 \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11\'] 51 52 header = { 53 \'Connection\': head_connection[0], 54 \'Accept\': head_accept[0], 55 \'Accept-Language\': head_accept_language[1], 56 \'User-Agent\': head_user_agent[random.randrange(0, len(head_user_agent))] 57 } 58 return header 59 60 def get_data(url):#获取页面数据 61 headers = randHeader() 62 page_data = requests.get(url, headers, timeout=20) 63 import html 64 page_data = html.unescape(page_data.text) 65 return page_data 66 67 def save_to_excel(url_cate_all): 68 url_cate_alls = [] 69 for i in range(len(url_cate_all)): 70 for j in range(len(url_cate_all[i])): 71 # print(\'所有的标题链接:\',url_cate_all[i][j]) 72 url_cate_alls.append(url_cate_all[i][j]) 73 url_cate_all_only = list(set(url_cate_alls)) 74 # print(\'唯一一个链接和分类:\',url_cate_all_only) 75 return url_cate_all_only 76 77 def url_cate_all_only(): 78 global url_cate_all 79 url_cate_all = [] 80 url = \'https://www.amazon.com/Best-Sellers-Womens-Clothing/zgbs/fashion/1040660/ref=zg_bs_unv_3_9522931011_1\' 81 page_data = get_data(url) 82 # print(page_data) 83 get_html_data(page_data) 84 # print(url_cate_all) 85 url_cate_all_only = save_to_excel(url_cate_all) 86 return url_cate_all_only
1 # -*- coding: utf-8 -*- 2 # @Time : 2020/9/9 17:30 3 # @Author : Chunfang 4 # @Email : 3470959534@qq.com 5 # @File : amazon_best_sellers.py 6 # @Software: PyCharm 7 8 import requests 9 import re,os,random 10 from openpyxl import load_workbook 11 12 from amazon_bestseller_cate_url2 import url_cate_all_only 13 14 def down_imgs(url_xuhao,url_img,pro_name):#下载图片 15 for i in range(len(url_xuhao)): 16 print(\'正在下载第\' + str(i+1) + \'张图片,图片地址:\' + str(url_img[i])) 17 try: 18 header = randHeader() 19 pic = requests.get(url_img[i], header,timeout=10) 20 except requests.exceptions.ConnectionError: 21 print(\'错误!当前图片无法下载\') 22 continue 23 dir = cwd + \'\\images_amazon\\\' + pro_name + \'_\' + url_xuhao[i] + \'.jpg\'#运行路径下自己手动新建一个images_amazon文件加,存放图片 24 with open(dir, \'wb\') as file: 25 file.write(pic.content) 26 27 def save_pro_to_excel(products_inf,pro_name):#热卖产品数据保存到Excel表 28 t = ws.max_row 29 for i in range(len(products_inf)): 30 for j in range(len(products_inf[i])): 31 # ws.cell(j + 2, i + 1).value = products_inf[i][j] # 序号 32 ws.cell(t+1+j, 1).value = pro_name 33 ws.cell(t+1+j, i+2).value = products_inf[i][j] # 产品信息 34 wb.save(path) 35 36 def down_products(result,pro_name):#正则匹配产品信息 37 products_inf = [] 38 # url_title = re.findall(\'<div class="p13n-sc-truncate p13n-sc-line-clamp-2" aria-hidden="true" data-rows="2">\n (.*?)\n </div>\',result,re.S) 39 url_title = re.findall(\'<div class=".*?" aria-hidden=".*?" data-rows=".*?">\n (.*?)\n </div>\',result, re.S) 40 url_pro = re.findall(\'<div class="a-row"><a class="a-link-normal a-text-normal" href="(.*?)"><span class="a-size-base a-color-price">\',result,re.S) 41 url_price = re.findall(\'<span class="a-size-base a-color-price"><span class=.*?>(.*?)</span>\',result,re.S) 42 url_xuhao = re.findall(\'<span class="zg-badge-text">#(.*?)</span></span>\',result,re.S) 43 url_img = re.findall(\'<div class="a-section a-spacing-small"><img alt=".*?src="(https.*?)" height="200" width="200"></div></span>\',result,re.S) 44 45 46 products_inf.append(url_xuhao) 47 products_inf.append(url_title) 48 products_inf.append(url_price) 49 products_inf.append(url_pro) 50 products_inf.append(url_img) 51 print(products_inf) 52 53 save_pro_to_excel(products_inf, pro_name) 54 down_imgs(url_xuhao, url_img, pro_name) 55 56 #生成随机头 57 def randHeader(): 58 head_connection = [\'Keep-Alive\', \'close\'] 59 head_accept = [\'text/html, application/xhtml+xml, */*\'] 60 head_accept_language = [\'zh-CN,fr-FR;q=0.5\', \'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3\'] 61 head_user_agent = [\'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko\', 62 \'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36\', 63 \'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36\', 64 \'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)\', 65 \'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1\', 66 \'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3\', 67 \'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12\', 68 \'Opera/9.27 (Windows NT 5.2; U; zh-cn)\', 69 \'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0\', 70 \'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)\', 71 \'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6\', 72 \'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)\', 73 \'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)\', 74 \'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)\', 75 \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 \', 76 \'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)\', 77 \'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 \', 78 \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER\', 79 \'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)\', 80 \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11\'] 81 82 header = { 83 \'Connection\': head_connection[0], 84 \'Accept\': head_accept[0], 85 \'Accept-Language\': head_accept_language[1], 86 \'User-Agent\': head_user_agent[random.randrange(0, len(head_user_agent))] 87 } 88 return header 89 90 def start_url(pro_name,url): 91 headers = randHeader() 92 result = requests.get(url, headers, timeout=20) 93 import html 94 result = html.unescape(result.text) 95 # print(result) 96 # 调用函数,下载页面产品信息:序号,标题,产品最低价格,产品链接,产品图片和链接 97 down_products(result,pro_name) 98 99 if __name__ == \'__main__\': 100 cwd = os.getcwd() 101 path = cwd+\'\\AmazonBestsellers.xlsx\' 102 wb = load_workbook(path) 103 ws = wb.worksheets[0] 104 table_titles = [\'产品类别\',\'序号\',\'产品标题\',\'产品最低价格\',\'产品链接\',\'产品图片链接\'] 105 for i,table_title in enumerate(table_titles): 106 ws.cell(1,i+1).value = table_title 107 wb.save(path) 108 109 # amazon_urls = [ 110 # #一级标题--女装衣服 111 # \'https://www.amazon.com/Best-Sellers-Womens-Clothing/zgbs/fashion/1040660/ref=zg_bs_pg_1?_encoding=UTF8&pg=1\', 112 # \'https://www.amazon.com/Best-Sellers-Womens-Clothing/zgbs/fashion/1040660/ref=zg_bs_pg_2?_encoding=UTF8&pg=2\', 113 # #二级标题--女装裙子 114 # \'https://www.amazon.com/Best-Sellers-Womens-Dresses/zgbs/fashion/1045024/ref=zg_bs_pg_1?_encoding=UTF8&pg=1\', 115 # \'https://www.amazon.com/Best-Sellers-Womens-Dresses/zgbs/fashion/1045024/ref=zg_bs_pg_2?_encoding=UTF8&pg=2\', 116 # #三级标题--女装日常款裙子 117 # \'https://www.amazon.com/Best-Sellers-Womens-Casual-Dresses/zgbs/fashion/2346727011/ref=zg_bs_pg_1?_encoding=UTF8&pg=1\', 118 # \'https://www.amazon.com/Best-Sellers-Womens-Casual-Dresses/zgbs/fashion/2346727011/ref=zg_bs_pg_2?_encoding=UTF8&pg=2\' 119 # ] 120 121 amazon_urls = [] 122 all_urls = url_cate_all_only() 123 for i in range(len(all_urls)): 124 amazon_urls.append(all_urls[i][0]) # 一级、二级分类下的所有页面链接 125 print(len(amazon_urls)) 126 print(amazon_urls) 127 128 for i in range(len(amazon_urls)): 129 pro_name = amazon_urls[i].split(\'/\') 130 print(pro_name[3]) 131 print(pro_name[3][13:]) 132 start_url(pro_name[3][13:],amazon_urls[i])