不太确定我是否弄明白了。但这会返回 856 种葡萄酒
import requests
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0"}
url = 'https://www.vivino.com/'
# Get Cache key to get country codes and type of wines
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
script = soup.find('script', text = re.compile('var vivinoCacheKey'))
vivinoCacheKey = str(script).split('vivinoCacheKey = ')[-1].split(';')[0].replace("'",'').strip()
# Get countries
api_url = 'https://www.vivino.com/api/countries'
payload = {
'cache_key':vivinoCacheKey}
countryData = requests.get(api_url, headers=headers, params=payload).json()['countries']
rows = []
# Iterate through countries and wine types
api_url = 'https://www.vivino.com/api/explore/explore'
for country in countryData:
payload = {
"country_code": country['code'].upper(),
"currency_code":country['currency']['code'],
'grape_ids[]':'131',
"grape_filter":"varietal",
"min_rating":"1",
"order_by":"ratings_count",
"order":"desc",
"page": '1',
"price_range_max":"1000",
"price_range_min":"1"}
try:
jsonData = requests.get(api_url, params=payload, headers=headers).json()
total_pages = math.ceil(jsonData['explore_vintage']['records_matched'] / 100)
#print('%s' %(country['code'].upper()))
for page in range(1,total_pages+1):
if page != 1:
payload.update({'page':page})
jsonData = requests.get(api_url, params=payload, headers=headers).json()
for each in jsonData['explore_vintage']['records']:
name = each['vintage']['name']
rating = each['vintage']['statistics']['ratings_average']
price = each['price']['amount']
row = {'name':name, 'rating':rating, 'price':price}
rows.append(row)
print('Aquired page: %s - %s ' %(country['code'].upper(), page))
except:
continue
df = pd.DataFrame(rows)
输出:
print(df)
name rating price
0 Mustiguillo Finca Terrerazo 2017 4.2 30.83
1 Beso de Rechenna Bobal Crianza 2016 3.6 10.16
2 Bruno Murciano Cambio de Tercio Bobal 2019 3.8 12.70
3 Mustiguillo Quincha Corral 2016 4.4 106.35
4 Finca Sandoval Signo Bobal de Manchuela 2008 3.7 48.91
.. ... ... ...
851 Mustiguillo Finca Terrerazo 2016 4.1 20.88
852 Pasión Bobal 2017 3.8 12.00
853 Chozas Carrascal Las 2 Ces Barrica Tinto 2012 3.3 8.00
854 Mustiguillo Finca Terrerazo 2017 4.2 20.66
855 De Moya Justina 2018 3.9 6.48
[856 rows x 3 columns]
这里的另一个选项是,每次您在列表中选择一个国家时,都会创建一个新的会话 cookie。我可以得到第一个,但似乎获得特定国家的唯一方法是使用 Selenium 模拟该选择,然后获取该 cookie。另一件事是,如果您将最低价格设为 0,则该网站的设计目的是不提供价格。不知道他们为什么这样做。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
import time
import math
import pandas as pd
url = "https://www.vivino.com/explore"
driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
driver.maximize_window()
driver.get(url)
# If Cookie Notice pop up, then click on OK
if driver.find_element_by_xpath('//div[contains(@class, "cookieNotice")]').size != 0:
driver.find_element_by_xpath('//div[contains(@class, "cookieNotice")]//button').click()
# Slect Dropdown menu
driver.find_element_by_xpath('//div[contains(@class, "simpleLabel__selectedKey")]').click()
# Click on United States and wait for page to render
driver.find_element_by_xpath("//a[@data-value='US']").click()
time.sleep(5)
cookies_list = driver.get_cookies()
cookieStr = ''
for each in cookies_list:
cookieStr += each['name'] + '=' + each['value'] + ';'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
'cookie':cookieStr}
rows = []
# Iterate through countries and wine types
api_url = 'https://www.vivino.com/api/explore/explore'
payload = {
"country_code": 'US',
"currency_code": 'USD',
'grape_ids[]':'131',
"grape_filter":"varietal",
"min_rating":"1",
"order_by":"ratings_count",
"order":"desc",
"page": '1',
"price_range_max":"1000",
"price_range_min":"1"}
jsonData = requests.get(api_url, params=payload, headers=headers).json()
total_pages = math.ceil(jsonData['explore_vintage']['records_matched'] / 100)
for page in range(1,total_pages+1):
if page != 1:
payload.update({'page':page})
jsonData = requests.get(api_url, params=payload, headers=headers).json()
for each in jsonData['explore_vintage']['records']:
name = each['vintage']['name']
rating = each['vintage']['statistics']['ratings_average']
try:
price = each['price']['amount']
except:
price = None
row = {'name':name, 'rating':rating, 'price':price}
rows.append(row)
print('Aquired page %s of %s ' %(page, total_pages))
df = pd.DataFrame(rows)