tripadvisor 抓取餐厅 URL 和电子邮件答案

【问题标题】：tripadvisor scrape restaurant URL and e-mailtripadvisor 抓取餐厅 URL 和电子邮件
【发布时间】：2017-05-28 16:29:06
【问题描述】：

尝试通过制作一个tripadvisor 刮板来提高我的python 技能。目前，scraper 可以抓取一个城市的餐馆，并将名称和tripadvisor URL 保存在一个excel 文件中。但是，我正在寻找一个建议来保存餐厅电子邮件和直接 URL。

任何人都可以为此提供任何意见吗？干杯

import requests

from tkinter import *
from bs4 import BeautifulSoup as b
from bs4 import Comment as com
from openpyxl import Workbook
# city_name = 'London_England'
# geo_code = '186338'


def o_and_t():
	global nameFile, geo_code, city_name
	nameFile = e_1.get() + '.xlsx'
	geo_code = e_2.get()
	city_name = e_3.get()
	root.destroy()
	return None

def gui():
	global root,e_1,e_2,e_3
	root = Tk()
	root.geometry('500x230')
	root.configure(bg = 'black')
	root.title('Enter Details')
	#
	l_0 = Label(root,text = '\t\tTripAdvisorScraper\n \t\t ~by a1b2t',font = ("Helevetica",14),bg = 'black',fg = 'white')
	l_0.place(x = 0 ,y = 0 )
	#
	l_1 = Label(root,text = 'Please Enter the FileName : ',font = ("Helevetica",11),bg = 'black',fg = 'white')
	l_1.place(x = 0 , y = 60)
	#
	l_2 = Label(root,text = 'Please enter the code from the url : ',font = ("Helevetica",11),bg = 'black',fg = 'white')
	l_2.place(x = 0 , y = 90)
	#
	l_3 = Label(root,text = 'Please enter the city and country as in url :',font = ("Helevetica",12),bg = 'black',fg = 'white')
	l_3.place(x = 0,y = 120)
	#
	e_1 = Entry(root)
	e_1.place(x = 320 ,y = 60)
	#
	e_2 = Entry(root)
	e_2.place(x = 320 ,y = 90)
	#
	e_3 = Entry(root)
	e_3.place(x = 320 ,y = 120)
	#
	b_1 = Button(root,text = 'START',command = o_and_t)
	b_1.place(x = 220 ,y = 170)
	root.mainloop()
	return None


gui()

print('\n\n\tStarting Scraper\t\n\n')



main_url = 'https://www.tripadvisor.co.uk/Restaurants-g{}-{}.html'.format(geo_code, city_name)

'https://www.tripadvisor.co.uk/RestaurantSearch?Action=PAGE&geo=186338&ajax=1&itags=10591&sortOrder=relevance&o=a150&availSearchEnabled=false'
'https://www.tripadvisor.co.uk/RestaurantSearch?Action=PAGE&geo=186338&ajax=1&itags=10591&sortOrder=relevance&o=a60&availSearchEnabled=false'
'https://www.tripadvisor.co.uk/RestaurantSearch?Action=PAGE&geo=186338&ajax=1&itags=10591&sortOrder=relevance&o=a30&availSearchEnabled=false'

req_1 = requests.get(main_url)
soup = b(req_1.content, 'html.parser')
total_pages = int(soup.find_all('a', class_="pageNum taLnk")[-1]['data-page-number']) + 1
print(total_pages)
RESULTS = []

for page_no in range(0, total_pages*30 , 30):
	page_no
	url = 'https://www.tripadvisor.co.uk/RestaurantSearch?Action=PAGE&geo={}&ajax=1&itags=10591&sortOrder=relevance&o=a{}&availSearchEnabled=false'.format(geo_code, page_no)
	req_2 = requests.get(url)
	soup_2 = b(req_2.content, 'html.parser')
	temp = soup_2.find_all('a', class_="property_title")
	for t in temp:
		r_name = t.text.replace('\n', '').replace('\t', '') 

		r_url = 'https://www.tripadvisor.com' + t['href']
		print(str([r_name, r_url]).encode()) 
		RESULTS.append([r_name, r_url])


if len(RESULTS) !=0:		
	wb = Workbook(write_only=True)
	ws = wb.create_sheet()
	for steps_0 in RESULTS:
		ws.append(steps_0)
	wb.save(nameFile)
print(len(RESULTS))

【问题讨论】：

标签： python web-scraping

【解决方案1】：

您应该只抓取您抓取的每个 URL 并查找元素 class="detail_section info"

【讨论】：