【发布时间】:2021-01-13 11:27:15
【问题描述】:
我有以下代码
import pandas as pd
import requests
from bs4 import BeautifulSoup
import datetime
import time
# url = 'https://www.pccomponentes.com/procesadores?page='
url_list = [
'https://www.pccomponentes.com/procesadores?page=',
'https://www.pccomponentes.com/discos-duros/500-gb/conexiones-m-2/disco-ssd/internos?page=',
'https://www.pccomponentes.com/discos-duros/1-tb/conexiones-m-2/disco-ssd/internos?page=',
'https://www.pccomponentes.com/placas-base/amd-b550/atx?page=',
'https://www.pccomponentes.com/placas-base/amd-x570/atx?page=',
'https://www.pccomponentes.com/memorias-ram/16-gb/kit-2x8gb?page=',
'https://www.pccomponentes.com/ventiladores-cpu?page=',
'https://www.pccomponentes.com/fuentes-alimentacion/850w/fuente-modular?page=',
'https://www.pccomponentes.com/fuentes-alimentacion/750w/fuente-modular?page=',
'https://www.pccomponentes.com/cajas-pc/atx/con-ventana/sin-ventana?page='
]
# store = 'PCComponentes'
# df_hold_list = [] # capture dataframe for each link
# extraction_date = datetime.datetime.now()
for url in url_list:
for x in range(1,2):
headers = ({'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Accept-Language': 'es-ES, es;q=0.5'})
r = requests.get(url + str(x), headers = headers)
print(r.status_code)
soup = BeautifulSoup(r.content,'html.parser')
# print(soup)
items = soup.find_all('div',class_='col-xs-6 col-sm-4 col-md-4 col-lg-4')
# print(product)
store = ['PCComponentes']
df_list =[]
df_hold_list = []
df_final =[]
extraction_date = datetime.datetime.now()
for item in items:
product_name = item.find('h3',class_ = 'c-product-card__title').text
try:
price = item.find('div', class_ = 'c-product-card__prices-actual cy-product-price-normal').text[:-1]
except AttributeError:
price = item.find('div', class_ = 'c-product-card__prices-actual c-product-card__prices-actual--discount cy-product-price-discount').text[:-1]
try:
old_price = item.find('div',class_ = 'c-product-card__prices-pvp cy-product-price-normal').text[:-1]
except AttributeError:
old_price = "Sin descuento"
# try:
# availability = item.find('div', class_ = 'c-product-card__availability disponibilidad-inmediata cy-product-availability-date').text.strip()
# except AttributeError:
# availability = item.find('div', class_ = 'c-product-card__availability disponibilidad-moderada cy-product-availability-date').text.strip()
# except AttributeError:
# availability = "Sin Fecha"
try:
rating = item.find('span',class_ = 'c-star-rating__text cy-product-text').text
except AttributeError:
"Sin valoracion"
try:
reviews = item.find('span',class_ = 'c-star-rating__text cy-product-rating-result').text
except AttributeError:
"Sin reviews"
try:
brand = item.find('article')['data-brand']
except AttributeError:
"Sin Marca"
try:
category = item.find('article')['data-category']
except AttributeError:
"Sin Categoria"
# if None in (product_name, price, availability, rating, reviews, brand, category):
# continue
print(product_name, price, old_price, rating, reviews, brand, category, store, extraction_date)
df = pd.DataFrame (
{
'product_name' : product_name,
'price' : price,
#'availability' : availability,
'rating' : rating,
'reviews' : reviews,
'brand' : brand,
'category' : category,
'store' : store,
'date_extraction' : extraction_date,
})
df_list.append(df)
time.sleep(3)
df_hold_list.append(df)
data_PCCOMP = pd.concat(df_hold_list, axis=0)
store = 'PCComponentes'
# site = ‘mysite’
path = '/home/pi/Documents/WebScraping Files/pccomp/'
mydate = extraction_date.strftime('%Y%m%d')
mytime = extraction_date.strftime('%H%M%S')
filename = path+store+'_'+mydate+'_'+mytime+".csv"
data_PCCOMP.to_csv(filename)
print(data_PCCOMP)
代码在一组网页上循环,这些网页在页面上分页并提取数据以收集到数据框中。
最后将收集到的所有数据都插入到一个scv中。
它运行良好,但我无法附加数据帧以仅获取一个包含所有数据的 csv。
我需要帮助来实现我的目标,任何帮助都将不胜感激。
提前致谢。
问候。
【问题讨论】:
标签: python pandas dataframe web-scraping