【发布时间】:2021-06-17 08:54:58
【问题描述】:
我对今天编写的这个脚本几乎感到满意。今天它得到了一些帮助(感谢迄今为止提供帮助的所有人)和我的一些可疑编程,但它在一定程度上是有用的。
我想将数据转储到 JSON。 除了价格(从<span></span> 获取)似乎正确转储了所有数据。我认为问题在于缩进,但我不是 100% 确定。
谁能把目光投向这个 sn-p 并纠正我看不到的地方。认为我因尝试了多种变化而无法看到正确的变化而失明。
from bs4 import BeautifulSoup
import requests
import shutil
import csv
import pandas
from pandas import DataFrame
import re
import os
import urllib.request as urllib2
import locale
import json
from selenium import webdriver
import lxml.html
import time
from selenium.webdriver.support.ui import Select
os.environ["PYTHONIOENCODING"] = "utf-8"
#selenium requests
browser = webdriver.Chrome(executable_path='C:/Users/admin/chromedriver.exe')
browser.get("https://www.mcavoyguns.co.uk/contents/en-uk/d130_Beretta_Over___Under_Competeition_shotguns.html")
time.sleep(2)
#beautiful soup requests
#URL = 'https://www.mcavoyguns.co.uk/contents/en-uk/d130_Beretta_Over___Under_Competeition_shotguns.html'
#page = requests.get(URL)
#soup = BeautifulSoup(page.content, 'html.parser')
soup = BeautifulSoup(browser.page_source, features="lxml")
#products = soup.find_all("div", "GC62 Product")
products = soup.find_all("div", "GC62 Product")
for product in products:
#barrel lengths
barrels = product.find('select', attrs={'name': re.compile('length')})
if barrels:
barrels_list = [x['origvalue'][:2] for x in barrels.find_all('option')[1:]]
for y in range(0, len(barrels_list)):
#title
title = product.find("h3")
titleText = title.text if title else ''
#manufacturer name
manufacturer = product.find("div", "GC5 ProductManufacturer")
manuText = manufacturer.text if manufacturer else ''
#image location
img = product.find("div", "ProductImage")
imglinks = img.find("a") if img else ''
imglinkhref = imglinks.get('href') if imglinks else ''
imgurl = 'https://www.mcavoyguns.co.uk/contents'+imglinkhref
#description
description = product.find("div", "GC12 ProductDescription")
descText = description.text if description else ''
#descStr = str(descText)
#more description
more = product.find("div", "GC12 ProductDetailedDescription")
moreText = more.text if more else ''
#price
spans = browser.find_elements_by_css_selector("div.GC20.ProductPrice span")
for i in range(0,len(spans),2):
span = spans[i].text
i+=1
#print(span)
#print(barrels_list[y])
#print(titleText)
#print(manuText)
#print(descText)
#print(moreText)
#print(imgurl.replace('..', ''))
#print("\n")
x = {
"price": span,
"barrel length": barrels_list[y],
"title": titleText,
"manufacturer": manuText,
"description": descText,
"desc cont": moreText,
"image Location": imgurl.replace('..', '')
}
dump = json.dumps(x)
print(dump)
y+=1
【问题讨论】:
标签: python json selenium web-scraping beautifulsoup