【发布时间】:2019-06-10 05:40:44
【问题描述】:
我希望能够将多个 New Egg 网页中的数据抓取到单个 csv 文件中。
我目前能够将单个 New Egg 网页中的数据抓取到 csv 文件中,但是我希望一次抓取多个页面。
from bs4 import BeautifulSoup
import requests
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
#my_url = 'https://www.newegg.com/Desktop-Graphics-Cards/SubCategory/ID-48?PageSize=96'
my_url = 'https://www.newegg.com/Desktop-Graphics-Cards/SubCategory/ID-48/Page-1?PageSize=96'
#opening up connection and grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#html parsing
page_soup = soup(page_html, "html.parser")
#grabs each product
containers = page_soup.findAll("div", {"class":"item-container"})
filename = "99FINAL.csv"
f = open(filename, "w")
headers = "Brand, Title, Shipping, Price\n"
f.write(headers)
# only for information, not used in url
page = 0
while True:
print('---', page, '---')
r = requests.get(my_url)
soup = BeautifulSoup(r.content, "html.parser")
# String substitution for HTML
for container in containers:
brand_container = container.findAll("a", {"class":"item-title"})
brand = brand_container[0].text
title_container = container.findAll("a", {"class":"item-title"})
title = title_container[0].text
shipping_container = container.findAll("li", {"class":"price-ship"})
shipping = shipping_container[0].text.strip()
price = container.findAll("li", {"class":"price-current"})
price = price[0]
pricing_container = price.findAll("strong")
pricing = pricing_container[0].text
cents_container = price.findAll("sup")
centing = cents_container[0].text
print("brand: " + brand.partition(' ')[0])
print("title: " + title)
print("shipping: " + shipping)
print("pricing: " + pricing)
print("centing: " + centing)
f.write(brand.partition(' ')[0] + "," + title.replace(",","|") + "," + shipping.partition(' ')[0] + "," + pricing.replace(",","")+centing + "\n")
break
# link to next page
next_page = soup.find("button", {"title": "Next"})
if next_page:
my_url = next_page.get("href")
page += 1
else:
break # exit `while True`for "Page" in my_url
f.close()
该算法成功地抓取了第一页,但是在切换到下一页时出现了问题。
我得到的错误是: “MissingSchema:无效的 URL 'None':未提供架构。也许您的意思是 http://None?”
【问题讨论】:
标签: python python-3.x web-scraping