【发布时间】:2017-02-21 19:13:56
【问题描述】:
我写了一些代码,第一部分我完美地收获了一些标题作品(包含 90 个值),但第二部分的价格 a 保留了一个不完整的列表(包含 30 个)。它似乎循环不适用于该段。
我应该如何更改此代码以保留完整列表?
提前致谢!
import re
import requests
from bs4 import BeautifulSoup
url = "http://www.watcheszon.com/casio-g-shock-c-19_20/?pg=store"
DATA_CONTAINER = list()
DATA = list()
def collectData():
global DATA_CONTAINER
global DATA
for i in range(1, 5):
newUrl = url + "&sort=20a&page=" + str(i)
r = requests.get(newUrl)
soup = BeautifulSoup(r.content, "lxml")
#print(soup)
g_data_odd = soup.find_all("td", {"class": "productListing-data"})
for item in g_data_odd:
t = item.find_all("div", {"class": "product_name"})
i = list()
for name in t:
piece = name.find('a').text
i.append(piece)
#print(piece)
# for pc in piece:
# i.append(pc.replace("\r", "").replace("\n", "").replace("\t", ""))
# print(pc)
DATA_CONTAINER.append(piece)
spans = soup.find_all('span', {"class": "productSalePrice"})
# create a list of lines corresponding to element texts
lines = [span.get_text() for span in spans]
# collect the dates from the list of lines using regex matching groups
found_dates = []
for line in lines:
m = re.search(r'[USD]+\d{2,3}.\d{2}', line)
if m:
found_dates.append(str(m.group(0)))
# print the dates we collected
# for date in found_dates:
# print(date)
# DATA_J = DATA_CONTAINER[:]
DATA = list(zip(DATA_CONTAINER, found_dates))
print(DATA)
def serializeToCSV(fileName):
with open(fileName, "w") as fd:
for item in DATA:
fd.write(u' '.join(item).encode('utf-8') + "\n")
collectData()
print(len(DATA))
serializeToCSV('csv.csv')
【问题讨论】:
标签: python-3.x parsing web-scraping bs4