bs4 解析器保留一个不完整的列表答案

【问题标题】：bs4 parser keeps an incomplete listbs4 解析器保留一个不完整的列表
【发布时间】：2017-02-21 19:13:56
【问题描述】：

我写了一些代码，第一部分我完美地收获了一些标题作品（包含 90 个值），但第二部分的价格 a 保留了一个不完整的列表（包含 30 个）。它似乎循环不适用于该段。
我应该如何更改此代码以保留完整列表？
提前致谢！

import re
import requests
from bs4 import BeautifulSoup

url = "http://www.watcheszon.com/casio-g-shock-c-19_20/?pg=store"


DATA_CONTAINER = list()
DATA = list()

def collectData():

    global DATA_CONTAINER
    global DATA


    for i in range(1, 5):
        newUrl = url + "&sort=20a&page=" + str(i) 
        r = requests.get(newUrl)
        soup = BeautifulSoup(r.content, "lxml")
        #print(soup)
        g_data_odd = soup.find_all("td", {"class": "productListing-data"})
        for item in g_data_odd:         
            t = item.find_all("div", {"class": "product_name"})
            i = list()
            for name in t:
                piece = name.find('a').text
                i.append(piece)
                #print(piece)
                # for pc in piece:
                #   i.append(pc.replace("\r", "").replace("\n", "").replace("\t", ""))
                #   print(pc)
                DATA_CONTAINER.append(piece)

        spans = soup.find_all('span', {"class": "productSalePrice"})
        # create a list of lines corresponding to element texts
        lines = [span.get_text() for span in spans]
        # collect the dates from the list of lines using regex matching groups
        found_dates = []
        for line in lines:
            m = re.search(r'[USD]+\d{2,3}.\d{2}', line)
            if m:
                found_dates.append(str(m.group(0)))
                # print the dates we collected
        # for date in found_dates:
        #     print(date)

        # DATA_J = DATA_CONTAINER[:]
        DATA = list(zip(DATA_CONTAINER, found_dates))
        print(DATA)

def serializeToCSV(fileName):
    with open(fileName, "w") as fd:
        for item in DATA:
            fd.write(u' '.join(item).encode('utf-8') + "\n")

collectData()
print(len(DATA))
serializeToCSV('csv.csv')

【问题讨论】：

标签： python-3.x parsing web-scraping bs4

【解决方案1】：

试试this code:

import re
import requests
from bs4 import BeautifulSoup

url = "http://www.watcheszon.com/casio-g-shock-c-19_20/?pg=store"


DATA_CONTAINER = list()
DATA = list()

def collectData():

    global DATA_CONTAINER
    global DATA


    for i in range(1, 5):
        newUrl = url + "&sort=20a&page=" + str(i) 
        r = requests.get(newUrl)
        soup = BeautifulSoup(r.content, "lxml")
        #print(soup)
        g_data_odd = soup.find_all("td", {"class": "productListing-data"})
        for item in g_data_odd:         
            t = item.find_all("div", {"class": "product_name"})
            i = list()
            for name in t:
                piece = name.find('a').text
                i.append(piece)
                #print(piece)
                # for pc in piece:
                #   i.append(pc.replace("\r", "").replace("\n", "").replace("\t", ""))
                #   print(pc)
                DATA_CONTAINER.append(piece)

        spans = soup.find_all('span', {"class": "productSalePrice"})
        # create a list of lines corresponding to element texts
        lines = [span.get_text() for span in spans]
        # collect the dates from the list of lines using regex matching groups
        found_dates = []
        for line in lines:
            m = re.search(r'[USD]+\d{2,3}.\d{2}', line)
            if m:
                found_dates.append(str(m.group(0)))
                # print the dates we collected
        # for date in found_dates:
        #     print(date)

        # DATA_J = DATA_CONTAINER[:]
        DATA = list(zip(DATA_CONTAINER, found_dates))
        print(DATA)

        def serializeToCSV(fileName):
            with open(fileName, "a") as fd:
                for item in DATA:
                   fd.write(u' '.join(str(item)) + "\n")

            print(len(DATA))
        serializeToCSV('csv.csv')

collectData()

在第 53 行使用 "a" 选项写入文件
循环调用方法serializeToCSV（第17行）

【讨论】：