【问题标题】:Remove row from the CSV file if condition met如果满足条件,则从 CSV 文件中删除行
【发布时间】:2021-08-28 12:49:39
【问题描述】:

我正在尝试抓取 pickels.com.au。

如果链接相同并且价格不同,我正在尝试更新 pickels_dataset.csv 文件,我将删除列表并将新行插入 CSV 文件,但不会删除CSV 文件中的旧条目。

删除和更新 CSV 文件中的行的最佳方法是什么。

下面是我的代码...

import requests
from scrapy.selector import Selector
import csv
import re
from tqdm import tqdm
from time import sleep


with open('pickels_dataset.csv', 'a+', newline='', encoding='utf-8') as auction_csv_file:
    auction_csv_writer = csv.writer(auction_csv_file)
    live_auctions_api = 'https://www.pickles.com.au/PWR-Web/services/api/sales/future'
    api_request = requests.get(url=live_auctions_api)
    for auctions in api_request.json():
        auction_link = auctions.get('viewSaleListingLink')
        if 'cars/item/search/-/listing/listSaleItems/' in auction_link:
            auction_request = requests.get(url=auction_link)
            response = Selector(text=auction_request.text)

            sales_id_re = response.xpath('//script[contains(text(), "Product_Type_Sequence")]/text() | //script[contains(text(), "lot_number_suffix_sequence")]/text()').get()
            sales_id = re.findall(r'"Product_Type_Sequence";var n="(.*?)"', sales_id_re) or re.findall(r'"lot_number_suffix_sequence";var n="(.*?)"', sales_id_re)
            if sales_id == []:
                continue
            auction_sale_link = f'https://www.pickles.com.au/v4/caradvert/saleid-{sales_id[0]}-public?count=true&inav=Car%7Cbc%7Cha%7Cu&q=(And.ProductType.Vehicles._.Year.range(2010..2021).)&sr=%7Clot_number_suffix_sequence%7C0%7C30'
            auction_sale_link_requests = requests.get(url=auction_sale_link)

            auctions_data = auction_sale_link_requests.json().get('SearchResults')
            if auctions_data == []:
                print("NO RESULTS")
            for auction_data in auctions_data:
                if int(auction_data.get('MinimumBid')) > 0:
                    ids = auction_data.get('TargetId')
                    main_title = auction_data.get('Title')
                    short_title = str(auction_data.get('Year')) + ' ' + str(auction_data.get('Make')) + ' ' + str(auction_data.get('Model'))
                    make = auction_data.get('M ake')
                    model = auction_data.get('Model')
                    variant = auction_data.get('Series')
                    transmission = auction_data.get('Transmission')
                    odometer = auction_data.get('Odometer')
                    state = auction_data.get('Location').get('State')
                    sale_price = auction_data.get('MinimumBid')
                    link_path = main_title.replace(' ', '-').replace('/', '-').replace(',', '-') + '/' + str(ids)
                    link = f'https://www.pickles.com.au/cars/item/-/details/{link_path}'
                    sale_date = auction_data.get('SaleEndString')
                    
                    auction_values = [
                        main_title, short_title, make, 
                        model, variant, transmission, odometer, 
                        state, "${:,.2f}".format(sale_price).strip() , 
                        link, sale_date
                    ]
                    
                    with open('pickels_dataset.csv', 'r+') as csv_read:
                        auction_reader = list(csv.reader(csv_read))
                        for each in auction_reader:
                            if link in each:
                                each_link, each_price = each[9], each[0]
                                if (link == each_link) and (sale_price != each_price):
                                    auction_reader.clear()
                                    print('New list found, old list deleted')
                                    auction_csv_writer.writerow(auction_values)
                                    print('New value added')
                                    continue
                                elif (link == each[9]) and (sale_price == each[0]):
                                    print('Same result already exist in the file')
                                    continue
                        else:
                            auction_csv_writer.writerow(auction_values)
                            print('Unique result found and added.')
                            break

【问题讨论】:

    标签: python csv web-scraping scrapy


    【解决方案1】:

    您当前的脚本正在打开您的拍卖 CSV 文件以进行附加,然后在它仍处于打开状态时尝试再次打开以进行读取。这可能就是它没有按预期更新的原因。

    更好的方法是首先将现有保存的拍卖文件的全部内容读入字典。关键可能是link,这样您就可以轻松确定您是否已经看过现有的拍卖。

    接下来抓取当前的拍卖并根据需要更新saved_auctions 字典。

    最后,将saved_auctions的内容写回CSV文件。

    例如:

    import requests
    from scrapy.selector import Selector
    import csv
    import re
    
    
    auction_filename = 'pickels_dataset.csv'
    
    # Load existing auctions into a dictionary with link as key
    saved_auctions = {}
    
    with open(auction_filename, newline='', encoding='utf-8') as f_auction_file:
        for row in csv.reader(f_auction_file):
            saved_auctions[row[9]] = row      # dictionary key is link
        
    live_auctions_api = 'https://www.pickles.com.au/PWR-Web/services/api/sales/future'
    api_request = requests.get(url=live_auctions_api)
    
    for auctions in api_request.json():
        auction_link = auctions.get('viewSaleListingLink')
        
        if 'cars/item/search/-/listing/listSaleItems/' in auction_link:
            auction_request = requests.get(url=auction_link)
            response = Selector(text=auction_request.text)
    
            sales_id_re = response.xpath('//script[contains(text(), "Product_Type_Sequence")]/text() | //script[contains(text(), "lot_number_suffix_sequence")]/text()').get()
            sales_id = re.findall(r'"Product_Type_Sequence";var n="(.*?)"', sales_id_re) or re.findall(r'"lot_number_suffix_sequence";var n="(.*?)"', sales_id_re)
            
            if sales_id == []:
                continue
            
            auction_sale_link = f'https://www.pickles.com.au/v4/caradvert/saleid-{sales_id[0]}-public?count=true&inav=Car%7Cbc%7Cha%7Cu&q=(And.ProductType.Vehicles._.Year.range(2010..2021).)&sr=%7Clot_number_suffix_sequence%7C0%7C30'
            auction_sale_link_requests = requests.get(url=auction_sale_link)
    
            auctions_data = auction_sale_link_requests.json().get('SearchResults')
            
            if auctions_data == []:
                print("NO RESULTS")
            
            for auction_data in auctions_data:
                if int(auction_data.get('MinimumBid')) > 0:
                    ids = auction_data.get('TargetId')
                    main_title = auction_data.get('Title')
                    short_title = str(auction_data.get('Year')) + ' ' + str(auction_data.get('Make')) + ' ' + str(auction_data.get('Model'))
                    make = auction_data.get('Make')
                    model = auction_data.get('Model')
                    variant = auction_data.get('Series')
                    transmission = auction_data.get('Transmission')
                    odometer = auction_data.get('Odometer')
                    state = auction_data.get('Location').get('State')
                    minimum_bid = auction_data.get('MinimumBid')
                    sale_price = "${:,.2f}".format(minimum_bid).strip()
                    link_path = main_title.replace(' ', '-').replace('/', '-').replace(',', '-') + '/' + str(ids)
                    link = f'https://www.pickles.com.au/cars/item/-/details/{link_path}'
                    sale_date = auction_data.get('SaleEndString')
                    
                    auction_values = [
                        main_title, short_title, make, 
                        model, variant, transmission, odometer, 
                        state, sale_price, 
                        link, sale_date
                    ]
    
                    if link in saved_auctions:
                        if saved_auctions[link][8] == sale_price:
                            print('Same result already exists in the file')
                        else:
                            print('New value updated')
                            saved_auctions[link] = auction_values        # Updated the entry
                    else:
                        print('New auction added')
                        saved_auctions[link] = auction_values
                        
    # Update the saved auction file                    
    with open(auction_filename, 'w', newline='', encoding='utf-8') as f_auction_file:
        csv_auction_file = csv.writer(f_auction_file)
        csv_auction_file.writerows(saved_auctions.values())
    

    如果您还想删除不再有效的拍卖,那么最好直接忽略保存的文件并按原样写入所有当前条目。

    【讨论】:

      猜你喜欢
      • 2015-10-13
      • 2020-09-11
      • 2016-10-20
      • 1970-01-01
      • 2021-06-06
      • 1970-01-01
      • 1970-01-01
      • 2023-01-19
      • 1970-01-01
      相关资源
      最近更新 更多