Python BeautifulSoup 程序初始化答案

【问题标题】：Python BeautifulSoup program initializationPython BeautifulSoup 程序初始化
【发布时间】：2019-05-02 13:24:58
【问题描述】：

我正在尝试调整 Python Scrapy 项目并在我的 PC 上本地运行它。目的是学习和理解它。我在“main”中包含了 start_requests() 函数，但它没有被调用。非常感谢任何帮助或对相关资源的参考。

程序编译没有错误，但它只是打开一个空白的浏览器窗口。预期的结果是浏览 .csv 中的 ASIN 代码列表并从相关页面中抓取一些数据。

# -*- coding: utf-8 -*-

import re
from os.path import splitext, basename

from bs4 import BeautifulSoup as bs
#from scrapy import Spider, Request

country_domain = {'US': {'code': 'us', 'domain': 'com'},
                  'UK': {'code': 'uk', 'domain': 'co.uk'},
                  'Germany': {'code': 'de', 'domain': 'de'}, }


def get_asin_url(asin, domain='com'):
#function get_asin_url body emitted for clarity ...    

def get_title(soup):
    title = ""
    pTitle = soup.find('h1', id='title')
    if pTitle:
        title = re.sub('\s+', ' ', pTitle.text.strip())
    return title

class AmazonbotSpider():
    print("I'm in class AmazonbotSpider")
    name = 'amazonbot'
    allowed_domains = ['amazon.*']
    start_urls = ['https://amazon.com/']
    custom_settings = {'FEED_URI': '%(input_filename)s_%(country)s_%(time)s.csv'}

    def __init__(self, asin_path='C:\\Users\\Chris K\Documents\\0_Molzi\\AmazonScraping\\customScripts\\asins.csv', country='UK', *args, **kwargs):
        print("I'm in __init__")
        super(AmazonbotSpider, self).__init__(*args, **kwargs)
        self.asin_path = asin_path
        self.country = country
        self.country_code = country_domain[country]['domain']
        self.input_filename = splitext(basename(asin_path))[0]

        with open(self.asin_path, 'r') as fp:
            lines = fp.readlines()
        for line in lines:
            asin = line.strip()
            data = get_asin_url(asin, self.country_code)
            #data.meta['item'] = {'asin': asin}
            print("data: ",data)
            #yield data

    def start_requests(self):
        print("I'm in start_requests")
        with open(self.asin_path, 'r') as fp:
            lines = fp.readlines()
        for line in lines:
            asin = line.strip()
            data = Request(get_asin_url(asin, self.country_code), callback=self.parse)
            data.meta['item'] = {'asin': asin}
            print("data: ",data)
            yield data

    def parse(self, response):
        print("I'm in parse")
        item = response.meta['item']
        soup = bs(response.text, 'lxml')
        # Remove any style tags
        style_tags = soup.find_all('style')
        if style_tags:
            for style_tag in style_tags:
                style_tag.extract()
        item['name'] = get_title(soup)
        item['url'] = response.url
        yield item

if __name__ == "__main__":
        spider = AmazonbotSpider() # Create the object
        #spider.start_requests() # Run the rank checker
        print("I'm in __main__")

【问题讨论】：

标签： python beautifulsoup scrapy

【解决方案1】：

get_title 中的 return 语句缩进不佳。

linter 应该可以帮助你写出好的 python，例如尝试pylint。

【讨论】：

是的，还有 this 和 start_requests() 缩进不够，以 super(AmazonbotSpider, self)... 开头的行应该缩进到 __init__() 方法中。缩进在python中非常很重要:)
缩进已修复。感谢 pylint 的建议。我目前正在使用记事本++。对我来说，这是早期的 Python 时代。为什么不调用 start_requests() 的任何想法？另外，我无法弄清楚 super(AmazonbotSpider, self) 做了什么。

【解决方案2】：

您应该缩进start_requests() 更多 - 它目前是AmazonbotSpider 类中的一个函数而不是一个方法。

【讨论】：

【解决方案3】：

我不再尝试调整上述代码。相反，我使用this 脚本作为基础，我只是添加了我需要的方法。使用 Selenium 是主要区别：

from bs4 import BeautifulSoup
import time
from selenium import webdriver
import re
import datetime
from collections import deque
import logging
import csv

globaldomain = "es"

class AmazonScaper(object):

    def __init__(self,asins, output_file='results.csv',sleep=2):

        self.browser = webdriver.Chrome(executable_path='chromedriver.exe')  #Add path to your Chromedriver
        self.asin_queue = deque(asins)  #Add the start URL to our list of URLs to crawl
        self.output_file = output_file
        self.sleep = sleep
        self.results = []
    #method body ommited for clarity:
    def get_title(self, soup):
    def get_soldby(self,soup):
    def get_price_seller(self,soup):

    def run_crawler(self):
        price_seller = ""
        while len(self.asin_queue): #If we have asins to check
            asin = self.asin_queue.popleft() #We grab a asin from the left of the list
            html = self.get_page(asin)
            soup = self.get_soup(html)
            time.sleep(self.sleep) # Wait for the specified time
            if soup is not None:  #If we have soup - parse and save data
                title = self.get_title(soup)
                soldby = self.get_soldby(soup)
                price_seller = self.get_price_seller(soup)

                time.sleep(3)
            print(asin,"^^^",title,"^^^",price_seller,"^^^",soldby)
        #self.browser.quit()
        #self.csv_output() # Save the object data to csv

if __name__ == "__main__":
        asins = [str.replace(line.rstrip('\n'),' ','+') for line in open('ASINs.txt')] # Use our file of asins & replaces spaces with +
        ranker = AmazonScaper(asins) # Create the object
        ranker.run_crawler() # Run the rank checker

【讨论】：