【发布时间】:2019-05-02 13:24:58
【问题描述】:
我正在尝试调整 Python Scrapy 项目并在我的 PC 上本地运行它。目的是学习和理解它。我在“main”中包含了 start_requests() 函数,但它没有被调用。非常感谢任何帮助或对相关资源的参考。
程序编译没有错误,但它只是打开一个空白的浏览器窗口。预期的结果是浏览 .csv 中的 ASIN 代码列表并从相关页面中抓取一些数据。
# -*- coding: utf-8 -*-
import re
from os.path import splitext, basename
from bs4 import BeautifulSoup as bs
#from scrapy import Spider, Request
country_domain = {'US': {'code': 'us', 'domain': 'com'},
'UK': {'code': 'uk', 'domain': 'co.uk'},
'Germany': {'code': 'de', 'domain': 'de'}, }
def get_asin_url(asin, domain='com'):
#function get_asin_url body emitted for clarity ...
def get_title(soup):
title = ""
pTitle = soup.find('h1', id='title')
if pTitle:
title = re.sub('\s+', ' ', pTitle.text.strip())
return title
class AmazonbotSpider():
print("I'm in class AmazonbotSpider")
name = 'amazonbot'
allowed_domains = ['amazon.*']
start_urls = ['https://amazon.com/']
custom_settings = {'FEED_URI': '%(input_filename)s_%(country)s_%(time)s.csv'}
def __init__(self, asin_path='C:\\Users\\Chris K\Documents\\0_Molzi\\AmazonScraping\\customScripts\\asins.csv', country='UK', *args, **kwargs):
print("I'm in __init__")
super(AmazonbotSpider, self).__init__(*args, **kwargs)
self.asin_path = asin_path
self.country = country
self.country_code = country_domain[country]['domain']
self.input_filename = splitext(basename(asin_path))[0]
with open(self.asin_path, 'r') as fp:
lines = fp.readlines()
for line in lines:
asin = line.strip()
data = get_asin_url(asin, self.country_code)
#data.meta['item'] = {'asin': asin}
print("data: ",data)
#yield data
def start_requests(self):
print("I'm in start_requests")
with open(self.asin_path, 'r') as fp:
lines = fp.readlines()
for line in lines:
asin = line.strip()
data = Request(get_asin_url(asin, self.country_code), callback=self.parse)
data.meta['item'] = {'asin': asin}
print("data: ",data)
yield data
def parse(self, response):
print("I'm in parse")
item = response.meta['item']
soup = bs(response.text, 'lxml')
# Remove any style tags
style_tags = soup.find_all('style')
if style_tags:
for style_tag in style_tags:
style_tag.extract()
item['name'] = get_title(soup)
item['url'] = response.url
yield item
if __name__ == "__main__":
spider = AmazonbotSpider() # Create the object
#spider.start_requests() # Run the rank checker
print("I'm in __main__")
【问题讨论】:
标签: python beautifulsoup scrapy