【问题标题】:TypeError: Cannot mix str and non-str argumentsTypeError:不能混合 str 和非 str 参数
【发布时间】:2020-08-05 04:21:53
【问题描述】:
from scrapy import Spider
from scrapy.http import Request


class CourseSpider(Spider):
    name = 'course'
    allowed_domains = ['coursera.org']
    start_urls = ['https://coursera.org/about/partners']

    def parse(self, response):
        listings = response.xpath('//div[@class="rc-PartnerBox vertical-box"]')
        for listing in listings:
            title = listing.xpath('.//div[@class="partner-box-wrapper card-one-clicker flex-1"]/p').extract_first()
            relative_url = listing.xpath('.//a/@href').extract_first()
            absolute_url = response.urljoin(relative_url)

            yield Request(response.urljoin(relative_url), callback = self.parse_listing,meta={'title':title,'absolute_url':absolute_url})

    def parse_listing(self,response):
        titles = response.meta.get('title')
        absolute_url = response.meta.get('absolute_url')
        titles_course =  response.xpath('//div[@class="name headline-1-text"]/text()').extract()
        url_link = response.xpath('//div[@class="rc-Course"]/a/@href').extract()
        abs_url = response.urljoin(url_link)

        yield {'title':title,
        'titles':title,
        'absolute_url':absolute_url,
        'titles_course':titles_course,
        'abs_url':abs_url}

但是,通过 cmd 运行脚本时。我收到错误。这些错误表明我不能混合 str 和非 str 参数,我对如何处理这个问题感到困惑。任何帮助将不胜感激。

Traceback (most recent call last):
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\utils\defer.py", line 117, in iter_errback
    yield next(it)
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\utils\python.py", line 345, in __next__
    return next(self.data)
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\utils\python.py", line 345, in __next__
    return next(self.data)
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
    for r in iterable:
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
    for x in result:
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
    for r in iterable:
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 338, in <genexpr>
    return (_set_referer(r) for r in result or ())
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
    for r in iterable:
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
    return (r for r in result or () if _filter(r))
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
    for r in iterable:
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
    return (r for r in result or () if _filter(r))
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
    for r in iterable:
  File "C:\Users\Naman Jogani\Desktop\Udemy\udemy\spiders\course.py", line 28, in parse_listing
    yield {'title':title,
NameError: name 'title' is not defined
2020-08-05 00:08:48 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.coursera.org/checkpoint> (referer: https://www.coursera.org/about/partners)
Traceback (most recent call last):
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\utils\defer.py", line 117, in iter_errback
    yield next(it)
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\utils\python.py", line 345, in __next__
    return next(self.data)
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\utils\python.py", line 345, in __next__
    return next(self.data)
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
    for r in iterable:
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
    for x in result:
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
    for r in iterable:
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 338, in <genexpr>
    return (_set_referer(r) for r in result or ())
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
    for r in iterable:
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
    return (r for r in result or () if _filter(r))
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
    for r in iterable:
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
    return (r for r in result or () if _filter(r))
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
    for r in iterable:
  File "C:\Users\Naman Jogani\Desktop\Udemy\udemy\spiders\course.py", line 26, in parse_listing
    abs_url = response.urljoin(url_link)
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\http\response\text.py", line 80, in urljoin
    return urljoin(get_base_url(self), url)
  File "c:\users\naman jogani\anaconda3\lib\urllib\parse.py", line 504, in urljoin
    base, url, _coerce_result = _coerce_args(base, url)
  File "c:\users\naman jogani\anaconda3\lib\urllib\parse.py", line 120, in _coerce_args
    raise TypeError("Cannot mix str and non-str arguments")
TypeError: Cannot mix str and non-str arguments
2020-08-05 00:08:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.coursera.org/casewesternreserve> (referer: https://www.coursera.org/about/partners)
2020-08-05 00:08:48 [scrapy.core.scraper] ERROR: Spider error processing <GET https://www.coursera.org/casewesternreserve> (referer: https://www.coursera.org/about/partners)
Traceback (most recent call last):
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\utils\defer.py", line 117, in iter_errback
    yield next(it)
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\utils\python.py", line 345, in __next__
    return next(self.data)
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\utils\python.py", line 345, in __next__
    return next(self.data)
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
    for r in iterable:
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
    for x in result:
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
    for r in iterable:
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 338, in <genexpr>
    return (_set_referer(r) for r in result or ())
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
    for r in iterable:
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
    return (r for r in result or () if _filter(r))
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
    for r in iterable:
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
    return (r for r in result or () if _filter(r))
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\core\spidermw.py", line 64, in _evaluate_iterable
    for r in iterable:
  File "C:\Users\Naman Jogani\Desktop\Udemy\udemy\spiders\course.py", line 26, in parse_listing
    abs_url = response.urljoin(url_link)
  File "c:\users\naman jogani\anaconda3\lib\site-packages\scrapy\http\response\text.py", line 80, in urljoin
    return urljoin(get_base_url(self), url)
  File "c:\users\naman jogani\anaconda3\lib\urllib\parse.py", line 504, in urljoin
    base, url, _coerce_result = _coerce_args(base, url)
  File "c:\users\naman jogani\anaconda3\lib\urllib\parse.py", line 120, in _coerce_args
    raise TypeError("Cannot mix str and non-str arguments")
TypeError: Cannot mix str and non-str arguments
2020-08-05 00:08:48 [scrapy.core.engine] INFO: Closing spider (finished)

我尝试添加 extract() 函数,因为它在列表容器上的一些先前 stackoverflow 问题中提到过,以消除该错误,但是我的 xpath 没有得到所需的输出。

【问题讨论】:

  • 你应该在yield {'title':title, ...中使用titles
  • 我刚刚在收益声明中进行了更改。但是,我无法摆脱运行脚本时弹出的类型错误。
  • 请更新回溯

标签: python python-3.x web-scraping scrapy


【解决方案1】:

您正在寻找.extract_first() or its new name .get(),因为.extract() 会生成一个列表,不能在.urljoin 中使用该列表

【讨论】:

    猜你喜欢
    • 1970-01-01
    • 2015-10-06
    • 2017-12-01
    • 2020-09-18
    • 2020-08-19
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    相关资源
    最近更新 更多