【问题标题】:Getting 504 Gateway Time-out while running SplashRequest through ScrapySpider通过 ScrapySpider 运行 SplashRequest 时出现 504 网关超时
【发布时间】:2019-12-02 08:49:44
【问题描述】:

在 VM VirtualBox 上运行 ubuntu。

运行 ifconfig 命令:

>docker0: flags=4099<UP,BROADCAST,MULTICAST>  mtu 1500
        inet 172.17.0.1  netmask 255.255.0.0  broadcast 172.17.255.255
        inet6 fe80::42:90ff:fe9b:4d22  prefixlen 64  scopeid 0x20<link>
        ether 02:42:90:9b:4d:22  txqueuelen 0  (Ethernet)
        RX packets 10757  bytes 5983236 (5.9 MB)
        RX errors 0  dropped 0  overruns 0  frame 0
        TX packets 10909  bytes 15688953 (15.6 MB)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0
enp0s3: flags=4163<UP,BROADCAST,RUNNING,MULTICAST>  mtu 1500
        inet 10.0.2.15  netmask 255.255.255.0  broadcast 10.0.2.255
        inet6 fe80::6ec6:7ba3:79fa:8791  prefixlen 64  scopeid 0x20<link>
        ether 08:00:27:92:a7:a7  txqueuelen 1000  (Ethernet)
        RX packets 145146  bytes 145357306 (145.3 MB)
        RX errors 0  dropped 0  overruns 0  frame 0
        TX packets 79019  bytes 19069408 (19.0 MB)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0
lo: flags=73<UP,LOOPBACK,RUNNING>  mtu 65536
        inet 127.0.0.1  netmask 255.0.0.0
        inet6 ::1  prefixlen 128  scopeid 0x10<host>
        loop  txqueuelen 1000  (Local Loopback)
        RX packets 8252  bytes 3265348 (3.2 MB)
        RX errors 0  dropped 0  overruns 0  frame 0
        TX packets 8252  bytes 3265348 (3.2 MB)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0

Scrapy 终端:

正在重试 GET http://www.gari.pk/used-cars-search/via http://172.17.0.1:8050/execute(失败1次):504网关超时

Splash 终端:

2019-07-24 08:14:07.645051 [-] 服务器监听 http://0.0.0.0:8050 libpng 警告:iCCP:已知不正确的 sRGB 配置文件 libpng 警告:iCCP:已知不正确的 sRGB 配置文件 过程 1:D-Bus 库似乎设置不正确;无法读取机器 uuid:UUID 文件“/etc/machine-id”应包含长度为 32 的十六进制字符串,而不是长度 0,没有其他文本 请参阅 dbus-uuidgen 的手册页以更正此问题。 qt.network.ssl:QSslSocket:无法解析 SSLv2_client_method qt.network.ssl:QSslSocket:无法解析 SSLv2_server_method

2019-07-24 08:14:40.935910 [事件] { “_id”:140385374548096, "client_ip":"172.17.0.1", “maxrss”:145496, “渲染时间”:30.157784700393677, “路径”:“/执行”, “状态码”:504, “加载”:[ 0.23, 0.25, 0.26 ], “错误”:{ "type":"GlobalTimeoutError", “信息”:{ “超时”:30 }, "description":"渲染页面超时", “错误”:504 }, “方法”:“发布”, "qsize":0, “参数”:{ “标题”:{ "接受语言":"en", "用户代理":"Scrapy/1.6.0 (+https://scrapy.org)", "接受":"text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8" }, "网址":"http://www.gari.pk/used-cars-search/", “饼干”:[ ], “uid”:140385374548096, "lua_source":"\n function find_search_input(inputs)\n if #inputs == 1 then\n return inputs[1]\n else\n for _, input in ipairs(inputs) do\n if input.node. attributes.type == \"search\" then\n return input\n end\n end\n end\n end\n\n function find_input(forms)\n local potential = {}\n\n for _, ipairs(forms) 中的表单 do\n 本地输入 = form.node:querySelectorAll('input:not([type=\"hidden\"])')\n if #inputs ~= 0 then\n local input = find_search_input (inputs)\n if input then\n return form, input\n end\n\n potential[#potential + 1] = {input=inputs[1], form=form}\n end\n end\n\ n return potential[1].form, potential[1].input\n end\n\n function main(splash, args)\n -- 找到一个表单并提交 \"splash\" 给它\n 本地函数 searc h_for_splash()\n local forms = splash:select_all('form')\n\n if #forms == 0 then\n error('no search form is found')\n end\n\n local form, input = find_input(forms)\n\n if not input then\n error('没有找到搜索表单')\n end\n\n assert(input:send_keys('honda'))\n assert(splash:wait (0))\n assert(form:submit())\n end\n\n -- 主渲染脚本\n assert(splash:go(splash.args.url))\n assert(splash:wait(5 ))\n search_for_splash()\n assert(splash:wait(15))\n --assert(splash:runjs('search_query('', (100));'))\n 本地按钮 = splash:select ('a[href*=\"search_query\"]')\n button.node:setAttribute('href', \"javascript: search_query('', (20))\");\n button:mouse_click( )\n 断言(splash:wait(120))\n \n return {html = splash:html()}\n end\n " }, “时间戳”:1563956080, “fds”:21, “活跃”:0, “用户代理”:“Scrapy/1.6.0 (+https://scrapy.org)” } 2019-07-24 08:14:40.936842 [-] "172.17.0.1" - - [24/Jul/2019:08:14:40 +0000] "POST /execute HTTP/1.1" 504 119 "-" "Scrapy /1.6.0 (+https://scrapy.org)"

试过这个: docker run -p 8050:8050 scrapinghub/splash --max-timeout 240

garispider.py

import scrapy
import re
from scrapy_splash import SplashRequest

class GarispiderSpider(scrapy.Spider):
    name = 'gariSpider'
    allowed_domains = ['www.gari.pk']
    start_urls = ['http://www.gari.pk/used-cars-search/']
    lua_script = """
    function find_search_input(inputs)
        if #inputs == 1 then
            return inputs[1]
        else
            for _, input in ipairs(inputs) do
                if input.node.attributes.type == "search" then
                    return input
                end
            end
        end
    end

    function find_input(forms)
        local potential = {}

        for _, form in ipairs(forms) do
            local inputs =        form.node:querySelectorAll('input:not([type="hidden"])')
            if #inputs ~= 0 then
                local input = find_search_input(inputs)
                if input then
                    return form, input
                end

                potential[#potential + 1] = {input=inputs[1], form=form}
            end
        end

        return potential[1].form, potential[1].input
    end

    function main(splash, args)
        -- find a form and submit "splash" to it
        local function search_for_splash()
        local forms = splash:select_all('form')

        if #forms == 0 then
            error('no search form is found')
        end

        local form, input = find_input(forms)

        if not input then
            error('no search form is found')
        end

        assert(input:send_keys('honda'))
        assert(splash:wait(0))
        assert(form:submit())
    end

    -- main rendering script
    assert(splash:go(splash.args.url))
    assert(splash:wait(5))
    search_for_splash()
    assert(splash:wait(15))
    --assert(splash:runjs('search_query('', (100));'))
    local button = splash:select('a[href*="search_query"]')
    button.node:setAttribute('href', "javascript: search_query('', (20))");
    button:mouse_click()
    assert(splash:wait(120))

    return {html = splash:html()}
    end
    """


    def start_requests(self):
        url=self.start_urls[0]
        yield SplashRequest(url, callback=self.parse, endpoint='execute', args={'lua_source': self.lua_script})

    def parse(self,response):
        print(response.body)
###########################################################################

############################Setting.py#####################################
BOT_NAME = 'ScrappyApp'

SPIDER_MODULES = ['ScrappyApp.spiders']
NEWSPIDER_MODULE = 'ScrappyApp.spiders'
# SPLASH_URL = 'http://10.0.2.15:8050'
SPLASH_URL = 'http://172.17.0.1:8050'

DOWNLOADER_MIDDLEWARES = {
    'scrapy_splash.SplashCookiesMiddleware': 723,
    'scrapy_splash.SplashMiddleware': 725,
    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}


SPIDER_MIDDLEWARES = {
    'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}

DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'

ROBOTSTXT_OBEY = False


ITEM_PIPELINES = {
    'ScrappyApp.pipelines.ScrappyappPipeline': 300,
}

我希望 lua 脚本之后的 html 正文。但是得到 通过http://172.17.0.1:8050/execute>重试http://www.gari.pk/used-cars-search/(失败1次):504 Gateway Time-out

【问题讨论】:

  • 为什么要使用 Splash 来提出表单请求? Scrapy 有 ForRequest 它应该可以工作
  • 因为搜索关键字后需要点击按钮,并且是scrapy的初学者,需要学习使用splash。
  • 您可以在表单中使用您尝试查找的单词对 URL 发出 POST 请求。此网址关注gari.pk/used-cars-search
  • 该站点使用Js加载搜索结果,这就是为什么我需要通过SplashRequest运行一个lua_script。
  • 您可以通过 AJAX 请求来模拟这种行为,请检查我的答案。它也可以在 Scrapy 中工作。

标签: python docker ubuntu scrapy scrapy-splash


【解决方案1】:

没有使用 Scrapy 准备好的项目。我将向您展示requests 的示例,但它可以轻松转换为 Scrapy 请求

import requests
r = requests.post('http://www.gari.pk/search-car-ajax.php', {'search_param': 'cars_mini/,/c_date desc/bmw'})

r.content 中,您将收到您正在寻找的数据的回复。对于分页,站点发出相同的请求但添加了一些偏移量,您应该做的就是将此偏移量添加到数据中。这是一个例子

r = requests.post('http://www.gari.pk/search-car-ajax.php', {'search_param': 'cars_mini/,/c_date desc/bmw/10'})

如您所见,添加 10 {'search_param': 'cars_mini/,/c_date desc/bmw/10'} 也许您可以为每个请求获取更多结果。 我建议你检查 Developerconcole->network->Xhr https://doc.scrapy.org/en/latest/topics/request-response.html#scrapy.http.FormRequest

yield scrapy.FormRequest('http://www.gari.pk/search-car-ajax.php', callback=self.parse,method='POST', formdata={'search_param': 'cars_mini/,/c_date desc/bmw/10'})

【讨论】:

  • 谢谢它的工作,但我也需要一个带有飞溅的解决方案。
【解决方案2】:

这是因为您要抓取的 url 返回 transfer-encoding 标头。我在Github 上打开了一个关于此的问题。

这里的脚本证明 (url httpbin.org/headers) 返回我根据请求发送的相同标头。

import requests
import json

ENDPOINT_SPLASH = 'http://localhost:8050/execute'


def test_with_custom_headers():
    lua_script = """
    function main(splash, args)
     splash:set_custom_headers({
       ["x-custom-header"] = "splash"
     })
     assert(splash:go(args.url))
     assert(splash:wait(0.5))
     return {
       html = splash:html()
     }
    end
    """

    payload = {
        'lua_source': lua_script,
        'url': 'https://httpbin.org/headers',
        'timeout': 15,
    }

    r = requests.post(url=ENDPOINT_SPLASH,
                      json=payload)

    result = json.loads(r.text)

    return result.get('html', result)


def test_with_content_encoding():
    lua_script = """
    function main(splash, args)
     splash:set_custom_headers({
       ["transfer-encoding"] = "chunked"
     })
     assert(splash:go(args.url))
     assert(splash:wait(0.5))
     return {
       html = splash:html()
     }
    end
    """

    payload = {
        'lua_source': lua_script,
        'url': 'https://httpbin.org/headers',
        'timeout': 15,
    }

    r = requests.post(url=ENDPOINT_SPLASH,
                      json=payload)

    result = json.loads(r.text)

    return result.get('html', result)


print("test_with_custom_headers: \n{}\n".format(test_with_custom_headers()))
print("test_with_content_encoding: \n{}".format(test_with_content_encoding()))

【讨论】:

    猜你喜欢
    • 1970-01-01
    • 2020-03-21
    • 1970-01-01
    • 2013-05-20
    • 2021-05-23
    • 2016-03-19
    • 2020-04-14
    • 2011-08-30
    相关资源
    最近更新 更多