【问题标题】:Code is running but no error and output is coming代码正在运行,但没有错误,输出即将到来
【发布时间】:2021-06-17 14:28:53
【问题描述】:

我正在尝试从以下链接中抓取数据:

https://www.mumzworld.com/en/johnson-johnson-baby-wipes-ultimate-clean-pack-192-wipes

我的代码是这样的

from bs4 import BeautifulSoup 
import requests

def mumzworld(URL): 
# opening our output file in append mode 
    # opening our output file in append mode
    File = open("out.csv", "a")
    print("function start")
    # specifying user agent, You can use other user agents
    # available on the internet
    HEADERS = ({'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64)                  AppleWebKit/537.36 (KHTML, like Gecko)                     Chrome/44.0.2403.157 Safari/537.36', 
                           'Accept-Language': 'en-US, en;q=0.5'})


    # Making the HTTP Request
    print('Making requests...')
    webpage = requests.get(URL, headers=HEADERS)

    #print(webpage.status_code)
    # Creating the Soup Object containing all data
    soup = BeautifulSoup(webpage.content, 'html.parser')
    #name
    try:       
        product_name = soup.find("h1", {"class" : "mtop0"})
    except AttributeError:
        product_name = 'NA'
        print(product_name)
        
    #name = soup.find_all("h1", {"class" : "text-uppercase"})
    #for cat in name: 
        #try:
            #product_name = cat.find("p")
        #except AttributeError:
            #product_name = 'NA'
        print(product_name)
    #desc = soup.find("div", {"class" : "col-xl-7"})
    #for des in desc:
            #try:
                #description = des.find("p")
            #except AttributeError:
                #description = 'NA'
    File.write(f"{URL}~") 
    File.write(f"{product_name}~")
    #File.write(f"{description}~")
    #File.write(f"{img}\n")  
    if __name__ == '__main__':
        file = open("url.txt", "r")
        header = "URL~BRAND~NAME~Description"
        File = open("out.csv", "w")
        File.write(f"{header}\n")
        File.close()
        
    URLs = file.readlines()
    
    for links in URLs:
        mumzworld(links)
    File.close()

检查第一个标签后,我已将其余行作为注释

out.csv 文件中没有任何错误,也没有打印任何内容

请告诉我这是什么问题 我已经抓取了数据

这个链接也出现了同样的问题: https://www.betadinefemininecare.com/products/betadine-daily-intimate-foam/

【问题讨论】:

    标签: python beautifulsoup python-requests


    【解决方案1】:
    import sys
    import httpx
    import trio
    from bs4 import BeautifulSoup
    import json
    import csv
    
    
    items = [
        'johnson-johnson-baby-wipes-ultimate-clean-pack-192-wipes',
    ]
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
    }
    
    limiter = trio.CapacityLimiter(3)
    
    fheaders = ['url', 'name', 'description', 'image']
    
    
    async def worker(client, url, sender):
        async with limiter, sender:
            print(f'Extracting ----> {url}')
            r = await client.get(f'https://www.mumzworld.com/en/{url}')
            soup = BeautifulSoup(r.text, 'lxml')
            goal = soup.select('.wrapper > script')[-1].string
            loader = json.loads(goal)[0]
            result = [loader[i] for i in fheaders]
            await sender.send(result)
    
    
    async def catcher():
        async with httpx.AsyncClient(timeout=None) as client, trio.open_nursery() as nurse:
            client.headers.update(headers)
            sender, receiver = trio.open_memory_channel(0)
            nurse.start_soon(rec, receiver)
    
            async with sender:
                for item in items:
                    nurse.start_soon(worker, client, item, sender.clone())
    
    
    async def rec(receiver):
        with open('result.csv', 'a', newline='', buffering=1) as f:
            writer = csv.writer(f)
            writer.writerow(fheaders)
            async with receiver:
                async for val in receiver:
                    writer.writerow(val)
    
    
    async def amain():
        await catcher()
    
    
    def main():
        return trio.run(amain)
    
    
    if __name__ == "__main__":
        sys.exit(main())
    

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 2020-09-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2021-11-04
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多