【问题标题】:NodeJS Cheerio library pagination web scrapingNodeJS Cheerio 库分页网页抓取
【发布时间】:2020-04-10 17:33:46
【问题描述】:

我希望你在这种情况下一切顺利。 Am 对使用 nodejs 和 librarycheerio 进行网络抓取(分页)有疑问。已经完成了一些代码,但是它只刮了一页有问题,我已经在寻找解决方案几个小时了,尝试按照一些步骤进行操作,但结果相同,它只刮了一页。我很感激你的任何回答,这里是代码:

const request = require('request-promise')
const cheerio = require('cheerio')
const fs      = require('fs')

const baseUrl = 'https://indotrading.com/company_hdpe_620' // the website i want to scrape
const outputFile = 'data.csv'
const parsedResults = []
var indexPage = 1
var totalPage = 0


const getWebsiteContent = async (url) => {
  try {
    request(url).then(function(body){
        const $ = cheerio.load(body)

        let page = $('.footer-page').children().children().last().children().prop("href")   //get last page navigation button
        page = page.split("/")
        totalPage = page[page.length-1]     //total page that website has

        //get some data from HTML attribute
        $('#products_container #catcom-container').each((key,element) => {
            const linkImage = $(element).find('.swiper-wrapper').children().children().children().prop('data-src')
            const companyName = $(element).find('.product_title').text().replace(/\n+/g,'')
            const companyAddress = $(element).find('i.fa.fa-map-marker.fs-18.mr-5').parent().find('p.d-flex.a-center').text().replace(/\s/,'')

            const splitLinkImage = linkImage.split("/")

            const companyID = splitLinkImage[splitLinkImage.indexOf("webp")+1]

            //calling function phone data based on company id
            const getdataPhone = getPhoneData(companyID)
            getdataPhone.then(function(result) {        // please check this one, is the promise correct?
                const listCompanyPhone = JSON.parse(result.d)
                const companyPhone = listCompanyPhone.Phone+" , "+listCompanyPhone.Phone2
                const Company = {
                    Name : companyName,
                    Phone: companyPhone,
                    Address: companyAddress
                }
                parsedResults.push(Company)
                exportResults(parsedResults)
            })
        })
    })

    const nextPageLink = baseUrl+'/'+(++indexPage)      // get next page
    indexPage++
    if(indexPage == totalPage){
        exportResults(parsedResults)    // exports to csv but not work
        return false
    }

    getWebsiteContent(nextPageLink) //it will not recursive
} catch (error) {
    console.log(error)
}

}

//function for get data by calling api and it returns json
function getPhoneData(data) {
  try {
    var options = {
        method : 'POST',
        uri : 'https://www.indotrading.com/AjaxMethod.asmx/UpdateCompanyPhoneLeads',
        body : {
            Token : "EAAAAKTheWTVifIaYce5HmctJuDKNQO5nbySwS3GGi14hbcy0oGq3yqxMhd5sE6349byCw==",
            EncCompanyID : data,
            ProductID : "undefined"
        },
        headers : {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
            'Content-Type': 'application/json'
        },
        json: true
    }

    return request(options).then(function(body){
        return body
    }).catch(function(error){
        console.log(error)
    })
} catch (error) {
    console.log("get phone data error : "+error)
}

}

//function for export to csv file
const exportResults = (parsedResults) => {
  fs.writeFile(outputFile, JSON.stringify(parsedResults, null, 4), (err) => {
    if (err) {
      console.log(err)
    }
    console.log(`\n ${parsedResults.length} Results exported successfully to ${outputFile}\n`)
  })
}


getWebsiteContent(baseUrl)

我该如何解决这个问题?我只想抓取所有页面,只要它存在

【问题讨论】:

    标签: node.js web-scraping cheerio request-promise


    【解决方案1】:

    我正在做类似的事情,我建议你这样做......

    现在不推荐使用请求,而是使用 axios

    const axios = require('axios')
    const cheerio = require('cheerio')
    const fs = require('fs')
    
    const baseUrl = 'https://example.com' // the website url to start scraping from
    var parsedResults = [];
    const outputFile = 'data.csv'
    var saved = false // Added this for monitoring if the scraped data was saved if an error is thrown
    var indexPage = 1
    var totalPages = 1;
    
    
    const getWebsiteContent = async (url) => {
        try {
    
            axios.get(url).then( res => {
    
                const $ = cheerio.load(res.data)
    
                totalPages = getTotalpages($);  // Get the pagination
    
    
                // Now we have the total pages for the url you want to scrap
                // Next we scrape all the data on the respective pages
    
                // Add your code here that scrapes the data
    
                });
            })
            .catch(err => {
                throw(err);
            });
    
            indexPage++; // Increment to the next page
    
            if (indexPage == totalPages) {
                exportResults(parsedResults)    // If we have surpassed the total pages we export the result to CSV
                return false
            }
    
            const nextPageLink = baseUrl + '......' + indexPage;      // get next page
        
            // Add a little  timeout to avoid getting banned by the server
            setTimeout(() => {
                getWebsiteContent(nextPageLink); // Call itself
              }, 3000);
            
    
        }
        catch (error) {
            console.log(error)
        }
        finally{
    
            // If results were written successfully to file the end else write whats in memory
            if(!saved){
                exportResults(parsedResults) ;
            }
        }
    }
    
    
    // Get the pagination
    function getTotalpages(data){
    
        // Extract the total number of pages available and return it as an integer
    }
    
    //function for export to csv file
    const exportResults = (parsedResults) => {
        fs.appendFile(outputFile, JSON.stringify(parsedResults, null, 4), (err) => {
            if (err) {
                console.log(err)
            }
            console.log(`\n ${parsedResults.length} Results exported successfully to ${outputFile}\n`);
            saved = true;
        })
    }
    
    
    getWebsiteContent(baseUrl);
    
    
    
    

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多