【问题标题】:Scraping IMDb episodes using Cheerio.js - only first page of TV episodes is returned使用 Cheerio.js 抓取 IMDb 剧集 - 仅返回电视剧集的第一页
【发布时间】:2019-04-23 10:41:17
【问题描述】:

致力于从 IMDb 抓取电视剧集(以下示例中的《绝命毒师》)。问题是在实现for 循环时,只返回j 的第一次迭代。

我的假设是 return 语句正在退出循环,但我不确定如何解决这个问题。

const fetch = require('node-fetch');
const cheerio = require('cheerio');

const searchUrl = 'https://www.imdb.com/find?s=tt&ttype=tv&ref_=fn_tv&q=';
const movieUrl = 'https://www.imdb.com/title/';

async function getEpisodes(searchTerm) {

  //const imdbID = await getID(searchTerm);
  //const numSeasons = await getSeasons(imdbID);

  const imdbID = 'tt0903747';
  const numSeasons = 5;
  const episodes = [];

  for (let j = 1; j <= numSeasons; j++) {
    return fetch(`${movieUrl}${imdbID}/episodes?season=${j}`)
      .then(response => response.text())
      .then(body => {
        const $ = cheerio.load(body);

        $('div[itemProp="episodes"]').each(function (i, element) {
          const airdate = $(element).find('.airdate').text().trim();
          const episodeTitle = $(element).find('a[itemProp="name"]').text().trim();
          const votes = $(element).find('.ipl-rating-star__total-votes').text().trim().match(/\(([^)]+)\)/)[1];
          const rating = $(element).find('.ipl-rating-star ').find('.ipl-rating-star__rating').text().trim().slice(0, 3);

          episode = {
            season: j,
            episodeTitle,
            airdate,
            votes,
            rating
          };
          episodes.push(episode);
        }); 
        return episodes; //Only season 1 is returned.
      }); 
  }
}

【问题讨论】:

    标签: javascript node.js cheerio imdb


    【解决方案1】:

    让我们使用异步等待样式重写函数。这样我们就可以确保我们触发fetchnumSeasons 次,等待所有这些,并一个一个地处理它们。

    async function processResponse(response, season) {
        const body = await response.text();
        const $ = cheerio.load(body);
    
        let episodes = [];
        $('div[itemProp="episodes"]').each(function (i, element) {
            const airdate = $(element).find('.airdate').text().trim();
            const episodeTitle = $(element).find('a[itemProp="name"]').text().trim();
            const votes = $(element).find('.ipl-rating-star__total-votes').text().trim().match(/\(([^)]+)\)/)[1];
            const rating = $(element).find('.ipl-rating-star ').find('.ipl-rating-star__rating').text().trim().slice(0, 3);
    
            episode = {
                season,
                episodeTitle,
                airdate,
                votes,
                rating
            };
    
            episodes.push(episode);
        });
    
        return episodes;
    }
    
    async function getEpisodes(searchTerm) {
    
        //const imdbID = await getID(searchTerm);
        //const numSeasons = await getSeasons(imdbID);
    
        const imdbID = 'tt0903747';
        const numSeasons = 5;
    
        let promises = [];
    
        for (let j = 1; j <= numSeasons; j++) {
            promises.push(fetch(`${movieUrl}${imdbID}/episodes?season=${j}`));
        }
    
        const responses = await Promise.all(promises);
        return responses.reduce((accumulator, response, index) => {
            return accumulator.concat(await processResponse(response, index + 1));
        }, []);
    }
    

    【讨论】:

    • 谢谢!运行上面的代码时,我得到一个空数组。 getEpisodes 正在迭代 numSeasons 次,但 processResponse 似乎没有正确获取任何数据(获得 5 个空数组)。想法?
    • 哦,我刚刚注意到 response.text() 也返回了一个 Promise,所以我们需要使 processResponse 异步,await response.text()。检查我编辑的代码
    猜你喜欢
    • 2023-01-24
    • 1970-01-01
    • 2021-09-15
    • 1970-01-01
    • 2020-01-07
    • 2012-07-26
    • 1970-01-01
    • 2021-12-10
    • 1970-01-01
    相关资源
    最近更新 更多