【发布时间】:2020-07-02 06:46:20
【问题描述】:
我尝试使用 puppeteer 和 Node.js 抓取 Reddit。有我的代码,我在哪里:
- 为 Reddit 的主页打开一个页面,
- 获取所有帖子。
- 对于每个帖子,我都会获得指向其内容页面的链接。
- 为每个内容页面打开一个新页面。
- 抓取每个内容页面。
const puppeteer = require("puppeteer");
const self = {
browser: null,
page: null,
initialize: async () => {
browser = await puppeteer.launch({
headless: false,
});
page = await browser.newPage();
// Go to the index page of Reddit
await page.goto("https://old.reddit.com/", { waitUntil: "networkidle0" });
},
getResults: async () => {
let platform = "Reddit";
// Get all posts on the main page of Reddit.
let mentions = await page.$$('#siteTable > div[class *= "thing"]');
let results = [];
// For each post:
for (let mention of mentions) {
let content = "";
// I get the link to its content page.
let content_URL = await mention.$eval(
'p[class="title"] > a[class*="title"]',
(node) => node.getAttribute("href").trim()
);
// if it is a inner link:
if (content_URL.substr(0, 3) === "/r/") {
// Create a new page to open that content page.
let contentPage = await browser.newPage();
await contentPage.goto("https://old.reddit.com" + content_URL, {
waitUntil: "networkidle0",
});
// Get the first paragraph of this content page.
content = await contentPage.evaluate((contentPage) => {
// Here is where the error occurred:
// Error: Evaluation failed: TypeError: Cannot read property 'querySelector' of undefined
let firstParagraph = contentPage.querySelector(
'div[class*="usertext-body"] > p'
);
if (firstParagraph != null) {
return firstParagraph.innerText.trim();
} else {
return null;
}
});
}
results.push({
title,
content,
image,
date,
popularity,
platform,
});
}
return results;
},
};
module.exports = self;
但发生错误:Error: Evaluation failed: TypeError: Cannot read property 'querySelector' of undefined。
谁能指出我哪里做错了?
谢谢!
【问题讨论】:
-
contentPage未定义。 -
@RobertHarvey 但我确实在
let contentPage = await browser.newPage();中定义了它
标签: javascript node.js web-scraping puppeteer