Puppeteer，循环通过 xpath 选择的链接答案

【问题标题】：Puppeteer, looping through links selected by xpathPuppeteer，循环通过 xpath 选择的链接
【发布时间】：2018-06-26 01:33:01
【问题描述】：

我是 puppeteer 的新手（一般来说 JavaScript 不太好），我正在尝试编写一些基本功能：

从 XPath 中获取所有链接
循环并点击这些链接
截屏并保存页面的 HTML
返回，截图并保存记录页面的HTML，保存在其他人的同一目录中，然后重新开始处理

我得到的错误是：

评估失败：DOMException：无法在“文档”上执行“querySelector”：“0”不是有效的选择器

这是我的代码：

我相当有信心所有代码都能正常工作，除了我在使用 XPath 获得正确的东西时遇到的问题。我得到这些的网站是：

https://hrlb.oregon.gov/bspa/licenseelookup/searchdir.asp?searchby=lastname&searchfor=a&stateselect=none&Submit=Search

代码：

const records = await page.$x('//table[2]//tr[td[a]]//td[1]/a');
let int = 0;
for (let record in records) {
    await Promise.all([
        page.waitForNavigation(),
        page.click(record)
    ]);

    await Promise.all([makeDirectory('screenshots/item'+int), makeDirectory('screenshots/item'+int+'/base'), makeDirectory('screenshots/item'+int+'/record')]);
    let recordPath = "screenshots/item"+int+"/record/record.html";
    let basePath = "screenshots/item"+int+"/base/base.html";

    page.screenshot({path: "screenshots/item"+int+"/record/record.png", fullPage: true});
    let recordBody = await page.evaluate(() => document.body.innerHTML);
    await saveHtml(recordPath, recordBody);

    await Promise.all([
        page.waitForNavigation(),
        page.goBack()
    ]);

    await page.screenshot({path: "screenshots/item"+int+"/base/base.png", fullPage: true});
    let baseBody = await page.evaluate(() => document.body.innerHTML);
    await saveHtml(basePath, baseBody);

    int++;
    console.log(record);
}

async function makeDirectory(path) {
    mkdirp(path, function(err) {
        if (err) throw err;
    });
};

async function saveHtml(path, html) {
    await fs.writeFile(path, html, (err) => {
        if (err) throw err;
    });
};

注意：我必须使用 XPath :(

2018 年 6 月 25 日更新这现在给了我来自 xpath 选择器的所有链接。然后我对其进行迭代并仅使用 page.goto 转到正确的站点。

const linksXPath = '//table[2]//tr[td[a]]//td[1]/a';
const links = await page.evaluate((selector) => {
    let results = [];
let query = document.evaluate(selector,
  document,
  null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
  for (let i=0, length=query.snapshotLength; i<length; ++i) {
    results.push(query.snapshotItem(i).href);
  }
    return results;
}, linksXPath);

【问题讨论】：

请将您的代码作为文本而不是图像插入问题中，这样可以测试您的代码。
@Vaviloff 谢谢，给你。
您是否使用 xpath 解决了这个问题？

标签： xpath puppeteer

【解决方案1】：

我认为问题出在你的选择器上。

我相信您的表格选择器应该是：

"body > table > tbody > tr:nth-child(2) > td > table > tbody > tr:nth-child(1) > td > table.bodytext > tbody"

为页面获取正确选择器的最简单方法是使用 Chrome 开发工具。

检查页面，然后转到“元素”选项卡。从那里，您应该会看到所有的 HTML 元素。右键单击您想要的那个（我选择了<tbody>，这样您就可以遍历<tr> 元素。）然后选择复制> 复制选择器。

【讨论】：

我会使用它，但我必须使用 XPath
啊，抱歉——我的印象实际上是 XPath 只用于处理 XML 文档。想要使用 XPath 是否有实际原因，还是纯粹的学术要求？

【解决方案2】：

我的代码现在正在做我需要它做的事情，但是我希望有一种更简单的方法来做这件事。此外，当我遍历链接时，您会看到，我正在使用 page.goto 函数去那里。我仍然不知道使用 page.click 的方法。我将不得不使用 xpath 来获取所有 td，然后单击它们，但我永远无法让它工作。所以这是工作产品：

const puppeteer = require('puppeteer');
const fs = require('fs');
const mkdirp = require('mkdirp');

async function run() {
    const pageToClick = 'body > table > tbody > tr:nth-child(3) > td > table > tbody > tr > td > form > table > tbody > tr:nth-child(3) > td > div > input[type="submit"]';
    const select = 'body > table > tbody > tr:nth-child(3) > td > table > tbody > tr > td > form > table > tbody > tr:nth-child(1) > td:nth-child(2) > select';
    const inputField = 'body > table > tbody > tr:nth-child(3) > td > table > tbody > tr > td > form > table > tbody > tr:nth-child(2) > td:nth-child(2) > input[type="text"]:nth-child(1)';
    const linksXPath = '//table[2]//tr[td[a]]//td[1]/a';
    const browser = await puppeteer.launch({
        headless: true
    });
    const page = await browser.newPage();
    await page.goto('https://hrlb.oregon.gov/bspa/licenseelookup/');
    await page.select(select, 'lastname');
    await page.focus(inputField);
    await page.keyboard.type('a');
    await Promise.all([
        page.waitForNavigation(),
        page.click(pageToClick)
    ]);

    const links = await page.evaluate((selector) => {
        let results = [];
        let query = document.evaluate(selector,
            document,
            null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
        for (let i=0, length=query.snapshotLength; i<length; ++i) {
            results.push(query.snapshotItem(i).href);
        }
        return results;
    }, linksXPath);
    const basePic = await page.screenshot({fullPage: true});
    let baseBody = await page.evaluate(() => document.body.innerHTML);
    let int = 0;
    for (i = 0; i < links.length; i++) {
        await Promise.all([
            page.waitForNavigation(),
            page.goto(links[i])
        ]);

        await Promise.all([makeDirectory('screenshots/item'+int), makeDirectory('screenshots/item'+int+'/base'), makeDirectory('screenshots/item'+int+'/record')]);
        let recordPath = "screenshots/item"+int+"/record/record.html";
        let basePath = "screenshots/item"+int+"/base/base.html";
        let basePicPath = "screenshots/item"+int+"/base/base.png";

        await page.screenshot({path: "screenshots/item"+int+"/record/record.png", fullPage: true});
        let recordBody = await page.evaluate(() => document.body.innerHTML);
        await saveFile(recordPath, recordBody);

        await Promise.all([
            page.waitForNavigation(),
            page.goBack()
        ]);

        await saveFile(basePath, baseBody);
        await saveFile(basePicPath, basePic);

        int++;
    }
    await page.close();
    await browser.close();
}

async function makeDirectory(path) {
    mkdirp(path, function(err) {
        if (err) throw err;
    });
};

async function saveFile(path, html) {
    await fs.writeFile(path, html, (err) => {
        if (err) throw err;
    });
};

run();

【讨论】：