使用cheerio 从使用cheerio 提取的链接中抓取数据答案

【问题标题】：Using cheerio to scrape data from links extracted using cheerio使用cheerio 从使用cheerio 提取的链接中抓取数据
【发布时间】：2020-09-27 23:03:03
【问题描述】：

正在使用 Cheerio 和 nodejs 从 allegro 网站获取数据，以在 API 中创建端点，返回 csv 数据，这些数据将在以后作为数据科学项目的一部分进行研究：

https://allegro.pl/kategoria/samochody-osobowe-4029?bmatch=baseline-al-product-cl-eyesa2-engag-dict45-aut-1-3-0605

为了获取汽车信息，我设法从第一页刮掉了链接，每个链接将您发送到汽车（汽车项目）以查看汽车的完整信息我需要从每个链接中刮取更多数据如何做我这样做？

以及如何让 json 数据显示为 csv？

这里使用的代码：

const url =
  "https://allegro.pl/kategoria/samochody-osobowe-4029?bmatch=baseline-al-product-cl-eyesa2-engag-dict45-aut-1-3-0605";

//const writeStream = fs.createWriteStream("allegro.csv");
// Write Headers
//writeStream.write(`Price,Link \n`);

function getCars() {
  return fetch(`${url}`)
    .then((response) => response.text())
    .then((body) => {
      const cars = [];
      const $ = cheerio.load(body);
      $("._9c44d_2H7Kt").each(function (i, el) {
        const $price = $(el).find("._9c44d_1zemI");
        const $link = $(el).find("a");
        const $year = $(el).find("dd");
        const $make = $(el).find("h2");

        const car = {
          price: $price.text().replace(/\s\s+/g, ""),
          link: $link.attr("href"),
          year: $year.first().next().next().text(),
          make: $make.text(),
        };
        cars.push(car);
      });

      // Write Row to CSV
      // writeStream.write(`${price},${link} \n`);
      return cars;
    });
}

用于 nodejs 端点的代码：

app.get("/scraping/:allegro", (req, res) => {
  scraper.getCars(req.param.allegro).then((cars) => {
    //console.log(cars);
    res.json(cars);
  });

从每个链接获取的数据如下：添加日期，型号，电话号码，城市，vin

【问题讨论】：

标签： node.js api web-scraping cheerio

【解决方案1】：

这些页面有一个方便的地方，就是您可以通过将媒体类型设置为application/json（例如设置Accept 标头）以JSON 而不是html 格式返回数据。

例如获取列表：

curl "https://allegro.pl/kategoria/samochody-osobowe-4029?bmatch=baseline-al-product-cl-eyesa2-engag-dict45-aut-1-3-0605&order=dd" \
     -H "Accept: application/json"

获取特定项目：

curl "https://allegro.pl/ogloszenie/mercedes-ml320-9341716141" -H "Accept: application/json"

因此，您无需使用仅解析 JSON 的网络抓取工具。分页是通过添加查询参数&p=PAGE_NUM来完成的，也很方便

我在python 中做了一个小例子，可以很容易地移植到 JS。它请求汽车列表，然后请求第一个元素：

import requests 
import json
import pandas as pd

r = requests.get("https://allegro.pl/kategoria/samochody-osobowe-4029",
    headers = {
        "Accept": "application/json"
    },
    params = {
        "bmatch":"baseline-al-product-cl-eyesa2-engag-dict45-aut-1-3-0605",
        "order":"dd"
    })
data = [{
        "name": t["name"],
        "url": t["url"],
        "price": t["sellingMode"]["advertisement"]["price"]["amount"],
        **dict([(j["name"],j["values"][0]) for j in t["parameters"]]),
    }
    for t in r.json()["pagination bottom"]["collection"]["items"]["promoted"]
]
df = pd.DataFrame(data)
print(df)

print("get data for first element")
r = requests.get(data[0]["url"],
    headers = {
        "Accept": "application/json"
    })
item = r.json()
item_data = {
    "phone": item["summary"]["offer"]["contact"]["phones"][0]["number"],
    "delivery": item["summary"]["offer"]["delivery"]["summary"][0]["value"]["text"],
    "startingAt": item["summary"]["offer"]["publication"]["startingAt"],
    "endingAt": item["summary"]["offer"]["publication"]["endingAt"],
    **dict([(j["name"], j["values"][0]["valueLabel"]) for j in item["summary"]["offer"]["parametersGroups"]["groups"][0]["parameters"]])
}

print(item_data)

nodejs 中使用 axios 的实现：

const axios = require("axios");

async function process() {
    let response = await axios.get('https://allegro.pl/kategoria/samochody-osobowe-4029',{
        query: {
            "bmatch":"baseline-al-product-cl-eyesa2-engag-dict45-aut-1-3-0605",
            "order":"dd"
        },
        responseType: "json"
    });
    let promoted = response.data["pagination bottom"].collection.items.promoted;
    list = [];
    for (var i = 0; i < promoted.length;i++) {
        let item = {
            name: promoted[i].name,
            url: promoted[i].url,
            price: promoted[i].sellingMode.advertisement.price.amount,
        };
        let params = promoted[i].parameters;
        for (var j = 0; j < params.length;j++){
            item[params[j].name] = params[j].values[0];
        }
        list.push(item);
    }
    console.log(list);
    console.log("fetching : " + list[0].url);
    response = await axios.get(list[0].url,{
        responseType: "json"
    });
    let entryData = response.data;
    let entry = {
        phone: entryData.summary.offer.contact.phones[0].number,
        delivery: entryData.summary.offer.delivery.summary[0].value.text,
        startingAt: entryData.summary.offer.publication.startingAt,
        endingAt: entryData.summary.offer.publication.endingAt
    };
    let parameters = entryData.summary.offer.parametersGroups.groups[0].parameters;
    for (var i = 0; i < parameters.length;i++) {
        entry[parameters[i].name] = parameters[i].values[0].valueLabel
    }
    console.log(entry);
}

process();

【讨论】：

非常感谢您提供非常详细的回答，这是一个巨大的帮助我的问题是有没有办法让这个 python 文件每次都自动运行，并将新的抓取数据添加到以前抓取的数据中能做到吗？
这取决于站点，我看不懂波兰语，但如果您可以按最近的条目排序（页面上的某些过滤器？），您可以使用新的查询参数（如果存在）你的脚本。您将使用条目 ID（来自 json）将所有内容存储在数据库中。当您的脚本运行时，它会将条目存储在数据库中，直到找到现有的 id
我明白了，谢谢你的回复，我现在唯一的问题是我如何将 pythong 连接到 nodejs 并表达，所以我在使用cheerio 之前创建端点以从浏览器访问数据我只导出了使用它们的模块
@Souhaildq 我在 nodejs 中添加了一个实现。注意我这里使用promoted字段提取列表数据但可能还有其他字段保存项目，其他字段我没有看太多
非常感谢您是救生员，我是否需要将数据保存到数据库才能从端点访问它？