【问题标题】:Convert CVS to JSON with duplicate keys使用重复键将 CSV 转换为 JSON
【发布时间】:2022-01-20 05:01:09
【问题描述】:

对于我正在处理的项目,我们在 Excel 工作表中提供了一些数据,我通过 Excel 将其转换为 CSV。 这些文件包含具有不同类别但 ID 相同的测量值。

例子

readingId; category; result;
1        ; cat 1   ; A
1        ; cat 2   ; B
2        ; cat1    ; C

然后我将 CSV 转换为 JSON 并编写了一个函数来将数据输出到不同的对象中

const fs = require('fs');
const path = require('path');

exports.convertJson = (file) => {
  let rawData = fs.readFileSync(file);
  let jsonData = JSON.parse(rawData);
  let rawOutput = [];

  for (output of jsonData) {
    rawOutput.push({
      locationId: output.Meetlocatienummer,
      date: output.Aanmaakdatum_score,
      subCategorie: output.Bestekspost,
      score: output.Score,
      scoreNumber: output.Cijfer,
      categories: output.Categorie,
      coordinates: output.Coordinaten,
      neighbourhoodIndex: output.BUURTCODE,
      quality: output.KWALITEIT,
      district: output.STADSDEEL,
      distrcitIndex: output.STADSDLCD,
      street: output.STRAATNAAM,
      neighbourhood: output.WIJK,
      cluster: output.Cluster,
    });
  }

  return rawOutput;
};

输出如下结果

 [
  {
    locationId: 10215,
    date: undefined,
    subCategorie: 'Meubilair-afvalbak-vullingsgraad',
    score: '',
    scoreNumber: 8,
    categories: 'Meubilair',
    coordinates: '52.072843, 4.287723',
    neighbourhoodIndex: 10,
    quality: 'residentiekwaliteit',
    district: 'Segbroek',
    distrcitIndex: 3,
    street: 'Xaverystraat',
    neighbourhood: 'Regentessekwartier',
    cluster: 'WRF'
  },
  {
    locationId: 10215,
    date: undefined,
    subCategorie: 'Meubilair-container-bijgeplaatst afval rondom container',
    score: 'A+',
    scoreNumber: 10,
    categories: 'Meubilair',
    coordinates: '52.072843, 4.287723',
    neighbourhoodIndex: 10,
    quality: 'residentiekwaliteit',
    district: 'Segbroek',
    distrcitIndex: 3,
    street: 'Xaverystraat',
    neighbourhood: 'Regentessekwartier',
    cluster: 'WRF'
  },
  {
    locationId: 10215,
    date: undefined,
    subCategorie: 'Riolering-kolk-belemmering inlaat',
    score: 'A+',
    scoreNumber: 10,
    categories: 'Riolering',
    coordinates: '52.072843, 4.287723',
    neighbourhoodIndex: 10,
    quality: 'residentiekwaliteit',
    district: 'Segbroek',
    distrcitIndex: 3,
    street: 'Xaverystraat',
    neighbourhood: 'Regentessekwartier',
    cluster: 'WRF'
  },
  {
    locationId: 10215,
    date: undefined,
    subCategorie: 'Verharding-open verharding-elementenverharding-onkruid',
    score: 'A',
    scoreNumber: 8,
    categories: 'Verharding',
    coordinates: '52.072843, 4.287723',
    neighbourhoodIndex: 10,
    quality: 'residentiekwaliteit',
    district: 'Segbroek',
    distrcitIndex: 3,
    street: 'Xaverystraat',
    neighbourhood: 'Regentessekwartier',
    cluster: 'WRF'
  },
  {
    locationId: 10215,
    date: undefined,
    subCategorie: 'Verharding-natuurlijk afval',
    score: 'A',
    scoreNumber: 8,
    categories: 'Verharding',
    coordinates: '52.072843, 4.287723',
    neighbourhoodIndex: 10,
    quality: 'residentiekwaliteit',
    district: 'Segbroek',
    distrcitIndex: 3,
    street: 'Xaverystraat',
    neighbourhood: 'Regentessekwartier',
    cluster: 'WRF'
  },
  {
    locationId: 10215,
    date: undefined,
    subCategorie: 'Verharding-uitwerpselen',
    score: 'A+',
    scoreNumber: 10,
    categories: 'Verharding',
    coordinates: '52.072843, 4.287723',
    neighbourhoodIndex: 10,
    quality: 'residentiekwaliteit',
    district: 'Segbroek',
    distrcitIndex: 3,
    street: 'Xaverystraat',
    neighbourhood: 'Regentessekwartier',
    cluster: 'WRF'
  },
  {
    locationId: 10215,
    date: undefined,
    subCategorie: 'Verharding-zwerfafval grof',
    score: 'A',
    scoreNumber: 8,
    categories: 'Verharding',
    coordinates: '52.072843, 4.287723',
    neighbourhoodIndex: 10,
    quality: 'residentiekwaliteit',
    district: 'Segbroek',
    distrcitIndex: 3,
    street: 'Xaverystraat',
    neighbourhood: 'Regentessekwartier',
    cluster: 'WRF'
  },
  {
    locationId: 10215,
    date: undefined,
    subCategorie: 'Verharding-veegvuil goten',
    score: 'A',
    scoreNumber: 8,
    categories: 'Verharding',
    coordinates: '52.072843, 4.287723',
    neighbourhoodIndex: 10,
    quality: 'residentiekwaliteit',
    district: 'Segbroek',
    distrcitIndex: 3,
    street: 'Xaverystraat',
    neighbourhood: 'Regentessekwartier',
    cluster: 'WRF'
  },
  {
    locationId: 10215,
    date: undefined,
    subCategorie: 'Verharding-onkruid rondom obstakels',
    score: 'B',
    scoreNumber: 6,
    categories: 'Verharding',
    coordinates: '52.072843, 4.287723',
    neighbourhoodIndex: 10,
    quality: 'residentiekwaliteit',
    district: 'Segbroek',
    distrcitIndex: 3,
    street: 'Xaverystraat',
    neighbourhood: 'Regentessekwartier',
    cluster: 'WRF'
  },
  {
    locationId: 10215,
    date: undefined,
    subCategorie: 'Verharding-grof vuil',
    score: 'A+',
    scoreNumber: 10,
    categories: 'Verharding',
    coordinates: '52.072843, 4.287723',
    neighbourhoodIndex: 10,
    quality: 'residentiekwaliteit',
    district: 'Segbroek',
    distrcitIndex: 3,
    street: 'Xaverystraat',
    neighbourhood: 'Regentessekwartier',
    cluster: 'WRF'
  },
  {
    locationId: 10215,
    date: undefined,
    subCategorie: 'Verharding-zwerfafval fijn',
    score: 'A',
    scoreNumber: 8,
    categories: 'Verharding',
    coordinates: '52.072843, 4.287723',
    neighbourhoodIndex: 10,
    quality: 'residentiekwaliteit',
    district: 'Segbroek',
    distrcitIndex: 3,
    street: 'Xaverystraat',
    neighbourhood: 'Regentessekwartier',
    cluster: 'WRF'
  },
  {
    locationId: 7466,
    date: undefined,
    subCategorie: 'Meubilair-afvalbak-vullingsgraad',
    score: 'B',
    scoreNumber: 6,
    categories: 'Meubilair',
    coordinates: '52.072647, 4.288656',
    neighbourhoodIndex: 10,
    quality: 'residentiekwaliteit',
    district: 'Segbroek',
    distrcitIndex: 3,
    street: 'Jan Krosstraat',
    neighbourhood: 'Regentessekwartier',
    cluster: 'WRF'
  }
]

最后我想将此信息写入 MongoDB,并且我想到了以下方案来减少重复数据的负载

{
  locationId: output.Meetlocatienummer,
  date: output.Aanmaakdatum_score,
  subCategories: [
    {
      subCategory: output.Bestekspost,
      score: output.Score,
      scoreNumber: output.Cijfer,
    },
  ],
  categories: [{ category: output.Categorie }],
  coordinates: output.Coordinaten,
  neighbourhoodIndex: output.BUURTCODE,
  quality: output.KWALITEIT,
  district: output.STADSDEEL,
  distrcitIndex: output.STADSDLCD,
  street: output.STRAATNAAM,
  neighbourhood: output.WIJK,
  cluster: output.Cluster,
}

这个项目是学习NodeJS时的一个爱好项目。实际数据是读取我为被污染的垃圾工作的城市街道的数量。在Excel中阅读数千行以查找城市的热点有点无聊,因为仅阅读一些分数和图表有点无聊,所以我认为通过NodeJS将其导入Leaflet会很好。

随着我学习 Node 并可能在未来 React 中,实际的后端将包含更多功能,这就是为什么我尝试自己编写它而不是将数据导入到谷歌地图中,​​这可以正常工作,但缺乏详细的类别过滤。

我希望我的想法有点清晰,有人可以指出我正确的方向。

编辑 1 我对 lodash 的了解更进一步。

  return _(rawOutput)
    .groupBy('locationId')
    .map((obj) => _.assignWith({}, ...obj, (val1, val2) => val1 || val2))
    .value();

我找到了上面的 sn-p,现在每个唯一 locationId 只得到 1 个输出,但现在我坚持使用子类别构建最终输出。

我还使用 csv-parser 直接从 csv 转换为正确的 json 输出,这将是理想的,因为那时我不必手动转换它。

我明天再回复它:-)

【问题讨论】:

  • 嗯,MongoDB 会很好地保存与“父”项相关的信息数组,并且它可以对坐标执行地理查找(您必须使用 GeoJSON 模式,但对于上面的点)这并不繁重)。你最初想要做什么?创建将骗子聚集在一起的逻辑?在MongoDB之外?里面?
  • 好的,我会调查的!我试图将大量重复数据放入一个对象中。在不同类别的同一坐标上大约有 10 个测量值。我希望将这些结果合并到一个对象中,并将不同的类别合并到主对象内的一个新对象中。 id:1 类别:- 类别:1 - 类别:2 坐标:0、0 而不是 id:1 类别:1 坐标:0、0 id:1 类别:2 坐标:0,0 等
  • categoriessubCategories是什么关系?或者说“按locationId 分组,只捕获10 个左右的测量值(坐标、街道、集群等)一次并构建一个类别字符串数组和一个包含subCategory、score 和scoreNumber 的subCategory 对象数组就足够了“?
  • 每个类别都有一些子类别,它们之间没有直接关系。例如,有一个“垃圾”类别,其中包含“小垃圾”、“大垃圾”、“自然垃圾”等子类别。

标签: node.js json mongodb express csv


【解决方案1】:

如果您将 JSON 和 mongoimport 导入 MongoDB,您可以使用以下管道对其进行转换 - 虽然老实说,外部的一个小 Python 脚本可以同样轻松地构建结构,然后您仍然可以导入压缩数据。

db.foo.aggregate([
    {$group: {_id: "$locationId",
              subCategories: {$push: {subCategory: "$subCategorie", score:"$score", scoreNumber: "$scoreNumber"}},
              categories: {$push: "$categories"},

              // Just take the first occurance of each of these since they are claimed                                
              // to be the same.                                                                                      
              date: {$first: "$date"},
              neighbourhoodIndex: {$first: "$neighbourhoodIndex"},
              quality: {$first: "$quality"},
              district: {$first: "$district"},
              distrcitIndex: {$first: "$distrcitIndex"},
              street: {$first: "$street"},
              neighbourhood: {$first: "$neighbourhood"},
              cluster: {$first: "$cluster"},
              coordinates: {$first: "$coordinates"}
             }}

    // Now that we have a single doc with locationId x and a coordinate, convert                                      
    // the string lat,long "52.072843, 4.287723" into a GeoJSON Point which is                                        
    // a long,lat array of doubles.  We convert by using $addFields to
    // overwrite the original coordinates field:                                                                                 
    ,{$addFields: {"coordinates": {$let: {
          vars: {pt: {$split:["$coordinates",","]}},
            in: {"type": "Point", "coordinates": [
                {$toDouble: {$trim: {input:{$arrayElemAt:["$$pt",1]}}}},
                {$toDouble: {$trim: {input:{$arrayElemAt:["$$pt",0]}}}}
            ]
                }
    }}
                  }}

    // Put the whole transformed thing into a new collection named "foo2":
    ,{$out: "foo2"}


]);

【讨论】:

  • 谢谢,我回家再仔细看看!
  • 好吧,我花了一些时间深入研究您的解决方案,实际上我得到了它的工作!朝着正确方向迈出了一大步,但我还发现 locationId 并不是特定坐标或日期所独有的。我想在测量时他们会回到相同的区域,给出相同的想法,但确切的位置不一样。我要多修改一下我想如何解析数据,但使用上面的代码,我希望我能自己弄清楚!非常感谢!完成后,我将上传结果,因为我采用了一些不同的方法,并使用 csv-parse 直接在 Node 中解析 CSV
【解决方案2】:

好吧,最后来自 Buzz Moschetti 的代码正是我想要摆脱重复数据的。我还没有听说过聚合,所以谢谢。

我最终使用 CSV Parse 库将 CSV 转换为 JSON,将其放入数据库中,然后使用 Buzz 中的代码查询出重复项。 我还没有编写代码来将清理后的数据写回数据库,但这应该不难,所以我将发布我现在拥有的内容以供其他人参考。

首先,我为转换编写了一个 csv 助手。

const fs = require('fs');
const { parse } = require('csv');
const moment = require('moment');

exports.processFile = async (filePath) => {
  const records = [];
  const input = fs.createReadStream(filePath);

  const parser = parse({
    // CSV options
    bom: true,
    delimiter: ';',
    cast: (value, context) => {
      if (context.header) return value;

      // Convert data
      if (context.column === 'date') {
        const dateString = moment(value, 'dd-mm-yyyy h:mm');
        const date = dateString.toDate();

        return date;
      }

      // Convert coordinates to GeoJSON
      if (context.column === 'coordinates') {
        const coordinate = value.split(',');

        const geoData = {
          type: 'Point',
          coordinate: [coordinate[0], coordinate[1]],
        };

        return geoData;
      }

      // Output rest of the fields
      return String(value);
    },
    columns: [
      'locationId', // meetlocatienummer
      'date', // aanmaakdatum score
      'subCategory', //bestekpost
      'category', // categorie
      'score', // score
      'coordinates', //coordinaten
      undefined, // buurt
      undefined, // buurtcode
      undefined, // gebied
      undefined, // id
      'quality', //kwaliteit
      undefined, // stadsdeel
      'districtIndex', //stadsdlcd
      'street', //straatnaam
      undefined, //vaknr
      'neighbourhood', //wijk
      undefined, //wijkcode
      'cluster', //cluster
      'scoreNumber', //cijfer
      undefined, // week
      undefined, // maand
      undefined, // jr-mnd
      undefined, // jaar
    ],
    trim: true,
    from_line: 2,
    skip_records_with_empty_values: true,
  });

  // parser.on('error', (err) => {
  //   console.log(err);
  //   const error = new Error(err);
  //   error.httpStatusCode = 500;
  //   throw error;
  // });

  //const transformer = transform((record, callback) => {});

  input.pipe(parser).on('error', (err) => {
    input.close();
  });

  for await (let record of parser) {
    // Skip all lines without coordinates
    if (record.coordinates.coordinate[1] === undefined) {
      continue;
    }

    // Push filename to the record object
    record.fileName = filePath;

    // Push records for final output
    records.push(record);

    //console.log('Records converted');
  }
  return records;
};

我正在使用 Multer 库上传文件。这是我的导入数据控制器中的 POST 操作。文件上传后,转换开始。如果发生错误,文件将再次被删除,并且没有记录写入数据库。如果转换成功,记录将被写入 MongoDB 中的导入数据,这些仍然是“脏”记录,因此会加载大量重复但没有被 CSV Parse 帮助程序过滤的无用数据。 (基本上都是没有坐标的数据)

exports.postImportData = (req, res, next) => {
  const uploadedCSV = req.file;
  //console.log(uploadedCSV);

  // Load imported CSV files from DB to be able to delete them
  ImportedCSVFile.find()
    .sort({ date: -1 })
    .then((result) => {
      // Check if there are already files imported
      let hasFiles = null;
      if (result.length > 0) {
        hasFiles = 1;
      }

      // If there are any erros with the file being uploaded
      if (req.fileValidationError) {
        return res.render('admin/import-data/import-data', {
          pageTitle: 'Importeer data',
          path: '/admin/import-data',
          files: result,
          activeAdmin: true,
          errorMessage: req.fileValidationError,
          validationErrors: [],
          hasFiles,
        });
      }
      // If there's no file uploaded
      if (!uploadedCSV) {
        return res.render('admin/import-data/import-data', {
          pageTitle: 'Importeer data',
          path: '/admin/import-data',
          files: result,
          activeAdmin: true,
          errorMessage: 'Geen bestand geselecteerd',
          validationErrors: [],
          hasFiles,
        });
      }
      (async () => {
        const csvFile = await fileHelper.hasFile(uploadedCSV);
        try {
          const records = await convert.processFile(csvFile);

          // Write all CSV data to importdatas in MongoDB
          await ImportData.insertMany(records)
            .then((result) => {
              console.log('Data imported');

              // Push info about the uploaded file into 'importedcsvfiles'
              const importedCSVFile = new ImportedCSVFile({
                filePath: fileHelper.hasFile(uploadedCSV),
                originalName: uploadedCSV.originalname,
              });
              return (
                importedCSVFile
                  .save() // Save all CSV data into 'importedcsvfiles' in MongoDB
                  .then((result) => {
                    res.redirect('/admin/import-data');
                  })
                  // Catch save filepath error
                  .catch((err) => {
                    console.log('save failed');
                    const error = new Error(err);
                    error.httpStatusCode = 500;
                    return next(error);
                  })
              );
            })
            // Catch insert CSV data into DB error
            .catch((err) => {
              console.log('insert many failed');
              const error = new Error(err);
              error.httpStatusCode = 500;
              return next(error);
            });
        } catch (err) {
          // console.log(error);
          fileHelper.removeFile(csvFile);
          return res.render('admin/import-data/import-data', {
            pageTitle: 'Importeer data',
            path: '/admin/import-data',
            files: result,
            activeAdmin: true,
            errorMessage:
              'Het geselecteerde bestand heeft niet de juiste indeling. Neem contact op met de beheerder.',
            validationErrors: [],
            hasFiles,
          });
        }
      })();
    });
};

还编写了一个删除选项,用于删除 CSV 文件和链接到该文件的所有数据库记录

exports.postDeleteData = (req, res, next) => {
  const dataId = req.body.dataId;
  ImportedCSVFile.findById(dataId)
    .then((result) => {
      // console.log('FilePath:');
      // console.log(result.filePath);

      const filePath = result.filePath;
      const deleteData = async () => {
        await ImportData.deleteMany({ filePath: filePath })
          .then((result) => {})
          .catch((err) => {
            const error = new Error(err);
            error.httpStatusCode = 500;
            return next(error);
          });
        await ImportedCSVFile.findByIdAndDelete(dataId)
          .then((result) => {
            console.log('Data deleted');
            fileHelper.removeFile(filePath);
            res.redirect('/admin/import-data');
          })
          .catch((err) => {
            console.log('here');
            const error = new Error(err);
            error.httpStatusCode = 500;
            return next(error);
          });
      };
      return deleteData();
    })
    .catch((err) => {
      const error = new Error(err);
      error.httpStatusCode = 500;
      return next(error);
    });
};

现在来自 Buzz 的聚合代码用于清理数据并将其放入 Leaflet 中,因此我在所有不同类别中获得 1 分。

const { ImportData, OutputData } = require('../models/importData.model');

// Main controller for the homepage
exports.getMainController = (req, res, next) => {
  ImportData.aggregate([
    {
      $group: {
        _id: '$coordinates',
        subCategories: {
          $push: {
            subCategory: '$subCategory',
            score: '$score',
            scoreNumber: '$scoreNumber',
          },
        },
        categories: { $push: '$category' },

        // Just take the first occurance of each of these since they are claimed
        // to be the same.
        date: { $first: '$date' },
        quality: { $first: '$quality' },
        districtIndex: { $first: '$districtIndex' },
        street: { $first: '$street' },
        neighbourhood: { $first: '$neighbourhood' },
        cluster: { $first: '$cluster' },
      },
    },
  ]).exec((err, locations) => {
    if (err) {
      throw next(err);
    }
    //console.log(locations);

    res.render('index.ejs', {
      pageTitle: 'Kaart',
      path: '/kaart',
      activeAdmin: true,
      data: locations,
      errorMessage: null,
    });
  });
};

到目前为止,我只是像开头所说的那样查询这些数据。由于我仍在学习很多关于 Javascript 和 Node 的知识,我现在开始使用 React 构建前端。一旦我去那里,我会将所有这些代码转换为 API,然后我将完成项目的这一部分。

【讨论】:

    猜你喜欢
    • 2015-04-23
    • 1970-01-01
    • 1970-01-01
    • 2015-08-23
    • 1970-01-01
    • 2020-11-02
    • 2020-10-26
    • 1970-01-01
    • 1970-01-01
    相关资源
    最近更新 更多