MongoDB聚合：展开操作后计算真实文档？答案

【问题标题】：MongoDB aggregation: count real documents after unwind operation?MongoDB聚合：展开操作后计算真实文档？
【发布时间】：2015-08-25 11:50:06
【问题描述】：

这是我的架构：

company: String
model: String
tags: [String] // array of strings

这是我的查询：

{ "$unwind": "$tags" },
{ "$group": {
    "_id": {
        "company": "$company",
        "model": "$model",
        "tag": "$tags"
    },
    "tagCount": { "$sum": 1 },
    "reviewCount": { "$sum" : 1}
}},
{ "$group": {
    "_id": { 
        "company": "$_id.company",
        "model": "$_id.model",
     },
     "tags": { "$push": { "tag": "$_id.tag", "count": "$tagCount" },
     "count": { "$sum": "$reviewCount" }
}}

请注意，我有 tagCount 和 reviewCount。
tagCount 必须显示某些模型在不同评论中的标签匹配数。
reviewCount 必须显示某些模型的评论数.

但是，由于 unwind 操作，reviewCount 当前显示的数字不正确。

在每个文档中展开tags列表后，获取某些model的文档数的有效方法是什么？

注意：我也尝试过这样做：

{ "$group": {
  "_id": {
    "company": "$company",
    "model": "$model"
  },
  "reviewCount": { "$sum": 1}
}},
{ "$unwind": "$tags" },
{ "$group": {
    "_id": {
        "company": "$_id.company",
        "model": "$_id.model",
        "tag": "$tags"
    },
    "tagCount": { "$sum": 1 }
}},
{ "$group": {
    "_id": { 
        "company": "$_id.company",
        "model": "$_id.model",
     },
     "tags": { "$push": { "tag": "$_id.tag", "count": "$tagCount" },
     "count": { "$sum": "$reviewCount" }
}}

但我得到了一个空数组作为响应。

【问题讨论】：

标签： node.js mongodb mongoose mongodb-query aggregation-framework

【解决方案1】：

您正在查看的问题是您希望在其他一些分组标准中计算数组中某些数据的不同出现次数（所以“标签”），然后只计算其他标准本身的不同出现次数"tags" 被移除并作为数组放置在结果中。

这个解决方案是很合乎逻辑的，所以如果你退后一步看一下数据，考虑一下这个传真：

{ "a" : "a", "b" : "b", "c" : [ "c", "d" ] }
{ "a" : "a", "b" : "b", "c" : [ "c", "d" ] }
{ "a" : "a", "b" : "b", "c" : [ "d", "e" ] }
{ "a" : "a", "b" : "b", "c" : [ "d", "e" ] }
{ "a" : "a", "b" : "b", "c" : [ "e", "f" ] }

总共有“五个”文档具有相同的“a”和“b”值，“c”当然会有不同的值。要获得不同的“c”计数，您需要先$unwind，然后在“a”、“b”和“c”上输入$group：

{ "$unwind": "$c"},
{ "$group": {
  "_id": {
    "a": "$a",
    "b": "$b",
    "c": "$c"
  },
  "count": { "$sum": 1 }
}},

查看结果：

{ "_id" : { "a" : "a", "b" : "b", "c" : "e" }, "count" : 3 }
{ "_id" : { "a" : "a", "b" : "b", "c" : "f" }, "count" : 1 }
{ "_id" : { "a" : "a", "b" : "b", "c" : "d" }, "count" : 4 }
{ "_id" : { "a" : "a", "b" : "b", "c" : "c" }, "count" : 2 }

根据可用的不同值，那里的“唯一”组合已下降到“四个”文档。现在你可以看到它并说“总计数加起来是‘十’，数组总是有‘两个’元素，所以这意味着‘五个’对吗？”，但这是从不真实世界的场景，数组长度会有所不同。

那么你如何获得文件的数量呢？那么从这里，你不能。在单个聚合管道中唯一可以做的事情是$group 首先在“a”和“b”上，使用$push 将所有数组内容保留在“c”中。这使您可以正确地将“a”和“b”组合计为“五”。

然后您将$unwind“两次”，因为这是一个数组数组，并从开始计算这些不同键的位置继续，保持“对”上初始计数的$first出现。

   { "$group": {
      "_id": {
        "a": "$a",
        "b": "$b"
      },
      "c": { "$push": "$c" },
      "count": { "$sum": 1 }
    }},
    { "$unwind": "$c" },
    { "$unwind": "$c" },
    { "$group": {
      "_id": {
        "a": "$_id.a",
        "b": "$_id.b",
        "c": "$c"
      },
      "docCount": { "$first": "$count" },
      "count": { "$sum": 1 }
    }},
    { "$group": {
      "_id": {
        "a": "$_id.a",
        "b": "$_id.b"
      },
      "tags": { "$push": { "tag": "$c", "count": "$count" } },
      "count": { "$first": "$docCount" }
    }}

但这不是“有效”的，因为您将组合在一起然后“分开”只是为了添加其他内容。

处理此问题的“高效”方法是运行“两个”聚合操作，并以“并行”方式进行。然后你可以在一个简单的哈希合并中组合操作的结果，或者使用类似 nedb 的东西，但在 MongoDB 集合中处理类似的方式：

var async = require('async'),
    mongoose = require('mongoose'),
    Schema = mongoose.Schema,
    DataStore = require('nedb'),
    db = new DataStore();

mongoose.connect('mongodb://localhost/test');

var data = [
  { "a" : "a", "b" : "b", "c" : [ "c", "d" ] },
  { "a" : "a", "b" : "b", "c" : [ "c", "d" ] },
  { "a" : "a", "b" : "b", "c" : [ "d", "e" ] },
  { "a" : "a", "b" : "b", "c" : [ "d", "e" ] },
  { "a" : "a", "b" : "b", "c" : [ "e", "f" ] }
];

var dataSchema = new Schema({
  a: String,
  b: String,
  c: [String]
});

var Data = mongoose.model( 'Data', dataSchema, 'data' );

async.series(
  [
    function(callback) {
      async.each([Data],function(model,callback) {
        model.remove({},callback);
      },callback);
    },

    function(callback) {
      async.each(data,function(doc,callback) {
        Data.create(doc,callback);
      },callback);
    },

    function(callback) {
      async.parallel(
        [
          function(callback) {
            Data.aggregate(
              [
                { "$group": {
                  "_id": {
                    "a": "$a",
                    "b": "$b"
                  },
                  "count": { "$sum": 1 }
                }}
              ],
              function(err,results) {
                if (err) callback(err);
                async.each(results,function(result,callback) {
                  db.update(
                    { "key": result._id },
                    { "$set": { "count": result.count } },
                    { "upsert": true },
                    callback
                  );
                },callback);
              }
            );
          },

          function(callback) {
            Data.aggregate(
              [
                { "$unwind": "$c" },
                { "$group": {
                  "_id": {
                    "a": "$a",
                    "b": "$b",
                    "c": "$c"
                  },
                  "count": { "$sum": 1 }
                }},
                { "$group": {
                  "_id": {
                    "a": "$_id.a",
                    "b": "$_id.b"
                  },
                  "tags": { "$push": { "tag": "$_id.c", "count": "$count" } }
                }}
              ],
              function(err,results) {
                if (err) callback(err);
                async.each(results,function(result,callback) {
                  db.update(
                    { "key": result._id },
                    { "$set": { "tags": result.tags } },
                    { "upsert": true },
                    callback
                  );
                },callback)
              }
            );
          }
        ],
        function(err) {
          if (err) callback(err);
          db.find({},function(err,results) {
            if (err) callback(err);
            console.log(JSON.stringify( results, undefined, 2 ));
            callback()
          });
        }
      );
    }
  ],
  function(err) {
    if (err) throw err;
    mongoose.disconnect()
  }
);

其中显示了实际同时运行但产生组合结果的查询的结果：

[
  {
    "key": {
      "a": "a",
      "b": "b"
    },
    "count": 5,
    "_id": "cL5oaVlIrKluKSo0",
    "tags": [
      {
        "tag": "e",
        "count": 3
      },
      {
        "tag": "f",
        "count": 1
      },
      {
        "tag": "d",
        "count": 4
      },
      {
        "tag": "c",
        "count": 2
      }
    ]
  }
]

这比试图在单个管道中处理并产生相同结果的效率要“多得多”，而且速度要快得多，因为总“并行”运行时间会少得多。

这作为一种方法也可以很好地扩展，如果结果对于内存来说太大了，那么您基本上可以将写入“交换”到基于 nedb 的数据存储中，以便在 MongoDB 服务器上进行实际收集。这里的“upsert”操作可以很好地“组合”数据以获得整体结果，所以这就是它的使用方式。

要考虑的关键是不要让聚合管道进程做的工作超过它真正应该做的工作。如果从不同的查询中获取结果是有意义的，那么就使用它。这往往是解决问题的最有效方法。

【讨论】：