Merging data while doing a sum of elements of an array of dict

Hello Everyone,
I’m faced with an issue in my aggregation pipeline and I can’t quite find a solution, so I hopped someone could come up with an idea.

I’m aggregating data from a collection sub into a collection agg.
Example of content of the agg collection:

[{
    "_id": "AAA",
    "count": 2,
    "location":
    [
        {
            "country": "US",
            "region": "ia",
            "city": "cedar rapids",
            "count": 1
        },
        {
            "country": "AU",
            "region": "vic",
            "city": "melbourne",
            "count": 1
        }
    ]
}]

Currently I have an aggregation pipeline getting data from the sub collection that output data with the same format. Now I want to merge the data from my pipeline with the agg collection so that all the count are updated to be a SUM of the previous and new values.

For instance, my pipeline give me the following data:

[{
    "_id": "AAA",
    "count": 2,
    "location":
    [
        {
            "country": "US",
            "region": "ca",
            "city": "los angeles",
            "count": 1
        },
        {
            "country": "AU",
            "region": "vic",
            "city": "melbourne",
            "count": 1
        }
    ]
}]

And i want the final data to look like this:

[{
    "_id": "AAA",
    "count": 4,
    "location":
    [
        {
            "country": "US",
            "region": "ia",
            "city": "cedar rapids",
            "count": 1
        },
        {
            "country": "US",
            "region": "ca",
            "city": "los angeles",
            "count": 1
        },
        {
            "country": "AU",
            "region": "vic",
            "city": "melbourne",
            "count": 2
        }
    ]
}]

I came up with the following $merge at the end of my pipeline that sums the top count:

[
{
            "$merge": {
                "into": "agg",
                "on": "_id",
                "whenMatched": [
                    {
                        "$addFields": {
                            "count": {
                                "$add": [
                                    "$count",
                                    "$$new.count"
                                ]
                            }
                        }
                    }
                ],
                "whenNotMatched": "insert"
            }
  }]

However I have no idea on how to do the same thing for the data inside the location array.

Would someone have an idea on how to do that? I could store all my data “unwinded” and add another collection to store the aggregation of the unwind, but I don’t feel that is the adequate solution.
Thanks

Hi @yc_rkc, welcome to the community!

I believe I solved your issue. The solution is quite extensive and a little complicated to understand, but it works. To make it easier to test and implement, I created one document with the fields acc and agg and on each one, I put the data referent of each collection, the aggregation stage will create a new field location with the combination of acc and agg. I’m quite in a hurry these days if you can´t migrate the implementation to a $merge stage let me know and I’ll try it, too. Here is the code:

const docBefore = {
  _id: { $oid: "6540f0237b938738dabe9558" },
  acc: [
    { country: "US", region: "ia", city: "cedar rapids", count: 1 },
    { country: "AU", region: "vic", city: "melbourne", count: 1 },
  ],
  agg: [
    { country: "US", region: "ca", city: "los angeles", count: 1 },
    { country: "AU", region: "vic", city: "melbourne", count: 1 },
  ],
};

const docAfter = {
  _id: { $oid: "6540f0237b938738dabe9558" },
  acc: [
    { country: "US", region: "ia", city: "cedar rapids", count: 1 },
    { country: "AU", region: "vic", city: "melbourne", count: 1 },
  ],
  agg: [
    { country: "US", region: "ca", city: "los angeles", count: 1 },
    { country: "AU", region: "vic", city: "melbourne", count: 1 },
  ],
  location: [
    { country: "US", region: "ia", city: "cedar rapids", count: 1 },
    { country: "US", region: "ca", city: "los angeles", count: 1 },
    { country: "AU", region: "vic", city: "melbourne", count: 2 },
  ],
};

//  Code related to the pipeline
const accExists = { $isArray: ["$acc"] };
const aggExists = { $isArray: ["$agg"] };

const accItemIsInInitialValue = {
  $eq: [
    {
      $size: {
        $filter: {
          input: "$$value",
          as: "valueItem",
          cond: {
            $and: [
              { $eq: ["$$this.country", "$$valueItem.country"] },
              { $eq: ["$$this.region", "$$valueItem.region"] },
              { $eq: ["$$this.city", "$$valueItem.city"] },
            ],
          },
        },
      },
    },
    1,
  ],
};

const sumNewItemCountToInitialValue = {
  $map: {
    input: "$$value",
    as: "valueItem",
    in: {
      $cond: {
        if: { $eq: ["$$this.country", "$$valueItem.country"] },
        then: {
          $mergeObjects: [
            "$$valueItem",
            { count: { $add: ["$$valueItem.count", "$$this.count"] } },
          ],
        },
        else: "$$valueItem",
      },
    },
  },
};

const includeNewItemInInitialValue = { $concatArrays: [["$$this"], "$$value"] };

const stage = {
  $addFields: {
    location: {
      $switch: {
        branches: [
          { case: { $and: [accExists, { $not: aggExists }] }, then: "$acc" },
          { case: { $and: [{ $not: accExists }, aggExists] }, then: "$agg" },
        ],
        default: {
          $reduce: {
            input: "$acc",
            initialValue: "$agg",
            in: {
              $cond: {
                if: accItemIsInInitialValue,
                then: sumNewItemCountToInitialValue,
                else: includeNewItemInInitialValue,
              },
            },
          },
        },
      },
    },
  },
};

Hello @Artur_57972, Thanks for your reply, it seems to be working well in production settings!

For those wondering the merge stage looks like the following. I just renamed acc to new and agg to old to have a better understanding of who is who.

{
            "$merge": {
                "into": "old",
                "on": "_id",
                "whenMatched": [
                    {
                        "$addFields": {
                            "count": {
                                "$add": [
                                    "$count",
                                    "$$new.count"
                                ]
                            },
                            "location": {
                                "$switch": {
                                    "branches": [
                                        {"case": {"$and": [newExists, {"$not": oldExists}]}, "then": "$$new.location"},
                                        {"case": {"$and": [{"$not": newExists}, oldExists]}, "then": "$location"},
                                    ],
                                    "default": {
                                        "$reduce": {
                                            "input": "$$new.location",
                                            "initialValue": "$location",
                                            "in": {
                                                "$cond": {
                                                    "if": newItemIsInInitialValue,
                                                    "then": sumNewItemCountToInitialValue,
                                                    "else": includeNewItemInInitialValue,
                                                },
                                            },
                                        },
                                    },
                                },
                            },
                        }
                    }
                ],
                "whenNotMatched": "insert"
            }
        }

With the following changes to 2 constant:

    const newExists = {"$isArray": ["$$new.location"]}

    const oldExists = {"$isArray": ["$location"]}

While waiting for the reply I also managed to find another way to do the same things in a less elegant and more bruteforce way with the following stages:

[{
            "$lookup": {
                "from": "old",
                "localField": "_id",
                "foreignField": "_id",
                "as": "existing"
            }
        },
        {
            "$unwind": "$location"
        },
        {
            "$unwind": {
                "path": "$existing",
                "preserveNullAndEmptyArrays": True
            }
        },
        {
            "$unwind": {
                "path": "$existing.location",
                "preserveNullAndEmptyArrays": True
            }
        },
        {
            "$match": {
                "$or": [
                    {
                        "$expr": {
                            "$and": [
                                {
                                    "$eq": [
                                        "$existing.location.city",
                                        "$location.city"
                                    ]
                                },
                                {
                                    "$eq": [
                                        "$existing.location.region",
                                        "$location.region"
                                    ]
                                },
                                {
                                    "$eq": [
                                        "$existing.location.country",
                                        "$location.country"
                                    ]
                                }
                            ]
                        }
                    },
                    {
                        "existing": {
                            "$exists": False
                        }
                    }
                ]
            }
        },
        {
            "$group": {
                "_id": {
                    "mainId": "$_id",
                    "country": "$location.country",
                    "region": "$location.region",
                    "city": "$location.city"
                },
                "count": {
                    "$sum": "$location.count"
                },
                "existingCount": {
                    "$sum": {
                        "$ifNull": [
                            "$existing.location.count",
                            0
                        ]
                    }
                }
            }
        },
        {
            "$group": {
                "_id": "$_id.mainId",
                "count": {
                    "$sum": {
                        "$add": [
                            "$count",
                            "$existingCount"
                        ]
                    }
                },
                "location": {
                    "$push": {
                        "country": "$_id.country",
                        "region": "$_id.region",
                        "city": "$_id.city",
                        "count": {
                            "$sum": {
                                "$add": [
                                    "$count",
                                    "$existingCount"
                                ]
                            }
                        }
                    }
                }
            }
        },
        {
            "$merge": {
                "into": "old",
                "on": "_id",
                "whenMatched": "merge",
                "whenNotMatched": "insert"
            }
        }]

I did have the time to compare performance between the 2, but I suspect that your way should be faster, or at least less memory intensive.

Thanks a lot again for your reply !

1 Like

This topic was automatically closed 5 days after the last reply. New replies are no longer allowed.