Super slow aggregation when combining stages

Hi there!

I am struggling with an aggregation that runs super fast when separate stages but becomes super slow when combining them.

The aggregation is on an about 4MM documents collection.

db.getCollection('events').aggregate([
{
    "$addFields": {
      "6059ff5a2aa6a105ae85d7f1": {
        "$cond": [
          {
            "$and": [
              {
                "$eq": [
                  "$campaign_id",
                  ObjectId( "604baadfa1a21c0c6d03901f" )
                ]
              },
              {
                "$eq": [
                  "$event",
                  "processed"
                ]
              },
              {
                "$eq": [
                  "$channel",
                  "email"
                ]
              }
            ]
          },
          1,
          0
        ]
      },
      "6059ff5a2aa6a103a585d7f0": {
        "$cond": [
          {
            "$and": [
              {
                "$eq": [
                  "$campaign_id",
                  ObjectId( "604baadfa1a21c0c6d03901f" )
                ]
              },
              {
                "$eq": [
                  "$event",
                  "open"
                ]
              },
              {
                "$eq": [
                  "$channel",
                  "email"
                ]
              }
            ]
          },
          1,
          0
        ]
      }
    }
},
{
    "$group": {
      "_id": "$contact_id",
      "6059ff5a2aa6a105ae85d7f1": {
        "$sum": "$6059ff5a2aa6a105ae85d7f1"
      },
      "6059ff5a2aa6a103a585d7f0": {
        "$sum": "$6059ff5a2aa6a103a585d7f0"
      }
    }
  }
 ], 
{ "allowDiskUse": true })

If I run the $addFields alone it takes 1 sec and if I run the $group alone it takes 1 sec. But the combo takes 70 seconds (or more).

Fields are indexed.

When running the $addFields explain gives

        {
            "$cursor" : {
                "query" : {},
                "queryPlanner" : {
                    "plannerVersion" : 1,
                    "namespace" : "production.events",
                    "indexFilterSet" : false,
                    "parsedQuery" : {},
                    "queryHash" : "8B3D4AB8",
                    "planCacheKey" : "8B3D4AB8",
                    "winningPlan" : {
                        "stage" : "COLLSCAN",
                        "direction" : "forward"
                    },
                    "rejectedPlans" : []
                }
            }
        }, 

For the $group alone explain shows:

        {
            "$cursor" : {
                "query" : {},
                "fields" : {
                    "contact_id" : 1,
                    "_id" : 0
                },
                "queryPlanner" : {
                    "plannerVersion" : 1,
                    "namespace" : "production.events",
                    "indexFilterSet" : false,
                    "parsedQuery" : {},
                    "queryHash" : "14AB7FAF",
                    "planCacheKey" : "14AB7FAF",
                    "winningPlan" : {
                        "stage" : "PROJECTION_COVERED",
                        "transformBy" : {
                            "contact_id" : 1,
                            "_id" : 0
                        },
                        "inputStage" : {
                            "stage" : "DISTINCT_SCAN",
                            "keyPattern" : {
                                "contact_id" : 1
                            },
                            "indexName" : "contact_id_1",
                            "isMultiKey" : false,
                            "multiKeyPaths" : {
                                "contact_id" : []
                            },
                            "isUnique" : false,
                            "isSparse" : false,
                            "isPartial" : false,
                            "indexVersion" : 2,
                            "direction" : "forward",
                            "indexBounds" : {
                                "contact_id" : [ 
                                    "[MinKey, MaxKey]"
                                ]
                            }
                        }
                    },
                    "rejectedPlans" : []
                }
            }
        }, 
        {
            "$groupByDistinctScan" : {
                "newRoot" : {
                    "_id" : "$contact_id"
                }
            }
        }
    ],

for the combo explain is:

    "stages" : [ 
        {
            "$cursor" : {
                "query" : {},
                "fields" : {
                    "6059ff5a2aa6a103a585d7f0" : 1,
                    "6059ff5a2aa6a105ae85d7f1" : 1,
                    "campaign_id" : 1,
                    "channel" : 1,
                    "contact_id" : 1,
                    "event" : 1,
                    "_id" : 0
                },
                "queryPlanner" : {
                    "plannerVersion" : 1,
                    "namespace" : "production.events",
                    "indexFilterSet" : false,
                    "parsedQuery" : {},
                    "queryHash" : "8B3D4AB8",
                    "planCacheKey" : "8B3D4AB8",
                    "winningPlan" : {
                        "stage" : "COLLSCAN",
                        "direction" : "forward"
                    },
                    "rejectedPlans" : []
                }
            }
        }, 

Thoughts???

Hello @Admin_MlabsPages_mLa, welcome to the MongoDB Community forum!

I suggest use a $match stage as the first stage of the aggregation with the following conditions:

$match: { 
    "campaign_id": ObjectId("604baadfa1a21c0c6d03901f"), 
    "channel": "email" 
}

The match stage will benefit from a compound index on the two fields used above, campaign_id and channel.

Then use the following $group stage to complete the aggregation.

   {
      "$group":{
           "_id":"$contact_id",
           "6059ff5a2aa6a105ae85d7f1": {
               "$sum": { 
                   $cond: [ { $eq: [ "$event", "processed" ] }, 1, 0 ]
               }
           },
           "6059ff5a2aa6a103a585d7f0": {
               "$sum": { 
                   $cond: [ { $eq: [ "$event", "open" ] }, 1, 0 ]
               }
           }
      }
   }

The solution above make the query a little faster and gave me insigth to refactor my aggregate…

  db.getCollection('contacts').aggregate([
  {
    "$match": {
      "isDeleted": false,
      "tenant_id": ObjectId( "5ec2a723a73af34fd5964c93" ),
      "$or": [
        {
          "emails": {
            "$exists": true,
            "$not": {
              "$size": 0
            }
          }
        },
        {
          "cellphones": {
            "$exists": true,
            "$not": {
              "$size": 0
            }
          }
        }
      ]
    }
  },  
  { "$lookup": {
    "from": "events",
    "let": { "cId": "$_id" },
    "pipeline": [
      { "$match": 
                 { "$expr":
                    { "$and":
                       [          
                          { "$eq": [ "$contact_id", "$$cId"] },
                          { "$eq": [ "$channel", "email" ] },  
                          { "$eq": [ "$event", "open" ] },    
                          { "$eq": [ "$campaign_id", ObjectId( "60648747f78ba3fd5b00e8ba" ) ] }
                        ]                 
                    }}},
      {
        "$addFields": {
        "6075ecec3319af23fd597b0f": 1
      }}
    ],
    "as": "events"
  }},
  {
    "$unwind": {
      "path": "$events",
      "preserveNullAndEmptyArrays": true
  }},
{
    "$group": {
      "_id": "$_id",
      "emails": {
        "$first": "$emails"
      },
      "6075ecec3319af23fd597b0f": {
        "$sum": "$events.6075ecec3319af23fd597b0f"
      }
    }
}
], 
{ "allowDiskUse": true })

In this new query all goes fine until the $group stage… The $group is taking 50 seconds… Any idea on how to make it faster?

Hello @Admin_MlabsPages_mLa,

This looks like another (or different) aggregation query. I suggest you make another post with a properly formatted code (please use the code tags), sample input documents and the output from the explain (with “executionStats” mode) which is run with the aggregation query.