Hi there!
I am struggling with an aggregation that runs super fast when separate stages but becomes super slow when combining them.
The aggregation is on an about 4MM documents collection.
db.getCollection('events').aggregate([
{
"$addFields": {
"6059ff5a2aa6a105ae85d7f1": {
"$cond": [
{
"$and": [
{
"$eq": [
"$campaign_id",
ObjectId( "604baadfa1a21c0c6d03901f" )
]
},
{
"$eq": [
"$event",
"processed"
]
},
{
"$eq": [
"$channel",
"email"
]
}
]
},
1,
0
]
},
"6059ff5a2aa6a103a585d7f0": {
"$cond": [
{
"$and": [
{
"$eq": [
"$campaign_id",
ObjectId( "604baadfa1a21c0c6d03901f" )
]
},
{
"$eq": [
"$event",
"open"
]
},
{
"$eq": [
"$channel",
"email"
]
}
]
},
1,
0
]
}
}
},
{
"$group": {
"_id": "$contact_id",
"6059ff5a2aa6a105ae85d7f1": {
"$sum": "$6059ff5a2aa6a105ae85d7f1"
},
"6059ff5a2aa6a103a585d7f0": {
"$sum": "$6059ff5a2aa6a103a585d7f0"
}
}
}
],
{ "allowDiskUse": true })
If I run the $addFields alone it takes 1 sec and if I run the $group alone it takes 1 sec. But the combo takes 70 seconds (or more).
Fields are indexed.
When running the $addFields explain gives
{
"$cursor" : {
"query" : {},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "production.events",
"indexFilterSet" : false,
"parsedQuery" : {},
"queryHash" : "8B3D4AB8",
"planCacheKey" : "8B3D4AB8",
"winningPlan" : {
"stage" : "COLLSCAN",
"direction" : "forward"
},
"rejectedPlans" : []
}
}
},
For the $group alone explain shows:
{
"$cursor" : {
"query" : {},
"fields" : {
"contact_id" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "production.events",
"indexFilterSet" : false,
"parsedQuery" : {},
"queryHash" : "14AB7FAF",
"planCacheKey" : "14AB7FAF",
"winningPlan" : {
"stage" : "PROJECTION_COVERED",
"transformBy" : {
"contact_id" : 1,
"_id" : 0
},
"inputStage" : {
"stage" : "DISTINCT_SCAN",
"keyPattern" : {
"contact_id" : 1
},
"indexName" : "contact_id_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"contact_id" : []
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"contact_id" : [
"[MinKey, MaxKey]"
]
}
}
},
"rejectedPlans" : []
}
}
},
{
"$groupByDistinctScan" : {
"newRoot" : {
"_id" : "$contact_id"
}
}
}
],
for the combo explain is:
"stages" : [
{
"$cursor" : {
"query" : {},
"fields" : {
"6059ff5a2aa6a103a585d7f0" : 1,
"6059ff5a2aa6a105ae85d7f1" : 1,
"campaign_id" : 1,
"channel" : 1,
"contact_id" : 1,
"event" : 1,
"_id" : 0
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "production.events",
"indexFilterSet" : false,
"parsedQuery" : {},
"queryHash" : "8B3D4AB8",
"planCacheKey" : "8B3D4AB8",
"winningPlan" : {
"stage" : "COLLSCAN",
"direction" : "forward"
},
"rejectedPlans" : []
}
}
},
Thoughts???