Speed Up Aggregation Pipeline

Hey guys,

We have mitigated to MongoDB recently and I`m trying to create a pipeline for grouping the matching data.
Our cluster has 1.1m documents.

Here`s our query:

db.getCollection(“xyx_data”).aggregate([{
   "$match":{
      "name":{
         "$ne":""
      },
      "time":{
         "$gte":1630420570.277,
         "$lte":1633012570.277
      },
      "$and":[
         {
            "$text":{
               "$search":"IL"
            }
         },
         {
            "country":{
               "$regex":"IL",
               "$options":"gi"
            }
         }
      ]
   }
},
{
   "$group":{
      "_id":"$name",
      "time":{
         "$first":"$time"
      }
   }
},
{
   "$sort":{
      "time":-1
   }
},
{
   "$limit":10
}])

The query takes huge amount of time to complete and the explain returned from the query telling us that most of the time was taken by the FETCH stage.
Here`s the explain:

{
   "explainVersion":“1”,
   "stages":[
      {
         "$cursor":{
            "queryPlanner":{
               "namespace":"xxx_data",
               "indexFilterSet":false,
               "parsedQuery":{
                  "$and":[
                     {
                        "time":{
                           "$lte":1633012570.277
                        }
                     },
                     {
                        "time":{
                           "$gte":1630420570.277
                        }
                     },
                     {
                        "country":"/IL/gi"
                     },
                     {
                        "name":{
                           "$not":{
                              "$eq":""
                           }
                        }
                     },
                     {
                        "$text":{
                           "$search":"IL",
                           "$language":"english",
                           "$caseSensitive":false,
                           "$diacriticSensitive":false
                        }
                     }
                  ]
               },
               "queryHash":“A9EBD8B1”,
               "planCacheKey":“A592F28E”,
               "maxIndexedOrSolutionsReached":false,
               "maxIndexedAndSolutionsReached":false,
               "maxScansToExplodeReached":false,
               "winningPlan":{
                  "stage":"PROJECTION_SIMPLE",
                  "transformBy":{
                     "_id":true,
                     "name":true
                  },
                  "inputStage":{
                     "stage":"FETCH",
                     "filter":{
                        "$and":[
                           {
                              "name":{
                                 "$not":{
                                    "$eq":""
                                 }
                              }
                           },
                           {
                              "country":"/IL/gi"
                           }
                        ]
                     },
                     "inputStage":{
                        "stage":"TEXT_MATCH",
                        "indexPrefix":{
                           
                        },
                        "indexName":"index1",
                        "parsedTextQuery":{
                           "terms":[
                              "il"
                           ],
                           "negatedTerms":[
                              
                           ],
                           "phrases":[
                              
                           ],
                           "negatedPhrases":[
                              
                           ]
                        },
                        "textIndexVersion":3,
                        "inputStage":{
                           "stage":"FETCH",
                           "inputStage":{
                              "stage":"IXSCAN",
                              "filter":{
                                 "$and":[
                                    {
                                       "time":{
                                          "$lte":1633012570.277
                                       }
                                    },
                                    {
                                       "time":{
                                          "$gte":1630420570.277
                                       }
                                    }
                                 ]
                              },
                              "keyPattern":{
                                 "_fts":"text",
                                 "_ftsx":1,
                                 "name":1,
                                 "time":-1
                              },
                              "indexName":"index1",
                              "isMultiKey":true,
                              "isUnique":false,
                              "isSparse":false,
                              "isPartial":false,
                              "indexVersion":2,
                              "direction":"backward",
                              "indexBounds":{
                                 
                              }
                           }
                        }
                     }
                  }
               },
               "rejectedPlans":[
                  
               ]
            },
            "executionStats":{
               "executionSuccess":true,
               "nReturned":1110137,
               "executionTimeMillis":1233785,
               "totalKeysExamined":1110824,
               "totalDocsExamined":2221648,
               "executionStages":{
                  "stage":"PROJECTION_SIMPLE",
                  "nReturned":1110137,
                  "executionTimeMillisEstimate":1153779,
                  "works":1110825,
                  "advanced":1110137,
                  "needTime":687,
                  "needYield":0,
                  "saveState":65665,
                  "restoreState":65665,
                  "isEOF":1,
                  "transformBy":{
                     "_id":true,
                     "name":true
                  },
                  "inputStage":{
                     "stage":"FETCH",
                     "filter":{
                        "$and":[
                           {
                              "name":{
                                 "$not":{
                                    "$eq":""
                                 }
                              }
                           },
                           {
                              "country":"/IL/gi"
                           }
                        ]
                     },
                     "nReturned":1110137,
                     "executionTimeMillisEstimate":1153200,
                     "works":1110825,
                     "advanced":1110137,
                     "needTime":687,
                     "needYield":0,
                     "saveState":65665,
                     "restoreState":65665,
                     "isEOF":1,
                     "docsExamined":1110824,
                     "alreadyHasObj":1110824,
                     "inputStage":{
                        "stage":"TEXT_MATCH",
                        "nReturned":1110824,
                        "executionTimeMillisEstimate":1150747,
                        "works":1110825,
                        "advanced":1110824,
                        "needTime":0,
                        "needYield":0,
                        "saveState":65665,
                        "restoreState":65665,
                        "isEOF":1,
                        "indexPrefix":{
                           
                        },
                        "indexName":"index1",
                        "parsedTextQuery":{
                           "terms":[
                              "il"
                           ],
                           "negatedTerms":[
                              
                           ],
                           "phrases":[
                              
                           ],
                           "negatedPhrases":[
                              
                           ]
                        },
                        "textIndexVersion":3,
                        "docsRejected":0,
                        "inputStage":{
                           "stage":"FETCH",
                           "nReturned":1110824,
                           "executionTimeMillisEstimate":1150533,
                           "works":1110825,
                           "advanced":1110824,
                           "needTime":0,
                           "needYield":0,
                           "saveState":65665,
                           "restoreState":65665,
                           "isEOF":1,
                           "docsExamined":1110824,
                           "alreadyHasObj":0,
                           "inputStage":{
                              "stage":"IXSCAN",
                              "filter":{
                                 "$and":[
                                    {
                                       "time":{
                                          "$lte":1633012570.277
                                       }
                                    },
                                    {
                                       "time":{
                                          "$gte":1630420570.277
                                       }
                                    }
                                 ]
                              },
                              "nReturned":1110824,
                              "executionTimeMillisEstimate":6914,
                              "works":1110825,
                              "advanced":1110824,
                              "needTime":0,
                              "needYield":0,
                              "saveState":65665,
                              "restoreState":65665,
                              "isEOF":1,
                              "keyPattern":{
                                 "_fts":"text",
                                 "_ftsx":1,
                                 "name":1,
                                 "time":-1
                              },
                              "indexName":"index1",
                              "isMultiKey":true,
                              "isUnique":false,
                              "isSparse":false,
                              "isPartial":false,
                              "indexVersion":2,
                              "direction":"backward",
                              "indexBounds":{
                                 
                              },
                              "keysExamined":1110824,
                              "seeks":1,
                              "dupsTested":1110824,
                              "dupsDropped":0,
                              "indexDef":{
                                 "indexName":"index1",
                                 "isMultiKey":true,
                                 "keyPattern":{
                                    "_fts":"text",
                                    "_ftsx":1,
                                    "name":1,
                                    "time":-1
                                 },
                                 "isUnique":false,
                                 "isSparse":false,
                                 "isPartial":false,
                                 "direction":"backward"
                              }
                           }
                        },
                        "indexDef":{
                           "indexName":"index1"
                        }
                     }
                  }
               }
            }
         },
         "nReturned":1110137,
         "executionTimeMillisEstimate":1233153
      },
      {
         "$group":{
            "_id":"$name"
         },
         "maxAccumulatorMemoryUsageBytes":{
            
         },
         "totalOutputDataSizeBytes":20285818,
         "usedDisk":false,
         "nReturned":73234,
         "executionTimeMillisEstimate":1233762
      },
      {
         "$sort":{
            "sortKey":{
               "time":-1
            },
            "limit":10
         },
         "totalDataSizeSortedBytesEstimate":2930,
         "usedDisk":false,
         "nReturned":10,
         "executionTimeMillisEstimate":1233769
      }
   ],
   "serverInfo":{
      "host":"localhost",
      "port":27017,
      "version":"5.0.3",
      "gitVersion":"657fea5a61a74d7a79df7aff8e4bcf0bc742b748"
   },
   "serverParameters":{
      "internalQueryFacetBufferSizeBytes":104857600,
      "internalQueryFacetMaxOutputDocSizeBytes":104857600,
      "internalLookupStageIntermediateDocumentMaxSizeBytes":104857600,
      "internalDocumentSourceGroupMaxMemoryBytes":104857600,
      "internalQueryMaxBlockingSortMemoryUsageBytes":104857600,
      "internalQueryProhibitBlockingMergeOnMongoS":0,
      "internalQueryMaxAddToSetBytes":104857600,
      "internalDocumentSourceSetWindowFieldsMaxMemoryBytes":104857600
   },
   "command":{
      "aggregate":"xxx_data",
      "pipeline":[
         {
            "$match":{
               "name":{
                  "$ne":""
               },
               "time":{
                  "$gte":1630420570.277,
                  "$lte":1633012570.277
               },
               "$and":[
                  {
                     "$text":{
                        "$search":"IL"
                     }
                  },
                  {
                     "country":"/IL/gi"
                  }
               ]
            }
         },
         {
            "$project":{
               "name":1
            }
         },
         {
            "$group":{
               "_id":"$name"
            }
         },
         {
            "$sort":{
               "time":-1
            }
         },
         {
            "$limit":10
         }
      ],
      "cursor":{
         
      },
      "$db":"db"
   },
   "ok":1,
   "$clusterTime":{
      "clusterTime":Timestamp(1633355452,
      1),
      "signature":{
         "hash":BinData(0,
         "R4d0Xu0c/CHiGxgh0NTp6s/C9Ek="")",
         "keyId":"NumberLong(""7009951518350639108"")"
      }
   },
   "operationTime":Timestamp(1633355452,
   1)
}

Visualization: https://i.imgur.com/GzMrxMk.png

I`ll be glad if someone can help me reduce the query time.
Thank you.

Hi @ASL4421,

Please format the code blocks in your initial correctly so a human can read this. This forum supports standard Markdown code blocks so it’s easy to get something formatted correctly and this will motivate more people to engage & try to solve the problem.

Can you please share with me the definition of the $text index you are using ?

It looks like to me that you are repeating yourself here:

{
  "$text": {
    "$search": "IL"
  }
},
{
  "country": {
    "$regex": "IL",
    "$options": "gi"
  }
}

I’m wondering if this $text search is absolutely necessary here.

Also, I’m wondering if you need a $regex to query for the country. Looks like you are looking for il or IL which could be resolved with a simple $in. By the way the option g doesn’t exist in the doc so I’m not sure what this is.

From what I’m seeing in your winning plan, you are just using an index on the time field which is definitely not optimal for this query.

As I think the $match stage can be improved a bit, based on what I said earlier, I can’t recommend the best possible index at the moment.

But I think an index like {name:1, time:1, country:1} would already be a lot better.

Moreover, I think your aggregation isn’t correct as you are not sorting your input before executing the $first operator in the $group. This means that you are retrieving a random time for each name basically.

It would be a LOT more efficient to add a {$sort: {name:1, time:1}} stage after the $match stage because it would make your aggregation predictable (always returns the same answer based on the sames documents) and it would allow you to use this optimization to return the first document of each group. As you are only using $first, this would be really fast.

If we can simplify the $text and/or the $regex and get the right index, I’m pretty sure this query should be below the second.

Cheers,
Maxime.

Hey @MaBeuLux88,

Thank you for your comment and sorry for the markdown problem (I cannot find the option to edit the post).
Anyway Ive followed your tips and advises and the query is running much much faster, We already had the index you mentioned but changing the MATCH stage and adding a sort before the group did the trick. now the only thing is that the group stage return an unsorted list of documents. Im trying to sort it by creating a sort stage right after the group stage but it has a huge impact on the performance (without sort it takes less than 1 sec, with sort it takes more than 50).

Here`s the query code:

[{
    $match: {

        "name": {
            "$ne": ""
        },
        "time": {
            "$gte": 1630420570.277,
            "$lte": 1633012570.277
        },

        "country": {
            $in: ["IL"]
        }

    }
}, {
    $sort: {
        name: -1,
        time: -1
    }
}, {
    $group: {
        "_id": "$name",
        "ID": {
            "$first": "$ID"
        },
        "name": {
            "$first": "$name"
        },
        "time": {
            "$first": "$time"
        }
    }
}, {
    $sort: {
        time: -1
    }
}, {
    $limit: 10
}]

Thank you.

The edit button is the small pen at the bottom of your post.

If you add more fields in the $group with $first, I think they also need to be added in the $sort before the $group and in the index that is backing the query.

Please compare the 2 winning plans with {explain:true}.

I’m still referring to this: https://docs.mongodb.com/manual/reference/operator/aggregation/group/#optimization-to-return-the-first-document-of-each-group.

There is not much you can do for the final $sort. There is no way to use an index after a $group so… If you want to speed it up, put more filters in the $match or faster hardware I guess :confused:.

How many items are you getting in the output? The final sort needs to happen entirely in memory.

Cheers,
Maxime.