Aggregation lookup stage does not use sparse index

When performing an aggregation query with a lookup stage, it does not seem to be using a sparse index defined on the field if we are using the pipeline syntax.
This causes performance of the $lookup step to become extremely bad for lookups with a lot of data.
This only occurs on more recent versions like 5.0.23 and not on version 5.0.5.

An example pipeline can be found below. In this case, there exists a sparse index on collection1 for the field foreignId. If we change the index to a non-sparse index, the aggregation will use the index.
If we just use the localField/foreignField operators, it will also use the index.
Is this a regression or an intentional change? I could not find any more information in any release notes.

[
  {
    $lookup: {
      from: "collection1",
      let: {
        foreignId: "$_id",
      },
      pipeline: [
        {
          $match: {
            $expr: {
              $eq: [
                "$foreignId",
                "$$foreignId",
              ],
            },
          },
        },
      ],
      as: "data",
    },
  },
]
1 Like

Server ticket https://jira.mongodb.org/browse/SERVER-38799 shows that $expr never was able to use partial indexes and I suspect sparse indexes also. Are you sure you’re seeing a regression and it worked differently on version 5.0.5?

You can test it by just running a find with explain with $expr:{$eq:[]} as the predicate and seeing what index is being used (if any).

Asya

I am quite sure the behavior has changed between 5.0.5 and 5.0.23 because we were seeing different behavior in development and production, which were running different versions.

If it is unsupported behavior, we will probably just stop using sparse indexes in $lookup related contexts.
For testing purposes, I have added the explain values on both versions.
Below you can find the results in which you can see, one uses the index, the other does not.

Thanks

{
  from: "assignments",
  let: { presentationId: "$_id" },
  pipeline: [
    {
      $match: {
        $expr: {
          $eq: [
            "$presentationId",
            "$$presentationId",
          ],
        },
      },
    },
  ],
  as: "assignments",
}

with sparse index present on presentationId: 1 for the assignments collection.

5.0.5

{
  "explainVersion": "1",
  "stages": [
    {
      "$cursor": {
        "queryPlanner": {
          "namespace": "meteor.presentations",
          "indexFilterSet": false,
          "parsedQuery": {},
          "queryHash": "8B3D4AB8",
          "planCacheKey": "D542626C",
          "maxIndexedOrSolutionsReached": false,
          "maxIndexedAndSolutionsReached": false,
          "maxScansToExplodeReached": false,
          "winningPlan": {
            "stage": "COLLSCAN",
            "direction": "forward"
          },
          "rejectedPlans": []
        },
        "executionStats": {
          "executionSuccess": true,
          "nReturned": 11,
          "executionTimeMillis": 1,
          "totalKeysExamined": 0,
          "totalDocsExamined": 11,
          "executionStages": {
            "stage": "COLLSCAN",
            "nReturned": 11,
            "executionTimeMillisEstimate": 0,
            "works": 13,
            "advanced": 11,
            "needTime": 1,
            "needYield": 0,
            "saveState": 1,
            "restoreState": 1,
            "isEOF": 1,
            "direction": "forward",
            "docsExamined": 11
          },
          "allPlansExecution": []
        }
      },
      "nReturned": 11,
      "executionTimeMillisEstimate": 0
    },
    {
      "$lookup": {
        "from": "assignments",
        "as": "assignments",
        "let": {
          "presentationId": "$_id"
        },
        "pipeline": [
          {
            "$match": {
              "$expr": {
                "$eq": [
                  "$presentationId",
                  "$$presentationId"
                ]
              }
            }
          }
        ]
      },
      "totalDocsExamined": 3,
      "totalKeysExamined": 3,
      "collectionScans": 0,
      "indexesUsed": [
        "presentationId_1"
      ],
      "nReturned": 11,
      "executionTimeMillisEstimate": 1
    }
  ],
  "serverInfo": {
    "host": "mongodb",
    "port": 27017,
    "version": "5.0.5",
    "gitVersion": "d65fd89df3fc039b5c55933c0f71d647a54510ae"
  },
  "serverParameters": {
    "internalQueryFacetBufferSizeBytes": 104857600,
    "internalQueryFacetMaxOutputDocSizeBytes": 104857600,
    "internalLookupStageIntermediateDocumentMaxSizeBytes": 104857600,
    "internalDocumentSourceGroupMaxMemoryBytes": 104857600,
    "internalQueryMaxBlockingSortMemoryUsageBytes": 104857600,
    "internalQueryProhibitBlockingMergeOnMongoS": 0,
    "internalQueryMaxAddToSetBytes": 104857600,
    "internalDocumentSourceSetWindowFieldsMaxMemoryBytes": 104857600
  },
  "command": {
    "aggregate": "presentations",
    "pipeline": [
      {
        "$lookup": {
          "from": "assignments",
          "let": {
            "presentationId": "$_id"
          },
          "pipeline": [
            {
              "$match": {
                "$expr": {
                  "$eq": [
                    "$presentationId",
                    "$$presentationId"
                  ]
                }
              }
            }
          ],
          "as": "assignments"
        }
      }
    ],
    "allowDiskUse": true,
    "cursor": {},
    "maxTimeMS": 60000,
    "$db": "meteor"
  },
  "ok": 1,
  "$clusterTime": {
    "clusterTime": {
      "$timestamp": {
        "t": 1704307112,
        "i": 1
      }
    },
    "signature": {
      "hash": {
        "$binary": {
          "base64": "AAAAAAAAAAAAAAAAAAAAAAAAAAA=",
          "subType": "00"
        }
      },
      "keyId": 0
    }
  },
  "operationTime": {
    "$timestamp": {
      "t": 1704307112,
      "i": 1
    }
  }
}

5.0.23

{
  "explainVersion": "1",
  "stages": [
    {
      "$cursor": {
        "queryPlanner": {
          "namespace": "meteor.presentations",
          "indexFilterSet": false,
          "parsedQuery": {},
          "queryHash": "8B3D4AB8",
          "planCacheKey": "D542626C",
          "maxIndexedOrSolutionsReached": false,
          "maxIndexedAndSolutionsReached": false,
          "maxScansToExplodeReached": false,
          "winningPlan": {
            "stage": "COLLSCAN",
            "direction": "forward"
          },
          "rejectedPlans": []
        },
        "executionStats": {
          "executionSuccess": true,
          "nReturned": 11,
          "executionTimeMillis": 1,
          "totalKeysExamined": 0,
          "totalDocsExamined": 11,
          "executionStages": {
            "stage": "COLLSCAN",
            "nReturned": 11,
            "executionTimeMillisEstimate": 0,
            "works": 13,
            "advanced": 11,
            "needTime": 1,
            "needYield": 0,
            "saveState": 1,
            "restoreState": 1,
            "isEOF": 1,
            "direction": "forward",
            "docsExamined": 11
          },
          "allPlansExecution": []
        }
      },
      "nReturned": 11,
      "executionTimeMillisEstimate": 0
    },
    {
      "$lookup": {
        "from": "assignments",
        "as": "assignments",
        "let": {
          "presentationId": "$_id"
        },
        "pipeline": [
          {
            "$match": {
              "$expr": {
                "$eq": [
                  "$presentationId",
                  "$$presentationId"
                ]
              }
            }
          }
        ]
      },
      "totalDocsExamined": 55,
      "totalKeysExamined": 0,
      "collectionScans": 22,
      "indexesUsed": [],
      "nReturned": 11,
      "executionTimeMillisEstimate": 1
    }
  ],
  "serverInfo": {
    "host": "mongodb",
    "port": 27017,
    "version": "5.0.23",
    "gitVersion": "3367195a14d0ba2734d2ba2719294fb974ad0834"
  },
  "serverParameters": {
    "internalQueryFacetBufferSizeBytes": 104857600,
    "internalQueryFacetMaxOutputDocSizeBytes": 104857600,
    "internalLookupStageIntermediateDocumentMaxSizeBytes": 104857600,
    "internalDocumentSourceGroupMaxMemoryBytes": 104857600,
    "internalQueryMaxBlockingSortMemoryUsageBytes": 104857600,
    "internalQueryProhibitBlockingMergeOnMongoS": 0,
    "internalQueryMaxAddToSetBytes": 104857600,
    "internalDocumentSourceSetWindowFieldsMaxMemoryBytes": 104857600
  },
  "command": {
    "aggregate": "presentations",
    "pipeline": [
      {
        "$lookup": {
          "from": "assignments",
          "let": {
            "presentationId": "$_id"
          },
          "pipeline": [
            {
              "$match": {
                "$expr": {
                  "$eq": [
                    "$presentationId",
                    "$$presentationId"
                  ]
                }
              }
            }
          ],
          "as": "assignments"
        }
      }
    ],
    "allowDiskUse": true,
    "cursor": {},
    "maxTimeMS": 60000,
    "$db": "meteor"
  },
  "ok": 1,
  "$clusterTime": {
    "clusterTime": {
      "$timestamp": {
        "t": 1704307205,
        "i": 1
      }
    },
    "signature": {
      "hash": {
        "$binary": {
          "base64": "AAAAAAAAAAAAAAAAAAAAAAAAAAA=",
          "subType": "00"
        }
      },
      "keyId": 0
    }
  },
  "operationTime": {
    "$timestamp": {
      "t": 1704307205,
      "i": 1
    }
  }
}

I’ve been trying to track down any backported fix that might have affected this and couldn’t find anything. However, I do have a question - is there a reason that $expr is used here instead of just localField/foreignField which does a regular $eq semantics query?

The index rules are different for the two and I’m wondering whether you see the same behavior in that scenario (if you still have the environment available to test).

Asya