Indexes not used when matching on a field in the $lookup collection

Kyle_Barnes1 · January 19, 2024, 11:19pm

Here is my simple database:

// ----------------------------
// Collection structure for jobs
// ----------------------------
db.getCollection("jobs").drop();
db.createCollection("jobs");
db.getCollection("jobs").createIndex({
    resource: NumberInt("1")
}, {
    name: "resource_1"
});

// ----------------------------
// Documents of jobs
// ----------------------------
db.getCollection("jobs").insert([ {
    _id: ObjectId("65aafea215315402290b215d"),
    name: "Job1",
    resource: ObjectId("65aafec815315402290b2160")
} ]);
db.getCollection("jobs").insert([ {
    _id: ObjectId("65aafea215315402290b215e"),
    name: "Job2",
    resource: ObjectId("65aafec815315402290b2161")
} ]);
db.getCollection("jobs").insert([ {
    _id: ObjectId("65aafea215315402290b215f"),
    name: "Job3",
    resource: ObjectId("65aafec815315402290b2162")
} ]);

// ----------------------------
// Collection structure for resources
// ----------------------------
db.getCollection("resources").drop();
db.createCollection("resources");
db.getCollection("resources").createIndex({
    name: NumberInt("1")
}, {
    name: "name_1"
});

// ----------------------------
// Documents of resources
// ----------------------------
db.getCollection("resources").insert([ {
    _id: ObjectId("65aafec815315402290b2160"),
    name: "Resource1"
} ]);
db.getCollection("resources").insert([ {
    _id: ObjectId("65aafec815315402290b2161"),
    name: "Resource2"
} ]);
db.getCollection("resources").insert([ {
    _id: ObjectId("65aafec815315402290b2162"),
    name: "Resource3"
} ]);

Now I would like to execute the following aggregation framework query:

db.jobs.aggregate([
    {
        "$lookup": {
            "from": "resources",
            "let": {
                resourceVar: "$resource"
            },
            "pipeline": [
                {
                    "$match": {
                        $expr: {
                            $eq: ["$_id", "$$resourceVar"]
                        },
												"name": { $in: ["Resource1"] }
                    }
                }
            ],
            "as": "resource",

        }
    },
    {
        "$unwind": "$resource"
    }
])

Everything works this way, but when executing an explain I am always getting a COLLSCAN:

{
    "explainVersion": "1",
    "stages": [
        {
            "$cursor": {
                "queryPlanner": {
                    "namespace": "practice.jobs",
                    "indexFilterSet": false,
                    "parsedQuery": { },
                    "queryHash": "17830885",
                    "planCacheKey": "17830885",
                    "maxIndexedOrSolutionsReached": false,
                    "maxIndexedAndSolutionsReached": false,
                    "maxScansToExplodeReached": false,
                    "winningPlan": {
                        "stage": "COLLSCAN",
                        "direction": "forward"
                    },
                    "rejectedPlans": [ ]
                },
                "executionStats": {
                    "executionSuccess": true,
                    "nReturned": 3,
                    "executionTimeMillis": 6,
                    "totalKeysExamined": 0,
                    "totalDocsExamined": 3,
                    "executionStages": {
                        "stage": "COLLSCAN",
                        "nReturned": 3,
                        "executionTimeMillisEstimate": 0,
                        "works": 5,
                        "advanced": 3,
                        "needTime": 1,
                        "needYield": 0,
                        "saveState": 1,
                        "restoreState": 1,
                        "isEOF": 1,
                        "direction": "forward",
                        "docsExamined": 3
                    },
                    "allPlansExecution": [ ]
                }
            },
            "nReturned": {
                "$numberLong": "3"
            },
            "executionTimeMillisEstimate": {
                "$numberLong": "0"
            }
        },
        {
            "$lookup": {
                "from": "resources",
                "as": "resource",
                "let": {
                    "resourceVar": "$resource"
                },
                "pipeline": [
                    {
                        "$match": {
                            "$expr": {
                                "$eq": [
                                    "$_id",
                                    "$$resourceVar"
                                ]
                            },
                            "name": {
                                "$in": [
                                    "Resource1"
                                ]
                            }
                        }
                    }
                ],
                "unwinding": {
                    "preserveNullAndEmptyArrays": false
                }
            },
            "totalDocsExamined": {
                "$numberLong": "2"
            },
            "totalKeysExamined": {
                "$numberLong": "2"
            },
            "collectionScans": {
                "$numberLong": "0"
            },
            "indexesUsed": [
                "_id_"
            ],
            "nReturned": {
                "$numberLong": "1"
            },
            "executionTimeMillisEstimate": {
                "$numberLong": "4"
            }
        }
    ],
    "serverInfo": {
        "host": "e043d4b1d9df",
        "port": 27017,
        "version": "6.0.4",
        "gitVersion": "44ff59461c1353638a71e710f385a566bcd2f547"
    },
    "serverParameters": {
        "internalQueryFacetBufferSizeBytes": 104857600,
        "internalQueryFacetMaxOutputDocSizeBytes": 104857600,
        "internalLookupStageIntermediateDocumentMaxSizeBytes": 104857600,
        "internalDocumentSourceGroupMaxMemoryBytes": 104857600,
        "internalQueryMaxBlockingSortMemoryUsageBytes": 104857600,
        "internalQueryProhibitBlockingMergeOnMongoS": 0,
        "internalQueryMaxAddToSetBytes": 104857600,
        "internalDocumentSourceSetWindowFieldsMaxMemoryBytes": 104857600
    },
    "command": {
        "aggregate": "jobs",
        "pipeline": [
            {
                "$lookup": {
                    "from": "resources",
                    "let": {
                        "resourceVar": "$resource"
                    },
                    "pipeline": [
                        {
                            "$match": {
                                "$expr": {
                                    "$eq": [
                                        "$_id",
                                        "$$resourceVar"
                                    ]
                                },
                                "name": {
                                    "$in": [
                                        "Resource1"
                                    ]
                                }
                            }
                        }
                    ],
                    "as": "resource"
                }
            },
            {
                "$unwind": "$resource"
            }
        ],
        "cursor": { },
        "$db": "practice"
    },
    "ok": 1
}

Is there anyway to write the aggregation framework query such that it will honor the index?

steevej · January 20, 2024, 2:19pm

With so little documents in your collections, I think the optimizer simply determine that a COLLSCAN is optimal.

Rather than

Kyle_Barnes1:

"let": {
                resourceVar: "$resource"
            },
            "pipeline": [
                {
                    "$match": {
                        $expr: {
                            $eq: ["$_id", "$resourceVar"]
                        },

You might be better off using

localField : "resource" ,
foreignField : "_id"

Kyle_Barnes1 · January 21, 2024, 5:13am

Thanks Steeve for the relpy. I used a simple example, but my actual use case if for 100K+ documents in both Jobs and Resources collections. When I explain in those cases I always get a COLLSCAN as well.

steevej · January 21, 2024, 3:17pm

Have your tried with my suggestion

to see if you get different results in your real collections?

One other thing to note is that you do not $match on jobs so the top most stage of the aggregation will always be a COLLSCAN because you aggregate on the whole jobs collection. Also, the $lookup, even with the let: seems to use the index of resources as indicated by:

It looks like it is simply a misunderstanding of the explain() output.

chris · January 21, 2024, 7:38pm

If you can modify the schema and embed some or all of the resources then a query on the jobs collection will be orders of magnitude faster(35ms on my Atlas M0).

But using you existing schema and generating a dataset with mgodatagen for 100k jobs and 100k resources:

mgodatagen setup

mgodatagen --seed=1705859631 --file=lookup-mgo.json --uri="mongodb://....."

lookup-mgo.json

[
  {
    "database": "lookup",
    "collection": "resources",
    "count": 100000,
    "content": {
      "_id": {
        "type": "reference",
        "id": 0,
        "refContent": {
          "type": "objectId"
        }
      },
      "name": {
        "type": "stringFromParts",
        "parts": [
          {
            "type": "constant",
            "constVal": "Resource"
          },
          {
            "type": "int",
            "min": 0,
            "max": 3
          }
        ]
      }
    }
  },
  {
    "database": "lookup",
    "collection": "jobs",
    "count": 100000,
    "content": {
      "job": {
        "type": "stringFromParts",
        "parts": [
          {
            "type": "constant",
            "constVal": "Job"
          },{
            "type":"autoincrement",
            "autoType": "int",
            "start": 0
          }
        ]
      },
      "resource": {
        "type": "ref",
        "id": 0
      }
    }
  }
]

Much better performance (2s vs 12s on an Atlas M0) via looking up jobs from the resources collection:

[
  {
    $match: {
      name: {
        $in: ["Resource1"],
      },
    },
  },
  {
    $replaceWith: {
      resource: "$$ROOT",
    },
  },
  {
    $lookup: {
      from: "jobs",
      localField: "resource._id",
      foreignField: "resource",
      as: "jobs",
    },
  },
  {
    $unwind: {
      path: "$jobs",
    },
  },
  {
    $replaceRoot: {
      newRoot: {
        $mergeObjects: [
          "$jobs",
          {
            resource: "$resource",
          },
        ],
      },
    },
  },
]

Kyle_Barnes1 · January 22, 2024, 7:34pm

Thanks @steevej for the suggestion. I tried using localField and ForeignField but I’m seeing the same result. Thanks for pointing out that the id index was used. I was hoping that the name_1 index on resources would be used, but maybe thats the result of there being too few documents.

Kyle_Barnes1 · January 22, 2024, 7:36pm

Thanks for the suggetion @chris. If I could nest the resources that would work great, but each resource will have many jobs, so I would end up duplicating the resource many times.

steevej · January 25, 2024, 1:24pm

It cannot since you have _id:1, perhaps the compound _id:1,name:1 could be more efficient.