Sorry for reviving an old thread, but we just ran into this problem on a MongoDB 4.4 replica set and after running some tests we came to a solution that seems somewhat silly? I was hoping to get some insight if possible. Let me know if you’d rather I opened a new thread altogether!
First some context:
We have a collection guide_progresses
with around ~2M documents, and the following indices:
[
{
"v" : 2,
"key" : {
"_id" : 1
},
"name" : "_id_"
},
{
"v" : 2,
"key" : {
"guide.slug" : 1,
"last_assignment.exercise.eid" : 1
},
"name" : "ExBibIdIndex"
},
{
"v" : 2,
"key" : {
"organization" : 1,
"course" : 1,
"student.uid" : 1
},
"name" : "organization_1_course_1_student.uid_1"
},
{
"v" : 2,
"key" : {
"_fts" : "text",
"_ftsx" : 1
},
"name" : "student.first_name_text_student.last_name_text_student.email_text",
"default_language" : "english",
"language_override" : "language",
"weights" : {
"student.email" : 1,
"student.first_name" : 1,
"student.last_name" : 1
},
"textIndexVersion" : 3
},
{
"v" : 2,
"key" : {
"organization" : 1,
"course" : 1,
"guide.slug" : 1,
"student.uid" : 1
},
"name" : "organization_1_course_1_guide.slug_1_student.uid_1"
}
]
Our application, as part of its regular work flow, runs the following query:
db.guide_progresses.aggregate([
{
"$match":
{
"organization": "wwwwwwwwwwwwww",
"course": "xxxxxxxxxxxxxx",
"guide.slug": "yyyyyyyyyyyyyy",
"detached":
{
"$exists": false
},
"$text":
{
"$search": "\"zzzzzzzzzzzzzz\""
}
}
},
{
"$sort":
{
"stats.passed": 1,
"stats.passed_with_warnings": 1,
"stats.failed": 1,
"student.last_name": 1,
"student.first_name": 1
}
},
{
"$project":
{
"_id": 0,
"assignments": 0,
"notifications": 0,
"guide._id": 0,
"student._id": 0,
"last_assignment._id": 0,
"last_assignment.guide._id": 0,
"last_assignment.exercise._id": 0,
"last_assignment.submission._id": 0
}
},
{
"$facet":
{
"results":
[
{
"$skip": 0
},
{
"$limit": 30
}
],
"total":
[
{
"$count": "count"
}
]
}
}
],
{
"allowDiskUse": true
})
Now, here comes the part I could use some insight with; upon asking for the query plan for the previous query, it returns the following:
{
"stages" : [
{
"$cursor" : {
"query" : {
"organization" : "wwwwwwwwwwwwww",
"course" : "xxxxxxxxxxxxxx",
"guide.slug" : "yyyyyyyyyyyyyy",
"detached" : {
"$exists" : false
},
"$text" : {
"$search" : "\"zzzzzzzzzzzzzz\""
}
},
"fields" : {
"$textScore" : {
"$meta" : "textScore"
}
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "classroom.guide_progresses",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"course" : {
"$eq" : "xxxxxxxxxxxxxx"
}
},
{
"guide.slug" : {
"$eq" : "yyyyyyyyyyyyyy"
}
},
{
"organization" : {
"$eq" : "wwwwwwwwwwwwww"
}
},
{
"$nor" : [
{
"detached" : {
"$exists" : true
}
}
]
},
{
"$text" : {
"$search" : "\"zzzzzzzzzzzzzz\"",
"$language" : "english",
"$caseSensitive" : false,
"$diacriticSensitive" : false
}
}
]
},
"winningPlan" : {
"stage" : "PROJECTION",
"transformBy" : {
"$textScore" : {
"$meta" : "textScore"
}
},
"inputStage" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"course" : {
"$eq" : "xxxxxxxxxxxxxx"
}
},
{
"guide.slug" : {
"$eq" : "yyyyyyyyyyyyyy"
}
},
{
"organization" : {
"$eq" : "wwwwwwwwwwwwww"
}
},
{
"$nor" : [
{
"detached" : {
"$exists" : true
}
}
]
}
]
},
"inputStage" : {
"stage" : "TEXT",
"indexPrefix" : {
},
"indexName" : "student.first_name_text_student.last_name_text_student.email_text",
"parsedTextQuery" : {
"terms" : [
"zzzzzzzzzzzzzz"
],
"negatedTerms" : [ ],
"phrases" : [
"zzzzzzzzzzzzzz"
],
"negatedPhrases" : [ ]
},
"textIndexVersion" : 3,
"inputStage" : {
"stage" : "TEXT_MATCH",
"inputStage" : {
"stage" : "TEXT_OR",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"_fts" : "text",
"_ftsx" : 1
},
"indexName" : "student.first_name_text_student.last_name_text_student.email_text",
"isMultiKey" : true,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "backward",
"indexBounds" : {
}
}
}
}
}
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$sort" : {
"sortKey" : {
"stats.passed" : 1,
"stats.passed_with_warnings" : 1,
"stats.failed" : 1,
"student.last_name" : 1,
"student.first_name" : 1
}
}
},
{
"$project" : {
"notifications" : false,
"assignments" : false,
"_id" : false,
"student" : {
"_id" : false
},
"last_assignment" : {
"_id" : false,
"exercise" : {
"_id" : false
},
"submission" : {
"_id" : false
},
"guide" : {
"_id" : false
}
},
"guide" : {
"_id" : false
}
}
},
{
"$facet" : {
"results" : [
{
"$limit" : NumberLong(30)
}
],
"total" : [
{
"$group" : {
"_id" : {
"$const" : null
},
"count" : {
"$sum" : {
"$const" : 1
}
}
}
},
{
"$project" : {
"_id" : false,
"count" : true
}
}
]
}
}
],
"ok" : 1
}
Meaning, if I’m understanding that correctly, that Mongo seems to be ignoring all other possible index scans, and using only the text search index.
This also means that a full text search is done over our whole 2M document collection, which seems to shoot RAM usage up extremely fast, which in our case ended up causing thrasing, slow performance across all other queries too, and after a little while, a server restart.
As a temporary solution to this we’re found that changing the aforementioned query’s match stage to the following:
{
"$match":
{
"organization": "wwwwwwwwwwwwww",
"course": "xxxxxxxxxxxxxx",
"guide.slug": "yyyyyyyyyyyyyy",
"detached":
{
"$exists": false
},
"$or":
[
{
"first_name": /zzzzzzzzzzzzzz/
},
{
"last_name": /zzzzzzzzzzzzzz/
},
{
"email": /zzzzzzzzzzzzzz/
},
]
}
}
Comes up with a much more desirable query plan:
{
"stages" : [
{
"$cursor" : {
"query" : {
"organization" : "wwwwwwwwwwwwww",
"course" : "xxxxxxxxxxxxxx",
"guide.slug" : "yyyyyyyyyyyyyy",
"detached" : {
"$exists" : false
},
"$or" : [
{
"first_name" : /zzzzzzzzzzzzzz/
},
{
"last_name" : /zzzzzzzzzzzzzz/
},
{
"email" : /zzzzzzzzzzzzzz/
}
]
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "classroom.guide_progresses",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"$or" : [
{
"email" : {
"$regex" : "zzzzzzzzzzzzzz"
}
},
{
"first_name" : {
"$regex" : "zzzzzzzzzzzzzz"
}
},
{
"last_name" : {
"$regex" : "zzzzzzzzzzzzzz"
}
}
]
},
{
"course" : {
"$eq" : "xxxxxxxxxxxxxx"
}
},
{
"guide.slug" : {
"$eq" : "yyyyyyyyyyyyyy"
}
},
{
"organization" : {
"$eq" : "wwwwwwwwwwwwww"
}
},
{
"$nor" : [
{
"detached" : {
"$exists" : true
}
}
]
}
]
},
"winningPlan" : {
"stage" : "CACHED_PLAN",
"inputStage" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"$or" : [
{
"email" : {
"$regex" : "zzzzzzzzzzzzzz"
}
},
{
"first_name" : {
"$regex" : "zzzzzzzzzzzzzz"
}
},
{
"last_name" : {
"$regex" : "zzzzzzzzzzzzzz"
}
}
]
},
{
"$nor" : [
{
"detached" : {
"$exists" : true
}
}
]
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"organization" : 1,
"course" : 1,
"guide.slug" : 1,
"student.uid" : 1
},
"indexName" : "organization_1_course_1_guide.slug_1_student.uid_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"organization" : [ ],
"course" : [ ],
"guide.slug" : [ ],
"student.uid" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"organization" : [
"[\"wwwwwwwwwwwwww\", \"wwwwwwwwwwwwww\"]"
],
"course" : [
"[\"xxxxxxxxxxxxxx\", \"xxxxxxxxxxxxxx\"]"
],
"guide.slug" : [
"[\"yyyyyyyyyyyyyy\", \"yyyyyyyyyyyyyy\"]"
],
"student.uid" : [
"[MinKey, MaxKey]"
]
}
}
}
},
"rejectedPlans" : [ ]
}
}
},
{
"$sort" : {
"sortKey" : {
"stats.passed" : 1,
"stats.passed_with_warnings" : 1,
"stats.failed" : 1,
"student.last_name" : 1,
"student.first_name" : 1
}
}
},
{
"$project" : {
"notifications" : false,
"assignments" : false,
"_id" : false,
"student" : {
"_id" : false
},
"last_assignment" : {
"_id" : false,
"exercise" : {
"_id" : false
},
"submission" : {
"_id" : false
},
"guide" : {
"_id" : false
}
},
"guide" : {
"_id" : false
}
}
},
{
"$facet" : {
"results" : [
{
"$limit" : NumberLong(30)
}
],
"total" : [
{
"$group" : {
"_id" : {
"$const" : null
},
"count" : {
"$sum" : {
"$const" : 1
}
}
}
},
{
"$project" : {
"_id" : false,
"count" : true
}
}
]
}
}
],
"ok" : 1
}
I realise this is not a perfect replacement of the text search feature, although depending on the use case and tuning the regexp similar results (if not exactly the same) can be obtained. This way however, instead of running the full text search over the whole 2M document collection, it is first filtered via our organization_1_course_1_guide.slug_1_student.uid_1
index, resulting in the “text search” to be executed (in our use case) only over a couple thousand documents, and negligible RAM usage and response times.
So I was left wondering, is this the expected behavior? Does the text search feature just not play well with other indices? Or maybe we don’t have our text index properly setup?
Any help on this would be greatly appreciated!