Hello everyone,
I faced an issue with MongoDB’s full-text search and I’m seeking assistance.
Since the data I store is in Chinese and MongoDB lacks support for Chinese word segmentation, I’ve addressed this by creating a new field named Terms. I manually segmented the Chinese words and stored them in the Terms field, separated by spaces. Subsequently, I established a full-text index on this field.
This approach provides Chinese language support and guarantees efficient searching. Upon executing the following command, the response time is a mere 0.008 seconds, indicating exceptional speed.
db.bidding_data.find({
$text: {
$language: "none",
$search: "公司 有限公司 投标 招标"
},
$and: [{
publishTime: {
$gte: ISODate("2020-01-17T00:00:00Z"),
$lte: ISODate("2024-01-01T23:59:59Z")
}
}]
}, {
publishTime: 1,
city: 1,
biddingType: 1,
downloadUrl: 1,
industry: 1,
updateTime: 1,
title: 1,
tags: 1,
sourceUrl: 1,
propertiesInfo: 1,
province: 1,
createTime: 1,
status: 1
}).skip(10)
.limit(10)
However, when I add sorting conditions, the query response becomes significantly slower, particularly when sorting based on scores or other fields. In my test environment with only 170,000 data, the response takes 6 seconds. The impact will be even greater with 5 million data in my production environment.
db.bidding_data.find({
$text: {
$language: "none",
$search: "公司 有限公司 投标 招标"
},
$and: [{
publishTime: {
$gte: ISODate("2020-01-17T00:00:00Z"),
$lte: ISODate("2024-01-01T23:59:59Z")
}
}]
}, {
publishTime: 1,
city: 1,
biddingType: 1,
downloadUrl: 1,
industry: 1,
updateTime: 1,
title: 1,
tags: 1,
sourceUrl: 1,
propertiesInfo: 1,
province: 1,
createTime: 1,
status: 1,
score: {
$meta: "textScore"
}
}).sort({
score: {
$meta: "textScore"
},
publishTime: -1
}).skip(10)
.limit(10)
Here is the plan for executing this statement.
// 1
{
"queryPlanner": {
"plannerVersion": NumberInt("1"),
"namespace": "bidding_data.bidding_data",
"indexFilterSet": false,
"parsedQuery": {
"$and": [
{
"publishTime": {
"$lte": ISODate("2024-01-01T23:59:59.000Z")
}
},
{
"publishTime": {
"$gte": ISODate("2020-01-17T00:00:00.000Z")
}
},
{
"$text": {
"$search": "公司 有限公司 投标 招标",
"$language": "none",
"$caseSensitive": false,
"$diacriticSensitive": false
}
}
]
},
"winningPlan": {
"stage": "PROJECTION_DEFAULT",
"transformBy": {
"publishTime": 1,
"city": 1,
"biddingType": 1,
"downloadUrl": 1,
"industry": 1,
"updateTime": 1,
"title": 1,
"tags": 1,
"sourceUrl": 1,
"propertiesInfo": 1,
"province": 1,
"createTime": 1,
"status": 1,
"score": {
"$meta": "textScore"
}
},
"inputStage": {
"stage": "SKIP",
"skipAmount": NumberInt("0"),
"inputStage": {
"stage": "SORT",
"sortPattern": {
"$computed0": {
"$meta": "textScore"
},
"publishTime": NumberInt("-1")
},
"memLimit": NumberInt("104857600"),
"limitAmount": NumberInt("20"),
"type": "default",
"inputStage": {
"stage": "TEXT",
"indexPrefix": { },
"indexName": "full_index",
"parsedTextQuery": {
"terms": [
"公司",
"投标",
"招标",
"有限公司"
],
"negatedTerms": [ ],
"phrases": [ ],
"negatedPhrases": [ ]
},
"textIndexVersion": NumberInt("3"),
"inputStage": {
"stage": "TEXT_MATCH",
"inputStage": {
"stage": "TEXT_OR",
"filter": {
"$and": [
{
"publishTime": {
"$lte": ISODate("2024-01-01T23:59:59.000Z")
}
},
{
"publishTime": {
"$gte": ISODate("2020-01-17T00:00:00.000Z")
}
}
]
},
"inputStages": [
{
"stage": "IXSCAN",
"keyPattern": {
"_fts": "text",
"_ftsx": NumberInt("1"),
"publishTime": NumberInt("-1")
},
"indexName": "full_index",
"isMultiKey": true,
"isUnique": false,
"isSparse": false,
"isPartial": false,
"indexVersion": NumberInt("2"),
"direction": "backward",
"indexBounds": { }
},
{
"stage": "IXSCAN",
"keyPattern": {
"_fts": "text",
"_ftsx": NumberInt("1"),
"publishTime": NumberInt("-1")
},
"indexName": "full_index",
"isMultiKey": true,
"isUnique": false,
"isSparse": false,
"isPartial": false,
"indexVersion": NumberInt("2"),
"direction": "backward",
"indexBounds": { }
},
{
"stage": "IXSCAN",
"keyPattern": {
"_fts": "text",
"_ftsx": NumberInt("1"),
"publishTime": NumberInt("-1")
},
"indexName": "full_index",
"isMultiKey": true,
"isUnique": false,
"isSparse": false,
"isPartial": false,
"indexVersion": NumberInt("2"),
"direction": "backward",
"indexBounds": { }
},
{
"stage": "IXSCAN",
"keyPattern": {
"_fts": "text",
"_ftsx": NumberInt("1"),
"publishTime": NumberInt("-1")
},
"indexName": "full_index",
"isMultiKey": true,
"isUnique": false,
"isSparse": false,
"isPartial": false,
"indexVersion": NumberInt("2"),
"direction": "backward",
"indexBounds": { }
}
]
}
}
}
}
}
},
"rejectedPlans": [ ]
},
"executionStats": {
"executionSuccess": true,
"nReturned": NumberInt("10"),
"executionTimeMillis": NumberInt("2954"),
"totalKeysExamined": NumberInt("102154"),
"totalDocsExamined": NumberInt("67415"),
"executionStages": {
"stage": "PROJECTION_DEFAULT",
"nReturned": NumberInt("10"),
"executionTimeMillisEstimate": NumberInt("2689"),
"works": NumberInt("169627"),
"advanced": NumberInt("10"),
"needTime": NumberInt("169616"),
"needYield": NumberInt("0"),
"saveState": NumberInt("225"),
"restoreState": NumberInt("225"),
"isEOF": NumberInt("1"),
"transformBy": {
"publishTime": 1,
"city": 1,
"biddingType": 1,
"downloadUrl": 1,
"industry": 1,
"updateTime": 1,
"title": 1,
"tags": 1,
"sourceUrl": 1,
"propertiesInfo": 1,
"province": 1,
"createTime": 1,
"status": 1,
"score": {
"$meta": "textScore"
}
},
"inputStage": {
"stage": "SKIP",
"nReturned": NumberInt("10"),
"executionTimeMillisEstimate": NumberInt("2679"),
"works": NumberInt("169627"),
"advanced": NumberInt("10"),
"needTime": NumberInt("169616"),
"needYield": NumberInt("0"),
"saveState": NumberInt("225"),
"restoreState": NumberInt("225"),
"isEOF": NumberInt("1"),
"skipAmount": NumberInt("0"),
"inputStage": {
"stage": "SORT",
"nReturned": NumberInt("20"),
"executionTimeMillisEstimate": NumberInt("2678"),
"works": NumberInt("169627"),
"advanced": NumberInt("20"),
"needTime": NumberInt("169606"),
"needYield": NumberInt("0"),
"saveState": NumberInt("225"),
"restoreState": NumberInt("225"),
"isEOF": NumberInt("1"),
"sortPattern": {
"$computed0": {
"$meta": "textScore"
},
"publishTime": NumberInt("-1")
},
"memLimit": NumberInt("104857600"),
"limitAmount": NumberInt("20"),
"type": "default",
"totalDataSizeSorted": NumberLong("2342696268"),
"usedDisk": false,
"inputStage": {
"stage": "TEXT",
"nReturned": NumberInt("67415"),
"executionTimeMillisEstimate": NumberInt("2669"),
"works": NumberInt("169606"),
"advanced": NumberInt("67415"),
"needTime": NumberInt("102190"),
"needYield": NumberInt("0"),
"saveState": NumberInt("225"),
"restoreState": NumberInt("225"),
"isEOF": NumberInt("1"),
"indexPrefix": { },
"indexName": "full_index",
"parsedTextQuery": {
"terms": [
"公司",
"投标",
"招标",
"有限公司"
],
"negatedTerms": [ ],
"phrases": [ ],
"negatedPhrases": [ ]
},
"textIndexVersion": NumberInt("3"),
"inputStage": {
"stage": "TEXT_MATCH",
"nReturned": NumberInt("67415"),
"executionTimeMillisEstimate": NumberInt("2668"),
"works": NumberInt("169606"),
"advanced": NumberInt("67415"),
"needTime": NumberInt("102190"),
"needYield": NumberInt("0"),
"saveState": NumberInt("225"),
"restoreState": NumberInt("225"),
"isEOF": NumberInt("1"),
"docsRejected": NumberInt("0"),
"inputStage": {
"stage": "TEXT_OR",
"filter": {
"$and": [
{
"publishTime": {
"$lte": ISODate("2024-01-01T23:59:59.000Z")
}
},
{
"publishTime": {
"$gte": ISODate("2020-01-17T00:00:00.000Z")
}
}
]
},
"nReturned": NumberInt("67415"),
"executionTimeMillisEstimate": NumberInt("2661"),
"works": NumberInt("169606"),
"advanced": NumberInt("67415"),
"needTime": NumberInt("102190"),
"needYield": NumberInt("0"),
"saveState": NumberInt("225"),
"restoreState": NumberInt("225"),
"isEOF": NumberInt("1"),
"docsExamined": NumberInt("67415"),
"inputStages": [
{
"stage": "IXSCAN",
"nReturned": NumberInt("31193"),
"executionTimeMillisEstimate": NumberInt("158"),
"works": NumberInt("31194"),
"advanced": NumberInt("31193"),
"needTime": NumberInt("0"),
"needYield": NumberInt("0"),
"saveState": NumberInt("225"),
"restoreState": NumberInt("225"),
"isEOF": NumberInt("1"),
"keyPattern": {
"_fts": "text",
"_ftsx": NumberInt("1"),
"publishTime": NumberInt("-1")
},
"indexName": "full_index",
"isMultiKey": true,
"isUnique": false,
"isSparse": false,
"isPartial": false,
"indexVersion": NumberInt("2"),
"direction": "backward",
"indexBounds": { },
"keysExamined": NumberInt("31193"),
"seeks": NumberInt("1"),
"dupsTested": NumberInt("31193"),
"dupsDropped": NumberInt("0")
},
{
"stage": "IXSCAN",
"nReturned": NumberInt("518"),
"executionTimeMillisEstimate": NumberInt("0"),
"works": NumberInt("519"),
"advanced": NumberInt("518"),
"needTime": NumberInt("0"),
"needYield": NumberInt("0"),
"saveState": NumberInt("225"),
"restoreState": NumberInt("225"),
"isEOF": NumberInt("1"),
"keyPattern": {
"_fts": "text",
"_ftsx": NumberInt("1"),
"publishTime": NumberInt("-1")
},
"indexName": "full_index",
"isMultiKey": true,
"isUnique": false,
"isSparse": false,
"isPartial": false,
"indexVersion": NumberInt("2"),
"direction": "backward",
"indexBounds": { },
"keysExamined": NumberInt("518"),
"seeks": NumberInt("1"),
"dupsTested": NumberInt("518"),
"dupsDropped": NumberInt("0")
},
{
"stage": "IXSCAN",
"nReturned": NumberInt("53254"),
"executionTimeMillisEstimate": NumberInt("184"),
"works": NumberInt("53255"),
"advanced": NumberInt("53254"),
"needTime": NumberInt("0"),
"needYield": NumberInt("0"),
"saveState": NumberInt("225"),
"restoreState": NumberInt("225"),
"isEOF": NumberInt("1"),
"keyPattern": {
"_fts": "text",
"_ftsx": NumberInt("1"),
"publishTime": NumberInt("-1")
},
"indexName": "full_index",
"isMultiKey": true,
"isUnique": false,
"isSparse": false,
"isPartial": false,
"indexVersion": NumberInt("2"),
"direction": "backward",
"indexBounds": { },
"keysExamined": NumberInt("53254"),
"seeks": NumberInt("1"),
"dupsTested": NumberInt("53254"),
"dupsDropped": NumberInt("0")
},
{
"stage": "IXSCAN",
"nReturned": NumberInt("17189"),
"executionTimeMillisEstimate": NumberInt("14"),
"works": NumberInt("17190"),
"advanced": NumberInt("17189"),
"needTime": NumberInt("0"),
"needYield": NumberInt("0"),
"saveState": NumberInt("225"),
"restoreState": NumberInt("225"),
"isEOF": NumberInt("1"),
"keyPattern": {
"_fts": "text",
"_ftsx": NumberInt("1"),
"publishTime": NumberInt("-1")
},
"indexName": "full_index",
"isMultiKey": true,
"isUnique": false,
"isSparse": false,
"isPartial": false,
"indexVersion": NumberInt("2"),
"direction": "backward",
"indexBounds": { },
"keysExamined": NumberInt("17189"),
"seeks": NumberInt("1"),
"dupsTested": NumberInt("17189"),
"dupsDropped": NumberInt("0")
}
]
}
}
}
}
}
}
},
"serverInfo": {
"host": "localhost.localdomain",
"port": NumberInt("27017"),
"version": "4.4.9",
"gitVersion": "b4048e19814bfebac717cf5a880076aa69aba481"
},
"ok": 1
}
I noticed that he walked through the index clearly. Why is the totalDataSizeSorted almost 2.34GB during the sorting phase? Why is there such a large amount of data? In my test environment, there are only 170,000 records in total, and in the production environment, there are 5 million records. So, how large is this sorted data? I assume it goes through the index one by one, searching for words separated by spaces, then merges all the results, calculates the scores, and sorts them in the end. Is that correct? How can I address this issue?
Should I consider using an Search engine such as Elasticsearch for this purpose?
I hope to get everyone’s help. My English is not very good, and I used a translation software. Thank you all!