Hi MongoDB community,
I’m using a time series collection to store IoT data and would like to get the most recent document in the collection to find out the current state of the IoT device that’s pushing the data.
My collection has about 300,000 documents, uses a timestamp field as the timeField, and has an index on timestamp, as suggested in the documentation.
I assumed that the following query would be efficient:
db.collection("iot-time-series")
.find({}, { sort: { timestamp: -1 }, limit: 1 })
.toArray()
However, it takes about two seconds to execute on an M10 cluster. If I interpret the explain output for this query correctly, MongoDB doesn’t use the index on timestamp and loads all 300,000 documents into memory to find the one with the most recent timestamp.
Explain output
{
"explainVersion": "1",
"stages": [
{
"$cursor": {
"queryPlanner": {
"namespace": "REDACTED.system.buckets.iot-time-series",
"indexFilterSet": false,
"parsedQuery": {},
"queryHash": "8B3D4AB8",
"planCacheKey": "D542626C",
"maxIndexedOrSolutionsReached": false,
"maxIndexedAndSolutionsReached": false,
"maxScansToExplodeReached": false,
"winningPlan": { "stage": "COLLSCAN", "direction": "forward" },
"rejectedPlans": []
},
"executionStats": {
"executionSuccess": true,
"nReturned": 825,
"executionTimeMillis": 1232,
"totalKeysExamined": 0,
"totalDocsExamined": 825,
"executionStages": {
"stage": "COLLSCAN",
"nReturned": 825,
"executionTimeMillisEstimate": 0,
"works": 827,
"advanced": 825,
"needTime": 1,
"needYield": 0,
"saveState": 41,
"restoreState": 41,
"isEOF": 1,
"direction": "forward",
"docsExamined": 825
},
"allPlansExecution": []
}
},
"nReturned": 825,
"executionTimeMillisEstimate": 90
},
{
"$_internalUnpackBucket": {
"exclude": [],
"timeField": "timestamp",
"metaField": "metadata",
"bucketMaxSpanSeconds": 3600
},
"nReturned": 294628,
"executionTimeMillisEstimate": 1060
},
{
"$sort": { "sortKey": { "timestamp": -1 }, "limit": 1 },
"totalDataSizeSortedBytesEstimate": 0,
"usedDisk": false,
"nReturned": 1,
"executionTimeMillisEstimate": 1227
}
],
"serverInfo": {
"host": "REDACTED",
"port": 27017,
"version": "5.0.6",
"gitVersion": "212a8dbb47f07427dae194a9c75baec1d81d9259"
},
"serverParameters": {
"internalQueryFacetBufferSizeBytes": 104857600,
"internalQueryFacetMaxOutputDocSizeBytes": 104857600,
"internalLookupStageIntermediateDocumentMaxSizeBytes": 104857600,
"internalDocumentSourceGroupMaxMemoryBytes": 104857600,
"internalQueryMaxBlockingSortMemoryUsageBytes": 104857600,
"internalQueryProhibitBlockingMergeOnMongoS": 0,
"internalQueryMaxAddToSetBytes": 104857600,
"internalDocumentSourceSetWindowFieldsMaxMemoryBytes": 104857600
},
"command": {
"aggregate": "system.buckets.iot-time-series",
"pipeline": [
{
"$_internalUnpackBucket": {
"timeField": "timestamp",
"metaField": "metadata",
"bucketMaxSpanSeconds": 3600,
"exclude": []
}
},
{ "$sort": { "timestamp": -1 } },
{ "$limit": 1 }
],
"cursor": {},
"maxTimeMS": 60000,
"collation": { "locale": "simple" }
},
"ok": 1,
"$clusterTime": {
"clusterTime": { "$timestamp": "7068709110501343233" },
"signature": {
"hash": "/scbPt2f8gVBc/Jpq0GEGEc6Ze4=",
"keyId": { "low": 2, "high": 1633075851, "unsigned": false }
}
},
"operationTime": { "$timestamp": "7068709110501343233" }
}
Is there another, more efficient way to get the latest document in a time series collection? Does MongoDB make any guarantees regarding the order of time series documents?
- It seems that the optimizer does not really now how many actual “documents” are stored in a bucket in order to push the limit step up in the query plan. In a normal collection, the index size corresponds to the the number of “documents” and it is enough to scan the index only to get the number of required document as specified by the limit op in the query. In a time-series collection, the number of documents in a bucket varies and this metadata is probably not present to the optimizer. Therefore it cannot push the step up in the plan but requires to first unpack all the buckets that match the query criteria and then perform the limit step.