We have a big collection of files in MongoDB,
each file can have multiple tags,
I want to count how many files we have on each tag to show the user
this couldn’t be cached as the count changes as users apply any tag (meaning filtering) or any other filtering for that matter.
I’ve taken multiple approaches: Indexing, aggregation pipeline, and splitting the count for each tag with indexing and aggregation pipeline.
I might need help in my approaches and indexing or trying different approaches to make it even faster.
For a large data set of 50K files, this can take up to 7 seconds.
This is then passed to the user in our platform as should be real-time data. for that, the time is way too long. (using Node.js API)
what are the best approaches to take?
I have a index for
{orgId:1,workspaces:1,status:1}
but it is rejected (in the rejected plans)
I also used hint to try to use it but got similar results
Example on the code we ran
{ "$match": {
"orgId": "64c20003cdaf92000d336ae3",
"workspaces": "64c20003cdaf92000d336ae7",
"status": {
"$ne": "pending"
}
}},
{ "$project": { "tags" : 1 }},
{ "$unwind": "$tags" },
{ "$group": { "_id": "$tags", "count": { "$sum": 1 } }}
]
)
The result of the executionStats :
* {
'$cursor': {
queryPlanner: {
plannerVersion: 1,
namespace: 'tagbox.files',
indexFilterSet: false,
parsedQuery: {
'$and': [
{ orgId: { '$eq': '64c20003cdaf92000d336ae3' } },
{ workspaces: { '$eq': '64c20003cdaf92000d336ae7' } },
{ status: { '$not': { '$eq': 'pending' } } }
]
},
queryHash: '0AB18F00',
planCacheKey: '8A6445FF',
winningPlan: {
stage: 'PROJECTION_SIMPLE',
transformBy: { _id: true, tags: true },
inputStage: {
stage: 'FETCH',
filter: {
'$and': [
{ status: { '$not': { '$eq': 'pending' } } },
{ workspaces: { '$eq': '64c20003cdaf92000d336ae7' } }
]
},
inputStage: {
stage: 'IXSCAN',
keyPattern: { orgId: 1 },
indexName: 'orgId_1',
isMultiKey: false,
multiKeyPaths: { orgId: [] },
isUnique: false,
isSparse: false,
isPartial: false,
indexVersion: 2,
direction: 'forward',
indexBounds: {
orgId: [
'["64c20003cdaf92000d336ae3", "64c20003cdaf92000d336ae3"]'
]
}
}
}
},
rejectedPlans: [
{
stage: 'PROJECTION_SIMPLE',
transformBy: { _id: true, tags: true },
inputStage: {
stage: 'FETCH',
filter: {
'$and': [
{ orgId: { '$eq': '64c20003cdaf92000d336ae3' } },
{
workspaces: { '$eq': '64c20003cdaf92000d336ae7' }
}
]
},
inputStage: {
stage: 'IXSCAN',
keyPattern: { status: 1 },
indexName: 'status_1',
isMultiKey: false,
multiKeyPaths: { status: [] },
isUnique: false,
isSparse: false,
isPartial: false,
indexVersion: 2,
direction: 'forward',
indexBounds: {
status: [ '[MinKey, "pending")', '("pending", MaxKey]' ]
}
}
}
},
{
stage: 'PROJECTION_SIMPLE',
transformBy: { _id: true, tags: true },
inputStage: {
stage: 'FETCH',
inputStage: {
stage: 'IXSCAN',
keyPattern: {
orgId: 1,
workspaces: 1,
createdAt: -1,
<em>id: 1,
status: 1
},
indexName: 'orgId_1_workspaces_1_createdAt</em>-1__id_1_status_1',
isMultiKey: true,
multiKeyPaths: {
orgId: [],
workspaces: [ 'workspaces' ],
createdAt: [],
_id: [],
status: []
},
isUnique: false,
isSparse: false,
isPartial: false,
indexVersion: 2,
direction: 'forward',
indexBounds: {
orgId: [
'["64c20003cdaf92000d336ae3", "64c20003cdaf92000d336ae3"]'
],
workspaces: [
'["64c20003cdaf92000d336ae7", "64c20003cdaf92000d336ae7"]'
],
createdAt: [ '[MaxKey, MinKey]' ],
_id: [ '[MinKey, MaxKey]' ],
status: [ '[MinKey, "pending")', '("pending", MaxKey]' ]
}
}
}
},
{
stage: 'PROJECTION_SIMPLE',
transformBy: { _id: true, tags: true },
inputStage: {
stage: 'FETCH',
filter: { workspaces: { '$eq': '64c20003cdaf92000d336ae7' } },
inputStage: {
stage: 'IXSCAN',
keyPattern: { orgId: 1, status: 1, collections: 1 },
indexName: 'orgId_1_status_1_collections_1',
isMultiKey: true,
multiKeyPaths: {
orgId: [],
status: [],
collections: [ 'collections' ]
},
isUnique: false,
isSparse: false,
isPartial: false,
indexVersion: 2,
direction: 'forward',
indexBounds: {
orgId: [
'["64c20003cdaf92000d336ae3", "64c20003cdaf92000d336ae3"]'
],
status: [ '[MinKey, "pending")', '("pending", MaxKey]' ],
collections: [ '[MinKey, MaxKey]' ]
}
}
}
},
{
stage: 'PROJECTION_SIMPLE',
transformBy: { _id: true, tags: true },
inputStage: {
stage: 'FETCH',
inputStage: {
stage: 'IXSCAN',
keyPattern: { orgId: 1, workspaces: 1, status: 1 },
indexName: 'orgId_1_workspaces_1_status_1',
isMultiKey: true,
multiKeyPaths: {
orgId: [],
workspaces: [ 'workspaces' ],
status: []
},
isUnique: false,
isSparse: false,
isPartial: false,
indexVersion: 2,
direction: 'forward',
indexBounds: {
orgId: [
'["64c20003cdaf92000d336ae3", "64c20003cdaf92000d336ae3"]'
],
workspaces: [
'["64c20003cdaf92000d336ae7", "64c20003cdaf92000d336ae7"]'
],
status: [ '[MinKey, "pending")', '("pending", MaxKey]' ]
}
}
}
},
{
stage: 'PROJECTION_SIMPLE',
transformBy: { _id: true, tags: true },
inputStage: {
stage: 'FETCH',
filter: { workspaces: { '$eq': '64c20003cdaf92000d336ae7' } },
inputStage: {
stage: 'IXSCAN',
keyPattern: { orgId: 1, status: 1, tags: 1 },
indexName: 'orgId_1_status_1_tags_1',
isMultiKey: true,
multiKeyPaths: { orgId: [], status: [], tags: [ 'tags' ] },
isUnique: false,
isSparse: false,
isPartial: false,
indexVersion: 2,
direction: 'forward',
indexBounds: {
orgId: [
'["64c20003cdaf92000d336ae3", "64c20003cdaf92000d336ae3"]'
],
status: [ '[MinKey, "pending")', '("pending", MaxKey]' ],
tags: [ '[MinKey, MaxKey]' ]
}
}
}
}
]
},
executionStats: {
executionSuccess: true,
nReturned: 43910,
executionTimeMillis: 8327,
totalKeysExamined: 43934,
totalDocsExamined: 43934,
executionStages: {
stage: 'PROJECTION_SIMPLE',
nReturned: 43910,
executionTimeMillisEstimate: 8149,
works: 43935,
advanced: 43910,
needTime: 24,
needYield: 0,
saveState: 470,
restoreState: 470,
isEOF: 1,
transformBy: { _id: true, tags: true },
inputStage: {
stage: 'FETCH',
filter: {
'$and': [
{ status: { '$not': { '$eq': 'pending' } } },
{ workspaces: { '$eq': '64c20003cdaf92000d336ae7' } }
]
},
nReturned: 43910,
executionTimeMillisEstimate: 8135,
works: 43935,
advanced: 43910,
needTime: 24,
needYield: 0,
saveState: 470,
restoreState: 470,
isEOF: 1,
docsExamined: 43934,
alreadyHasObj: 0,
inputStage: {
stage: 'IXSCAN',
nReturned: 43934,
executionTimeMillisEstimate: 69,
works: 43935,
advanced: 43934,
needTime: 0,
needYield: 0,
saveState: 470,
restoreState: 470,
isEOF: 1,
keyPattern: { orgId: 1 },
indexName: 'orgId_1',
isMultiKey: false,
multiKeyPaths: { orgId: [] },
isUnique: false,
isSparse: false,
isPartial: false,
indexVersion: 2,
direction: 'forward',
indexBounds: {
orgId: [
'["64c20003cdaf92000d336ae3", "64c20003cdaf92000d336ae3"]'
]
},
keysExamined: 43934,
seeks: 1,
dupsTested: 0,
dupsDropped: 0
}
}
}
}
},
nReturned: Long("43910"),
executionTimeMillisEstimate: Long("8191")
},
{
'$unwind': { path: '$tags' },
nReturned: Long("233618"),
executionTimeMillisEstimate: Long("8222")
},
{
'$group': { _id: '$tags', count: { '$sum': { '$const': 1 } } },
nReturned: Long("237"),
executionTimeMillisEstimate: Long("8258")
}
],
serverInfo: {
host: 'atlas-u7e53s-shard-00-02.8avbv.mongodb.net',
port: 27017,
version: '4.4.23',
gitVersion: '36c047f935fd86b2d5ac4c4f5189e52daa044966'
},
ok: 1,
'$clusterTime': {
clusterTime: Timestamp({ t: 1692090071, i: 12 }),
signature: {
hash: Binary(Buffer.from("31ddbd309d0f50634f3086763a42616ee28da83c", "hex"), 0),
keyId: Long("7221999452528050178")
}
},
operationTime: Timestamp({ t: 1692090071, i: 12 })
}