Creating a fast and efficient query to count dynamic data

We have a big collection of files in MongoDB,
each file can have multiple tags,
I want to count how many files we have on each tag to show the user
this couldn’t be cached as the count changes as users apply any tag (meaning filtering) or any other filtering for that matter.

I’ve taken multiple approaches: Indexing, aggregation pipeline, and splitting the count for each tag with indexing and aggregation pipeline.

I might need help in my approaches and indexing or trying different approaches to make it even faster.

For a large data set of 50K files, this can take up to 7 seconds.
This is then passed to the user in our platform as should be real-time data. for that, the time is way too long. (using Node.js API)

what are the best approaches to take?

I have a index for
{orgId:1,workspaces:1,status:1} but it is rejected (in the rejected plans)
I also used hint to try to use it but got similar results

Example on the code we ran

{ "$match": {
"orgId": "64c20003cdaf92000d336ae3",
"workspaces": "64c20003cdaf92000d336ae7",
"status": {
"$ne": "pending"
}
}},
{ "$project": { "tags" : 1 }},
{ "$unwind": "$tags" },
{ "$group": { "_id": "$tags", "count": { "$sum": 1 } }}
]
) 

The result of the executionStats :

* {
'$cursor': {
queryPlanner: {
plannerVersion: 1,
namespace: 'tagbox.files',
indexFilterSet: false,
parsedQuery: {
'$and': [
{ orgId: { '$eq': '64c20003cdaf92000d336ae3' } },
{ workspaces: { '$eq': '64c20003cdaf92000d336ae7' } },
{ status: { '$not': { '$eq': 'pending' } } }
]
},
queryHash: '0AB18F00',
planCacheKey: '8A6445FF',
winningPlan: {
stage: 'PROJECTION_SIMPLE',
transformBy: { _id: true, tags: true },
inputStage: {
stage: 'FETCH',
filter: {
'$and': [
{ status: { '$not': { '$eq': 'pending' } } },
{ workspaces: { '$eq': '64c20003cdaf92000d336ae7' } }
]
},
inputStage: {
stage: 'IXSCAN',
keyPattern: { orgId: 1 },
indexName: 'orgId_1',
isMultiKey: false,
multiKeyPaths: { orgId: [] },
isUnique: false,
isSparse: false,
isPartial: false,
indexVersion: 2,
direction: 'forward',
indexBounds: {
orgId: [
'["64c20003cdaf92000d336ae3", "64c20003cdaf92000d336ae3"]'
]
}
}
}
},
rejectedPlans: [
{
stage: 'PROJECTION_SIMPLE',
transformBy: { _id: true, tags: true },
inputStage: {
stage: 'FETCH',
filter: {
'$and': [
{ orgId: { '$eq': '64c20003cdaf92000d336ae3' } },
{
workspaces: { '$eq': '64c20003cdaf92000d336ae7' }
}
]
},
inputStage: {
stage: 'IXSCAN',
keyPattern: { status: 1 },
indexName: 'status_1',
isMultiKey: false,
multiKeyPaths: { status: [] },
isUnique: false,
isSparse: false,
isPartial: false,
indexVersion: 2,
direction: 'forward',
indexBounds: {
status: [ '[MinKey, "pending")', '("pending", MaxKey]' ]
}
}
}
},
{
stage: 'PROJECTION_SIMPLE',
transformBy: { _id: true, tags: true },
inputStage: {
stage: 'FETCH',
inputStage: {
stage: 'IXSCAN',
keyPattern: {
orgId: 1,
workspaces: 1,
createdAt: -1,
<em>id: 1,
status: 1
},
indexName: 'orgId_1_workspaces_1_createdAt</em>-1__id_1_status_1',
isMultiKey: true,
multiKeyPaths: {
orgId: [],
workspaces: [ 'workspaces' ],
createdAt: [],
_id: [],
status: []
},
isUnique: false,
isSparse: false,
isPartial: false,
indexVersion: 2,
direction: 'forward',
indexBounds: {
orgId: [
'["64c20003cdaf92000d336ae3", "64c20003cdaf92000d336ae3"]'
],
workspaces: [
'["64c20003cdaf92000d336ae7", "64c20003cdaf92000d336ae7"]'
],
createdAt: [ '[MaxKey, MinKey]' ],
_id: [ '[MinKey, MaxKey]' ],
status: [ '[MinKey, "pending")', '("pending", MaxKey]' ]
}
}
}
},
{
stage: 'PROJECTION_SIMPLE',
transformBy: { _id: true, tags: true },
inputStage: {
stage: 'FETCH',
filter: { workspaces: { '$eq': '64c20003cdaf92000d336ae7' } },
inputStage: {
stage: 'IXSCAN',
keyPattern: { orgId: 1, status: 1, collections: 1 },
indexName: 'orgId_1_status_1_collections_1',
isMultiKey: true,
multiKeyPaths: {
orgId: [],
status: [],
collections: [ 'collections' ]
},
isUnique: false,
isSparse: false,
isPartial: false,
indexVersion: 2,
direction: 'forward',
indexBounds: {
orgId: [
'["64c20003cdaf92000d336ae3", "64c20003cdaf92000d336ae3"]'
],
status: [ '[MinKey, "pending")', '("pending", MaxKey]' ],
collections: [ '[MinKey, MaxKey]' ]
}
}
}
},
{
stage: 'PROJECTION_SIMPLE',
transformBy: { _id: true, tags: true },
inputStage: {
stage: 'FETCH',
inputStage: {
stage: 'IXSCAN',
keyPattern: { orgId: 1, workspaces: 1, status: 1 },
indexName: 'orgId_1_workspaces_1_status_1',
isMultiKey: true,
multiKeyPaths: {
orgId: [],
workspaces: [ 'workspaces' ],
status: []
},
isUnique: false,
isSparse: false,
isPartial: false,
indexVersion: 2,
direction: 'forward',
indexBounds: {
orgId: [
'["64c20003cdaf92000d336ae3", "64c20003cdaf92000d336ae3"]'
],
workspaces: [
'["64c20003cdaf92000d336ae7", "64c20003cdaf92000d336ae7"]'
],
status: [ '[MinKey, "pending")', '("pending", MaxKey]' ]
}
}
}
},
{
stage: 'PROJECTION_SIMPLE',
transformBy: { _id: true, tags: true },
inputStage: {
stage: 'FETCH',
filter: { workspaces: { '$eq': '64c20003cdaf92000d336ae7' } },
inputStage: {
stage: 'IXSCAN',
keyPattern: { orgId: 1, status: 1, tags: 1 },
indexName: 'orgId_1_status_1_tags_1',
isMultiKey: true,
multiKeyPaths: { orgId: [], status: [], tags: [ 'tags' ] },
isUnique: false,
isSparse: false,
isPartial: false,
indexVersion: 2,
direction: 'forward',
indexBounds: {
orgId: [
'["64c20003cdaf92000d336ae3", "64c20003cdaf92000d336ae3"]'
],
status: [ '[MinKey, "pending")', '("pending", MaxKey]' ],
tags: [ '[MinKey, MaxKey]' ]
}
}
}
}
]
},
executionStats: {
executionSuccess: true,
nReturned: 43910,
executionTimeMillis: 8327,
totalKeysExamined: 43934,
totalDocsExamined: 43934,
executionStages: {
stage: 'PROJECTION_SIMPLE',
nReturned: 43910,
executionTimeMillisEstimate: 8149,
works: 43935,
advanced: 43910,
needTime: 24,
needYield: 0,
saveState: 470,
restoreState: 470,
isEOF: 1,
transformBy: { _id: true, tags: true },
inputStage: {
stage: 'FETCH',
filter: {
'$and': [
{ status: { '$not': { '$eq': 'pending' } } },
{ workspaces: { '$eq': '64c20003cdaf92000d336ae7' } }
]
},
nReturned: 43910,
executionTimeMillisEstimate: 8135,
works: 43935,
advanced: 43910,
needTime: 24,
needYield: 0,
saveState: 470,
restoreState: 470,
isEOF: 1,
docsExamined: 43934,
alreadyHasObj: 0,
inputStage: {
stage: 'IXSCAN',
nReturned: 43934,
executionTimeMillisEstimate: 69,
works: 43935,
advanced: 43934,
needTime: 0,
needYield: 0,
saveState: 470,
restoreState: 470,
isEOF: 1,
keyPattern: { orgId: 1 },
indexName: 'orgId_1',
isMultiKey: false,
multiKeyPaths: { orgId: [] },
isUnique: false,
isSparse: false,
isPartial: false,
indexVersion: 2,
direction: 'forward',
indexBounds: {
orgId: [
'["64c20003cdaf92000d336ae3", "64c20003cdaf92000d336ae3"]'
]
},
keysExamined: 43934,
seeks: 1,
dupsTested: 0,
dupsDropped: 0
}
}
}
}
},
nReturned: Long("43910"),
executionTimeMillisEstimate: Long("8191")
},
{
'$unwind': { path: '$tags' },
nReturned: Long("233618"),
executionTimeMillisEstimate: Long("8222")
},
{
'$group': { _id: '$tags', count: { '$sum': { '$const': 1 } } },
nReturned: Long("237"),
executionTimeMillisEstimate: Long("8258")
}
],
serverInfo: {
host: 'atlas-u7e53s-shard-00-02.8avbv.mongodb.net',
port: 27017,
version: '4.4.23',
gitVersion: '36c047f935fd86b2d5ac4c4f5189e52daa044966'
},
ok: 1,
'$clusterTime': {
clusterTime: Timestamp({ t: 1692090071, i: 12 }),
signature: {
hash: Binary(Buffer.from("31ddbd309d0f50634f3086763a42616ee28da83c", "hex"), 0),
keyId: Long("7221999452528050178")
}
},
operationTime: Timestamp({ t: 1692090071, i: 12 })
}

Did you really need to use $ne operator?

It can be very hard to database to execute not operator, probably it’s the reason of your index isn’t in use.

If status is an Enum ( I am trying to guess based on value pending), could be better to you use $in operator instead $ne. Try to do it and probably will use your index.

Other approach that you can think about is split the documents between file in fileCollection and metadata in metadataFileCollection. So you can just make your queries on metadata collection that will be smaller than file collection and can improve your performance.

1 Like

Thanks for the response
you had a promising idea
but it seems not to really help at the end
it still takes 8 seconds to get the 40K files

db.files.explain("executionStats").aggregate([
     { "$match": {
          "orgId": "64c2d9a3cdaf92000d336ae3",
          "workspaces": "64c2d9a3cdaf92000d336ae7",
          "status": {
               "$eq": "complete"
          }
     }},
     { "$project": { "tags" : 1 }},  
     { "$unwind": "$tags" },  
     { "$group": { "_id": "$tags", "count": { "$sum": 1 } }}  
   ]
   ,{hint: 'orgId_1_workspaces_1_status_1'}

and got result

{
  stages: [
    {
      '$cursor': {
        queryPlanner: {
          plannerVersion: 1,
          namespace: 'tagbox.files',
          indexFilterSet: false,
          parsedQuery: {
            '$and': [
              { orgId: { '$eq': '64c2d9a3cdaf92000d336ae3' } },
              { status: { '$eq': 'complete' } },
              { workspaces: { '$eq': '64c2d9a3cdaf92000d336ae7' } }
            ]
          },
          queryHash: '04612910',
          planCacheKey: 'E40C51EF',
          winningPlan: {
            stage: 'PROJECTION_SIMPLE',
            transformBy: { _id: true, tags: true },
            inputStage: {
              stage: 'FETCH',
              inputStage: {
                stage: 'IXSCAN',
                keyPattern: { orgId: 1, workspaces: 1, status: 1 },
                indexName: 'orgId_1_workspaces_1_status_1',
                isMultiKey: true,
                multiKeyPaths: { orgId: [], workspaces: [ 'workspaces' ], status: [] },
                isUnique: false,
                isSparse: false,
                isPartial: false,
                indexVersion: 2,
                direction: 'forward',
                indexBounds: {
                  orgId: [
                    '["64c2d9a3cdaf92000d336ae3", "64c2d9a3cdaf92000d336ae3"]'
                  ],
                  workspaces: [
                    '["64c2d9a3cdaf92000d336ae7", "64c2d9a3cdaf92000d336ae7"]'
                  ],
                  status: [ '["complete", "complete"]' ]
                }
              }
            }
          },
          rejectedPlans: []
        },
        executionStats: {
          executionSuccess: true,
          nReturned: 43799,
          executionTimeMillis: 8169,
          totalKeysExamined: 43799,
          totalDocsExamined: 43799,
          executionStages: {
            stage: 'PROJECTION_SIMPLE',
            nReturned: 43799,
            executionTimeMillisEstimate: 8055,
            works: 43800,
            advanced: 43799,
            needTime: 0,
            needYield: 0,
            saveState: 448,
            restoreState: 448,
            isEOF: 1,
            transformBy: { _id: true, tags: true },
            inputStage: {
              stage: 'FETCH',
              nReturned: 43799,
              executionTimeMillisEstimate: 8001,
              works: 43800,
              advanced: 43799,
              needTime: 0,
              needYield: 0,
              saveState: 448,
              restoreState: 448,
              isEOF: 1,
              docsExamined: 43799,
              alreadyHasObj: 0,
              inputStage: {
                stage: 'IXSCAN',
                nReturned: 43799,
                executionTimeMillisEstimate: 62,
                works: 43800,
                advanced: 43799,
                needTime: 0,
                needYield: 0,
                saveState: 448,
                restoreState: 448,
                isEOF: 1,
                keyPattern: { orgId: 1, workspaces: 1, status: 1 },
                indexName: 'orgId_1_workspaces_1_status_1',
                isMultiKey: true,
                multiKeyPaths: { orgId: [], workspaces: [ 'workspaces' ], status: [] },
                isUnique: false,
                isSparse: false,
                isPartial: false,
                indexVersion: 2,
                direction: 'forward',
                indexBounds: {
                  orgId: [
                    '["64c2d9a3cdaf92000d336ae3", "64c2d9a3cdaf92000d336ae3"]'
                  ],
                  workspaces: [
                    '["64c2d9a3cdaf92000d336ae7", "64c2d9a3cdaf92000d336ae7"]'
                  ],
                  status: [ '["complete", "complete"]' ]
                },
                keysExamined: 43799,
                seeks: 1,
                dupsTested: 43799,
                dupsDropped: 0
              }
            }
          }
        }
      },
      nReturned: Long("43799"),
      executionTimeMillisEstimate: Long("8105")
    },
    {
      '$unwind': { path: '$tags' },
      nReturned: Long("232843"),
      executionTimeMillisEstimate: Long("8142")
    },
    {
      '$group': { _id: '$tags', count: { '$sum': { '$const': 1 } } },
      nReturned: Long("237"),
      executionTimeMillisEstimate: Long("8165")
    }
  ],
  serverInfo: {
    host: 'atlas-u7e53s-shard-00-02.8avbv.mongodb.net',
    port: 27017,
    version: '4.4.23',
    gitVersion: '36c047f935fd86b2d5ac4c4f5189e52daa044966'
  },
  ok: 1,
  '$clusterTime': {
    clusterTime: Timestamp({ t: 1692170598, i: 40 }),
    signature: {
      hash: Binary(Buffer.from("514609968c6aab0748ec202e591120dff1b3466b", "hex"), 0),
      keyId: Long("7221999452528050178")
    }
  },
  operationTime: Timestamp({ t: 1692170598, i: 40 })
}

Your FETCH stage looks like a degraded performance.

I don’t know the reason for it.

What is the avg size of each document in this collection?
How much memory does your cluster have?

Try to share your collection stats, can help to identify the problem
db.files.stats()

1 Like