Atlas Search Cluster Sizing

Hi, I am trying out MongoDB atlas search and have some questions about getting the right cluster:
Here’s a bit about my data:
There are 7.5M search requests for the last 6 months.
I have a collection with ~2000 records that looks like this:

    {
    "data": {
        "idFS": "xx",
        "jobNumber": "Dxxx8",
        "applicationUrl": "xxxx",
        "idClient": "xxx",
        "title": "Werkstudent – IT",
        "language": "DE",
        "businessUnit": "Automotive Technology",
        "remote": "No specification",
        "company": "xxx xxx xxx",
        "additionalInfo": "",
        "cityState": "Hombergen,North Rhine-Westphalia",
        "city": "Hombergen",
        "zipCode": "58256",
        "state": "North Rhine-Westphalia",
        "address": "xxxx",
        "country": "Germany",
        "locations": [
            {
                "country": "Germany",
                "zipCode": "58256",
                "city": "Hombergen",
                "state": "North Rhine-Westphalia",
                "stateShort": "NRW",
                "cityState": "Hombergen,North Rhine-Westphalia",
                "address": "xxxx"
            },
            {
                "country": "Germany",
                "zipCode": "54429",
                "city": "Mandern",
                "state": "Rhineland-Palatinate",
                "stateShort": "RP",
                "cityState": "Mandern,Rhineland-Palatinate",
                "address": ""
            }
        ],
        "employmentType": "Part-time",
        "google_employmentType": "PART_TIME",
        "contract": "Limited",
        "socialInsurance": "Ja",
        "entryLevel": "Student job",
        "entryLevel_order": 8,
        "jobField": "IT",
        "category": "Automotive supply",
        "recruiter": [
            "xxx",
            "xxxx",
            "xxxx",
            "xxxx"
        ],
        "applicationEnd": null,
        "postingDate": "2021-11-29T00:00:00+01:00",
        "postingDate_timestamp": 1638140400,
        "new_postingDate": "2021-11-28 23:00:00+00:00",
        "expectedStartDate": null,
        "subClients": null
    },
    "content": {
        "employmentType": "Teilzeit",
        "contract": "Befristet",
        "entryLevel": "Studienjob",
        "jobField": "IT",
        "category": "Automobilzulieferung",
        "applicationEnd": null,
        "businessHL": "Unternehmen",
        "business": "<p>xxxx.</p>",
        "taskHL": "Aufgaben",
        "task": "<ul><lixxxx",
        "profileHL": "Profil",
        "profile": "<ul><li>xxx>",
        "offerHL": "<p>Ihre Vorteile bei uns</p>",
        "offer": "xxxxx",
        "contactHL": "Kontakt",
        "contact": "xxxx",
        "diversityHL": "Das bieten wir",
        "diversity": "xxxx>",
        "headerImage": "xxxx",
        "mobileHeaderImage": "xxxx",
        "compensation": "",
        "employerSeal": ""
    },
    "_geoloc": [
        {
            "lat": 123.123,
            "lng": 123.123
        },
        {
            "lat": 123.123,
            "lng": 123.123
        }
    ],
    "arbeitsAgentur": {
        "argeId": null,
        "baReferenzeId": "811389-002"
    }
}
]

I have created a search index that looks like this:

  "mappings": {
    "dynamic": false,
    "fields": {
      "data": {
        "fields": {
          "businessUnit": {
            "analyzer": "lucene.keyword",
            "searchAnalyzer": "lucene.keyword",
            "type": "string"
          },
          "category": {
            "type": "string"
          },
          "company": {
            "analyzer": "lucene.keyword",
            "searchAnalyzer": "lucene.keyword",
            "type": "string"
          },
          "contract": {
            "analyzer": "lucene.keyword",
            "searchAnalyzer": "lucene.keyword",
            "type": "string"
          },
          "employmentType": {
            "analyzer": "lucene.keyword",
            "searchAnalyzer": "lucene.keyword",
            "type": "string"
          },
          "jobField": {
            "analyzer": "lucene.keyword",
            "searchAnalyzer": "lucene.keyword",
            "type": "string"
          },
          "postingDate": {
            "type": "string"
          },
          "postingDate_timestamp": {
            "type": "number"
          },
          "title": {
            "type": "string"
          }
        },
        "type": "document"
      }
    }
  },
  "storedSource": {
    "include": [
      "data"
    ]
  }
}

then also another index for facets (I can probably combine them, but reading / watching some material I saw example of using more than 1 index):

{
  "mappings": {
    "dynamic": false,
    "fields": {
      "data": {
        "fields": {
          "businessUnit": [
            {
              "analyzer": "lucene.keyword",
              "searchAnalyzer": "lucene.keyword",
              "type": "string"
            },
            {
              "type": "stringFacet"
            }
          ],
          "employmentType": [
            {
              "type": "stringFacet"
            },
            {
              "analyzer": "lucene.keyword",
              "searchAnalyzer": "lucene.keyword",
              "type": "string"
            }
          ],
          "entryLevel": [
            {
              "type": "stringFacet"
            },
            {
              "analyzer": "lucene.keyword",
              "searchAnalyzer": "lucene.keyword",
              "type": "string"
            }
          ],
          "jobField": [
            {
              "type": "stringFacet"
            },
            {
              "type": "string"
            }
          ],
          "locations": {
            "fields": {
              "country": [
                {
                  "type": "stringFacet"
                },
                {
                  "type": "string"
                }
              ]
            },
            "type": "document"
          },
          "title": {
            "type": "string"
          }
        },
        "type": "document"
      }
    }
  },
  "storedSource": true
}

Finally here is my example query - I’d have to add some more fields for filtering but this is it:

mainQuery [
  {
    '$search': {
      returnStoredSource: true,
      index: 'tkag_en',
      compound: {
        must: [
          {
            text: {
              query: 'Operations Manager',
              path: 'data.title',
              fuzzy: { maxEdits: 2 }
            }
          },
          {
            text: {
              path: 'data.jobField',
              query: [ 'Engineering & Science' ]
            }
          },
          {
            text: { path: 'data.employmentType', query: [ 'Full-time' ] }
          },
          {
            text: {
              path: 'data.businessUnit',
              query: [ 'Automotive Technology' ]
            }
          },
          { 
          // instaed of $sort [which is expensive] use $near for sorting
            near: {
              path: 'data.postingDate_timestamp', 
              origin: 1686729595572, // today
              pivot: 7776000000,// far in the future to give me the latest records based on timestamp

              score: { boost: { value: 1000 } }
            }
          }
        ]
      }
    }
  },
  {
    '$project': {
      'data.title': 1,
      'data.idClient': 1,
      'data.city': 1,
      'data.state': 1,
      'data.country': 1,
      'data.company': 1,
      'data.postingDate': 1,
      'data.locations': 1,
      _geoloc: 1,
      score: { '$meta': 'searchScore' }
    }
  },
  { '$skip': 0 },
  { '$limit': 50 }
]

And here is the $searchMeta query:

 [
    {
      '$searchMeta': {
        index: 'tkag_en_facets',
        returnStoredSource: true,
        facet: {
          operator: {
            compound: {
              must: [
                {
                  text: {
                    query: 'Operations Manager',
                    path: 'data.title',
                    fuzzy: { maxEdits: 2 }
                  }
                },
                {
                  text: {
                    path: 'data.jobField',
                    query: [ 'Engineering & Science' ]
                  }
                },
                {
                  text: {
                    path: 'data.employmentType',
                    query: [ 'Full-time' ]
                  }
                },
                {
                  text: {
                    path: 'data.businessUnit',
                    query: [ 'Automotive Technology' ]
                  }
                }
              ]
            }
          },
          facets: {
            data_DOT_businessUnit: { type: 'string', path: 'data.businessUnit' },
            data_DOT_employmentType: { type: 'string', path: 'data.employmentType' },
            data_DOT_jobField: { type: 'string', path: 'data.jobField' }
          }
        }
      }
    }
  ],
  [
    {
      '$searchMeta': {
        returnStoredSource: true,
        index: 'tkag_en_facets',
        facet: {
          operator: {
            compound: {
              must: [
                {
                  text: {
                    path: 'data.employmentType',
                    query: [ 'Full-time' ]
                  }
                },
                {
                  text: {
                    path: 'data.businessUnit',
                    query: [ 'Automotive Technology' ]
                  }
                }
              ]
            }
          },
          facets: {
            data_DOT_jobField: { type: 'string', path: 'data.jobField' }
          }
        }
      }
    }
  ],

I am happy with the performance so far, searching / filtering is around 15-45ms.
First question, am I doing something wrong? I’ve only done the research for the last week
Second questions, what type of a cluster do I need given my requirements? - Note I will have multiple collections with 2000 records.
Thanks in advance

Hi @Ed_Durguti and welcome to MongoDB community forums!!

Could you state your concerns with this particular question? I understand you’ve noted you are happy with the performance so I just wish to clarify the concerns here.

For further details on Atlas Search Performance, please view the Tune Atlas Search Performance documentation which may be of use.

However, we usually recommend contacting the MongoDB consulting in understanding the current workload and the suggesting what would be best suited for the application. This may be of additional use if you have further use cases out of the one described here in this post.

Let us know if you have any further questions.

Regards
Aasawari

@Aasawari I was just wondering if I am doing the indexing/querying correctly, are there any room from improvments etc.
Thank you for pointing me to the docs.

1 Like

This topic was automatically closed 5 days after the last reply. New replies are no longer allowed.