Hi, I am trying out MongoDB atlas search and have some questions about getting the right cluster:
Here’s a bit about my data:
There are 7.5M search requests for the last 6 months.
I have a collection with ~2000 records that looks like this:
{
"data": {
"idFS": "xx",
"jobNumber": "Dxxx8",
"applicationUrl": "xxxx",
"idClient": "xxx",
"title": "Werkstudent – IT",
"language": "DE",
"businessUnit": "Automotive Technology",
"remote": "No specification",
"company": "xxx xxx xxx",
"additionalInfo": "",
"cityState": "Hombergen,North Rhine-Westphalia",
"city": "Hombergen",
"zipCode": "58256",
"state": "North Rhine-Westphalia",
"address": "xxxx",
"country": "Germany",
"locations": [
{
"country": "Germany",
"zipCode": "58256",
"city": "Hombergen",
"state": "North Rhine-Westphalia",
"stateShort": "NRW",
"cityState": "Hombergen,North Rhine-Westphalia",
"address": "xxxx"
},
{
"country": "Germany",
"zipCode": "54429",
"city": "Mandern",
"state": "Rhineland-Palatinate",
"stateShort": "RP",
"cityState": "Mandern,Rhineland-Palatinate",
"address": ""
}
],
"employmentType": "Part-time",
"google_employmentType": "PART_TIME",
"contract": "Limited",
"socialInsurance": "Ja",
"entryLevel": "Student job",
"entryLevel_order": 8,
"jobField": "IT",
"category": "Automotive supply",
"recruiter": [
"xxx",
"xxxx",
"xxxx",
"xxxx"
],
"applicationEnd": null,
"postingDate": "2021-11-29T00:00:00+01:00",
"postingDate_timestamp": 1638140400,
"new_postingDate": "2021-11-28 23:00:00+00:00",
"expectedStartDate": null,
"subClients": null
},
"content": {
"employmentType": "Teilzeit",
"contract": "Befristet",
"entryLevel": "Studienjob",
"jobField": "IT",
"category": "Automobilzulieferung",
"applicationEnd": null,
"businessHL": "Unternehmen",
"business": "<p>xxxx.</p>",
"taskHL": "Aufgaben",
"task": "<ul><lixxxx",
"profileHL": "Profil",
"profile": "<ul><li>xxx>",
"offerHL": "<p>Ihre Vorteile bei uns</p>",
"offer": "xxxxx",
"contactHL": "Kontakt",
"contact": "xxxx",
"diversityHL": "Das bieten wir",
"diversity": "xxxx>",
"headerImage": "xxxx",
"mobileHeaderImage": "xxxx",
"compensation": "",
"employerSeal": ""
},
"_geoloc": [
{
"lat": 123.123,
"lng": 123.123
},
{
"lat": 123.123,
"lng": 123.123
}
],
"arbeitsAgentur": {
"argeId": null,
"baReferenzeId": "811389-002"
}
}
]
I have created a search index that looks like this:
"mappings": {
"dynamic": false,
"fields": {
"data": {
"fields": {
"businessUnit": {
"analyzer": "lucene.keyword",
"searchAnalyzer": "lucene.keyword",
"type": "string"
},
"category": {
"type": "string"
},
"company": {
"analyzer": "lucene.keyword",
"searchAnalyzer": "lucene.keyword",
"type": "string"
},
"contract": {
"analyzer": "lucene.keyword",
"searchAnalyzer": "lucene.keyword",
"type": "string"
},
"employmentType": {
"analyzer": "lucene.keyword",
"searchAnalyzer": "lucene.keyword",
"type": "string"
},
"jobField": {
"analyzer": "lucene.keyword",
"searchAnalyzer": "lucene.keyword",
"type": "string"
},
"postingDate": {
"type": "string"
},
"postingDate_timestamp": {
"type": "number"
},
"title": {
"type": "string"
}
},
"type": "document"
}
}
},
"storedSource": {
"include": [
"data"
]
}
}
then also another index for facets (I can probably combine them, but reading / watching some material I saw example of using more than 1 index):
{
"mappings": {
"dynamic": false,
"fields": {
"data": {
"fields": {
"businessUnit": [
{
"analyzer": "lucene.keyword",
"searchAnalyzer": "lucene.keyword",
"type": "string"
},
{
"type": "stringFacet"
}
],
"employmentType": [
{
"type": "stringFacet"
},
{
"analyzer": "lucene.keyword",
"searchAnalyzer": "lucene.keyword",
"type": "string"
}
],
"entryLevel": [
{
"type": "stringFacet"
},
{
"analyzer": "lucene.keyword",
"searchAnalyzer": "lucene.keyword",
"type": "string"
}
],
"jobField": [
{
"type": "stringFacet"
},
{
"type": "string"
}
],
"locations": {
"fields": {
"country": [
{
"type": "stringFacet"
},
{
"type": "string"
}
]
},
"type": "document"
},
"title": {
"type": "string"
}
},
"type": "document"
}
}
},
"storedSource": true
}
Finally here is my example query - I’d have to add some more fields for filtering but this is it:
mainQuery [
{
'$search': {
returnStoredSource: true,
index: 'tkag_en',
compound: {
must: [
{
text: {
query: 'Operations Manager',
path: 'data.title',
fuzzy: { maxEdits: 2 }
}
},
{
text: {
path: 'data.jobField',
query: [ 'Engineering & Science' ]
}
},
{
text: { path: 'data.employmentType', query: [ 'Full-time' ] }
},
{
text: {
path: 'data.businessUnit',
query: [ 'Automotive Technology' ]
}
},
{
// instaed of $sort [which is expensive] use $near for sorting
near: {
path: 'data.postingDate_timestamp',
origin: 1686729595572, // today
pivot: 7776000000,// far in the future to give me the latest records based on timestamp
score: { boost: { value: 1000 } }
}
}
]
}
}
},
{
'$project': {
'data.title': 1,
'data.idClient': 1,
'data.city': 1,
'data.state': 1,
'data.country': 1,
'data.company': 1,
'data.postingDate': 1,
'data.locations': 1,
_geoloc: 1,
score: { '$meta': 'searchScore' }
}
},
{ '$skip': 0 },
{ '$limit': 50 }
]
And here is the $searchMeta
query:
[
{
'$searchMeta': {
index: 'tkag_en_facets',
returnStoredSource: true,
facet: {
operator: {
compound: {
must: [
{
text: {
query: 'Operations Manager',
path: 'data.title',
fuzzy: { maxEdits: 2 }
}
},
{
text: {
path: 'data.jobField',
query: [ 'Engineering & Science' ]
}
},
{
text: {
path: 'data.employmentType',
query: [ 'Full-time' ]
}
},
{
text: {
path: 'data.businessUnit',
query: [ 'Automotive Technology' ]
}
}
]
}
},
facets: {
data_DOT_businessUnit: { type: 'string', path: 'data.businessUnit' },
data_DOT_employmentType: { type: 'string', path: 'data.employmentType' },
data_DOT_jobField: { type: 'string', path: 'data.jobField' }
}
}
}
}
],
[
{
'$searchMeta': {
returnStoredSource: true,
index: 'tkag_en_facets',
facet: {
operator: {
compound: {
must: [
{
text: {
path: 'data.employmentType',
query: [ 'Full-time' ]
}
},
{
text: {
path: 'data.businessUnit',
query: [ 'Automotive Technology' ]
}
}
]
}
},
facets: {
data_DOT_jobField: { type: 'string', path: 'data.jobField' }
}
}
}
}
],
I am happy with the performance so far, searching / filtering is around 15-45ms.
First question, am I doing something wrong? I’ve only done the research for the last week
Second questions, what type of a cluster do I need given my requirements? - Note I will have multiple collections with 2000 records.
Thanks in advance