I’m trying to remove duplicates of documents, in the collection the collection have millions of records and duplicates data I assume have 100k records. I did use aggregation to delete those duplicates, but it is slow and is not ideal when we deploy it to prod. Is there a better and fast way to remove duplicates in a collection?
What I have tried so far:
db.mycollection.aggregate([
{"$match":{
"create_date_audit":{
$gte: ISODate('2022-07-25T18:27:56.084+00:00'),
$lte: ISODate('2022-07-26T20:15:50.561+00:00')
}
}},
{"$sort":{
_id: -1
}},
{"$group":{
_id: {
notification_id: '$notifId',
empId: '$empId',
date: '$date'
},
dups: {
$push: '$_id'
},
creationTimestamp: {
$push: '$create_date'
},
count: {
$sum: 1
}
}},
{"$match":{
_id: {
$ne: null
},
count: {
$gt: 1
}
}},
{"$sort":{
create_date: -1
}},
], { allowDiskUse: true }).forEach(function(doc) {
db.mycollection.deleteMany({_id : {doc.dups[0]});
})```