Hey @Abishan_Parameswaran,
I think this looks a bit like something I have done for the Open Data COVID-19 project.
In this project, I have documents like this:
{
"_id": {
"$oid": "612e13c475ca254df57156e1"
},
"uid": 860,
"country_iso2": "UZ",
"country_iso3": "UZB",
"country_code": 860,
"country": "Uzbekistan",
"combined_name": "Uzbekistan",
"population": 33469199,
"loc": {
"type": "Point",
"coordinates": [
64.5853,
41.3775
]
},
"date": {
"$date": "2021-08-30T00:00:00.000Z"
},
"confirmed": 155639,
"deaths": 1075,
"recovered": 0,
"confirmed_daily": 795,
"deaths_daily": 5,
"recovered_daily": 0
}
The raw data I retrieve from JHU only contains the cumulative values for confirmed, deaths and recovered. The 3 fields confirmed_daily
, deaths_daily
and recovered_daily
are calculated by applying a simple math operation: cumulative_value_today - cumulative_value_yesterday = daily_count
.
I feel like you are trying to achieve the same thing here. Initially my documents don’t contain these 3 daily fields. But they are added by an aggregation pipeline that calculates the differences for each document and then merges these result within the relevant document.
Here is the function I’m using in my Python code. Hopefully you can read it
.
def calculate_daily_counts(client, collection, unique_daily_field):
start = time.time()
coll = client.get_database(DB).get_collection(collection)
pipeline = [
{"$sort": {unique_daily_field: 1, "date": 1}},
{"$group": {"_id": "$" + unique_daily_field, "docs": {"$push": {"dt": "$date", "c": "$confirmed", "d": "$deaths", "r": "$recovered"}}}},
{
"$set": {
"docs": {
"$map": {
"input": {"$range": [0, {"$size": "$docs"}]},
"as": "idx",
"in": {
"$let": {
"vars": {"d0": {"$arrayElemAt": ["$docs", {"$max": [0, {"$subtract": ["$$idx", 1]}]}]}, "d1": {"$arrayElemAt": ["$docs", "$$idx"]}},
"in": {"dt": "$$d1.dt", "dc": {"$subtract": ["$$d1.c", "$$d0.c"]}, "dd": {"$subtract": ["$$d1.d", "$$d0.d"]},
"dr": {"$subtract": ["$$d1.r", "$$d0.r"]}}
}
}
}
}
}
},
{"$unwind": "$docs"},
{"$project": {"_id": "$$REMOVE", unique_daily_field: "$_id", "date": "$docs.dt", "confirmed_daily": {"$ifNull": ["$docs.dc", "$$REMOVE"]},
"deaths_daily": {"$ifNull": ["$docs.dd", "$$REMOVE"]}, "recovered_daily": {"$ifNull": ["$docs.dr", "$$REMOVE"]}}},
{"$merge": {"into": collection, "on": [unique_daily_field, "date"], "whenNotMatched": "fail"}}
]
coll.aggregate(pipeline, allowDiskUse=True)
print('Calculated daily fields for', collection, 'in', round(time.time() - start, 2), 's')
I hope this helps.
Cheers,
Maxime.