In the last month, we’ve been encountering issues with our HA configuration when it comes to replication of the primary and secondary servers. We are currently running MongoDB v4.4.8 across our environment.
HA Config:
Site 1: 1 Primary server, 22TB of data
Site 2: 1 Secondary server, voting, electible
Site 3: 1 Arbiter
The specs on the primary and secondary are CPU: 28core, 56 thread and 256GB RAM and 10Gbit network interfaces.
Oplog is set to 450GB
rs0:SECONDARY> rs.printReplicationInfo()
configured oplog size: 450000MB
log length start to end: 153741secs (42.71hrs)
oplog first event time: Sat Nov 13 2021 12:05:09 GMT-0600 (CST)
oplog last event time: Mon Nov 15 2021 06:47:30 GMT-0600 (CST)
now: Mon Nov 15 2021 13:52:15 GMT-0600 (CST)
Our secondary has fallen completely out of sync twice in the last month which is very concerning considering the fact that the oplog give us 40+ hours of runway. It became even more of an issue when it came to creating indexes as we had to remove our secondary’s voting and priority rights to get our primary back to the expected performance for our application.
From research, this typically falls down to network speeds/bandwidth, latency, hardware specs, or configuration issues. The network and hardware specs are not a concern between the sites but how can I narrow down on the latency and potential configuration issues? Long term, I think we need to scale out sharding to handle our data intake but we need to at least restore our desired HA configuration so we can work towards the desired endstate. We just currently have a lot of pain points with the current setup that was not designed correctly for the current use case so any feedback/suggestions would be very greatly appreciated!
rs.conf():
rs0:SECONDARY> rs.conf()
{
"_id" : "rs0",
"version" : 171450,
"term" : 18,
"protocolVersion" : NumberLong(1),
"writeConcernMajorityJournalDefault" : true,
"members" : [
{
"_id" : 2,
"host" : "mongodb04:27017",
"arbiterOnly" : false,
"buildIndexes" : true,
"hidden" : false,
"priority" : 1,
"tags" : {
},
"slaveDelay" : NumberLong(0),
"votes" : 1
},
{
"_id" : 3,
"host" : "arbiter0:27017",
"arbiterOnly" : true,
"buildIndexes" : true,
"hidden" : false,
"priority" : 0,
"tags" : {
},
"slaveDelay" : NumberLong(0),
"votes" : 1
},
{
"_id" : 4,
"host" : "mongodb03:27017",
"arbiterOnly" : false,
"buildIndexes" : true,
"hidden" : false,
"priority" : 0,
"tags" : {
},
"slaveDelay" : NumberLong(0),
"votes" : 0
}
],
"settings" : {
"chainingAllowed" : true,
"heartbeatIntervalMillis" : 2000,
"heartbeatTimeoutSecs" : 10,
"electionTimeoutMillis" : 10000,
"catchUpTimeoutMillis" : -1,
"catchUpTakeoverDelayMillis" : 30000,
"getLastErrorModes" : {
},
"getLastErrorDefaults" : {
"w" : 1,
"wtimeout" : 0
},
"replicaSetId" : ObjectId("5ef3f5c0ea85cbd0890c2fa4")
}
}
rs.status():
rs0:SECONDARY> rs.status()
{
"set" : "rs0",
"date" : ISODate("2021-11-15T20:02:40.060Z"),
"myState" : 2,
"term" : NumberLong(18),
"syncSourceHost" : "mongodb04:27017",
"syncSourceId" : 2,
"heartbeatIntervalMillis" : NumberLong(2000),
"majorityVoteCount" : 2,
"writeMajorityCount" : 1,
"votingMembersCount" : 2,
"writableVotingMembersCount" : 1,
"optimes" : {
"lastCommittedOpTime" : {
"ts" : Timestamp(1637006807, 3168),
"t" : NumberLong(18)
},
"lastCommittedWallTime" : ISODate("2021-11-15T20:06:47.588Z"),
"readConcernMajorityOpTime" : {
"ts" : Timestamp(1636981009, 2780),
"t" : NumberLong(18)
},
"readConcernMajorityWallTime" : ISODate("2021-11-15T12:56:50.315Z"),
"appliedOpTime" : {
"ts" : Timestamp(1636981009, 2780),
"t" : NumberLong(18)
},
"durableOpTime" : {
"ts" : Timestamp(1636981009, 2780),
"t" : NumberLong(18)
},
"lastAppliedWallTime" : ISODate("2021-11-15T12:56:50.315Z"),
"lastDurableWallTime" : ISODate("2021-11-15T12:56:50.315Z")
},
"members" : [
{
"_id" : 2,
"name" : "mongodb04:27017",
"health" : 1,
"state" : 1,
"stateStr" : "PRIMARY",
"uptime" : 34617,
"optime" : {
"ts" : Timestamp(1637006807, 3771),
"t" : NumberLong(18)
},
"optimeDurable" : {
"ts" : Timestamp(1637006807, 3168),
"t" : NumberLong(18)
},
"optimeDate" : ISODate("2021-11-15T20:06:47Z"),
"optimeDurableDate" : ISODate("2021-11-15T20:06:47Z"),
"lastHeartbeat" : ISODate("2021-11-15T20:02:39.879Z"),
"lastHeartbeatRecv" : ISODate("2021-11-15T20:02:39.513Z"),
"pingMs" : NumberLong(30),
"lastHeartbeatMessage" : "",
"syncSourceHost" : "",
"syncSourceId" : -1,
"infoMessage" : "",
"electionTime" : Timestamp(1630031492, 1),
"electionDate" : ISODate("2021-08-27T02:31:32Z"),
"configVersion" : 171450,
"configTerm" : 18
},
{
"_id" : 3,
"name" : "arbiter0:27017",
"health" : 1,
"state" : 7,
"stateStr" : "ARBITER",
"uptime" : 34617,
"lastHeartbeat" : ISODate("2021-11-15T20:02:38.930Z"),
"lastHeartbeatRecv" : ISODate("2021-11-15T20:02:38.173Z"),
"pingMs" : NumberLong(39),
"lastHeartbeatMessage" : "",
"syncSourceHost" : "",
"syncSourceId" : -1,
"infoMessage" : "",
"configVersion" : 171450,
"configTerm" : 18
},
{
"_id" : 4,
"name" : "mongodb03:27017",
"health" : 1,
"state" : 2,
"stateStr" : "SECONDARY",
"uptime" : 34692,
"optime" : {
"ts" : Timestamp(1636981009, 2780),
"t" : NumberLong(18)
},
"optimeDate" : ISODate("2021-11-15T12:56:49Z"),
"syncSourceHost" : "mongodb04:27017",
"syncSourceId" : 2,
"infoMessage" : "",
"configVersion" : 171450,
"configTerm" : 18,
"self" : true,
"lastHeartbeatMessage" : ""
}
],
"ok" : 1,
"$clusterTime" : {
"clusterTime" : Timestamp(1637006807, 3771),
"signature" : {
"hash" : BinData(0,"Y2W6fSCoS7BqKUvYnc/Ww7+mjPw="),
"keyId" : NumberLong("6976771418520289346")
}
},
"operationTime" : Timestamp(1636981009, 2780)
}