Hi,
we have a couple of “microservices”, based on NodeJS 18 with MongoDB driver 4, run via Docker. Those microservices connect to a PSA replica set (MongoDB 5.0.13).
There’s no problem with the connection on startup. However, some of these services fail to reconnect in a failover situation (MongoServerSelectionError: Server selection timed out after 30000 ms). It’s not the same services everytime. It seems, that services are especially prone to error if they are constantly writing to the Replica Set, but others are affected as well.
Reconnect is actually handled inside MongoDB driver, but we are logging events like topologyDescriptionChanged and serverDescriptionChanged, so we know the driver topology goes from “ReplicaSetWithPrimary” to “ReplicaSetNoPrimary”. But rs.status() shows, that, of course, there is a PRIMARY.
Actually there is another point that bothers us - the reason for the failover situation: the SECONDARY calls out for ELECTION: “Starting an election, since we’ve seen no PRIMARY in election timeout period”
which is acknowledged by the former PRIMARY, but not by ARBITER, which says “can see a healthy primary (mongodb1.domain.local:27017) of equal or greater priority”. The last action in SECONDARY’s log before ELECTION is “a long time” (30-60s) before to set WT_SESSION.checkpoint, no sign of losing connection to the PRIMARY.
Of course, this will somewhat be due to the network issues, but could such behaviour be related? Clients should reconnect after the PRIMARY changed, so why can’t they recognize the new PRIMARY?
I had opened a report on JIRA (https://jira.mongodb.org/browse/NODE-4643), when I tried to reproduce our findings from production in a simple test replica set, but it turned out, the specific information had been influenced by debugger.
Nevertheless, the actual problem remains, and we are seeking advice how to reconnect our services reliably.
I want to mention that we never experienced such problems as long as we were running MongoDB 4. It started after the upgrade to 5.0. However, I cannot see what could cause these reconnect failures. We had
“replication.enableMajorityReadConcern: false”, though. Also, the performance degradation with the PSA replica set should either not be related or be irrelevant since PRIMARY and SECONDARY are not absent for long. When we restart the services they are connecting immediately.
Finally here’s some diagnostic output.
rs.conf()
{
"_id" : "rs0",
"version" : 14,
"term" : 1547,
"members" : [
{
"_id" : 2,
"host" : "mongodb1.domain.local:27017",
"arbiterOnly" : false,
"buildIndexes" : true,
"hidden" : false,
"priority" : 1,
"tags" : {
},
"secondaryDelaySecs" : NumberLong(0),
"votes" : 1
},
{
"_id" : 3,
"host" : "mongodb2.domain.local:27017",
"arbiterOnly" : false,
"buildIndexes" : true,
"hidden" : false,
"priority" : 1,
"tags" : {
},
"secondaryDelaySecs" : NumberLong(0),
"votes" : 1
},
{
"_id" : 4,
"host" : "mongodb3.domain.local:27017",
"arbiterOnly" : true,
"buildIndexes" : true,
"hidden" : false,
"priority" : 0,
"tags" : {
},
"secondaryDelaySecs" : NumberLong(0),
"votes" : 1
}
],
"protocolVersion" : NumberLong(1),
"writeConcernMajorityJournalDefault" : true,
"settings" : {
"chainingAllowed" : true,
"heartbeatIntervalMillis" : 2000,
"heartbeatTimeoutSecs" : 10,
"electionTimeoutMillis" : 10000,
"catchUpTimeoutMillis" : -1,
"catchUpTakeoverDelayMillis" : 30000,
"getLastErrorModes" : {
},
"getLastErrorDefaults" : {
"w" : 1,
"wtimeout" : 0
},
"replicaSetId" : ObjectId("5caf7fbcfdb32bdedd3d0fbe")
}
}
rs.status()
{
"set" : "rs0",
"date" : ISODate("2022-10-26T09:59:24.613Z"),
"myState" : 1,
"term" : NumberLong(1547),
"syncSourceHost" : "",
"syncSourceId" : -1,
"heartbeatIntervalMillis" : NumberLong(2000),
"majorityVoteCount" : 2,
"writeMajorityCount" : 2,
"votingMembersCount" : 3,
"writableVotingMembersCount" : 2,
"optimes" : {
"lastCommittedOpTime" : {
"ts" : Timestamp(1666778360, 1),
"t" : NumberLong(1547)
},
"lastCommittedWallTime" : ISODate("2022-10-26T09:59:20.111Z"),
"readConcernMajorityOpTime" : {
"ts" : Timestamp(1666778360, 1),
"t" : NumberLong(1547)
},
"appliedOpTime" : {
"ts" : Timestamp(1666778360, 1),
"t" : NumberLong(1547)
},
"durableOpTime" : {
"ts" : Timestamp(1666778360, 1),
"t" : NumberLong(1547)
},
"lastAppliedWallTime" : ISODate("2022-10-26T09:59:20.111Z"),
"lastDurableWallTime" : ISODate("2022-10-26T09:59:20.111Z")
},
"lastStableRecoveryTimestamp" : Timestamp(1666778345, 1),
"electionCandidateMetrics" : {
"lastElectionReason" : "e
"_id" : 3,
"name" : "mongodb2.domain.local:27017",
"health" : 1,
"state" : 1,
"stateStr" : "PRIMARY",
"uptime" : 139435,
"optime" : {
"ts" : Timestamp(1666778360, 1),
"t" : NumberLong(1547)
},
"optimeDate" : ISODate("2022-10-26T09:59:20Z"),
"lastAppliedWallTime" : ISODate("2022-10-26T09:59:20.111Z"),
"lastDurableWallTime" : ISODate("2022-10-26T09:59:20.111Z"),
"syncSourceHost" : "",
"syncSourceId" : -1,
"infoMessage" : "",
"electionTime" : Timestamp(1666742992, 1),
"electionDate" : ISODate("2022-10-26T00:09:52Z"),
"configVersion" : 14,
"configTerm" : 1547,
"self" : true,
"lastHeartbeatMessage" : ""
},
{
"_id" : 4,
"name" : "mongodb3.domain.local:27017",
"health" : 1,
"state" : 7,
"stateStr" : "ARBITER",
"uptime" : 77769,
"lastHeartbeat" : ISODate("2022-10-26T09:59:23.173Z"),
"lastHeartbeatRecv" : ISODate("2022-10-26T09:59:23.792Z"),
"pingMs" : NumberLong(0),
"lastHeartbeatMessage" : "",
"syncSourceHost" : "",
"syncSourceId" : -1,
"infoMessage" : "",
"configVersion" : 14,
"configTerm" : 1547
}
],
"ok" : 1,
"$clusterTime" : {
"clusterTime" : Timestamp(1666778360, 1),
"signature" : {
"hash" : BinData(0,"gOSNYK9Irr+uJ3V5qJXz+tx/9QA="),
"keyId" : NumberLong("7114560952992792606")
}
},
"operationTime" : Timestamp(1666778360, 1)
}
serverDescriptionChanged (last one in error case):
{
"topologyId": 0,
"address": "mongodb1.domain.local:27017",
"previousDescription": {
"address": "mongodb1.domain.local:27017",
"type": "RSPrimary",
"hosts": [
"mongodb1.domain.local:27017",
"mongodb2.domain.local:27017"
],
"passives": [],
"arbiters": [
"mongodb3.domain.local:27017"
],
"tags": {},
"minWireVersion": 0,
"maxWireVersion": 13,
"roundTripTime": 0.6801772302843527,
"lastUpdateTime": 1069475265,
"lastWriteDate": "2022-10-26T00:09:50.000Z",
"error": null,
"topologyVersion": {
"processId": "6356e3fb92313b936c04b9c6",
"counter": 6
},
"setName": "rs0",
"setVersion": 14,
"electionId": "7fffffff000000000000060a",
"logicalSessionTimeoutMinutes": 30,
"primary": "mongodb1.domain.local:27017",
"me": "mongodb1.domain.local:27017",
"$clusterTime": {
"clusterTime": {
"$timestamp": "7158606632887255041"
},
"signature": {
"hash": "gzCLGs886hw9kx0T5gEF9/J1dCU=",
"keyId": {
"low": 30,
"high": 1656487806,
"unsigned": false
}
}
}
},
"newDescription": {
"address": "mongodb1.domain.local:27017",
"type": "Unknown",
"hosts": [],
"passives": [],
"arbiters": [],
"tags": {},
"minWireVersion": 0,
"maxWireVersion": 0,
"roundTripTime": -1,
"lastUpdateTime": 1070049651,
"lastWriteDate": 0,
"error": {
"topologyVersion": {
"processId": "6356e3fb92313b936c04b9c6",
"counter": 8
},
"ok": 0,
"code": 13435,
"codeName": "NotPrimaryNoSecondaryOk",
"$clusterTime": {
"clusterTime": {
"$timestamp": "7158609102493450243"
},
"signature": {
"hash": "/9kVPKo5sMY3xHu9XI4UPv3liTM=",
"keyId": {
"low": 30,
"high": 1656487806,
"unsigned": false
}
}
},
"operationTime": {
"$timestamp": "7158609102493450243"
}
},
"topologyVersion": {
"processId": "6356e3fb92313b936c04b9c6",
"counter": 8
},
"setName": null,
"setVersion": null,
"electionId": null,
"logicalSessionTimeoutMinutes": null,
"primary": null,
"me": null,
"$clusterTime": null
}
}
topologyDescriptionChanged (also last one in error case):
{
"topologyId": 0,
"previousDescription": {
"type": "ReplicaSetWithPrimary",
"servers": {},
"stale": false,
"compatible": true,
"heartbeatFrequencyMS": 10000,
"localThresholdMS": 15,
"setName": "rs0",
"maxElectionId": "7fffffff000000000000060a",
"maxSetVersion": 14,
"commonWireVersion": 0,
"logicalSessionTimeoutMinutes": 30
},
"newDescription": {
"type": "ReplicaSetNoPrimary",
"servers": {},
"stale": false,
"compatible": true,
"heartbeatFrequencyMS": 10000,
"localThresholdMS": 15,
"setName": "rs0",
"maxElectionId": "7fffffff000000000000060a",
"maxSetVersion": 14,
"commonWireVersion": 0,
"logicalSessionTimeoutMinutes": 30
}
}
Best regards