mongo version: 4.4.6
Yesterday I find that a replicaset member in my sharded cluster is down for about 5 months, then I start it, but it stuck into RECOVERING
status.
shard1:RECOVERING> rs.status();
{
"set" : "shard1",
"date" : ISODate("2022-09-01T08:24:29.761Z"),
"myState" : 3,
"term" : NumberLong(3),
"tooStale" : true,
"syncSourceHost" : "",
"syncSourceId" : -1,
"heartbeatIntervalMillis" : NumberLong(2000),
"majorityVoteCount" : 2,
"writeMajorityCount" : 2,
"votingMembersCount" : 3,
"writableVotingMembersCount" : 3,
"optimes" : {
"lastCommittedOpTime" : {
"ts" : Timestamp(0, 0),
"t" : NumberLong(-1)
},
"lastCommittedWallTime" : ISODate("1970-01-01T00:00:00Z"),
"appliedOpTime" : {
"ts" : Timestamp(1653075884, 37),
"t" : NumberLong(2)
},
"durableOpTime" : {
"ts" : Timestamp(1653075884, 37),
"t" : NumberLong(2)
},
"lastAppliedWallTime" : ISODate("2022-05-20T19:44:44.059Z"),
"lastDurableWallTime" : ISODate("2022-05-20T19:44:44.059Z")
},
"lastStableRecoveryTimestamp" : Timestamp(1653075807, 10),
"members" : [
{
"_id" : 0,
"name" : "172.16.70.197:27001",
"health" : 1,
"state" : 3,
"stateStr" : "RECOVERING",
"uptime" : 194681,
"optime" : {
"ts" : Timestamp(1653075884, 37),
"t" : NumberLong(2)
},
"optimeDate" : ISODate("2022-05-20T19:44:44Z"),
"syncSourceHost" : "",
"syncSourceId" : -1,
"maintenanceMode" : 1,
"infoMessage" : "",
"configVersion" : 2,
"configTerm" : 3,
"self" : true,
"lastHeartbeatMessage" : ""
},
{
"_id" : 1,
"name" : "172.16.70.198:27001",
"health" : 1,
"state" : 2,
"stateStr" : "SECONDARY",
"uptime" : 194673,
"optime" : {
"ts" : Timestamp(1662020669, 359),
"t" : NumberLong(3)
},
"optimeDurable" : {
"ts" : Timestamp(1662020669, 353),
"t" : NumberLong(3)
},
"optimeDate" : ISODate("2022-09-01T08:24:29Z"),
"optimeDurableDate" : ISODate("2022-09-01T08:24:29Z"),
"lastHeartbeat" : ISODate("2022-09-01T08:24:29.606Z"),
"lastHeartbeatRecv" : ISODate("2022-09-01T08:24:28.497Z"),
"pingMs" : NumberLong(0),
"lastHeartbeatMessage" : "",
"syncSourceHost" : "172.16.70.199:27001",
"syncSourceId" : 2,
"infoMessage" : "",
"configVersion" : 2,
"configTerm" : 3
},
{
"_id" : 2,
"name" : "172.16.70.199:27001",
"health" : 1,
"state" : 1,
"stateStr" : "PRIMARY",
"uptime" : 194673,
"optime" : {
"ts" : Timestamp(1662020669, 359),
"t" : NumberLong(3)
},
"optimeDurable" : {
"ts" : Timestamp(1662020669, 312),
"t" : NumberLong(3)
},
"optimeDate" : ISODate("2022-09-01T08:24:29Z"),
"optimeDurableDate" : ISODate("2022-09-01T08:24:29Z"),
"lastHeartbeat" : ISODate("2022-09-01T08:24:29.607Z"),
"lastHeartbeatRecv" : ISODate("2022-09-01T08:24:28.496Z"),
"pingMs" : NumberLong(0),
"lastHeartbeatMessage" : "",
"syncSourceHost" : "",
"syncSourceId" : -1,
"infoMessage" : "",
"electionTime" : Timestamp(1659657002, 1),
"electionDate" : ISODate("2022-08-04T23:50:02Z"),
"configVersion" : 2,
"configTerm" : 3
}
],
"ok" : 1,
"$gleStats" : {
"lastOpTime" : Timestamp(0, 0),
"electionId" : ObjectId("000000000000000000000000")
},
"lastCommittedOpTime" : Timestamp(0, 0),
"$configServerState" : {
"opTime" : {
"ts" : Timestamp(1662020658, 663),
"t" : NumberLong(4)
}
}
}
shard1:RECOVERING> db.printReplicationInfo()
configured oplog size: 20453.31494140625MB
log length start to end: 193968secs (53.88hrs)
oplog first event time: Wed May 18 2022 21:51:56 GMT+0800 (CST)
oplog last event time: Sat May 21 2022 03:44:44 GMT+0800 (CST)
now: Thu Sep 01 2022 16:47:46 GMT+0800 (CST)
There is about 100G data in the databases.
I searched 2 links:
- https://www.mongodb.com/docs/v4.4/tutorial/resync-replica-set-member/
- What is the major reason for secondary server going into Recovery state
How to make it to SECONDARY
?
- Delete the data dir and restart mongod for an initial sync.
- Stop another SECONDARY in the replica set, and copy data, I do not know if it is permitted to stop 2 instances in the meantime.
- Increase the OPLog size, I do not know how to estimate it.
Need your help, thanks in advance.