I receive this error when attempting to drop any collection on any database on my lab’s sharded MongoDB cluster. All of our machines run AlmaLinux 8.10 and we are using MongoDB 7.0.1. Our sharded cluster consists of one config replica set (three machines total), five shard replica sets (15 machines total), and a mongos on all 25 machines in our cluster.
Here is an example of the error output:
[direct: mongos] sustaindb> db.MyCollection.drop()
MongoServerError: Failed to acquire DDL lock for namespace 'sustaindb' after 300000 ms that is currently locked with reason 'dropCollection_V2'
[direct: mongos] devdb> db.test_drop.drop()
MongoServerError: Failed to acquire DDL lock for namespace 'devdb' after 300000 ms that is currently locked with reason 'dropCollection_V2'
`
In an attempt to fix the issue we tried shutting down all config servers and mongos and mongod processes. However, the issue persisted after restarting the cluster. I've also tried to identify the operations that could possibly be holding the DDL lock, and believe they are the following three operations:
`
[direct: mongos] devdb> db.currentOp({"command.reason.command": 'dropCollection'})
{
inprog: [
{
shard: 'shard1rs',
type: 'op',
host: 'lattice-151:27017',
desc: 'conn174',
connectionId: 174,
client_s: '129.82.208.173:35544',
clientMetadata: {
driver: {
name: 'NetworkInterfaceTL-ShardingDDLCoordinatorNetwork',
version: '7.0.1'
},
os: {
type: 'Linux',
name: 'AlmaLinux release 8.10 (Cerulean Leopard)',
architecture: 'x86_64',
version: 'Kernel 4.18.0-553.34.1.el8_10.x86_64'
}
},
active: true,
currentOpTime: '2025-02-06T11:51:18.054-07:00',
effectiveUsers: [ { user: '__system', db: 'local' } ],
threaded: true,
opid: 'shard1rs:17320448',
lsid: {
id: new UUID("3944ee01-0b7a-49b4-9c00-d18ca46c3ce8"),
uid: Binary(Buffer.from("bb89d3175fb0981c86826c27759082a37160471f60504b4612415186c630ab70", "hex"), 0)
},
secs_running: Long("0"),
microsecs_running: Long("128232"),
op: 'command',
ns: 'MMM_FFAR.$cmd',
redacted: false,
command: {
_shardsvrParticipantBlock: 'FFAR_Flux_Measurements',
blockType: 'ReadsAndWrites',
reason: {
command: 'dropCollection',
ns: 'MMM_FFAR.FFAR_Flux_Measurements'
},
allowViews: true,
writeConcern: { w: 'majority', wtimeout: 60000 },
lsid: {
id: new UUID("3944ee01-0b7a-49b4-9c00-d18ca46c3ce8"),
uid: Binary(Buffer.from("bb89d3175fb0981c86826c27759082a37160471f60504b4612415186c630ab70", "hex"), 0)
},
txnNumber: Long("19381"),
'$audit': {
'$impersonatedUser': { user: 'root', db: 'admin' },
'$impersonatedRoles': [ { role: 'root', db: 'admin' } ]
},
mayBypassWriteBlocking: true,
'$clusterTime': {
clusterTime: Timestamp({ t: 1738867877, i: 1 }),
signature: {
hash: Binary(Buffer.from("0000000000000000000000000000000000000000", "hex"), 0),
keyId: Long("0")
}
},
'$configTime': Timestamp({ t: 1738867876, i: 7 }),
'$topologyTime': Timestamp({ t: 1718904344, i: 6 }),
'$db': 'MMM_FFAR'
},
numYields: 0,
waitForWriteConcernDurationMillis: Long("126"),
locks: {},
waitingForLock: false,
lockStats: {
ParallelBatchWriterMode: { acquireCount: { r: Long("2") } },
FeatureCompatibilityVersion: { acquireCount: { w: Long("2") } },
ReplicationStateTransition: { acquireCount: { w: Long("3") } },
Global: { acquireCount: { w: Long("2") } },
Database: { acquireCount: { w: Long("2") } },
Collection: { acquireCount: { w: Long("2") } },
Mutex: { acquireCount: { r: Long("8") } }
},
waitingForFlowControl: false,
flowControlStats: { acquireCount: Long("1") }
},
{
shard: 'shard1rs',
type: 'op',
host: 'lattice-151:27017',
desc: 'conn173',
connectionId: 173,
client_s: '129.82.208.170:58842',
clientMetadata: {
driver: {
name: 'NetworkInterfaceTL-ShardingDDLCoordinatorNetwork',
version: '7.0.1'
},
os: {
type: 'Linux',
name: 'AlmaLinux release 8.10 (Cerulean Leopard)',
architecture: 'x86_64',
version: 'Kernel 4.18.0-553.34.1.el8_10.x86_64'
}
},
active: true,
currentOpTime: '2025-02-06T11:51:18.054-07:00',
effectiveUsers: [ { user: '__system', db: 'local' } ],
threaded: true,
opid: 'shard1rs:17318642',
lsid: {
id: new UUID("a102efe5-3c0f-4aff-9f21-5052ca596841"),
uid: Binary(Buffer.from("bb89d3175fb0981c86826c27759082a37160471f60504b4612415186c630ab70", "hex"), 0)
},
secs_running: Long("55"),
microsecs_running: Long("55515458"),
op: 'command',
ns: 'sustaindb.$cmd',
redacted: false,
command: {
_shardsvrParticipantBlock: 'ssurgo_mupolygon_all_unsimplified_polygon',
blockType: 'ReadsAndWrites',
reason: {
command: 'dropCollection',
ns: 'sustaindb.ssurgo_mupolygon_all_unsimplified_polygon'
},
allowViews: true,
writeConcern: { w: 'majority', wtimeout: 60000 },
lsid: {
id: new UUID("a102efe5-3c0f-4aff-9f21-5052ca596841"),
uid: Binary(Buffer.from("bb89d3175fb0981c86826c27759082a37160471f60504b4612415186c630ab70", "hex"), 0)
},
txnNumber: Long("32124"),
'$audit': {
'$impersonatedUser': { user: 'root', db: 'admin' },
'$impersonatedRoles': [ { role: 'root', db: 'admin' } ]
},
mayBypassWriteBlocking: true,
'$clusterTime': {
clusterTime: Timestamp({ t: 1738867822, i: 1062 }),
signature: {
hash: Binary(Buffer.from("0000000000000000000000000000000000000000", "hex"), 0),
keyId: Long("0")
}
},
'$configTime': Timestamp({ t: 1738867822, i: 1056 }),
'$topologyTime': Timestamp({ t: 1718904344, i: 6 }),
'$db': 'sustaindb'
},
numYields: 0,
waitForWriteConcernDurationMillis: Long("55513"),
locks: {},
waitingForLock: false,
lockStats: {
ParallelBatchWriterMode: { acquireCount: { r: Long("2") } },
FeatureCompatibilityVersion: { acquireCount: { w: Long("2") } },
ReplicationStateTransition: { acquireCount: { w: Long("3") } },
Global: { acquireCount: { w: Long("2") } },
Database: { acquireCount: { w: Long("2") } },
Collection: { acquireCount: { w: Long("2") } },
Mutex: { acquireCount: { r: Long("8") } }
},
waitingForFlowControl: false,
flowControlStats: { acquireCount: Long("1") }
},
{
shard: 'shard1rs',
type: 'op',
host: 'lattice-151:27017',
desc: 'conn172',
connectionId: 172,
client_s: '129.82.208.167:37256',
clientMetadata: {
driver: {
name: 'NetworkInterfaceTL-ShardingDDLCoordinatorNetwork',
version: '7.0.1'
},
os: {
type: 'Linux',
name: 'AlmaLinux release 8.10 (Cerulean Leopard)',
architecture: 'x86_64',
version: 'Kernel 4.18.0-553.34.1.el8_10.x86_64'
}
},
active: true,
currentOpTime: '2025-02-06T11:51:18.054-07:00',
effectiveUsers: [ { user: '__system', db: 'local' } ],
threaded: true,
opid: 'shard1rs:17319288',
lsid: {
id: new UUID("ba740141-0f32-4b6c-9251-1d2dcf88cf02"),
uid: Binary(Buffer.from("bb89d3175fb0981c86826c27759082a37160471f60504b4612415186c630ab70", "hex"), 0)
},
secs_running: Long("36"),
microsecs_running: Long("36322705"),
op: 'command',
ns: 'devdb.$cmd',
redacted: false,
command: {
_shardsvrParticipantBlock: 'quench_coagmet_weather_TEST',
blockType: 'ReadsAndWrites',
reason: {
command: 'dropCollection',
ns: 'devdb.quench_coagmet_weather_TEST'
},
allowViews: true,
writeConcern: { w: 'majority', wtimeout: 60000 },
lsid: {
id: new UUID("ba740141-0f32-4b6c-9251-1d2dcf88cf02"),
uid: Binary(Buffer.from("bb89d3175fb0981c86826c27759082a37160471f60504b4612415186c630ab70", "hex"), 0)
},
txnNumber: Long("20949"),
'$audit': {
'$impersonatedUser': { user: 'root', db: 'admin' },
'$impersonatedRoles': [ { role: 'root', db: 'admin' } ]
},
mayBypassWriteBlocking: true,
'$clusterTime': {
clusterTime: Timestamp({ t: 1738867841, i: 1 }),
signature: {
hash: Binary(Buffer.from("0000000000000000000000000000000000000000", "hex"), 0),
keyId: Long("0")
}
},
'$configTime': Timestamp({ t: 1738867838, i: 5 }),
'$topologyTime': Timestamp({ t: 1718904344, i: 6 }),
'$db': 'devdb'
},
numYields: 0,
waitForWriteConcernDurationMillis: Long("36321"),
locks: {},
waitingForLock: false,
lockStats: {
ParallelBatchWriterMode: { acquireCount: { r: Long("2") } },
FeatureCompatibilityVersion: { acquireCount: { w: Long("2") } },
ReplicationStateTransition: { acquireCount: { w: Long("3") } },
Global: { acquireCount: { w: Long("2") } },
Database: { acquireCount: { w: Long("2") } },
Collection: { acquireCount: { w: Long("2") } },
Mutex: { acquireCount: { r: Long("8") } }
},
waitingForFlowControl: false,
flowControlStats: { acquireCount: Long("1") }
}
],
ok: 1
}
`
I've tried to kill these operations with `db.killOp(<op_id>)`, but they continue to hang around. I cannot find any information online about resolving the 'Failed to acquire DDL lock' issue, any help would be appreciated. Let me know what additional information I can provide.
Currently, our planned solution is to tear down the cluster and rebuild it, but we would like to avoid that if possible.