MongoServerError: Failed to acquire DDL lock for namespace 'sustaindb' after 300000 ms that is currently locked with reason 'dropCollection_V2'

jjvolesky · February 7, 2025, 2:00pm

I receive this error when attempting to drop any collection on any database on my lab’s sharded MongoDB cluster. All of our machines run AlmaLinux 8.10 and we are using MongoDB 7.0.1. Our sharded cluster consists of one config replica set (three machines total), five shard replica sets (15 machines total), and a mongos on all 25 machines in our cluster.

Here is an example of the error output:

[direct: mongos] sustaindb> db.MyCollection.drop()
MongoServerError: Failed to acquire DDL lock for namespace 'sustaindb' after 300000 ms that is currently locked with reason 'dropCollection_V2'

[direct: mongos] devdb> db.test_drop.drop()
MongoServerError: Failed to acquire DDL lock for namespace 'devdb' after 300000 ms that is currently locked with reason 'dropCollection_V2'

`

In an attempt to fix the issue we tried shutting down all config servers and mongos and mongod processes. However, the issue persisted after restarting the cluster. I've also tried to identify the operations that could possibly be holding the DDL lock, and believe they are the following three operations:

`
[direct: mongos] devdb> db.currentOp({"command.reason.command": 'dropCollection'})
{
  inprog: [
    {
      shard: 'shard1rs',
      type: 'op',
      host: 'lattice-151:27017',
      desc: 'conn174',
      connectionId: 174,
      client_s: '129.82.208.173:35544',
      clientMetadata: {
        driver: {
          name: 'NetworkInterfaceTL-ShardingDDLCoordinatorNetwork',
          version: '7.0.1'
        },
        os: {
          type: 'Linux',
          name: 'AlmaLinux release 8.10 (Cerulean Leopard)',
          architecture: 'x86_64',
          version: 'Kernel 4.18.0-553.34.1.el8_10.x86_64'
        }
      },
      active: true,
      currentOpTime: '2025-02-06T11:51:18.054-07:00',
      effectiveUsers: [ { user: '__system', db: 'local' } ],
      threaded: true,
      opid: 'shard1rs:17320448',
      lsid: {
        id: new UUID("3944ee01-0b7a-49b4-9c00-d18ca46c3ce8"),
        uid: Binary(Buffer.from("bb89d3175fb0981c86826c27759082a37160471f60504b4612415186c630ab70", "hex"), 0)
      },
      secs_running: Long("0"),
      microsecs_running: Long("128232"),
      op: 'command',
      ns: 'MMM_FFAR.$cmd',
      redacted: false,
      command: {
        _shardsvrParticipantBlock: 'FFAR_Flux_Measurements',
        blockType: 'ReadsAndWrites',
        reason: {
          command: 'dropCollection',
          ns: 'MMM_FFAR.FFAR_Flux_Measurements'
        },
        allowViews: true,
        writeConcern: { w: 'majority', wtimeout: 60000 },
        lsid: {
          id: new UUID("3944ee01-0b7a-49b4-9c00-d18ca46c3ce8"),
          uid: Binary(Buffer.from("bb89d3175fb0981c86826c27759082a37160471f60504b4612415186c630ab70", "hex"), 0)
        },
        txnNumber: Long("19381"),
        '$audit': {
          '$impersonatedUser': { user: 'root', db: 'admin' },
          '$impersonatedRoles': [ { role: 'root', db: 'admin' } ]
        },
        mayBypassWriteBlocking: true,
        '$clusterTime': {
          clusterTime: Timestamp({ t: 1738867877, i: 1 }),
          signature: {
            hash: Binary(Buffer.from("0000000000000000000000000000000000000000", "hex"), 0),
            keyId: Long("0")
          }
        },
        '$configTime': Timestamp({ t: 1738867876, i: 7 }),
        '$topologyTime': Timestamp({ t: 1718904344, i: 6 }),
        '$db': 'MMM_FFAR'
      },
      numYields: 0,
      waitForWriteConcernDurationMillis: Long("126"),
      locks: {},
      waitingForLock: false,
      lockStats: {
        ParallelBatchWriterMode: { acquireCount: { r: Long("2") } },
        FeatureCompatibilityVersion: { acquireCount: { w: Long("2") } },
        ReplicationStateTransition: { acquireCount: { w: Long("3") } },
        Global: { acquireCount: { w: Long("2") } },
        Database: { acquireCount: { w: Long("2") } },
        Collection: { acquireCount: { w: Long("2") } },
        Mutex: { acquireCount: { r: Long("8") } }
      },
      waitingForFlowControl: false,
      flowControlStats: { acquireCount: Long("1") }
    },
    {
      shard: 'shard1rs',
      type: 'op',
      host: 'lattice-151:27017',
      desc: 'conn173',
      connectionId: 173,
      client_s: '129.82.208.170:58842',
      clientMetadata: {
        driver: {
          name: 'NetworkInterfaceTL-ShardingDDLCoordinatorNetwork',
          version: '7.0.1'
        },
        os: {
          type: 'Linux',
          name: 'AlmaLinux release 8.10 (Cerulean Leopard)',
          architecture: 'x86_64',
          version: 'Kernel 4.18.0-553.34.1.el8_10.x86_64'
        }
      },
      active: true,
      currentOpTime: '2025-02-06T11:51:18.054-07:00',
      effectiveUsers: [ { user: '__system', db: 'local' } ],
      threaded: true,
      opid: 'shard1rs:17318642',
      lsid: {
        id: new UUID("a102efe5-3c0f-4aff-9f21-5052ca596841"),
        uid: Binary(Buffer.from("bb89d3175fb0981c86826c27759082a37160471f60504b4612415186c630ab70", "hex"), 0)
      },
      secs_running: Long("55"),
      microsecs_running: Long("55515458"),
      op: 'command',
      ns: 'sustaindb.$cmd',
      redacted: false,
      command: {
        _shardsvrParticipantBlock: 'ssurgo_mupolygon_all_unsimplified_polygon',
        blockType: 'ReadsAndWrites',
        reason: {
          command: 'dropCollection',
          ns: 'sustaindb.ssurgo_mupolygon_all_unsimplified_polygon'
        },
        allowViews: true,
        writeConcern: { w: 'majority', wtimeout: 60000 },
        lsid: {
          id: new UUID("a102efe5-3c0f-4aff-9f21-5052ca596841"),
          uid: Binary(Buffer.from("bb89d3175fb0981c86826c27759082a37160471f60504b4612415186c630ab70", "hex"), 0)
        },
        txnNumber: Long("32124"),
        '$audit': {
          '$impersonatedUser': { user: 'root', db: 'admin' },
          '$impersonatedRoles': [ { role: 'root', db: 'admin' } ]
        },
        mayBypassWriteBlocking: true,
        '$clusterTime': {
          clusterTime: Timestamp({ t: 1738867822, i: 1062 }),
          signature: {
            hash: Binary(Buffer.from("0000000000000000000000000000000000000000", "hex"), 0),
            keyId: Long("0")
          }
        },
        '$configTime': Timestamp({ t: 1738867822, i: 1056 }),
        '$topologyTime': Timestamp({ t: 1718904344, i: 6 }),
        '$db': 'sustaindb'
      },
      numYields: 0,
      waitForWriteConcernDurationMillis: Long("55513"),
      locks: {},
      waitingForLock: false,
      lockStats: {
        ParallelBatchWriterMode: { acquireCount: { r: Long("2") } },
        FeatureCompatibilityVersion: { acquireCount: { w: Long("2") } },
        ReplicationStateTransition: { acquireCount: { w: Long("3") } },
        Global: { acquireCount: { w: Long("2") } },
        Database: { acquireCount: { w: Long("2") } },
        Collection: { acquireCount: { w: Long("2") } },
        Mutex: { acquireCount: { r: Long("8") } }
      },
      waitingForFlowControl: false,
      flowControlStats: { acquireCount: Long("1") }
    },
    {
      shard: 'shard1rs',
      type: 'op',
      host: 'lattice-151:27017',
      desc: 'conn172',
      connectionId: 172,
      client_s: '129.82.208.167:37256',
      clientMetadata: {
        driver: {
          name: 'NetworkInterfaceTL-ShardingDDLCoordinatorNetwork',
          version: '7.0.1'
        },
        os: {
          type: 'Linux',
          name: 'AlmaLinux release 8.10 (Cerulean Leopard)',
          architecture: 'x86_64',
          version: 'Kernel 4.18.0-553.34.1.el8_10.x86_64'
        }
      },
      active: true,
      currentOpTime: '2025-02-06T11:51:18.054-07:00',
      effectiveUsers: [ { user: '__system', db: 'local' } ],
      threaded: true,
      opid: 'shard1rs:17319288',
      lsid: {
        id: new UUID("ba740141-0f32-4b6c-9251-1d2dcf88cf02"),
        uid: Binary(Buffer.from("bb89d3175fb0981c86826c27759082a37160471f60504b4612415186c630ab70", "hex"), 0)
      },
      secs_running: Long("36"),
      microsecs_running: Long("36322705"),
      op: 'command',
      ns: 'devdb.$cmd',
      redacted: false,
      command: {
        _shardsvrParticipantBlock: 'quench_coagmet_weather_TEST',
        blockType: 'ReadsAndWrites',
        reason: {
          command: 'dropCollection',
          ns: 'devdb.quench_coagmet_weather_TEST'
        },
        allowViews: true,
        writeConcern: { w: 'majority', wtimeout: 60000 },
        lsid: {
          id: new UUID("ba740141-0f32-4b6c-9251-1d2dcf88cf02"),
          uid: Binary(Buffer.from("bb89d3175fb0981c86826c27759082a37160471f60504b4612415186c630ab70", "hex"), 0)
        },
        txnNumber: Long("20949"),
        '$audit': {
          '$impersonatedUser': { user: 'root', db: 'admin' },
          '$impersonatedRoles': [ { role: 'root', db: 'admin' } ]
        },
        mayBypassWriteBlocking: true,
        '$clusterTime': {
          clusterTime: Timestamp({ t: 1738867841, i: 1 }),
          signature: {
            hash: Binary(Buffer.from("0000000000000000000000000000000000000000", "hex"), 0),
            keyId: Long("0")
          }
        },
        '$configTime': Timestamp({ t: 1738867838, i: 5 }),
        '$topologyTime': Timestamp({ t: 1718904344, i: 6 }),
        '$db': 'devdb'
      },
      numYields: 0,
      waitForWriteConcernDurationMillis: Long("36321"),
      locks: {},
      waitingForLock: false,
      lockStats: {
        ParallelBatchWriterMode: { acquireCount: { r: Long("2") } },
        FeatureCompatibilityVersion: { acquireCount: { w: Long("2") } },
        ReplicationStateTransition: { acquireCount: { w: Long("3") } },
        Global: { acquireCount: { w: Long("2") } },
        Database: { acquireCount: { w: Long("2") } },
        Collection: { acquireCount: { w: Long("2") } },
        Mutex: { acquireCount: { r: Long("8") } }
      },
      waitingForFlowControl: false,
      flowControlStats: { acquireCount: Long("1") }
    }
  ],
  ok: 1
}
`

I've tried to kill these operations with `db.killOp(<op_id>)`, but they continue to hang around. I cannot find any information online about resolving the 'Failed to acquire DDL lock' issue, any help would be appreciated. Let me know what additional information I can provide.

Currently, our planned solution is to tear down the cluster and rebuild it, but we would like to avoid that if possible.

Renato_Riccio · February 9, 2025, 3:59pm

Hi Jackson,

Are the majority of the data bearing voting nodes up and running for shard1rs?
You can check that executing rs.status() directly on the shard1rs.
How many data nodes do you have in the replica set? Do you use an arbiter?

Drop needs to be committed on the majority of the voting nodes and if this condition is not respected the operation will not succeed.

jjvolesky · February 9, 2025, 5:17pm

Hi Renato,

Thanks for the response.

All three data nodes for shard1rs are up running. At least, all three nodes have an active mongod process.
There are three data nodes in the replica set. I am not using an arbiter.

I get an error when trying to use rs.status() in mongosh. Below is the output of sh.status().

[direct: mongos] admin> sh.status()
shardingVersion
{ _id: 1, clusterId: ObjectId("6517a9c3513b5e1ea5598afb") }
---
shards
[
  {
    _id: 'shard1rs',
    host: 'shard1rs/lattice-151:27017,lattice-152:27017,lattice-153:27017',
    state: 1,
    topologyTime: Timestamp({ t: 1696049773, i: 2 })
  },
  {
    _id: 'shard2rs',
    host: 'shard2rs/lattice-154:27017,lattice-155:27017,lattice-156:27017',
    state: 1,
    topologyTime: Timestamp({ t: 1696049917, i: 1 })
  },
  {
    _id: 'shard3rs',
    host: 'shard3rs/lattice-157:27017,lattice-158:27017,lattice-159:27017',
    state: 1,
    topologyTime: Timestamp({ t: 1696049999, i: 5 })
  },
  {
    _id: 'shard4rs',
    host: 'shard4rs/lattice-160:27017,lattice-161:27017,lattice-162:27017',
    state: 1,
    topologyTime: Timestamp({ t: 1696050169, i: 5 })
  },
  {
    _id: 'shard5rs',
    host: 'shard5rs/lattice-163:27017,lattice-164:27017,lattice-165:27017',
    state: 1,
    topologyTime: Timestamp({ t: 1718904344, i: 6 })
  }
]
---
active mongoses
[ { '7.0.1': 60 } ]
---
autosplit
{ 'Currently enabled': 'yes' }
---
balancer
{ 'Currently enabled': 'yes', 'Currently running': 'no' }
---
databases
[
  {
    database: {
      _id: 'auth_test',
      primary: 'shard2rs',
      partitioned: false,
      version: {
        uuid: new UUID("feacc160-f91b-4564-a993-0788334bb858"),
        timestamp: Timestamp({ t: 1696482050, i: 7 }),
        lastMod: 1
      }
    },
    collections: {}
  },
  {
    database: {
      _id: 'census_ppmf',
      primary: 'shard1rs',
      partitioned: false,
      version: {
        uuid: new UUID("48b6c178-148d-48da-8579-58d447d35146"),
        timestamp: Timestamp({ t: 1718647921, i: 7 }),
        lastMod: 1
      }
    },
    collections: {}
  },
  {
    database: { _id: 'config', primary: 'config', partitioned: true },
    collections: {
      'config.system.sessions': {
        shardKey: { _id: 1 },
        unique: false,
        balancing: true,
        chunkMetadata: [
          { shard: 'shard1rs', nChunks: 1 },
          { shard: 'shard2rs', nChunks: 222 },
          { shard: 'shard3rs', nChunks: 98 },
          { shard: 'shard4rs', nChunks: 98 },
          { shard: 'shard5rs', nChunks: 255 }
        ],
        chunks: [
          'too many chunks to print, use verbose if you want to force print'
        ],
        tags: []
      }
    }
  },
  {
    database: {
      _id: 'devdb',
      primary: 'shard3rs',
      partitioned: false,
      version: {
        uuid: new UUID("39e6e499-c80a-410b-ac77-712fa9d29f03"),
        timestamp: Timestamp({ t: 1697219035, i: 1 }),
        lastMod: 1
      }
    },
    collections: {}
  },
  {
    database: {
      _id: 'MMM_FFAR',
      primary: 'shard5rs',
      partitioned: false,
      version: {
        uuid: new UUID("e452ef90-0cb3-4971-a552-f4ecc2359c7e"),
        timestamp: Timestamp({ t: 1719426193, i: 1 }),
        lastMod: 1
      }
    },
    collections: {}
  },
  {
    database: {
      _id: 'scsc_data',
      primary: 'shard2rs',
      partitioned: false,
      version: {
        uuid: new UUID("093c96e1-4a2e-4a94-8c09-52268cfe8248"),
        timestamp: Timestamp({ t: 1696728208, i: 1 }),
        lastMod: 1
      }
    },
    collections: {}
  },
  {
    database: {
      _id: 'sustaindb',
      primary: 'shard4rs',
      partitioned: false,
      version: {
        uuid: new UUID("c66eb0cc-aa7e-4d63-9cee-ef89c657075f"),
        timestamp: Timestamp({ t: 1696050410, i: 5 }),
        lastMod: 1
      }
    },
    collections: {
      'sustaindb.tract_centroid_geo_old': {
        shardKey: { _id: 1 },
        unique: false,
        balancing: true,
        chunkMetadata: [ { shard: 'shard4rs', nChunks: 1 } ],
        chunks: [
          { min: { _id: MinKey() }, max: { _id: MaxKey() }, 'on shard': 'shard4rs', 'last modified': Timestamp({ t: 1, i: 1 }) }
        ],
        tags: []
      }
    }
  },
  {
    database: {
      _id: 'test',
      primary: 'shard1rs',
      partitioned: false,
      version: {
        uuid: new UUID("0465ba40-a547-43a1-af96-4ebde10f4394"),
        timestamp: Timestamp({ t: 1708580643, i: 2 }),
        lastMod: 1
      }
    },
    collections: {}
  },
  {
    database: {
      _id: 'user',
      primary: 'shard3rs',
      partitioned: false,
      version: {
        uuid: new UUID("8d403324-f7a3-4baf-925a-be8b6391551a"),
        timestamp: Timestamp({ t: 1696725450, i: 5 }),
        lastMod: 1
      }
    },
    collections: {}
  }
]