ShardingTaskExecutor-PoolMinSize

hi gurus,
we run community version 4.4.2 on Ubuntu 18.04, on 90 shards of 3 LXC-containers each (thus 270 containers), on top of XFS filesystems.
When we start our continuous load stream then soon some shard servers crashed with the following logs:

{"t":{"$date":"2021-01-04T13:45:47.800+01:00"},"s":"F",  "c":"CONTROL",  "id":4757800, "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"Writing fatal message","attr":{"message":"terminate() called. An exception is active; attempting to gather more information"}}
{"t":{"$date":"2021-01-04T13:45:47.800+01:00"},"s":"F",  "c":"CONTROL",  "id":4757800, "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"Writing fatal message","attr":{"message":"DBException::toString(): NetworkInterfaceExceededTimeLimit: Remote command timed out while waiting to get a connection from the pool, took 31481ms, timeout was set to 20000ms\nActual exception type: mongo::error_details::ExceptionForImpl<(mongo::ErrorCodes::Error)202, mongo::ExceptionForCat<(mongo::ErrorCategory)1>, mongo::ExceptionForCat<(mongo::ErrorCategory)10> >\n"}}
{"t":{"$date":"2021-01-04T13:45:48.267+01:00"},"s":"I",  "c":"CONTROL",  "id":31431,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"BACKTRACE: {bt}","attr":{"bt":{"backtrace":[{"a":"563F1146B921","b":"563F0E788000","o":"2CE3921","s":"_ZN5mongo18stack_trace_detail12_GLOBAL__N_119printStackTraceImplERKNS1_7OptionsEPNS_14StackTraceSinkE.constprop.606","s+":"1E1"},{"a":"563F1146CF59","b":"563F0E788000","o":"2CE4F59","s":"_ZN5mongo15printStackTraceEv","s+":"29"},{"a":"563F1146A5F6","b":"563F0E788000","o":"2CE25F6","s":"_ZN5mongo12_GLOBAL__N_111myTerminateEv","s+":"A6"},{"a":"563F115F5A16","b":"563F0E788000","o":"2E6DA16","s":"_ZN10__cxxabiv111__terminateEPFvvE","s+":"6"},{"a":"563F11689DB9","b":"563F0E788000","o":"2F01DB9","s":"__cxa_call_terminate,"s+":"39"},{"a":"563F115F5435","b":"563F0E788000","o":"2E6D435","s":"__gxx_personality_v0","s+":"2C5"},{"a":"7F394622F763","b":"7F394621F000","o":"10763","s":"_Unwind_GetTextRelBase","s+":"1E13"},{"a":"7F394623007D","b":"7F394621F000","o":"1107D","s":"_Unwind_Resume","s+":"12D"},{"a":"563F0F5FDC30","b":"563F0E788000","o":"E75C30","s":"_ZN5mongo8executor18NetworkInterfaceTL16CommandStateBase8setTimerEv.cold.1687","s+":"78"},{"a":"563F10E17678","b":"563F0E788000","o":"268F678","s":"_ZN5mongo8executor18NetworkInterfaceTL19ExhaustCommandState11sendRequestESt10shared_ptrINS1_12RequestStateEE","s+":"38"},{"a":"563F10E1B2A1","b":"563F0E788000","o":"26932A1","s":"_ZN5mongo8executor18NetworkInterfaceTL14RequestManager7trySendENS_10StatusWithISt10unique_ptrINS0_14ConnectionPool19ConnectionInterfaceESt8functionIFvPS6_EEEEEm","s+":"C41"},{"a":"563F10E1BB7E","b":"563F0E788000","o":"2693B7E","s":"_ZZN5mongo15unique_functionIFvNS_6StatusEEE8makeImplIZZNOS_14ExecutorFutureISt10unique_ptrINS_8executor14ConnectionPool19ConnectionInterfaceESt8functionIFvPS9_EEEE8getAsyncIZNS7_18NetworkInterfaceTL19startExhaustCommandERKNS7_12TaskExecutor14CallbackHandleERNS7_24RemoteCommandRequestImplISt6vectorINS_11HostAndPortESaISO_EEEEONS0_IFvRKNS7_26RemoteCommandOnAnyResponseEEEERKSt10shared_ptrINS_5BatonEEEUlT_E0_Li0EEEvOS14_ENUlNS_10StatusWithISE_EEE_clES18_EUlS1_E_EEDaS16_EN12SpecificImpl4callEOS1_","s+":"CE"},{"a":"563F10E4EA09","b":"563F0E788000","o":"26C6A09","s":"_ZN4asio6detail11executor_opINS0_15work_dispatcherIZN5mongo9transport18TransportLayerASIO11ASIOReactor8scheduleENS3_15unique_functionIFvNS3_6StatusEEEEEUlvE_EESaIvENS0_19scheduler_operationEE11do_completeEPvPSE_RKSt10error_codem","s+":"89"},{"a":"563F10F8F714","b":"563F0E788000","o":"2807714","s":"_ZN4asio6detail9scheduler10do_run_oneERNS0_27conditionally_enabled_mutex11scoped_lockERNS0_21scheduler_thread_infoERKSt10error_code","s+":"3B4"},{"a":"563F10F8F9A5","b":"563F0E788000","o":"28079A5","s":"_ZN4asio6detail9scheduler3runERSt10error_code","s+":"115"},{"a":"563F10F9762E","b":"563F0E788000","o":"280F62E","s":"_ZN4asio10io_context3runEv","s+":"3E"},{"a":"563F10E400C6","b":"563F0E788000","o":"26B80C6","s":"_ZN5mongo9transport18TransportLayerASIO11ASIOReactor3runEv","s+":"36"},{"a":"563F10E0D348","b":"563F0E788000","o":"2685348","s":"_ZN5mongo8executor18NetworkInterfaceTL4_runEv","s+":"C8"},{"a":"563F10E0D58D","b":"563F0E788000","o":"268558D","s":"_ZNSt6thread11_State_implINS_8_InvokerISt5tupleIJZN5mongo4stdx6threadC4IZNS3_8executor18NetworkInterfaceTL7startupEvEUlvE_JELi0EEET_DpOT0_EUlvE_EEEEE6_M_runEv","s+":"6D"},{"a":"563F1161147F","b":"563F0E788000","o":"2E8947F","s":"execute_native_thread_routine","s+":"F"},{"a":"7F39460076DB","b":"7F3946000000","o":"76DB","s":"start_thread","s+":"DB"},{"a":"7F3945D3088F","b":"7F3945C0F000","o":"12188F","s":"clone","s+":"3F"}],"processInfo":{"mongodbVersion":"4.4.2","gitVersion":"15e73dc5738d2278b688f8929aee605fe4279b0e","compiledModules":[],"uname":{"sysname":"Linux","release":"4.15.0-122-generic","version":"#124-Ubuntu SMP Thu Oct 15 13:03:05 UTC 2020","machine":"x86_64"},"somap":[{"b":"563F0E788000","elfType":3,"buildId":"D18F657A1E06C333C2AEE534E3047044B0653DBF"},{"b":"7F394621F000","path":"/lib/x86_64-linux-gnu/libgcc_s.so.1","elfType":3,"buildId":"039AE85FEF075EC14FE3528762A0645C8CF73B29"},{"b":"7F3946000000","path":"/lib/x86_64-linux-gnu/libpthread.so.0","elfType":3,"buildId":"28C6AADE70B2D40D1F0F3D0A1A0CAD1AB816448F"},{"b":"7F3945C0F000","path":"/lib/x86_64-linux-gnu/libc.so.6","elfType":3,"buildId":"B417C0BA7CC5CF06D1D1BED6652CEDB9253C60D0"}]}}}}
{"t":{"$date":"2021-01-04T13:45:48.267+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"563F1146B921","b":"563F0E788000","o":"2CE3921","s":"_ZN5mongo18stack_trace_detail12_GLOBAL__N_119printStackTraceImplERKNS1_7OptionsEPNS_14StackTraceSinkE.constprop.606","s+":"1E1"}}}
{"t":{"$date":"2021-01-04T13:45:48.267+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"563F1146CF59","b":"563F0E788000","o":"2CE4F59","s":"_ZN5mongo15printStackTraceEv","s+":"29"}}}
{"t":{"$date":"2021-01-04T13:45:48.267+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"563F1146A5F6","b":"563F0E788000","o":"2CE25F6","s":"_ZN5mongo12_GLOBAL__N_111myTerminateEv","s+":"A6"}}}
{"t":{"$date":"2021-01-04T13:45:48.267+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"563F115F5A16","b":"563F0E788000","o":"2E6DA16","s":"_ZN10__cxxabiv111__terminateEPFvvE","s+":"6"}}}
{"t":{"$date":"2021-01-04T13:45:48.267+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"563F11689DB9","b":"563F0E788000","o":"2F01DB9","s":"__cxa_call_terminate","s+":"39"}}}
{"t":{"$date":"2021-01-04T13:45:48.267+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"563F115F5435","b":"563F0E788000","o":"2E6D435","s":"__gxx_personality_v0","s+":"2C5"}}}
{"t":{"$date":"2021-01-04T13:45:48.267+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"7F394622F763","b":"7F394621F000","o":"10763","s":"_Unwind_GetTextRelBase","s+":"1E13"}}}
{"t":{"$date":"2021-01-04T13:45:48.267+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame:  {frame}","attr":{"frame": {"a":"7F394623007D","b":"7F394621F000","o":"1107D","s":"_Unwind_Resume","s+":"12D"}}}
{"t":{"$date":"2021-01-04T13:45:48.267+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"563F0F5FDC30","b":"563F0E788000","o":"E75C30","s":"_ZN5mongo8executor18NetworkInterfaceTL16CommandStateBase8setTimerEv.cold.1687","s+":"78"}}}
{"t":{"$date":"2021-01-04T13:45:48.267+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"563F10E17678","b":"563F0E788000","o":"268F678","s":"_ZN5mongo8executor18NetworkInterfaceTL19ExhaustCommandState11sendRequestESt10shared_ptrINS1_12RequestStateEE","s+":"38"}}}
{"t":{"$date":"2021-01-04T13:45:48.268+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"563F10E1B2A1","b":"563F0E788000","o":"26932A1","s":"_ZN5mongo8executor18NetworkInterfaceTL14RequestManager7trySendENS_10StatusWithISt10unique_ptrINS0_14ConnectionPool19ConnectionInterfaceESt8functionIFvPS6_EEEEEm","s+":"C41"}}}
{"t":{"$date":"2021-01-04T13:45:48.268+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"563F10E1BB7E","b":"563F0E788000","o":"2693B7E","s":"_ZZN5mongo15unique_functionIFvNS_6StatusEEE8makeImplIZZNOS_14ExecutorFutureISt10unique_ptrINS_8executor14ConnectionPool19ConnectionInterfaceESt8functionIFvPS9_EEEE8getAsyncIZNS7_18NetworkInterfaceTL19startExhaustCommandERKNS7_12TaskExecutor14CallbackHandleERNS7_24RemoteCommandRequestImplISt6vectorINS_11HostAndPortESaISO_EEEEONS0_IFvRKNS7_26RemoteCommandOnAnyResponseEEEERKSt10shared_ptrINS_5BatonEEEUlT_E0_Li0EEEvOS14_ENUlNS_10StatusWithISE_EEE_clES18_EUlS1_E_EEDaS16_EN12SpecificImpl4callEOS1_","s+":"CE"}}}
{"t":{"$date":"2021-01-04T13:45:48.268+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"563F10E4EA09","b":"563F0E788000","o":"26C6A09","s":"_ZN4asio6detail11executor_opINS0_15work_dispatcherIZN5mongo9transport18TransportLayerASIO11ASIOReactor8scheduleENS3_15unique_functionIFvNS3_6StatusEEEEEUlvE_EESaIvENS0_19scheduler_operationEE11do_completeEPvPSE_RKSt10error_codem","s+":"89"}}}
{"t":{"$date":"2021-01-04T13:45:48.268+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"563F10F8F714","b":"563F0E788000","o":"2807714","s":"_ZN4asio6detail9scheduler10do_run_oneERNS0_27conditionally_enabled_mutex11scoped_lockERNS0_21scheduler_thread_infoERKSt10error_code","s+":"3B4"}}}
{"t":{"$date":"2021-01-04T13:45:48.268+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"563F10F8F9A5","b":"563F0E788000","o":"28079A5","s":"_ZN4asio6detail9scheduler3runERSt10error_code","s+":"115"}}}
{"t":{"$date":"2021-01-04T13:45:48.268+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"563F10F9762E","b":"563F0E788000","o":"280F62E","s":"_ZN4asio10io_context3runEv","s+":"3E"}}}
{"t":{"$date":"2021-01-04T13:45:48.268+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"563F10E400C6","b":"563F0E788000","o":"26B80C6","s":"_ZN5mongo9transport18TransportLayerASIO11ASIOReactor3runEv","s+":"36"}}}
{"t":{"$date":"2021-01-04T13:45:48.268+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame:  {frame}","attr":{"frame":{"a":"563F10E0D348","b":"563F0E788000","o":"2685348","s":"_ZN5mongo8executor18NetworkInterfaceTL4_runEv","s+":"C8"}}}
{"t":{"$date":"2021-01-04T13:45:48.268+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"563F10E0D58D","b":"563F0E788000","o":"268558D","s":"_ZNSt6thread11_State_implINS_8_InvokerISt5tupleIJZN5mongo4stdx6threadC4IZNS3_8executor18NetworkInterfaceTL7startupEvEUlvE_JELi0EEET_DpOT0_EUlvE_EEEEE6_M_runEv","s+":"6D"}}}
{"t":{"$date":"2021-01-04T13:45:48.268+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"563F1161147F","b":"563F0E788000","o":"2E8947F","s":"execute_native_thread_routine","s+":"F"}}}
{"t":{"$date":"2021-01-04T13:45:48.268+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"7F39460076DB","b":"7F3946000000","o":"76DB","s":"start_thread","s+":"DB"}}}
{"t":{"$date":"2021-01-04T13:45:48.268+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame:  {frame}","attr":"frame":"a":"7F3945D3088F","b":"7F3945C0F000","o":"12188F","s":"clone","s+":"3F"}}}

{"t":{"$date":"2021-01-04T13:46:01.892+01:00"},"s":"I",  "c":"CONTROL",  "id":20698,   "ctx":"main","msg":"***** SERVER RESTARTED *****"}

This is not explicitly documented with some solution, but some users posted messages that their identical problem was solved by adding some extra pool-size parameters to the startup. Thus I added the following settings to my “/etc/mongod.conf” files on all shard servers:

setParameter:
  ShardingTaskExecutorPoolMinSize: 90

But this causes even more shard servers to crash, now with the following log message:

{"t":{"$date":"2021-01-04T23:39:51.326+01:00"},"s":"F",  "c":"CONTROL",  "id":4757800, "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"Writing fatal message","attr":{"message":"terminate() called. An exception is active; attempting to gather more information"}}
{"t":{"$date":"2021-01-04T23:39:51.327+01:00"},"s":"F",  "c":"CONTROL",  "id":4757800, "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"Writing fatal message","attr":{"message":"DBException::toString(): NetworkInterfaceExceededTimeLimit: Remote command timed out while waiting to get a connection from the pool, took 30851ms, timeout was set to 20000ms\nActual exception type: mongo::error_details::ExceptionForImpl<(mongo::ErrorCodes::Error)202, mongo::ExceptionForCat<(mongo::ErrorCategory)1>, mongo::ExceptionForCat<(mongo::ErrorCategory)10> >\n"}}
{"t":{"$date":"2021-01-04T23:39:51.369+01:00"},"s":"I",  "c":"NETWORK",  "id":22944,   "ctx":"conn56560","msg":"Connection ended","attr":{"remote":"10.100.22.58:48090","connectionId":56560,"connectionCount":823}}
{"t":{"$date":"2021-01-04T23:39:51.374+01:00"},"s":"I",  "c":"NETWORK",  "id":22944,   "ctx":"conn56561","msg":"Connection ended","attr":{"remote":"10.100.22.58:48708","connectionId":56561,"connectionCount":822}}
{"t":{"$date":"2021-01-04T23:39:51.379+01:00"},"s":"I",  "c":"NETWORK",  "id":22943,   "ctx":"listener","msg":"Connection accepted","attr":{"remote":"10.100.22.58:53872","connectionId":57245,"connectionCount":823}}
{"t":{"$date":"2021-01-04T23:39:51.389+01:00"},"s":"I",  "c":"NETWORK",  "id":51800,   "ctx":"conn57245","msg":"client metadata","attr":{"remote":"10.100.22.58:53872","client":"conn57245","doc":{"driver":{"name":"NetworkInterfaceTL","version":"4.4.2"},"os":{"type":"Linux","name":"Ubuntu","architecture":"x86_64","version":"18.04"}}}}
{"t":{"$date":"2021-01-04T23:39:51.401+01:00"},"s":"I",  "c":"NETWORK",  "id":22943,   "ctx":"listener","msg":"Connection accepted","attr":{"remote":"192.168.201.122:17792","connectionId":57246,"connectionCount":824}}
{"t":{"$date":"2021-01-04T23:39:51.427+01:00"},"s":"I",  "c":"CONTROL",  "id":31431,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"BACKTRACE: {bt}","attr":{"bt":{"backtrace":[{"a":"55AD0DFDF921","b":"55AD0B2FC000","o":"2CE3921","s":"_ZN5mongo18stack_trace_detail12_GLOBAL__N_119printStackTraceImplERKNS1_7OptionsEPNS_14StackTraceSinkE.constprop.606","s+":"1E1"},{"a":"55AD0DFE0F59","b":"55AD0B2FC000","o":"2CE4F59","s":"_ZN5mongo15printStackTraceEv","s+":"29"},{"a":"55AD0DFDE5F6","b":"55AD0B2FC000","o":"2CE25F6","s":"_ZN5mongo12_GLOBAL__N_111myTerminateEv","s+":"A6"},{"a":"55AD0E169A16","b":"55AD0B2FC000","o":"2E6DA16","s":"_ZN10__cxxabiv111__terminateEPFvvE","s+":"6"},{"a":"55AD0E1FDDB9","b":"55AD0B2FC000","o":"2F01DB9","s":"__cxa_call_terminate","s+":"39"},{"a":"55AD0E169435","b":"55AD0B2FC000","o":"2E6D435","s":"__gxx_personality_v0","s+":"2C5"},{"a":"7F680F90E763","b":"7F680F8FE000","o":"10763","s":"_Unwind_GetTextRelBase","s+":"1E13"},{"a":"7F680F90F07D","b":"7F680F8FE000","o":"1107D","s":"_Unwind_Resume","s+":"12D"},{"a":"55AD0C171C30","b":"55AD0B2FC000","o":"E75C30","s":"_ZN5mongo8executor18NetworkInterfaceTL16CommandStateBase8setTimerEv.cold.1687","s+":"78"},{"a":"55AD0D98B678","b":"55AD0B2FC000","o":"268F678","s":"_ZN5mongo8executor18NetworkInterfaceTL19ExhaustCommandState11sendRequestESt10shared_ptrINS1_12RequestStateEE","s+":"38"},{"a":"55AD0D98F2A1","b":"55AD0B2FC000","o":"26932A1","s":"_ZN5mongo8executor18NetworkInterfaceTL14RequestManager7trySendENS_10StatusWithISt10unique_ptrINS0_14ConnectionPool19ConnectionInterfaceESt8functionIFvPS6_EEEEEm","s+":"C41"},{"a":"55AD0D98FB7E","b":"55AD0B2FC000","o":"2693B7E","s":"_ZZN5mongo15unique_functionIFvNS_6StatusEEE8makeImplIZZNOS_14ExecutorFutureISt10unique_ptrINS_8executor14ConnectionPool19ConnectionInterfaceESt8functionIFvPS9_EEEE8getAsyncIZNS7_18NetworkInterfaceTL19startExhaustCommandERKNS7_12TaskExecutor14CallbackHandleERNS7_24RemoteCommandRequestImplISt6vectorINS_11HostAndPortESaISO_EEEEONS0_IFvRKNS7_26RemoteCommandOnAnyResponseEEEERKSt10shared_ptrINS_5BatonEEEUlT_E0_Li0EEEvOS14_ENUlNS_10StatusWithISE_EEE_clES18_EUlS1_E_EEDaS16_EN12SpecificImpl4callEOS1_","s+":"CE"},{"a":"55AD0D9C2A09","b":"55AD0B2FC000","o":"26C6A09","s":"_ZN4asio6detail11executor_opINS0_15work_dispatcherIZN5mongo9transport18TransportLayerASIO11ASIOReactor8scheduleENS3_15unique_functionIFvNS3_6StatusEEEEEUlvE_EESaIvENS0_19scheduler_operationEE11do_completeEPvPSE_RKSt10error_codem","s+":"89"},{"a":"55AD0DB03714","b":"55AD0B2FC000","o":"2807714","s":"_ZN4asio6detail9scheduler10do_run_oneERNS0_27conditionally_enabled_mutex11scoped_lockERNS0_21scheduler_thread_infoERKSt10error_code","s+":"3B4"},{"a":"55AD0DB039A5","b":"55AD0B2FC000","o":"28079A5","s":"_ZN4asio6detail9scheduler3runERSt10error_code","s+":"115"},{"a":"55AD0DB0B62E","b":"55AD0B2FC000","o":"280F62E","s":"_ZN4asio10io_context3runEv","s+":"3E"},{"a":"55AD0D9B40C6","b":"55AD0B2FC000","o":"26B80C6","s":"_ZN5mongo9transport18TransportLayerASIO11ASIOReactor3runEv","s+":"36"},{"a":"55AD0D981348","b":"55AD0B2FC000","o":"2685348","s":"_ZN5mongo8executor18NetworkInterfaceTL4_runEv","s+":"C8"},{"a":"55AD0D98158D","b":"55AD0B2FC000","o":"268558D","s":"_ZNSt6thread11_State_implINS_8_InvokerISt5tupleIJZN5mongo4stdx6threadC4IZNS3_8executor18NetworkInterfaceTL7startupEvEUlvE_JELi0EEET_DpOT0_EUlvE_EEEEE6_M_runEv","s+":"6D"},{"a":"55AD0E18547F","b":"55AD0B2FC000","o":"2E8947F","s":"execute_native_thread_routine","s+":"F"},{"a":"7F680F6E66DB","b":"7F680F6DF000","o":"76DB","s":"start_thread","s+":"DB"},{"a":"7F680F40F88F","b":"7F680F2EE000","o":"12188F","s":"clone","s+":"3F"}],"processInfo":{"mongodbVersion":"4.4.2","gitVersion":"15e73dc5738d2278b688f8929aee605fe4279b0e","compiledModules":[],"uname":{"sysname":"Linux","release":"4.15.0-128-generic","version":"#131-Ubuntu SMP Wed Dec 9 06:57:35 UTC 2020","machine":"x86_64"},"somap":[{"b":"55AD0B2FC000","elfType":3,"buildId":"D18F657A1E06C333C2AEE534E3047044B0653DBF"},{"b":"7F680F8FE000","path":"/lib/x86_64-linux-gnu/libgcc_s.so.1","elfType":3,"buildId":"039AE85FEF075EC14FE3528762A0645C8CF73B29"},{"b":"7F680F6DF000","path":"/lib/x86_64-linux-gnu/libpthread.so.0","elfType":3,"buildId":"28C6AADE70B2D40D1F0F3D0A1A0CAD1AB816448F"},{"b":"7F680F2EE000","path":"/lib/x86_64-linux-gnu/libc.so.6","elfType":3,"buildId":"B417C0BA7CC5CF06D1D1BED6652CEDB9253C60D0"}]}}}}
{"t":{"$date":"2021-01-04T23:39:51.427+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"55AD0DFDF921","b":"55AD0B2FC000","o":"2CE3921","s":"_ZN5mongo18stack_trace_detail12_GLOBAL__N_119printStackTraceImplERKNS1_7OptionsEPNS_14StackTraceSinkE.constprop.606","s+":"1E1"}}}
{"t":{"$date":"2021-01-04T23:39:51.427+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"55AD0DFE0F59","b":"55AD0B2FC000","o":"2CE4F59","s":"_ZN5mongo15printStackTraceEv","s+":"29"}}}
{"t":{"$date":"2021-01-04T23:39:51.427+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"55AD0DFDE5F6","b":"55AD0B2FC000","o":"2CE25F6","s":"_ZN5mongo12_GLOBAL__N_111myTerminateEv","s+":"A6"}}}
{"t":{"$date":"2021-01-04T23:39:51.427+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"55AD0E169A16","b":"55AD0B2FC000","o":"2E6DA16","s":"_ZN10__cxxabiv111__terminateEPFvvE","s+":"6"}}}
{"t":{"$date":"2021-01-04T23:39:51.427+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"55AD0E1FDDB9","b":"55AD0B2FC000","o":"2F01DB9","s":"__cxa_call_terminate","s+":"39"}}}
{"t":{"$date":"2021-01-04T23:39:51.427+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"55AD0E169435","b":"55AD0B2FC000","o":"2E6D435","s":"__gxx_personality_v0","s+":"2C5"}}}
{"t":{"$date":"2021-01-04T23:39:51.427+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"7F680F90E763","b":"7F680F8FE000","o":"10763","s":"_Unwind_GetTextRelBase","s+":"1E13"}}}
{"t":{"$date":"2021-01-04T23:39:51.427+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"7F680F90F07D","b":"7F680F8FE000","o":"1107D","s":"_Unwind_Resume","s+":"12D"}}}
{"t":{"$date":"2021-01-04T23:39:51.427+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"55AD0C171C30","b":"55AD0B2FC000","o":"E75C30","s":"_ZN5mongo8executor18NetworkInterfaceTL16CommandStateBase8setTimerEv.cold.1687","s+":"78"}}}
{"t":{"$date":"2021-01-04T23:39:51.427+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"55AD0D98B678","b":"55AD0B2FC000","o":"268F678","s":"_ZN5mongo8executor18NetworkInterfaceTL19ExhaustCommandState11sendRequestESt10shared_ptrINS1_12RequestStateEE","s+":"38"}}}
{"t":{"$date":"2021-01-04T23:39:51.427+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"55AD0D98F2A1","b":"55AD0B2FC000","o":"26932A1","s":"_ZN5mongo8executor18NetworkInterfaceTL14RequestManager7trySendENS_10StatusWithISt10unique_ptrINS0_14ConnectionPool19ConnectionInterfaceESt8functionIFvPS6_EEEEEm","s+":"C41"}}}
{"t":{"$date":"2021-01-04T23:39:51.427+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"55AD0D98FB7E","b":"55AD0B2FC000","o":"2693B7E","s":"_ZZN5mongo15unique_functionIFvNS_6StatusEEE8makeImplIZZNOS_14ExecutorFutureISt10unique_ptrINS_8executor14ConnectionPool19ConnectionInterfaceESt8functionIFvPS9_EEEE8getAsyncIZNS7_18NetworkInterfaceTL19startExhaustCommandERKNS7_12TaskExecutor14CallbackHandleERNS7_24RemoteCommandRequestImplISt6vectorINS_11HostAndPortESaISO_EEEEONS0_IFvRKNS7_26RemoteCommandOnAnyResponseEEEERKSt10shared_ptrINS_5BatonEEEUlT_E0_Li0EEEvOS14_ENUlNS_10StatusWithISE_EEE_clES18_EUlS1_E_EEDaS16_EN12SpecificImpl4callEOS1_","s+":"CE"}}}
{"t":{"$date":"2021-01-04T23:39:51.427+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"55AD0D9C2A09","b":"55AD0B2FC000","o":"26C6A09","s":"_ZN4asio6detail11executor_opINS0_15work_dispatcherIZN5mongo9transport18TransportLayerASIO11ASIOReactor8scheduleENS3_15unique_functionIFvNS3_6StatusEEEEEUlvE_EESaIvENS0_19scheduler_operationEE11do_completeEPvPSE_RKSt10error_codem","s+":"89"}}}
{"t":{"$date":"2021-01-04T23:39:51.427+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"55AD0DB03714","b":"55AD0B2FC000","o":"2807714","s":"_ZN4asio6detail9scheduler10do_run_oneERNS0_27conditionally_enabled_mutex11scoped_lockERNS0_21scheduler_thread_infoERKSt10error_code","s+":"3B4"}}}
{"t":{"$date":"2021-01-04T23:39:51.427+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"55AD0DB039A5","b":"55AD0B2FC000","o":"28079A5","s":"_ZN4asio6detail9scheduler3runERSt10error_code","s+":"115"}}}
{"t":{"$date":"2021-01-04T23:39:51.427+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"55AD0DB0B62E","b":"55AD0B2FC000","o":"280F62E","s":"_ZN4asio10io_context3runEv","s+":"3E"}}}
{"t":{"$date":"2021-01-04T23:39:51.427+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"55AD0D9B40C6","b":"55AD0B2FC000","o":"26B80C6","s":"_ZN5mongo9transport18TransportLayerASIO11ASIOReactor3runEv","s+":"36"}}}
{"t":{"$date":"2021-01-04T23:39:51.427+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"55AD0D981348","b":"55AD0B2FC000","o":"2685348","s":"_ZN5mongo8executor18NetworkInterfaceTL4_runEv","s+":"C8"}}}
{"t":{"$date":"2021-01-04T23:39:51.427+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"55AD0D98158D","b":"55AD0B2FC000","o":"268558D","s":"_ZNSt6thread11_State_implINS_8_InvokerISt5tupleIJZN5mongo4stdx6threadC4IZNS3_8executor18NetworkInterfaceTL7startupEvEUlvE_JELi0EEET_DpOT0_EUlvE_EEEEE6_M_runEv","s+":"6D"}}}
{"t":{"$date":"2021-01-04T23:39:51.427+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"55AD0E18547F","b":"55AD0B2FC000","o":"2E8947F","s":"execute_native_thread_routine","s+":"F"}}}
{"t":{"$date":"2021-01-04T23:39:51.427+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"7F680F6E66DB","b":"7F680F6DF000","o":"76DB","s":"start_thread","s+":"DB"}}}
{"t":{"$date":"2021-01-04T23:39:51.427+01:00"},"s":"I",  "c":"CONTROL",  "id":31427,   "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"  Frame: {frame}","attr":{"frame":{"a":"7F680F40F88F","b":"7F680F2EE000","o":"12188F","s":"clone","s+":"3F"}}}

{"t":{"$date":"2021-01-04T23:40:01.713+01:00"},"s":"I",  "c":"CONTROL",  "id":20698,   "ctx":"main","msg":"***** SERVER RESTARTED *****"}

So I will remove again that “setParameter” from “/etc/mongod.conf” because it only makes things worse.

Has anyone some suggestions to fix my initial problem, or what are safe values for the pool-size parameters, please ?

thx!
Rob

It turned out that some of my servers where not (NTP-)time synchronized. Fixing that however, did still not remove the many cases where a shard server did shutdown with a log message like below:

{"t":{"$date":"2021-01-04T13:45:47.800+01:00"},"s":"F",  "c":"CONTROL",  "id":4757800, "ctx":"ReplicaSetMonitor-TaskExecutor","msg":"Writing fatal message","attr":{"message":"DBException::toString(): NetworkInterfaceExceededTimeLimit: Remote command timed out while waiting to get a connection from the pool, took 31481ms, timeout was set to 20000ms\nActual exception type: mongo::error_details::ExceptionForImpl<(mongo::ErrorCodes::Error)202, mongo::ExceptionForCat<(mongo::ErrorCategory)1>, mongo::ExceptionForCat<(mongo::ErrorCategory)10> >\n"}}