]> git.itanic.dy.fi Git - linux-stable/commitdiff
nvme: fix reconnection fail due to reserved tag allocation
authorChunguang Xu <chunguang.xu@shopee.com>
Mon, 11 Mar 2024 02:09:27 +0000 (10:09 +0800)
committerSasha Levin <sashal@kernel.org>
Tue, 26 Mar 2024 22:17:33 +0000 (18:17 -0400)
[ Upstream commit de105068fead55ed5c07ade75e9c8e7f86a00d1d ]

We found a issue on production environment while using NVMe over RDMA,
admin_q reconnect failed forever while remote target and network is ok.
After dig into it, we found it may caused by a ABBA deadlock due to tag
allocation. In my case, the tag was hold by a keep alive request
waiting inside admin_q, as we quiesced admin_q while reset ctrl, so the
request maked as idle and will not process before reset success. As
fabric_q shares tagset with admin_q, while reconnect remote target, we
need a tag for connect command, but the only one reserved tag was held
by keep alive command which waiting inside admin_q. As a result, we
failed to reconnect admin_q forever. In order to fix this issue, I
think we should keep two reserved tags for admin queue.

Fixes: ed01fee283a0 ("nvme-fabrics: only reserve a single tag")
Signed-off-by: Chunguang Xu <chunguang.xu@shopee.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
drivers/nvme/host/core.c
drivers/nvme/host/fabrics.h

index 0a96362912ceda0a035b916f432b24afb261f6a7..fe3627c5bdc9940c1eb6844332ea98f629cf0c3a 100644 (file)
@@ -4359,7 +4359,8 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
        set->ops = ops;
        set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
        if (ctrl->ops->flags & NVME_F_FABRICS)
-               set->reserved_tags = NVMF_RESERVED_TAGS;
+               /* Reserved for fabric connect and keep alive */
+               set->reserved_tags = 2;
        set->numa_node = ctrl->numa_node;
        set->flags = BLK_MQ_F_NO_SCHED;
        if (ctrl->ops->flags & NVME_F_BLOCKING)
@@ -4428,7 +4429,8 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
        if (ctrl->quirks & NVME_QUIRK_SHARED_TAGS)
                set->reserved_tags = NVME_AQ_DEPTH;
        else if (ctrl->ops->flags & NVME_F_FABRICS)
-               set->reserved_tags = NVMF_RESERVED_TAGS;
+               /* Reserved for fabric connect */
+               set->reserved_tags = 1;
        set->numa_node = ctrl->numa_node;
        set->flags = BLK_MQ_F_SHOULD_MERGE;
        if (ctrl->ops->flags & NVME_F_BLOCKING)
index 06cc54851b1be39615cdfa6eed1a935dec472f82..37c974c38dcb077a5018c728c8f33ffc421e53b9 100644 (file)
 /* default is -1: the fail fast mechanism is disabled  */
 #define NVMF_DEF_FAIL_FAST_TMO         -1
 
-/*
- * Reserved one command for internal usage.  This command is used for sending
- * the connect command, as well as for the keep alive command on the admin
- * queue once live.
- */
-#define NVMF_RESERVED_TAGS     1
-
 /*
  * Define a host as seen by the target.  We allocate one at boot, but also
  * allow the override it when creating controllers.  This is both to provide