]> git.itanic.dy.fi Git - linux-stable/commitdiff
RDMA/mlx5: Add a umr recovery flow
authorAharon Landau <aharonl@nvidia.com>
Sun, 15 May 2022 04:19:53 +0000 (07:19 +0300)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 20 Sep 2022 10:43:45 +0000 (12:43 +0200)
[ Upstream commit 158e71bb69e368b8b33e8b7c4ac8c111da0c1ae2 ]

When a UMR fails, the UMR QP state changes to an error state. Therefore,
all the further UMR operations will fail too.

Add a recovery flow to the UMR QP, and repost the flushed WQEs.

Link: https://lore.kernel.org/r/6cc24816cca049bd8541317f5e41d3ac659445d3.1652588303.git.leonro@nvidia.com
Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Reviewed-by: Michael Guralnik <michaelgur@nvidia.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Stable-dep-of: 9b7d4be967f1 ("RDMA/mlx5: Fix UMR cleanup on error flow of driver init")
Signed-off-by: Sasha Levin <sashal@kernel.org>
drivers/infiniband/hw/mlx5/cq.c
drivers/infiniband/hw/mlx5/mlx5_ib.h
drivers/infiniband/hw/mlx5/umr.c

index 08371a80fdc269e3c00a9219c242483c434c6e98..be189e0525de6c33549f68b727c5c95317455d80 100644 (file)
@@ -523,6 +523,10 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq,
                            "Requestor" : "Responder", cq->mcq.cqn);
                mlx5_ib_dbg(dev, "syndrome 0x%x, vendor syndrome 0x%x\n",
                            err_cqe->syndrome, err_cqe->vendor_err_synd);
+               if (wc->status != IB_WC_WR_FLUSH_ERR &&
+                   (*cur_qp)->type == MLX5_IB_QPT_REG_UMR)
+                       dev->umrc.state = MLX5_UMR_STATE_RECOVER;
+
                if (opcode == MLX5_CQE_REQ_ERR) {
                        wq = &(*cur_qp)->sq;
                        wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
index 998b67509a5338d3c48ca5ff8b0d5c02813484d0..7460e0dfe6db4ae4b248d3e467437b7887a806ce 100644 (file)
@@ -717,13 +717,23 @@ struct mlx5_ib_umr_context {
        struct completion       done;
 };
 
+enum {
+       MLX5_UMR_STATE_ACTIVE,
+       MLX5_UMR_STATE_RECOVER,
+       MLX5_UMR_STATE_ERR,
+};
+
 struct umr_common {
        struct ib_pd    *pd;
        struct ib_cq    *cq;
        struct ib_qp    *qp;
-       /* control access to UMR QP
+       /* Protects from UMR QP overflow
         */
        struct semaphore        sem;
+       /* Protects from using UMR while the UMR is not active
+        */
+       struct mutex lock;
+       unsigned int state;
 };
 
 struct mlx5_cache_ent {
index 3a48364c09181c4edf7e6aac8075407b62595c4e..e00b94d1b1ea1e9c039d06548185a6ae4aa03ce0 100644 (file)
@@ -176,6 +176,7 @@ int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev)
        dev->umrc.pd = pd;
 
        sema_init(&dev->umrc.sem, MAX_UMR_WR);
+       mutex_init(&dev->umrc.lock);
 
        return 0;
 
@@ -195,6 +196,31 @@ void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev)
        ib_dealloc_pd(dev->umrc.pd);
 }
 
+static int mlx5r_umr_recover(struct mlx5_ib_dev *dev)
+{
+       struct umr_common *umrc = &dev->umrc;
+       struct ib_qp_attr attr;
+       int err;
+
+       attr.qp_state = IB_QPS_RESET;
+       err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE);
+       if (err) {
+               mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
+               goto err;
+       }
+
+       err = mlx5r_umr_qp_rst2rts(dev, umrc->qp);
+       if (err)
+               goto err;
+
+       umrc->state = MLX5_UMR_STATE_ACTIVE;
+       return 0;
+
+err:
+       umrc->state = MLX5_UMR_STATE_ERR;
+       return err;
+}
+
 static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,
                               struct mlx5r_umr_wqe *wqe, bool with_data)
 {
@@ -231,7 +257,7 @@ static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,
 
        id.ib_cqe = cqe;
        mlx5r_finish_wqe(qp, ctrl, seg, size, cur_edge, idx, id.wr_id, 0,
-                        MLX5_FENCE_MODE_NONE, MLX5_OPCODE_UMR);
+                        MLX5_FENCE_MODE_INITIATOR_SMALL, MLX5_OPCODE_UMR);
 
        mlx5r_ring_db(qp, 1, ctrl);
 
@@ -270,17 +296,49 @@ static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey,
        mlx5r_umr_init_context(&umr_context);
 
        down(&umrc->sem);
-       err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe,
-                                 with_data);
-       if (err)
-               mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err);
-       else {
-               wait_for_completion(&umr_context.done);
-               if (umr_context.status != IB_WC_SUCCESS) {
-                       mlx5_ib_warn(dev, "reg umr failed (%u)\n",
-                                    umr_context.status);
+       while (true) {
+               mutex_lock(&umrc->lock);
+               if (umrc->state == MLX5_UMR_STATE_ERR) {
+                       mutex_unlock(&umrc->lock);
                        err = -EFAULT;
+                       break;
+               }
+
+               if (umrc->state == MLX5_UMR_STATE_RECOVER) {
+                       mutex_unlock(&umrc->lock);
+                       usleep_range(3000, 5000);
+                       continue;
+               }
+
+               err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe,
+                                         with_data);
+               mutex_unlock(&umrc->lock);
+               if (err) {
+                       mlx5_ib_warn(dev, "UMR post send failed, err %d\n",
+                                    err);
+                       break;
                }
+
+               wait_for_completion(&umr_context.done);
+
+               if (umr_context.status == IB_WC_SUCCESS)
+                       break;
+
+               if (umr_context.status == IB_WC_WR_FLUSH_ERR)
+                       continue;
+
+               WARN_ON_ONCE(1);
+               mlx5_ib_warn(dev,
+                       "reg umr failed (%u). Trying to recover and resubmit the flushed WQEs\n",
+                       umr_context.status);
+               mutex_lock(&umrc->lock);
+               err = mlx5r_umr_recover(dev);
+               mutex_unlock(&umrc->lock);
+               if (err)
+                       mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n",
+                                    err);
+               err = -EFAULT;
+               break;
        }
        up(&umrc->sem);
        return err;