Skip to content

Commit 2a0165a

Browse files
Mohamad Haj YahiaSaeed Mahameed
authored andcommitted
net/mlx5: Cancel delayed recovery work when unloading the driver
Draining the health workqueue will ignore future health works including the one that report hardware failure and thus we can't enter error state Instead cancel the recovery flow and make sure only recovery flow won't be scheduled. Fixes: 5e44fca ('net/mlx5: Only cancel recovery work when cleaning up device') Signed-off-by: Mohamad Haj Yahia <[email protected]> Signed-off-by: Moshe Shemesh <[email protected]> Signed-off-by: Saeed Mahameed <[email protected]>
1 parent 8ce59b1 commit 2a0165a

File tree

3 files changed

+16
-2
lines changed

3 files changed

+16
-2
lines changed

drivers/net/ethernet/mellanox/mlx5/core/health.c

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ enum {
6767

6868
enum {
6969
MLX5_DROP_NEW_HEALTH_WORK,
70+
MLX5_DROP_NEW_RECOVERY_WORK,
7071
};
7172

7273
static u8 get_nic_state(struct mlx5_core_dev *dev)
@@ -193,7 +194,7 @@ static void health_care(struct work_struct *work)
193194
mlx5_handle_bad_state(dev);
194195

195196
spin_lock(&health->wq_lock);
196-
if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags))
197+
if (!test_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags))
197198
schedule_delayed_work(&health->recover_work, recover_delay);
198199
else
199200
dev_err(&dev->pdev->dev,
@@ -313,6 +314,7 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev)
313314
init_timer(&health->timer);
314315
health->sick = 0;
315316
clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
317+
clear_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
316318
health->health = &dev->iseg->health;
317319
health->health_counter = &dev->iseg->health_counter;
318320

@@ -335,11 +337,22 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
335337

336338
spin_lock(&health->wq_lock);
337339
set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
340+
set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
338341
spin_unlock(&health->wq_lock);
339342
cancel_delayed_work_sync(&health->recover_work);
340343
cancel_work_sync(&health->work);
341344
}
342345

346+
void mlx5_drain_health_recovery(struct mlx5_core_dev *dev)
347+
{
348+
struct mlx5_core_health *health = &dev->priv.health;
349+
350+
spin_lock(&health->wq_lock);
351+
set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
352+
spin_unlock(&health->wq_lock);
353+
cancel_delayed_work_sync(&dev->priv.health.recover_work);
354+
}
355+
343356
void mlx5_health_cleanup(struct mlx5_core_dev *dev)
344357
{
345358
struct mlx5_core_health *health = &dev->priv.health;

drivers/net/ethernet/mellanox/mlx5/core/main.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1228,7 +1228,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
12281228
int err = 0;
12291229

12301230
if (cleanup)
1231-
mlx5_drain_health_wq(dev);
1231+
mlx5_drain_health_recovery(dev);
12321232

12331233
mutex_lock(&dev->intf_state_mutex);
12341234
if (test_bit(MLX5_INTERFACE_STATE_DOWN, &dev->intf_state)) {

include/linux/mlx5/driver.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -925,6 +925,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev);
925925
void mlx5_start_health_poll(struct mlx5_core_dev *dev);
926926
void mlx5_stop_health_poll(struct mlx5_core_dev *dev);
927927
void mlx5_drain_health_wq(struct mlx5_core_dev *dev);
928+
void mlx5_drain_health_recovery(struct mlx5_core_dev *dev);
928929
int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size,
929930
struct mlx5_buf *buf, int node);
930931
int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf);

0 commit comments

Comments
 (0)