diff --git a/configs/1.8B_MoE16_sft.py b/configs/1.8B_MoE16_sft.py
index f85302778..9eefb3312 100644
--- a/configs/1.8B_MoE16_sft.py
+++ b/configs/1.8B_MoE16_sft.py
@@ -213,7 +213,6 @@
     alert=dict(
         enable_feishu_alert=DO_ALERT,
         feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
         alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
     ),
     tensorboard=dict(
diff --git a/configs/57B_qwen2_MoE.py b/configs/57B_qwen2_MoE.py
index 0fd676036..44aaa3b85 100644
--- a/configs/57B_qwen2_MoE.py
+++ b/configs/57B_qwen2_MoE.py
@@ -220,10 +220,9 @@
     alert=dict(
         enable_feishu_alert=DO_ALERT,
         feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
         alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
     ),
     tensorboard=dict(
         queue_max_length=10,
     ),
-)
\ No newline at end of file
+)
diff --git a/configs/7B_MoE4_sft.py b/configs/7B_MoE4_sft.py
index c558427cc..f837bd74e 100644
--- a/configs/7B_MoE4_sft.py
+++ b/configs/7B_MoE4_sft.py
@@ -211,7 +211,6 @@
     alert=dict(
         enable_feishu_alert=DO_ALERT,
         feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
         alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
     ),
     tensorboard=dict(
diff --git a/configs/7B_baichuan2.py b/configs/7B_baichuan2.py
index eaa26a867..0517da59e 100644
--- a/configs/7B_baichuan2.py
+++ b/configs/7B_baichuan2.py
@@ -22,7 +22,7 @@
 CHECKPOINT_EVERY = 50
 ckpt = dict(
     enable_save_ckpt=False,  # enable ckpt save.
-    enable_internevo2hf_ckpt=False, # enable ckpt save for huggingface format.
+    enable_internevo2hf_ckpt=False,  # enable ckpt save for huggingface format.
     save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
     # 'load_ckpt_info' setting guide:
     # 1. the 'path' indicate ckpt path,
@@ -196,7 +196,6 @@
     alert=dict(
         enable_feishu_alert=DO_ALERT,
         feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
         alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
     ),
     tensorboard=dict(
diff --git a/configs/7B_gemma.py b/configs/7B_gemma.py
index aff448232..6ee9979a3 100644
--- a/configs/7B_gemma.py
+++ b/configs/7B_gemma.py
@@ -24,7 +24,7 @@
 CHECKPOINT_EVERY = 50
 ckpt = dict(
     enable_save_ckpt=False,  # enable ckpt save.
-    enable_internevo2hf_ckpt=False, # enable ckpt save for huggingface format.
+    enable_internevo2hf_ckpt=False,  # enable ckpt save for huggingface format.
     save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
     # 'load_ckpt_info' setting guide:
     # 1. the 'path' indicate ckpt path,
@@ -203,7 +203,6 @@
     alert=dict(
         enable_feishu_alert=DO_ALERT,
         feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
         alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
     ),
     tensorboard=dict(
diff --git a/configs/7B_internlm2.py b/configs/7B_internlm2.py
index 97758bba4..2dbf8d4c8 100644
--- a/configs/7B_internlm2.py
+++ b/configs/7B_internlm2.py
@@ -194,7 +194,6 @@
     alert=dict(
         enable_feishu_alert=DO_ALERT,
         feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
         alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
     ),
     tensorboard=dict(
diff --git a/configs/7B_isp_sft.py b/configs/7B_isp_sft.py
index 2698a82f6..59edc3511 100644
--- a/configs/7B_isp_sft.py
+++ b/configs/7B_isp_sft.py
@@ -31,7 +31,7 @@
     # 'load_ckpt_info' setting guide:
     # 1. the 'path' indicate ckpt path,
     # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined 
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined
     # load function such as "llama"
     load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internevo"),
     # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
@@ -188,17 +188,17 @@
     2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
 sequence_2D (dict):
     1. enable: bool, whether enable the 2D sequence parallel or not.
-    2. head_size: int, the parallel degree of head parallelism (DeepSpeed Ulysses). 
+    2. head_size: int, the parallel degree of head parallelism (DeepSpeed Ulysses).
                   head_size * context_size should be equal tensor size.
     3. context_size: int, the parallel degree of context parallelism.
                   head_size * context_size should be equal tensor size.
     4. window_size: int, the sliding window size in context parallelism.
     5. device_placement_strategy: dict,
-        head_first: bool, if `True`, ranks of the same head parallel group are 
+        head_first: bool, if `True`, ranks of the same head parallel group are
                               given high priority for colocation on the same node;
                               if `False`, ranks of the same context parallel group are
                               given high priority for colocation on the same node;
-        interleaved: bool, if `head_first` is `False` and `window_size` > 1, this config could 
+        interleaved: bool, if `head_first` is `False` and `window_size` > 1, this config could
                            interleaved the ranks in the same window to make full use of NIC as much as possible.
 """
 parallel = dict(
@@ -223,7 +223,6 @@
     alert=dict(
         enable_feishu_alert=DO_ALERT,
         feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
         alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
     ),
     tensorboard=dict(
diff --git a/configs/7B_llama2.py b/configs/7B_llama2.py
index b0a173c8d..9bc026882 100644
--- a/configs/7B_llama2.py
+++ b/configs/7B_llama2.py
@@ -195,7 +195,6 @@
     alert=dict(
         enable_feishu_alert=DO_ALERT,
         feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
         alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
     ),
     tensorboard=dict(
diff --git a/configs/7B_qwen2.py b/configs/7B_qwen2.py
index 09b536ccc..f5de53e8f 100644
--- a/configs/7B_qwen2.py
+++ b/configs/7B_qwen2.py
@@ -23,7 +23,7 @@
 CHECKPOINT_EVERY = 50
 ckpt = dict(
     enable_save_ckpt=False,  # enable ckpt save.
-    enable_internevo2hf_ckpt=False, # enable ckpt save for huggingface format.
+    enable_internevo2hf_ckpt=False,  # enable ckpt save for huggingface format.
     save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
     # 'load_ckpt_info' setting guide:
     # 1. the 'path' indicate ckpt path,
@@ -203,7 +203,6 @@
     alert=dict(
         enable_feishu_alert=DO_ALERT,
         feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
         alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
     ),
     tensorboard=dict(
diff --git a/configs/7B_sft.py b/configs/7B_sft.py
index 4799b5f35..b7f62cf4a 100644
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@@ -205,7 +205,6 @@
     alert=dict(
         enable_feishu_alert=DO_ALERT,
         feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
         alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
     ),
     tensorboard=dict(
diff --git a/configs/8x22B_mixtral.py b/configs/8x22B_mixtral.py
index 56206bd4b..f7ebbb100 100644
--- a/configs/8x22B_mixtral.py
+++ b/configs/8x22B_mixtral.py
@@ -221,7 +221,6 @@
     alert=dict(
         enable_feishu_alert=DO_ALERT,
         feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
         alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
     ),
     tensorboard=dict(
diff --git a/configs/8x7B_mixtral.py b/configs/8x7B_mixtral.py
index f589c9670..41a2a2478 100644
--- a/configs/8x7B_mixtral.py
+++ b/configs/8x7B_mixtral.py
@@ -221,7 +221,6 @@
     alert=dict(
         enable_feishu_alert=DO_ALERT,
         feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
         alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
     ),
     tensorboard=dict(
diff --git a/configs/demo.py b/configs/demo.py
index e66f007f4..22735018d 100644
--- a/configs/demo.py
+++ b/configs/demo.py
@@ -34,7 +34,7 @@
     # 'load_ckpt_info' setting guide:
     # 1. the 'path' indicate ckpt path,
     # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined 
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined
     # load function such as "llama"
     load_ckpt_info=dict(path=LOAD_CKPT_FOLDER, content=("model",), ckpt_type="internevo"),
     # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
@@ -141,7 +141,6 @@
     alert=dict(
         enable_feishu_alert=DO_ALERT,
         feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
         alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
     ),
     tensorboard=dict(
diff --git a/configs/demo_llava.py b/configs/demo_llava.py
index e138e886a..a96cbc7d5 100644
--- a/configs/demo_llava.py
+++ b/configs/demo_llava.py
@@ -178,7 +178,6 @@
     alert=dict(
         enable_feishu_alert=DO_ALERT,
         feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
         alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
     ),
     tensorboard=dict(
diff --git a/doc/code-docs/locales/en/LC_MESSAGES/monitor.po b/doc/code-docs/locales/en/LC_MESSAGES/monitor.po
index 4d61a4353..a92bdd527 100644
--- a/doc/code-docs/locales/en/LC_MESSAGES/monitor.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/monitor.po
@@ -7,7 +7,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: InternLM \n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2024-08-30 16:07+0800\n"
+"POT-Creation-Date: 2024-11-20 15:01+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: en\n"
@@ -16,7 +16,7 @@ msgstr ""
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.15.0\n"
+"Generated-By: Babel 2.14.0\n"
 
 #: ../../source/monitor.rst:2
 msgid "监控和告警"
@@ -56,25 +56,12 @@ msgstr ""
 "``internlm.monitor.alert.send_feishu_msg_with_webhook()``."
 
 #: ../../source/monitor.rst:25
-msgid "轻量监控"
-msgstr "Light Monitoring"
+msgid "监控告警配置"
+msgstr "Monitor Config"
 
-#: ../../source/monitor.rst:27
+#: ../../source/monitor.rst:28
 msgid ""
-"InternEvo轻量级监控工具采用心跳机制实时监测训练过程中的各项指标，如loss、grad_norm、训练阶段的耗时等。同时，InternEvo还可以通过"
-" `grafana dashboard <https://grafana.com/grafana/dashboards/>`_ "
-"直观地呈现这些指标信息，以便用户进行更加全面和深入的训练分析。"
-msgstr ""
-"The InternEvo light monitoring tool employs a heartbeat mechanism to "
-"real-time monitor various metrics during the training process, such as "
-"loss, grad_norm, and training phase duration. Additionally, InternEvo can"
-" present these metric details through a `grafana dashboard "
-"<https://grafana.com/grafana/dashboards/>`_, allowing users to conduct "
-"more comprehensive and in-depth training analysis in an intuitive manner."
-
-#: ../../source/monitor.rst:29
-msgid ""
-"轻量监控的配置由配置文件中的 ``monitor`` 字段指定， 用户可以通过修改配置文件 `config file "
+"配置由配置文件中的 ``monitor`` 字段指定， 用户可以通过修改配置文件 `config file "
 "<https://github.com/InternLM/InternEvo/blob/develop/configs/7B_sft.py>`_ "
 "来更改监控配置。以下是一个监控配置的示例："
 msgstr ""
@@ -84,23 +71,17 @@ msgstr ""
 "<https://github.com/InternLM/InternEvo/blob/develop/configs/7B_sft.py>`_."
 " Here is an example of a monitoring configuration:"
 
-#: ../../source/monitor.rst:42
+#: ../../source/monitor.rst:40
 msgid "enable_feishu_alert (bool)：是否启用飞书告警。默认值：False。"
 msgstr "enable_feishu_alert: Whether to enable Feishu alerts. Defaults: False."
 
-#: ../../source/monitor.rst:43
+#: ../../source/monitor.rst:41
 msgid "feishu_alert_address (str)：飞书告警的 Webhook 地址。默认值：None。"
 msgstr ""
 "feishu_alert_address: The webhook address for Feishu alerts. Defaults: "
 "None."
 
-#: ../../source/monitor.rst:44
-msgid "light_monitor_address (str)：轻量监控的地址。默认值：None。"
-msgstr ""
-"light_monitor_address: The address for lightweight monitoring. Defaults: "
-"None."
-
-#: ../../source/monitor.rst:45
+#: ../../source/monitor.rst:42
 msgid "alert_file_path (str)：告警存储路径。默认值：None。"
 msgstr "alert_file_path: path of alert. Defaults: None."
 
@@ -213,60 +194,3 @@ msgstr "alert_file_path: path of alert. Defaults: None."
 
 #~ msgid "示例"
 #~ msgstr "Example"
-
-#~ msgid ""
-#~ "Initialize the monitoring module with "
-#~ "the default address ``initialize_light_monitor()``"
-#~ msgstr ""
-
-#~ msgid "Send a heartbeat message to a monitoring server."
-#~ msgstr ""
-
-#~ msgid ""
-#~ "The type of heartbeat message, e.g., "
-#~ "\"train_metrics\", \"init_time\", \"stage_time\"."
-#~ msgstr ""
-
-#~ msgid "A dictionary containing message data to be included in the heartbeat."
-#~ msgstr ""
-
-#~ msgid ""
-#~ "Sending a heartbeat message for training"
-#~ " metrics ``send_heartbeat(\"train_metrics\", {\"loss\":"
-#~ " 0.1, \"accuracy\": 0.95})``"
-#~ msgstr ""
-
-#~ msgid ""
-#~ "Sending a heartbeat message for "
-#~ "initialization time ``send_heartbeat(\"init_time\", "
-#~ "{\"import_time\": 0.25})``"
-#~ msgstr ""
-
-#~ msgid ""
-#~ "Sending a heartbeat message for stage"
-#~ " time ``send_heartbeat(\"stage_time\", {\"fwd_time\":"
-#~ " 2.3, \"bwd_time\": 6.2})``"
-#~ msgstr ""
-
-#~ msgid ""
-#~ "InternEvo 使用 "
-#~ "``internlm.monitor.alert.initialize_light_monitor`` "
-#~ "来初始化轻量监控客户端。一旦初始化完成，它会建立与监控服务器的连接。在训练过程中，使用 "
-#~ "``internlm.monitor.alert.send_heartbeat`` "
-#~ "来发送不同类型的心跳信息至监控服务器。监控服务器会根据这些心跳信息来检测训练是否出现异常，并在需要时发送警报消息。"
-#~ msgstr ""
-#~ "InternEvo uses "
-#~ "``internlm.monitor.alert.initialize_light_monitor`` to "
-#~ "initialize the lightweight monitoring client."
-#~ " Once initialization is complete, it "
-#~ "establishes a connection with the "
-#~ "monitoring server. During the training "
-#~ "process, it uses "
-#~ "``internlm.monitor.alert.send_heartbeat`` to send "
-#~ "various types of heartbeat messages to"
-#~ " the monitoring server. The monitoring "
-#~ "server uses these heartbeat messages to"
-#~ " detect if the training encounters "
-#~ "any abnormalities and sends alert "
-#~ "messages as needed."
-
diff --git a/doc/code-docs/locales/en/LC_MESSAGES/usage.po b/doc/code-docs/locales/en/LC_MESSAGES/usage.po
index 615fabf77..0f39adb9f 100644
--- a/doc/code-docs/locales/en/LC_MESSAGES/usage.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/usage.po
@@ -7,7 +7,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: InternLM \n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2024-09-02 11:13+0800\n"
+"POT-Creation-Date: 2024-11-20 15:01+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: en\n"
@@ -16,7 +16,7 @@ msgstr ""
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.15.0\n"
+"Generated-By: Babel 2.14.0\n"
 
 #: ../../../usage.md:2
 msgid "使用教程"
@@ -241,74 +241,74 @@ msgstr ""
 "Taking the configuration file `configs/7B_sft.py` for the 7B demo as an "
 "example,"
 
-#: ../../../usage.md:312
+#: ../../../usage.md:310
 msgid "接下来将详细介绍启动一个模型训练所需要进行的数据、模型、并行和监控等相关的配置。"
 msgstr ""
 "let's discuss the data, model, parallel and monitoring configurations "
 "required to start a model training."
 
-#: ../../../usage.md:314
+#: ../../../usage.md:312
 msgid "数据配置"
 msgstr "Data Configuration"
 
-#: ../../../usage.md:315
+#: ../../../usage.md:313
 msgid "数据相关的关键参数配置及释义如下所示："
 msgstr "Here are the key parameters and their explanations for data configuration:"
 
-#: ../../../usage.md:330
+#: ../../../usage.md:328
 msgid "![pack_into_one](./imgs/pack_into_one.png)"
 msgstr ""
 
-#: ../../../usage.md:330
+#: ../../../usage.md:328
 msgid "pack_into_one"
 msgstr ""
 
-#: ../../../usage.md:333
+#: ../../../usage.md:331
 msgid "目前支持传入数据集文件路径`train_folder`，且要求文件格式如下："
 msgstr ""
 "Currently, it supports passing the dataset file path `train_folder`, and "
 "the file format is required to be as follows:"
 
-#: ../../../usage.md:340
+#: ../../../usage.md:338
 msgid "数据集的详细内容可参考``数据准备``模块相关的介绍。"
 msgstr ""
 "For detailed information about the dataset, please refer to the \"Data "
 "Preparation\" section."
 
-#: ../../../usage.md:342
+#: ../../../usage.md:340
 msgid "同时，也支持huggingface格式的数据集处理。"
 msgstr ""
 "Additionally, it supports processing of datasets in the Hugging Face "
 "format."
 
-#: ../../../usage.md:344
+#: ../../../usage.md:342
 msgid "train_folder设置为从huggingface上下载的本地数据集路径，如：\"/mnt/petrelfs/hf-TinyStories\""
 msgstr ""
 "Set the train_folder to the local path of the dataset downloaded from "
 "Hugging Face, for example: \"/mnt/petrelfs/hf-TinyStories\"."
 
-#: ../../../usage.md:346
+#: ../../../usage.md:344
 msgid "在data中，需要新增type及tokenizer_path字段，标示数据集是huggingface格式，并指定tokenizer路径，如："
 msgstr ""
 "In the data section, you need to add new fields for type and "
 "tokenizer_path to indicate that the dataset is in Hugging Face format and"
 " to specify the path of the tokenizer, for example:"
 
-#: ../../../usage.md:364
+#: ../../../usage.md:362
 msgid "模型配置"
 msgstr "Model Configuration"
 
-#: ../../../usage.md:366
+#: ../../../usage.md:364
 msgid "如果在启动训练时要加载模型 `checkpoint`，可进行如下相关配置："
 msgstr ""
 "If you want to load a model checkpoint when starting the training, you "
 "can configure it as follows:"
 
-#: ../../../usage.md:390
+#: ../../../usage.md:388
 msgid "注意："
 msgstr "Note:"
 
-#: ../../../usage.md:391
+#: ../../../usage.md:389
 msgid ""
 "路径若以 `local:` 为前缀，则存储在本地文件系统；若以 `boto3:` 为前缀，则存储在远程 oss "
 "上；若无前缀，为huggingface上可以直接下载的模型路径。"
@@ -317,11 +317,11 @@ msgstr ""
 "local file system. If it starts with `boto3:`, it means the file is "
 "stored in the remote OSS."
 
-#: ../../../usage.md:393
+#: ../../../usage.md:391
 msgid "模型相关关键参数配置如下所示："
 msgstr "The configuration for the model is as follows:"
 
-#: ../../../usage.md:417
+#: ../../../usage.md:415
 msgid "注意：用户可自定义模型类型名和模型结构，并配置相对应的模型参数。通过`internlm/model/registry.py`下的`model_initializer`对象进行模型初始化函数接口注册，在训练主函数`train.py`中初始化模型时，可通过`model_type`配置获取指定的模型初始化接口函数。"
 msgstr ""
 "Note: Users can customize the model type name and model structure, and "
@@ -332,7 +332,7 @@ msgstr ""
 "interface function can be obtained through the `model_type` "
 "configuration."
 
-#: ../../../usage.md:419
+#: ../../../usage.md:417
 msgid ""
 "*如果基于 InternLM 7B继续训练，可以参考 "
 "[ModelZoo](https://github.com/InternLM/InternLM/tree/main#model-zoo) 中 "
@@ -342,130 +342,126 @@ msgstr ""
 "OpenXLab [ModelZoo](https://github.com/InternLM/InternLM/tree/main#model-"
 "zoo) to download weights*."
 
-#: ../../../usage.md:421
+#: ../../../usage.md:419
 msgid "并行配置"
 msgstr "Parallel Configuration"
 
-#: ../../../usage.md:423
+#: ../../../usage.md:421
 msgid "训练并行配置样例如下："
 msgstr "Training parallel configuration example:"
 
-#: ../../../usage.md:432
+#: ../../../usage.md:430
 msgid "zero1（字典）："
 msgstr "zero1 (dict): "
 
-#: ../../../usage.md:433
+#: ../../../usage.md:431
 msgid "size: 整数"
 msgstr "size: int "
 
-#: ../../../usage.md:434
+#: ../../../usage.md:432
 msgid "当`zero1 <= 0`，则 zero1 进程组的大小等于数据并行进程组的大小，因此优化器状态参数将在数据并行范围内分配"
 msgstr ""
 "When `zero1 <= 0` , the size of the zero1 process group is equal to the "
 "size of the data parallel process group, so the optimizer state "
 "parameters will be split within the data parallel range."
 
-#: ../../../usage.md:435
+#: ../../../usage.md:433
 msgid "当`zero1 == 1`，则不使用 zero1 ，所有数据并行组保留完整的优化器状态参数"
 msgstr ""
 "When `zero1 == 1`, zero1 is not used, and all data parallel groups retain"
 " the complete optimizer state parameters."
 
-#: ../../../usage.md:436
+#: ../../../usage.md:434
 msgid "当`zero1 > 1`且`zero1 <= data_parallel_world_size`，则 zero1 进程组是数据并行进程组的子集"
 msgstr ""
 "When `zero1 > 1` and `zero1 <= data_parallel_world_size`, the zero1 "
 "process group is a subset of the data parallel process group."
 
-#: ../../../usage.md:437
+#: ../../../usage.md:435
 msgid "fsdp: 布尔值，启用/禁用torch的完全分片数据并行，默认为False。"
 msgstr ""
 "fsdp: A boolean value that enables or disables fully sharded data "
 "parallelism in torch, with the default being False."
 
-#: ../../../usage.md:438
+#: ../../../usage.md:436
 msgid "tensor（字典）："
 msgstr "tensor (dict): "
 
-#: ../../../usage.md:439
+#: ../../../usage.md:437
 msgid "size: 整数，张量并行的大小。"
 msgstr "size: int, size of tensor parallem"
 
-#: ../../../usage.md:440
+#: ../../../usage.md:438
 msgid "mode: 字符串，张量并行模式，应该是 ['mtp', 'msp', 'fsp', 'isp'] 中的一个，"
 msgstr ""
 "mode: string, tensor parallel mode, should be one of ['mtp', 'msp', "
 "'fsp', 'isp'] "
 
-#: ../../../usage.md:441
+#: ../../../usage.md:439
 msgid "默认为 'mtp'，意味着没有序列并行的纯Megatron张量并行。"
 msgstr ""
 "Default is 'mtp', which means there is no sequence parallelism, just pure"
 " tensor parallelism for Megatron."
 
-#: ../../../usage.md:442
+#: ../../../usage.md:440
 msgid "msp: 带序列并行的Megatron张量并行，序列并行大小 = 张量并行大小。"
 msgstr ""
 "msp: Megatron Tensor Parallelism with Sequence Parallelism, where the "
 "size of sequence parallelism is equal to the size of tensor parallelism."
 
-#: ../../../usage.md:443
+#: ../../../usage.md:441
 msgid "fsp: 通过flash-attn带序列并行的张量并行，序列并行大小 = 张量并行大小。"
 msgstr ""
 "fsp: Tensor Parallelism with Sequence Parallelism facilitated by flash-"
 "attn, where the size of sequence parallelism is equal to the size of "
 "tensor parallelism."
 
-#: ../../../usage.md:444
+#: ../../../usage.md:442
 msgid "isp: 定制的内部序列并行，不带张量并行，可以与权重并行一起使用。"
 msgstr ""
 "isp: Custom internal sequence parallelism, without tensor parallelism, "
 "which can be used in conjunction with weight parallelism."
 
-#: ../../../usage.md:445
+#: ../../../usage.md:443
 msgid "pipeline（字典）："
 msgstr "pipeline: pipeline parallel strategy"
 
-#: ../../../usage.md:446
+#: ../../../usage.md:444
 msgid "size: 整数，流水线并行的大小。"
 msgstr "size: int, size of pipeline parallel"
 
-#: ../../../usage.md:447
+#: ../../../usage.md:445
 msgid "interleaved_overlap: 布尔值，启用/禁用在使用交错流水线调度器时的通信重叠，默认为False。"
 msgstr ""
 "interleaved_overlap: A boolean value that enables or disables "
 "communication overlapping when using an interleaved pipeline scheduler, "
 "with the default being False."
 
-#: ../../../usage.md:448
+#: ../../../usage.md:446
 msgid "weight（字典）："
 msgstr "weight (dict):"
 
-#: ../../../usage.md:449
+#: ../../../usage.md:447
 msgid "size: 整数，权重并行的大小。"
 msgstr "size: int, size of weight parallel"
 
-#: ../../../usage.md:450
+#: ../../../usage.md:448
 msgid "overlap: 布尔值，启用/禁用all_gather/reduce_scatter通信重叠，默认为False。"
 msgstr ""
 "overlap: bool, enable/disable all_gather/reduce_scatter communication "
 "overlap, default is False"
 
-#: ../../../usage.md:451
-msgid "memory_pool: 布尔值，启用/禁用内存池，默认为False。"
-msgstr "memory_pool: bool, enable/disable memory pool, default is False"
-
-#: ../../../usage.md:453
+#: ../../../usage.md:450
 msgid "注意：`数据并行大小 = 总的 GPU 数目 / 流水线并行大小 / 张量并行大小`"
 msgstr ""
 "Note: `Data parallel size = Total number of GPUs / Pipeline parallel size"
 " / Tensor parallel size`"
 
-#: ../../../usage.md:455
+#: ../../../usage.md:452
 msgid "启动训练"
 msgstr "Start Training"
 
-#: ../../../usage.md:457
+#: ../../../usage.md:454
 msgid "完成了以上数据集准备和相关训练配置后，可启动 Demo 训练。接下来分别以 slurm 和 torch 环境为例，介绍训练启动方式。"
 msgstr ""
 "After completing the data preparation and relevant training "
@@ -473,19 +469,19 @@ msgstr ""
 "following examples demonstrate how to start the training in both slurm "
 "and torch environments."
 
-#: ../../../usage.md:459 ../../../usage.md:496
+#: ../../../usage.md:456 ../../../usage.md:493
 msgid "若在 slurm 上启动分布式运行环境，多节点 16 卡的运行命令如下所示："
 msgstr ""
 "If you want to start distributed training on slurm with 16 GPUs across "
 "multiple nodes, use the following command:"
 
-#: ../../../usage.md:464
+#: ../../../usage.md:461
 msgid "若在 torch 上启动分布式运行环境，单节点 8 卡的运行命令如下所示："
 msgstr ""
 "If you want to start distributed training on torch with 8 GPUs on a "
 "single node, use the following command:"
 
-#: ../../../usage.md:469
+#: ../../../usage.md:466
 msgid ""
 "其中，train.py文件的内容，请参考： [训练脚本](https://internevo.readthedocs.io/zh-"
 "cn/latest/training.html)"
@@ -493,29 +489,29 @@ msgstr ""
 "The content of train.py, please refer to: [training "
 "script](https://internevo.readthedocs.io/en/latest/training.html) "
 
-#: ../../../usage.md:471
+#: ../../../usage.md:468
 msgid "运行结果"
 msgstr "Training Results"
 
-#: ../../../usage.md:473
+#: ../../../usage.md:470
 msgid "以 slurm 上单机 8 卡的 Demo 训练配置为例，训练结果日志展示如下："
 msgstr ""
 "Taking the configuration of the demo training on a single machine with 8 "
 "GPUs on slurm as an example, the training result log is shown below:"
 
-#: ../../../usage.md:494
+#: ../../../usage.md:491
 msgid "加载训练的checkpoint并生成"
 msgstr "Load the training checkpoint and generate."
 
-#: ../../../usage.md:501
+#: ../../../usage.md:498
 msgid "在配置文件中添加`generation`配置"
 msgstr "Add generation configuration to the configuration file."
 
-#: ../../../usage.md:519
+#: ../../../usage.md:516
 msgid "长文本生成"
 msgstr "Long Text Generation"
 
-#: ../../../usage.md:521
+#: ../../../usage.md:518
 msgid ""
 "在推理阶段，我们可以使用 Dynamic NTK RoPE 来代替原始的 RoPE，从而使得模型能够适应长文本的输入输出，达到 16K "
 "的外推效果。 目前 InternLM 支持在 huggingface 格式和 InternLM 本身格式的模型中使用 Dynamic NTK "
@@ -527,7 +523,7 @@ msgstr ""
 "the use of Dynamic NTK RoPE in models formatted in both Hugging Face "
 "format and InternLM's native format."
 
-#: ../../../usage.md:524
+#: ../../../usage.md:521
 msgid ""
 "对于 huggingface 格式的模型，dynamic ntk rope 目前是被默认使用的。如果用户想要关闭该行为，请将 "
 "`config.json` 中的 `rotary.type` 修改为 `origin`；"
@@ -536,7 +532,7 @@ msgstr ""
 "by default. If users wish to disable this behavior, they can modify "
 "`rotary.type` in the `config.json` file to `origin`."
 
-#: ../../../usage.md:525
+#: ../../../usage.md:522
 msgid ""
 "对于 InternLM "
 "本身格式的模型，在推理时，通过在初始化模型的配置字典中添加`use_dynamic_ntk_rope=True`来开启这一行为。"
@@ -545,7 +541,7 @@ msgstr ""
 "this behavior by adding use_dynamic_ntk_rope=True to the configuration "
 "dictionary when initializing the model."
 
-#: ../../../usage.md:527
+#: ../../../usage.md:524
 msgid ""
 "用户可以直接通过 web_demo 来直观地对比查看 Dynamic NTK RoPE "
 "是如何生效的。例如文件[长文本示例](../../aux_materials/long_text_example.txt)中存放着一个token长度超过2200的文本，如果不使用"
@@ -559,23 +555,23 @@ msgstr ""
 "text. However, after applying Dynamic NTK RoPE, the response from the "
 "InternLM Chat 7B v1.1 model is as follows:"
 
-#: ../../../usage.md:530
+#: ../../../usage.md:527
 msgid "![dynamic_ntk_answer](./imgs/dynamic_ntk_answer.png)"
 msgstr ""
 
-#: ../../../usage.md:530
+#: ../../../usage.md:527
 msgid "dynamic_ntk_answer"
 msgstr ""
 
-#: ../../../usage.md:532
+#: ../../../usage.md:529
 msgid "关于 Dyanmic NTK 的原理，详细请参考"
 msgstr "Regarding the principle of Dyanmic NTK, please refer to"
 
-#: ../../../usage.md:533
+#: ../../../usage.md:530
 msgid "[dynamically_scaled_rope_further_increases](https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases)"
 msgstr ""
 
-#: ../../../usage.md:534
+#: ../../../usage.md:531
 msgid "[https://kexue.fm/archives/9675](https://kexue.fm/archives/9675)"
 msgstr ""
 
@@ -633,4 +629,3 @@ msgstr ""
 
 #~ msgid "数据准备 （预训练）"
 #~ msgstr "Dataset Preparation (Pre-training)"
-
diff --git a/doc/code-docs/source/monitor.rst b/doc/code-docs/source/monitor.rst
index e6a872162..a358f9f27 100644
--- a/doc/code-docs/source/monitor.rst
+++ b/doc/code-docs/source/monitor.rst
@@ -21,12 +21,11 @@ InternEvo 监控线程会周期性地检查模型训练过程中是否出现 los
 
 .. autofunction:: internlm.monitor.alert.send_feishu_msg_with_webhook
 
-轻量监控
+监控告警配置
 -----------------
 
-InternEvo轻量级监控工具采用心跳机制实时监测训练过程中的各项指标，如loss、grad_norm、训练阶段的耗时等。同时，InternEvo还可以通过 `grafana dashboard <https://grafana.com/grafana/dashboards/>`_ 直观地呈现这些指标信息，以便用户进行更加全面和深入的训练分析。
 
-轻量监控的配置由配置文件中的 ``monitor`` 字段指定， 用户可以通过修改配置文件 `config file <https://github.com/InternLM/InternEvo/blob/develop/configs/7B_sft.py>`_ 来更改监控配置。以下是一个监控配置的示例：
+配置由配置文件中的 ``monitor`` 字段指定， 用户可以通过修改配置文件 `config file <https://github.com/InternLM/InternEvo/blob/develop/configs/7B_sft.py>`_ 来更改监控配置。以下是一个监控配置的示例：
 
 .. code-block:: python
 
@@ -34,12 +33,10 @@ InternEvo轻量级监控工具采用心跳机制实时监测训练过程中的
         alert=dict(
             enable_feishu_alert=False,
             feishu_alert_address=None,
-            light_monitor_address=None,
             alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
         ),
     )
 
 - enable_feishu_alert (bool)：是否启用飞书告警。默认值：False。
 - feishu_alert_address (str)：飞书告警的 Webhook 地址。默认值：None。
-- light_monitor_address (str)：轻量监控的地址。默认值：None。
 - alert_file_path (str)：告警存储路径。默认值：None。
diff --git a/doc/en/usage.md b/doc/en/usage.md
index 8e1670c2f..8d803ba46 100644
--- a/doc/en/usage.md
+++ b/doc/en/usage.md
@@ -269,7 +269,6 @@ monitor = dict(
     alert=dict(
         enable_feishu_alert=DO_ALERT,
         feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
     ),
 )
 ```
diff --git a/doc/usage.md b/doc/usage.md
index 67ae1edf5..014477e0a 100644
--- a/doc/usage.md
+++ b/doc/usage.md
@@ -123,7 +123,7 @@ ckpt = dict(
     # 'load_ckpt_info' setting guide:
     # 1. the 'path' indicate ckpt path,
     # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined 
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined
     # load function such as "llama"
     load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internevo"),
     # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
@@ -299,7 +299,6 @@ monitor = dict(
     alert=dict(
         enable_feishu_alert=DO_ALERT,
         feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
         alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
     ),
     tensorboard=dict(
diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py
index fc63b8a23..1f1f0bfbe 100644
--- a/internlm/initialize/launch.py
+++ b/internlm/initialize/launch.py
@@ -467,7 +467,6 @@ def args_sanity_check():
             "alert": {
                 "enable_feishu_alert": False,
                 "feishu_alert_address": None,
-                "light_monitor_address": None,
                 "alert_file_path": None,
             }
         },
diff --git a/internlm/monitor/monitor.py b/internlm/monitor/monitor.py
index fc33de62a..0fbb45022 100644
--- a/internlm/monitor/monitor.py
+++ b/internlm/monitor/monitor.py
@@ -38,11 +38,13 @@ def execute_with_exception_handling(func, *args, **kwargs):
                 return func(*args, **kwargs)
             try:
                 return func(*args, **kwargs)
-            except Exception:
+            except Exception as e:
                 hostname = socket.gethostname()
                 logger.error(
                     f"Raise exception from {hostname} with rank id: {gpc.get_global_rank()}\n{traceback.format_exc()}",
                 )
+                monitor_manager.monitor_exception(excp_info=traceback.format_exc())
+                raise e
             finally:
                 devices_per_node = internlm_accelerator.device_count()
                 local_rank = gpc.get_global_rank() % devices_per_node
@@ -178,7 +180,7 @@ def __init__(self, loss_spike_limit: float = 1.5) -> None:
         self.last_step_loss = -1
         self.alert_file_path = None
         self.enable_alert = False
-        self.light_monitor_address = None
+        self.alert_address = None
 
     def monitor_loss_spike(self, alert_address: str = None, step_count: int = 0, cur_step_loss: float = 0.0):
         """Check loss value, if loss spike occurs, send alert message to Feishu."""
@@ -219,6 +221,8 @@ def exception_should_be_alert(self, msg: str, alert_address: str = None):
     def monitor_exception(self, alert_address: str = None, excp_info: str = None):
         """Catch and format exception information, send alert message to Feishu."""
         if self.enable_alert:
+            if alert_address is None:
+                alert_address = self.alert_address
             filtered_trace = excp_info.split("\n")[-10:]
             format_trace = ""
             for line in filtered_trace:
@@ -271,9 +275,9 @@ def start_monitor(
         # initialize some variables for monitoring
         set_env_var(key="JOB_NAME", value=job_name)
         self.enable_alert = gpc.config.monitor.alert.get("enable_feishu_alert", False)
+        self.alert_address = alert_address
 
         if self.enable_alert:
-            self.light_monitor_address = gpc.config.monitor.alert.get("light_monitor_address", None)
             # initialize alert file
             self.alert_file_path = gpc.config.monitor.alert.get("alert_file_path")
             if self.alert_file_path and gpc.is_rank_for_log():
@@ -289,9 +293,13 @@ def start_monitor(
                 loss_spike_limit=loss_spike_limit,
             )
 
+            self.handle_sigterm(alert_address=alert_address)
+            send_alert_message(address=alert_address, message=f"Training in {socket.gethostname()} is starting.")
+
     def stop_monitor(self):
         """Stop the monitor and alert thread."""
-        if self.monitor_thread is not None:
+        if self.enable_alert:
+            send_alert_message(address=self.alert_address, message=f"Training in {socket.gethostname()} completed.")
             self.monitor_thread.stop()
 
 
@@ -311,11 +319,8 @@ def initialize_monitor_manager(job_name: str = None, alert_address: str = None):
     if alert_address is not None:
         try:
             monitor_manager.start_monitor(job_name=job_name, alert_address=alert_address)
-            monitor_manager.handle_sigterm(alert_address=alert_address)
-            send_alert_message(address=alert_address, message=f"Training in {socket.gethostname()} is starting.")
             yield
         finally:
-            send_alert_message(address=alert_address, message=f"Training in {socket.gethostname()} completed.")
             monitor_manager.stop_monitor()
     else:
         yield
diff --git a/tests/common_fixture.py b/tests/common_fixture.py
index e5a8b9aa1..22e52c8c0 100644
--- a/tests/common_fixture.py
+++ b/tests/common_fixture.py
@@ -57,7 +57,7 @@
         ),
         model_type="INTERNLM",
         alert_address=None,
-        monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None, light_monitor_address=None)),
+        monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None)),
         grad_scaler=dict(
             fp16=dict(
                 initial_scale=2**16,
diff --git a/tests/test_core/test_pipeline.py b/tests/test_core/test_pipeline.py
index 180fe4b71..ba8a7c399 100644
--- a/tests/test_core/test_pipeline.py
+++ b/tests/test_core/test_pipeline.py
@@ -44,7 +44,7 @@
         resume_tb_folder="",
         tensorboard_folder="",
         alert_address=None,
-        monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None, light_monitor_address=None)),
+        monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None)),
         grad_scaler=dict(
             fp16=dict(
                 initial_scale=1,
diff --git a/tests/test_model/test_model_internlm.py b/tests/test_model/test_model_internlm.py
index 3ce6f530e..9e33b9a99 100644
--- a/tests/test_model/test_model_internlm.py
+++ b/tests/test_model/test_model_internlm.py
@@ -69,7 +69,7 @@
         resume_tb_folder="",
         tensorboard_folder="",
         alert_address=None,
-        monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None, light_monitor_address=None)),
+        monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None)),
     )
 )
 
diff --git a/tests/test_solver/test_optimizer.py b/tests/test_solver/test_optimizer.py
index ca470ffc9..7f0fc34e7 100644
--- a/tests/test_solver/test_optimizer.py
+++ b/tests/test_solver/test_optimizer.py
@@ -56,7 +56,7 @@ def forward(self, x):
         resume_tb_folder="",
         tensorboard_folder="",
         alert_address=None,
-        monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None, light_monitor_address=None)),
+        monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None)),
         grad_scaler=dict(
             fp16=dict(
                 initial_scale=1,
diff --git a/tests/test_training/7B_check_acc.py b/tests/test_training/7B_check_acc.py
index 3b727d7c9..0e6249e34 100644
--- a/tests/test_training/7B_check_acc.py
+++ b/tests/test_training/7B_check_acc.py
@@ -164,6 +164,5 @@
     alert=dict(
         enable_feishu_alert=DO_ALERT,
         feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
     ),
 )
diff --git a/tests/test_training/7B_check_init.py b/tests/test_training/7B_check_init.py
index 6f72c7d75..3fcb5c11f 100644
--- a/tests/test_training/7B_check_init.py
+++ b/tests/test_training/7B_check_init.py
@@ -160,6 +160,5 @@
     alert=dict(
         enable_feishu_alert=DO_ALERT,
         feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
     ),
 )
diff --git a/tests/test_training/test_forward_output_no_fa.py b/tests/test_training/test_forward_output_no_fa.py
index e846594e5..287658e63 100644
--- a/tests/test_training/test_forward_output_no_fa.py
+++ b/tests/test_training/test_forward_output_no_fa.py
@@ -70,7 +70,7 @@
         ),
         model_type="INTERNLM",
         alert_address=None,
-        monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None, light_monitor_address=None)),
+        monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None)),
         grad_scaler=dict(
             fp16=dict(
                 initial_scale=2**16,
@@ -177,7 +177,7 @@ def train_check_output(args):
 
     optimizer, beta2_scheduler, lr_scheduler = initialize_optimizer(model=model)
 
-    train_dl, dataset_types = build_train_loader_with_data_type()
+    _, dataset_types = build_train_loader_with_data_type()
 
     metric = AccPerplex(
         device=get_current_device(),
diff --git a/tests/test_training/test_load_ckpt_loss.py b/tests/test_training/test_load_ckpt_loss.py
index 45cd319c4..8c52252f8 100644
--- a/tests/test_training/test_load_ckpt_loss.py
+++ b/tests/test_training/test_load_ckpt_loss.py
@@ -28,8 +28,8 @@
     Config,
 )
 from internlm.core.trainer import (  # noqa: E402  #pylint: disable=wrong-import-position
-    TrainState,
     Trainer,
+    TrainState,
 )
 from internlm.data import (  # noqa: E402  #pylint: disable=wrong-import-position
     build_train_loader_with_data_type,
@@ -109,7 +109,7 @@
         ),
         model_type="INTERNLM",
         alert_address=None,
-        monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None, light_monitor_address=None)),
+        monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None)),
         grad_scaler=dict(
             fp16=dict(
                 initial_scale=2**16,
diff --git a/tests/test_training/test_swap_nb_loss_and_gradnorm.py b/tests/test_training/test_swap_nb_loss_and_gradnorm.py
index 73534f5f7..d3e8b4859 100644
--- a/tests/test_training/test_swap_nb_loss_and_gradnorm.py
+++ b/tests/test_training/test_swap_nb_loss_and_gradnorm.py
@@ -79,7 +79,7 @@
         ),
         model_type="INTERNLM",
         alert_address=None,
-        monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None, light_monitor_address=None)),
+        monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None)),
         grad_scaler=dict(
             fp16=dict(
                 initial_scale=2**16,
@@ -277,7 +277,7 @@ def exam_loss(args):
     criterion = FlashGPTLMLoss(parallel_output=True, label_smoothing=gpc.config.loss.label_smoothing)
 
     # initialize the train and validation data loader
-    train_dl, dataset_types = build_train_loader_with_data_type()
+    _, dataset_types = build_train_loader_with_data_type()
     val_dls = build_valid_loader_with_data_type()
 
     optimizer, beta2_scheduler, lr_scheduler = initialize_optimizer(model=model)
diff --git a/tests/test_utils/common_fixture.py b/tests/test_utils/common_fixture.py
index f4b34ddee..329a58c69 100644
--- a/tests/test_utils/common_fixture.py
+++ b/tests/test_utils/common_fixture.py
@@ -84,7 +84,7 @@
         resume_tb_folder="",
         tensorboard_folder="",
         alert_address=None,
-        monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None, light_monitor_address=None)),
+        monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None)),
     )
 )