diff --git a/configs/1.8B_MoE16_sft.py b/configs/1.8B_MoE16_sft.py index f85302778..9eefb3312 100644 --- a/configs/1.8B_MoE16_sft.py +++ b/configs/1.8B_MoE16_sft.py @@ -213,7 +213,6 @@ alert=dict( enable_feishu_alert=DO_ALERT, feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat alert_file_path=f"llm_alter/{JOB_NAME}_alert.log", ), tensorboard=dict( diff --git a/configs/57B_qwen2_MoE.py b/configs/57B_qwen2_MoE.py index 0fd676036..44aaa3b85 100644 --- a/configs/57B_qwen2_MoE.py +++ b/configs/57B_qwen2_MoE.py @@ -220,10 +220,9 @@ alert=dict( enable_feishu_alert=DO_ALERT, feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat alert_file_path=f"llm_alter/{JOB_NAME}_alert.log", ), tensorboard=dict( queue_max_length=10, ), -) \ No newline at end of file +) diff --git a/configs/7B_MoE4_sft.py b/configs/7B_MoE4_sft.py index c558427cc..f837bd74e 100644 --- a/configs/7B_MoE4_sft.py +++ b/configs/7B_MoE4_sft.py @@ -211,7 +211,6 @@ alert=dict( enable_feishu_alert=DO_ALERT, feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat alert_file_path=f"llm_alter/{JOB_NAME}_alert.log", ), tensorboard=dict( diff --git a/configs/7B_baichuan2.py b/configs/7B_baichuan2.py index eaa26a867..0517da59e 100644 --- a/configs/7B_baichuan2.py +++ b/configs/7B_baichuan2.py @@ -22,7 +22,7 @@ CHECKPOINT_EVERY = 50 ckpt = dict( enable_save_ckpt=False, # enable ckpt save. - enable_internevo2hf_ckpt=False, # enable ckpt save for huggingface format. + enable_internevo2hf_ckpt=False, # enable ckpt save for huggingface format. save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. # 'load_ckpt_info' setting guide: # 1. the 'path' indicate ckpt path, @@ -196,7 +196,6 @@ alert=dict( enable_feishu_alert=DO_ALERT, feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat alert_file_path=f"llm_alter/{JOB_NAME}_alert.log", ), tensorboard=dict( diff --git a/configs/7B_gemma.py b/configs/7B_gemma.py index aff448232..6ee9979a3 100644 --- a/configs/7B_gemma.py +++ b/configs/7B_gemma.py @@ -24,7 +24,7 @@ CHECKPOINT_EVERY = 50 ckpt = dict( enable_save_ckpt=False, # enable ckpt save. - enable_internevo2hf_ckpt=False, # enable ckpt save for huggingface format. + enable_internevo2hf_ckpt=False, # enable ckpt save for huggingface format. save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. # 'load_ckpt_info' setting guide: # 1. the 'path' indicate ckpt path, @@ -203,7 +203,6 @@ alert=dict( enable_feishu_alert=DO_ALERT, feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat alert_file_path=f"llm_alter/{JOB_NAME}_alert.log", ), tensorboard=dict( diff --git a/configs/7B_internlm2.py b/configs/7B_internlm2.py index 97758bba4..2dbf8d4c8 100644 --- a/configs/7B_internlm2.py +++ b/configs/7B_internlm2.py @@ -194,7 +194,6 @@ alert=dict( enable_feishu_alert=DO_ALERT, feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat alert_file_path=f"llm_alter/{JOB_NAME}_alert.log", ), tensorboard=dict( diff --git a/configs/7B_isp_sft.py b/configs/7B_isp_sft.py index 2698a82f6..59edc3511 100644 --- a/configs/7B_isp_sft.py +++ b/configs/7B_isp_sft.py @@ -31,7 +31,7 @@ # 'load_ckpt_info' setting guide: # 1. the 'path' indicate ckpt path, # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined # load function such as "llama" load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internevo"), # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering @@ -188,17 +188,17 @@ 2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False. sequence_2D (dict): 1. enable: bool, whether enable the 2D sequence parallel or not. - 2. head_size: int, the parallel degree of head parallelism (DeepSpeed Ulysses). + 2. head_size: int, the parallel degree of head parallelism (DeepSpeed Ulysses). head_size * context_size should be equal tensor size. 3. context_size: int, the parallel degree of context parallelism. head_size * context_size should be equal tensor size. 4. window_size: int, the sliding window size in context parallelism. 5. device_placement_strategy: dict, - head_first: bool, if `True`, ranks of the same head parallel group are + head_first: bool, if `True`, ranks of the same head parallel group are given high priority for colocation on the same node; if `False`, ranks of the same context parallel group are given high priority for colocation on the same node; - interleaved: bool, if `head_first` is `False` and `window_size` > 1, this config could + interleaved: bool, if `head_first` is `False` and `window_size` > 1, this config could interleaved the ranks in the same window to make full use of NIC as much as possible. """ parallel = dict( @@ -223,7 +223,6 @@ alert=dict( enable_feishu_alert=DO_ALERT, feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat alert_file_path=f"llm_alter/{JOB_NAME}_alert.log", ), tensorboard=dict( diff --git a/configs/7B_llama2.py b/configs/7B_llama2.py index b0a173c8d..9bc026882 100644 --- a/configs/7B_llama2.py +++ b/configs/7B_llama2.py @@ -195,7 +195,6 @@ alert=dict( enable_feishu_alert=DO_ALERT, feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat alert_file_path=f"llm_alter/{JOB_NAME}_alert.log", ), tensorboard=dict( diff --git a/configs/7B_qwen2.py b/configs/7B_qwen2.py index 09b536ccc..f5de53e8f 100644 --- a/configs/7B_qwen2.py +++ b/configs/7B_qwen2.py @@ -23,7 +23,7 @@ CHECKPOINT_EVERY = 50 ckpt = dict( enable_save_ckpt=False, # enable ckpt save. - enable_internevo2hf_ckpt=False, # enable ckpt save for huggingface format. + enable_internevo2hf_ckpt=False, # enable ckpt save for huggingface format. save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. # 'load_ckpt_info' setting guide: # 1. the 'path' indicate ckpt path, @@ -203,7 +203,6 @@ alert=dict( enable_feishu_alert=DO_ALERT, feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat alert_file_path=f"llm_alter/{JOB_NAME}_alert.log", ), tensorboard=dict( diff --git a/configs/7B_sft.py b/configs/7B_sft.py index 4799b5f35..b7f62cf4a 100644 --- a/configs/7B_sft.py +++ b/configs/7B_sft.py @@ -205,7 +205,6 @@ alert=dict( enable_feishu_alert=DO_ALERT, feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat alert_file_path=f"llm_alter/{JOB_NAME}_alert.log", ), tensorboard=dict( diff --git a/configs/8x22B_mixtral.py b/configs/8x22B_mixtral.py index 56206bd4b..f7ebbb100 100644 --- a/configs/8x22B_mixtral.py +++ b/configs/8x22B_mixtral.py @@ -221,7 +221,6 @@ alert=dict( enable_feishu_alert=DO_ALERT, feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat alert_file_path=f"llm_alter/{JOB_NAME}_alert.log", ), tensorboard=dict( diff --git a/configs/8x7B_mixtral.py b/configs/8x7B_mixtral.py index f589c9670..41a2a2478 100644 --- a/configs/8x7B_mixtral.py +++ b/configs/8x7B_mixtral.py @@ -221,7 +221,6 @@ alert=dict( enable_feishu_alert=DO_ALERT, feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat alert_file_path=f"llm_alter/{JOB_NAME}_alert.log", ), tensorboard=dict( diff --git a/configs/demo.py b/configs/demo.py index e66f007f4..22735018d 100644 --- a/configs/demo.py +++ b/configs/demo.py @@ -34,7 +34,7 @@ # 'load_ckpt_info' setting guide: # 1. the 'path' indicate ckpt path, # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined # load function such as "llama" load_ckpt_info=dict(path=LOAD_CKPT_FOLDER, content=("model",), ckpt_type="internevo"), # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering @@ -141,7 +141,6 @@ alert=dict( enable_feishu_alert=DO_ALERT, feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat alert_file_path=f"llm_alter/{JOB_NAME}_alert.log", ), tensorboard=dict( diff --git a/configs/demo_llava.py b/configs/demo_llava.py index e138e886a..a96cbc7d5 100644 --- a/configs/demo_llava.py +++ b/configs/demo_llava.py @@ -178,7 +178,6 @@ alert=dict( enable_feishu_alert=DO_ALERT, feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat alert_file_path=f"llm_alter/{JOB_NAME}_alert.log", ), tensorboard=dict( diff --git a/doc/code-docs/locales/en/LC_MESSAGES/monitor.po b/doc/code-docs/locales/en/LC_MESSAGES/monitor.po index 4d61a4353..a92bdd527 100644 --- a/doc/code-docs/locales/en/LC_MESSAGES/monitor.po +++ b/doc/code-docs/locales/en/LC_MESSAGES/monitor.po @@ -7,7 +7,7 @@ msgid "" msgstr "" "Project-Id-Version: InternLM \n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2024-08-30 16:07+0800\n" +"POT-Creation-Date: 2024-11-20 15:01+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language: en\n" @@ -16,7 +16,7 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: Babel 2.15.0\n" +"Generated-By: Babel 2.14.0\n" #: ../../source/monitor.rst:2 msgid "监控和告警" @@ -56,25 +56,12 @@ msgstr "" "``internlm.monitor.alert.send_feishu_msg_with_webhook()``." #: ../../source/monitor.rst:25 -msgid "轻量监控" -msgstr "Light Monitoring" +msgid "监控告警配置" +msgstr "Monitor Config" -#: ../../source/monitor.rst:27 +#: ../../source/monitor.rst:28 msgid "" -"InternEvo轻量级监控工具采用心跳机制实时监测训练过程中的各项指标,如loss、grad_norm、训练阶段的耗时等。同时,InternEvo还可以通过" -" `grafana dashboard `_ " -"直观地呈现这些指标信息,以便用户进行更加全面和深入的训练分析。" -msgstr "" -"The InternEvo light monitoring tool employs a heartbeat mechanism to " -"real-time monitor various metrics during the training process, such as " -"loss, grad_norm, and training phase duration. Additionally, InternEvo can" -" present these metric details through a `grafana dashboard " -"`_, allowing users to conduct " -"more comprehensive and in-depth training analysis in an intuitive manner." - -#: ../../source/monitor.rst:29 -msgid "" -"轻量监控的配置由配置文件中的 ``monitor`` 字段指定, 用户可以通过修改配置文件 `config file " +"配置由配置文件中的 ``monitor`` 字段指定, 用户可以通过修改配置文件 `config file " "`_ " "来更改监控配置。以下是一个监控配置的示例:" msgstr "" @@ -84,23 +71,17 @@ msgstr "" "`_." " Here is an example of a monitoring configuration:" -#: ../../source/monitor.rst:42 +#: ../../source/monitor.rst:40 msgid "enable_feishu_alert (bool):是否启用飞书告警。默认值:False。" msgstr "enable_feishu_alert: Whether to enable Feishu alerts. Defaults: False." -#: ../../source/monitor.rst:43 +#: ../../source/monitor.rst:41 msgid "feishu_alert_address (str):飞书告警的 Webhook 地址。默认值:None。" msgstr "" "feishu_alert_address: The webhook address for Feishu alerts. Defaults: " "None." -#: ../../source/monitor.rst:44 -msgid "light_monitor_address (str):轻量监控的地址。默认值:None。" -msgstr "" -"light_monitor_address: The address for lightweight monitoring. Defaults: " -"None." - -#: ../../source/monitor.rst:45 +#: ../../source/monitor.rst:42 msgid "alert_file_path (str):告警存储路径。默认值:None。" msgstr "alert_file_path: path of alert. Defaults: None." @@ -213,60 +194,3 @@ msgstr "alert_file_path: path of alert. Defaults: None." #~ msgid "示例" #~ msgstr "Example" - -#~ msgid "" -#~ "Initialize the monitoring module with " -#~ "the default address ``initialize_light_monitor()``" -#~ msgstr "" - -#~ msgid "Send a heartbeat message to a monitoring server." -#~ msgstr "" - -#~ msgid "" -#~ "The type of heartbeat message, e.g., " -#~ "\"train_metrics\", \"init_time\", \"stage_time\"." -#~ msgstr "" - -#~ msgid "A dictionary containing message data to be included in the heartbeat." -#~ msgstr "" - -#~ msgid "" -#~ "Sending a heartbeat message for training" -#~ " metrics ``send_heartbeat(\"train_metrics\", {\"loss\":" -#~ " 0.1, \"accuracy\": 0.95})``" -#~ msgstr "" - -#~ msgid "" -#~ "Sending a heartbeat message for " -#~ "initialization time ``send_heartbeat(\"init_time\", " -#~ "{\"import_time\": 0.25})``" -#~ msgstr "" - -#~ msgid "" -#~ "Sending a heartbeat message for stage" -#~ " time ``send_heartbeat(\"stage_time\", {\"fwd_time\":" -#~ " 2.3, \"bwd_time\": 6.2})``" -#~ msgstr "" - -#~ msgid "" -#~ "InternEvo 使用 " -#~ "``internlm.monitor.alert.initialize_light_monitor`` " -#~ "来初始化轻量监控客户端。一旦初始化完成,它会建立与监控服务器的连接。在训练过程中,使用 " -#~ "``internlm.monitor.alert.send_heartbeat`` " -#~ "来发送不同类型的心跳信息至监控服务器。监控服务器会根据这些心跳信息来检测训练是否出现异常,并在需要时发送警报消息。" -#~ msgstr "" -#~ "InternEvo uses " -#~ "``internlm.monitor.alert.initialize_light_monitor`` to " -#~ "initialize the lightweight monitoring client." -#~ " Once initialization is complete, it " -#~ "establishes a connection with the " -#~ "monitoring server. During the training " -#~ "process, it uses " -#~ "``internlm.monitor.alert.send_heartbeat`` to send " -#~ "various types of heartbeat messages to" -#~ " the monitoring server. The monitoring " -#~ "server uses these heartbeat messages to" -#~ " detect if the training encounters " -#~ "any abnormalities and sends alert " -#~ "messages as needed." - diff --git a/doc/code-docs/locales/en/LC_MESSAGES/usage.po b/doc/code-docs/locales/en/LC_MESSAGES/usage.po index 615fabf77..0f39adb9f 100644 --- a/doc/code-docs/locales/en/LC_MESSAGES/usage.po +++ b/doc/code-docs/locales/en/LC_MESSAGES/usage.po @@ -7,7 +7,7 @@ msgid "" msgstr "" "Project-Id-Version: InternLM \n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2024-09-02 11:13+0800\n" +"POT-Creation-Date: 2024-11-20 15:01+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language: en\n" @@ -16,7 +16,7 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: Babel 2.15.0\n" +"Generated-By: Babel 2.14.0\n" #: ../../../usage.md:2 msgid "使用教程" @@ -241,74 +241,74 @@ msgstr "" "Taking the configuration file `configs/7B_sft.py` for the 7B demo as an " "example," -#: ../../../usage.md:312 +#: ../../../usage.md:310 msgid "接下来将详细介绍启动一个模型训练所需要进行的数据、模型、并行和监控等相关的配置。" msgstr "" "let's discuss the data, model, parallel and monitoring configurations " "required to start a model training." -#: ../../../usage.md:314 +#: ../../../usage.md:312 msgid "数据配置" msgstr "Data Configuration" -#: ../../../usage.md:315 +#: ../../../usage.md:313 msgid "数据相关的关键参数配置及释义如下所示:" msgstr "Here are the key parameters and their explanations for data configuration:" -#: ../../../usage.md:330 +#: ../../../usage.md:328 msgid "![pack_into_one](./imgs/pack_into_one.png)" msgstr "" -#: ../../../usage.md:330 +#: ../../../usage.md:328 msgid "pack_into_one" msgstr "" -#: ../../../usage.md:333 +#: ../../../usage.md:331 msgid "目前支持传入数据集文件路径`train_folder`,且要求文件格式如下:" msgstr "" "Currently, it supports passing the dataset file path `train_folder`, and " "the file format is required to be as follows:" -#: ../../../usage.md:340 +#: ../../../usage.md:338 msgid "数据集的详细内容可参考``数据准备``模块相关的介绍。" msgstr "" "For detailed information about the dataset, please refer to the \"Data " "Preparation\" section." -#: ../../../usage.md:342 +#: ../../../usage.md:340 msgid "同时,也支持huggingface格式的数据集处理。" msgstr "" "Additionally, it supports processing of datasets in the Hugging Face " "format." -#: ../../../usage.md:344 +#: ../../../usage.md:342 msgid "train_folder设置为从huggingface上下载的本地数据集路径,如:\"/mnt/petrelfs/hf-TinyStories\"" msgstr "" "Set the train_folder to the local path of the dataset downloaded from " "Hugging Face, for example: \"/mnt/petrelfs/hf-TinyStories\"." -#: ../../../usage.md:346 +#: ../../../usage.md:344 msgid "在data中,需要新增type及tokenizer_path字段,标示数据集是huggingface格式,并指定tokenizer路径,如:" msgstr "" "In the data section, you need to add new fields for type and " "tokenizer_path to indicate that the dataset is in Hugging Face format and" " to specify the path of the tokenizer, for example:" -#: ../../../usage.md:364 +#: ../../../usage.md:362 msgid "模型配置" msgstr "Model Configuration" -#: ../../../usage.md:366 +#: ../../../usage.md:364 msgid "如果在启动训练时要加载模型 `checkpoint`,可进行如下相关配置:" msgstr "" "If you want to load a model checkpoint when starting the training, you " "can configure it as follows:" -#: ../../../usage.md:390 +#: ../../../usage.md:388 msgid "注意:" msgstr "Note:" -#: ../../../usage.md:391 +#: ../../../usage.md:389 msgid "" "路径若以 `local:` 为前缀,则存储在本地文件系统;若以 `boto3:` 为前缀,则存储在远程 oss " "上;若无前缀,为huggingface上可以直接下载的模型路径。" @@ -317,11 +317,11 @@ msgstr "" "local file system. If it starts with `boto3:`, it means the file is " "stored in the remote OSS." -#: ../../../usage.md:393 +#: ../../../usage.md:391 msgid "模型相关关键参数配置如下所示:" msgstr "The configuration for the model is as follows:" -#: ../../../usage.md:417 +#: ../../../usage.md:415 msgid "注意:用户可自定义模型类型名和模型结构,并配置相对应的模型参数。通过`internlm/model/registry.py`下的`model_initializer`对象进行模型初始化函数接口注册,在训练主函数`train.py`中初始化模型时,可通过`model_type`配置获取指定的模型初始化接口函数。" msgstr "" "Note: Users can customize the model type name and model structure, and " @@ -332,7 +332,7 @@ msgstr "" "interface function can be obtained through the `model_type` " "configuration." -#: ../../../usage.md:419 +#: ../../../usage.md:417 msgid "" "*如果基于 InternLM 7B继续训练,可以参考 " "[ModelZoo](https://github.com/InternLM/InternLM/tree/main#model-zoo) 中 " @@ -342,130 +342,126 @@ msgstr "" "OpenXLab [ModelZoo](https://github.com/InternLM/InternLM/tree/main#model-" "zoo) to download weights*." -#: ../../../usage.md:421 +#: ../../../usage.md:419 msgid "并行配置" msgstr "Parallel Configuration" -#: ../../../usage.md:423 +#: ../../../usage.md:421 msgid "训练并行配置样例如下:" msgstr "Training parallel configuration example:" -#: ../../../usage.md:432 +#: ../../../usage.md:430 msgid "zero1(字典):" msgstr "zero1 (dict): " -#: ../../../usage.md:433 +#: ../../../usage.md:431 msgid "size: 整数" msgstr "size: int " -#: ../../../usage.md:434 +#: ../../../usage.md:432 msgid "当`zero1 <= 0`,则 zero1 进程组的大小等于数据并行进程组的大小,因此优化器状态参数将在数据并行范围内分配" msgstr "" "When `zero1 <= 0` , the size of the zero1 process group is equal to the " "size of the data parallel process group, so the optimizer state " "parameters will be split within the data parallel range." -#: ../../../usage.md:435 +#: ../../../usage.md:433 msgid "当`zero1 == 1`,则不使用 zero1 ,所有数据并行组保留完整的优化器状态参数" msgstr "" "When `zero1 == 1`, zero1 is not used, and all data parallel groups retain" " the complete optimizer state parameters." -#: ../../../usage.md:436 +#: ../../../usage.md:434 msgid "当`zero1 > 1`且`zero1 <= data_parallel_world_size`,则 zero1 进程组是数据并行进程组的子集" msgstr "" "When `zero1 > 1` and `zero1 <= data_parallel_world_size`, the zero1 " "process group is a subset of the data parallel process group." -#: ../../../usage.md:437 +#: ../../../usage.md:435 msgid "fsdp: 布尔值,启用/禁用torch的完全分片数据并行,默认为False。" msgstr "" "fsdp: A boolean value that enables or disables fully sharded data " "parallelism in torch, with the default being False." -#: ../../../usage.md:438 +#: ../../../usage.md:436 msgid "tensor(字典):" msgstr "tensor (dict): " -#: ../../../usage.md:439 +#: ../../../usage.md:437 msgid "size: 整数,张量并行的大小。" msgstr "size: int, size of tensor parallem" -#: ../../../usage.md:440 +#: ../../../usage.md:438 msgid "mode: 字符串,张量并行模式,应该是 ['mtp', 'msp', 'fsp', 'isp'] 中的一个," msgstr "" "mode: string, tensor parallel mode, should be one of ['mtp', 'msp', " "'fsp', 'isp'] " -#: ../../../usage.md:441 +#: ../../../usage.md:439 msgid "默认为 'mtp',意味着没有序列并行的纯Megatron张量并行。" msgstr "" "Default is 'mtp', which means there is no sequence parallelism, just pure" " tensor parallelism for Megatron." -#: ../../../usage.md:442 +#: ../../../usage.md:440 msgid "msp: 带序列并行的Megatron张量并行,序列并行大小 = 张量并行大小。" msgstr "" "msp: Megatron Tensor Parallelism with Sequence Parallelism, where the " "size of sequence parallelism is equal to the size of tensor parallelism." -#: ../../../usage.md:443 +#: ../../../usage.md:441 msgid "fsp: 通过flash-attn带序列并行的张量并行,序列并行大小 = 张量并行大小。" msgstr "" "fsp: Tensor Parallelism with Sequence Parallelism facilitated by flash-" "attn, where the size of sequence parallelism is equal to the size of " "tensor parallelism." -#: ../../../usage.md:444 +#: ../../../usage.md:442 msgid "isp: 定制的内部序列并行,不带张量并行,可以与权重并行一起使用。" msgstr "" "isp: Custom internal sequence parallelism, without tensor parallelism, " "which can be used in conjunction with weight parallelism." -#: ../../../usage.md:445 +#: ../../../usage.md:443 msgid "pipeline(字典):" msgstr "pipeline: pipeline parallel strategy" -#: ../../../usage.md:446 +#: ../../../usage.md:444 msgid "size: 整数,流水线并行的大小。" msgstr "size: int, size of pipeline parallel" -#: ../../../usage.md:447 +#: ../../../usage.md:445 msgid "interleaved_overlap: 布尔值,启用/禁用在使用交错流水线调度器时的通信重叠,默认为False。" msgstr "" "interleaved_overlap: A boolean value that enables or disables " "communication overlapping when using an interleaved pipeline scheduler, " "with the default being False." -#: ../../../usage.md:448 +#: ../../../usage.md:446 msgid "weight(字典):" msgstr "weight (dict):" -#: ../../../usage.md:449 +#: ../../../usage.md:447 msgid "size: 整数,权重并行的大小。" msgstr "size: int, size of weight parallel" -#: ../../../usage.md:450 +#: ../../../usage.md:448 msgid "overlap: 布尔值,启用/禁用all_gather/reduce_scatter通信重叠,默认为False。" msgstr "" "overlap: bool, enable/disable all_gather/reduce_scatter communication " "overlap, default is False" -#: ../../../usage.md:451 -msgid "memory_pool: 布尔值,启用/禁用内存池,默认为False。" -msgstr "memory_pool: bool, enable/disable memory pool, default is False" - -#: ../../../usage.md:453 +#: ../../../usage.md:450 msgid "注意:`数据并行大小 = 总的 GPU 数目 / 流水线并行大小 / 张量并行大小`" msgstr "" "Note: `Data parallel size = Total number of GPUs / Pipeline parallel size" " / Tensor parallel size`" -#: ../../../usage.md:455 +#: ../../../usage.md:452 msgid "启动训练" msgstr "Start Training" -#: ../../../usage.md:457 +#: ../../../usage.md:454 msgid "完成了以上数据集准备和相关训练配置后,可启动 Demo 训练。接下来分别以 slurm 和 torch 环境为例,介绍训练启动方式。" msgstr "" "After completing the data preparation and relevant training " @@ -473,19 +469,19 @@ msgstr "" "following examples demonstrate how to start the training in both slurm " "and torch environments." -#: ../../../usage.md:459 ../../../usage.md:496 +#: ../../../usage.md:456 ../../../usage.md:493 msgid "若在 slurm 上启动分布式运行环境,多节点 16 卡的运行命令如下所示:" msgstr "" "If you want to start distributed training on slurm with 16 GPUs across " "multiple nodes, use the following command:" -#: ../../../usage.md:464 +#: ../../../usage.md:461 msgid "若在 torch 上启动分布式运行环境,单节点 8 卡的运行命令如下所示:" msgstr "" "If you want to start distributed training on torch with 8 GPUs on a " "single node, use the following command:" -#: ../../../usage.md:469 +#: ../../../usage.md:466 msgid "" "其中,train.py文件的内容,请参考: [训练脚本](https://internevo.readthedocs.io/zh-" "cn/latest/training.html)" @@ -493,29 +489,29 @@ msgstr "" "The content of train.py, please refer to: [training " "script](https://internevo.readthedocs.io/en/latest/training.html) " -#: ../../../usage.md:471 +#: ../../../usage.md:468 msgid "运行结果" msgstr "Training Results" -#: ../../../usage.md:473 +#: ../../../usage.md:470 msgid "以 slurm 上单机 8 卡的 Demo 训练配置为例,训练结果日志展示如下:" msgstr "" "Taking the configuration of the demo training on a single machine with 8 " "GPUs on slurm as an example, the training result log is shown below:" -#: ../../../usage.md:494 +#: ../../../usage.md:491 msgid "加载训练的checkpoint并生成" msgstr "Load the training checkpoint and generate." -#: ../../../usage.md:501 +#: ../../../usage.md:498 msgid "在配置文件中添加`generation`配置" msgstr "Add generation configuration to the configuration file." -#: ../../../usage.md:519 +#: ../../../usage.md:516 msgid "长文本生成" msgstr "Long Text Generation" -#: ../../../usage.md:521 +#: ../../../usage.md:518 msgid "" "在推理阶段,我们可以使用 Dynamic NTK RoPE 来代替原始的 RoPE,从而使得模型能够适应长文本的输入输出,达到 16K " "的外推效果。 目前 InternLM 支持在 huggingface 格式和 InternLM 本身格式的模型中使用 Dynamic NTK " @@ -527,7 +523,7 @@ msgstr "" "the use of Dynamic NTK RoPE in models formatted in both Hugging Face " "format and InternLM's native format." -#: ../../../usage.md:524 +#: ../../../usage.md:521 msgid "" "对于 huggingface 格式的模型,dynamic ntk rope 目前是被默认使用的。如果用户想要关闭该行为,请将 " "`config.json` 中的 `rotary.type` 修改为 `origin`;" @@ -536,7 +532,7 @@ msgstr "" "by default. If users wish to disable this behavior, they can modify " "`rotary.type` in the `config.json` file to `origin`." -#: ../../../usage.md:525 +#: ../../../usage.md:522 msgid "" "对于 InternLM " "本身格式的模型,在推理时,通过在初始化模型的配置字典中添加`use_dynamic_ntk_rope=True`来开启这一行为。" @@ -545,7 +541,7 @@ msgstr "" "this behavior by adding use_dynamic_ntk_rope=True to the configuration " "dictionary when initializing the model." -#: ../../../usage.md:527 +#: ../../../usage.md:524 msgid "" "用户可以直接通过 web_demo 来直观地对比查看 Dynamic NTK RoPE " "是如何生效的。例如文件[长文本示例](../../aux_materials/long_text_example.txt)中存放着一个token长度超过2200的文本,如果不使用" @@ -559,23 +555,23 @@ msgstr "" "text. However, after applying Dynamic NTK RoPE, the response from the " "InternLM Chat 7B v1.1 model is as follows:" -#: ../../../usage.md:530 +#: ../../../usage.md:527 msgid "![dynamic_ntk_answer](./imgs/dynamic_ntk_answer.png)" msgstr "" -#: ../../../usage.md:530 +#: ../../../usage.md:527 msgid "dynamic_ntk_answer" msgstr "" -#: ../../../usage.md:532 +#: ../../../usage.md:529 msgid "关于 Dyanmic NTK 的原理,详细请参考" msgstr "Regarding the principle of Dyanmic NTK, please refer to" -#: ../../../usage.md:533 +#: ../../../usage.md:530 msgid "[dynamically_scaled_rope_further_increases](https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases)" msgstr "" -#: ../../../usage.md:534 +#: ../../../usage.md:531 msgid "[https://kexue.fm/archives/9675](https://kexue.fm/archives/9675)" msgstr "" @@ -633,4 +629,3 @@ msgstr "" #~ msgid "数据准备 (预训练)" #~ msgstr "Dataset Preparation (Pre-training)" - diff --git a/doc/code-docs/source/monitor.rst b/doc/code-docs/source/monitor.rst index e6a872162..a358f9f27 100644 --- a/doc/code-docs/source/monitor.rst +++ b/doc/code-docs/source/monitor.rst @@ -21,12 +21,11 @@ InternEvo 监控线程会周期性地检查模型训练过程中是否出现 los .. autofunction:: internlm.monitor.alert.send_feishu_msg_with_webhook -轻量监控 +监控告警配置 ----------------- -InternEvo轻量级监控工具采用心跳机制实时监测训练过程中的各项指标,如loss、grad_norm、训练阶段的耗时等。同时,InternEvo还可以通过 `grafana dashboard `_ 直观地呈现这些指标信息,以便用户进行更加全面和深入的训练分析。 -轻量监控的配置由配置文件中的 ``monitor`` 字段指定, 用户可以通过修改配置文件 `config file `_ 来更改监控配置。以下是一个监控配置的示例: +配置由配置文件中的 ``monitor`` 字段指定, 用户可以通过修改配置文件 `config file `_ 来更改监控配置。以下是一个监控配置的示例: .. code-block:: python @@ -34,12 +33,10 @@ InternEvo轻量级监控工具采用心跳机制实时监测训练过程中的 alert=dict( enable_feishu_alert=False, feishu_alert_address=None, - light_monitor_address=None, alert_file_path=f"llm_alter/{JOB_NAME}_alert.log", ), ) - enable_feishu_alert (bool):是否启用飞书告警。默认值:False。 - feishu_alert_address (str):飞书告警的 Webhook 地址。默认值:None。 -- light_monitor_address (str):轻量监控的地址。默认值:None。 - alert_file_path (str):告警存储路径。默认值:None。 diff --git a/doc/en/usage.md b/doc/en/usage.md index 8e1670c2f..8d803ba46 100644 --- a/doc/en/usage.md +++ b/doc/en/usage.md @@ -269,7 +269,6 @@ monitor = dict( alert=dict( enable_feishu_alert=DO_ALERT, feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat ), ) ``` diff --git a/doc/usage.md b/doc/usage.md index 67ae1edf5..014477e0a 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -123,7 +123,7 @@ ckpt = dict( # 'load_ckpt_info' setting guide: # 1. the 'path' indicate ckpt path, # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined + # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined # load function such as "llama" load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internevo"), # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering @@ -299,7 +299,6 @@ monitor = dict( alert=dict( enable_feishu_alert=DO_ALERT, feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat alert_file_path=f"llm_alter/{JOB_NAME}_alert.log", ), tensorboard=dict( diff --git a/internlm/initialize/launch.py b/internlm/initialize/launch.py index fc63b8a23..1f1f0bfbe 100644 --- a/internlm/initialize/launch.py +++ b/internlm/initialize/launch.py @@ -467,7 +467,6 @@ def args_sanity_check(): "alert": { "enable_feishu_alert": False, "feishu_alert_address": None, - "light_monitor_address": None, "alert_file_path": None, } }, diff --git a/internlm/monitor/monitor.py b/internlm/monitor/monitor.py index fc33de62a..0fbb45022 100644 --- a/internlm/monitor/monitor.py +++ b/internlm/monitor/monitor.py @@ -38,11 +38,13 @@ def execute_with_exception_handling(func, *args, **kwargs): return func(*args, **kwargs) try: return func(*args, **kwargs) - except Exception: + except Exception as e: hostname = socket.gethostname() logger.error( f"Raise exception from {hostname} with rank id: {gpc.get_global_rank()}\n{traceback.format_exc()}", ) + monitor_manager.monitor_exception(excp_info=traceback.format_exc()) + raise e finally: devices_per_node = internlm_accelerator.device_count() local_rank = gpc.get_global_rank() % devices_per_node @@ -178,7 +180,7 @@ def __init__(self, loss_spike_limit: float = 1.5) -> None: self.last_step_loss = -1 self.alert_file_path = None self.enable_alert = False - self.light_monitor_address = None + self.alert_address = None def monitor_loss_spike(self, alert_address: str = None, step_count: int = 0, cur_step_loss: float = 0.0): """Check loss value, if loss spike occurs, send alert message to Feishu.""" @@ -219,6 +221,8 @@ def exception_should_be_alert(self, msg: str, alert_address: str = None): def monitor_exception(self, alert_address: str = None, excp_info: str = None): """Catch and format exception information, send alert message to Feishu.""" if self.enable_alert: + if alert_address is None: + alert_address = self.alert_address filtered_trace = excp_info.split("\n")[-10:] format_trace = "" for line in filtered_trace: @@ -271,9 +275,9 @@ def start_monitor( # initialize some variables for monitoring set_env_var(key="JOB_NAME", value=job_name) self.enable_alert = gpc.config.monitor.alert.get("enable_feishu_alert", False) + self.alert_address = alert_address if self.enable_alert: - self.light_monitor_address = gpc.config.monitor.alert.get("light_monitor_address", None) # initialize alert file self.alert_file_path = gpc.config.monitor.alert.get("alert_file_path") if self.alert_file_path and gpc.is_rank_for_log(): @@ -289,9 +293,13 @@ def start_monitor( loss_spike_limit=loss_spike_limit, ) + self.handle_sigterm(alert_address=alert_address) + send_alert_message(address=alert_address, message=f"Training in {socket.gethostname()} is starting.") + def stop_monitor(self): """Stop the monitor and alert thread.""" - if self.monitor_thread is not None: + if self.enable_alert: + send_alert_message(address=self.alert_address, message=f"Training in {socket.gethostname()} completed.") self.monitor_thread.stop() @@ -311,11 +319,8 @@ def initialize_monitor_manager(job_name: str = None, alert_address: str = None): if alert_address is not None: try: monitor_manager.start_monitor(job_name=job_name, alert_address=alert_address) - monitor_manager.handle_sigterm(alert_address=alert_address) - send_alert_message(address=alert_address, message=f"Training in {socket.gethostname()} is starting.") yield finally: - send_alert_message(address=alert_address, message=f"Training in {socket.gethostname()} completed.") monitor_manager.stop_monitor() else: yield diff --git a/tests/common_fixture.py b/tests/common_fixture.py index e5a8b9aa1..22e52c8c0 100644 --- a/tests/common_fixture.py +++ b/tests/common_fixture.py @@ -57,7 +57,7 @@ ), model_type="INTERNLM", alert_address=None, - monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None, light_monitor_address=None)), + monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None)), grad_scaler=dict( fp16=dict( initial_scale=2**16, diff --git a/tests/test_core/test_pipeline.py b/tests/test_core/test_pipeline.py index 180fe4b71..ba8a7c399 100644 --- a/tests/test_core/test_pipeline.py +++ b/tests/test_core/test_pipeline.py @@ -44,7 +44,7 @@ resume_tb_folder="", tensorboard_folder="", alert_address=None, - monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None, light_monitor_address=None)), + monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None)), grad_scaler=dict( fp16=dict( initial_scale=1, diff --git a/tests/test_model/test_model_internlm.py b/tests/test_model/test_model_internlm.py index 3ce6f530e..9e33b9a99 100644 --- a/tests/test_model/test_model_internlm.py +++ b/tests/test_model/test_model_internlm.py @@ -69,7 +69,7 @@ resume_tb_folder="", tensorboard_folder="", alert_address=None, - monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None, light_monitor_address=None)), + monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None)), ) ) diff --git a/tests/test_solver/test_optimizer.py b/tests/test_solver/test_optimizer.py index ca470ffc9..7f0fc34e7 100644 --- a/tests/test_solver/test_optimizer.py +++ b/tests/test_solver/test_optimizer.py @@ -56,7 +56,7 @@ def forward(self, x): resume_tb_folder="", tensorboard_folder="", alert_address=None, - monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None, light_monitor_address=None)), + monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None)), grad_scaler=dict( fp16=dict( initial_scale=1, diff --git a/tests/test_training/7B_check_acc.py b/tests/test_training/7B_check_acc.py index 3b727d7c9..0e6249e34 100644 --- a/tests/test_training/7B_check_acc.py +++ b/tests/test_training/7B_check_acc.py @@ -164,6 +164,5 @@ alert=dict( enable_feishu_alert=DO_ALERT, feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat ), ) diff --git a/tests/test_training/7B_check_init.py b/tests/test_training/7B_check_init.py index 6f72c7d75..3fcb5c11f 100644 --- a/tests/test_training/7B_check_init.py +++ b/tests/test_training/7B_check_init.py @@ -160,6 +160,5 @@ alert=dict( enable_feishu_alert=DO_ALERT, feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat ), ) diff --git a/tests/test_training/test_forward_output_no_fa.py b/tests/test_training/test_forward_output_no_fa.py index e846594e5..287658e63 100644 --- a/tests/test_training/test_forward_output_no_fa.py +++ b/tests/test_training/test_forward_output_no_fa.py @@ -70,7 +70,7 @@ ), model_type="INTERNLM", alert_address=None, - monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None, light_monitor_address=None)), + monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None)), grad_scaler=dict( fp16=dict( initial_scale=2**16, @@ -177,7 +177,7 @@ def train_check_output(args): optimizer, beta2_scheduler, lr_scheduler = initialize_optimizer(model=model) - train_dl, dataset_types = build_train_loader_with_data_type() + _, dataset_types = build_train_loader_with_data_type() metric = AccPerplex( device=get_current_device(), diff --git a/tests/test_training/test_load_ckpt_loss.py b/tests/test_training/test_load_ckpt_loss.py index 45cd319c4..8c52252f8 100644 --- a/tests/test_training/test_load_ckpt_loss.py +++ b/tests/test_training/test_load_ckpt_loss.py @@ -28,8 +28,8 @@ Config, ) from internlm.core.trainer import ( # noqa: E402 #pylint: disable=wrong-import-position - TrainState, Trainer, + TrainState, ) from internlm.data import ( # noqa: E402 #pylint: disable=wrong-import-position build_train_loader_with_data_type, @@ -109,7 +109,7 @@ ), model_type="INTERNLM", alert_address=None, - monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None, light_monitor_address=None)), + monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None)), grad_scaler=dict( fp16=dict( initial_scale=2**16, diff --git a/tests/test_training/test_swap_nb_loss_and_gradnorm.py b/tests/test_training/test_swap_nb_loss_and_gradnorm.py index 73534f5f7..d3e8b4859 100644 --- a/tests/test_training/test_swap_nb_loss_and_gradnorm.py +++ b/tests/test_training/test_swap_nb_loss_and_gradnorm.py @@ -79,7 +79,7 @@ ), model_type="INTERNLM", alert_address=None, - monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None, light_monitor_address=None)), + monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None)), grad_scaler=dict( fp16=dict( initial_scale=2**16, @@ -277,7 +277,7 @@ def exam_loss(args): criterion = FlashGPTLMLoss(parallel_output=True, label_smoothing=gpc.config.loss.label_smoothing) # initialize the train and validation data loader - train_dl, dataset_types = build_train_loader_with_data_type() + _, dataset_types = build_train_loader_with_data_type() val_dls = build_valid_loader_with_data_type() optimizer, beta2_scheduler, lr_scheduler = initialize_optimizer(model=model) diff --git a/tests/test_utils/common_fixture.py b/tests/test_utils/common_fixture.py index f4b34ddee..329a58c69 100644 --- a/tests/test_utils/common_fixture.py +++ b/tests/test_utils/common_fixture.py @@ -84,7 +84,7 @@ resume_tb_folder="", tensorboard_folder="", alert_address=None, - monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None, light_monitor_address=None)), + monitor=dict(alert=dict(enable_feishu_alert=False, feishu_alert_address=None)), ) )