!1074 requirements.txt移除apex依赖，模型训练脚本规范化加上日志存档

* requirements.txt移除apex依赖，模型训练脚本规范化加上日志存档
2024-12-11 01:18:47 +08:00 · 2024-03-19 10:55:11 +00:00 · 2024-03-19 10:55:11 +00:00 · bf6456e04c
commit bf6456e04c
parent 670cad5dfe
27 changed files with 53 additions and 22 deletions
--- a/examples/baichuan/pretrain_baichuan_ptd_13B.sh
+++ b/examples/baichuan/pretrain_baichuan_ptd_13B.sh
@ -84,4 +84,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
    $DATA_ARGS \
    $OUTPUT_ARGS \
    --distributed-backend nccl \
-    --save $CKPT_SAVE_DIR
+    --save $CKPT_SAVE_DIR \
+    | tee logs/train_baichuan_13b.log
--- a/examples/baichuan/pretrain_baichuan_ptd_7B.sh
+++ b/examples/baichuan/pretrain_baichuan_ptd_7B.sh
@ -87,4 +87,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
    $DATA_ARGS \
    $OUTPUT_ARGS \
    --distributed-backend nccl \
-    --save $CKPT_SAVE_DIR
+    --save $CKPT_SAVE_DIR \
+    | tee logs/train_baichuan_7b.log
--- a/examples/baichuan2/pretrain_baichuan2_ptd_13B.sh
+++ b/examples/baichuan2/pretrain_baichuan2_ptd_13B.sh
@ -92,4 +92,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
    $DATA_ARGS \
    $OUTPUT_ARGS \
    --distributed-backend nccl \
-    --save $CKPT_SAVE_DIR
+    --save $CKPT_SAVE_DIR \
+    | tee logs/train_baichuan2_13b.log
--- a/examples/baichuan2/pretrain_baichuan2_ptd_7B.sh
+++ b/examples/baichuan2/pretrain_baichuan2_ptd_7B.sh
@ -86,4 +86,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
    $DATA_ARGS \
    $OUTPUT_ARGS \
    --distributed-backend nccl \
-    --save $CKPT_SAVE_DIR
+    --save $CKPT_SAVE_DIR \
+    | tee logs/train_baichuan2_7b.log
--- a/examples/bloom/README.md
+++ b/examples/bloom/README.md
@ -222,6 +222,8 @@ Bloom-176B 训练的硬件配置:
 ```shell
 git clone https://gitee.com/ascend/ModelLink.git
 cd ModelLink
+mkdir logs
+mkdir ckpt
 ```

 2. 搭建环境
--- a/examples/bloom/README_en.md
+++ b/examples/bloom/README_en.md
@ -226,6 +226,8 @@ Here's a hardware summary of pre-training Bloom-176B:
 ```shell
 git clone https://gitee.com/ascend/ModelLink.git
 cd ModelLink
+mkdir logs
+mkdir ckpt
 ```

 2. Build enviroment
--- a/examples/bloom/pretrain_bloom_176b.sh
+++ b/examples/bloom/pretrain_bloom_176b.sh
@ -89,4 +89,5 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS pretrain_gpt.py \
    $DATA_ARGS \
    $OUTPUT_ARGS \
    --distributed-backend nccl \
-    --save $CKPT_SAVE_DIR
+    --save $CKPT_SAVE_DIR \
+    | tee logs/train_bloom_176b.log
--- a/examples/bloom/pretrain_bloom_ptd_7B.sh
+++ b/examples/bloom/pretrain_bloom_ptd_7B.sh
@ -82,4 +82,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
    $DATA_ARGS \
    $OUTPUT_ARGS \
    --distributed-backend nccl \
-    --save $CKPT_SAVE_DIR
+    --save $CKPT_SAVE_DIR \
+    | tee logs/train_bloom_7b.log
--- a/examples/intern/pretrain_internlm_65b_ptd.sh
+++ b/examples/intern/pretrain_internlm_65b_ptd.sh
@ -90,5 +90,6 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
    $DATA_ARGS \
    $OUTPUT_ARGS \
    --distributed-backend nccl \
-    --save ${CKPT_SAVE_DIR} | tee train_internlm_65B.log
+    --save ${CKPT_SAVE_DIR} \
+    | tee logs/train_internlm_65B.log

--- a/examples/intern/pretrain_internlm_7b_ptd.sh
+++ b/examples/intern/pretrain_internlm_7b_ptd.sh
@ -91,4 +91,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
    $DATA_ARGS \
    $OUTPUT_ARGS \
    --distributed-backend nccl \
-    --save ${CKPT_SAVE_DIR} | tee train_internlm_7b.log
+    --save ${CKPT_SAVE_DIR} \
+    | tee logs/train_internlm_7b.log
--- a/examples/llama/pretrain_llama_13b_ptd.sh
+++ b/examples/llama/pretrain_llama_13b_ptd.sh
@ -86,4 +86,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
    $DATA_ARGS \
    $OUTPUT_ARGS \
    --distributed-backend nccl \
-    --save ${SAVE_CHECKPOINT_PATH}
+    --save ${SAVE_CHECKPOINT_PATH} \
+    | tee logs/train_llama_13b.log
--- a/examples/llama/pretrain_llama_33B_ptd_32p.sh
+++ b/examples/llama/pretrain_llama_33B_ptd_32p.sh
@ -87,4 +87,5 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS pretrain_gpt.py \
    $DATA_ARGS \
    $OUTPUT_ARGS \
    --distributed-backend nccl \
-    --save $CKPT_SAVE_DIR
+    --save $CKPT_SAVE_DIR \
+    | tee logs/train_llama_33b.log
--- a/examples/llama/pretrain_llama_65b_ptd.sh
+++ b/examples/llama/pretrain_llama_65b_ptd.sh
@ -87,5 +87,6 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
    $DATA_ARGS \
    $OUTPUT_ARGS \
    --distributed-backend nccl \
-    --save ${SAVE_CHECKPOINT_PATH}
+    --save ${SAVE_CHECKPOINT_PATH} \
+    | tee logs/train_llama_65b.log

--- a/examples/llama/pretrain_llama_7b_ptd.sh
+++ b/examples/llama/pretrain_llama_7b_ptd.sh
@ -86,4 +86,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
    $DATA_ARGS \
    $OUTPUT_ARGS \
    --distributed-backend nccl \
-    --save ${SAVE_CHECKPOINT_PATH} 
+    --save ${SAVE_CHECKPOINT_PATH} \
+    | tee logs/train_llama_7b.log
--- a/examples/llama2/pretrain_llama2_13B_ptd_8p.sh
+++ b/examples/llama2/pretrain_llama2_13B_ptd_8p.sh
@ -86,4 +86,5 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS pretrain_gpt.py \
    $DATA_ARGS \
    $OUTPUT_ARGS \
    --distributed-backend nccl \
-    --save $CKPT_SAVE_DIR
+    --save $CKPT_SAVE_DIR \
+    | tee logs/train_llama2_13b.log
--- a/examples/llama2/pretrain_llama2_34B_ptd_16p.sh
+++ b/examples/llama2/pretrain_llama2_34B_ptd_16p.sh
@ -88,4 +88,5 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS pretrain_gpt.py \
    $DATA_ARGS \
    $OUTPUT_ARGS \
    --distributed-backend nccl \
-    --save $CKPT_SAVE_DIR
+    --save $CKPT_SAVE_DIR \
+    | tee logs/train_llama2_34b.log
--- a/examples/llama2/pretrain_llama2_70b_ptd.sh
+++ b/examples/llama2/pretrain_llama2_70b_ptd.sh
@ -86,4 +86,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
    $DATA_ARGS \
    $OUTPUT_ARGS \
    --distributed-backend nccl \
-    --save $CKPT_SAVE_DIR 
+    --save $CKPT_SAVE_DIR  \
+    | tee logs/train_llama2_70b.log
--- a/examples/llama2/pretrain_llama2_7b_ptd.sh
+++ b/examples/llama2/pretrain_llama2_7b_ptd.sh
@ -86,4 +86,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
    $OUTPUT_ARGS \
    --distributed-backend nccl \
    --load $CKPT_LOAD_DIR \
-    --save $CKPT_SAVE_DIR
+    --save $CKPT_SAVE_DIR \
+    | tee logs/train_llama2_7b.log
--- a/examples/mixtral/README.md
+++ b/examples/mixtral/README.md
@ -55,6 +55,7 @@
    pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl

    cd ModelLink
+    mkdir logs
    pip install -r requirements.txt
    cd ..

--- a/examples/mixtral/README_en.md
+++ b/examples/mixtral/README_en.md
@ -55,6 +55,7 @@ Recommended hardware configuration for inference:
    pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl

    cd ModelLink
+    mkdir logs
    pip install -r requirements.txt
    cd ..

--- a/examples/mixtral/pretrain_mixtral_8x7b_ptd.sh
+++ b/examples/mixtral/pretrain_mixtral_8x7b_ptd.sh
@ -108,4 +108,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
  $GPT_ARGS \
  $DATA_ARGS \
  $OUTPUT_ARGS \
-  --distributed-backend nccl | tee train.log
+  --distributed-backend nccl \
+  | tee logs/train_mixtral.log
--- a/examples/qwen/README.md
+++ b/examples/qwen/README.md
@ -47,6 +47,7 @@ Qwen-7B 训练的硬件配置:
   ```shell
   git clone https://gitee.com/ascend/ModelLink.git 
   cd ModelLink
+   mkdir logs
   ```
 2. 搭建环境

@ -266,7 +267,8 @@ Qwen-14B 训练的硬件配置:

   ```shell
   git clone https://gitee.com/ascend/ModelLink.git 
-   cd ModelLink 
+   cd ModelLink
+   mkdir logs 
   ```
 2. 搭建环境

@ -492,6 +494,7 @@ Qwen-72B 训练的硬件配置:
   ```shell
   git clone https://gitee.com/ascend/ModelLink.git 
   cd ModelLink
+   mkdir logs
   ```
 2. 搭建环境

--- a/examples/qwen/README_en.md
+++ b/examples/qwen/README_en.md
@ -46,6 +46,7 @@ Here's a hardware summary of pre-training  Qwen-7B:
    ```shell
    git clone https://gitee.com/ascend/ModelLink.git 
    cd ModelLink
+    mkdir logs
    ```

 2. Build environment
@ -261,6 +262,7 @@ Here's a hardware summary of pre-training  Qwen-14B:
    ```shell
    git clone https://gitee.com/ascend/ModelLink.git 
    cd ModelLink
+    mkdir logs
    ```

 2. Build environment
@ -483,6 +485,7 @@ Here's a hardware summary of pre-training  Qwen-72B:
    ```shell
    git clone https://gitee.com/ascend/ModelLink.git 
    cd ModelLink
+    mkdir logs
    ```

 2. Build environment
--- a/examples/qwen/pretrain_qwen_14b_ptd.sh
+++ b/examples/qwen/pretrain_qwen_14b_ptd.sh
@ -89,4 +89,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
    $OUTPUT_ARGS \
    --tokenizer-kwargs 'eos_token' '<|endoftext|>' 'pad_token' '<|extra_0|>' \
    --distributed-backend nccl \
-    --save ${CKPT_SAVE_DIR}
+    --save ${CKPT_SAVE_DIR} \
+    | tee logs/train_qwen_14b.log
--- a/examples/qwen/pretrain_qwen_72b_ptd.sh
+++ b/examples/qwen/pretrain_qwen_72b_ptd.sh
@ -90,4 +90,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
    $OUTPUT_ARGS \
    --tokenizer-kwargs 'eos_token' '<|endoftext|>' 'pad_token' '<|extra_0|>' \
    --distributed-backend nccl \
-    --save ${CKPT_SAVE_DIR}
+    --save ${CKPT_SAVE_DIR} \
+    | tee logs/train_qwen_72b.log
--- a/examples/qwen/pretrain_qwen_7b_ptd.sh
+++ b/examples/qwen/pretrain_qwen_7b_ptd.sh
@ -89,4 +89,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
    $OUTPUT_ARGS \
    --tokenizer-kwargs 'eos_token' '<|endoftext|>' 'pad_token' '<|extra_0|>' \
    --distributed-backend nccl \
-    --save ${CKPT_SAVE_DIR}
+    --save ${CKPT_SAVE_DIR} \
+    | tee logs/train_qwen_7b.log
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,3 @@
-apex
 numpy
 transformers==4.36.0
 transformers_stream_generator