!1074 requirements.txt移除apex依赖,模型训练脚本规范化加上日志存档

* requirements.txt移除apex依赖,模型训练脚本规范化加上日志存档
This commit is contained in:
LeiZhenzhen 2024-03-19 10:55:11 +00:00 committed by guhangsong
parent 670cad5dfe
commit bf6456e04c
27 changed files with 53 additions and 22 deletions

View File

@ -84,4 +84,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save $CKPT_SAVE_DIR
--save $CKPT_SAVE_DIR \
| tee logs/train_baichuan_13b.log

View File

@ -87,4 +87,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save $CKPT_SAVE_DIR
--save $CKPT_SAVE_DIR \
| tee logs/train_baichuan_7b.log

View File

@ -92,4 +92,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save $CKPT_SAVE_DIR
--save $CKPT_SAVE_DIR \
| tee logs/train_baichuan2_13b.log

View File

@ -86,4 +86,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save $CKPT_SAVE_DIR
--save $CKPT_SAVE_DIR \
| tee logs/train_baichuan2_7b.log

View File

@ -222,6 +222,8 @@ Bloom-176B 训练的硬件配置:
```shell
git clone https://gitee.com/ascend/ModelLink.git
cd ModelLink
mkdir logs
mkdir ckpt
```
2. 搭建环境

View File

@ -226,6 +226,8 @@ Here's a hardware summary of pre-training Bloom-176B:
```shell
git clone https://gitee.com/ascend/ModelLink.git
cd ModelLink
mkdir logs
mkdir ckpt
```
2. Build enviroment

View File

@ -89,4 +89,5 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS pretrain_gpt.py \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save $CKPT_SAVE_DIR
--save $CKPT_SAVE_DIR \
| tee logs/train_bloom_176b.log

View File

@ -82,4 +82,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save $CKPT_SAVE_DIR
--save $CKPT_SAVE_DIR \
| tee logs/train_bloom_7b.log

View File

@ -90,5 +90,6 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save ${CKPT_SAVE_DIR} | tee train_internlm_65B.log
--save ${CKPT_SAVE_DIR} \
| tee logs/train_internlm_65B.log

View File

@ -91,4 +91,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save ${CKPT_SAVE_DIR} | tee train_internlm_7b.log
--save ${CKPT_SAVE_DIR} \
| tee logs/train_internlm_7b.log

View File

@ -86,4 +86,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save ${SAVE_CHECKPOINT_PATH}
--save ${SAVE_CHECKPOINT_PATH} \
| tee logs/train_llama_13b.log

View File

@ -87,4 +87,5 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS pretrain_gpt.py \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save $CKPT_SAVE_DIR
--save $CKPT_SAVE_DIR \
| tee logs/train_llama_33b.log

View File

@ -87,5 +87,6 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save ${SAVE_CHECKPOINT_PATH}
--save ${SAVE_CHECKPOINT_PATH} \
| tee logs/train_llama_65b.log

View File

@ -86,4 +86,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save ${SAVE_CHECKPOINT_PATH}
--save ${SAVE_CHECKPOINT_PATH} \
| tee logs/train_llama_7b.log

View File

@ -86,4 +86,5 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS pretrain_gpt.py \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save $CKPT_SAVE_DIR
--save $CKPT_SAVE_DIR \
| tee logs/train_llama2_13b.log

View File

@ -88,4 +88,5 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS pretrain_gpt.py \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save $CKPT_SAVE_DIR
--save $CKPT_SAVE_DIR \
| tee logs/train_llama2_34b.log

View File

@ -86,4 +86,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save $CKPT_SAVE_DIR
--save $CKPT_SAVE_DIR \
| tee logs/train_llama2_70b.log

View File

@ -86,4 +86,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$OUTPUT_ARGS \
--distributed-backend nccl \
--load $CKPT_LOAD_DIR \
--save $CKPT_SAVE_DIR
--save $CKPT_SAVE_DIR \
| tee logs/train_llama2_7b.log

View File

@ -55,6 +55,7 @@
pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
cd ModelLink
mkdir logs
pip install -r requirements.txt
cd ..

View File

@ -55,6 +55,7 @@ Recommended hardware configuration for inference:
pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
cd ModelLink
mkdir logs
pip install -r requirements.txt
cd ..

View File

@ -108,4 +108,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$GPT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl | tee train.log
--distributed-backend nccl \
| tee logs/train_mixtral.log

View File

@ -47,6 +47,7 @@ Qwen-7B 训练的硬件配置:
```shell
git clone https://gitee.com/ascend/ModelLink.git
cd ModelLink
mkdir logs
```
2. 搭建环境
@ -266,7 +267,8 @@ Qwen-14B 训练的硬件配置:
```shell
git clone https://gitee.com/ascend/ModelLink.git
cd ModelLink
cd ModelLink
mkdir logs
```
2. 搭建环境
@ -492,6 +494,7 @@ Qwen-72B 训练的硬件配置:
```shell
git clone https://gitee.com/ascend/ModelLink.git
cd ModelLink
mkdir logs
```
2. 搭建环境

View File

@ -46,6 +46,7 @@ Here's a hardware summary of pre-training Qwen-7B:
```shell
git clone https://gitee.com/ascend/ModelLink.git
cd ModelLink
mkdir logs
```
2. Build environment
@ -261,6 +262,7 @@ Here's a hardware summary of pre-training Qwen-14B:
```shell
git clone https://gitee.com/ascend/ModelLink.git
cd ModelLink
mkdir logs
```
2. Build environment
@ -483,6 +485,7 @@ Here's a hardware summary of pre-training Qwen-72B:
```shell
git clone https://gitee.com/ascend/ModelLink.git
cd ModelLink
mkdir logs
```
2. Build environment

View File

@ -89,4 +89,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$OUTPUT_ARGS \
--tokenizer-kwargs 'eos_token' '<|endoftext|>' 'pad_token' '<|extra_0|>' \
--distributed-backend nccl \
--save ${CKPT_SAVE_DIR}
--save ${CKPT_SAVE_DIR} \
| tee logs/train_qwen_14b.log

View File

@ -90,4 +90,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$OUTPUT_ARGS \
--tokenizer-kwargs 'eos_token' '<|endoftext|>' 'pad_token' '<|extra_0|>' \
--distributed-backend nccl \
--save ${CKPT_SAVE_DIR}
--save ${CKPT_SAVE_DIR} \
| tee logs/train_qwen_72b.log

View File

@ -89,4 +89,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$OUTPUT_ARGS \
--tokenizer-kwargs 'eos_token' '<|endoftext|>' 'pad_token' '<|extra_0|>' \
--distributed-backend nccl \
--save ${CKPT_SAVE_DIR}
--save ${CKPT_SAVE_DIR} \
| tee logs/train_qwen_7b.log

View File

@ -1,4 +1,3 @@
apex
numpy
transformers==4.36.0
transformers_stream_generator