mirror of
https://gitee.com/ascend/ModelLink.git
synced 2024-12-05 21:37:43 +08:00
!1074 requirements.txt移除apex依赖,模型训练脚本规范化加上日志存档
* requirements.txt移除apex依赖,模型训练脚本规范化加上日志存档
This commit is contained in:
parent
670cad5dfe
commit
bf6456e04c
@ -84,4 +84,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
|
||||
$DATA_ARGS \
|
||||
$OUTPUT_ARGS \
|
||||
--distributed-backend nccl \
|
||||
--save $CKPT_SAVE_DIR
|
||||
--save $CKPT_SAVE_DIR \
|
||||
| tee logs/train_baichuan_13b.log
|
||||
|
@ -87,4 +87,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
|
||||
$DATA_ARGS \
|
||||
$OUTPUT_ARGS \
|
||||
--distributed-backend nccl \
|
||||
--save $CKPT_SAVE_DIR
|
||||
--save $CKPT_SAVE_DIR \
|
||||
| tee logs/train_baichuan_7b.log
|
||||
|
@ -92,4 +92,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
|
||||
$DATA_ARGS \
|
||||
$OUTPUT_ARGS \
|
||||
--distributed-backend nccl \
|
||||
--save $CKPT_SAVE_DIR
|
||||
--save $CKPT_SAVE_DIR \
|
||||
| tee logs/train_baichuan2_13b.log
|
@ -86,4 +86,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
|
||||
$DATA_ARGS \
|
||||
$OUTPUT_ARGS \
|
||||
--distributed-backend nccl \
|
||||
--save $CKPT_SAVE_DIR
|
||||
--save $CKPT_SAVE_DIR \
|
||||
| tee logs/train_baichuan2_7b.log
|
||||
|
@ -222,6 +222,8 @@ Bloom-176B 训练的硬件配置:
|
||||
```shell
|
||||
git clone https://gitee.com/ascend/ModelLink.git
|
||||
cd ModelLink
|
||||
mkdir logs
|
||||
mkdir ckpt
|
||||
```
|
||||
|
||||
2. 搭建环境
|
||||
|
@ -226,6 +226,8 @@ Here's a hardware summary of pre-training Bloom-176B:
|
||||
```shell
|
||||
git clone https://gitee.com/ascend/ModelLink.git
|
||||
cd ModelLink
|
||||
mkdir logs
|
||||
mkdir ckpt
|
||||
```
|
||||
|
||||
2. Build enviroment
|
||||
|
@ -89,4 +89,5 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS pretrain_gpt.py \
|
||||
$DATA_ARGS \
|
||||
$OUTPUT_ARGS \
|
||||
--distributed-backend nccl \
|
||||
--save $CKPT_SAVE_DIR
|
||||
--save $CKPT_SAVE_DIR \
|
||||
| tee logs/train_bloom_176b.log
|
@ -82,4 +82,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
|
||||
$DATA_ARGS \
|
||||
$OUTPUT_ARGS \
|
||||
--distributed-backend nccl \
|
||||
--save $CKPT_SAVE_DIR
|
||||
--save $CKPT_SAVE_DIR \
|
||||
| tee logs/train_bloom_7b.log
|
||||
|
@ -90,5 +90,6 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
|
||||
$DATA_ARGS \
|
||||
$OUTPUT_ARGS \
|
||||
--distributed-backend nccl \
|
||||
--save ${CKPT_SAVE_DIR} | tee train_internlm_65B.log
|
||||
--save ${CKPT_SAVE_DIR} \
|
||||
| tee logs/train_internlm_65B.log
|
||||
|
||||
|
@ -91,4 +91,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
|
||||
$DATA_ARGS \
|
||||
$OUTPUT_ARGS \
|
||||
--distributed-backend nccl \
|
||||
--save ${CKPT_SAVE_DIR} | tee train_internlm_7b.log
|
||||
--save ${CKPT_SAVE_DIR} \
|
||||
| tee logs/train_internlm_7b.log
|
||||
|
@ -86,4 +86,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
|
||||
$DATA_ARGS \
|
||||
$OUTPUT_ARGS \
|
||||
--distributed-backend nccl \
|
||||
--save ${SAVE_CHECKPOINT_PATH}
|
||||
--save ${SAVE_CHECKPOINT_PATH} \
|
||||
| tee logs/train_llama_13b.log
|
@ -87,4 +87,5 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS pretrain_gpt.py \
|
||||
$DATA_ARGS \
|
||||
$OUTPUT_ARGS \
|
||||
--distributed-backend nccl \
|
||||
--save $CKPT_SAVE_DIR
|
||||
--save $CKPT_SAVE_DIR \
|
||||
| tee logs/train_llama_33b.log
|
||||
|
@ -87,5 +87,6 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
|
||||
$DATA_ARGS \
|
||||
$OUTPUT_ARGS \
|
||||
--distributed-backend nccl \
|
||||
--save ${SAVE_CHECKPOINT_PATH}
|
||||
--save ${SAVE_CHECKPOINT_PATH} \
|
||||
| tee logs/train_llama_65b.log
|
||||
|
||||
|
@ -86,4 +86,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
|
||||
$DATA_ARGS \
|
||||
$OUTPUT_ARGS \
|
||||
--distributed-backend nccl \
|
||||
--save ${SAVE_CHECKPOINT_PATH}
|
||||
--save ${SAVE_CHECKPOINT_PATH} \
|
||||
| tee logs/train_llama_7b.log
|
@ -86,4 +86,5 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS pretrain_gpt.py \
|
||||
$DATA_ARGS \
|
||||
$OUTPUT_ARGS \
|
||||
--distributed-backend nccl \
|
||||
--save $CKPT_SAVE_DIR
|
||||
--save $CKPT_SAVE_DIR \
|
||||
| tee logs/train_llama2_13b.log
|
||||
|
@ -88,4 +88,5 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS pretrain_gpt.py \
|
||||
$DATA_ARGS \
|
||||
$OUTPUT_ARGS \
|
||||
--distributed-backend nccl \
|
||||
--save $CKPT_SAVE_DIR
|
||||
--save $CKPT_SAVE_DIR \
|
||||
| tee logs/train_llama2_34b.log
|
||||
|
@ -86,4 +86,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
|
||||
$DATA_ARGS \
|
||||
$OUTPUT_ARGS \
|
||||
--distributed-backend nccl \
|
||||
--save $CKPT_SAVE_DIR
|
||||
--save $CKPT_SAVE_DIR \
|
||||
| tee logs/train_llama2_70b.log
|
@ -86,4 +86,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
|
||||
$OUTPUT_ARGS \
|
||||
--distributed-backend nccl \
|
||||
--load $CKPT_LOAD_DIR \
|
||||
--save $CKPT_SAVE_DIR
|
||||
--save $CKPT_SAVE_DIR \
|
||||
| tee logs/train_llama2_7b.log
|
@ -55,6 +55,7 @@
|
||||
pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
|
||||
|
||||
cd ModelLink
|
||||
mkdir logs
|
||||
pip install -r requirements.txt
|
||||
cd ..
|
||||
|
||||
|
@ -55,6 +55,7 @@ Recommended hardware configuration for inference:
|
||||
pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
|
||||
|
||||
cd ModelLink
|
||||
mkdir logs
|
||||
pip install -r requirements.txt
|
||||
cd ..
|
||||
|
||||
|
@ -108,4 +108,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
|
||||
$GPT_ARGS \
|
||||
$DATA_ARGS \
|
||||
$OUTPUT_ARGS \
|
||||
--distributed-backend nccl | tee train.log
|
||||
--distributed-backend nccl \
|
||||
| tee logs/train_mixtral.log
|
||||
|
@ -47,6 +47,7 @@ Qwen-7B 训练的硬件配置:
|
||||
```shell
|
||||
git clone https://gitee.com/ascend/ModelLink.git
|
||||
cd ModelLink
|
||||
mkdir logs
|
||||
```
|
||||
2. 搭建环境
|
||||
|
||||
@ -266,7 +267,8 @@ Qwen-14B 训练的硬件配置:
|
||||
|
||||
```shell
|
||||
git clone https://gitee.com/ascend/ModelLink.git
|
||||
cd ModelLink
|
||||
cd ModelLink
|
||||
mkdir logs
|
||||
```
|
||||
2. 搭建环境
|
||||
|
||||
@ -492,6 +494,7 @@ Qwen-72B 训练的硬件配置:
|
||||
```shell
|
||||
git clone https://gitee.com/ascend/ModelLink.git
|
||||
cd ModelLink
|
||||
mkdir logs
|
||||
```
|
||||
2. 搭建环境
|
||||
|
||||
|
@ -46,6 +46,7 @@ Here's a hardware summary of pre-training Qwen-7B:
|
||||
```shell
|
||||
git clone https://gitee.com/ascend/ModelLink.git
|
||||
cd ModelLink
|
||||
mkdir logs
|
||||
```
|
||||
|
||||
2. Build environment
|
||||
@ -261,6 +262,7 @@ Here's a hardware summary of pre-training Qwen-14B:
|
||||
```shell
|
||||
git clone https://gitee.com/ascend/ModelLink.git
|
||||
cd ModelLink
|
||||
mkdir logs
|
||||
```
|
||||
|
||||
2. Build environment
|
||||
@ -483,6 +485,7 @@ Here's a hardware summary of pre-training Qwen-72B:
|
||||
```shell
|
||||
git clone https://gitee.com/ascend/ModelLink.git
|
||||
cd ModelLink
|
||||
mkdir logs
|
||||
```
|
||||
|
||||
2. Build environment
|
||||
|
@ -89,4 +89,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
|
||||
$OUTPUT_ARGS \
|
||||
--tokenizer-kwargs 'eos_token' '<|endoftext|>' 'pad_token' '<|extra_0|>' \
|
||||
--distributed-backend nccl \
|
||||
--save ${CKPT_SAVE_DIR}
|
||||
--save ${CKPT_SAVE_DIR} \
|
||||
| tee logs/train_qwen_14b.log
|
||||
|
@ -90,4 +90,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
|
||||
$OUTPUT_ARGS \
|
||||
--tokenizer-kwargs 'eos_token' '<|endoftext|>' 'pad_token' '<|extra_0|>' \
|
||||
--distributed-backend nccl \
|
||||
--save ${CKPT_SAVE_DIR}
|
||||
--save ${CKPT_SAVE_DIR} \
|
||||
| tee logs/train_qwen_72b.log
|
||||
|
@ -89,4 +89,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
|
||||
$OUTPUT_ARGS \
|
||||
--tokenizer-kwargs 'eos_token' '<|endoftext|>' 'pad_token' '<|extra_0|>' \
|
||||
--distributed-backend nccl \
|
||||
--save ${CKPT_SAVE_DIR}
|
||||
--save ${CKPT_SAVE_DIR} \
|
||||
| tee logs/train_qwen_7b.log
|
||||
|
@ -1,4 +1,3 @@
|
||||
apex
|
||||
numpy
|
||||
transformers==4.36.0
|
||||
transformers_stream_generator
|
||||
|
Loading…
Reference in New Issue
Block a user