!1373 修改typos和issue

Merge pull request !1373 from fengliangjun/master
2024-11-29 18:38:39 +08:00 · 2024-06-24 13:00:04 +00:00 · 2024-06-24 13:00:04 +00:00 · f80514ad86
commit f80514ad86
parent aa3013f9b7
61 changed files with 12 additions and 146 deletions
--- a/3
+++ b/3
@ -7,6 +7,7 @@ reviewers:
 - guhangsong
 - leizhenzhen
 - liuyanghan
+- jzh
 - guo-xinjie-1
 - wucong0509
 - dhrhank
@ -15,7 +16,7 @@ reviewers:
 - sunjunjie1587
 - dingzicha
 - zhangjianxiang4
- guozhihua
+- guozhihua9814
 - huangyiming123
 - zhangshengdong
 - xiong-liangcheng_admin
--- a/examples/aquila/generate_aquila_7b_ptd.sh
+++ b/examples/aquila/generate_aquila_7b_ptd.sh
@ -2,7 +2,6 @@

 # See README, please remember to source the set_env.sh file in CLI, or here
 # source /path/to/your/ascend-toolkit/set_env.sh
-export TOKENIZERS_PARALLELISM=false
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 CKPT_LOAD_DIR="your checkpoint load dir"
 TOKENIZER_PATH="your tokenizer path"
--- a/examples/aquila/pretrain_aquila_7b_ptd.sh
+++ b/examples/aquila/pretrain_aquila_7b_ptd.sh
@ -3,7 +3,6 @@
 # See README, please remember to source the set_env.sh file in CLI, or here
 # source /path/to/your/ascend-toolkit/set_env.sh
 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 CKPT_SAVE_DIR="your checkpoint save dir"
 DATA_PATH="your training data dir"
--- a/examples/aquila2/generate_aquila2_34b_ptd.sh
+++ b/examples/aquila2/generate_aquila2_34b_ptd.sh
@ -2,7 +2,6 @@

 # See README, please remember to source the set_env.sh file in CLI, or here
 # source /path/to/your/ascend-toolkit/set_env.sh
-export TOKENIZERS_PARALLELISM=false
 export CUDA_DEVICE_MAX_CONNECTIONS=1

 # please fill these path configurations
--- a/examples/aquila2/generate_aquila2_7b_ptd.sh
+++ b/examples/aquila2/generate_aquila2_7b_ptd.sh
@ -2,7 +2,6 @@

 # See README, please remember to source the set_env.sh file in CLI, or here
 # source /path/to/your/ascend-toolkit/set_env.sh
-export TOKENIZERS_PARALLELISM=false
 export CUDA_DEVICE_MAX_CONNECTIONS=1

 # please fill these path configurations
--- a/examples/aquila2/pretrain_aquila2_34b_ptd_16p.sh
+++ b/examples/aquila2/pretrain_aquila2_34b_ptd_16p.sh
@ -3,7 +3,6 @@
 # See README, please remember to source the set_env.sh file in CLI, or here
 # source /path/to/your/ascend-toolkit/set_env.sh
 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 # please fill these path configurations
 CKPT_SAVE_DIR="your checkpoint save dir"
--- a/examples/aquila2/pretrain_aquila2_7b_ptd.sh
+++ b/examples/aquila2/pretrain_aquila2_7b_ptd.sh
@ -3,7 +3,6 @@
 # See README, please remember to source the set_env.sh file in CLI, or here
 # source /path/to/your/ascend-toolkit/set_env.sh
 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 # please fill these path configurations
 CKPT_SAVE_DIR="your checkpoint save dir"
--- a/examples/baichuan/pretrain_baichuan_ptd_13B.sh
+++ b/examples/baichuan/pretrain_baichuan_ptd_13B.sh
@ -1,7 +1,6 @@
 #!/bin/bash

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 GPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/baichuan/pretrain_baichuan_ptd_7B.sh
+++ b/examples/baichuan/pretrain_baichuan_ptd_7B.sh
@ -1,7 +1,6 @@
 #!/bin/bash

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 GPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/baichuan2/pretrain_baichuan2_ptd_13B.sh
+++ b/examples/baichuan2/pretrain_baichuan2_ptd_13B.sh
@ -1,7 +1,6 @@
 #!/bin/bash

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 GPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/baichuan2/pretrain_baichuan2_ptd_7B.sh
+++ b/examples/baichuan2/pretrain_baichuan2_ptd_7B.sh
@ -1,7 +1,6 @@
 #!/bin/bash

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 GPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/bloom/evaluate_bloom_176b_ptd.sh
+++ b/examples/bloom/evaluate_bloom_176b_ptd.sh
@ -2,7 +2,6 @@

 # The number of parameters is not aligned
 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export TOKENIZERS_PARALLELISM=false

 export HCCL_CONNECT_TIMEOUT=1200

--- a/examples/bloom/generate_bloom_176b_ptd.sh
+++ b/examples/bloom/generate_bloom_176b_ptd.sh
@ -2,7 +2,6 @@

 # The number of parameters is not aligned
 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export TOKENIZERS_PARALLELISM=false

 export HCCL_CONNECT_TIMEOUT=1200

--- a/examples/bloom/generate_bloom_7b_ptd.sh
+++ b/examples/bloom/generate_bloom_7b_ptd.sh
@ -2,7 +2,6 @@

 # The number of parameters is not aligned
 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export TOKENIZERS_PARALLELISM=false

 export HCCL_CONNECT_TIMEOUT=1200

--- a/examples/bloom/pretrain_bloom_176b.sh
+++ b/examples/bloom/pretrain_bloom_176b.sh
@ -1,7 +1,6 @@
 #!/bin/bash
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export HCCL_CONNECT_TIMEOUT=1200
-export NPU_ASD_ENABLE=0

 GPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/bloom/pretrain_bloom_ptd_7B.sh
+++ b/examples/bloom/pretrain_bloom_ptd_7B.sh
@ -1,7 +1,6 @@
 #!/bin/bash
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export HCCL_CONNECT_TIMEOUT=1200
-export NPU_ASD_ENABLE=0

 GPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/chatglm3/pretrain_chatglm3_6B_8K.sh
+++ b/examples/chatglm3/pretrain_chatglm3_6B_8K.sh
@ -1,6 +1,5 @@
 #!/bin/bash
 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 NPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/chatglm3/tune_chatglm3_6B_8K.sh
+++ b/examples/chatglm3/tune_chatglm3_6B_8K.sh
@ -1,6 +1,5 @@
 #!/bin/bash
 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 NPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/gemma/pretrain_gemma_2b_ptd.sh
+++ b/examples/gemma/pretrain_gemma_2b_ptd.sh
@ -1,7 +1,6 @@
 #!/bin/bash

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 GPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/gemma/pretrain_gemma_7b_ptd.sh
+++ b/examples/gemma/pretrain_gemma_7b_ptd.sh
@ -1,7 +1,6 @@
 #!/bin/bash

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 GPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/gemma/tune_gemma_7b_ptd.sh
+++ b/examples/gemma/tune_gemma_7b_ptd.sh
@ -1,7 +1,6 @@
 #!/bin/bash

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 GPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/intern/generate_internlm_7b_ptd.sh
+++ b/examples/intern/generate_internlm_7b_ptd.sh
@ -5,7 +5,6 @@ export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib:/root/miniconda3/lib:$LD_LI
 export HCCL_CONNECT_TIMEOUT=1200
 export COMBINED_ENABLE=1
 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 # please fill these path configurations
 CHECKPOINT="your model path"
--- a/examples/intern/pretrain_internlm_65b_ptd.sh
+++ b/examples/intern/pretrain_internlm_65b_ptd.sh
@ -1,7 +1,6 @@
 #!/bin/bash

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 GPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/intern/pretrain_internlm_7b_ptd.sh
+++ b/examples/intern/pretrain_internlm_7b_ptd.sh
@ -1,7 +1,6 @@
 #!/bin/bash

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 GPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/llama/tune_llama_13b_ptd.sh
+++ b/examples/llama/tune_llama_13b_ptd.sh
@ -1,7 +1,6 @@
 #!/bin/bash

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 GPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/llama/tune_llama_33b_ptd.sh
+++ b/examples/llama/tune_llama_33b_ptd.sh
@ -5,7 +5,6 @@ export COMBINED_ENABLE=1
 export AZUREML_EXPERIMENT_ID=0

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 GPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/llama/tune_llama_65b_ptd.sh
+++ b/examples/llama/tune_llama_65b_ptd.sh
@ -1,7 +1,6 @@
 #!/bin/bash

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 GPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/llama/tune_llama_7b_ptd.sh
+++ b/examples/llama/tune_llama_7b_ptd.sh
@ -1,7 +1,6 @@
 #!/bin/bash

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 GPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/llama2/pretrain_llama2_13B_ptd_8p.sh
+++ b/examples/llama2/pretrain_llama2_13B_ptd_8p.sh
@ -1,6 +1,5 @@
 #!/bin/bash
 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 GPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/llama2/pretrain_llama2_34B_ptd_16p.sh
+++ b/examples/llama2/pretrain_llama2_34B_ptd_16p.sh
@ -1,7 +1,6 @@
 #!/bin/bash

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 NPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/llama2/pretrain_llama2_70b_ptd.sh
+++ b/examples/llama2/pretrain_llama2_70b_ptd.sh
@ -1,5 +1,4 @@
 #!/bin/bash
-export NPU_ASD_ENABLE=0
 export CUDA_DEVICE_MAX_CONNECTIONS=1

 GPUS_PER_NODE=8
--- a/examples/llama2/pretrain_llama2_7b_ptd.sh
+++ b/examples/llama2/pretrain_llama2_7b_ptd.sh
@ -1,7 +1,6 @@
 #!/bin/bash

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 GPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/llama2/tune_llama2_13b_ptd.sh
+++ b/examples/llama2/tune_llama2_13b_ptd.sh
@ -5,7 +5,6 @@ export COMBINED_ENABLE=1
 export AZUREML_EXPERIMENT_ID=0

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 GPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/llama2/tune_llama2_34b_ptd.sh
+++ b/examples/llama2/tune_llama2_34b_ptd.sh
@ -5,7 +5,6 @@ export COMBINED_ENABLE=1
 export AZUREML_EXPERIMENT_ID=0

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 GPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/llama2/tune_llama2_70b_ptd.sh
+++ b/examples/llama2/tune_llama2_70b_ptd.sh
@ -1,5 +1,4 @@
 #!/bin/bash
-export NPU_ASD_ENABLE=0
 export CUDA_DEVICE_MAX_CONNECTIONS=1

 GPUS_PER_NODE=8
--- a/examples/llama2/tune_llama2_7b_ptd.sh
+++ b/examples/llama2/tune_llama2_7b_ptd.sh
@ -1,7 +1,6 @@
 #!/bin/bash

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 GPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/llama3/pretrain_llama3_70b_ptd.sh
+++ b/examples/llama3/pretrain_llama3_70b_ptd.sh
@ -1,7 +1,6 @@
 #!/bin/bash

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 GPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/llama3/pretrain_llama3_8b_ptd.sh
+++ b/examples/llama3/pretrain_llama3_8b_ptd.sh
@ -1,7 +1,6 @@
 #!/bin/bash

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 GPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/mistral/generate_mistral_7b_ptd.sh
+++ b/examples/mistral/generate_mistral_7b_ptd.sh
@ -4,7 +4,6 @@
 export HCCL_CONNECT_TIMEOUT=1200
 export COMBINED_ENABLE=1
 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export TOKENIZERS_PARALLELISM=false

 # please fill these path configurations
 CHECKPOINT="your model ckpt path"
--- a/examples/mistral/pretrain_mistral_7b_ptd.sh
+++ b/examples/mistral/pretrain_mistral_7b_ptd.sh
@ -1,6 +1,5 @@
 #!/bin/bash
 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 GPUS_PER_NODE=8
 MASTER_ADDR="your master node IP"
--- a/examples/mixtral/README.md
+++ b/examples/mixtral/README.md
@ -98,7 +98,7 @@
    # 修改 ascend-toolkit 路径
    source /usr/local/Ascend/ascend-toolkit/set_env.sh
    
-    # HF 转 tp1-pp8-ep2
+    # HF 转 tp8-pp4-ep1
    python tools/checkpoint/convert_ckpt.py \
        --model-type GPT \
        --loader mixtral_hf \
@ -137,7 +137,7 @@
    # 修改 ascend-toolkit 路径
    source /usr/local/Ascend/ascend-toolkit/set_env.sh
    
-    # tp1-pp8-ep2 转 HF
+    # tp8-pp4-ep1 转 HF
    python tools/checkpoint/convert_ckpt.py \
        --model-type GPT \
        --loader mixtral_mg \
@ -261,7 +261,7 @@ Mixtral-8x7B 在四机32卡上(tp8 pp4) **昇腾芯片** 和 **参考芯片**
 source /usr/local/Ascend/ascend-toolkit/set_env.sh 

 # 修改模型权重路径和词表路径
-CHECKPOINT="./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep1/"
+CHECKPOINT="./model_weights/Mixtral-8x7B-v0.1-tp8-pp1-ep1/"
 TOKENIZER_MODEL="./model_from_hf/Mixtral-8x7B/"

 # 根据实际加载的模型权重修改并行配置
--- a/examples/mixtral/README_en.md
+++ b/examples/mixtral/README_en.md
@ -99,7 +99,7 @@ Recommended hardware configuration for inference:
    # Modify the ascend-toolkit path
    source /usr/local/Ascend/ascend-toolkit/set_env.sh
    
-    # HF to tp1-pp8-ep2
+    # HF to tp8-pp4-ep1
    python tools/checkpoint/convert_ckpt.py \
        --model-type GPT \
        --loader mixtral_hf \
@ -113,13 +113,13 @@ Recommended hardware configuration for inference:
    ```

    Any Megatron weights with parallel slicing strategy --> Any Megatron weights with parallel slicing strategy
-    ***(This scenario is generally used to reconfigure the sliced model weights, such as training on a dual-node 16-card EP2-PP8 strategy, and then wanting to infer on a single-node 8-card TP8)***
+    ***(This scenario is generally used to reconfigure the sliced model weights, such as training on a four-node 32-card TP8-PP4 strategy, and then wanting to infer on a single-node 8-card TP8)***

    ```bash
    # Modify the ascend-toolkit path
    source /usr/local/Ascend/ascend-toolkit/set_env.sh
    
-    # tp1-pp8-ep2 to tp1-pp8-ep1
+    # tp8-pp4-ep1 to tp8-pp1-ep1
    python tools/checkpoint/convert_ckpt.py \
        --model-type GPT \
        --loader mixtral_mg \
@ -138,7 +138,7 @@ Recommended hardware configuration for inference:
    # Modify the ascend-toolkit path
    source /usr/local/Ascend/ascend-toolkit/set_env.sh
    
-    # tp1-pp8-ep2 to HF
+    # tp8-pp4-ep1 to HF
    python tools/checkpoint/convert_ckpt.py \
        --model-type GPT \
        --loader mixtral_mg \
--- a/examples/mixtral/evaluate_mixtral_8x7b_ptd.sh
+++ b/examples/mixtral/evaluate_mixtral_8x7b_ptd.sh
@ -4,7 +4,6 @@
 export HCCL_CONNECT_TIMEOUT=1200
 export COMBINED_ENABLE=1
 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export TOKENIZERS_PARALLELISM=false

 MASTER_ADDR=localhost
 MASTER_PORT=6000
--- a/examples/mixtral/generate_mixtral_8x7b_ptd.sh
+++ b/examples/mixtral/generate_mixtral_8x7b_ptd.sh
@ -4,7 +4,6 @@
 export HCCL_CONNECT_TIMEOUT=1200
 export COMBINED_ENABLE=1
 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export TOKENIZERS_PARALLELISM=false

 # please fill these path configurations
 CHECKPOINT="your model ckpt path"
--- a/examples/qwen/pretrain_qwen_14b_ptd.sh
+++ b/examples/qwen/pretrain_qwen_14b_ptd.sh
@ -1,7 +1,6 @@
 #!/bin/bash

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 NPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/qwen/pretrain_qwen_72b_ptd.sh
+++ b/examples/qwen/pretrain_qwen_72b_ptd.sh
@ -1,7 +1,6 @@
 #!/bin/bash

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 NPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/qwen/pretrain_qwen_7b_ptd.sh
+++ b/examples/qwen/pretrain_qwen_7b_ptd.sh
@ -1,7 +1,6 @@
 #!/bin/bash

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 NPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/qwen15/pretrain_qwen15_0point5b_ptd.sh
+++ b/examples/qwen15/pretrain_qwen15_0point5b_ptd.sh
@ -1,7 +1,6 @@
 #!/bin/bash

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 NPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/qwen15/pretrain_qwen15_14b_ptd.sh
+++ b/examples/qwen15/pretrain_qwen15_14b_ptd.sh
@ -1,7 +1,6 @@
 #!/bin/bash

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 NPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/qwen15/pretrain_qwen15_1point8b_ptd.sh
+++ b/examples/qwen15/pretrain_qwen15_1point8b_ptd.sh
@ -1,7 +1,6 @@
 #!/bin/bash

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 NPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/qwen15/pretrain_qwen15_32b_ptd.sh
+++ b/examples/qwen15/pretrain_qwen15_32b_ptd.sh
@ -2,7 +2,6 @@

 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export HCCL_CONNECT_TIMEOUT=1800
-export NPU_ASD_ENABLE=0

 MASTER_ADDR=localhost
 NPUS_PER_NODE=8
--- a/examples/qwen15/pretrain_qwen15_4b_ptd.sh
+++ b/examples/qwen15/pretrain_qwen15_4b_ptd.sh
@ -1,7 +1,7 @@
 #!/bin/bash

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0
+

 NPUS_PER_NODE=8
 # Change for multinode config
--- a/examples/qwen15/pretrain_qwen15_72b_ptd.sh
+++ b/examples/qwen15/pretrain_qwen15_72b_ptd.sh
@ -2,7 +2,6 @@

 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export HCCL_CONNECT_TIMEOUT=1800
-export NPU_ASD_ENABLE=0

 MASTER_ADDR=localhost
 NPUS_PER_NODE=8
--- a/examples/qwen15/pretrain_qwen15_7b_ptd.sh
+++ b/examples/qwen15/pretrain_qwen15_7b_ptd.sh
@ -1,7 +1,6 @@
 #!/bin/bash

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 NPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/examples/qwen15/tune_qwen15_32b_ptd.sh
+++ b/examples/qwen15/tune_qwen15_32b_ptd.sh
@ -2,7 +2,6 @@

 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export HCCL_CONNECT_TIMEOUT=1800
-export NPU_ASD_ENABLE=0

 MASTER_ADDR=localhost
 NPUS_PER_NODE=8
--- a/examples/qwen15/tune_qwen15_72b_ptd.sh
+++ b/examples/qwen15/tune_qwen15_72b_ptd.sh
@ -2,7 +2,6 @@

 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export HCCL_CONNECT_TIMEOUT=1800
-export NPU_ASD_ENABLE=0

 MASTER_ADDR=localhost
 NPUS_PER_NODE=8
--- a/examples/yi/pretrain_yi_34b_ptd_16p.sh
+++ b/examples/yi/pretrain_yi_34b_ptd_16p.sh
@ -1,7 +1,6 @@
 #!/bin/bash

 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NPU_ASD_ENABLE=0

 NPUS_PER_NODE=8
 MASTER_ADDR=localhost
--- a/setup.py
+++ b/setup.py
@ -21,7 +21,7 @@ import setuptools
 if sys.version_info < (3,):
    raise Exception("Python 2 is not supported by ModelLink.")

-__description__ = 'AscendSpeed for LLMs of Ascend'
+__description__ = 'ModelLink for LLMs of Ascend'
 __version__ = '0.0.1'
 __author__ = 'Ascend'
 __long_description__ = 'ModelLink for LLMs of Ascend'
--- a/tests/st/test_gemma_inference_ptd.sh
+++ b/tests/st/test_gemma_inference_ptd.sh
@ -1,5 +1,4 @@
 #!/bin/bash
-export TOKENIZERS_PARALLELISM=false
 export CUDA_DEVICE_MAX_CONNECTIONS=1

 MASTER_ADDR=localhost
--- a/tools/checkpoint/convert_ckpt.py
+++ b/tools/checkpoint/convert_ckpt.py
@ -60,7 +60,7 @@ def main():
    parser.add_argument('--loader', type=str, default='megatron',
                        help='Module name to load checkpoint, should be on python path')
    parser.add_argument('--saver', type=str, default='megatron',
-                        help='Module name to save checkpoint, shdoul be on python path')
+                        help='Module name to save checkpoint, should be on python path')
    parser.add_argument('--load-dir', type=str, required=True,
                        help='Directory to load model checkpoint from')
    parser.add_argument('--lora-dir', type=str,
--- a/tools/retro/utils.py
+++ b/tools/retro/utils.py
@ -1,81 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# This source code is licensed under the Apache license found in the
-# LICENSE file in the root directory of this source tree.
-
-import os
-import types
-import torch
-
-from megatron import get_retro_args
-from megatron.tokenizer.tokenizer import (
-    _BertWordPieceTokenizer,
-    _GPT2BPETokenizer,
-    _GPTSentencePieceTokenizer,
-)
-
-
-def get_args_path(workdir):
-    '''Argument copy stored within retro workdir.'''
-    return os.path.join(workdir, "args.json")
-
-
-def get_num_chunks_per_sample():
-    '''Compute seq_length // chunk_length.'''
-    args = get_retro_args()
-    sample_length = args.retro_gpt_seq_length
-    chunk_length = args.retro_gpt_chunk_length
-    if sample_length % chunk_length != 0:
-        raise ValueError('chunk_length should be divisible by sample_length.')
-    return sample_length // chunk_length
-
-
-def get_gpt_tokenizer():
-    '''GPT (BPE) tokenizer.'''
-    args = get_retro_args()
-    tokenizer_type = args.retro_gpt_tokenizer_type
-    if tokenizer_type == "GPT2BPETokenizer":
-        if not args.retro_gpt_vocab_file or not args.retro_gpt_merge_file:
-            raise ValueError('retro_gpt_vocab_file and retro_gpt_merge_file should not be none.')
-        return _GPT2BPETokenizer(
-            vocab_file=args.retro_gpt_vocab_file,
-            merge_file=args.retro_gpt_merge_file,
-        )
-    elif tokenizer_type == 'GPTSentencePieceTokenizer':
-        if args.retro_gpt_tokenizer_model is None:
-            raise ValueError('retro_gpt_tokenizer_model is None.')
-        return _GPTSentencePieceTokenizer(args.retro_gpt_tokenizer_model)
-    else:
-        raise Exception("unrecognized gpt tokenizer, '%s'." % tokenizer_type)
-
-
-def get_bert_tokenizer():
-    '''Bert (Wordpiece) tokenizer.'''
-    args = get_retro_args()
-    lower_case = {
-        "BertWordPieceLowerCase" : True,
-        "BertWordPieceCase" : False,
-    }[args.retro_bert_tokenizer_type]
-    return _BertWordPieceTokenizer(
-        vocab_file=args.retro_bert_vocab_file,
-        lower_case=lower_case,
-    )
-
-
-class GPTToTextDataset(torch.utils.data.Dataset):
-    '''Dataset to convert GPT tokens to text.'''
-
-    def __init__(self, gpt_dataset):
-
-        super().__init__()
-
-        self.gpt_dataset = gpt_dataset
-        self.gpt_tokenizer = get_gpt_tokenizer()
-
-    def __len__(self):
-        return len(self.gpt_dataset)
-
-    def __getitem__(self, idx):
-        gpt_token_ids = self.gpt_dataset[idx]["text"].tolist()
-        text = self.gpt_tokenizer.detokenize(gpt_token_ids)
-        return {"text": text}