mirror of
https://gitee.com/ascend/ModelLink.git
synced 2024-12-01 19:39:02 +08:00
remove unused functions and files.
This commit is contained in:
parent
4a42cf21f2
commit
393f78a3a3
@ -193,6 +193,9 @@ wget https://gitee.com/ascend/pytorch/releases/download/v5.0.rc2-pytorch1.11.0/t
|
||||
or wget https://gitee.com/ascend/pytorch/releases/download/v5.0.rc2-pytorch1.11.0/torch_npu-1.11.0.post1-cp37-cp37m-linux_x86_64.whl (X86)
|
||||
pip install torch_npu-1.11.0.post1-cp37-cp37m-linux_XXXXXX.whl
|
||||
|
||||
# install megatron-core
|
||||
pip3 install -e git+https://github.com/NVIDIA/Megatron-LM.git@23.05#egg=megatron-core
|
||||
|
||||
# install deepspeed and deepspeed_npu
|
||||
pip install deepspeed==0.9.2
|
||||
git clone https://gitee.com/ascend/DeepSpeed.git -b v0.9.2 deepspeed_npu
|
||||
|
@ -13,6 +13,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
import sys
|
||||
import logging
|
||||
|
||||
import torch
|
||||
@ -36,6 +37,10 @@ from .package_info import (
|
||||
__version__,
|
||||
)
|
||||
|
||||
if 'amp_C' in sys.modules:
|
||||
del sys.modules['amp_C']
|
||||
sys.modules['amp_C'] = __import__('ascendspeed.amp_C')
|
||||
|
||||
from .global_vars import get_args
|
||||
from .global_vars import get_current_global_batch_size
|
||||
from .global_vars import get_num_microbatches
|
||||
|
@ -360,17 +360,17 @@ def get_state_dict_and_release(load_dir, lora_load_dir=None):
|
||||
state_dict = load_state_dict_from_checkpoint_with_megatron(checkpoint_name,
|
||||
model_checkpoint_name=model_checkpoint_name)
|
||||
except ModuleNotFoundError:
|
||||
from ascendspeed.fp16_deprecated import loss_scaler
|
||||
from megatron.fp16_deprecated import loss_scaler
|
||||
# For backward compatibility.
|
||||
print_rank_0(' > deserializing using the old code structure ...')
|
||||
sys.modules['fp16.loss_scaler'] = sys.modules[
|
||||
'ascendspeed.fp16_deprecated.loss_scaler']
|
||||
sys.modules['ascendspeed.fp16.loss_scaler'] = sys.modules[
|
||||
'ascendspeed.fp16_deprecated.loss_scaler']
|
||||
'megatron.fp16_deprecated.loss_scaler']
|
||||
sys.modules['megatron.fp16.loss_scaler'] = sys.modules[
|
||||
'megatron.fp16_deprecated.loss_scaler']
|
||||
state_dict = load_state_dict_from_checkpoint_with_megatron(checkpoint_name,
|
||||
model_checkpoint_name=model_checkpoint_name)
|
||||
sys.modules.pop('fp16.loss_scaler', None)
|
||||
sys.modules.pop('ascendspeed.fp16.loss_scaler', None)
|
||||
sys.modules.pop('megatron.fp16.loss_scaler', None)
|
||||
except BaseException as e:
|
||||
print_rank_0('could not load the checkpoint')
|
||||
print_rank_0(e)
|
||||
|
@ -1,319 +0,0 @@
|
||||
"""AutoAugment data augmentation policy for ImageNet.
|
||||
|
||||
-- Begin license text.
|
||||
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2018 Philip Popien
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
|
||||
-- End license text.
|
||||
|
||||
|
||||
This module implements the fixed AutoAugment data augmentation policy for ImageNet provided in
|
||||
Appendix A, Table 9 of reference [1]. It does not include any of the search code for augmentation
|
||||
policies.
|
||||
|
||||
Reference:
|
||||
[1] https://arxiv.org/abs/1805.09501
|
||||
"""
|
||||
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from PIL import ImageEnhance
|
||||
from PIL import ImageOps
|
||||
|
||||
_MAX_LEVEL = 10 # Maximum integer strength of an augmentation, if applicable.
|
||||
|
||||
|
||||
class ImageNetPolicy:
|
||||
"""Definition of an ImageNetPolicy.
|
||||
|
||||
Implements a fixed AutoAugment data augmentation policy targeted at
|
||||
ImageNet training by randomly applying at runtime one of the 25 pre-defined
|
||||
data augmentation sub-policies provided in Reference [1].
|
||||
|
||||
Usage example as a Pytorch Transform:
|
||||
>>> transform=transforms.Compose([transforms.Resize(256),
|
||||
>>> ImageNetPolicy(),
|
||||
>>> transforms.ToTensor()])
|
||||
"""
|
||||
|
||||
def __init__(self, fillcolor=(128, 128, 128)):
|
||||
"""Initialize an ImageNetPolicy.
|
||||
|
||||
Args:
|
||||
fillcolor (tuple): RGB color components of the color to be used for
|
||||
filling when needed (default: (128, 128, 128), which
|
||||
corresponds to gray).
|
||||
"""
|
||||
# Instantiate a list of sub-policies.
|
||||
# Each entry of the list is a SubPolicy which consists of
|
||||
# two augmentation operations,
|
||||
# each of those parametrized as operation, probability, magnitude.
|
||||
# Those two operations are applied sequentially on the image upon call.
|
||||
self.policies = [
|
||||
SubPolicy("posterize", 0.4, 8, "rotate", 0.6, 9, fillcolor),
|
||||
SubPolicy("solarize", 0.6, 5, "autocontrast", 0.6, 5, fillcolor),
|
||||
SubPolicy("equalize", 0.8, 8, "equalize", 0.6, 3, fillcolor),
|
||||
SubPolicy("posterize", 0.6, 7, "posterize", 0.6, 6, fillcolor),
|
||||
SubPolicy("equalize", 0.4, 7, "solarize", 0.2, 4, fillcolor),
|
||||
SubPolicy("equalize", 0.4, 4, "rotate", 0.8, 8, fillcolor),
|
||||
SubPolicy("solarize", 0.6, 3, "equalize", 0.6, 7, fillcolor),
|
||||
SubPolicy("posterize", 0.8, 5, "equalize", 1.0, 2, fillcolor),
|
||||
SubPolicy("rotate", 0.2, 3, "solarize", 0.6, 8, fillcolor),
|
||||
SubPolicy("equalize", 0.6, 8, "posterize", 0.4, 6, fillcolor),
|
||||
SubPolicy("rotate", 0.8, 8, "color", 0.4, 0, fillcolor),
|
||||
SubPolicy("rotate", 0.4, 9, "equalize", 0.6, 2, fillcolor),
|
||||
SubPolicy("equalize", 0.0, 7, "equalize", 0.8, 8, fillcolor),
|
||||
SubPolicy("invert", 0.6, 4, "equalize", 1.0, 8, fillcolor),
|
||||
SubPolicy("color", 0.6, 4, "contrast", 1.0, 8, fillcolor),
|
||||
SubPolicy("rotate", 0.8, 8, "color", 1.0, 2, fillcolor),
|
||||
SubPolicy("color", 0.8, 8, "solarize", 0.8, 7, fillcolor),
|
||||
SubPolicy("sharpness", 0.4, 7, "invert", 0.6, 8, fillcolor),
|
||||
SubPolicy("shearX", 0.6, 5, "equalize", 1.0, 9, fillcolor),
|
||||
SubPolicy("color", 0.4, 0, "equalize", 0.6, 3, fillcolor),
|
||||
SubPolicy("equalize", 0.4, 7, "solarize", 0.2, 4, fillcolor),
|
||||
SubPolicy("solarize", 0.6, 5, "autocontrast", 0.6, 5, fillcolor),
|
||||
SubPolicy("invert", 0.6, 4, "equalize", 1.0, 8, fillcolor),
|
||||
SubPolicy("color", 0.6, 4, "contrast", 1.0, 8, fillcolor),
|
||||
SubPolicy("equalize", 0.8, 8, "equalize", 0.6, 3, fillcolor),
|
||||
]
|
||||
|
||||
def __call__(self, img):
|
||||
"""Define call method for ImageNetPolicy class."""
|
||||
policy_idx = random.randint(0, len(self.policies) - 1)
|
||||
return self.policies[policy_idx](img)
|
||||
|
||||
def __repr__(self):
|
||||
"""Define repr method for ImageNetPolicy class."""
|
||||
return "ImageNetPolicy"
|
||||
|
||||
|
||||
class SubPolicy:
|
||||
"""Definition of a SubPolicy.
|
||||
|
||||
A SubPolicy consists of two augmentation operations,
|
||||
each of those parametrized as operation, probability, magnitude.
|
||||
The two operations are applied sequentially on the image upon call.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
operation1,
|
||||
probability1,
|
||||
magnitude_idx1,
|
||||
operation2,
|
||||
probability2,
|
||||
magnitude_idx2,
|
||||
fillcolor,
|
||||
):
|
||||
"""Initialize a SubPolicy.
|
||||
|
||||
Args:
|
||||
operation1 (str): Key specifying the first augmentation operation.
|
||||
There are fourteen key values altogether (see supported_ops below
|
||||
listing supported operations). probability1 (float): Probability
|
||||
within [0., 1.] of applying the first augmentation operation.
|
||||
magnitude_idx1 (int): Integer specifiying the strength of the first
|
||||
operation as an index further used to derive the magnitude from a
|
||||
range of possible values.
|
||||
operation2 (str): Key specifying the second augmentation operation.
|
||||
probability2 (float): Probability within [0., 1.] of applying the
|
||||
second augmentation operation.
|
||||
magnitude_idx2 (int): Integer specifiying the strength of the
|
||||
second operation as an index further used to derive the magnitude
|
||||
from a range of possible values.
|
||||
fillcolor (tuple): RGB color components of the color to be used for
|
||||
filling.
|
||||
Returns:
|
||||
"""
|
||||
# List of supported operations for operation1 and operation2.
|
||||
supported_ops = [
|
||||
"shearX",
|
||||
"shearY",
|
||||
"translateX",
|
||||
"translateY",
|
||||
"rotate",
|
||||
"color",
|
||||
"posterize",
|
||||
"solarize",
|
||||
"contrast",
|
||||
"sharpness",
|
||||
"brightness",
|
||||
"autocontrast",
|
||||
"equalize",
|
||||
"invert",
|
||||
]
|
||||
assert (operation1 in supported_ops) and (
|
||||
operation2 in supported_ops
|
||||
), "SubPolicy:one of oper1 or oper2 refers to an unsupported operation."
|
||||
|
||||
assert (
|
||||
0.0 <= probability1 <= 1.0 and 0.0 <= probability2 <= 1.0
|
||||
), "SubPolicy: prob1 and prob2 should be within [0., 1.]."
|
||||
|
||||
assert (
|
||||
isinstance(magnitude_idx1, int) and 0 <= magnitude_idx1 <= 10
|
||||
), "SubPolicy: idx1 should be specified as an integer within [0, 10]."
|
||||
|
||||
assert (
|
||||
isinstance(magnitude_idx2, int) and 0 <= magnitude_idx2 <= 10
|
||||
), "SubPolicy: idx2 should be specified as an integer within [0, 10]."
|
||||
|
||||
# Define a dictionary where each key refers to a specific type of
|
||||
# augmentation and the corresponding value is a range of ten possible
|
||||
# magnitude values for that augmentation.
|
||||
num_levels = _MAX_LEVEL + 1
|
||||
ranges = {
|
||||
"shearX": np.linspace(0, 0.3, num_levels),
|
||||
"shearY": np.linspace(0, 0.3, num_levels),
|
||||
"translateX": np.linspace(0, 150 / 331, num_levels),
|
||||
"translateY": np.linspace(0, 150 / 331, num_levels),
|
||||
"rotate": np.linspace(0, 30, num_levels),
|
||||
"color": np.linspace(0.0, 0.9, num_levels),
|
||||
"posterize": np.round(np.linspace(8, 4, num_levels), 0).astype(
|
||||
np.int
|
||||
),
|
||||
"solarize": np.linspace(256, 0, num_levels), # range [0, 256]
|
||||
"contrast": np.linspace(0.0, 0.9, num_levels),
|
||||
"sharpness": np.linspace(0.0, 0.9, num_levels),
|
||||
"brightness": np.linspace(0.0, 0.9, num_levels),
|
||||
"autocontrast": [0]
|
||||
* num_levels, # This augmentation doesn't use magnitude parameter.
|
||||
"equalize": [0]
|
||||
* num_levels, # This augmentation doesn't use magnitude parameter.
|
||||
"invert": [0]
|
||||
* num_levels, # This augmentation doesn't use magnitude parameter.
|
||||
}
|
||||
|
||||
def rotate_with_fill(img, magnitude):
|
||||
"""Define rotation transformation with fill.
|
||||
|
||||
The input image is first rotated, then it is blended together with
|
||||
a gray mask of the same size. Note that fillcolor as defined
|
||||
elsewhere in this module doesn't apply here.
|
||||
|
||||
Args:
|
||||
magnitude (float): rotation angle in degrees.
|
||||
Returns:
|
||||
rotated_filled (PIL Image): rotated image with gray filling for
|
||||
disoccluded areas unveiled by the rotation.
|
||||
"""
|
||||
rotated = img.convert("RGBA").rotate(magnitude)
|
||||
rotated_filled = Image.composite(
|
||||
rotated, Image.new("RGBA", rotated.size, (128,) * 4), rotated
|
||||
)
|
||||
return rotated_filled.convert(img.mode)
|
||||
|
||||
# Define a dictionary of augmentation functions where each key refers
|
||||
# to a specific type of augmentation and the corresponding value defines
|
||||
# the augmentation itself using a lambda function.
|
||||
# pylint: disable=unnecessary-lambda
|
||||
func_dict = {
|
||||
"shearX": lambda img, magnitude: img.transform(
|
||||
img.size,
|
||||
Image.AFFINE,
|
||||
(1, magnitude * random.choice([-1, 1]), 0, 0, 1, 0),
|
||||
Image.BICUBIC,
|
||||
fillcolor=fillcolor,
|
||||
),
|
||||
"shearY": lambda img, magnitude: img.transform(
|
||||
img.size,
|
||||
Image.AFFINE,
|
||||
(1, 0, 0, magnitude * random.choice([-1, 1]), 1, 0),
|
||||
Image.BICUBIC,
|
||||
fillcolor=fillcolor,
|
||||
),
|
||||
"translateX": lambda img, magnitude: img.transform(
|
||||
img.size,
|
||||
Image.AFFINE,
|
||||
(
|
||||
1,
|
||||
0,
|
||||
magnitude * img.size[0] * random.choice([-1, 1]),
|
||||
0,
|
||||
1,
|
||||
0,
|
||||
),
|
||||
fillcolor=fillcolor,
|
||||
),
|
||||
"translateY": lambda img, magnitude: img.transform(
|
||||
img.size,
|
||||
Image.AFFINE,
|
||||
(
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
1,
|
||||
magnitude * img.size[1] * random.choice([-1, 1]),
|
||||
),
|
||||
fillcolor=fillcolor,
|
||||
),
|
||||
"rotate": lambda img, magnitude: rotate_with_fill(img, magnitude),
|
||||
"color": lambda img, magnitude: ImageEnhance.Color(img).enhance(
|
||||
1 + magnitude * random.choice([-1, 1])
|
||||
),
|
||||
"posterize": lambda img, magnitude: ImageOps.posterize(
|
||||
img, magnitude
|
||||
),
|
||||
"solarize": lambda img, magnitude: ImageOps.solarize(
|
||||
img, magnitude
|
||||
),
|
||||
"contrast": lambda img, magnitude: ImageEnhance.Contrast(
|
||||
img
|
||||
).enhance(1 + magnitude * random.choice([-1, 1])),
|
||||
"sharpness": lambda img, magnitude: ImageEnhance.Sharpness(
|
||||
img
|
||||
).enhance(1 + magnitude * random.choice([-1, 1])),
|
||||
"brightness": lambda img, magnitude: ImageEnhance.Brightness(
|
||||
img
|
||||
).enhance(1 + magnitude * random.choice([-1, 1])),
|
||||
"autocontrast": lambda img, magnitude: ImageOps.autocontrast(img),
|
||||
"equalize": lambda img, magnitude: ImageOps.equalize(img),
|
||||
"invert": lambda img, magnitude: ImageOps.invert(img),
|
||||
}
|
||||
|
||||
# Store probability, function and magnitude of the first augmentation
|
||||
# for the sub-policy.
|
||||
self.probability1 = probability1
|
||||
self.operation1 = func_dict[operation1]
|
||||
self.magnitude1 = ranges[operation1][magnitude_idx1]
|
||||
|
||||
# Store probability, function and magnitude of the second augmentation
|
||||
# for the sub-policy.
|
||||
self.probability2 = probability2
|
||||
self.operation2 = func_dict[operation2]
|
||||
self.magnitude2 = ranges[operation2][magnitude_idx2]
|
||||
|
||||
def __call__(self, img):
|
||||
"""Define call method for SubPolicy class."""
|
||||
# Randomly apply operation 1.
|
||||
if random.random() < self.probability1:
|
||||
img = self.operation1(img, self.magnitude1)
|
||||
|
||||
# Randomly apply operation 2.
|
||||
if random.random() < self.probability2:
|
||||
img = self.operation2(img, self.magnitude2)
|
||||
|
||||
return img
|
@ -48,7 +48,7 @@ class BlendableDataset(torch.utils.data.Dataset):
|
||||
self.dataset_index = np.zeros(self.size, dtype=np.uint8)
|
||||
self.dataset_sample_index = np.zeros(self.size, dtype=np.int64)
|
||||
|
||||
from ascendspeed.data import helpers
|
||||
from megatron.data import helpers
|
||||
helpers.build_blending_indices(self.dataset_index,
|
||||
self.dataset_sample_index,
|
||||
weights, num_datasets, self.size,
|
||||
|
@ -19,7 +19,6 @@ import time
|
||||
import glob
|
||||
import json
|
||||
import logging
|
||||
from typing import Tuple
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
|
@ -28,7 +28,6 @@ from ascendspeed import (
|
||||
print_rank_0
|
||||
)
|
||||
from ascendspeed.core import parallel_state
|
||||
from ascendspeed.data.blendable_dataset import BlendableDataset
|
||||
from ascendspeed.data.indexed_dataset import make_dataset as make_indexed_dataset
|
||||
from deepspeed.accelerator import get_accelerator
|
||||
DSET_TYPE_BERT = 'standard_bert'
|
||||
@ -70,19 +69,6 @@ def get_datasets_weights_and_num_samples(data_prefix,
|
||||
return prefixes, weights, datasets_train_valid_test_num_samples
|
||||
|
||||
|
||||
def compile_helper():
|
||||
"""Compile helper function ar runtime. Make sure this
|
||||
is invoked on a single process."""
|
||||
import os
|
||||
import subprocess
|
||||
path = os.path.abspath(os.path.dirname(__file__))
|
||||
ret = subprocess.run(['make', '-C', path])
|
||||
if ret.returncode != 0:
|
||||
print("Making C++ dataset helpers module failed, exiting.")
|
||||
import sys
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def get_a_and_b_segments(sample, np_rng):
|
||||
"""Divide sample into a and b segments."""
|
||||
|
||||
@ -410,175 +396,6 @@ def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
|
||||
return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np
|
||||
|
||||
|
||||
def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
|
||||
train_valid_test_num_samples,
|
||||
max_seq_length,
|
||||
masked_lm_prob, short_seq_prob, seed,
|
||||
skip_warmup, binary_head=False,
|
||||
max_seq_length_dec=None,
|
||||
dataset_type='standard_bert'):
|
||||
|
||||
if len(data_prefix) == 1:
|
||||
return _build_train_valid_test_datasets(data_prefix[0],
|
||||
data_impl, splits_string,
|
||||
train_valid_test_num_samples,
|
||||
max_seq_length, masked_lm_prob,
|
||||
short_seq_prob, seed,
|
||||
skip_warmup,
|
||||
binary_head,
|
||||
max_seq_length_dec,
|
||||
dataset_type=dataset_type)
|
||||
# Blending dataset.
|
||||
# Parse the values.
|
||||
output = get_datasets_weights_and_num_samples(data_prefix,
|
||||
train_valid_test_num_samples)
|
||||
prefixes, weights, datasets_train_valid_test_num_samples = output
|
||||
|
||||
# Build individual datasets.
|
||||
train_datasets = []
|
||||
valid_datasets = []
|
||||
test_datasets = []
|
||||
for i in range(len(prefixes)):
|
||||
train_ds, valid_ds, test_ds = _build_train_valid_test_datasets(
|
||||
prefixes[i], data_impl, splits_string,
|
||||
datasets_train_valid_test_num_samples[i],
|
||||
max_seq_length, masked_lm_prob, short_seq_prob,
|
||||
seed, skip_warmup, binary_head, dataset_type=dataset_type)
|
||||
if train_ds:
|
||||
train_datasets.append(train_ds)
|
||||
if valid_ds:
|
||||
valid_datasets.append(valid_ds)
|
||||
if test_ds:
|
||||
test_datasets.append(test_ds)
|
||||
|
||||
# Blend.
|
||||
blending_train_dataset = None
|
||||
if train_datasets:
|
||||
blending_train_dataset = BlendableDataset(train_datasets, weights)
|
||||
blending_valid_dataset = None
|
||||
if valid_datasets:
|
||||
blending_valid_dataset = BlendableDataset(valid_datasets, weights)
|
||||
blending_test_dataset = None
|
||||
if test_datasets:
|
||||
blending_test_dataset = BlendableDataset(test_datasets, weights)
|
||||
|
||||
return (blending_train_dataset, blending_valid_dataset,
|
||||
blending_test_dataset)
|
||||
|
||||
|
||||
def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
|
||||
train_valid_test_num_samples,
|
||||
max_seq_length,
|
||||
masked_lm_prob, short_seq_prob, seed,
|
||||
skip_warmup, binary_head,
|
||||
max_seq_length_dec,
|
||||
dataset_type='standard_bert'):
|
||||
|
||||
if dataset_type not in DSET_TYPES:
|
||||
raise ValueError("Invalid dataset_type: ", dataset_type)
|
||||
|
||||
# Indexed dataset.
|
||||
indexed_dataset = get_indexed_dataset_(data_prefix,
|
||||
data_impl,
|
||||
skip_warmup)
|
||||
|
||||
if dataset_type == DSET_TYPE_ICT:
|
||||
args = get_args()
|
||||
title_dataset = get_indexed_dataset_(args.titles_data_path,
|
||||
data_impl,
|
||||
skip_warmup)
|
||||
|
||||
# Get start and end indices of train/valid/train into doc-idx
|
||||
# Note that doc-idx is desinged to be num-docs + 1 so we can
|
||||
# easily iterate over it.
|
||||
total_num_of_documents = indexed_dataset.doc_idx.shape[0] - 1
|
||||
splits = get_train_valid_test_split_(splits_string, total_num_of_documents)
|
||||
|
||||
# Print stats about the splits.
|
||||
print_rank_0(' > dataset split:')
|
||||
|
||||
def print_split_stats(name, index):
|
||||
print_rank_0(' {}:'.format(name))
|
||||
print_rank_0(' document indices in [{}, {}) total of {} '
|
||||
'documents'.format(splits[index], splits[index + 1],
|
||||
splits[index + 1] - splits[index]))
|
||||
start_index = indexed_dataset.doc_idx[splits[index]]
|
||||
end_index = indexed_dataset.doc_idx[splits[index + 1]]
|
||||
print_rank_0(' sentence indices in [{}, {}) total of {} '
|
||||
'sentences'.format(start_index, end_index,
|
||||
end_index - start_index))
|
||||
print_split_stats('train', 0)
|
||||
print_split_stats('validation', 1)
|
||||
print_split_stats('test', 2)
|
||||
|
||||
def build_dataset(index, name):
|
||||
from ascendspeed.data.bert_dataset import BertDataset
|
||||
from ascendspeed.data.ict_dataset import ICTDataset
|
||||
from ascendspeed.data.t5_dataset import T5Dataset
|
||||
dataset = None
|
||||
if splits[index + 1] > splits[index]:
|
||||
# Get the pointer to the original doc-idx so we can set it later.
|
||||
doc_idx_ptr = indexed_dataset.get_doc_idx()
|
||||
# Slice the doc-idx
|
||||
start_index = splits[index]
|
||||
# Add +1 so we can index into the dataset to get the upper bound.
|
||||
end_index = splits[index + 1] + 1
|
||||
# New doc_idx view.
|
||||
indexed_dataset.set_doc_idx(doc_idx_ptr[start_index:end_index])
|
||||
# Build the dataset accordingly.
|
||||
kwargs = dict(
|
||||
name=name,
|
||||
data_prefix=data_prefix,
|
||||
num_epochs=None,
|
||||
max_num_samples=train_valid_test_num_samples[index],
|
||||
max_seq_length=max_seq_length,
|
||||
seed=seed,
|
||||
)
|
||||
|
||||
if dataset_type == DSET_TYPE_ICT:
|
||||
args = get_args()
|
||||
dataset = ICTDataset(
|
||||
block_dataset=indexed_dataset,
|
||||
title_dataset=title_dataset,
|
||||
query_in_block_prob=args.query_in_block_prob,
|
||||
use_one_sent_docs=args.use_one_sent_docs,
|
||||
binary_head=binary_head,
|
||||
**kwargs
|
||||
)
|
||||
elif dataset_type == DSET_TYPE_T5:
|
||||
dataset = T5Dataset(
|
||||
indexed_dataset=indexed_dataset,
|
||||
masked_lm_prob=masked_lm_prob,
|
||||
max_seq_length_dec=max_seq_length_dec,
|
||||
short_seq_prob=short_seq_prob,
|
||||
**kwargs
|
||||
)
|
||||
elif dataset_type == DSET_TYPE_BERT:
|
||||
dataset = BertDataset(
|
||||
indexed_dataset=indexed_dataset,
|
||||
masked_lm_prob=masked_lm_prob,
|
||||
short_seq_prob=short_seq_prob,
|
||||
binary_head=binary_head,
|
||||
**kwargs
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError("Dataset type not fully implemented.")
|
||||
|
||||
# Set the original pointer so dataset remains the main dataset.
|
||||
indexed_dataset.set_doc_idx(doc_idx_ptr)
|
||||
# Checks.
|
||||
assert indexed_dataset.doc_idx[0] == 0
|
||||
assert indexed_dataset.doc_idx.shape[0] == \
|
||||
(total_num_of_documents + 1)
|
||||
return dataset
|
||||
|
||||
train_dataset = build_dataset(0, 'train')
|
||||
valid_dataset = build_dataset(1, 'valid')
|
||||
test_dataset = build_dataset(2, 'test')
|
||||
|
||||
return (train_dataset, valid_dataset, test_dataset)
|
||||
|
||||
|
||||
def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
|
||||
|
||||
print_rank_0(' > building dataset index ...')
|
||||
@ -702,7 +519,7 @@ def get_samples_mapping(indexed_dataset,
|
||||
print_rank_0(' > building sapmles index mapping for {} ...'.format(
|
||||
name))
|
||||
# First compile and then import.
|
||||
from ascendspeed.data import helpers
|
||||
from megatron.data import helpers
|
||||
samples_mapping = helpers.build_mapping(
|
||||
indexed_dataset.doc_idx,
|
||||
indexed_dataset.sizes,
|
||||
|
@ -1,220 +0,0 @@
|
||||
import os
|
||||
import numpy as np
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import torch.distributed as dist
|
||||
|
||||
class DistDataError(Exception):
|
||||
"""Defines an empty exception to throw when some other rank hit a real exception."""
|
||||
pass
|
||||
|
||||
class DistData(object):
|
||||
def __init__(self, backend='gloo'):
|
||||
assert backend in ['gloo', 'mpi'], f"torch.distributed backend '{backend}' is not supported, valid options are 'gloo' or 'mpi'"
|
||||
|
||||
dist.init_process_group(backend, init_method="env://")
|
||||
|
||||
# lookup our process rank and the group size
|
||||
self.rank = dist.get_rank()
|
||||
self.numranks = dist.get_world_size()
|
||||
|
||||
def allassert(self, cond, msg):
|
||||
"""Check that cond is True on all ranks, assert with msg everywhere if not.
|
||||
|
||||
To prevent deadlocks in cases where an assertion might only fail on one rank,
|
||||
this executes an allreduce to ensure that if any rank finds that an assertion
|
||||
has been violated, all ranks fail an assertion check.
|
||||
The condition must be true on all ranks for this not to assert.
|
||||
"""
|
||||
alltrue = self.alltrue(cond)
|
||||
assert alltrue, msg
|
||||
|
||||
def allraise_if(self, err):
|
||||
"""Raise exception if err is not None on any rank.
|
||||
|
||||
Similarly to allassert, this raises an exception on all ranks if err
|
||||
is set to an exception on any rank. Rank(s) where err is not None
|
||||
re-raise err as exception, and ranks where err is None raise DistDataError.
|
||||
Thus all ranks raise an exception if any rank has an active exception,
|
||||
which helps avoid deadlocks in cases where an exception may be raised
|
||||
on a subset of ranks.
|
||||
"""
|
||||
alltrue = self.alltrue(err is None)
|
||||
if not alltrue:
|
||||
# At least one rank raised an exception.
|
||||
# Re-raise the actual exception if this rank threw one.
|
||||
if err is not None:
|
||||
raise err
|
||||
|
||||
# TODO: is there a better exception to use here?
|
||||
# On other ranks, raise an "empty" exception to indicate
|
||||
# that we're only failing because someone else did.
|
||||
raise DistDataError
|
||||
|
||||
def barrier(self):
|
||||
"""Globally synchronize all processes"""
|
||||
dist.barrier()
|
||||
|
||||
def bcast(self, val, root):
|
||||
"""Broadcast a scalar value from root to all ranks"""
|
||||
vals = [val]
|
||||
dist.broadcast_object_list(vals, src=root)
|
||||
return vals[0]
|
||||
|
||||
def scatterv_(self, invals: np.array, counts: list, root:int=0):
|
||||
"""Scatter int64 values from invals according to counts array, return received portion in a new tensor"""
|
||||
|
||||
self.allassert(len(counts) == self.numranks,
|
||||
f"Length of counts list {len(counts)} does not match number of ranks {self.numranks}")
|
||||
|
||||
# Define list of tensors to scatter on the root.
|
||||
# torch.distributed.scatter requires each tensor to be the same shape,
|
||||
# so find the max size across all count values and pad.
|
||||
max_size = max(counts)
|
||||
scatterlist = None
|
||||
if self.rank == root:
|
||||
slices = list(torch.split(torch.from_numpy(invals), counts))
|
||||
scatterlist = [F.pad(s, (0, max_size - len(s))) for s in slices]
|
||||
|
||||
# Receive a tensor of the max count size from the root,
|
||||
# then copy values into output numpy array, which may be smaller.
|
||||
recvtensor = torch.zeros(max_size, dtype=torch.int64)
|
||||
dist.scatter(recvtensor, scatterlist, src=root)
|
||||
return recvtensor[:counts[self.rank]]
|
||||
|
||||
def alltrue(self, val):
|
||||
"""Returns True if all procs input True, False otherwise"""
|
||||
# torch.dist does not support reductions with bool types
|
||||
# so we cast to int and cast the result back to bool
|
||||
tensor = torch.tensor([int(val)], dtype=torch.int32)
|
||||
dist.all_reduce(tensor, op=dist.ReduceOp.BAND)
|
||||
return bool(tensor[0])
|
||||
|
||||
def sum(self, val):
|
||||
"""Compute sum of a scalar val, and return total on all ranks."""
|
||||
tensor = torch.tensor([val])
|
||||
dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
|
||||
return tensor[0]
|
||||
|
||||
def exscan(self, val: int):
|
||||
"""Compute prefix sum (exclusive scan) of int64 val, and return offset of each rank."""
|
||||
# torch.distributed doesn't have a scan, so fallback to allreduce
|
||||
tensor = torch.zeros(self.numranks, dtype=torch.int64)
|
||||
tensor[self.rank:] = val
|
||||
dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
|
||||
return int(tensor[self.rank]) - val
|
||||
|
||||
def min(self, val):
|
||||
"""Return minimum of scalar val to all ranks."""
|
||||
tensor = torch.tensor([val])
|
||||
dist.all_reduce(tensor, op=dist.ReduceOp.MIN)
|
||||
return tensor[0]
|
||||
|
||||
def minrank(self, cond):
|
||||
"""Find first rank whose condition is True, return that rank if any, None otherwise."""
|
||||
minrank = self.numranks
|
||||
if cond:
|
||||
minrank = self.rank
|
||||
minrank = self.min(minrank)
|
||||
|
||||
if minrank < self.numranks:
|
||||
return minrank
|
||||
return None
|
||||
|
||||
def bcast_first(self, val):
|
||||
"""Broadcast val from first rank where it is not None, return val if any, None otherwise"""
|
||||
# Find the first rank with a valid value.
|
||||
minrank = self.minrank(val is not None)
|
||||
|
||||
# If there is no rank with a valid value, return None
|
||||
if minrank is None:
|
||||
return None
|
||||
|
||||
# Otherwise broadcast the value from the first valid rank.
|
||||
val = self.bcast(val, root=minrank)
|
||||
return val
|
||||
|
||||
def all_sum_(self, vals: np.array):
|
||||
"""Sums values in numpy array vals element-wise and update vals in place with final result on all ranks"""
|
||||
# Builds torch.tensor with from_numpy to use same underlying memory as numpy array.
|
||||
tensor = torch.from_numpy(vals)
|
||||
dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
|
||||
|
||||
def open(self, filename, truncate=None):
|
||||
"""Create, truncate, and open a file shared by all ranks."""
|
||||
|
||||
# Don't truncate existing file until all ranks reach this point
|
||||
self.barrier()
|
||||
|
||||
# We'll capture any exception in this variable
|
||||
err = None
|
||||
|
||||
# Rank 0 creates and truncates file.
|
||||
if self.rank == 0:
|
||||
try:
|
||||
f = open(filename, 'wb')
|
||||
|
||||
# Some file systems like GPFS deliver faster write speed
|
||||
# if the file size is known before data is written to the file.
|
||||
if truncate is not None:
|
||||
f.truncate(truncate)
|
||||
|
||||
except Exception as e:
|
||||
err = e
|
||||
|
||||
# Verify that rank 0 created the file
|
||||
self.allraise_if(err)
|
||||
|
||||
# Wait for rank 0 to open (and truncate) file,
|
||||
# then have all ranks open file for writing.
|
||||
if self.rank != 0:
|
||||
try:
|
||||
f = open(filename, 'r+b')
|
||||
except Exception as e:
|
||||
err = e
|
||||
|
||||
# Verify that all ranks successfully opened the file
|
||||
self.allraise_if(err)
|
||||
|
||||
return f
|
||||
|
||||
def remove(self, filename):
|
||||
"""Remove a shared file."""
|
||||
|
||||
# Don't remove the file until all are ready
|
||||
self.barrier()
|
||||
|
||||
# We'll capture any exception in this variable
|
||||
err = None
|
||||
|
||||
# Rank 0 removes the file if it exists.
|
||||
if self.rank == 0:
|
||||
try:
|
||||
if os.path.exists(filename):
|
||||
os.remove(filename)
|
||||
except Exception as e:
|
||||
err = e
|
||||
|
||||
# Verify that rank 0 successfully removed the file.
|
||||
self.allraise_if(err)
|
||||
|
||||
def rename(self, srcfile, destfile):
|
||||
"""Rename a shared file."""
|
||||
|
||||
# Don't rename until all are ready
|
||||
self.barrier()
|
||||
|
||||
# We'll capture any exception in this variable
|
||||
err = None
|
||||
|
||||
# Rank 0 renames the file.
|
||||
if self.rank == 0:
|
||||
try:
|
||||
if os.path.exists(srcfile):
|
||||
os.rename(srcfile, destfile)
|
||||
except Exception as e:
|
||||
err = e
|
||||
|
||||
# Verify that the rename succeeded
|
||||
self.allraise_if(err)
|
@ -381,7 +381,7 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
|
||||
start_time = time.time()
|
||||
# Use C++ implementation for speed.
|
||||
# First compile and then import.
|
||||
from ascendspeed.data import helpers
|
||||
from megatron.data import helpers
|
||||
assert doc_idx.dtype == np.int32
|
||||
assert sizes.dtype == np.int32
|
||||
sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length,
|
||||
|
@ -1,717 +0,0 @@
|
||||
/*
|
||||
coding=utf-8
|
||||
Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
/* Helper methods for fast index mapping builds */
|
||||
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <limits>
|
||||
#include <math.h>
|
||||
#include <stdexcept>
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/numpy.h>
|
||||
#include <random>
|
||||
|
||||
namespace py = pybind11;
|
||||
using namespace std;
|
||||
|
||||
const int32_t LONG_SENTENCE_LEN = 512;
|
||||
|
||||
|
||||
void build_blending_indices(py::array_t<uint8_t>& dataset_index,
|
||||
py::array_t<int64_t>& dataset_sample_index,
|
||||
const py::array_t<double>& weights,
|
||||
const int32_t num_datasets,
|
||||
const int64_t size, const bool verbose) {
|
||||
/* Given multiple datasets and a weighting array, build samples
|
||||
such that it follows those wieghts.*/
|
||||
|
||||
if (verbose) {
|
||||
std::cout << "> building indices for blendable datasets ..." << std::endl;
|
||||
}
|
||||
|
||||
// Get the pointer access without the checks.
|
||||
auto dataset_index_ptr = dataset_index.mutable_unchecked<1>();
|
||||
auto dataset_sample_index_ptr = dataset_sample_index.mutable_unchecked<1>();
|
||||
auto weights_ptr = weights.unchecked<1>();
|
||||
|
||||
// Initialize buffer for number of samples used for each dataset.
|
||||
int64_t current_samples[num_datasets];
|
||||
for(int64_t i = 0; i < num_datasets; ++i) {
|
||||
current_samples[i] = 0;
|
||||
}
|
||||
|
||||
// For each sample:
|
||||
for(int64_t sample_idx = 0; sample_idx < size; ++sample_idx) {
|
||||
|
||||
// Determine where the max error in sampling is happening.
|
||||
auto sample_idx_double = std::max(static_cast<double>(sample_idx), 1.0);
|
||||
int64_t max_error_index = 0;
|
||||
double max_error = weights_ptr[0] * sample_idx_double -
|
||||
static_cast<double>(current_samples[0]);
|
||||
for (int64_t dataset_idx = 1; dataset_idx < num_datasets; ++dataset_idx) {
|
||||
double error = weights_ptr[dataset_idx] * sample_idx_double -
|
||||
static_cast<double>(current_samples[dataset_idx]);
|
||||
if (error > max_error) {
|
||||
max_error = error;
|
||||
max_error_index = dataset_idx;
|
||||
}
|
||||
}
|
||||
|
||||
// Populate the indices.
|
||||
dataset_index_ptr[sample_idx] = static_cast<uint8_t>(max_error_index);
|
||||
dataset_sample_index_ptr[sample_idx] = current_samples[max_error_index];
|
||||
|
||||
// Update the total samples.
|
||||
current_samples[max_error_index] += 1;
|
||||
|
||||
}
|
||||
|
||||
// print info
|
||||
if (verbose) {
|
||||
std::cout << " > sample ratios:" << std::endl;
|
||||
for (int64_t dataset_idx = 0; dataset_idx < num_datasets; ++dataset_idx) {
|
||||
auto ratio = static_cast<double>(current_samples[dataset_idx]) /
|
||||
static_cast<double>(size);
|
||||
std::cout << " dataset " << dataset_idx << ", input: " <<
|
||||
weights_ptr[dataset_idx] << ", achieved: " << ratio << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
py::array build_sample_idx(const py::array_t<int32_t>& sizes_,
|
||||
const py::array_t<int32_t>& doc_idx_,
|
||||
const int32_t seq_length,
|
||||
const int32_t num_epochs,
|
||||
const int64_t tokens_per_epoch) {
|
||||
/* Sample index (sample_idx) is used for gpt2 like dataset for which
|
||||
the documents are flattened and the samples are built based on this
|
||||
1-D flatten array. It is a 2D array with sizes [number-of-samples + 1, 2]
|
||||
where [..., 0] contains the index into `doc_idx` and [..., 1] is the
|
||||
starting offset in that document.*/
|
||||
|
||||
// Consistency checks.
|
||||
assert(seq_length > 1);
|
||||
assert(num_epochs > 0);
|
||||
assert(tokens_per_epoch > 1);
|
||||
|
||||
// Remove bound checks.
|
||||
auto sizes = sizes_.unchecked<1>();
|
||||
auto doc_idx = doc_idx_.unchecked<1>();
|
||||
|
||||
// Mapping and it's length (1D).
|
||||
int64_t num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length;
|
||||
int32_t* sample_idx = new int32_t[2*(num_samples+1)];
|
||||
|
||||
cout << " using:" << endl << std::flush;
|
||||
cout << " number of documents: " <<
|
||||
doc_idx_.shape(0) / num_epochs << endl << std::flush;
|
||||
cout << " number of epochs: " << num_epochs <<
|
||||
endl << std::flush;
|
||||
cout << " sequence length: " << seq_length <<
|
||||
endl << std::flush;
|
||||
cout << " total number of samples: " << num_samples <<
|
||||
endl << std::flush;
|
||||
|
||||
// Index into sample_idx.
|
||||
int64_t sample_index = 0;
|
||||
// Index into doc_idx.
|
||||
int64_t doc_idx_index = 0;
|
||||
// Begining offset for each document.
|
||||
int32_t doc_offset = 0;
|
||||
// Start with first document and no offset.
|
||||
sample_idx[2 * sample_index] = doc_idx_index;
|
||||
sample_idx[2 * sample_index + 1] = doc_offset;
|
||||
++sample_index;
|
||||
|
||||
while (sample_index <= num_samples) {
|
||||
// Start with a fresh sequence.
|
||||
int32_t remaining_seq_length = seq_length + 1;
|
||||
while (remaining_seq_length != 0) {
|
||||
// Get the document length.
|
||||
auto doc_id = doc_idx[doc_idx_index];
|
||||
auto doc_length = sizes[doc_id] - doc_offset;
|
||||
// And add it to the current sequence.
|
||||
remaining_seq_length -= doc_length;
|
||||
// If we have more than a full sequence, adjust offset and set
|
||||
// remaining length to zero so we return from the while loop.
|
||||
// Note that -1 here is for the same reason we have -1 in
|
||||
// `_num_epochs` calculations.
|
||||
if (remaining_seq_length <= 0) {
|
||||
doc_offset += (remaining_seq_length + doc_length - 1);
|
||||
remaining_seq_length = 0;
|
||||
} else {
|
||||
// Otherwise, start from the begining of the next document.
|
||||
++doc_idx_index;
|
||||
doc_offset = 0;
|
||||
}
|
||||
}
|
||||
// Record the sequence.
|
||||
sample_idx[2 * sample_index] = doc_idx_index;
|
||||
sample_idx[2 * sample_index + 1] = doc_offset;
|
||||
++sample_index;
|
||||
}
|
||||
|
||||
// Method to deallocate memory.
|
||||
py::capsule free_when_done(sample_idx, [](void *mem_) {
|
||||
int32_t *mem = reinterpret_cast<int32_t*>(mem_);
|
||||
delete[] mem;
|
||||
});
|
||||
|
||||
// Return the numpy array.
|
||||
const auto byte_size = sizeof(int32_t);
|
||||
return py::array(std::vector<int64_t>{num_samples+1, 2}, // shape
|
||||
{2*byte_size, byte_size}, // C-style contiguous strides
|
||||
sample_idx, // the data pointer
|
||||
free_when_done); // numpy array references
|
||||
|
||||
}
|
||||
|
||||
|
||||
inline int32_t get_target_sample_len(const int32_t short_seq_ratio,
|
||||
const int32_t max_length,
|
||||
std::mt19937& rand32_gen) {
|
||||
/* Training sample length. */
|
||||
if (short_seq_ratio == 0) {
|
||||
return max_length;
|
||||
}
|
||||
const auto random_number = rand32_gen();
|
||||
if ((random_number % short_seq_ratio) == 0) {
|
||||
return 2 + random_number % (max_length - 1);
|
||||
}
|
||||
return max_length;
|
||||
}
|
||||
|
||||
|
||||
template<typename DocIdx>
|
||||
py::array build_mapping_impl(const py::array_t<int64_t>& docs_,
|
||||
const py::array_t<int32_t>& sizes_,
|
||||
const int32_t num_epochs,
|
||||
const uint64_t max_num_samples,
|
||||
const int32_t max_seq_length,
|
||||
const double short_seq_prob,
|
||||
const int32_t seed,
|
||||
const bool verbose,
|
||||
const int32_t min_num_sent) {
|
||||
/* Build a mapping of (start-index, end-index, sequence-length) where
|
||||
start and end index are the indices of the sentences in the sample
|
||||
and sequence-length is the target sequence length.
|
||||
*/
|
||||
|
||||
// Consistency checks.
|
||||
assert(num_epochs > 0);
|
||||
assert(max_seq_length > 1);
|
||||
assert(short_seq_prob >= 0.0);
|
||||
assert(short_seq_prob <= 1.0);
|
||||
assert(seed > 0);
|
||||
|
||||
// Remove bound checks.
|
||||
auto docs = docs_.unchecked<1>();
|
||||
auto sizes = sizes_.unchecked<1>();
|
||||
|
||||
// For efficiency, convert probability to ratio. Note: rand() generates int.
|
||||
int32_t short_seq_ratio = 0;
|
||||
if (short_seq_prob > 0) {
|
||||
short_seq_ratio = static_cast<int32_t>(round(1.0 / short_seq_prob));
|
||||
}
|
||||
|
||||
if (verbose) {
|
||||
const auto sent_start_index = docs[0];
|
||||
const auto sent_end_index = docs[docs_.shape(0) - 1];
|
||||
const auto num_sentences = sent_end_index - sent_start_index;
|
||||
cout << " using:" << endl << std::flush;
|
||||
cout << " number of documents: " << docs_.shape(0) - 1 <<
|
||||
endl << std::flush;
|
||||
cout << " sentences range: [" << sent_start_index <<
|
||||
", " << sent_end_index << ")" << endl << std::flush;
|
||||
cout << " total number of sentences: " << num_sentences <<
|
||||
endl << std::flush;
|
||||
cout << " number of epochs: " << num_epochs <<
|
||||
endl << std::flush;
|
||||
cout << " maximum number of samples: " << max_num_samples <<
|
||||
endl << std::flush;
|
||||
cout << " maximum sequence length: " << max_seq_length <<
|
||||
endl << std::flush;
|
||||
cout << " short sequence probability: " << short_seq_prob <<
|
||||
endl << std::flush;
|
||||
cout << " short sequence ration (1/prob): " << short_seq_ratio <<
|
||||
endl << std::flush;
|
||||
cout << " seed: " << seed << endl <<
|
||||
std::flush;
|
||||
}
|
||||
|
||||
// Mapping and it's length (1D).
|
||||
int64_t num_samples = -1;
|
||||
DocIdx* maps = NULL;
|
||||
|
||||
// Perform two iterations, in the first iteration get the size
|
||||
// and allocate memory and in the second iteration populate the map.
|
||||
bool second = false;
|
||||
for (int32_t iteration=0; iteration<2; ++iteration) {
|
||||
|
||||
// Set the seed so both iterations produce the same results.
|
||||
std::mt19937 rand32_gen(seed);
|
||||
|
||||
// Set the flag on second iteration.
|
||||
second = (iteration == 1);
|
||||
|
||||
// Counters:
|
||||
uint64_t empty_docs = 0;
|
||||
uint64_t one_sent_docs = 0;
|
||||
uint64_t long_sent_docs = 0;
|
||||
|
||||
// Current map index.
|
||||
uint64_t map_index = 0;
|
||||
|
||||
// For each epoch:
|
||||
for (int32_t epoch=0; epoch<num_epochs; ++epoch) {
|
||||
if (map_index >= max_num_samples) {
|
||||
if (verbose && (!second)) {
|
||||
cout << " reached " << max_num_samples << " samples after "
|
||||
<< epoch << " epochs ..." << endl << std::flush;
|
||||
}
|
||||
break;
|
||||
}
|
||||
// For each document:
|
||||
for (int32_t doc=0; doc<(docs.shape(0) - 1); ++doc) {
|
||||
|
||||
// Document sentences are in [sent_index_first, sent_index_last)
|
||||
const auto sent_index_first = docs[doc];
|
||||
const auto sent_index_last = docs[doc + 1];
|
||||
|
||||
// At the begining of the document previous index is the
|
||||
// start index.
|
||||
auto prev_start_index = sent_index_first;
|
||||
|
||||
// Remaining documents.
|
||||
auto num_remain_sent = sent_index_last - sent_index_first;
|
||||
|
||||
// Some bookkeeping
|
||||
if ((epoch == 0) && (!second)) {
|
||||
if (num_remain_sent == 0) {
|
||||
++empty_docs;
|
||||
}
|
||||
if (num_remain_sent == 1) {
|
||||
++one_sent_docs;
|
||||
}
|
||||
}
|
||||
|
||||
// Detect documents with long sentences.
|
||||
bool contains_long_sentence = false;
|
||||
if (num_remain_sent > 1) {
|
||||
for (auto sent_index=sent_index_first;
|
||||
sent_index < sent_index_last; ++sent_index) {
|
||||
if (sizes[sent_index] > LONG_SENTENCE_LEN){
|
||||
if ((epoch == 0) && (!second)) {
|
||||
++long_sent_docs;
|
||||
}
|
||||
contains_long_sentence = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we have more than two sentences.
|
||||
if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence)) {
|
||||
|
||||
// Set values.
|
||||
auto seq_len = int32_t{0};
|
||||
auto num_sent = int32_t{0};
|
||||
auto target_seq_len = get_target_sample_len(short_seq_ratio,
|
||||
max_seq_length,
|
||||
rand32_gen);
|
||||
|
||||
// Loop through sentences.
|
||||
for (auto sent_index=sent_index_first;
|
||||
sent_index < sent_index_last; ++sent_index) {
|
||||
|
||||
// Add the size and number of sentences.
|
||||
seq_len += sizes[sent_index];
|
||||
++num_sent;
|
||||
--num_remain_sent;
|
||||
|
||||
// If we have reached the target length.
|
||||
// and if not only one sentence is left in the document.
|
||||
// and if we have at least two sentneces.
|
||||
// and if we have reached end of the document.
|
||||
if (((seq_len >= target_seq_len) &&
|
||||
(num_remain_sent > 1) &&
|
||||
(num_sent >= min_num_sent) ) || (num_remain_sent == 0)) {
|
||||
|
||||
// Check for overflow.
|
||||
if ((3 * map_index + 2) >
|
||||
std::numeric_limits<int64_t>::max()) {
|
||||
cout << "number of samples exceeded maximum "
|
||||
<< "allowed by type int64: "
|
||||
<< std::numeric_limits<int64_t>::max()
|
||||
<< endl;
|
||||
throw std::overflow_error("Number of samples");
|
||||
}
|
||||
|
||||
// Populate the map.
|
||||
if (second) {
|
||||
const auto map_index_0 = 3 * map_index;
|
||||
maps[map_index_0] = static_cast<DocIdx>(prev_start_index);
|
||||
maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1);
|
||||
maps[map_index_0 + 2] = static_cast<DocIdx>(target_seq_len);
|
||||
}
|
||||
|
||||
// Update indices / counters.
|
||||
++map_index;
|
||||
prev_start_index = sent_index + 1;
|
||||
target_seq_len = get_target_sample_len(short_seq_ratio,
|
||||
max_seq_length,
|
||||
rand32_gen);
|
||||
seq_len = 0;
|
||||
num_sent = 0;
|
||||
}
|
||||
|
||||
} // for (auto sent_index=sent_index_first; ...
|
||||
} // if (num_remain_sent > 1) {
|
||||
} // for (int doc=0; doc < num_docs; ++doc) {
|
||||
} // for (int epoch=0; epoch < num_epochs; ++epoch) {
|
||||
|
||||
if (!second) {
|
||||
if (verbose) {
|
||||
cout << " number of empty documents: " << empty_docs <<
|
||||
endl << std::flush;
|
||||
cout << " number of documents with one sentence: " <<
|
||||
one_sent_docs << endl << std::flush;
|
||||
cout << " number of documents with long sentences: " <<
|
||||
long_sent_docs << endl << std::flush;
|
||||
cout << " will create mapping for " << map_index <<
|
||||
" samples" << endl << std::flush;
|
||||
}
|
||||
assert(maps == NULL);
|
||||
assert(num_samples < 0);
|
||||
maps = new DocIdx[3*map_index];
|
||||
num_samples = static_cast<int64_t>(map_index);
|
||||
}
|
||||
|
||||
} // for (int iteration=0; iteration < 2; ++iteration) {
|
||||
|
||||
// Shuffle.
|
||||
// We need a 64 bit random number generator as we might have more
|
||||
// than 2 billion samples.
|
||||
std::mt19937_64 rand64_gen(seed + 1);
|
||||
for (auto i=(num_samples - 1); i > 0; --i) {
|
||||
const auto j = static_cast<int64_t>(rand64_gen() % (i + 1));
|
||||
const auto i0 = 3 * i;
|
||||
const auto j0 = 3 * j;
|
||||
// Swap values.
|
||||
swap(maps[i0], maps[j0]);
|
||||
swap(maps[i0 + 1], maps[j0 + 1]);
|
||||
swap(maps[i0 + 2], maps[j0 + 2]);
|
||||
}
|
||||
|
||||
// Method to deallocate memory.
|
||||
py::capsule free_when_done(maps, [](void *mem_) {
|
||||
DocIdx *mem = reinterpret_cast<DocIdx*>(mem_);
|
||||
delete[] mem;
|
||||
});
|
||||
|
||||
// Return the numpy array.
|
||||
const auto byte_size = sizeof(DocIdx);
|
||||
return py::array(std::vector<int64_t>{num_samples, 3}, // shape
|
||||
{3*byte_size, byte_size}, // C-style contiguous strides
|
||||
maps, // the data pointer
|
||||
free_when_done); // numpy array references
|
||||
|
||||
}
|
||||
|
||||
|
||||
py::array build_mapping(const py::array_t<int64_t>& docs_,
|
||||
const py::array_t<int>& sizes_,
|
||||
const int num_epochs,
|
||||
const uint64_t max_num_samples,
|
||||
const int max_seq_length,
|
||||
const double short_seq_prob,
|
||||
const int seed,
|
||||
const bool verbose,
|
||||
const int32_t min_num_sent) {
|
||||
|
||||
if (sizes_.size() > std::numeric_limits<uint32_t>::max()) {
|
||||
if (verbose) {
|
||||
cout << " using uint64 for data mapping..." << endl << std::flush;
|
||||
}
|
||||
return build_mapping_impl<uint64_t>(docs_, sizes_, num_epochs,
|
||||
max_num_samples, max_seq_length,
|
||||
short_seq_prob, seed, verbose,
|
||||
min_num_sent);
|
||||
} else {
|
||||
if (verbose) {
|
||||
cout << " using uint32 for data mapping..." << endl << std::flush;
|
||||
}
|
||||
return build_mapping_impl<uint32_t>(docs_, sizes_, num_epochs,
|
||||
max_num_samples, max_seq_length,
|
||||
short_seq_prob, seed, verbose,
|
||||
min_num_sent);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename DocIdx>
|
||||
py::array build_blocks_mapping_impl(const py::array_t<int64_t>& docs_,
|
||||
const py::array_t<int32_t>& sizes_,
|
||||
const py::array_t<int32_t>& titles_sizes_,
|
||||
const int32_t num_epochs,
|
||||
const uint64_t max_num_samples,
|
||||
const int32_t max_seq_length,
|
||||
const int32_t seed,
|
||||
const bool verbose,
|
||||
const bool use_one_sent_blocks) {
|
||||
/* Build a mapping of (start-index, end-index, sequence-length) where
|
||||
start and end index are the indices of the sentences in the sample
|
||||
and sequence-length is the target sequence length.
|
||||
*/
|
||||
|
||||
// Consistency checks.
|
||||
assert(num_epochs > 0);
|
||||
assert(max_seq_length > 1);
|
||||
assert(seed > 0);
|
||||
|
||||
// Remove bound checks.
|
||||
auto docs = docs_.unchecked<1>();
|
||||
auto sizes = sizes_.unchecked<1>();
|
||||
auto titles_sizes = titles_sizes_.unchecked<1>();
|
||||
|
||||
if (verbose) {
|
||||
const auto sent_start_index = docs[0];
|
||||
const auto sent_end_index = docs[docs_.shape(0) - 1];
|
||||
const auto num_sentences = sent_end_index - sent_start_index;
|
||||
cout << " using:" << endl << std::flush;
|
||||
cout << " number of documents: " << docs_.shape(0) - 1 <<
|
||||
endl << std::flush;
|
||||
cout << " sentences range: [" << sent_start_index <<
|
||||
", " << sent_end_index << ")" << endl << std::flush;
|
||||
cout << " total number of sentences: " << num_sentences <<
|
||||
endl << std::flush;
|
||||
cout << " number of epochs: " << num_epochs <<
|
||||
endl << std::flush;
|
||||
cout << " maximum number of samples: " << max_num_samples <<
|
||||
endl << std::flush;
|
||||
cout << " maximum sequence length: " << max_seq_length <<
|
||||
endl << std::flush;
|
||||
cout << " seed: " << seed << endl <<
|
||||
std::flush;
|
||||
}
|
||||
|
||||
// Mapping and its length (1D).
|
||||
int64_t num_samples = -1;
|
||||
DocIdx* maps = NULL;
|
||||
|
||||
// Acceptable number of sentences per block.
|
||||
int min_num_sent = 2;
|
||||
if (use_one_sent_blocks) {
|
||||
min_num_sent = 1;
|
||||
}
|
||||
|
||||
// Perform two iterations, in the first iteration get the size
|
||||
// and allocate memory and in the second iteration populate the map.
|
||||
bool second = false;
|
||||
for (int32_t iteration=0; iteration<2; ++iteration) {
|
||||
|
||||
// Set the flag on second iteration.
|
||||
second = (iteration == 1);
|
||||
|
||||
// Current map index.
|
||||
uint64_t map_index = 0;
|
||||
|
||||
uint64_t empty_docs = 0;
|
||||
uint64_t one_sent_docs = 0;
|
||||
uint64_t long_sent_docs = 0;
|
||||
// For each epoch:
|
||||
for (int32_t epoch=0; epoch<num_epochs; ++epoch) {
|
||||
// assign every block a unique id
|
||||
int32_t block_id = 0;
|
||||
|
||||
if (map_index >= max_num_samples) {
|
||||
if (verbose && (!second)) {
|
||||
cout << " reached " << max_num_samples << " samples after "
|
||||
<< epoch << " epochs ..." << endl << std::flush;
|
||||
}
|
||||
break;
|
||||
}
|
||||
// For each document:
|
||||
for (int32_t doc=0; doc<(docs.shape(0) - 1); ++doc) {
|
||||
|
||||
// Document sentences are in [sent_index_first, sent_index_last)
|
||||
const auto sent_index_first = docs[doc];
|
||||
const auto sent_index_last = docs[doc + 1];
|
||||
const auto target_seq_len = max_seq_length - titles_sizes[doc];
|
||||
|
||||
// At the begining of the document previous index is the
|
||||
// start index.
|
||||
auto prev_start_index = sent_index_first;
|
||||
|
||||
// Remaining documents.
|
||||
auto num_remain_sent = sent_index_last - sent_index_first;
|
||||
|
||||
// Some bookkeeping
|
||||
if ((epoch == 0) && (!second)) {
|
||||
if (num_remain_sent == 0) {
|
||||
++empty_docs;
|
||||
}
|
||||
if (num_remain_sent == 1) {
|
||||
++one_sent_docs;
|
||||
}
|
||||
}
|
||||
// Detect documents with long sentences.
|
||||
bool contains_long_sentence = false;
|
||||
if (num_remain_sent >= min_num_sent) {
|
||||
for (auto sent_index=sent_index_first;
|
||||
sent_index < sent_index_last; ++sent_index) {
|
||||
if (sizes[sent_index] > LONG_SENTENCE_LEN){
|
||||
if ((epoch == 0) && (!second)) {
|
||||
++long_sent_docs;
|
||||
}
|
||||
contains_long_sentence = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// If we have enough sentences and no long sentences.
|
||||
if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence)) {
|
||||
|
||||
// Set values.
|
||||
auto seq_len = int32_t{0};
|
||||
auto num_sent = int32_t{0};
|
||||
|
||||
// Loop through sentences.
|
||||
for (auto sent_index=sent_index_first;
|
||||
sent_index < sent_index_last; ++sent_index) {
|
||||
|
||||
// Add the size and number of sentences.
|
||||
seq_len += sizes[sent_index];
|
||||
++num_sent;
|
||||
--num_remain_sent;
|
||||
|
||||
// If we have reached the target length.
|
||||
// and there are an acceptable number of sentences left
|
||||
// and if we have at least the minimum number of sentences.
|
||||
// or if we have reached end of the document.
|
||||
if (((seq_len >= target_seq_len) &&
|
||||
(num_remain_sent >= min_num_sent) &&
|
||||
(num_sent >= min_num_sent) ) || (num_remain_sent == 0)) {
|
||||
|
||||
// Populate the map.
|
||||
if (second) {
|
||||
const auto map_index_0 = 4 * map_index;
|
||||
// Each sample has 4 items: the starting sentence index, ending sentence index,
|
||||
// the index of the document from which the block comes (used for fetching titles)
|
||||
// and the unique id of the block (used for creating block indexes)
|
||||
|
||||
maps[map_index_0] = static_cast<DocIdx>(prev_start_index);
|
||||
maps[map_index_0 + 1] = static_cast<DocIdx>(sent_index + 1);
|
||||
maps[map_index_0 + 2] = static_cast<DocIdx>(doc);
|
||||
maps[map_index_0 + 3] = static_cast<DocIdx>(block_id);
|
||||
}
|
||||
|
||||
// Update indices / counters.
|
||||
++map_index;
|
||||
++block_id;
|
||||
prev_start_index = sent_index + 1;
|
||||
seq_len = 0;
|
||||
num_sent = 0;
|
||||
}
|
||||
} // for (auto sent_index=sent_index_first; ...
|
||||
} // if (num_remain_sent > 1) {
|
||||
} // for (int doc=0; doc < num_docs; ++doc) {
|
||||
} // for (int epoch=0; epoch < num_epochs; ++epoch) {
|
||||
|
||||
if (!second) {
|
||||
if (verbose) {
|
||||
cout << " number of empty documents: " << empty_docs <<
|
||||
endl << std::flush;
|
||||
cout << " number of documents with one sentence: " <<
|
||||
one_sent_docs << endl << std::flush;
|
||||
cout << " number of documents with long sentences: " <<
|
||||
long_sent_docs << endl << std::flush;
|
||||
cout << " will create mapping for " << map_index <<
|
||||
" samples" << endl << std::flush;
|
||||
}
|
||||
assert(maps == NULL);
|
||||
assert(num_samples < 0);
|
||||
maps = new DocIdx[4*map_index];
|
||||
num_samples = static_cast<int64_t>(map_index);
|
||||
}
|
||||
|
||||
} // for (int iteration=0; iteration < 2; ++iteration) {
|
||||
|
||||
// Shuffle.
|
||||
// We need a 64 bit random number generator as we might have more
|
||||
// than 2 billion samples.
|
||||
std::mt19937_64 rand64_gen(seed + 1);
|
||||
for (auto i=(num_samples - 1); i > 0; --i) {
|
||||
const auto j = static_cast<int64_t>(rand64_gen() % (i + 1));
|
||||
const auto i0 = 4 * i;
|
||||
const auto j0 = 4 * j;
|
||||
// Swap values.
|
||||
swap(maps[i0], maps[j0]);
|
||||
swap(maps[i0 + 1], maps[j0 + 1]);
|
||||
swap(maps[i0 + 2], maps[j0 + 2]);
|
||||
swap(maps[i0 + 3], maps[j0 + 3]);
|
||||
}
|
||||
|
||||
// Method to deallocate memory.
|
||||
py::capsule free_when_done(maps, [](void *mem_) {
|
||||
DocIdx *mem = reinterpret_cast<DocIdx*>(mem_);
|
||||
delete[] mem;
|
||||
});
|
||||
|
||||
// Return the numpy array.
|
||||
const auto byte_size = sizeof(DocIdx);
|
||||
return py::array(std::vector<int64_t>{num_samples, 4}, // shape
|
||||
{4*byte_size, byte_size}, // C-style contiguous strides
|
||||
maps, // the data pointer
|
||||
free_when_done); // numpy array references
|
||||
|
||||
}
|
||||
|
||||
py::array build_blocks_mapping(const py::array_t<int64_t>& docs_,
|
||||
const py::array_t<int>& sizes_,
|
||||
const py::array_t<int>& titles_sizes_,
|
||||
const int num_epochs,
|
||||
const uint64_t max_num_samples,
|
||||
const int max_seq_length,
|
||||
const int seed,
|
||||
const bool verbose,
|
||||
const bool use_one_sent_blocks) {
|
||||
|
||||
if (sizes_.size() > std::numeric_limits<uint32_t>::max()) {
|
||||
if (verbose) {
|
||||
cout << " using uint64 for data mapping..." << endl << std::flush;
|
||||
}
|
||||
return build_blocks_mapping_impl<uint64_t>(docs_, sizes_, titles_sizes_,
|
||||
num_epochs, max_num_samples, max_seq_length, seed, verbose, use_one_sent_blocks);
|
||||
} else {
|
||||
if (verbose) {
|
||||
cout << " using uint32 for data mapping..." << endl << std::flush;
|
||||
}
|
||||
return build_blocks_mapping_impl<uint32_t>(docs_, sizes_, titles_sizes_,
|
||||
num_epochs, max_num_samples, max_seq_length, seed, verbose, use_one_sent_blocks);
|
||||
}
|
||||
}
|
||||
|
||||
PYBIND11_MODULE(helpers, m) {
|
||||
m.def("build_mapping", &build_mapping);
|
||||
m.def("build_blocks_mapping", &build_blocks_mapping);
|
||||
m.def("build_sample_idx", &build_sample_idx);
|
||||
m.def("build_blending_indices", &build_blending_indices);
|
||||
}
|
@ -1,197 +0,0 @@
|
||||
import os
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from ascendspeed.core import parallel_state
|
||||
from ascendspeed import get_args, get_tokenizer, print_rank_0, mpu
|
||||
from deepspeed.accelerator import get_accelerator
|
||||
|
||||
def get_one_epoch_dataloader(dataset, micro_batch_size=None):
|
||||
"""Specifically one epoch to be used in an indexing job."""
|
||||
args = get_args()
|
||||
|
||||
world_size = parallel_state.get_data_parallel_world_size()
|
||||
rank = parallel_state.get_data_parallel_rank()
|
||||
if micro_batch_size is None:
|
||||
micro_batch_size = args.micro_batch_size
|
||||
global_batch_size = micro_batch_size * world_size
|
||||
num_workers = args.num_workers
|
||||
|
||||
sampler = torch.utils.data.SequentialSampler(dataset)
|
||||
# importantly, drop_last must be False to get all the data.
|
||||
assert False, 'DistributedBatchSampler deprecated, change the implementation'
|
||||
from ascendspeed.data.samplers import DistributedBatchSampler
|
||||
batch_sampler = DistributedBatchSampler(sampler,
|
||||
batch_size=global_batch_size,
|
||||
drop_last=False,
|
||||
rank=rank,
|
||||
world_size=world_size)
|
||||
|
||||
return torch.utils.data.DataLoader(dataset,
|
||||
batch_sampler=batch_sampler,
|
||||
num_workers=num_workers,
|
||||
pin_memory=True)
|
||||
|
||||
|
||||
def get_ict_batch(data_iterator):
|
||||
# Items and their type.
|
||||
keys = ['query_tokens', 'query_pad_mask',
|
||||
'block_tokens', 'block_pad_mask', 'block_data']
|
||||
datatype = torch.int64
|
||||
|
||||
# Broadcast data.
|
||||
if data_iterator is None:
|
||||
data = None
|
||||
else:
|
||||
data = next(data_iterator)
|
||||
data_b = mpu.broadcast_data(keys, data, datatype)
|
||||
|
||||
# Unpack.
|
||||
query_tokens = data_b['query_tokens'].long()
|
||||
query_pad_mask = data_b['query_pad_mask'].long()
|
||||
block_tokens = data_b['block_tokens'].long()
|
||||
block_pad_mask = data_b['block_pad_mask'].long()
|
||||
block_indices = data_b['block_data'].long()
|
||||
|
||||
return query_tokens, query_pad_mask,\
|
||||
block_tokens, block_pad_mask, block_indices
|
||||
|
||||
|
||||
def join_str_list(str_list):
|
||||
"""Join a list of strings, handling spaces appropriately"""
|
||||
result = ""
|
||||
for s in str_list:
|
||||
if s.startswith("##"):
|
||||
result += s[2:]
|
||||
else:
|
||||
result += " " + s
|
||||
return result
|
||||
|
||||
|
||||
class BlockSampleData(object):
|
||||
"""A struct for fully describing a fixed-size block of data as used in REALM
|
||||
|
||||
:param start_idx: for first sentence of the block
|
||||
:param end_idx: for last sentence of the block (may be partially truncated in sample construction)
|
||||
:param doc_idx: the index of the document from which the block comes in the original indexed dataset
|
||||
:param block_idx: a unique integer identifier given to every block.
|
||||
"""
|
||||
def __init__(self, start_idx, end_idx, doc_idx, block_idx):
|
||||
self.start_idx = start_idx
|
||||
self.end_idx = end_idx
|
||||
self.doc_idx = doc_idx
|
||||
self.block_idx = block_idx
|
||||
|
||||
def as_array(self):
|
||||
return np.array([self.start_idx, self.end_idx, self.doc_idx, self.block_idx]).astype(np.int64)
|
||||
|
||||
def as_tuple(self):
|
||||
return self.start_idx, self.end_idx, self.doc_idx, self.block_idx
|
||||
|
||||
|
||||
class BlockSamplesMapping(object):
|
||||
def __init__(self, mapping_array):
|
||||
# make sure that the array is compatible with BlockSampleData
|
||||
assert mapping_array.shape[1] == 4
|
||||
self.mapping_array = mapping_array
|
||||
|
||||
def __len__(self):
|
||||
return self.mapping_array.shape[0]
|
||||
|
||||
def __getitem__(self, idx):
|
||||
"""Get the data associated with an indexed sample."""
|
||||
sample_data = BlockSampleData(*self.mapping_array[idx])
|
||||
return sample_data
|
||||
|
||||
|
||||
def get_block_samples_mapping(block_dataset, title_dataset, data_prefix, num_epochs,
|
||||
max_num_samples, max_seq_length, seed, name, use_one_sent_docs=False):
|
||||
"""Get samples mapping for a dataset over fixed size blocks. This function also requires
|
||||
a dataset of the titles for the source documents since their lengths must be taken into account.
|
||||
|
||||
:return: samples_mapping (BlockSamplesMapping)
|
||||
"""
|
||||
|
||||
if not num_epochs:
|
||||
if not max_num_samples:
|
||||
raise ValueError("Need to specify either max_num_samples "
|
||||
"or num_epochs")
|
||||
num_epochs = np.iinfo(np.int32).max - 1
|
||||
if not max_num_samples:
|
||||
max_num_samples = np.iinfo(np.int64).max - 1
|
||||
|
||||
# Filename of the index mapping
|
||||
indexmap_filename = data_prefix
|
||||
indexmap_filename += '_{}_indexmap'.format(name)
|
||||
if num_epochs != (np.iinfo(np.int32).max - 1):
|
||||
indexmap_filename += '_{}ep'.format(num_epochs)
|
||||
if max_num_samples != (np.iinfo(np.int64).max - 1):
|
||||
indexmap_filename += '_{}mns'.format(max_num_samples)
|
||||
indexmap_filename += '_{}msl'.format(max_seq_length)
|
||||
indexmap_filename += '_{}s'.format(seed)
|
||||
if use_one_sent_docs:
|
||||
indexmap_filename += '_1sentok'
|
||||
indexmap_filename += '.npy'
|
||||
|
||||
# Build the indexed mapping if not exist.
|
||||
if parallel_state.get_data_parallel_rank() == 0 and \
|
||||
not os.path.isfile(indexmap_filename):
|
||||
print(' > WARNING: could not find index map file {}, building '
|
||||
'the indices on rank 0 ...'.format(indexmap_filename))
|
||||
|
||||
# Make sure the types match the helpers input types.
|
||||
assert block_dataset.doc_idx.dtype == np.int64
|
||||
assert block_dataset.sizes.dtype == np.int32
|
||||
|
||||
# Build samples mapping
|
||||
verbose = torch.distributed.get_rank() == 0
|
||||
start_time = time.time()
|
||||
print_rank_0(' > building samples index mapping for {} ...'.format(
|
||||
name))
|
||||
|
||||
from ascendspeed.data import helpers
|
||||
mapping_array = helpers.build_blocks_mapping(
|
||||
block_dataset.doc_idx,
|
||||
block_dataset.sizes,
|
||||
title_dataset.sizes,
|
||||
num_epochs,
|
||||
max_num_samples,
|
||||
max_seq_length - 3, # account for added tokens
|
||||
seed,
|
||||
verbose,
|
||||
use_one_sent_docs)
|
||||
|
||||
|
||||
print_rank_0(' > done building samples index mapping')
|
||||
np.save(indexmap_filename, mapping_array, allow_pickle=True)
|
||||
print_rank_0(' > saved the index mapping in {}'.format(
|
||||
indexmap_filename))
|
||||
# Make sure all the ranks have built the mapping
|
||||
print_rank_0(' > elapsed time to build and save samples mapping '
|
||||
'(seconds): {:4f}'.format(
|
||||
time.time() - start_time))
|
||||
|
||||
# This should be a barrier but nccl barrier assumes
|
||||
# device_index=rank which is not the case for model
|
||||
# parallel case
|
||||
counts = get_accelerator().LongTensor([1])
|
||||
torch.distributed.all_reduce(counts, group=parallel_state.get_data_parallel_group())
|
||||
assert counts[0].item() == torch.distributed.get_world_size(
|
||||
group=parallel_state.get_data_parallel_group())
|
||||
|
||||
# Load indexed dataset.
|
||||
print_rank_0(' > loading indexed mapping from {}'.format(
|
||||
indexmap_filename))
|
||||
start_time = time.time()
|
||||
|
||||
mapping_array = np.load(indexmap_filename, allow_pickle=True, mmap_mode='r')
|
||||
samples_mapping = BlockSamplesMapping(mapping_array)
|
||||
|
||||
print_rank_0(' loaded indexed file in {:3.3f} seconds'.format(
|
||||
time.time() - start_time))
|
||||
print_rank_0(' total number of samples: {}'.format(
|
||||
mapping_array.shape[0]))
|
||||
|
||||
return samples_mapping
|
@ -1,224 +0,0 @@
|
||||
import itertools
|
||||
import os
|
||||
import pickle
|
||||
import shutil
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from ascendspeed import get_args
|
||||
from ascendspeed.core import parallel_state
|
||||
|
||||
|
||||
def detach(tensor):
|
||||
return tensor.detach().cpu().numpy()
|
||||
|
||||
|
||||
class OpenRetreivalDataStore(object):
|
||||
"""
|
||||
Serializable data structure for holding data for blocks --
|
||||
embeddings and necessary metadata for Retriever
|
||||
"""
|
||||
def __init__(self, embedding_path=None, load_from_path=True, rank=None):
|
||||
self.embed_data = dict()
|
||||
if embedding_path is None:
|
||||
args = get_args()
|
||||
embedding_path = args.embedding_path
|
||||
rank = args.rank
|
||||
self.embedding_path = embedding_path
|
||||
self.rank = rank
|
||||
|
||||
if load_from_path:
|
||||
self.load_from_file()
|
||||
|
||||
block_data_name = os.path.splitext(self.embedding_path)[0]
|
||||
self.temp_dir_name = block_data_name + '_tmp'
|
||||
|
||||
def state(self):
|
||||
return {
|
||||
'embed_data': self.embed_data,
|
||||
}
|
||||
|
||||
def clear(self):
|
||||
"""
|
||||
Clear the embedding data structures to save memory.
|
||||
The metadata ends up getting used, and is also much smaller in
|
||||
dimensionality so it isn't really worth clearing.
|
||||
"""
|
||||
self.embed_data = dict()
|
||||
|
||||
def load_from_file(self):
|
||||
"""Populate members from instance saved to file"""
|
||||
|
||||
if parallel_state.is_unitialized() or parallel_state.get_data_parallel_rank() == 0:
|
||||
print("\n> Unpickling BlockData", flush=True)
|
||||
state_dict = pickle.load(open(self.embedding_path, 'rb'))
|
||||
if parallel_state.is_unitialized() or parallel_state.get_data_parallel_rank() == 0:
|
||||
print(">> Finished unpickling BlockData\n", flush=True)
|
||||
|
||||
self.embed_data = state_dict['embed_data']
|
||||
|
||||
def add_block_data(self, row_id, block_embeds, allow_overwrite=False):
|
||||
"""
|
||||
Add data for set of blocks
|
||||
:param row_id: 1D array of unique int ids for the blocks
|
||||
:param block_embeds: 2D array of embeddings of the blocks
|
||||
In the case of retriever this will be [start_idx, end_idx, doc_idx]
|
||||
"""
|
||||
for idx, embed in zip(row_id, block_embeds):
|
||||
if not allow_overwrite and idx in self.embed_data:
|
||||
raise ValueError("Unexpectedly tried to overwrite block data")
|
||||
|
||||
self.embed_data[idx] = np.float16(embed)
|
||||
|
||||
def save_shard(self):
|
||||
"""
|
||||
Save the block data that was created this in this process
|
||||
"""
|
||||
if not os.path.isdir(self.temp_dir_name):
|
||||
os.makedirs(self.temp_dir_name, exist_ok=True)
|
||||
|
||||
# save the data for each shard
|
||||
with open('{}/{}.pkl'.format(self.temp_dir_name, self.rank), 'wb') \
|
||||
as writer:
|
||||
pickle.dump(self.state(), writer)
|
||||
|
||||
def merge_shards_and_save(self):
|
||||
#Combine all the shards made using save_shard
|
||||
shard_names = os.listdir(self.temp_dir_name)
|
||||
seen_own_shard = False
|
||||
|
||||
for fname in os.listdir(self.temp_dir_name):
|
||||
shard_rank = int(os.path.splitext(fname)[0])
|
||||
if shard_rank == self.rank:
|
||||
seen_own_shard = True
|
||||
continue
|
||||
|
||||
with open('{}/{}'.format(self.temp_dir_name, fname), 'rb') as f:
|
||||
data = pickle.load(f)
|
||||
old_size = len(self.embed_data)
|
||||
shard_size = len(data['embed_data'])
|
||||
|
||||
# add the shard's data and check to make sure there
|
||||
# is no overlap
|
||||
self.embed_data.update(data['embed_data'])
|
||||
assert len(self.embed_data) == old_size + shard_size
|
||||
|
||||
assert seen_own_shard
|
||||
|
||||
# save the consolidated shards and remove temporary directory
|
||||
with open(self.embedding_path, 'wb') as final_file:
|
||||
pickle.dump(self.state(), final_file)
|
||||
shutil.rmtree(self.temp_dir_name, ignore_errors=True)
|
||||
|
||||
print("Finished merging {} shards for a total of {} embeds".format(
|
||||
len(shard_names), len(self.embed_data)), flush=True)
|
||||
|
||||
|
||||
class FaissMIPSIndex(object):
|
||||
"""
|
||||
Wrapper object for a BlockData which similarity search via FAISS under the hood
|
||||
"""
|
||||
def __init__(self, embed_size, embed_data=None, use_gpu=False):
|
||||
self.embed_size = embed_size
|
||||
self.embed_data = embed_data
|
||||
self.use_gpu = use_gpu
|
||||
|
||||
self.mips_index = None
|
||||
self._set_mips_index()
|
||||
|
||||
def _set_mips_index(self):
|
||||
"""
|
||||
Create a Faiss Flat index with inner product as the metric
|
||||
to search against
|
||||
"""
|
||||
try:
|
||||
import faiss
|
||||
except ImportError:
|
||||
raise Exception("Error: Please install faiss to use FaissMIPSIndex")
|
||||
|
||||
if parallel_state.is_unitialized() or parallel_state.get_data_parallel_rank() == 0:
|
||||
print("\n> Building index", flush=True)
|
||||
|
||||
cpu_index = faiss.IndexFlatIP(self.embed_size)
|
||||
|
||||
if self.use_gpu:
|
||||
# create resources and config for GpuIndex
|
||||
config = faiss.GpuMultipleClonerOptions()
|
||||
config.shard = True
|
||||
config.useFloat16 = True
|
||||
gpu_index = faiss.index_cpu_to_all_gpus(cpu_index, co=config)
|
||||
self.mips_index = faiss.IndexIDMap(gpu_index)
|
||||
if parallel_state.is_unitialized() or parallel_state.get_data_parallel_rank() == 0:
|
||||
print(">> Initialized index on GPU", flush=True)
|
||||
else:
|
||||
# CPU index supports IDs so wrap with IDMap
|
||||
self.mips_index = faiss.IndexIDMap(cpu_index)
|
||||
if parallel_state.is_unitialized() or parallel_state.get_data_parallel_rank() == 0:
|
||||
print(">> Initialized index on CPU", flush=True)
|
||||
|
||||
# if we were constructed with a BlockData, then automatically load it
|
||||
# when the FAISS structure is built
|
||||
if self.embed_data is not None:
|
||||
self.add_embed_data(self.embed_data)
|
||||
|
||||
def reset_index(self):
|
||||
"""Delete existing index and create a new"""
|
||||
del self.mips_index
|
||||
|
||||
# reset the block data so that _set_block_index will reload it as well
|
||||
if self.embed_data is not None:
|
||||
embed_data_path = self.embed_data.embedding_path
|
||||
del self.embed_data
|
||||
self.embed_data = OpenRetreivalDataStore(embed_data_path)
|
||||
|
||||
self._set_mips_index()
|
||||
|
||||
def update_index(self):
|
||||
"""Delete existing index and create a new"""
|
||||
del self.mips_index
|
||||
|
||||
# reset the block data so that _set_mips_index will reload it as well
|
||||
if self.embed_data is not None:
|
||||
self.embed_data.load_from_file()
|
||||
self._set_mips_index()
|
||||
|
||||
def add_embed_data(self, all_embed_data):
|
||||
"""Add the embedding of each block to the underlying FAISS index"""
|
||||
|
||||
# this assumes the embed_data is a dict : {int: np.array<float>}
|
||||
block_indices, block_embeds = zip(*all_embed_data.embed_data.items())
|
||||
|
||||
# the embeddings have to be entered in as float32 even though the math
|
||||
# internally is done with float16.
|
||||
embeds_arr = np.float32(np.array(block_embeds))
|
||||
indices_arr = np.array(block_indices)
|
||||
|
||||
# we no longer need the embedding data since it's in the index now
|
||||
all_embed_data.clear()
|
||||
|
||||
self.mips_index.add_with_ids(embeds_arr, indices_arr)
|
||||
|
||||
if parallel_state.is_unitialized() or parallel_state.get_data_parallel_rank() == 0:
|
||||
print(">>> Finished adding block data to index", flush=True)
|
||||
|
||||
def search_mips_index(self, query_embeds, top_k, reconstruct=True):
|
||||
"""
|
||||
Get the top-k blocks by the index distance metric.
|
||||
|
||||
:param reconstruct: if True: return a [num_queries x k x embed_dim]
|
||||
array of blocks
|
||||
if False: return [num_queries x k] array of
|
||||
distances, and another for indices
|
||||
"""
|
||||
query_embeds = np.float32(detach(query_embeds))
|
||||
|
||||
if reconstruct:
|
||||
# get the vectors themselves
|
||||
top_k_block_embeds = self.mips_index.search_and_reconstruct(\
|
||||
query_embeds, top_k)
|
||||
return top_k_block_embeds
|
||||
else:
|
||||
# get distances and indices of closest vectors
|
||||
distances, block_indices = self.mips_index.search(query_embeds, top_k)
|
||||
return distances, block_indices
|
@ -1,39 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""For backward compatibility, we need the class definitions to deserialize."""
|
||||
|
||||
class LossScaler:
|
||||
def __init__(self, scale=1):
|
||||
self.cur_scale = scale
|
||||
|
||||
class DynamicLossScaler:
|
||||
def __init__(self,
|
||||
init_scale=2**32,
|
||||
scale_factor=2.,
|
||||
scale_window=1000,
|
||||
min_scale=1,
|
||||
delayed_shift=1,
|
||||
consecutive_hysteresis=False):
|
||||
self.cur_scale = init_scale
|
||||
self.cur_iter = 0
|
||||
self.last_overflow_iter = -1
|
||||
self.scale_factor = scale_factor
|
||||
self.scale_window = scale_window
|
||||
self.min_scale = min_scale
|
||||
self.delayed_shift = delayed_shift
|
||||
self.cur_hysteresis = delayed_shift
|
||||
self.consecutive_hysteresis = consecutive_hysteresis
|
||||
|
@ -1,127 +0,0 @@
|
||||
import sys
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
from ascendspeed import get_args
|
||||
from ascendspeed.core import parallel_state
|
||||
from ascendspeed.checkpointing import load_biencoder_checkpoint
|
||||
from ascendspeed.data.orqa_wiki_dataset import get_open_retrieval_wiki_dataset
|
||||
from ascendspeed.data.orqa_wiki_dataset import get_open_retrieval_batch
|
||||
from ascendspeed.data.biencoder_dataset_utils import get_one_epoch_dataloader
|
||||
from ascendspeed.data.realm_index import detach, OpenRetreivalDataStore
|
||||
from ascendspeed.model.biencoder_model import biencoder_model_provider
|
||||
from ascendspeed.training import get_model
|
||||
|
||||
|
||||
class IndexBuilder(object):
|
||||
"""
|
||||
Object for taking one pass over a dataset and creating a BlockData of its
|
||||
embeddings
|
||||
"""
|
||||
def __init__(self):
|
||||
args = get_args()
|
||||
self.model = None
|
||||
self.dataloader = None
|
||||
self.evidence_embedder_obj = None
|
||||
self.biencoder_shared_query_context_model = \
|
||||
args.biencoder_shared_query_context_model
|
||||
|
||||
# need to know whether we're using a REALM checkpoint (args.load)
|
||||
# or ICT checkpoint
|
||||
assert not (args.load and args.ict_load)
|
||||
#self.using_realm_chkpt = args.ict_load is None
|
||||
|
||||
self.log_interval = args.indexer_log_interval
|
||||
self.batch_size = args.indexer_batch_size
|
||||
|
||||
self.load_attributes()
|
||||
self.is_main_builder = parallel_state.get_data_parallel_rank() == 0
|
||||
self.num_total_builders = parallel_state.get_data_parallel_world_size()
|
||||
self.iteration = self.total_processed = 0
|
||||
|
||||
def load_attributes(self):
|
||||
"""
|
||||
Load the necessary attributes: model, dataloader and empty BlockData
|
||||
"""
|
||||
only_context_model = True
|
||||
if self.biencoder_shared_query_context_model:
|
||||
only_context_model = False
|
||||
|
||||
model = get_model(lambda: biencoder_model_provider(only_context_model \
|
||||
= only_context_model, biencoder_shared_query_context_model = \
|
||||
self.biencoder_shared_query_context_model))
|
||||
|
||||
self.model = load_biencoder_checkpoint(model,
|
||||
only_context_model=only_context_model)
|
||||
|
||||
assert len(self.model) == 1
|
||||
self.model[0].eval()
|
||||
|
||||
self.dataset = get_open_retrieval_wiki_dataset()
|
||||
self.dataloader = iter(get_one_epoch_dataloader(self.dataset, \
|
||||
self.batch_size))
|
||||
|
||||
self.evidence_embedder_obj = OpenRetreivalDataStore( \
|
||||
load_from_path=False)
|
||||
|
||||
def track_and_report_progress(self, batch_size):
|
||||
"""
|
||||
Utility function for tracking progress
|
||||
"""
|
||||
self.iteration += 1
|
||||
self.total_processed += batch_size * self.num_total_builders
|
||||
if self.is_main_builder and self.iteration % self.log_interval == 0:
|
||||
print('Batch {:10d} | Total {:10d}'.format(self.iteration,
|
||||
self.total_processed), flush=True)
|
||||
|
||||
def build_and_save_index(self):
|
||||
"""
|
||||
Goes through one epoch of the dataloader and adds all data to this
|
||||
instance's BlockData.
|
||||
|
||||
The copy of BlockData is saved as a shard, which when run in a
|
||||
distributed setting will be consolidated by the rank 0 process
|
||||
and saved as a final pickled BlockData.
|
||||
"""
|
||||
assert len(self.model) == 1
|
||||
unwrapped_model = self.model[0]
|
||||
while not hasattr(unwrapped_model, 'embed_text'):
|
||||
unwrapped_model = unwrapped_model.module
|
||||
|
||||
while True:
|
||||
try:
|
||||
# batch also has query_tokens and query_pad_data
|
||||
row_id, context_tokens, context_mask, context_types, \
|
||||
context_pad_mask = get_open_retrieval_batch( \
|
||||
self.dataloader)
|
||||
except (StopIteration, IndexError):
|
||||
break
|
||||
|
||||
# TODO: can we add with torch.no_grad() to reduce memory usage
|
||||
# detach, separate fields and add to BlockData
|
||||
assert context_mask.dtype == torch.bool
|
||||
context_logits = unwrapped_model.embed_text(
|
||||
unwrapped_model.context_model, context_tokens, context_mask,
|
||||
context_types)
|
||||
context_logits = detach(context_logits)
|
||||
row_id = detach(row_id)
|
||||
|
||||
self.evidence_embedder_obj.add_block_data(row_id, context_logits)
|
||||
self.track_and_report_progress(batch_size=len(row_id))
|
||||
|
||||
# This process signals to finalize its shard and then synchronize with
|
||||
# the other processes
|
||||
self.evidence_embedder_obj.save_shard()
|
||||
torch.distributed.barrier()
|
||||
del self.model
|
||||
|
||||
# rank 0 process builds the final copy
|
||||
if self.is_main_builder:
|
||||
self.evidence_embedder_obj.merge_shards_and_save()
|
||||
# make sure that every single piece of data was embedded
|
||||
assert len(self.evidence_embedder_obj.embed_data) == \
|
||||
len(self.dataset)
|
||||
self.evidence_embedder_obj.clear()
|
||||
|
||||
# complete building the final copy
|
||||
torch.distributed.barrier()
|
@ -100,7 +100,7 @@ def _compile_dependencies():
|
||||
if torch.distributed.get_rank() == 0:
|
||||
start_time = time.time()
|
||||
print('> compiling dataset index builder ...')
|
||||
from ascendspeed.data.dataset_utils import compile_helper
|
||||
from megatron.data.dataset_utils import compile_helper
|
||||
compile_helper()
|
||||
print('>>> done with dataset index builder. Compilation time: {:.3f} '
|
||||
'seconds'.format(time.time() - start_time), flush=True)
|
||||
|
117
ci/build.sh
117
ci/build.sh
@ -1,117 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright (c) 2020 Huawei Technologies Co., Ltd
|
||||
# Copyright (c) 2019, Facebook CORPORATION.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Licensed under the BSD 3-Clause License (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# https://opensource.org/licenses/BSD-3-Clause
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
CUR_DIR=$(dirname $(readlink -f $0))
|
||||
SUPPORTED_PY_VERSION=(3.7 3.8 3.9)
|
||||
PY_VERSION='3.7' # Default supported python version is 3.7
|
||||
DEFAULT_SCRIPT_ARGS_NUM=1 # Default supported input parameters
|
||||
|
||||
# Parse arguments inside script
|
||||
function parse_script_args() {
|
||||
local args_num=0
|
||||
if [[ "x${1}" = "x" ]]; then
|
||||
# default: bash build.sh (python3.7)
|
||||
return 0
|
||||
fi
|
||||
|
||||
while true; do
|
||||
if [[ "x${1}" = "x" ]]; then
|
||||
break
|
||||
fi
|
||||
if [[ "$(echo "${1}"|cut -b1-|cut -b-2)" == "--" ]]; then
|
||||
args_num=$((args_num+1))
|
||||
fi
|
||||
if [[ ${args_num} -eq ${DEFAULT_SCRIPT_ARGS_NUM} ]]; then
|
||||
break
|
||||
fi
|
||||
shift
|
||||
done
|
||||
|
||||
# if num of args are not fully parsed, throw an error.
|
||||
if [[ ${args_num} -lt ${DEFAULT_SCRIPT_ARGS_NUM} ]]; then
|
||||
return 1
|
||||
fi
|
||||
|
||||
while true; do
|
||||
case "${1}" in
|
||||
--python=*)
|
||||
PY_VERSION=$(echo "${1}"|cut -d"=" -f2)
|
||||
args_num=$((args_num-1))
|
||||
shift
|
||||
;;
|
||||
--tocpu=*)
|
||||
export 'NPU_TOCPU'=${1:8}
|
||||
args_num=$((args_num-1))
|
||||
shift
|
||||
;;
|
||||
-*)
|
||||
echo "ERROR Unsupported parameters: ${1}"
|
||||
return 1
|
||||
;;
|
||||
*)
|
||||
if [ "x${1}" != "x" ]; then
|
||||
echo "ERROR Unsupported parameters: ${1}"
|
||||
return 1
|
||||
fi
|
||||
break
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# if some "--param=value" are not parsed correctly, throw an error.
|
||||
if [[ ${args_num} -ne 0 ]]; then
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
function check_python_version() {
|
||||
matched_py_version='false'
|
||||
for ver in ${SUPPORTED_PY_VERSION[*]}; do
|
||||
if [ "${PY_VERSION}" = "${ver}" ]; then
|
||||
matched_py_version='true'
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
if [ "${matched_py_version}" = 'false' ]; then
|
||||
echo "${PY_VERSION} is an unsupported python version, we suggest ${SUPPORTED_PY_VERSION[*]}"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
function main()
|
||||
{
|
||||
if ! parse_script_args "$@"; then
|
||||
echo "Failed to parse script args. Please check your inputs."
|
||||
exit 1
|
||||
fi
|
||||
check_python_version
|
||||
|
||||
cd ${CUR_DIR}/..
|
||||
# if you add or delete file/files in the project, you need to remove the following comment
|
||||
# make clean
|
||||
|
||||
python"${PY_VERSION}" setup.py build bdist_wheel
|
||||
if [ $? != 0 ]; then
|
||||
echo "Failed to compile the wheel file. Please check the source code by yourself."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
||||
}
|
||||
|
||||
main "$@"
|
@ -132,6 +132,9 @@ wget https://gitee.com/ascend/pytorch/releases/download/v5.0.rc2-pytorch1.11.0/t
|
||||
or wget https://gitee.com/ascend/pytorch/releases/download/v5.0.rc2-pytorch1.11.0/torch_npu-1.11.0.post1-cp37-cp37m-linux_x86_64.whl (X86)
|
||||
pip install torch_npu-1.11.0.post1-cp37-cp37m-linux_XXXXXX.whl
|
||||
|
||||
# install megatron-core
|
||||
pip3 install -e git+https://github.com/NVIDIA/Megatron-LM.git@23.05#egg=megatron-core
|
||||
|
||||
# install deepspeed and deepspeed_npu
|
||||
pip install deepspeed==0.9.2
|
||||
git clone https://gitee.com/ascend/DeepSpeed.git -b v0.9.2 deepspeed_npu
|
||||
|
@ -51,6 +51,9 @@ pip install torch-1.11.0-cp37-cp37m-linux_aarch64.whl
|
||||
pip install torch_npu-1.11.0.post4_XXXXXX-cp37-cp37m-linux_aarch64.whl
|
||||
pip install apex-0.1_ascend_XXXXXX-cp37-cp37m-linux_aarch64.whl
|
||||
|
||||
# install megatron-core
|
||||
pip3 install -e git+https://github.com/NVIDIA/Megatron-LM.git@23.05#egg=megatron-core
|
||||
|
||||
# install deepspeed and deepspeed_npu
|
||||
pip install deepspeed==0.9.2
|
||||
git clone https://gitee.com/ascend/DeepSpeed.git -b v0.9.2 deepspeed_npu
|
||||
@ -183,6 +186,11 @@ pip install torch-1.11.0-cp37-cp37m-linux_aarch64.whl
|
||||
pip install torch_npu-1.11.0.post4_XXXXXX-cp37-cp37m-linux_aarch64.whl
|
||||
pip install apex-0.1_ascend_XXXXXX-cp37-cp37m-linux_aarch64.whl
|
||||
|
||||
#install megatron
|
||||
git clone https://github.com/NVIDIA/Megatron-LM.git -b 23.05
|
||||
cd Megatron-LM
|
||||
pip3 install -e ./
|
||||
|
||||
# install deepspeed and deepspeed_npu
|
||||
pip install deepspeed==0.9.2
|
||||
git clone https://gitee.com/ascend/DeepSpeed.git -b v0.9.2 deepspeed_npu
|
||||
|
@ -47,23 +47,26 @@ mkdir ckpt
|
||||
```
|
||||
2.Install AscendSpeed requirement environment.
|
||||
```shell
|
||||
#python3.7
|
||||
# python3.7
|
||||
conda create -n test python=3.7
|
||||
conda activate test
|
||||
|
||||
#install torch and torch_npu
|
||||
# install torch and torch_npu
|
||||
pip install torch-1.11.0-cp37-cp37m-linux_aarch64.whl
|
||||
pip install torch_npu-1.11.0.post4_XXXXXX-cp37-cp37m-linux_aarch64.whl
|
||||
pip install apex-0.1_ascend_XXXXXX-cp37-cp37m-linux_aarch64.whl
|
||||
|
||||
#install deepspeed and deepspeed_npu
|
||||
pip install deepspeed_npu==0.9.2
|
||||
# install megatron-core
|
||||
pip3 install -e git+https://github.com/NVIDIA/Megatron-LM.git@23.05#egg=megatron-core
|
||||
|
||||
# install deepspeed and deepspeed_npu
|
||||
pip install deepspeed==0.9.2
|
||||
git clone https://gitee.com/ascend/DeepSpeed.git -b v0.9.2 deepspeed_npu
|
||||
cd deepspeed_npu
|
||||
pip3 install -e ./
|
||||
cd ..
|
||||
|
||||
#install other packages
|
||||
# install other packages
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
3.Download llama-65b checkpoint
|
||||
@ -71,7 +74,7 @@ pip install -r requirements.txt
|
||||
mkdir tokenizer
|
||||
cd ./tokenizer
|
||||
|
||||
#Make sure you have git-lfs installed (https://git-lfs.com)
|
||||
# make sure you have git-lfs installed (https://git-lfs.com)
|
||||
git lfs install
|
||||
git clone https://huggingface.co/decapoda-research/llama-65b-hf
|
||||
cd ..
|
||||
@ -91,12 +94,12 @@ python $SCRIPT_PATH \
|
||||
|
||||
5.Download dataset
|
||||
```shell
|
||||
#for llama, dowload alpaca dataset, like
|
||||
# for llama, dowload alpaca dataset, like
|
||||
wget http://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.jason
|
||||
|
||||
#download tokenizer configs nad (selective) weights from
|
||||
#http://huggingface.co/decapoda-research/llama-65b-hf
|
||||
#revise "LLaMATokenizer" as "LLaMTokenizer" in tokenizer_config.json
|
||||
# download tokenizer configs nad (selective) weights from
|
||||
# http://huggingface.co/decapoda-research/llama-65b-hf
|
||||
# revise "LLaMATokenizer" as "LLaMTokenizer" in tokenizer_config.json
|
||||
mkdir dataset
|
||||
python tools/preprocess_data.py --input alpaca_data.json\
|
||||
--output-prefix dataset/alpaca\
|
||||
|
116
setup.py
116
setup.py
@ -1,116 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Setup for pip package."""
|
||||
|
||||
import sys
|
||||
import setuptools
|
||||
|
||||
if sys.version_info < (3,):
|
||||
raise Exception("Python 2 is not supported by AscendSpeed.")
|
||||
|
||||
__description__ = 'AscendSpeed for LLMs of Ascend'
|
||||
__version__ = '0.0.1'
|
||||
__author__ = 'Ascend'
|
||||
__long_description__ = 'AscendSpeed for LLMs of Ascend'
|
||||
__url__ = 'https://gitee.com/ascend/AscendSpeed'
|
||||
__download_url__ = 'https://gitee.com/ascend/AscendSpeed/release'
|
||||
__keywords__ = 'Ascend, langauge, deep learning, NLP'
|
||||
__license__ = 'See https://gitee.com/ascend/AscendSpeed'
|
||||
__package_name__ = 'ascendspeed'
|
||||
__contact_names__ = 'Ascend'
|
||||
|
||||
|
||||
with open("README.md", "r") as fh:
|
||||
long_description = fh.read()
|
||||
|
||||
###############################################################################
|
||||
# Dependency Loading #
|
||||
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #
|
||||
|
||||
|
||||
def req_file(filename):
|
||||
with open(filename) as f:
|
||||
content = f.readlines()
|
||||
return [x.strip() for x in content]
|
||||
|
||||
|
||||
install_requires = req_file("requirements.txt")
|
||||
cmd_class={}
|
||||
exts = []
|
||||
try:
|
||||
from torch_npu.utils import cpp_extension
|
||||
from torch.utils.cpp_extension import BuildExtension
|
||||
import imp
|
||||
ext_ops = cpp_extension.NpuExtension(
|
||||
name="ascendspeed_ops_internel",
|
||||
sources=["ascendspeed/ops/FlashAttention.cpp"],
|
||||
extra_compile_args=[
|
||||
'-Wno-sign-compare',
|
||||
'-Wno-deprecated-declarations',
|
||||
'-Wno-return-type',
|
||||
'-D__FILENAME__=\"$(notdir $(abspath $<))\"',
|
||||
'-I'+imp.find_module('torch_npu')[1]+'/include/third_party/acl/inc'
|
||||
],
|
||||
)
|
||||
exts.append(ext_ops)
|
||||
cmd_class={"build_ext": BuildExtension.with_options(use_ninja=False)}
|
||||
except Exception:
|
||||
print('Can not find any torch_npu, ops setup failed')
|
||||
|
||||
setuptools.setup(
|
||||
package_data={'ascendspeed':['ascendspeed/data/Makefile']},
|
||||
data_files=[('', ['ascendspeed/data/Makefile'])],
|
||||
name=__package_name__,
|
||||
# Versions should comply with PEP440. For a discussion on single-sourcing
|
||||
# the version across setup.py and the project code, see
|
||||
# https://packaging.python.org/en/latest/single_source_version.html
|
||||
version=__version__,
|
||||
description=__description__,
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
# The project's main homepage.
|
||||
url=__url__,
|
||||
author=__contact_names__,
|
||||
maintainer=__contact_names__,
|
||||
# The licence under which the project is released
|
||||
license=__license__,
|
||||
classifiers=[
|
||||
'Intended Audience :: Developers',
|
||||
'Intended Audience :: Science/Research',
|
||||
'Intended Audience :: Information Technology',
|
||||
# Indicate what your project relates to
|
||||
'Topic :: Scientific/Engineering :: Artificial Intelligence',
|
||||
'Topic :: Software Development :: Libraries :: Python Modules',
|
||||
# Supported python versions
|
||||
'Programming Language :: Python :: 3.7',
|
||||
'Programming Language :: Python :: 3.8',
|
||||
'Programming Language :: Python :: 3.9',
|
||||
# Additional Setting
|
||||
'Environment :: Console',
|
||||
'Natural Language :: English',
|
||||
'Operating System :: OS Independent',
|
||||
],
|
||||
python_requires='>=3.7',
|
||||
packages=setuptools.find_packages(),
|
||||
install_requires=install_requires,
|
||||
# Add in any packaged data.
|
||||
include_package_data=True,
|
||||
zip_safe=False,
|
||||
# PyPI package information.
|
||||
keywords=__keywords__,
|
||||
cmdclass=cmd_class,
|
||||
ext_modules=exts
|
||||
)
|
Loading…
Reference in New Issue
Block a user