2022-04-11 16:44:08 +08:00
|
|
|
from itertools import chain
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
2022-04-11 21:44:53 +08:00
|
|
|
from fastNLP.core.samplers import UnrepeatedRandomSampler, UnrepeatedSortedSampler, UnrepeatedSequentialSampler
|
2022-04-11 16:44:08 +08:00
|
|
|
|
|
|
|
|
|
|
|
class DatasetWithVaryLength:
|
|
|
|
def __init__(self, num_of_data=100):
|
|
|
|
self.data = list(range(num_of_data))
|
|
|
|
|
|
|
|
def __getitem__(self, item):
|
|
|
|
return self.data[item]
|
|
|
|
|
|
|
|
def __len__(self):
|
|
|
|
return len(self.data)
|
|
|
|
|
|
|
|
|
|
|
|
class TestUnrepeatedSampler:
|
|
|
|
@pytest.mark.parametrize('shuffle', [True, False])
|
|
|
|
def test_single(self, shuffle):
|
|
|
|
num_of_data = 100
|
|
|
|
data = DatasetWithVaryLength(num_of_data)
|
2022-04-11 21:44:53 +08:00
|
|
|
sampler = UnrepeatedRandomSampler(data, shuffle)
|
2022-04-11 16:44:08 +08:00
|
|
|
indexes = set(sampler)
|
|
|
|
assert indexes==set(range(num_of_data))
|
|
|
|
|
2022-04-13 17:04:33 +08:00
|
|
|
@pytest.mark.parametrize('num_replicas', [2, 3])
|
2022-04-11 16:44:08 +08:00
|
|
|
@pytest.mark.parametrize('num_of_data', [2, 3, 4, 100])
|
|
|
|
@pytest.mark.parametrize('shuffle', [False, True])
|
2022-04-30 21:04:55 +08:00
|
|
|
def test_multi(self, num_replicas, num_of_data, shuffle):
|
2022-05-02 13:46:47 +08:00
|
|
|
if num_replicas > num_of_data:
|
|
|
|
pytest.skip("num_replicas > num_of_data")
|
2022-04-11 16:44:08 +08:00
|
|
|
data = DatasetWithVaryLength(num_of_data=num_of_data)
|
|
|
|
samplers = []
|
2022-04-30 21:04:55 +08:00
|
|
|
for i in range(num_replicas):
|
2022-04-11 21:44:53 +08:00
|
|
|
sampler = UnrepeatedRandomSampler(dataset=data, shuffle=shuffle)
|
2022-04-30 21:04:55 +08:00
|
|
|
sampler.set_distributed(num_replicas, rank=i)
|
2022-04-11 16:44:08 +08:00
|
|
|
samplers.append(sampler)
|
|
|
|
|
2022-04-11 21:44:53 +08:00
|
|
|
indexes = list(chain(*samplers))
|
|
|
|
assert len(indexes) == num_of_data
|
|
|
|
indexes = set(indexes)
|
2022-04-11 16:44:08 +08:00
|
|
|
assert indexes==set(range(num_of_data))
|
|
|
|
|
|
|
|
|
|
|
|
class TestUnrepeatedSortedSampler:
|
2022-04-11 21:44:53 +08:00
|
|
|
def test_single(self):
|
2022-04-11 16:44:08 +08:00
|
|
|
num_of_data = 100
|
|
|
|
data = DatasetWithVaryLength(num_of_data)
|
|
|
|
sampler = UnrepeatedSortedSampler(data, length=data.data)
|
|
|
|
indexes = list(sampler)
|
|
|
|
assert indexes==list(range(num_of_data-1, -1, -1))
|
|
|
|
|
2022-04-13 17:04:33 +08:00
|
|
|
@pytest.mark.parametrize('num_replicas', [2, 3])
|
2022-04-11 16:44:08 +08:00
|
|
|
@pytest.mark.parametrize('num_of_data', [2, 3, 4, 100])
|
2022-04-30 21:04:55 +08:00
|
|
|
def test_multi(self, num_replicas, num_of_data):
|
2022-05-02 13:46:47 +08:00
|
|
|
if num_replicas > num_of_data:
|
|
|
|
pytest.skip("num_replicas > num_of_data")
|
2022-04-11 16:44:08 +08:00
|
|
|
data = DatasetWithVaryLength(num_of_data=num_of_data)
|
|
|
|
samplers = []
|
2022-04-30 21:04:55 +08:00
|
|
|
for i in range(num_replicas):
|
2022-04-11 16:44:08 +08:00
|
|
|
sampler = UnrepeatedSortedSampler(dataset=data, length=data.data)
|
2022-04-30 21:04:55 +08:00
|
|
|
sampler.set_distributed(num_replicas, rank=i)
|
2022-04-11 16:44:08 +08:00
|
|
|
samplers.append(sampler)
|
|
|
|
|
2022-04-11 21:44:53 +08:00
|
|
|
# 保证顺序是没乱的
|
|
|
|
for sampler in samplers:
|
|
|
|
prev_index = float('inf')
|
|
|
|
for index in sampler:
|
|
|
|
assert index <= prev_index
|
|
|
|
prev_index = index
|
|
|
|
|
|
|
|
indexes = list(chain(*samplers))
|
|
|
|
assert len(indexes) == num_of_data # 不同卡之间没有交叉
|
|
|
|
indexes = set(indexes)
|
2022-04-11 16:44:08 +08:00
|
|
|
assert indexes==set(range(num_of_data))
|
2022-04-11 21:44:53 +08:00
|
|
|
|
|
|
|
|
|
|
|
class TestUnrepeatedSequentialSampler:
|
|
|
|
def test_single(self):
|
|
|
|
num_of_data = 100
|
|
|
|
data = DatasetWithVaryLength(num_of_data)
|
|
|
|
sampler = UnrepeatedSequentialSampler(data, length=data.data)
|
|
|
|
indexes = list(sampler)
|
|
|
|
assert indexes==list(range(num_of_data))
|
|
|
|
|
2022-04-13 17:04:33 +08:00
|
|
|
@pytest.mark.parametrize('num_replicas', [2, 3])
|
2022-04-11 21:44:53 +08:00
|
|
|
@pytest.mark.parametrize('num_of_data', [2, 3, 4, 100])
|
2022-04-30 21:04:55 +08:00
|
|
|
def test_multi(self, num_replicas, num_of_data):
|
2022-05-02 13:46:47 +08:00
|
|
|
if num_replicas > num_of_data:
|
|
|
|
pytest.skip("num_replicas > num_of_data")
|
2022-04-11 21:44:53 +08:00
|
|
|
data = DatasetWithVaryLength(num_of_data=num_of_data)
|
|
|
|
samplers = []
|
2022-04-30 21:04:55 +08:00
|
|
|
for i in range(num_replicas):
|
2022-04-11 21:44:53 +08:00
|
|
|
sampler = UnrepeatedSequentialSampler(dataset=data, length=data.data)
|
2022-04-30 21:04:55 +08:00
|
|
|
sampler.set_distributed(num_replicas, rank=i)
|
2022-04-11 21:44:53 +08:00
|
|
|
samplers.append(sampler)
|
|
|
|
|
|
|
|
# 保证顺序是没乱的
|
|
|
|
for sampler in samplers:
|
|
|
|
prev_index = float('-inf')
|
|
|
|
for index in sampler:
|
|
|
|
assert index>=prev_index
|
|
|
|
prev_index = index
|
|
|
|
|
|
|
|
indexes = list(chain(*samplers))
|
|
|
|
assert len(indexes) == num_of_data
|
|
|
|
indexes = set(indexes)
|
|
|
|
assert indexes == set(range(num_of_data))
|