增加ClassifyF1PreRecMetric的多卡测试用例

2024-12-04 05:07:44 +08:00 · 2022-04-12 16:06:25 +08:00 · 2022-04-12 16:06:25 +08:00 · 7a68f90b56
commit 7a68f90b56
parent ab3b66715e
4 changed files with 133 additions and 70 deletions
--- a/tests/core/metrics/test_accuracy_torch.py
+++ b/tests/core/metrics/test_accuracy_torch.py
@ -15,6 +15,7 @@ from sklearn.metrics import accuracy_score as sklearn_accuracy
 from fastNLP.core.dataset import DataSet
 from fastNLP.core.metrics.accuracy import Accuracy
 from fastNLP.core.metrics.metric import Metric
+from .utils import find_free_network_port, setup_ddp, _assert_allclose

 set_start_method("spawn", force=True)

@ -23,42 +24,6 @@ NUM_PROCESSES = 2
 pool = None


-def setup_ddp(rank: int, world_size: int, master_port: int) -> None:
-    """Setup ddp environment."""
-
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = str(master_port)
-    print(torch.cuda.device_count())
-    if torch.distributed.is_available() and sys.platform not in ("win32", "cygwin"):
-        torch.distributed.init_process_group("gloo", rank=rank, world_size=world_size)
-
-
-def find_free_network_port() -> int:
-    """Finds a free port on localhost.
-
-    It is useful in single-node training when we don't want to connect to a real master node but have to set the
-    `MASTER_PORT` environment variable.
-    """
-    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    s.bind(("", 0))
-    s.listen(1)
-    port = s.getsockname()[1]
-    s.close()
-    return port
-
-
-def _assert_allclose(my_result: Union[float, np.ndarray], sklearn_result: Union[float, np.ndarray],
-                     atol: float = 1e-8) -> None:
-    """
-    测试对比结果，这里不用非得是必须数组且维度对应，一些其他情况例如 np.allclose(np.array([[1e10, ], ]), 1e10+1) 也是 True
-    :param my_result: 可以不限设备等
-    :param sklearn_result:
-    :param atol:
-    :return:
-    """
-    assert np.allclose(a=my_result, b=sklearn_result, atol=atol)
-
-
 def _test(local_rank: int,
          world_size: int,
          device: torch.device,
--- a/tests/core/metrics/test_classify_f1_pre_rec_metric_torch.py
+++ b/tests/core/metrics/test_classify_f1_pre_rec_metric_torch.py
@ -1,8 +1,32 @@
+from functools import partial
+import copy
+
 import pytest
 import torch
 import numpy as np
+from torch.multiprocessing import Pool, set_start_method

 from fastNLP.core.metrics import ClassifyFPreRecMetric
+from fastNLP.core.dataset import DataSet
+from .utils import find_free_network_port, setup_ddp
+
+set_start_method("spawn", force=True)
+
+
+def _test(local_rank: int, world_size: int, device: torch.device,
+          dataset: DataSet, metric_class, metric_kwargs, metric_result):
+    metric = metric_class(**metric_kwargs)
+    # dataset 也类似（每个进程有自己的一个）
+    dataset = copy.deepcopy(dataset)
+    metric.to(device)
+    # 把数据拆到每个 GPU 上，有点模仿 DistributedSampler 的感觉，但这里数据单位是一个 batch（即每个 i 取了一个 batch 到自己的 GPU 上）
+    for i in range(local_rank, len(dataset), world_size):
+        pred, tg = dataset[i]['pred'].to(device), dataset[i]['tg'].to(device)
+        metric.update(pred, tg)
+
+    my_result = metric.get_metric()
+    for keys in ['f', 'pre', 'rec']:
+        np.allclose(my_result[keys], metric_result[keys], atol=0.000001)


 class TestClassfiyFPreRecMetric:
@ -86,3 +110,68 @@ class TestClassfiyFPreRecMetric:
            tmp_d = {"p": "precision", "r": "recall", "f": "f1-score"}
            gk = tmp_d[keys[0]]
            np.allclose(result_dict[keys], ground_truth[gl][gk], atol=0.000001)
+
+    @pytest.mark.parametrize("f_type, f1_score,recall,pre",
+                             [('macro', 0.1882051282051282, 0.1619047619047619, 0.23928571428571427),
+                              ('micro', 0.21875, 0.21875, 0.21875)])
+    def test_case_2(self, f_type, f1_score, recall, pre):
+        dataset = DataSet({
+            'pred': [torch.tensor([[-0.4375, -0.1779, -1.0985, -1.1592, 0.4910],
+                                   [1.3410, 0.2889, -0.8667, -1.8580, 0.3029],
+                                   [0.7459, -1.1957, 0.3231, 0.0308, -0.1847],
+                                   [1.1439, -0.0057, 0.8203, 0.0312, -1.0051],
+                                   [-0.4870, 0.3215, -0.8290, 0.9221, 0.4683],
+                                   [0.9078, 1.0674, -0.5629, 0.3895, 0.8917],
+                                   [-0.7743, -0.4041, -0.9026, 0.2112, 1.0892],
+                                   [1.8232, -1.4188, -2.5615, -2.4187, 0.5907],
+                                   [-1.0592, 0.4164, -0.1192, 1.4238, -0.9258],
+                                   [-1.1137, 0.5773, 2.5778, 0.5398, -0.3323],
+                                   [-0.3868, -0.5165, 0.2286, -1.3876, 0.5561],
+                                   [-0.3304, 1.3619, -1.5744, 0.4902, -0.7661],
+                                   [1.8387, 0.5234, 0.4269, 1.3748, -1.2793],
+                                   [0.6692, 0.2571, 1.2425, -0.5894, -0.0184],
+                                   [0.4165, 0.4084, -0.1280, 1.4489, -2.3058],
+                                   [-0.5826, -0.5469, 1.5898, -0.2786, -0.9882]]),
+                     torch.tensor([
+                         [-1.5548, -2.2891, 0.2983, -1.2145, -0.1947],
+                         [-0.7222, 2.3543, -0.5801, -0.0640, -1.5614],
+                         [-1.4978, 1.9297, -1.3652, -0.2358, 2.5566],
+                         [0.1561, -0.0316, 0.9331, 1.0363, 2.3949],
+                         [0.2650, -0.8459, 1.3221, 0.1321, -1.1900],
+                         [0.0664, -1.2353, -0.5242, -1.4491, 1.3300],
+                         [-0.2744, 0.0941, 0.7157, 0.1404, 1.2046],
+                         [0.9341, -0.6652, 1.4512, 0.9608, -0.3623],
+                         [-1.1641, 0.0873, 0.1163, -0.2068, -0.7002],
+                         [1.4775, -2.0025, -0.5634, -0.1589, 0.0247],
+                         [1.0151, 1.0304, -0.1042, -0.6955, -0.0629],
+                         [-0.3119, -0.4558, 0.7757, 0.0758, -1.6297],
+                         [1.0654, 0.0313, -0.7716, 0.1194, 0.6913],
+                         [-0.8088, -0.6648, -0.5018, -0.0230, -0.8207],
+                         [-0.7753, -0.3508, 1.6163, 0.7158, 1.5207],
+                         [0.8692, 0.7718, -0.6734, 0.6515, 0.0641]
+                     ])],
+            'tg': [
+                torch.LongTensor([0, 2, 4, 1, 4, 0, 1, 3, 3, 3, 1, 3, 4, 4, 3, 4]),
+                torch.LongTensor([0, 2, 4, 4, 3, 4, 4, 3, 0, 3, 0, 0, 0, 1, 3, 1])
+            ]
+        })
+        metric_kwargs = {
+            'f_type': f_type,
+            'num_class': 5,
+            'only_gross': False,
+            'aggregate_when_get_metric': True
+        }
+        ground_truth = {'f': f1_score, 'pre': pre, 'rec': recall}
+
+        NUM_PROCESSES = 2
+        pool = Pool(processes=NUM_PROCESSES)
+        master_port = find_free_network_port()
+        pool.starmap(setup_ddp, [(rank, NUM_PROCESSES, master_port) for rank in range(NUM_PROCESSES)])
+
+        pool.starmap(partial(_test, dataset=dataset,
+                             metric_class=ClassifyFPreRecMetric,
+                             metric_kwargs=metric_kwargs,
+                             metric_result=ground_truth),
+                     [(rank, NUM_PROCESSES, torch.device(f'cuda:{rank+4}')) for rank in range(NUM_PROCESSES)])
+        pool.close()
+        pool.join()
--- a/tests/core/metrics/test_span_f1_rec_acc_torch.py
+++ b/tests/core/metrics/test_span_f1_rec_acc_torch.py
@ -14,6 +14,7 @@ from torch.multiprocessing import Pool, set_start_method
 from fastNLP.core.vocabulary import Vocabulary
 from fastNLP.core.metrics import SpanFPreRecMetric
 from fastNLP.core.dataset import DataSet
+from .utils import find_free_network_port, setup_ddp

 set_start_method("spawn", force=True)

@ -41,40 +42,6 @@ NUM_PROCESSES = 2
 pool = None


-def setup_ddp(rank: int, world_size: int, master_port: int) -> None:
-    """Setup ddp environment."""
-
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = str(master_port)
-    if torch.distributed.is_available() and sys.platform not in ("win32", "cygwin"):
-        torch.distributed.init_process_group("gloo", rank=rank, world_size=world_size)
-
-
-def find_free_network_port() -> int:
-    """Finds a free port on localhost.
-
-    It is useful in single-node training when we don't want to connect to a real master node but have to set the
-    `MASTER_PORT` environment variable.
-    """
-    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    s.bind(("", 0))
-    s.listen(1)
-    port = s.getsockname()[1]
-    s.close()
-    return port
-
-
-# @pytest.fixture(scope='class', autouse=True)
-# def pre_process():
-#     global pool
-#     pool = Pool(processes=NUM_PROCESSES)
-#     master_port = find_free_network_port()
-#     pool.starmap(setup_ddp, [(rank, NUM_PROCESSES, master_port) for rank in range(NUM_PROCESSES)])
-#     yield
-#     pool.close()
-#     pool.join()
-
-
 def _test(local_rank: int,
          world_size: int,
          device: torch.device,
--- a/tests/core/metrics/utils.py
+++ b/tests/core/metrics/utils.py
@ -0,0 +1,42 @@
+import os, sys
+import socket
+from typing import Union
+
+import torch
+from torch import distributed
+import numpy as np
+
+
+def setup_ddp(rank: int, world_size: int, master_port: int) -> None:
+    """Setup ddp environment."""
+
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = str(master_port)
+    if torch.distributed.is_available() and sys.platform not in ("win32", "cygwin"):
+        torch.distributed.init_process_group("gloo", rank=rank, world_size=world_size)
+
+
+def find_free_network_port() -> int:
+    """Finds a free port on localhost.
+
+    It is useful in single-node training when we don't want to connect to a real master node but have to set the
+    `MASTER_PORT` environment variable.
+    """
+    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    s.bind(("", 0))
+    s.listen(1)
+    port = s.getsockname()[1]
+    s.close()
+    return port
+
+
+def _assert_allclose(my_result: Union[float, np.ndarray], sklearn_result: Union[float, np.ndarray],
+                     atol: float = 1e-8) -> None:
+    """
+    测试对比结果，这里不用非得是必须数组且维度对应，一些其他情况例如 np.allclose(np.array([[1e10, ], ]), 1e10+1) 也是 True
+    :param my_result: 可以不限设备等
+    :param sklearn_result:
+    :param atol:
+    :return:
+    """
+    assert np.allclose(a=my_result, b=sklearn_result, atol=atol)