milvus/internal/querycoordv2/meta/replica_manager_test.go

497 lines
14 KiB
Go
Raw Normal View History

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package meta
import (
"testing"
"github.com/golang/protobuf/proto"
"github.com/samber/lo"
"github.com/stretchr/testify/suite"
"github.com/milvus-io/milvus-proto/go-api/v2/milvuspb"
etcdkv "github.com/milvus-io/milvus/internal/kv/etcd"
"github.com/milvus-io/milvus/internal/metastore"
"github.com/milvus-io/milvus/internal/metastore/kv/querycoord"
"github.com/milvus-io/milvus/internal/proto/querypb"
. "github.com/milvus-io/milvus/internal/querycoordv2/params"
"github.com/milvus-io/milvus/pkg/kv"
"github.com/milvus-io/milvus/pkg/util/etcd"
"github.com/milvus-io/milvus/pkg/util/paramtable"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
type collectionLoadConfig struct {
spawnConfig map[string]int
}
func (c *collectionLoadConfig) getTotalSpawn() int {
totalSpawn := 0
for _, spawnNum := range c.spawnConfig {
totalSpawn += spawnNum
}
return totalSpawn
}
// Old replica manager test suite.
type ReplicaManagerSuite struct {
suite.Suite
rgs map[string]typeutil.UniqueSet
collections map[int64]collectionLoadConfig
idAllocator func() (int64, error)
kv kv.MetaKv
catalog metastore.QueryCoordCatalog
mgr *ReplicaManager
}
func (suite *ReplicaManagerSuite) SetupSuite() {
paramtable.Init()
suite.rgs = map[string]typeutil.UniqueSet{
"RG1": typeutil.NewUniqueSet(1),
"RG2": typeutil.NewUniqueSet(2, 3),
"RG3": typeutil.NewUniqueSet(4, 5, 6),
}
suite.collections = map[int64]collectionLoadConfig{
100: {
spawnConfig: map[string]int{"RG1": 1},
},
101: {
spawnConfig: map[string]int{"RG2": 2},
},
102: {
spawnConfig: map[string]int{"RG3": 2},
},
103: {
spawnConfig: map[string]int{"RG1": 1, "RG2": 1, "RG3": 1},
},
}
}
func (suite *ReplicaManagerSuite) SetupTest() {
var err error
config := GenerateEtcdConfig()
cli, err := etcd.GetEtcdClient(
config.UseEmbedEtcd.GetAsBool(),
config.EtcdUseSSL.GetAsBool(),
config.Endpoints.GetAsStrings(),
config.EtcdTLSCert.GetValue(),
config.EtcdTLSKey.GetValue(),
config.EtcdTLSCACert.GetValue(),
config.EtcdTLSMinVersion.GetValue())
suite.Require().NoError(err)
suite.kv = etcdkv.NewEtcdKV(cli, config.MetaRootPath.GetValue())
suite.catalog = querycoord.NewCatalog(suite.kv)
suite.idAllocator = RandomIncrementIDAllocator()
suite.mgr = NewReplicaManager(suite.idAllocator, suite.catalog)
suite.spawnAll()
}
func (suite *ReplicaManagerSuite) TearDownTest() {
suite.kv.Close()
}
func (suite *ReplicaManagerSuite) TestSpawn() {
mgr := suite.mgr
mgr.idAllocator = ErrorIDAllocator()
_, err := mgr.Spawn(1, map[string]int{DefaultResourceGroupName: 1}, nil)
suite.Error(err)
replicas := mgr.GetByCollection(1)
suite.Len(replicas, 0)
mgr.idAllocator = suite.idAllocator
replicas, err = mgr.Spawn(1, map[string]int{DefaultResourceGroupName: 1}, []string{"channel1", "channel2"})
suite.NoError(err)
for _, replica := range replicas {
suite.Len(replica.replicaPB.GetChannelNodeInfos(), 0)
}
paramtable.Get().Save(paramtable.Get().QueryCoordCfg.Balancer.Key, ChannelLevelScoreBalancerName)
defer paramtable.Get().Reset(paramtable.Get().QueryCoordCfg.Balancer.Key)
replicas, err = mgr.Spawn(2, map[string]int{DefaultResourceGroupName: 1}, []string{"channel1", "channel2"})
suite.NoError(err)
for _, replica := range replicas {
suite.Len(replica.replicaPB.GetChannelNodeInfos(), 2)
}
}
func (suite *ReplicaManagerSuite) TestGet() {
mgr := suite.mgr
for collectionID, collectionCfg := range suite.collections {
replicas := mgr.GetByCollection(collectionID)
replicaNodes := make(map[int64][]int64)
nodes := make([]int64, 0)
for _, replica := range replicas {
suite.Equal(collectionID, replica.GetCollectionID())
suite.Equal(replica, mgr.Get(replica.GetID()))
suite.Equal(len(replica.replicaPB.GetNodes()), replica.RWNodesCount())
suite.Equal(replica.replicaPB.GetNodes(), replica.GetNodes())
replicaNodes[replica.GetID()] = replica.GetNodes()
nodes = append(nodes, replica.GetNodes()...)
}
expectedNodes := make([]int64, 0)
for rg := range collectionCfg.spawnConfig {
expectedNodes = append(expectedNodes, suite.rgs[rg].Collect()...)
}
suite.ElementsMatch(nodes, expectedNodes)
for replicaID, nodes := range replicaNodes {
for _, node := range nodes {
replica := mgr.GetByCollectionAndNode(collectionID, node)
suite.Equal(replicaID, replica.GetID())
}
}
}
}
func (suite *ReplicaManagerSuite) TestGetByNode() {
mgr := suite.mgr
randomNodeID := int64(11111)
testReplica1 := newReplica(&querypb.Replica{
CollectionID: 3002,
ID: 10086,
Nodes: []int64{randomNodeID},
ResourceGroup: DefaultResourceGroupName,
})
testReplica2 := newReplica(&querypb.Replica{
CollectionID: 3002,
ID: 10087,
Nodes: []int64{randomNodeID},
ResourceGroup: DefaultResourceGroupName,
})
mgr.Put(testReplica1, testReplica2)
replicas := mgr.GetByNode(randomNodeID)
suite.Len(replicas, 2)
}
func (suite *ReplicaManagerSuite) TestRecover() {
mgr := suite.mgr
// Clear data in memory, and then recover from meta store
suite.clearMemory()
mgr.Recover(lo.Keys(suite.collections))
suite.TestGet()
// Test recover from 2.1 meta store
replicaInfo := milvuspb.ReplicaInfo{
ReplicaID: 2100,
CollectionID: 1000,
NodeIds: []int64{1, 2, 3},
}
value, err := proto.Marshal(&replicaInfo)
suite.NoError(err)
suite.kv.Save(querycoord.ReplicaMetaPrefixV1+"/2100", string(value))
suite.clearMemory()
mgr.Recover(append(lo.Keys(suite.collections), 1000))
replica := mgr.Get(2100)
suite.NotNil(replica)
suite.EqualValues(1000, replica.GetCollectionID())
suite.EqualValues([]int64{1, 2, 3}, replica.GetNodes())
suite.Len(replica.GetNodes(), len(replica.GetNodes()))
for _, node := range replica.GetNodes() {
suite.True(replica.Contains(node))
}
}
func (suite *ReplicaManagerSuite) TestRemove() {
mgr := suite.mgr
for collection := range suite.collections {
err := mgr.RemoveCollection(collection)
suite.NoError(err)
replicas := mgr.GetByCollection(collection)
suite.Empty(replicas)
}
// Check whether the replicas are also removed from meta store
mgr.Recover(lo.Keys(suite.collections))
for collection := range suite.collections {
replicas := mgr.GetByCollection(collection)
suite.Empty(replicas)
}
}
func (suite *ReplicaManagerSuite) TestNodeManipulate() {
mgr := suite.mgr
// add node into rg.
rgs := map[string]typeutil.UniqueSet{
"RG1": typeutil.NewUniqueSet(1, 7),
"RG2": typeutil.NewUniqueSet(2, 3, 8),
"RG3": typeutil.NewUniqueSet(4, 5, 6, 9),
}
// Add node into rg.
for collectionID, cfg := range suite.collections {
rgsOfCollection := make(map[string]typeutil.UniqueSet)
for rg := range cfg.spawnConfig {
rgsOfCollection[rg] = rgs[rg]
}
mgr.RecoverNodesInCollection(collectionID, rgsOfCollection)
for rg := range cfg.spawnConfig {
for _, node := range rgs[rg].Collect() {
replica := mgr.GetByCollectionAndNode(collectionID, node)
suite.Contains(replica.GetNodes(), node)
}
}
}
// Check these modifications are applied to meta store
suite.clearMemory()
mgr.Recover(lo.Keys(suite.collections))
for collectionID, cfg := range suite.collections {
for rg := range cfg.spawnConfig {
for _, node := range rgs[rg].Collect() {
replica := mgr.GetByCollectionAndNode(collectionID, node)
suite.Contains(replica.GetNodes(), node)
}
}
}
}
func (suite *ReplicaManagerSuite) spawnAll() {
mgr := suite.mgr
for id, cfg := range suite.collections {
replicas, err := mgr.Spawn(id, cfg.spawnConfig, nil)
suite.NoError(err)
totalSpawn := 0
rgsOfCollection := make(map[string]typeutil.UniqueSet)
for rg, spawnNum := range cfg.spawnConfig {
totalSpawn += spawnNum
rgsOfCollection[rg] = suite.rgs[rg]
}
mgr.RecoverNodesInCollection(id, rgsOfCollection)
suite.Len(replicas, totalSpawn)
}
}
func (suite *ReplicaManagerSuite) TestResourceGroup() {
mgr := NewReplicaManager(suite.idAllocator, suite.catalog)
replicas1, err := mgr.Spawn(int64(1000), map[string]int{DefaultResourceGroupName: 1}, nil)
suite.NoError(err)
suite.NotNil(replicas1)
suite.Len(replicas1, 1)
replica2, err := mgr.Spawn(int64(2000), map[string]int{DefaultResourceGroupName: 1}, nil)
suite.NoError(err)
suite.NotNil(replica2)
suite.Len(replica2, 1)
replicas := mgr.GetByResourceGroup(DefaultResourceGroupName)
suite.Len(replicas, 2)
rgNames := mgr.GetResourceGroupByCollection(int64(1000))
suite.Len(rgNames, 1)
suite.True(rgNames.Contain(DefaultResourceGroupName))
}
func (suite *ReplicaManagerSuite) clearMemory() {
suite.mgr.replicas = make(map[int64]*Replica)
}
type ReplicaManagerV2Suite struct {
suite.Suite
rgs map[string]typeutil.UniqueSet
collections map[int64]collectionLoadConfig
kv kv.MetaKv
catalog metastore.QueryCoordCatalog
mgr *ReplicaManager
}
func (suite *ReplicaManagerV2Suite) SetupSuite() {
paramtable.Init()
suite.rgs = map[string]typeutil.UniqueSet{
"RG1": typeutil.NewUniqueSet(1),
"RG2": typeutil.NewUniqueSet(2, 3),
"RG3": typeutil.NewUniqueSet(4, 5, 6),
"RG4": typeutil.NewUniqueSet(7, 8, 9, 10),
"RG5": typeutil.NewUniqueSet(11, 12, 13, 14, 15),
}
suite.collections = map[int64]collectionLoadConfig{
1000: {
spawnConfig: map[string]int{"RG1": 1},
},
1001: {
spawnConfig: map[string]int{"RG2": 2},
},
1002: {
spawnConfig: map[string]int{"RG3": 2},
},
1003: {
spawnConfig: map[string]int{"RG1": 1, "RG2": 1, "RG3": 1},
},
1004: {
spawnConfig: map[string]int{"RG4": 2, "RG5": 3},
},
1005: {
spawnConfig: map[string]int{"RG4": 3, "RG5": 2},
},
}
var err error
config := GenerateEtcdConfig()
cli, err := etcd.GetEtcdClient(
config.UseEmbedEtcd.GetAsBool(),
config.EtcdUseSSL.GetAsBool(),
config.Endpoints.GetAsStrings(),
config.EtcdTLSCert.GetValue(),
config.EtcdTLSKey.GetValue(),
config.EtcdTLSCACert.GetValue(),
config.EtcdTLSMinVersion.GetValue())
suite.Require().NoError(err)
suite.kv = etcdkv.NewEtcdKV(cli, config.MetaRootPath.GetValue())
suite.catalog = querycoord.NewCatalog(suite.kv)
idAllocator := RandomIncrementIDAllocator()
suite.mgr = NewReplicaManager(idAllocator, suite.catalog)
}
func (suite *ReplicaManagerV2Suite) TearDownSuite() {
suite.kv.Close()
}
func (suite *ReplicaManagerV2Suite) TestSpawn() {
mgr := suite.mgr
for id, cfg := range suite.collections {
replicas, err := mgr.Spawn(id, cfg.spawnConfig, nil)
suite.NoError(err)
rgsOfCollection := make(map[string]typeutil.UniqueSet)
for rg := range cfg.spawnConfig {
rgsOfCollection[rg] = suite.rgs[rg]
}
mgr.RecoverNodesInCollection(id, rgsOfCollection)
for rg := range cfg.spawnConfig {
for _, node := range suite.rgs[rg].Collect() {
replica := mgr.GetByCollectionAndNode(id, node)
suite.Contains(replica.GetNodes(), node)
}
}
suite.Len(replicas, cfg.getTotalSpawn())
replicas = mgr.GetByCollection(id)
suite.Len(replicas, cfg.getTotalSpawn())
}
suite.testIfBalanced()
}
func (suite *ReplicaManagerV2Suite) testIfBalanced() {
// If balanced
for id := range suite.collections {
replicas := suite.mgr.GetByCollection(id)
rgToReplica := make(map[string][]*Replica, 0)
for _, r := range replicas {
rgToReplica[r.GetResourceGroup()] = append(rgToReplica[r.GetResourceGroup()], r)
}
for _, replicas := range rgToReplica {
maximumNodes := -1
minimumNodes := -1
nodes := make([]int64, 0)
for _, r := range replicas {
availableNodes := suite.rgs[r.GetResourceGroup()]
if maximumNodes == -1 || r.RWNodesCount() > maximumNodes {
maximumNodes = r.RWNodesCount()
}
if minimumNodes == -1 || r.RWNodesCount() < minimumNodes {
minimumNodes = r.RWNodesCount()
}
nodes = append(nodes, r.GetNodes()...)
r.RangeOverRONodes(func(node int64) bool {
if availableNodes.Contain(node) {
nodes = append(nodes, node)
}
return true
})
}
suite.ElementsMatch(nodes, suite.rgs[replicas[0].GetResourceGroup()].Collect())
suite.True(maximumNodes-minimumNodes <= 1)
}
}
}
func (suite *ReplicaManagerV2Suite) TestTransferReplica() {
// param error
err := suite.mgr.TransferReplica(10086, "RG4", "RG5", 1)
suite.Error(err)
err = suite.mgr.TransferReplica(1005, "RG4", "RG5", 0)
suite.Error(err)
err = suite.mgr.TransferReplica(1005, "RG4", "RG4", 1)
suite.Error(err)
err = suite.mgr.TransferReplica(1005, "RG4", "RG5", 1)
suite.NoError(err)
suite.recoverReplica(2, true)
suite.testIfBalanced()
}
func (suite *ReplicaManagerV2Suite) TestTransferReplicaAndAddNode() {
suite.mgr.TransferReplica(1005, "RG4", "RG5", 1)
suite.recoverReplica(1, false)
suite.rgs["RG5"].Insert(16, 17, 18)
suite.recoverReplica(2, true)
suite.testIfBalanced()
}
func (suite *ReplicaManagerV2Suite) TestTransferNode() {
suite.rgs["RG4"].Remove(7)
suite.rgs["RG5"].Insert(7)
suite.recoverReplica(2, true)
suite.testIfBalanced()
}
func (suite *ReplicaManagerV2Suite) recoverReplica(k int, clearOutbound bool) {
// need at least two times to recover the replicas.
// transfer node between replicas need set to outbound and then set to incoming.
for i := 0; i < k; i++ {
// do a recover
for id, cfg := range suite.collections {
rgsOfCollection := make(map[string]typeutil.UniqueSet)
for rg := range cfg.spawnConfig {
rgsOfCollection[rg] = suite.rgs[rg]
}
suite.mgr.RecoverNodesInCollection(id, rgsOfCollection)
}
// clear all outbound nodes
if clearOutbound {
for id := range suite.collections {
replicas := suite.mgr.GetByCollection(id)
for _, r := range replicas {
outboundNodes := r.GetRONodes()
suite.mgr.RemoveNode(r.GetID(), outboundNodes...)
}
}
}
}
}
func TestReplicaManager(t *testing.T) {
suite.Run(t, new(ReplicaManagerSuite))
suite.Run(t, new(ReplicaManagerV2Suite))
}