milvus/internal/querycoordv2/meta/resource_manager_test.go
Bingyi Sun a3e22786ed
Move meta store to kv catalog (#25915)
Signed-off-by: sunby <sunbingyi1992@gmail.com>
2023-07-31 13:57:04 +08:00

431 lines
14 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package meta
import (
"testing"
"github.com/cockroachdb/errors"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/suite"
"github.com/milvus-io/milvus/internal/kv"
etcdkv "github.com/milvus-io/milvus/internal/kv/etcd"
"github.com/milvus-io/milvus/internal/metastore/kv/querycoord"
"github.com/milvus-io/milvus/internal/metastore/mocks"
"github.com/milvus-io/milvus/internal/proto/querypb"
. "github.com/milvus-io/milvus/internal/querycoordv2/params"
"github.com/milvus-io/milvus/internal/querycoordv2/session"
"github.com/milvus-io/milvus/pkg/util/etcd"
"github.com/milvus-io/milvus/pkg/util/merr"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
type ResourceManagerSuite struct {
suite.Suite
kv kv.MetaKv
manager *ResourceManager
}
func (suite *ResourceManagerSuite) SetupSuite() {
Params.Init()
}
func (suite *ResourceManagerSuite) SetupTest() {
config := GenerateEtcdConfig()
cli, err := etcd.GetEtcdClient(
config.UseEmbedEtcd.GetAsBool(),
config.EtcdUseSSL.GetAsBool(),
config.Endpoints.GetAsStrings(),
config.EtcdTLSCert.GetValue(),
config.EtcdTLSKey.GetValue(),
config.EtcdTLSCACert.GetValue(),
config.EtcdTLSMinVersion.GetValue())
suite.Require().NoError(err)
suite.kv = etcdkv.NewEtcdKV(cli, config.MetaRootPath.GetValue())
store := querycoord.NewCatalog(suite.kv)
suite.manager = NewResourceManager(store, session.NewNodeManager())
}
func (suite *ResourceManagerSuite) TestManipulateResourceGroup() {
// test add rg
err := suite.manager.AddResourceGroup("rg1")
suite.NoError(err)
suite.True(suite.manager.ContainResourceGroup("rg1"))
suite.Len(suite.manager.ListResourceGroups(), 2)
// test add duplicate rg
err = suite.manager.AddResourceGroup("rg1")
suite.Error(err)
// test delete rg
err = suite.manager.RemoveResourceGroup("rg1")
suite.NoError(err)
// test delete rg which doesn't exist
err = suite.manager.RemoveResourceGroup("rg1")
suite.NoError(err)
// test delete default rg
err = suite.manager.RemoveResourceGroup(DefaultResourceGroupName)
suite.ErrorIs(ErrDeleteDefaultRG, err)
}
func (suite *ResourceManagerSuite) TestManipulateNode() {
suite.manager.nodeMgr.Add(session.NewNodeInfo(1, "localhost"))
err := suite.manager.AddResourceGroup("rg1")
suite.NoError(err)
// test add node to rg
err = suite.manager.AssignNode("rg1", 1)
suite.NoError(err)
// test add non-exist node to rg
err = suite.manager.AssignNode("rg1", 2)
suite.ErrorIs(err, merr.ErrNodeNotFound)
// test add node to non-exist rg
err = suite.manager.AssignNode("rg2", 1)
suite.ErrorIs(err, merr.ErrResourceGroupNotFound)
// test remove node from rg
err = suite.manager.UnassignNode("rg1", 1)
suite.NoError(err)
// test remove non-exist node from rg
err = suite.manager.UnassignNode("rg1", 2)
suite.NoError(err)
// test remove node from non-exist rg
err = suite.manager.UnassignNode("rg2", 1)
suite.ErrorIs(err, merr.ErrResourceGroupNotFound)
// add node which already assign to rg to another rg
err = suite.manager.AddResourceGroup("rg2")
suite.NoError(err)
err = suite.manager.AssignNode("rg1", 1)
suite.NoError(err)
err = suite.manager.AssignNode("rg2", 1)
suite.ErrorIs(err, ErrNodeAlreadyAssign)
// transfer node between rgs
_, err = suite.manager.TransferNode("rg1", "rg2", 1)
suite.NoError(err)
// transfer meet non exist rg
_, err = suite.manager.TransferNode("rgggg", "rg2", 1)
suite.ErrorIs(err, merr.ErrResourceGroupNotFound)
_, err = suite.manager.TransferNode("rg1", "rg2", 5)
suite.ErrorIs(err, ErrNodeNotEnough)
suite.manager.nodeMgr.Add(session.NewNodeInfo(11, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(12, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(13, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(14, "localhost"))
suite.manager.AssignNode("rg1", 11)
suite.manager.AssignNode("rg1", 12)
suite.manager.AssignNode("rg1", 13)
suite.manager.AssignNode("rg1", 14)
rg1, err := suite.manager.GetResourceGroup("rg1")
suite.NoError(err)
rg2, err := suite.manager.GetResourceGroup("rg2")
suite.NoError(err)
suite.Equal(rg1.GetCapacity(), 4)
suite.Equal(rg2.GetCapacity(), 1)
suite.manager.TransferNode("rg1", "rg2", 3)
suite.Equal(rg1.GetCapacity(), 1)
suite.Equal(rg2.GetCapacity(), 4)
}
func (suite *ResourceManagerSuite) TestHandleNodeUp() {
suite.manager.nodeMgr.Add(session.NewNodeInfo(1, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(2, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(3, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(100, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(101, "localhost"))
err := suite.manager.AddResourceGroup("rg1")
suite.NoError(err)
suite.manager.AssignNode("rg1", 1)
suite.manager.AssignNode("rg1", 2)
suite.manager.AssignNode("rg1", 3)
// test query node id not change, expect assign back to origin rg
rg, err := suite.manager.GetResourceGroup("rg1")
suite.NoError(err)
suite.Equal(rg.GetCapacity(), 3)
suite.Equal(len(rg.GetNodes()), 3)
suite.manager.HandleNodeUp(1)
suite.Equal(rg.GetCapacity(), 3)
suite.Equal(len(rg.GetNodes()), 3)
suite.manager.HandleNodeDown(2)
rg, err = suite.manager.GetResourceGroup("rg1")
suite.NoError(err)
suite.Equal(rg.GetCapacity(), 3)
suite.Equal(len(rg.GetNodes()), 2)
suite.NoError(err)
defaultRG, err := suite.manager.GetResourceGroup(DefaultResourceGroupName)
suite.NoError(err)
suite.Equal(DefaultResourceGroupCapacity, defaultRG.GetCapacity())
suite.manager.HandleNodeUp(101)
rg, err = suite.manager.GetResourceGroup("rg1")
suite.NoError(err)
suite.Equal(rg.GetCapacity(), 3)
suite.Equal(len(rg.GetNodes()), 2)
suite.False(suite.manager.ContainsNode("rg1", 101))
suite.Equal(DefaultResourceGroupCapacity, defaultRG.GetCapacity())
}
func (suite *ResourceManagerSuite) TestRecover() {
suite.manager.nodeMgr.Add(session.NewNodeInfo(1, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(2, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(3, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(4, "localhost"))
err := suite.manager.AddResourceGroup("rg1")
suite.NoError(err)
err = suite.manager.AddResourceGroup("rg2")
suite.NoError(err)
suite.manager.AssignNode(DefaultResourceGroupName, 1)
suite.manager.TransferNode(DefaultResourceGroupName, "rg1", 1)
suite.manager.AssignNode(DefaultResourceGroupName, 2)
suite.manager.TransferNode(DefaultResourceGroupName, "rg2", 1)
suite.manager.AssignNode(DefaultResourceGroupName, 3)
suite.manager.AssignNode(DefaultResourceGroupName, 4)
suite.manager.HandleNodeDown(2)
suite.manager.HandleNodeDown(3)
// clear resource manager in hack way
delete(suite.manager.groups, "rg1")
delete(suite.manager.groups, "rg2")
delete(suite.manager.groups, DefaultResourceGroupName)
suite.manager.Recover()
rg, err := suite.manager.GetResourceGroup("rg1")
suite.NoError(err)
suite.Equal(1, rg.GetCapacity())
suite.True(suite.manager.ContainsNode("rg1", 1))
rg, err = suite.manager.GetResourceGroup("rg2")
suite.NoError(err)
suite.Equal(1, rg.GetCapacity())
suite.False(suite.manager.ContainsNode("rg2", 2))
rg, err = suite.manager.GetResourceGroup(DefaultResourceGroupName)
suite.NoError(err)
suite.Equal(DefaultResourceGroupCapacity, rg.GetCapacity())
suite.False(suite.manager.ContainsNode(DefaultResourceGroupName, 3))
suite.True(suite.manager.ContainsNode(DefaultResourceGroupName, 4))
}
func (suite *ResourceManagerSuite) TestCheckOutboundNodes() {
suite.manager.nodeMgr.Add(session.NewNodeInfo(1, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(2, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(3, "localhost"))
err := suite.manager.AddResourceGroup("rg")
suite.NoError(err)
suite.manager.AssignNode("rg", 1)
suite.manager.AssignNode("rg", 2)
suite.manager.AssignNode("rg", 3)
replica := NewReplica(
&querypb.Replica{
ID: 1,
CollectionID: 1,
Nodes: []int64{1, 2, 3, 4},
ResourceGroup: "rg",
},
typeutil.NewUniqueSet(1, 2, 3, 4),
)
outboundNodes := suite.manager.CheckOutboundNodes(replica)
suite.Len(outboundNodes, 1)
suite.True(outboundNodes.Contain(4))
}
func (suite *ResourceManagerSuite) TestCheckResourceGroup() {
suite.manager.nodeMgr.Add(session.NewNodeInfo(1, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(2, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(3, "localhost"))
err := suite.manager.AddResourceGroup("rg")
suite.NoError(err)
suite.manager.AssignNode("rg", 1)
suite.manager.AssignNode("rg", 2)
suite.manager.AssignNode("rg", 3)
suite.manager.HandleNodeDown(1)
lackNodes := suite.manager.CheckLackOfNode("rg")
suite.Equal(lackNodes, 1)
suite.manager.nodeMgr.Remove(2)
suite.manager.checkRGNodeStatus("rg")
lackNodes = suite.manager.CheckLackOfNode("rg")
suite.Equal(lackNodes, 2)
rg, err := suite.manager.FindResourceGroupByNode(3)
suite.NoError(err)
suite.Equal(rg, "rg")
}
func (suite *ResourceManagerSuite) TestGetOutboundNode() {
suite.manager.nodeMgr.Add(session.NewNodeInfo(1, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(2, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(3, "localhost"))
suite.manager.AddResourceGroup("rg")
suite.manager.AddResourceGroup("rg1")
suite.manager.AssignNode("rg", 1)
suite.manager.AssignNode("rg", 2)
suite.manager.AssignNode("rg1", 3)
replica := NewReplica(
&querypb.Replica{
ID: 1,
CollectionID: 100,
ResourceGroup: "rg",
Nodes: []int64{1, 2, 3},
},
typeutil.NewUniqueSet(1, 2, 3),
)
outgoingNodes := suite.manager.GetOutgoingNodeNumByReplica(replica)
suite.NotNil(outgoingNodes)
suite.Len(outgoingNodes, 1)
suite.NotNil(outgoingNodes["rg1"])
suite.Equal(outgoingNodes["rg1"], int32(1))
}
func (suite *ResourceManagerSuite) TestAutoRecover() {
suite.manager.nodeMgr.Add(session.NewNodeInfo(1, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(2, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(3, "localhost"))
err := suite.manager.AddResourceGroup("rg")
suite.NoError(err)
suite.manager.AssignNode(DefaultResourceGroupName, 1)
suite.manager.AssignNode(DefaultResourceGroupName, 2)
suite.manager.AssignNode("rg", 3)
suite.manager.HandleNodeDown(3)
lackNodes := suite.manager.CheckLackOfNode("rg")
suite.Equal(lackNodes, 1)
suite.manager.AutoRecoverResourceGroup("rg")
lackNodes = suite.manager.CheckLackOfNode("rg")
suite.Equal(lackNodes, 0)
// test auto recover behavior when all node down
suite.manager.nodeMgr.Remove(1)
suite.manager.nodeMgr.Remove(2)
suite.manager.AutoRecoverResourceGroup("rg")
nodes, _ := suite.manager.GetNodes("rg")
suite.Len(nodes, 0)
nodes, _ = suite.manager.GetNodes(DefaultResourceGroupName)
suite.Len(nodes, 0)
suite.manager.nodeMgr.Add(session.NewNodeInfo(1, "localhost"))
suite.manager.HandleNodeUp(1)
suite.manager.AutoRecoverResourceGroup("rg")
nodes, _ = suite.manager.GetNodes("rg")
suite.Len(nodes, 1)
nodes, _ = suite.manager.GetNodes(DefaultResourceGroupName)
suite.Len(nodes, 0)
}
func (suite *ResourceManagerSuite) TestDefaultResourceGroup() {
for i := 0; i < 10; i++ {
suite.manager.nodeMgr.Add(session.NewNodeInfo(int64(i), "localhost"))
}
defaultRG, err := suite.manager.GetResourceGroup(DefaultResourceGroupName)
suite.NoError(err)
suite.Equal(defaultRG.GetCapacity(), DefaultResourceGroupCapacity)
suite.Len(defaultRG.GetNodes(), 0)
suite.manager.HandleNodeUp(1)
suite.manager.HandleNodeUp(2)
suite.manager.HandleNodeUp(3)
suite.Equal(defaultRG.GetCapacity(), DefaultResourceGroupCapacity)
suite.Len(defaultRG.GetNodes(), 3)
// shutdown node 1 and 2
suite.manager.nodeMgr.Remove(1)
suite.manager.nodeMgr.Remove(2)
defaultRG, err = suite.manager.GetResourceGroup(DefaultResourceGroupName)
suite.NoError(err)
suite.Equal(defaultRG.GetCapacity(), DefaultResourceGroupCapacity)
suite.Len(defaultRG.GetNodes(), 1)
suite.manager.HandleNodeUp(4)
suite.manager.HandleNodeUp(5)
suite.Equal(defaultRG.GetCapacity(), DefaultResourceGroupCapacity)
suite.Len(defaultRG.GetNodes(), 3)
suite.manager.HandleNodeUp(7)
suite.manager.HandleNodeUp(8)
suite.manager.HandleNodeUp(9)
suite.Equal(defaultRG.GetCapacity(), DefaultResourceGroupCapacity)
suite.Len(defaultRG.GetNodes(), 6)
}
func (suite *ResourceManagerSuite) TestStoreFailed() {
store := mocks.NewQueryCoordCatalog(suite.T())
nodeMgr := session.NewNodeManager()
manager := NewResourceManager(store, nodeMgr)
nodeMgr.Add(session.NewNodeInfo(1, "localhost"))
nodeMgr.Add(session.NewNodeInfo(2, "localhost"))
nodeMgr.Add(session.NewNodeInfo(3, "localhost"))
storeErr := errors.New("store error")
store.EXPECT().SaveResourceGroup(mock.Anything, mock.Anything).Return(storeErr)
store.EXPECT().RemoveResourceGroup(mock.Anything).Return(storeErr)
err := manager.AddResourceGroup("rg")
suite.ErrorIs(err, storeErr)
manager.groups["rg"] = &ResourceGroup{
nodes: typeutil.NewUniqueSet(),
capacity: 0,
}
err = manager.RemoveResourceGroup("rg")
suite.ErrorIs(err, storeErr)
err = manager.AssignNode("rg", 1)
suite.ErrorIs(err, storeErr)
manager.groups["rg"].assignNode(1, 1)
err = manager.UnassignNode("rg", 1)
suite.ErrorIs(err, storeErr)
_, err = manager.TransferNode("rg", DefaultResourceGroupName, 1)
suite.ErrorIs(err, storeErr)
_, err = manager.HandleNodeUp(2)
suite.ErrorIs(err, storeErr)
_, err = manager.HandleNodeDown(1)
suite.ErrorIs(err, storeErr)
}
func (suite *ResourceManagerSuite) TearDownSuite() {
suite.kv.Close()
}
func TestResourceManager(t *testing.T) {
suite.Run(t, new(ResourceManagerSuite))
}