2023-01-30 10:19:48 +08:00
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package meta
import (
2024-04-15 08:13:19 +08:00
"fmt"
2023-01-30 10:19:48 +08:00
"sync"
2023-02-26 11:31:49 +08:00
"github.com/cockroachdb/errors"
2024-04-15 08:13:19 +08:00
"github.com/golang/protobuf/proto"
2023-04-06 19:14:32 +08:00
"github.com/samber/lo"
"go.uber.org/zap"
2023-02-26 11:31:49 +08:00
2024-04-15 08:13:19 +08:00
"github.com/milvus-io/milvus-proto/go-api/v2/rgpb"
2023-07-31 13:57:04 +08:00
"github.com/milvus-io/milvus/internal/metastore"
2023-01-30 10:19:48 +08:00
"github.com/milvus-io/milvus/internal/proto/querypb"
"github.com/milvus-io/milvus/internal/querycoordv2/session"
2023-04-06 19:14:32 +08:00
"github.com/milvus-io/milvus/pkg/log"
2023-07-17 14:59:34 +08:00
"github.com/milvus-io/milvus/pkg/util/merr"
2024-04-15 08:13:19 +08:00
"github.com/milvus-io/milvus/pkg/util/paramtable"
"github.com/milvus-io/milvus/pkg/util/syncutil"
2023-04-06 19:14:32 +08:00
"github.com/milvus-io/milvus/pkg/util/typeutil"
2023-01-30 10:19:48 +08:00
)
2024-04-15 08:13:19 +08:00
var ErrNodeNotEnough = errors . New ( "nodes not enough" )
2023-01-30 10:19:48 +08:00
2024-04-15 08:13:19 +08:00
type ResourceManager struct {
incomingNode typeutil . UniqueSet // incomingNode is a temporary set for incoming hangup node,
// after node is assigned to resource group, it will be removed from this set.
groups map [ string ] * ResourceGroup // primary index from resource group name to resource group
nodeIDMap map [ int64 ] string // secondary index from node id to resource group
2023-04-06 19:14:32 +08:00
2024-04-15 08:13:19 +08:00
catalog metastore . QueryCoordCatalog
nodeMgr * session . NodeManager // TODO: ResourceManager is watch node status with service discovery, so it can handle node up and down as fast as possible.
// All function can get latest online node without checking with node manager.
// so node manager is a redundant type here.
2023-01-30 10:19:48 +08:00
2024-04-15 08:13:19 +08:00
rwmutex sync . RWMutex
rgChangedNotifier * syncutil . VersionedNotifier // used to notify that resource group has been changed.
// resource_observer will listen this notifier to do a resource group recovery.
nodeChangedNotifier * syncutil . VersionedNotifier // used to notify that node distribution in resource group has been changed.
// replica_observer will listen this notifier to do a replica recovery.
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
// NewResourceManager is used to create a ResourceManager instance.
func NewResourceManager ( catalog metastore . QueryCoordCatalog , nodeMgr * session . NodeManager ) * ResourceManager {
groups := make ( map [ string ] * ResourceGroup )
// Always create a default resource group to keep compatibility.
groups [ DefaultResourceGroupName ] = NewResourceGroup ( DefaultResourceGroupName , newResourceGroupConfig ( 0 , defaultResourceGroupCapacity ) )
return & ResourceManager {
incomingNode : typeutil . NewUniqueSet ( ) ,
groups : groups ,
nodeIDMap : make ( map [ int64 ] string ) ,
catalog : catalog ,
nodeMgr : nodeMgr ,
2023-01-30 10:19:48 +08:00
2024-04-15 08:13:19 +08:00
rwmutex : sync . RWMutex { } ,
rgChangedNotifier : syncutil . NewVersionedNotifier ( ) ,
nodeChangedNotifier : syncutil . NewVersionedNotifier ( ) ,
2023-01-30 10:19:48 +08:00
}
}
2024-04-15 08:13:19 +08:00
// Recover recover resource group from meta, other interface of ResourceManager can be only called after recover is done.
func ( rm * ResourceManager ) Recover ( ) error {
rm . rwmutex . Lock ( )
defer rm . rwmutex . Unlock ( )
rgs , err := rm . catalog . GetResourceGroups ( )
if err != nil {
return errors . Wrap ( err , "failed to recover resource group from store" )
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
// Resource group meta upgrade to latest version.
upgrades := make ( [ ] * querypb . ResourceGroup , 0 )
for _ , meta := range rgs {
needUpgrade := meta . Config == nil
2023-01-30 10:19:48 +08:00
2024-04-15 08:13:19 +08:00
rg := NewResourceGroupFromMeta ( meta )
rm . groups [ rg . GetName ( ) ] = rg
for _ , node := range rg . GetNodes ( ) {
if _ , ok := rm . nodeIDMap [ node ] ; ok {
// unreachable code, should never happen.
panic ( fmt . Sprintf ( "dirty meta, node has been assign to multi resource group, %s, %s" , rm . nodeIDMap [ node ] , rg . GetName ( ) ) )
}
rm . nodeIDMap [ node ] = rg . GetName ( )
}
log . Info ( "Recover resource group" ,
zap . String ( "rgName" , rg . GetName ( ) ) ,
zap . Int64s ( "nodes" , rm . groups [ rg . GetName ( ) ] . GetNodes ( ) ) ,
zap . Any ( "config" , rg . GetConfig ( ) ) ,
)
if needUpgrade {
upgrades = append ( upgrades , rg . GetMeta ( ) )
}
}
if len ( upgrades ) > 0 {
log . Info ( "upgrade resource group meta into latest" , zap . Int ( "num" , len ( upgrades ) ) )
return rm . catalog . SaveResourceGroup ( upgrades ... )
}
2023-01-30 10:19:48 +08:00
return nil
}
2024-04-15 08:13:19 +08:00
// AddResourceGroup create a new ResourceGroup.
// Do no changed with node, all node will be reassign to new resource group by auto recover.
func ( rm * ResourceManager ) AddResourceGroup ( rgName string , cfg * rgpb . ResourceGroupConfig ) error {
if len ( rgName ) == 0 {
return merr . WrapErrParameterMissing ( "resource group name couldn't be empty" )
}
if cfg == nil {
// Use default config if not set, compatible with old client.
cfg = newResourceGroupConfig ( 0 , 0 )
2023-01-30 10:19:48 +08:00
}
rm . rwmutex . Lock ( )
defer rm . rwmutex . Unlock ( )
2024-04-15 08:13:19 +08:00
if rm . groups [ rgName ] != nil {
// Idempotent promise.
// If resource group already exist, check if configuration is the same,
if proto . Equal ( rm . groups [ rgName ] . GetConfig ( ) , cfg ) {
return nil
}
return merr . WrapErrResourceGroupAlreadyExist ( rgName )
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
maxResourceGroup := paramtable . Get ( ) . QuotaConfig . MaxResourceGroupNumOfQueryNode . GetAsInt ( )
if len ( rm . groups ) >= maxResourceGroup {
return merr . WrapErrResourceGroupReachLimit ( rgName , maxResourceGroup )
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
if err := rm . validateResourceGroupConfig ( rgName , cfg ) ; err != nil {
return err
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
rg := NewResourceGroup ( rgName , cfg )
if err := rm . catalog . SaveResourceGroup ( rg . GetMeta ( ) ) ; err != nil {
log . Warn ( "failed to add resource group" ,
2023-01-30 10:19:48 +08:00
zap . String ( "rgName" , rgName ) ,
2024-04-15 08:13:19 +08:00
zap . Any ( "config" , cfg ) ,
2023-01-30 10:19:48 +08:00
zap . Error ( err ) ,
)
2024-04-15 08:13:19 +08:00
return merr . WrapErrResourceGroupServiceAvailable ( )
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
rm . groups [ rgName ] = rg
2023-01-30 10:19:48 +08:00
log . Info ( "add resource group" ,
zap . String ( "rgName" , rgName ) ,
2024-04-15 08:13:19 +08:00
zap . Any ( "config" , cfg ) ,
2023-01-30 10:19:48 +08:00
)
2024-04-15 08:13:19 +08:00
// notify that resource group config has been changed.
rm . rgChangedNotifier . NotifyAll ( )
2023-01-30 10:19:48 +08:00
return nil
}
2024-04-15 08:13:19 +08:00
// UpdateResourceGroups update resource group configuration.
// Only change the configuration, no change with node. all node will be reassign by auto recover.
func ( rm * ResourceManager ) UpdateResourceGroups ( rgs map [ string ] * rgpb . ResourceGroupConfig ) error {
if len ( rgs ) == 0 {
return nil
}
2023-01-30 10:19:48 +08:00
rm . rwmutex . Lock ( )
defer rm . rwmutex . Unlock ( )
2024-04-15 08:13:19 +08:00
return rm . updateResourceGroups ( rgs )
}
2023-01-30 10:19:48 +08:00
2024-04-15 08:13:19 +08:00
// updateResourceGroups update resource group configuration.
func ( rm * ResourceManager ) updateResourceGroups ( rgs map [ string ] * rgpb . ResourceGroupConfig ) error {
modifiedRG := make ( [ ] * ResourceGroup , 0 , len ( rgs ) )
updates := make ( [ ] * querypb . ResourceGroup , 0 , len ( rgs ) )
for rgName , cfg := range rgs {
if _ , ok := rm . groups [ rgName ] ; ! ok {
return merr . WrapErrResourceGroupNotFound ( rgName )
}
if err := rm . validateResourceGroupConfig ( rgName , cfg ) ; err != nil {
return err
}
// Update with copy on write.
mrg := rm . groups [ rgName ] . CopyForWrite ( )
mrg . UpdateConfig ( cfg )
rg := mrg . ToResourceGroup ( )
updates = append ( updates , rg . GetMeta ( ) )
modifiedRG = append ( modifiedRG , rg )
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
if err := rm . catalog . SaveResourceGroup ( updates ... ) ; err != nil {
for rgName , cfg := range rgs {
log . Warn ( "failed to update resource group" ,
zap . String ( "rgName" , rgName ) ,
zap . Any ( "config" , cfg ) ,
zap . Error ( err ) ,
)
}
return merr . WrapErrResourceGroupServiceAvailable ( )
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
// Commit updates to memory.
for _ , rg := range modifiedRG {
log . Info ( "update resource group" ,
zap . String ( "rgName" , rg . GetName ( ) ) ,
zap . Any ( "config" , rg . GetConfig ( ) ) ,
2023-01-30 10:19:48 +08:00
)
2024-04-15 08:13:19 +08:00
rm . groups [ rg . GetName ( ) ] = rg
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
// notify that resource group config has been changed.
rm . rgChangedNotifier . NotifyAll ( )
2023-01-30 10:19:48 +08:00
return nil
}
2024-04-15 08:13:19 +08:00
// go:deprecated TransferNode transfer node from source resource group to target resource group.
// Deprecated, use Declarative API `UpdateResourceGroups` instead.
func ( rm * ResourceManager ) TransferNode ( sourceRGName string , targetRGName string , nodeNum int ) error {
2024-04-17 15:27:19 +08:00
if sourceRGName == targetRGName {
return merr . WrapErrParameterInvalidMsg ( "source resource group and target resource group should not be the same, resource group: %s" , sourceRGName )
}
if nodeNum <= 0 {
return merr . WrapErrParameterInvalid ( "NumNode > 0" , fmt . Sprintf ( "invalid NumNode %d" , nodeNum ) )
}
2023-01-30 10:19:48 +08:00
rm . rwmutex . Lock ( )
defer rm . rwmutex . Unlock ( )
2024-04-15 08:13:19 +08:00
if rm . groups [ sourceRGName ] == nil {
return merr . WrapErrResourceGroupNotFound ( sourceRGName )
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
if rm . groups [ targetRGName ] == nil {
return merr . WrapErrResourceGroupNotFound ( targetRGName )
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
sourceRG := rm . groups [ sourceRGName ]
targetRG := rm . groups [ targetRGName ]
2023-01-30 10:19:48 +08:00
2024-04-15 08:13:19 +08:00
// Check if source resource group has enough node to transfer.
if len ( sourceRG . GetNodes ( ) ) < nodeNum {
return merr . WrapErrResourceGroupNodeNotEnough ( sourceRGName , len ( sourceRG . GetNodes ( ) ) , nodeNum )
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
// Compatible with old version.
sourceCfg := sourceRG . GetConfigCloned ( )
targetCfg := targetRG . GetConfigCloned ( )
sourceCfg . Requests . NodeNum -= int32 ( nodeNum )
if sourceCfg . Requests . NodeNum < 0 {
sourceCfg . Requests . NodeNum = 0
2023-02-16 10:48:34 +08:00
}
2024-04-25 19:25:24 +08:00
// Special case for compatibility with old version.
if sourceRGName != DefaultResourceGroupName {
sourceCfg . Limits . NodeNum -= int32 ( nodeNum )
if sourceCfg . Limits . NodeNum < 0 {
sourceCfg . Limits . NodeNum = 0
}
}
2024-04-15 08:13:19 +08:00
targetCfg . Requests . NodeNum += int32 ( nodeNum )
if targetCfg . Requests . NodeNum > targetCfg . Limits . NodeNum {
targetCfg . Limits . NodeNum = targetCfg . Requests . NodeNum
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
return rm . updateResourceGroups ( map [ string ] * rgpb . ResourceGroupConfig {
sourceRGName : sourceCfg ,
targetRGName : targetCfg ,
} )
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
// RemoveResourceGroup remove resource group.
func ( rm * ResourceManager ) RemoveResourceGroup ( rgName string ) error {
2023-01-30 10:19:48 +08:00
rm . rwmutex . Lock ( )
defer rm . rwmutex . Unlock ( )
if rm . groups [ rgName ] == nil {
2024-04-15 08:13:19 +08:00
// Idempotent promise: delete a non-exist rg should be ok
return nil
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
// validateResourceGroupIsDeletable will check if rg is deletable.
if err := rm . validateResourceGroupIsDeletable ( rgName ) ; err != nil {
return err
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
// Nodes may be still assign to these group,
// recover the resource group from redundant status before remove it.
if rm . groups [ rgName ] . NodeNum ( ) > 0 {
if err := rm . recoverRedundantNodeRG ( rgName ) ; err != nil {
log . Info ( "failed to recover redundant node resource group before remove it" ,
zap . String ( "rgName" , rgName ) ,
zap . Error ( err ) ,
)
return err
2023-01-30 10:19:48 +08:00
}
2023-02-16 10:48:34 +08:00
}
2024-04-15 08:13:19 +08:00
// Remove it from meta storage.
if err := rm . catalog . RemoveResourceGroup ( rgName ) ; err != nil {
log . Info ( "failed to remove resource group" ,
2023-01-30 10:19:48 +08:00
zap . String ( "rgName" , rgName ) ,
zap . Error ( err ) ,
)
2024-04-15 08:13:19 +08:00
return merr . WrapErrResourceGroupServiceAvailable ( )
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
// After recovering, all node assigned to these rg has been removed.
// no secondary index need to be removed.
delete ( rm . groups , rgName )
2023-01-30 10:19:48 +08:00
2024-04-15 08:13:19 +08:00
log . Info ( "remove resource group" ,
2023-01-30 10:19:48 +08:00
zap . String ( "rgName" , rgName ) ,
)
2024-04-15 08:13:19 +08:00
// notify that resource group has been changed.
rm . rgChangedNotifier . NotifyAll ( )
2023-01-30 10:19:48 +08:00
return nil
}
2024-04-05 04:57:16 +08:00
// GetNodesOfMultiRG return nodes of multi rg, it can be used to get a consistent view of nodes of multi rg.
func ( rm * ResourceManager ) GetNodesOfMultiRG ( rgName [ ] string ) ( map [ string ] typeutil . UniqueSet , error ) {
2023-01-30 10:19:48 +08:00
rm . rwmutex . RLock ( )
defer rm . rwmutex . RUnlock ( )
2024-04-15 08:13:19 +08:00
2024-04-05 04:57:16 +08:00
ret := make ( map [ string ] typeutil . UniqueSet )
for _ , name := range rgName {
if rm . groups [ name ] == nil {
return nil , merr . WrapErrResourceGroupNotFound ( name )
}
ret [ name ] = typeutil . NewUniqueSet ( rm . groups [ name ] . GetNodes ( ) ... )
2023-01-30 10:19:48 +08:00
}
2024-04-05 04:57:16 +08:00
return ret , nil
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
// GetNodes return nodes of given resource group.
2024-04-05 04:57:16 +08:00
func ( rm * ResourceManager ) GetNodes ( rgName string ) ( [ ] int64 , error ) {
2023-01-30 10:19:48 +08:00
rm . rwmutex . RLock ( )
defer rm . rwmutex . RUnlock ( )
2024-04-05 04:57:16 +08:00
if rm . groups [ rgName ] == nil {
return nil , merr . WrapErrResourceGroupNotFound ( rgName )
2023-01-30 10:19:48 +08:00
}
2024-04-05 04:57:16 +08:00
return rm . groups [ rgName ] . GetNodes ( ) , nil
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
// GetOutgoingNodeNumByReplica return outgoing node num on each rg from this replica.
2023-01-30 10:19:48 +08:00
func ( rm * ResourceManager ) GetOutgoingNodeNumByReplica ( replica * Replica ) map [ string ] int32 {
rm . rwmutex . RLock ( )
defer rm . rwmutex . RUnlock ( )
if rm . groups [ replica . GetResourceGroup ( ) ] == nil {
return nil
}
rg := rm . groups [ replica . GetResourceGroup ( ) ]
2024-04-15 08:13:19 +08:00
2023-01-30 10:19:48 +08:00
ret := make ( map [ string ] int32 )
2024-04-05 04:57:16 +08:00
replica . RangeOverRONodes ( func ( node int64 ) bool {
2024-04-15 08:13:19 +08:00
// if rgOfNode is not equal to rg of replica, outgoing node found.
if rgOfNode := rm . getResourceGroupByNodeID ( node ) ; rgOfNode != nil && rgOfNode . GetName ( ) != rg . GetName ( ) {
ret [ rgOfNode . GetName ( ) ] ++
2023-01-30 10:19:48 +08:00
}
2024-04-05 04:57:16 +08:00
return true
} )
2023-01-30 10:19:48 +08:00
return ret
}
2024-04-15 08:13:19 +08:00
// getResourceGroupByNodeID get resource group by node id.
func ( rm * ResourceManager ) getResourceGroupByNodeID ( nodeID int64 ) * ResourceGroup {
if rgName , ok := rm . nodeIDMap [ nodeID ] ; ok {
return rm . groups [ rgName ]
}
return nil
}
// ContainsNode return whether given node is in given resource group.
2023-01-30 10:19:48 +08:00
func ( rm * ResourceManager ) ContainsNode ( rgName string , node int64 ) bool {
rm . rwmutex . RLock ( )
defer rm . rwmutex . RUnlock ( )
if rm . groups [ rgName ] == nil {
return false
}
2024-04-15 08:13:19 +08:00
return rm . groups [ rgName ] . ContainNode ( node )
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
// ContainResourceGroup return whether given resource group is exist.
2023-01-30 10:19:48 +08:00
func ( rm * ResourceManager ) ContainResourceGroup ( rgName string ) bool {
rm . rwmutex . RLock ( )
defer rm . rwmutex . RUnlock ( )
return rm . groups [ rgName ] != nil
}
2024-04-15 08:13:19 +08:00
// GetResourceGroup return resource group snapshot by name.
func ( rm * ResourceManager ) GetResourceGroup ( rgName string ) * ResourceGroup {
2023-01-30 10:19:48 +08:00
rm . rwmutex . RLock ( )
defer rm . rwmutex . RUnlock ( )
if rm . groups [ rgName ] == nil {
2024-04-15 08:13:19 +08:00
return nil
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
return rm . groups [ rgName ] . Snapshot ( )
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
// ListResourceGroups return all resource groups names.
2023-01-30 10:19:48 +08:00
func ( rm * ResourceManager ) ListResourceGroups ( ) [ ] string {
rm . rwmutex . RLock ( )
defer rm . rwmutex . RUnlock ( )
return lo . Keys ( rm . groups )
}
2024-04-15 08:13:19 +08:00
// MeetRequirement return whether resource group meet requirement.
// Return error with reason if not meet requirement.
func ( rm * ResourceManager ) MeetRequirement ( rgName string ) error {
2023-01-30 10:19:48 +08:00
rm . rwmutex . RLock ( )
defer rm . rwmutex . RUnlock ( )
2024-04-15 08:13:19 +08:00
if rm . groups [ rgName ] == nil {
return nil
}
return rm . groups [ rgName ] . MeetRequirement ( )
}
2023-01-30 10:19:48 +08:00
2024-04-15 08:13:19 +08:00
// CheckIncomingNodeNum return incoming node num.
func ( rm * ResourceManager ) CheckIncomingNodeNum ( ) int {
rm . rwmutex . RLock ( )
defer rm . rwmutex . RUnlock ( )
return rm . incomingNode . Len ( )
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
// HandleNodeUp handle node when new node is incoming.
func ( rm * ResourceManager ) HandleNodeUp ( node int64 ) {
rm . rwmutex . Lock ( )
defer rm . rwmutex . Unlock ( )
2023-01-30 10:19:48 +08:00
2024-04-15 08:13:19 +08:00
rm . incomingNode . Insert ( node )
// Trigger assign incoming node right away.
// error can be ignored here, because `AssignPendingIncomingNode`` will retry assign node.
rgName , err := rm . assignIncomingNodeWithNodeCheck ( node )
log . Info ( "HandleNodeUp: add node to resource group" ,
zap . String ( "rgName" , rgName ) ,
zap . Int64 ( "node" , node ) ,
zap . Error ( err ) ,
)
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
// HandleNodeDown handle the node when node is leave.
func ( rm * ResourceManager ) HandleNodeDown ( node int64 ) {
2023-01-30 10:19:48 +08:00
rm . rwmutex . Lock ( )
defer rm . rwmutex . Unlock ( )
2024-04-15 08:13:19 +08:00
rm . incomingNode . Remove ( node )
2024-04-28 20:25:25 +08:00
// for stopping query node becomes offline, node change won't be triggered,
// cause when it becomes stopping, it already remove from resource manager
// then `unassignNode` will do nothing
2024-04-15 08:13:19 +08:00
rgName , err := rm . unassignNode ( node )
2024-04-28 20:25:25 +08:00
// trigger node changes, expected to remove ro node from replica immediately
rm . nodeChangedNotifier . NotifyAll ( )
2024-04-15 08:13:19 +08:00
log . Info ( "HandleNodeDown: remove node from resource group" ,
zap . String ( "rgName" , rgName ) ,
zap . Int64 ( "node" , node ) ,
zap . Error ( err ) ,
)
}
2023-01-30 10:19:48 +08:00
2024-05-20 10:21:38 +08:00
func ( rm * ResourceManager ) HandleNodeStopping ( node int64 ) {
rm . rwmutex . Lock ( )
defer rm . rwmutex . Unlock ( )
rm . incomingNode . Remove ( node )
rgName , err := rm . unassignNode ( node )
log . Info ( "HandleNodeStopping: remove node from resource group" ,
zap . String ( "rgName" , rgName ) ,
zap . Int64 ( "node" , node ) ,
zap . Error ( err ) ,
)
}
2024-04-15 08:13:19 +08:00
// ListenResourceGroupChanged return a listener for resource group changed.
func ( rm * ResourceManager ) ListenResourceGroupChanged ( ) * syncutil . VersionedListener {
return rm . rgChangedNotifier . Listen ( syncutil . VersionedListenAtEarliest )
}
2023-01-30 10:19:48 +08:00
2024-04-15 08:13:19 +08:00
// ListenNodeChanged return a listener for node changed.
func ( rm * ResourceManager ) ListenNodeChanged ( ) * syncutil . VersionedListener {
return rm . nodeChangedNotifier . Listen ( syncutil . VersionedListenAtEarliest )
}
2023-01-30 10:19:48 +08:00
2024-04-15 08:13:19 +08:00
// AssignPendingIncomingNode assign incoming node to resource group.
func ( rm * ResourceManager ) AssignPendingIncomingNode ( ) {
rm . rwmutex . Lock ( )
defer rm . rwmutex . Unlock ( )
for node := range rm . incomingNode {
rgName , err := rm . assignIncomingNodeWithNodeCheck ( node )
log . Info ( "Pending HandleNodeUp: add node to resource group" ,
zap . String ( "rgName" , rgName ) ,
2023-02-23 14:15:45 +08:00
zap . Int64 ( "node" , node ) ,
zap . Error ( err ) ,
)
}
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
// AutoRecoverResourceGroup auto recover rg, return recover used node num
func ( rm * ResourceManager ) AutoRecoverResourceGroup ( rgName string ) error {
2023-01-30 10:19:48 +08:00
rm . rwmutex . Lock ( )
defer rm . rwmutex . Unlock ( )
2024-04-15 08:13:19 +08:00
rg := rm . groups [ rgName ]
if rg == nil {
return nil
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
if rg . MissingNumOfNodes ( ) > 0 {
return rm . recoverMissingNodeRG ( rgName )
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
// DefaultResourceGroup is the backup resource group of redundant recovery,
// So after all other resource group is reach the `limits`, rest redundant node will be transfer to DefaultResourceGroup.
if rg . RedundantNumOfNodes ( ) > 0 {
return rm . recoverRedundantNodeRG ( rgName )
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
return nil
}
2023-01-30 10:19:48 +08:00
2024-04-15 08:13:19 +08:00
// recoverMissingNodeRG recover resource group by transfer node from other resource group.
func ( rm * ResourceManager ) recoverMissingNodeRG ( rgName string ) error {
for rm . groups [ rgName ] . MissingNumOfNodes ( ) > 0 {
rg := rm . groups [ rgName ]
sourceRG := rm . selectMissingRecoverSourceRG ( rg )
if sourceRG == nil {
log . Warn ( "fail to select source resource group" , zap . String ( "rgName" , rg . GetName ( ) ) )
return ErrNodeNotEnough
}
nodeID , err := rm . transferOneNodeFromRGToRG ( sourceRG , rg )
if err != nil {
log . Warn ( "failed to recover missing node by transfer node from other resource group" ,
zap . String ( "sourceRG" , sourceRG . GetName ( ) ) ,
zap . String ( "targetRG" , rg . GetName ( ) ) ,
zap . Error ( err ) )
return err
}
log . Info ( "recover missing node by transfer node from other resource group" ,
zap . String ( "sourceRG" , sourceRG . GetName ( ) ) ,
zap . String ( "targetRG" , rg . GetName ( ) ) ,
zap . Int64 ( "nodeID" , nodeID ) ,
)
2023-02-16 10:48:34 +08:00
}
2024-04-15 08:13:19 +08:00
return nil
}
2023-02-16 10:48:34 +08:00
2024-04-15 08:13:19 +08:00
// selectMissingRecoverSourceRG select source resource group for recover missing resource group.
func ( rm * ResourceManager ) selectMissingRecoverSourceRG ( rg * ResourceGroup ) * ResourceGroup {
// First, Transfer node from most redundant resource group first. `len(nodes) > limits`
if redundantRG := rm . findMaxRGWithGivenFilter (
func ( sourceRG * ResourceGroup ) bool {
return rg . GetName ( ) != sourceRG . GetName ( ) && sourceRG . RedundantNumOfNodes ( ) > 0
} ,
func ( sourceRG * ResourceGroup ) int {
return sourceRG . RedundantNumOfNodes ( )
} ,
) ; redundantRG != nil {
return redundantRG
}
// Second, Transfer node from most oversized resource group. `len(nodes) > requests`
// `TransferFrom` configured resource group at high priority.
return rm . findMaxRGWithGivenFilter (
func ( sourceRG * ResourceGroup ) bool {
return rg . GetName ( ) != sourceRG . GetName ( ) && sourceRG . OversizedNumOfNodes ( ) > 0
} ,
func ( sourceRG * ResourceGroup ) int {
if rg . HasFrom ( sourceRG . GetName ( ) ) {
// give a boost if sourceRG is configured as `TransferFrom` to set as high priority to select.
return sourceRG . OversizedNumOfNodes ( ) * resourceGroupTransferBoost
}
return sourceRG . OversizedNumOfNodes ( )
} )
}
// recoverRedundantNodeRG recover resource group by transfer node to other resource group.
func ( rm * ResourceManager ) recoverRedundantNodeRG ( rgName string ) error {
for rm . groups [ rgName ] . RedundantNumOfNodes ( ) > 0 {
rg := rm . groups [ rgName ]
targetRG := rm . selectRedundantRecoverTargetRG ( rg )
if targetRG == nil {
log . Info ( "failed to select redundant recover target resource group, please check resource group configuration if as expected." ,
zap . String ( "rgName" , rg . GetName ( ) ) )
return errors . New ( "all resource group reach limits" )
}
2023-03-16 14:27:55 +08:00
2024-04-15 08:13:19 +08:00
nodeID , err := rm . transferOneNodeFromRGToRG ( rg , targetRG )
if err != nil {
log . Warn ( "failed to recover redundant node by transfer node to other resource group" ,
zap . String ( "sourceRG" , rg . GetName ( ) ) ,
zap . String ( "targetRG" , targetRG . GetName ( ) ) ,
zap . Error ( err ) )
return err
}
log . Info ( "recover redundant node by transfer node to other resource group" ,
zap . String ( "sourceRG" , rg . GetName ( ) ) ,
zap . String ( "targetRG" , targetRG . GetName ( ) ) ,
zap . Int64 ( "nodeID" , nodeID ) ,
2023-03-16 14:27:55 +08:00
)
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
return nil
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
// selectRedundantRecoverTargetRG select target resource group for recover redundant resource group.
func ( rm * ResourceManager ) selectRedundantRecoverTargetRG ( rg * ResourceGroup ) * ResourceGroup {
// First, Transfer node to most missing resource group first.
if missingRG := rm . findMaxRGWithGivenFilter (
func ( targetRG * ResourceGroup ) bool {
return rg . GetName ( ) != targetRG . GetName ( ) && targetRG . MissingNumOfNodes ( ) > 0
} ,
func ( targetRG * ResourceGroup ) int {
return targetRG . MissingNumOfNodes ( )
} ,
) ; missingRG != nil {
return missingRG
}
// Second, Transfer node to max reachLimit resource group.
// `TransferTo` configured resource group at high priority.
if selectRG := rm . findMaxRGWithGivenFilter (
func ( targetRG * ResourceGroup ) bool {
return rg . GetName ( ) != targetRG . GetName ( ) && targetRG . ReachLimitNumOfNodes ( ) > 0
} ,
func ( targetRG * ResourceGroup ) int {
if rg . HasTo ( targetRG . GetName ( ) ) {
// give a boost if targetRG is configured as `TransferTo` to set as high priority to select.
return targetRG . ReachLimitNumOfNodes ( ) * resourceGroupTransferBoost
}
return targetRG . ReachLimitNumOfNodes ( )
} ,
) ; selectRG != nil {
return selectRG
2023-02-14 16:16:34 +08:00
}
2024-04-15 08:13:19 +08:00
// Finally, Always transfer node to default resource group.
if rg . GetName ( ) != DefaultResourceGroupName {
return rm . groups [ DefaultResourceGroupName ]
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
return nil
}
2023-01-30 10:19:48 +08:00
2024-04-15 08:13:19 +08:00
// transferOneNodeFromRGToRG transfer one node from source resource group to target resource group.
func ( rm * ResourceManager ) transferOneNodeFromRGToRG ( sourceRG * ResourceGroup , targetRG * ResourceGroup ) ( int64 , error ) {
if sourceRG . NodeNum ( ) == 0 {
return - 1 , ErrNodeNotEnough
2023-02-16 10:48:34 +08:00
}
2024-04-15 08:13:19 +08:00
// TODO: select node by some load strategy, such as segment loaded.
node := sourceRG . GetNodes ( ) [ 0 ]
if err := rm . transferNode ( targetRG . GetName ( ) , node ) ; err != nil {
return - 1 , err
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
return node , nil
}
2023-01-30 10:19:48 +08:00
2024-04-15 08:13:19 +08:00
// assignIncomingNodeWithNodeCheck assign node to resource group with node status check.
func ( rm * ResourceManager ) assignIncomingNodeWithNodeCheck ( node int64 ) ( string , error ) {
// node is on stopping or stopped, remove it from incoming node set.
if rm . nodeMgr . Get ( node ) == nil {
rm . incomingNode . Remove ( node )
return "" , errors . New ( "node is not online" )
2023-02-16 10:48:34 +08:00
}
2024-04-15 08:13:19 +08:00
if ok , _ := rm . nodeMgr . IsStoppingNode ( node ) ; ok {
rm . incomingNode . Remove ( node )
return "" , errors . New ( "node has been stopped" )
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
rgName , err := rm . assignIncomingNode ( node )
if err != nil {
return "" , err
}
// node assignment is finished, remove the node from incoming node set.
rm . incomingNode . Remove ( node )
return rgName , nil
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
// assignIncomingNode assign node to resource group.
func ( rm * ResourceManager ) assignIncomingNode ( node int64 ) ( string , error ) {
// If node already assign to rg.
rg := rm . getResourceGroupByNodeID ( node )
if rg != nil {
log . Info ( "HandleNodeUp: node already assign to resource group" ,
zap . String ( "rgName" , rg . GetName ( ) ) ,
zap . Int64 ( "node" , node ) ,
)
return rg . GetName ( ) , nil
}
// select a resource group to assign incoming node.
rg = rm . mustSelectAssignIncomingNodeTargetRG ( )
if err := rm . transferNode ( rg . GetName ( ) , node ) ; err != nil {
return "" , errors . Wrap ( err , "at finally assign to default resource group" )
}
return rg . GetName ( ) , nil
}
// mustSelectAssignIncomingNodeTargetRG select resource group for assign incoming node.
func ( rm * ResourceManager ) mustSelectAssignIncomingNodeTargetRG ( ) * ResourceGroup {
// First, Assign it to rg with the most missing nodes at high priority.
if rg := rm . findMaxRGWithGivenFilter (
func ( rg * ResourceGroup ) bool {
return rg . MissingNumOfNodes ( ) > 0
} ,
func ( rg * ResourceGroup ) int {
return rg . MissingNumOfNodes ( )
} ,
) ; rg != nil {
return rg
}
// Second, assign it to rg do not reach limit.
if rg := rm . findMaxRGWithGivenFilter (
func ( rg * ResourceGroup ) bool {
return rg . ReachLimitNumOfNodes ( ) > 0
} ,
func ( rg * ResourceGroup ) int {
return rg . ReachLimitNumOfNodes ( )
} ,
) ; rg != nil {
return rg
}
// Finally, add node to default rg.
return rm . groups [ DefaultResourceGroupName ]
}
// findMaxRGWithGivenFilter find resource group with given filter and return the max one.
// not efficient, but it's ok for low nodes and low resource group.
func ( rm * ResourceManager ) findMaxRGWithGivenFilter ( filter func ( rg * ResourceGroup ) bool , attr func ( rg * ResourceGroup ) int ) * ResourceGroup {
var maxRG * ResourceGroup
for _ , rg := range rm . groups {
if filter == nil || filter ( rg ) {
if maxRG == nil || attr ( rg ) > attr ( maxRG ) {
maxRG = rg
}
}
}
return maxRG
}
2023-01-30 10:19:48 +08:00
2024-04-15 08:13:19 +08:00
// transferNode transfer given node to given resource group.
// if given node is assigned in given resource group, do nothing.
// if given node is assigned to other resource group, it will be unassigned first.
func ( rm * ResourceManager ) transferNode ( rgName string , node int64 ) error {
2023-01-30 10:19:48 +08:00
if rm . groups [ rgName ] == nil {
2024-04-15 08:13:19 +08:00
return merr . WrapErrResourceGroupNotFound ( rgName )
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
updates := make ( [ ] * querypb . ResourceGroup , 0 , 2 )
modifiedRG := make ( [ ] * ResourceGroup , 0 , 2 )
originalRG := "_"
// Check if node is already assign to rg.
if rg := rm . getResourceGroupByNodeID ( node ) ; rg != nil {
if rg . GetName ( ) == rgName {
// node is already assign to rg.
log . Info ( "node already assign to resource group" ,
zap . String ( "rgName" , rgName ) ,
zap . Int64 ( "node" , node ) ,
)
return nil
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
// Apply update.
mrg := rg . CopyForWrite ( )
mrg . UnassignNode ( node )
rg := mrg . ToResourceGroup ( )
updates = append ( updates , rg . GetMeta ( ) )
modifiedRG = append ( modifiedRG , rg )
originalRG = rg . GetName ( )
}
// assign the node to rg.
mrg := rm . groups [ rgName ] . CopyForWrite ( )
mrg . AssignNode ( node )
rg := mrg . ToResourceGroup ( )
updates = append ( updates , rg . GetMeta ( ) )
modifiedRG = append ( modifiedRG , rg )
// Commit updates to meta storage.
if err := rm . catalog . SaveResourceGroup ( updates ... ) ; err != nil {
log . Warn ( "failed to transfer node to resource group" ,
zap . String ( "rgName" , rgName ) ,
zap . String ( "originalRG" , originalRG ) ,
zap . Int64 ( "node" , node ) ,
zap . Error ( err ) ,
2023-03-16 14:27:55 +08:00
)
2024-04-15 08:13:19 +08:00
return merr . WrapErrResourceGroupServiceAvailable ( )
}
2023-03-16 14:27:55 +08:00
2024-04-15 08:13:19 +08:00
// Commit updates to memory.
for _ , rg := range modifiedRG {
rm . groups [ rg . GetName ( ) ] = rg
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
rm . nodeIDMap [ node ] = rgName
log . Info ( "transfer node to resource group" ,
zap . String ( "rgName" , rgName ) ,
zap . String ( "originalRG" , originalRG ) ,
zap . Int64 ( "node" , node ) ,
)
2023-01-30 10:19:48 +08:00
2024-04-15 08:13:19 +08:00
// notify that node distribution has been changed.
rm . nodeChangedNotifier . NotifyAll ( )
return nil
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
// unassignNode remove a node from resource group where it belongs to.
func ( rm * ResourceManager ) unassignNode ( node int64 ) ( string , error ) {
if rg := rm . getResourceGroupByNodeID ( node ) ; rg != nil {
mrg := rg . CopyForWrite ( )
mrg . UnassignNode ( node )
rg := mrg . ToResourceGroup ( )
if err := rm . catalog . SaveResourceGroup ( rg . GetMeta ( ) ) ; err != nil {
2024-06-24 21:38:02 +08:00
log . Fatal ( "unassign node from resource group" ,
2024-04-15 08:13:19 +08:00
zap . String ( "rgName" , rg . GetName ( ) ) ,
zap . Int64 ( "node" , node ) ,
zap . Error ( err ) ,
)
2023-01-30 10:19:48 +08:00
}
2023-02-16 10:48:34 +08:00
2024-04-15 08:13:19 +08:00
// Commit updates to memory.
rm . groups [ rg . GetName ( ) ] = rg
delete ( rm . nodeIDMap , node )
log . Info ( "unassign node to resource group" ,
2023-01-30 10:19:48 +08:00
zap . String ( "rgName" , rg . GetName ( ) ) ,
2024-04-15 08:13:19 +08:00
zap . Int64 ( "node" , node ) ,
2023-01-30 10:19:48 +08:00
)
2024-04-15 08:13:19 +08:00
// notify that node distribution has been changed.
rm . nodeChangedNotifier . NotifyAll ( )
return rg . GetName ( ) , nil
}
2024-05-20 10:21:38 +08:00
return "" , errors . Errorf ( "node %d not found in any resource group" , node )
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
// validateResourceGroupConfig validate resource group config.
// validateResourceGroupConfig must be called after lock, because it will check with other resource group.
func ( rm * ResourceManager ) validateResourceGroupConfig ( rgName string , cfg * rgpb . ResourceGroupConfig ) error {
if cfg . GetLimits ( ) == nil || cfg . GetRequests ( ) == nil {
return merr . WrapErrResourceGroupIllegalConfig ( rgName , cfg , "requests or limits is required" )
}
if cfg . GetRequests ( ) . GetNodeNum ( ) < 0 || cfg . GetLimits ( ) . GetNodeNum ( ) < 0 {
return merr . WrapErrResourceGroupIllegalConfig ( rgName , cfg , "node num in `requests` or `limits` should not less than 0" )
}
if cfg . GetLimits ( ) . GetNodeNum ( ) < cfg . GetRequests ( ) . GetNodeNum ( ) {
return merr . WrapErrResourceGroupIllegalConfig ( rgName , cfg , "limits node num should not less than requests node num" )
}
2023-01-30 10:19:48 +08:00
2024-04-15 08:13:19 +08:00
for _ , transferCfg := range cfg . GetTransferFrom ( ) {
if transferCfg . GetResourceGroup ( ) == rgName {
return merr . WrapErrResourceGroupIllegalConfig ( rgName , cfg , fmt . Sprintf ( "resource group in `TransferFrom` %s should not be itself" , rgName ) )
}
if rm . groups [ transferCfg . GetResourceGroup ( ) ] == nil {
return merr . WrapErrResourceGroupIllegalConfig ( rgName , cfg , fmt . Sprintf ( "resource group in `TransferFrom` %s not exist" , transferCfg . GetResourceGroup ( ) ) )
2023-01-30 10:19:48 +08:00
}
}
2024-04-15 08:13:19 +08:00
for _ , transferCfg := range cfg . GetTransferTo ( ) {
if transferCfg . GetResourceGroup ( ) == rgName {
return merr . WrapErrResourceGroupIllegalConfig ( rgName , cfg , fmt . Sprintf ( "resource group in `TransferTo` %s should not be itself" , rgName ) )
}
if rm . groups [ transferCfg . GetResourceGroup ( ) ] == nil {
return merr . WrapErrResourceGroupIllegalConfig ( rgName , cfg , fmt . Sprintf ( "resource group in `TransferTo` %s not exist" , transferCfg . GetResourceGroup ( ) ) )
}
}
return nil
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
// validateResourceGroupIsDeletable validate a resource group is deletable.
func ( rm * ResourceManager ) validateResourceGroupIsDeletable ( rgName string ) error {
// default rg is not deletable.
if rgName == DefaultResourceGroupName {
return merr . WrapErrParameterInvalid ( "not default resource group" , rgName , "default resource group is not deletable" )
2023-01-30 10:19:48 +08:00
}
2024-04-15 08:13:19 +08:00
// If rg is not empty, it's not deletable.
if rm . groups [ rgName ] . GetConfig ( ) . GetLimits ( ) . GetNodeNum ( ) != 0 {
return merr . WrapErrParameterInvalid ( "not empty resource group" , rgName , "resource group's limits node num is not 0" )
}
2023-01-30 10:19:48 +08:00
2024-04-15 08:13:19 +08:00
// If rg is used by other rg, it's not deletable.
for _ , rg := range rm . groups {
for _ , transferCfg := range rg . GetConfig ( ) . GetTransferFrom ( ) {
if transferCfg . GetResourceGroup ( ) == rgName {
return merr . WrapErrParameterInvalid ( "not `TransferFrom` of resource group" , rgName , fmt . Sprintf ( "resource group %s is used by %s's `TransferFrom`, remove that configuration first" , rgName , rg . name ) )
}
}
for _ , transferCfg := range rg . GetConfig ( ) . GetTransferTo ( ) {
if transferCfg . GetResourceGroup ( ) == rgName {
return merr . WrapErrParameterInvalid ( "not `TransferTo` of resource group" , rgName , fmt . Sprintf ( "resource group %s is used by %s's `TransferTo`, remove that configuration first" , rgName , rg . name ) )
}
}
}
return nil
2023-01-30 10:19:48 +08:00
}