milvus/internal/indexnode/task_test.go
cai.zhang 2c9bb4dfa3
feat: Support stats task to sort segment by PK (#35054)
issue: #33744 

This PR includes the following changes:
1. Added a new task type to the task scheduler in datacoord: stats task,
which sorts segments by primary key.
2. Implemented segment sorting in indexnode.
3. Added a new field `FieldStatsLog` to SegmentInfo to store token index
information.

---------

Signed-off-by: Cai Zhang <cai.zhang@zilliz.com>
2024-09-02 14:19:03 +08:00

238 lines
7.8 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package indexnode
import (
"context"
"testing"
"github.com/stretchr/testify/suite"
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/proto/etcdpb"
"github.com/milvus-io/milvus/internal/proto/indexpb"
"github.com/milvus-io/milvus/internal/proto/workerpb"
"github.com/milvus-io/milvus/internal/storage"
"github.com/milvus-io/milvus/internal/util/dependency"
"github.com/milvus-io/milvus/pkg/common"
"github.com/milvus-io/milvus/pkg/util/metautil"
"github.com/milvus-io/milvus/pkg/util/metric"
"github.com/milvus-io/milvus/pkg/util/paramtable"
"github.com/milvus-io/milvus/pkg/util/timerecord"
)
type IndexBuildTaskSuite struct {
suite.Suite
schema *schemapb.CollectionSchema
collectionID int64
partitionID int64
segmentID int64
dataPath string
numRows int
dim int
}
func (suite *IndexBuildTaskSuite) SetupSuite() {
paramtable.Init()
suite.collectionID = 1000
suite.partitionID = 1001
suite.segmentID = 1002
suite.dataPath = "/tmp/milvus/data/1000/1001/1002/3/1"
suite.numRows = 100
suite.dim = 128
}
func (suite *IndexBuildTaskSuite) SetupTest() {
suite.schema = &schemapb.CollectionSchema{
Name: "test",
Description: "test",
AutoID: false,
Fields: []*schemapb.FieldSchema{
{FieldID: common.RowIDField, Name: common.RowIDFieldName, DataType: schemapb.DataType_Int64, IsPrimaryKey: true},
{FieldID: common.TimeStampField, Name: common.TimeStampFieldName, DataType: schemapb.DataType_Int64, IsPrimaryKey: true},
{FieldID: 100, Name: "pk", DataType: schemapb.DataType_Int64, IsPrimaryKey: true},
{FieldID: 101, Name: "ts", DataType: schemapb.DataType_Int64},
{FieldID: 102, Name: "vec", DataType: schemapb.DataType_FloatVector, TypeParams: []*commonpb.KeyValuePair{{Key: "dim", Value: "128"}}},
},
}
}
func (suite *IndexBuildTaskSuite) serializeData() ([]*storage.Blob, error) {
insertCodec := storage.NewInsertCodecWithSchema(&etcdpb.CollectionMeta{
Schema: suite.schema,
})
return insertCodec.Serialize(suite.partitionID, suite.segmentID, &storage.InsertData{
Data: map[storage.FieldID]storage.FieldData{
0: &storage.Int64FieldData{Data: generateLongs(suite.numRows)},
1: &storage.Int64FieldData{Data: generateLongs(suite.numRows)},
100: &storage.Int64FieldData{Data: generateLongs(suite.numRows)},
101: &storage.Int64FieldData{Data: generateLongs(suite.numRows)},
102: &storage.FloatVectorFieldData{Data: generateFloats(suite.numRows * suite.dim), Dim: suite.dim},
},
Infos: []storage.BlobInfo{{Length: suite.numRows}},
})
}
func (suite *IndexBuildTaskSuite) TestBuildMemoryIndex() {
ctx, cancel := context.WithCancel(context.Background())
req := &workerpb.CreateJobRequest{
BuildID: 1,
IndexVersion: 1,
DataPaths: []string{suite.dataPath},
IndexID: 0,
IndexName: "",
IndexParams: []*commonpb.KeyValuePair{{Key: common.IndexTypeKey, Value: "FLAT"}, {Key: common.MetricTypeKey, Value: metric.L2}},
TypeParams: []*commonpb.KeyValuePair{{Key: "dim", Value: "128"}},
NumRows: int64(suite.numRows),
StorageConfig: &indexpb.StorageConfig{
RootPath: "/tmp/milvus/data",
StorageType: "local",
},
CollectionID: 1,
PartitionID: 2,
SegmentID: 3,
FieldID: 102,
FieldName: "vec",
FieldType: schemapb.DataType_FloatVector,
}
cm, err := NewChunkMgrFactory().NewChunkManager(ctx, req.GetStorageConfig())
suite.NoError(err)
blobs, err := suite.serializeData()
suite.NoError(err)
err = cm.Write(ctx, suite.dataPath, blobs[0].Value)
suite.NoError(err)
t := newIndexBuildTask(ctx, cancel, req, cm, NewIndexNode(context.Background(), dependency.NewDefaultFactory(true)))
err = t.PreExecute(context.Background())
suite.NoError(err)
err = t.Execute(context.Background())
suite.NoError(err)
err = t.PostExecute(context.Background())
suite.NoError(err)
}
func TestIndexBuildTask(t *testing.T) {
suite.Run(t, new(IndexBuildTaskSuite))
}
type AnalyzeTaskSuite struct {
suite.Suite
schema *schemapb.CollectionSchema
collectionID int64
partitionID int64
segmentID int64
fieldID int64
taskID int64
}
func (suite *AnalyzeTaskSuite) SetupSuite() {
paramtable.Init()
suite.collectionID = 1000
suite.partitionID = 1001
suite.segmentID = 1002
suite.fieldID = 102
suite.taskID = 1004
}
func (suite *AnalyzeTaskSuite) SetupTest() {
suite.schema = &schemapb.CollectionSchema{
Name: "test",
Description: "test",
AutoID: false,
Fields: []*schemapb.FieldSchema{
{FieldID: common.RowIDField, Name: common.RowIDFieldName, DataType: schemapb.DataType_Int64, IsPrimaryKey: true},
{FieldID: common.TimeStampField, Name: common.TimeStampFieldName, DataType: schemapb.DataType_Int64, IsPrimaryKey: true},
{FieldID: 100, Name: "pk", DataType: schemapb.DataType_Int64, IsPrimaryKey: true},
{FieldID: 101, Name: "ts", DataType: schemapb.DataType_Int64},
{FieldID: 102, Name: "vec", DataType: schemapb.DataType_FloatVector, TypeParams: []*commonpb.KeyValuePair{{Key: "dim", Value: "1"}}},
},
}
}
func (suite *AnalyzeTaskSuite) serializeData() ([]*storage.Blob, error) {
insertCodec := storage.NewInsertCodecWithSchema(&etcdpb.CollectionMeta{
Schema: suite.schema,
})
return insertCodec.Serialize(suite.partitionID, suite.segmentID, &storage.InsertData{
Data: map[storage.FieldID]storage.FieldData{
0: &storage.Int64FieldData{Data: []int64{0, 1, 2}},
1: &storage.Int64FieldData{Data: []int64{1, 2, 3}},
100: &storage.Int64FieldData{Data: []int64{0, 1, 2}},
101: &storage.Int64FieldData{Data: []int64{0, 1, 2}},
102: &storage.FloatVectorFieldData{Data: []float32{1, 2, 3}, Dim: 1},
},
Infos: []storage.BlobInfo{{Length: 3}},
})
}
func (suite *AnalyzeTaskSuite) TestAnalyze() {
ctx, cancel := context.WithCancel(context.Background())
req := &workerpb.AnalyzeRequest{
ClusterID: "test",
TaskID: 1,
CollectionID: suite.collectionID,
PartitionID: suite.partitionID,
FieldID: suite.fieldID,
FieldName: "vec",
FieldType: schemapb.DataType_FloatVector,
SegmentStats: map[int64]*indexpb.SegmentStats{
suite.segmentID: {
ID: suite.segmentID,
NumRows: 1024,
LogIDs: []int64{1},
},
},
Version: 1,
StorageConfig: &indexpb.StorageConfig{
RootPath: "/tmp/milvus/data",
StorageType: "local",
},
Dim: 1,
}
cm, err := NewChunkMgrFactory().NewChunkManager(ctx, req.GetStorageConfig())
suite.NoError(err)
blobs, err := suite.serializeData()
suite.NoError(err)
dataPath := metautil.BuildInsertLogPath(cm.RootPath(), suite.collectionID, suite.partitionID, suite.segmentID,
suite.fieldID, 1)
err = cm.Write(ctx, dataPath, blobs[0].Value)
suite.NoError(err)
t := &analyzeTask{
ident: "",
cancel: cancel,
ctx: ctx,
req: req,
tr: timerecord.NewTimeRecorder("test-indexBuildTask"),
queueDur: 0,
node: NewIndexNode(context.Background(), dependency.NewDefaultFactory(true)),
}
err = t.PreExecute(context.Background())
suite.NoError(err)
}
func TestAnalyzeTaskSuite(t *testing.T) {
suite.Run(t, new(AnalyzeTaskSuite))
}