milvus/internal/datanode/compactor_test.go
Buqian Zheng 8a1017a152
enhance: add helpers to parse sparse float vector in JSON (#32543)
issue: #29419

added helper functions to parse JSON representation of sparse float
vectors, will be used by both the restful server and the import utils.

Signed-off-by: Buqian Zheng <zhengbuqian@gmail.com>
2024-04-25 14:47:24 +08:00

1142 lines
37 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package datanode
import (
"context"
"fmt"
"math"
"testing"
"time"
"github.com/cockroachdb/errors"
"github.com/samber/lo"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus-proto/go-api/v2/milvuspb"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/datanode/allocator"
"github.com/milvus-io/milvus/internal/datanode/broker"
"github.com/milvus-io/milvus/internal/datanode/io"
"github.com/milvus-io/milvus/internal/datanode/metacache"
"github.com/milvus-io/milvus/internal/datanode/syncmgr"
memkv "github.com/milvus-io/milvus/internal/kv/mem"
"github.com/milvus-io/milvus/internal/proto/datapb"
"github.com/milvus-io/milvus/internal/proto/etcdpb"
"github.com/milvus-io/milvus/internal/storage"
"github.com/milvus-io/milvus/pkg/common"
"github.com/milvus-io/milvus/pkg/util/paramtable"
"github.com/milvus-io/milvus/pkg/util/timerecord"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
var compactTestDir = "/tmp/milvus_test/compact"
func TestCompactionTaskInnerMethods(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
cm := storage.NewLocalChunkManager(storage.RootPath(compactTestDir))
defer cm.RemoveWithPrefix(ctx, cm.RootPath())
t.Run("Test.interface2FieldData", func(t *testing.T) {
tests := []struct {
isvalid bool
tp schemapb.DataType
content []interface{}
description string
}{
{true, schemapb.DataType_Bool, []interface{}{true, false}, "valid bool"},
{true, schemapb.DataType_Int8, []interface{}{int8(1), int8(2)}, "valid int8"},
{true, schemapb.DataType_Int16, []interface{}{int16(1), int16(2)}, "valid int16"},
{true, schemapb.DataType_Int32, []interface{}{int32(1), int32(2)}, "valid int32"},
{true, schemapb.DataType_Int64, []interface{}{int64(1), int64(2)}, "valid int64"},
{true, schemapb.DataType_Float, []interface{}{float32(1), float32(2)}, "valid float32"},
{true, schemapb.DataType_Double, []interface{}{float64(1), float64(2)}, "valid float64"},
{true, schemapb.DataType_VarChar, []interface{}{"test1", "test2"}, "valid varChar"},
{true, schemapb.DataType_JSON, []interface{}{[]byte("{\"key\":\"value\"}"), []byte("{\"hello\":\"world\"}")}, "valid json"},
{true, schemapb.DataType_Array, []interface{}{
&schemapb.ScalarField{
Data: &schemapb.ScalarField_IntData{
IntData: &schemapb.IntArray{
Data: []int32{1, 2},
},
},
},
&schemapb.ScalarField{
Data: &schemapb.ScalarField_IntData{
IntData: &schemapb.IntArray{
Data: []int32{3, 4},
},
},
},
}, "valid array"},
{true, schemapb.DataType_FloatVector, []interface{}{[]float32{1.0, 2.0}}, "valid floatvector"},
{true, schemapb.DataType_BinaryVector, []interface{}{[]byte{255}}, "valid binaryvector"},
{true, schemapb.DataType_Float16Vector, []interface{}{[]byte{255, 255, 255, 255}}, "valid float16vector"},
{true, schemapb.DataType_BFloat16Vector, []interface{}{[]byte{255, 255, 255, 255}}, "valid bfloat16vector"},
{false, schemapb.DataType_Bool, []interface{}{1, 2}, "invalid bool"},
{false, schemapb.DataType_Int8, []interface{}{nil, nil}, "invalid int8"},
{false, schemapb.DataType_Int16, []interface{}{nil, nil}, "invalid int16"},
{false, schemapb.DataType_Int32, []interface{}{nil, nil}, "invalid int32"},
{false, schemapb.DataType_Int64, []interface{}{nil, nil}, "invalid int64"},
{false, schemapb.DataType_Float, []interface{}{nil, nil}, "invalid float32"},
{false, schemapb.DataType_Double, []interface{}{nil, nil}, "invalid float64"},
{false, schemapb.DataType_VarChar, []interface{}{nil, nil}, "invalid varChar"},
{false, schemapb.DataType_JSON, []interface{}{nil, nil}, "invalid json"},
{false, schemapb.DataType_FloatVector, []interface{}{nil, nil}, "invalid floatvector"},
{false, schemapb.DataType_BinaryVector, []interface{}{nil, nil}, "invalid binaryvector"},
{false, schemapb.DataType_Float16Vector, []interface{}{nil, nil}, "invalid float16vector"},
{false, schemapb.DataType_BFloat16Vector, []interface{}{nil, nil}, "invalid bfloat16vector"},
{false, schemapb.DataType_SparseFloatVector, []interface{}{nil, nil}, "invalid sparsefloatvector"},
{false, schemapb.DataType_SparseFloatVector, []interface{}{[]byte{255}, []byte{15}}, "invalid sparsefloatvector"},
{true, schemapb.DataType_SparseFloatVector, []interface{}{
typeutil.CreateSparseFloatRow([]uint32{1, 2}, []float32{1.0, 2.0}),
typeutil.CreateSparseFloatRow([]uint32{3, 4}, []float32{1.0, 2.0}),
}, "valid sparsefloatvector"},
}
// make sure all new data types missed to handle would throw unexpected error
for typeName, typeValue := range schemapb.DataType_value {
tests = append(tests, struct {
isvalid bool
tp schemapb.DataType
content []interface{}
description string
}{false, schemapb.DataType(typeValue), []interface{}{nil, nil}, "invalid " + typeName})
}
for _, test := range tests {
t.Run(test.description, func(t *testing.T) {
if test.isvalid {
fd, err := interface2FieldData(test.tp, test.content, 2)
assert.NoError(t, err)
assert.Equal(t, 2, fd.RowNum())
} else {
fd, err := interface2FieldData(test.tp, test.content, 2)
assert.True(t, errors.Is(err, errTransferType) || errors.Is(err, errUnknownDataType))
assert.Nil(t, fd)
}
})
}
})
t.Run("Test mergeDeltalogs", func(t *testing.T) {
t.Run("One segment", func(t *testing.T) {
invalidBlobs := map[UniqueID][]*Blob{
1: {},
}
blobs, err := getInt64DeltaBlobs(
100,
[]UniqueID{
1,
2,
3,
4,
5,
1,
2,
},
[]Timestamp{
20000,
20001,
20002,
30000,
50000,
50000,
10000,
})
require.NoError(t, err)
validBlobs := map[UniqueID][]*Blob{
100: blobs,
}
tests := []struct {
isvalid bool
dBlobs map[UniqueID][]*Blob
description string
}{
{false, invalidBlobs, "invalid dBlobs"},
{true, validBlobs, "valid blobs"},
}
for _, test := range tests {
task := &compactionTask{
done: make(chan struct{}, 1),
}
t.Run(test.description, func(t *testing.T) {
pk2ts, err := task.mergeDeltalogs(test.dBlobs)
if test.isvalid {
assert.NoError(t, err)
assert.Equal(t, 5, len(pk2ts))
assert.EqualValues(t, 20001, pk2ts[UniqueID(2)])
} else {
assert.Error(t, err)
assert.Nil(t, pk2ts)
}
})
}
})
t.Run("Multiple segments", func(t *testing.T) {
tests := []struct {
segIDA UniqueID
dataApk []UniqueID
dataAts []Timestamp
segIDB UniqueID
dataBpk []UniqueID
dataBts []Timestamp
segIDC UniqueID
dataCpk []UniqueID
dataCts []Timestamp
expectedpk2ts int
description string
}{
{
0, nil, nil,
100,
[]UniqueID{1, 2, 3},
[]Timestamp{20000, 30000, 20005},
200,
[]UniqueID{4, 5, 6},
[]Timestamp{50000, 50001, 50002},
6, "2 segments",
},
{
300,
[]UniqueID{10, 20},
[]Timestamp{20001, 40001},
100,
[]UniqueID{1, 2, 3},
[]Timestamp{20000, 30000, 20005},
200,
[]UniqueID{4, 5, 6},
[]Timestamp{50000, 50001, 50002},
8, "3 segments",
},
}
for _, test := range tests {
t.Run(test.description, func(t *testing.T) {
dBlobs := make(map[UniqueID][]*Blob)
if test.segIDA != UniqueID(0) {
d, err := getInt64DeltaBlobs(test.segIDA, test.dataApk, test.dataAts)
require.NoError(t, err)
dBlobs[test.segIDA] = d
}
if test.segIDB != UniqueID(0) {
d, err := getInt64DeltaBlobs(test.segIDB, test.dataBpk, test.dataBts)
require.NoError(t, err)
dBlobs[test.segIDB] = d
}
if test.segIDC != UniqueID(0) {
d, err := getInt64DeltaBlobs(test.segIDC, test.dataCpk, test.dataCts)
require.NoError(t, err)
dBlobs[test.segIDC] = d
}
task := &compactionTask{
done: make(chan struct{}, 1),
}
pk2ts, err := task.mergeDeltalogs(dBlobs)
assert.NoError(t, err)
assert.Equal(t, test.expectedpk2ts, len(pk2ts))
})
}
})
})
t.Run("Test merge", func(t *testing.T) {
collectionID := int64(1)
meta := NewMetaFactory().GetCollectionMeta(collectionID, "test", schemapb.DataType_Int64)
broker := broker.NewMockBroker(t)
broker.EXPECT().DescribeCollection(mock.Anything, mock.Anything, mock.Anything).
Return(&milvuspb.DescribeCollectionResponse{
Schema: meta.GetSchema(),
}, nil).Maybe()
metaCache := metacache.NewMockMetaCache(t)
metaCache.EXPECT().Schema().Return(meta.GetSchema()).Maybe()
metaCache.EXPECT().GetSegmentByID(mock.Anything).RunAndReturn(func(id int64, filters ...metacache.SegmentFilter) (*metacache.SegmentInfo, bool) {
segment := metacache.NewSegmentInfo(&datapb.SegmentInfo{
CollectionID: 1,
PartitionID: 0,
ID: id,
NumOfRows: 10,
}, nil)
return segment, true
})
alloc := allocator.NewMockAllocator(t)
alloc.EXPECT().GetGenerator(mock.Anything, mock.Anything).Call.Return(validGeneratorFn, nil)
alloc.EXPECT().AllocOne().Return(0, nil)
t.Run("Merge without expiration", func(t *testing.T) {
mockbIO := io.NewBinlogIO(cm, getOrCreateIOPool())
paramtable.Get().Save(Params.CommonCfg.EntityExpirationTTL.Key, "0")
iData := genInsertDataWithExpiredTS()
iCodec := storage.NewInsertCodecWithSchema(meta)
var allPaths [][]string
inpath, err := uploadInsertLog(context.Background(), mockbIO, alloc, meta.GetID(), 0, 1, iData, iCodec)
assert.NoError(t, err)
assert.Equal(t, 12, len(inpath))
binlogNum := len(inpath[0].GetBinlogs())
assert.Equal(t, 1, binlogNum)
for idx := 0; idx < binlogNum; idx++ {
var ps []string
for _, path := range inpath {
ps = append(ps, path.GetBinlogs()[idx].GetLogPath())
}
allPaths = append(allPaths, ps)
}
dm := map[interface{}]Timestamp{
1: 10000,
}
ct := &compactionTask{
metaCache: metaCache,
binlogIO: mockbIO,
Allocator: alloc,
done: make(chan struct{}, 1),
plan: &datapb.CompactionPlan{
SegmentBinlogs: []*datapb.CompactionSegmentBinlogs{
{SegmentID: 1},
},
},
}
inPaths, statsPaths, numOfRow, err := ct.merge(context.Background(), allPaths, 2, 0, meta, dm)
assert.NoError(t, err)
assert.Equal(t, int64(2), numOfRow)
assert.Equal(t, 1, len(inPaths[0].GetBinlogs()))
assert.Equal(t, 1, len(statsPaths))
assert.NotEqual(t, -1, inPaths[0].GetBinlogs()[0].GetTimestampFrom())
assert.NotEqual(t, -1, inPaths[0].GetBinlogs()[0].GetTimestampTo())
})
t.Run("Merge without expiration2", func(t *testing.T) {
mockbIO := io.NewBinlogIO(cm, getOrCreateIOPool())
iCodec := storage.NewInsertCodecWithSchema(meta)
paramtable.Get().Save(Params.CommonCfg.EntityExpirationTTL.Key, "0")
BinLogMaxSize := Params.DataNodeCfg.BinLogMaxSize.GetValue()
defer func() {
Params.Save(Params.DataNodeCfg.BinLogMaxSize.Key, BinLogMaxSize)
}()
paramtable.Get().Save(Params.DataNodeCfg.BinLogMaxSize.Key, "64")
iData := genInsertDataWithExpiredTS()
meta := NewMetaFactory().GetCollectionMeta(1, "test", schemapb.DataType_Int64)
var allPaths [][]string
inpath, err := uploadInsertLog(context.Background(), mockbIO, alloc, meta.GetID(), 0, 1, iData, iCodec)
assert.NoError(t, err)
assert.Equal(t, 12, len(inpath))
binlogNum := len(inpath[0].GetBinlogs())
assert.Equal(t, 1, binlogNum)
for idx := 0; idx < binlogNum; idx++ {
var ps []string
for _, path := range inpath {
ps = append(ps, path.GetBinlogs()[idx].GetLogPath())
}
allPaths = append(allPaths, ps)
}
dm := map[interface{}]Timestamp{}
ct := &compactionTask{
metaCache: metaCache,
binlogIO: mockbIO,
Allocator: alloc,
done: make(chan struct{}, 1),
plan: &datapb.CompactionPlan{
SegmentBinlogs: []*datapb.CompactionSegmentBinlogs{
{SegmentID: 1},
},
},
}
inPaths, statsPaths, numOfRow, err := ct.merge(context.Background(), allPaths, 2, 0, meta, dm)
assert.NoError(t, err)
assert.Equal(t, int64(2), numOfRow)
assert.Equal(t, 1, len(inPaths[0].GetBinlogs()))
assert.Equal(t, 1, len(statsPaths))
assert.Equal(t, 1, len(statsPaths[0].GetBinlogs()))
assert.NotEqual(t, -1, inPaths[0].GetBinlogs()[0].GetTimestampFrom())
assert.NotEqual(t, -1, inPaths[0].GetBinlogs()[0].GetTimestampTo())
})
// set Params.DataNodeCfg.BinLogMaxSize.Key = 1 to generate multi binlogs, each has only one row
t.Run("merge_with_more_than_100rows", func(t *testing.T) {
mockbIO := io.NewBinlogIO(cm, getOrCreateIOPool())
iCodec := storage.NewInsertCodecWithSchema(meta)
paramtable.Get().Save(Params.CommonCfg.EntityExpirationTTL.Key, "0")
BinLogMaxSize := Params.DataNodeCfg.BinLogMaxSize.GetAsInt()
defer func() {
paramtable.Get().Save(Params.DataNodeCfg.BinLogMaxSize.Key, fmt.Sprintf("%d", BinLogMaxSize))
}()
paramtable.Get().Save(Params.DataNodeCfg.BinLogMaxSize.Key, "1")
iData := genInsertData(101)
var allPaths [][]string
inpath, err := uploadInsertLog(context.Background(), mockbIO, alloc, meta.GetID(), 0, 1, iData, iCodec)
assert.NoError(t, err)
assert.Equal(t, 12, len(inpath))
binlogNum := len(inpath[0].GetBinlogs())
assert.Equal(t, 1, binlogNum)
for idx := 0; idx < binlogNum; idx++ {
var ps []string
for _, path := range inpath {
ps = append(ps, path.GetBinlogs()[idx].GetLogPath())
}
allPaths = append(allPaths, ps)
}
dm := map[interface{}]Timestamp{
1: 10000,
}
ct := &compactionTask{
metaCache: metaCache,
binlogIO: mockbIO,
Allocator: alloc,
done: make(chan struct{}, 1),
plan: &datapb.CompactionPlan{
SegmentBinlogs: []*datapb.CompactionSegmentBinlogs{
{SegmentID: 1},
},
},
}
inPaths, statsPaths, numOfRow, err := ct.merge(context.Background(), allPaths, 2, 0, meta, dm)
assert.NoError(t, err)
assert.Equal(t, int64(101), numOfRow)
assert.Equal(t, 2, len(inPaths[0].GetBinlogs()))
assert.Equal(t, 1, len(statsPaths))
for _, inpath := range inPaths {
assert.NotEqual(t, -1, inpath.GetBinlogs()[0].GetTimestampFrom())
assert.NotEqual(t, -1, inpath.GetBinlogs()[0].GetTimestampTo())
}
})
t.Run("Merge with expiration", func(t *testing.T) {
mockbIO := io.NewBinlogIO(cm, getOrCreateIOPool())
iCodec := storage.NewInsertCodecWithSchema(meta)
iData := genInsertDataWithExpiredTS()
meta := NewMetaFactory().GetCollectionMeta(1, "test", schemapb.DataType_Int64)
var allPaths [][]string
inpath, err := uploadInsertLog(context.Background(), mockbIO, alloc, meta.GetID(), 0, 1, iData, iCodec)
assert.NoError(t, err)
assert.Equal(t, 12, len(inpath))
binlogNum := len(inpath[0].GetBinlogs())
assert.Equal(t, 1, binlogNum)
for idx := 0; idx < binlogNum; idx++ {
var ps []string
for _, path := range inpath {
ps = append(ps, path.GetBinlogs()[idx].GetLogPath())
}
allPaths = append(allPaths, ps)
}
dm := map[interface{}]Timestamp{
1: 10000,
}
// 10 days in seconds
ct := &compactionTask{
metaCache: metaCache,
binlogIO: mockbIO,
Allocator: alloc,
plan: &datapb.CompactionPlan{
CollectionTtl: 864000,
SegmentBinlogs: []*datapb.CompactionSegmentBinlogs{
{SegmentID: 1},
},
},
done: make(chan struct{}, 1),
}
inPaths, statsPaths, numOfRow, err := ct.merge(context.Background(), allPaths, 2, 0, meta, dm)
assert.NoError(t, err)
assert.Equal(t, int64(0), numOfRow)
assert.Equal(t, 0, len(inPaths))
assert.Equal(t, 0, len(statsPaths))
})
t.Run("merge_with_rownum_zero", func(t *testing.T) {
mockbIO := io.NewBinlogIO(cm, getOrCreateIOPool())
iData := genInsertDataWithExpiredTS()
iCodec := storage.NewInsertCodecWithSchema(meta)
meta := NewMetaFactory().GetCollectionMeta(1, "test", schemapb.DataType_Int64)
metaCache := metacache.NewMockMetaCache(t)
metaCache.EXPECT().Schema().Return(meta.GetSchema()).Maybe()
metaCache.EXPECT().GetSegmentByID(mock.Anything).RunAndReturn(func(id int64, filters ...metacache.SegmentFilter) (*metacache.SegmentInfo, bool) {
segment := metacache.NewSegmentInfo(&datapb.SegmentInfo{
CollectionID: 1,
PartitionID: 0,
ID: id,
NumOfRows: 0,
}, nil)
return segment, true
})
var allPaths [][]string
inpath, err := uploadInsertLog(context.Background(), mockbIO, alloc, meta.GetID(), 0, 1, iData, iCodec)
assert.NoError(t, err)
assert.Equal(t, 12, len(inpath))
binlogNum := len(inpath[0].GetBinlogs())
assert.Equal(t, 1, binlogNum)
for idx := 0; idx < binlogNum; idx++ {
var ps []string
for _, path := range inpath {
ps = append(ps, path.GetBinlogs()[idx].GetLogPath())
}
allPaths = append(allPaths, ps)
}
dm := map[interface{}]Timestamp{
1: 10000,
}
ct := &compactionTask{
metaCache: metaCache,
binlogIO: mockbIO,
Allocator: alloc,
done: make(chan struct{}, 1),
plan: &datapb.CompactionPlan{
SegmentBinlogs: []*datapb.CompactionSegmentBinlogs{
{SegmentID: 1},
},
},
}
_, _, _, err = ct.merge(context.Background(), allPaths, 2, 0, &etcdpb.CollectionMeta{
Schema: meta.GetSchema(),
}, dm)
assert.Error(t, err)
})
t.Run("Merge with meta error", func(t *testing.T) {
mockbIO := io.NewBinlogIO(cm, getOrCreateIOPool())
iCodec := storage.NewInsertCodecWithSchema(meta)
paramtable.Get().Save(Params.CommonCfg.EntityExpirationTTL.Key, "0")
iData := genInsertDataWithExpiredTS()
meta := NewMetaFactory().GetCollectionMeta(1, "test", schemapb.DataType_Int64)
var allPaths [][]string
inpath, err := uploadInsertLog(context.Background(), mockbIO, alloc, meta.GetID(), 0, 1, iData, iCodec)
assert.NoError(t, err)
assert.Equal(t, 12, len(inpath))
binlogNum := len(inpath[0].GetBinlogs())
assert.Equal(t, 1, binlogNum)
for idx := 0; idx < binlogNum; idx++ {
var ps []string
for _, path := range inpath {
ps = append(ps, path.GetBinlogs()[idx].GetLogPath())
}
allPaths = append(allPaths, ps)
}
dm := map[interface{}]Timestamp{
1: 10000,
}
ct := &compactionTask{
metaCache: metaCache,
binlogIO: mockbIO,
Allocator: alloc,
done: make(chan struct{}, 1),
plan: &datapb.CompactionPlan{
SegmentBinlogs: []*datapb.CompactionSegmentBinlogs{
{SegmentID: 1},
},
},
}
_, _, _, err = ct.merge(context.Background(), allPaths, 2, 0, &etcdpb.CollectionMeta{
Schema: &schemapb.CollectionSchema{Fields: []*schemapb.FieldSchema{
{DataType: schemapb.DataType_FloatVector, TypeParams: []*commonpb.KeyValuePair{
{Key: common.DimKey, Value: "64"},
}},
}},
}, dm)
assert.Error(t, err)
})
t.Run("Merge with meta type param error", func(t *testing.T) {
mockbIO := io.NewBinlogIO(cm, getOrCreateIOPool())
iCodec := storage.NewInsertCodecWithSchema(meta)
paramtable.Get().Save(Params.CommonCfg.EntityExpirationTTL.Key, "0")
iData := genInsertDataWithExpiredTS()
meta := NewMetaFactory().GetCollectionMeta(1, "test", schemapb.DataType_Int64)
var allPaths [][]string
inpath, err := uploadInsertLog(context.Background(), mockbIO, alloc, meta.GetID(), 0, 1, iData, iCodec)
assert.NoError(t, err)
assert.Equal(t, 12, len(inpath))
binlogNum := len(inpath[0].GetBinlogs())
assert.Equal(t, 1, binlogNum)
for idx := 0; idx < binlogNum; idx++ {
var ps []string
for _, path := range inpath {
ps = append(ps, path.GetBinlogs()[idx].GetLogPath())
}
allPaths = append(allPaths, ps)
}
dm := map[interface{}]Timestamp{
1: 10000,
}
ct := &compactionTask{
metaCache: metaCache,
binlogIO: mockbIO,
Allocator: alloc,
done: make(chan struct{}, 1),
}
_, _, _, err = ct.merge(context.Background(), allPaths, 2, 0, &etcdpb.CollectionMeta{
Schema: &schemapb.CollectionSchema{Fields: []*schemapb.FieldSchema{
{DataType: schemapb.DataType_FloatVector, TypeParams: []*commonpb.KeyValuePair{
{Key: common.DimKey, Value: "bad_dim"},
}},
}},
}, dm)
assert.Error(t, err)
})
})
t.Run("Test isExpiredEntity", func(t *testing.T) {
t.Run("When CompactionEntityExpiration is set math.MaxInt64", func(t *testing.T) {
ct := &compactionTask{
plan: &datapb.CompactionPlan{
CollectionTtl: math.MaxInt64,
},
done: make(chan struct{}, 1),
}
res := ct.isExpiredEntity(0, genTimestamp())
assert.Equal(t, false, res)
res = ct.isExpiredEntity(math.MaxInt64, genTimestamp())
assert.Equal(t, false, res)
res = ct.isExpiredEntity(0, math.MaxInt64)
assert.Equal(t, true, res)
res = ct.isExpiredEntity(math.MaxInt64, math.MaxInt64)
assert.Equal(t, false, res)
res = ct.isExpiredEntity(math.MaxInt64, 0)
assert.Equal(t, false, res)
})
t.Run("When CompactionEntityExpiration is set MAX_ENTITY_EXPIRATION = 0", func(t *testing.T) {
// 0 means expiration is not enabled
ct := &compactionTask{
plan: &datapb.CompactionPlan{
CollectionTtl: 0,
},
done: make(chan struct{}, 1),
}
res := ct.isExpiredEntity(0, genTimestamp())
assert.Equal(t, false, res)
res = ct.isExpiredEntity(math.MaxInt64, genTimestamp())
assert.Equal(t, false, res)
res = ct.isExpiredEntity(0, math.MaxInt64)
assert.Equal(t, false, res)
res = ct.isExpiredEntity(math.MaxInt64, math.MaxInt64)
assert.Equal(t, false, res)
res = ct.isExpiredEntity(math.MaxInt64, 0)
assert.Equal(t, false, res)
})
t.Run("When CompactionEntityExpiration is set 10 days", func(t *testing.T) {
// 10 days in seconds
ct := &compactionTask{
plan: &datapb.CompactionPlan{
CollectionTtl: 864000,
},
done: make(chan struct{}, 1),
}
res := ct.isExpiredEntity(0, genTimestamp())
assert.Equal(t, true, res)
res = ct.isExpiredEntity(math.MaxInt64, genTimestamp())
assert.Equal(t, false, res)
res = ct.isExpiredEntity(0, math.MaxInt64)
assert.Equal(t, true, res)
res = ct.isExpiredEntity(math.MaxInt64, math.MaxInt64)
assert.Equal(t, false, res)
res = ct.isExpiredEntity(math.MaxInt64, 0)
assert.Equal(t, false, res)
})
})
t.Run("Test getNumRows error", func(t *testing.T) {
metaCache := metacache.NewMockMetaCache(t)
metaCache.EXPECT().GetSegmentByID(mock.Anything).Return(nil, false)
ct := &compactionTask{
metaCache: metaCache,
plan: &datapb.CompactionPlan{
SegmentBinlogs: []*datapb.CompactionSegmentBinlogs{
{
SegmentID: 1,
},
},
},
done: make(chan struct{}, 1),
}
_, err := ct.getNumRows()
assert.Error(t, err, "segment not found")
})
t.Run("Test uploadRemainLog error", func(t *testing.T) {
f := &MetaFactory{}
t.Run("upload failed", func(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer cancel()
alloc := allocator.NewMockAllocator(t)
alloc.EXPECT().AllocOne().Call.Return(int64(11111), nil)
meta := f.GetCollectionMeta(UniqueID(10001), "test_upload_remain_log", schemapb.DataType_Int64)
stats, err := storage.NewPrimaryKeyStats(106, int64(schemapb.DataType_Int64), 10)
require.NoError(t, err)
ct := &compactionTask{
binlogIO: io.NewBinlogIO(&mockCm{errSave: true}, getOrCreateIOPool()),
Allocator: alloc,
done: make(chan struct{}, 1),
}
_, _, err = ct.uploadRemainLog(ctx, 1, 2, meta, stats, 10, nil)
assert.Error(t, err)
})
})
}
func getInt64DeltaBlobs(segID UniqueID, pks []UniqueID, tss []Timestamp) ([]*Blob, error) {
primaryKeys := make([]storage.PrimaryKey, len(pks))
for index, v := range pks {
primaryKeys[index] = storage.NewInt64PrimaryKey(v)
}
deltaData := &DeleteData{
Pks: primaryKeys,
Tss: tss,
RowCount: int64(len(pks)),
}
dCodec := storage.NewDeleteCodec()
blob, err := dCodec.Serialize(1, 10, segID, deltaData)
return []*Blob{blob}, err
}
func TestCompactorInterfaceMethods(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
cm := storage.NewLocalChunkManager(storage.RootPath(compactTestDir))
defer cm.RemoveWithPrefix(ctx, cm.RootPath())
notEmptySegmentBinlogs := []*datapb.CompactionSegmentBinlogs{{
SegmentID: 100,
FieldBinlogs: nil,
Field2StatslogPaths: nil,
Deltalogs: nil,
}}
paramtable.Get().Save(Params.CommonCfg.EntityExpirationTTL.Key, "0") // Turn off auto expiration
t.Run("Test compact invalid empty segment binlogs", func(t *testing.T) {
plan := &datapb.CompactionPlan{
PlanID: 999,
SegmentBinlogs: nil,
TimeoutInSeconds: 10,
Type: datapb.CompactionType_MixCompaction,
}
ctx, cancel := context.WithCancel(context.Background())
emptyTask := &compactionTask{
ctx: ctx,
cancel: cancel,
tr: timerecord.NewTimeRecorder("test"),
done: make(chan struct{}, 1),
plan: plan,
}
_, err := emptyTask.compact()
assert.Error(t, err)
assert.ErrorIs(t, err, errIllegalCompactionPlan)
emptyTask.complete()
emptyTask.stop()
})
t.Run("Test compact invalid AllocOnce failed", func(t *testing.T) {
mockAlloc := allocator.NewMockAllocator(t)
mockAlloc.EXPECT().AllocOne().Call.Return(int64(0), errors.New("mock allocone error")).Once()
plan := &datapb.CompactionPlan{
PlanID: 999,
SegmentBinlogs: notEmptySegmentBinlogs,
TimeoutInSeconds: 10,
Type: datapb.CompactionType_MixCompaction,
}
task := &compactionTask{
ctx: context.Background(),
tr: timerecord.NewTimeRecorder("test"),
Allocator: mockAlloc,
plan: plan,
}
_, err := task.compact()
assert.Error(t, err)
})
t.Run("Test typeII compact valid", func(t *testing.T) {
alloc := allocator.NewMockAllocator(t)
alloc.EXPECT().GetGenerator(mock.Anything, mock.Anything).Call.Return(validGeneratorFn, nil)
alloc.EXPECT().AllocOne().Call.Return(int64(19530), nil)
type testCase struct {
pkType schemapb.DataType
iData1 storage.FieldData
iData2 storage.FieldData
pks1 [2]storage.PrimaryKey
pks2 [2]storage.PrimaryKey
colID UniqueID
parID UniqueID
segID1 UniqueID
segID2 UniqueID
}
cases := []testCase{
{
pkType: schemapb.DataType_Int64,
iData1: &storage.Int64FieldData{Data: []UniqueID{1}},
iData2: &storage.Int64FieldData{Data: []UniqueID{9}},
pks1: [2]storage.PrimaryKey{storage.NewInt64PrimaryKey(1), storage.NewInt64PrimaryKey(2)},
pks2: [2]storage.PrimaryKey{storage.NewInt64PrimaryKey(9), storage.NewInt64PrimaryKey(10)},
colID: 1,
parID: 10,
segID1: 100,
segID2: 101,
},
{
pkType: schemapb.DataType_VarChar,
iData1: &storage.StringFieldData{Data: []string{"aaaa"}},
iData2: &storage.StringFieldData{Data: []string{"milvus"}},
pks1: [2]storage.PrimaryKey{storage.NewVarCharPrimaryKey("aaaa"), storage.NewVarCharPrimaryKey("bbbb")},
pks2: [2]storage.PrimaryKey{storage.NewVarCharPrimaryKey("milvus"), storage.NewVarCharPrimaryKey("mmmm")},
colID: 2,
parID: 11,
segID1: 102,
segID2: 103,
},
}
for _, c := range cases {
collName := "test_compact_coll_name"
meta := NewMetaFactory().GetCollectionMeta(c.colID, collName, c.pkType)
mockbIO := io.NewBinlogIO(cm, getOrCreateIOPool())
iCodec := storage.NewInsertCodecWithSchema(meta)
mockKv := memkv.NewMemoryKV()
metaCache := metacache.NewMockMetaCache(t)
metaCache.EXPECT().Collection().Return(c.colID)
metaCache.EXPECT().Schema().Return(meta.GetSchema())
syncMgr := syncmgr.NewMockSyncManager(t)
syncMgr.EXPECT().Block(mock.Anything).Return()
bfs := metacache.NewBloomFilterSet()
bfs.UpdatePKRange(c.iData1)
seg1 := metacache.NewSegmentInfo(&datapb.SegmentInfo{
CollectionID: c.colID,
PartitionID: c.parID,
ID: c.segID1,
NumOfRows: 2,
}, bfs)
bfs = metacache.NewBloomFilterSet()
bfs.UpdatePKRange(c.iData2)
seg2 := metacache.NewSegmentInfo(&datapb.SegmentInfo{
CollectionID: c.colID,
PartitionID: c.parID,
ID: c.segID2,
NumOfRows: 2,
}, bfs)
metaCache.EXPECT().GetSegmentByID(mock.Anything).RunAndReturn(func(id int64, filters ...metacache.SegmentFilter) (*metacache.SegmentInfo, bool) {
switch id {
case c.segID1:
return seg1, true
case c.segID2:
return seg2, true
default:
return nil, false
}
})
iData1 := genInsertDataWithPKs(c.pks1, c.pkType)
dData1 := &DeleteData{
Pks: []storage.PrimaryKey{c.pks1[0]},
Tss: []Timestamp{20000},
RowCount: 1,
}
iData2 := genInsertDataWithPKs(c.pks2, c.pkType)
dData2 := &DeleteData{
Pks: []storage.PrimaryKey{c.pks2[0]},
Tss: []Timestamp{30000},
RowCount: 1,
}
stats1, err := storage.NewPrimaryKeyStats(1, int64(c.pkType), 1)
require.NoError(t, err)
iPaths1, err := uploadInsertLog(context.Background(), mockbIO, alloc, meta.GetID(), c.parID, c.segID1, iData1, iCodec)
require.NoError(t, err)
sPaths1, err := uploadStatsLog(context.Background(), mockbIO, alloc, meta.GetID(), c.parID, c.segID1, stats1, 2, iCodec)
require.NoError(t, err)
dPaths1, err := uploadDeltaLog(context.TODO(), mockbIO, alloc, meta.GetID(), c.parID, c.segID1, dData1)
require.NoError(t, err)
require.Equal(t, 12, len(iPaths1))
stats2, err := storage.NewPrimaryKeyStats(1, int64(c.pkType), 1)
require.NoError(t, err)
iPaths2, err := uploadInsertLog(context.Background(), mockbIO, alloc, meta.GetID(), c.parID, c.segID2, iData2, iCodec)
require.NoError(t, err)
sPaths2, err := uploadStatsLog(context.Background(), mockbIO, alloc, meta.GetID(), c.parID, c.segID2, stats2, 2, iCodec)
require.NoError(t, err)
dPaths2, err := uploadDeltaLog(context.TODO(), mockbIO, alloc, meta.GetID(), c.parID, c.segID2, dData2)
require.NoError(t, err)
require.Equal(t, 12, len(iPaths2))
plan := &datapb.CompactionPlan{
PlanID: 10080,
SegmentBinlogs: []*datapb.CompactionSegmentBinlogs{
{
SegmentID: c.segID1,
FieldBinlogs: lo.Values(iPaths1),
Field2StatslogPaths: lo.Values(sPaths1),
Deltalogs: dPaths1,
},
{
SegmentID: c.segID2,
FieldBinlogs: lo.Values(iPaths2),
Field2StatslogPaths: lo.Values(sPaths2),
Deltalogs: dPaths2,
},
},
StartTime: 0,
TimeoutInSeconds: 10,
Type: datapb.CompactionType_MergeCompaction,
Channel: "channelname",
}
task := newCompactionTask(context.TODO(), mockbIO, metaCache, syncMgr, alloc, plan)
result, err := task.compact()
assert.NoError(t, err)
assert.NotNil(t, result)
assert.Equal(t, plan.GetPlanID(), result.GetPlanID())
assert.Equal(t, 1, len(result.GetSegments()))
segment := result.GetSegments()[0]
assert.EqualValues(t, 19530, segment.GetSegmentID())
assert.EqualValues(t, 2, segment.GetNumOfRows())
assert.NotEmpty(t, segment.InsertLogs)
assert.NotEmpty(t, segment.Field2StatslogPaths)
// New test, remove all the binlogs in memkv
err = mockKv.RemoveWithPrefix("/")
require.NoError(t, err)
plan.PlanID++
result, err = task.compact()
assert.NoError(t, err)
assert.NotNil(t, result)
assert.Equal(t, plan.GetPlanID(), result.GetPlanID())
assert.Equal(t, 1, len(result.GetSegments()))
segment = result.GetSegments()[0]
assert.EqualValues(t, 19530, segment.GetSegmentID())
assert.EqualValues(t, 2, segment.GetNumOfRows())
assert.NotEmpty(t, segment.InsertLogs)
assert.NotEmpty(t, segment.Field2StatslogPaths)
}
})
t.Run("Test typeII compact 2 segments with the same pk", func(t *testing.T) {
// Test merge compactions, two segments with the same pk, one deletion pk=1
// The merged segment 19530 should only contain 2 rows and both pk=2
// Both pk = 1 rows of the two segments are compacted.
var collID, partID, segID1, segID2 UniqueID = 1, 10, 200, 201
alloc := allocator.NewMockAllocator(t)
alloc.EXPECT().AllocOne().Call.Return(int64(19530), nil)
alloc.EXPECT().GetGenerator(mock.Anything, mock.Anything).Call.Return(validGeneratorFn, nil)
meta := NewMetaFactory().GetCollectionMeta(collID, "test_compact_coll_name", schemapb.DataType_Int64)
mockbIO := io.NewBinlogIO(cm, getOrCreateIOPool())
iCodec := storage.NewInsertCodecWithSchema(meta)
metaCache := metacache.NewMockMetaCache(t)
metaCache.EXPECT().Collection().Return(collID)
metaCache.EXPECT().Schema().Return(meta.GetSchema())
syncMgr := syncmgr.NewMockSyncManager(t)
syncMgr.EXPECT().Block(mock.Anything).Return()
bfs := metacache.NewBloomFilterSet()
bfs.UpdatePKRange(&storage.Int64FieldData{Data: []UniqueID{1}})
seg1 := metacache.NewSegmentInfo(&datapb.SegmentInfo{
CollectionID: collID,
PartitionID: partID,
ID: segID1,
NumOfRows: 2,
}, bfs)
bfs = metacache.NewBloomFilterSet()
bfs.UpdatePKRange(&storage.Int64FieldData{Data: []UniqueID{1}})
seg2 := metacache.NewSegmentInfo(&datapb.SegmentInfo{
CollectionID: collID,
PartitionID: partID,
ID: segID2,
NumOfRows: 2,
}, bfs)
metaCache.EXPECT().GetSegmentByID(mock.Anything).RunAndReturn(func(id int64, filters ...metacache.SegmentFilter) (*metacache.SegmentInfo, bool) {
switch id {
case segID1:
return seg1, true
case segID2:
return seg2, true
default:
return nil, false
}
})
// the same pk for segmentI and segmentII
pks := [2]storage.PrimaryKey{storage.NewInt64PrimaryKey(1), storage.NewInt64PrimaryKey(2)}
iData1 := genInsertDataWithPKs(pks, schemapb.DataType_Int64)
iData2 := genInsertDataWithPKs(pks, schemapb.DataType_Int64)
pk1 := storage.NewInt64PrimaryKey(1)
dData1 := &DeleteData{
Pks: []storage.PrimaryKey{pk1},
Tss: []Timestamp{20000},
RowCount: 1,
}
// empty dData2
dData2 := &DeleteData{
Pks: []storage.PrimaryKey{},
Tss: []Timestamp{},
RowCount: 0,
}
stats1, err := storage.NewPrimaryKeyStats(1, int64(schemapb.DataType_Int64), 1)
require.NoError(t, err)
iPaths1, err := uploadInsertLog(context.Background(), mockbIO, alloc, meta.GetID(), partID, segID1, iData1, iCodec)
require.NoError(t, err)
sPaths1, err := uploadStatsLog(context.Background(), mockbIO, alloc, meta.GetID(), partID, segID1, stats1, 1, iCodec)
require.NoError(t, err)
dPaths1, err := uploadDeltaLog(context.TODO(), mockbIO, alloc, meta.GetID(), partID, segID1, dData1)
require.NoError(t, err)
require.Equal(t, 12, len(iPaths1))
stats2, err := storage.NewPrimaryKeyStats(1, int64(schemapb.DataType_Int64), 1)
require.NoError(t, err)
iPaths2, err := uploadInsertLog(context.Background(), mockbIO, alloc, meta.GetID(), partID, segID2, iData2, iCodec)
require.NoError(t, err)
sPaths2, err := uploadStatsLog(context.Background(), mockbIO, alloc, meta.GetID(), partID, segID2, stats2, 1, iCodec)
require.NoError(t, err)
dPaths2, err := uploadDeltaLog(context.TODO(), mockbIO, alloc, meta.GetID(), partID, segID2, dData2)
require.NoError(t, err)
require.Equal(t, 12, len(iPaths2))
plan := &datapb.CompactionPlan{
PlanID: 20080,
SegmentBinlogs: []*datapb.CompactionSegmentBinlogs{
{
SegmentID: segID1,
FieldBinlogs: lo.Values(iPaths1),
Field2StatslogPaths: lo.Values(sPaths1),
Deltalogs: dPaths1,
},
{
SegmentID: segID2,
FieldBinlogs: lo.Values(iPaths2),
Field2StatslogPaths: lo.Values(sPaths2),
Deltalogs: dPaths2,
},
},
StartTime: 0,
TimeoutInSeconds: 10,
Type: datapb.CompactionType_MergeCompaction,
Channel: "channelname",
}
task := newCompactionTask(context.TODO(), mockbIO, metaCache, syncMgr, alloc, plan)
result, err := task.compact()
assert.NoError(t, err)
assert.NotNil(t, result)
assert.Equal(t, plan.GetPlanID(), result.GetPlanID())
assert.Equal(t, 1, len(result.GetSegments()))
segment := result.GetSegments()[0]
assert.EqualValues(t, 19530, segment.GetSegmentID())
assert.EqualValues(t, 2, segment.GetNumOfRows())
assert.NotEmpty(t, segment.InsertLogs)
assert.NotEmpty(t, segment.Field2StatslogPaths)
})
}
func TestInjectDone(t *testing.T) {
syncMgr := syncmgr.NewMockSyncManager(t)
segmentIDs := []int64{100, 200, 300}
task := &compactionTask{
plan: &datapb.CompactionPlan{
SegmentBinlogs: lo.Map(segmentIDs, func(id int64, _ int) *datapb.CompactionSegmentBinlogs {
return &datapb.CompactionSegmentBinlogs{SegmentID: id}
}),
},
syncMgr: syncMgr,
}
for _, segmentID := range segmentIDs {
syncMgr.EXPECT().Unblock(segmentID).Return().Once()
}
task.injectDone()
task.injectDone()
}