2021-12-23 12:01:36 +08:00
|
|
|
// Licensed to the LF AI & Data foundation under one
|
|
|
|
// or more contributor license agreements. See the NOTICE file
|
|
|
|
// distributed with this work for additional information
|
|
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
|
|
// to you under the Apache License, Version 2.0 (the
|
|
|
|
// "License"); you may not use this file except in compliance
|
2021-04-19 11:32:24 +08:00
|
|
|
// with the License. You may obtain a copy of the License at
|
|
|
|
//
|
2021-12-23 12:01:36 +08:00
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
2021-04-19 11:32:24 +08:00
|
|
|
//
|
2021-12-23 12:01:36 +08:00
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
2021-04-19 11:32:24 +08:00
|
|
|
|
2020-12-09 20:07:27 +08:00
|
|
|
package storage
|
|
|
|
|
|
|
|
import (
|
2021-10-11 17:28:30 +08:00
|
|
|
"encoding/binary"
|
2021-01-28 17:25:43 +08:00
|
|
|
"encoding/json"
|
2020-12-09 20:07:27 +08:00
|
|
|
"fmt"
|
2021-09-28 14:30:02 +08:00
|
|
|
"math"
|
2020-12-23 11:34:35 +08:00
|
|
|
"sort"
|
|
|
|
"strconv"
|
|
|
|
"strings"
|
2020-12-09 20:07:27 +08:00
|
|
|
|
2024-02-02 10:47:04 +08:00
|
|
|
"github.com/samber/lo"
|
|
|
|
|
2023-06-09 01:28:37 +08:00
|
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
2021-04-22 14:45:57 +08:00
|
|
|
"github.com/milvus-io/milvus/internal/proto/etcdpb"
|
2023-04-06 19:14:32 +08:00
|
|
|
"github.com/milvus-io/milvus/pkg/common"
|
2023-11-25 15:10:25 +08:00
|
|
|
"github.com/milvus-io/milvus/pkg/util/merr"
|
2024-03-04 19:31:09 +08:00
|
|
|
"github.com/milvus-io/milvus/pkg/util/metautil"
|
2023-04-06 19:14:32 +08:00
|
|
|
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
2020-12-09 20:07:27 +08:00
|
|
|
)
|
|
|
|
|
2020-12-23 18:06:04 +08:00
|
|
|
const (
|
2021-10-13 18:59:26 +08:00
|
|
|
// Ts is blob key "ts"
|
|
|
|
Ts = "ts"
|
|
|
|
// DDL is blob key "ddl"
|
|
|
|
DDL = "ddl"
|
|
|
|
// IndexParamsKey is blob key "indexParams"
|
2021-10-09 19:27:02 +08:00
|
|
|
IndexParamsKey = "indexParams"
|
2020-12-23 18:06:04 +08:00
|
|
|
)
|
|
|
|
|
2021-09-30 17:57:01 +08:00
|
|
|
// when the blob of index file is too large, we can split blob into several rows,
|
|
|
|
// fortunately, the blob has no other semantics which differs from other binlog type,
|
|
|
|
// we then assemble these several rows into a whole blob when deserialize index binlog.
|
|
|
|
// num rows = math.Ceil(len(blob) / maxLengthPerRowOfIndexFile)
|
|
|
|
// There is only a string row in the past version index file which is a subset case of splitting into several rows.
|
|
|
|
// So splitting index file won't introduce incompatibility with past version.
|
|
|
|
const maxLengthPerRowOfIndexFile = 4 * 1024 * 1024
|
|
|
|
|
2020-12-09 20:07:27 +08:00
|
|
|
type (
|
2021-10-26 22:32:44 +08:00
|
|
|
// UniqueID is type alias of typeutil.UniqueID
|
|
|
|
UniqueID = typeutil.UniqueID
|
|
|
|
|
2023-06-21 14:00:42 +08:00
|
|
|
// FieldID represent the identity number of field in collection and its type is UniqueID
|
2021-10-26 22:32:44 +08:00
|
|
|
FieldID = typeutil.UniqueID
|
|
|
|
|
|
|
|
// Timestamp is type alias of typeutil.Timestamp
|
2020-12-09 20:07:27 +08:00
|
|
|
Timestamp = typeutil.Timestamp
|
|
|
|
)
|
|
|
|
|
2021-11-18 19:15:27 +08:00
|
|
|
// InvalidUniqueID is used when the UniqueID is not set (like in return with err)
|
2021-07-16 17:19:55 +08:00
|
|
|
const InvalidUniqueID = UniqueID(-1)
|
|
|
|
|
2021-11-18 19:15:27 +08:00
|
|
|
// Blob is a pack of key&value
|
2020-12-09 20:07:27 +08:00
|
|
|
type Blob struct {
|
2022-12-01 20:33:17 +08:00
|
|
|
Key string
|
|
|
|
Value []byte
|
|
|
|
Size int64
|
|
|
|
RowNum int64
|
2020-12-23 11:34:35 +08:00
|
|
|
}
|
|
|
|
|
2021-11-18 19:15:27 +08:00
|
|
|
// BlobList implements sort.Interface for a list of Blob
|
2020-12-23 11:34:35 +08:00
|
|
|
type BlobList []*Blob
|
|
|
|
|
2021-11-18 19:15:27 +08:00
|
|
|
// Len implements Len in sort.Interface
|
2020-12-23 11:34:35 +08:00
|
|
|
func (s BlobList) Len() int {
|
|
|
|
return len(s)
|
|
|
|
}
|
|
|
|
|
2021-12-20 22:47:07 +08:00
|
|
|
// Less implements Less in sort.Interface
|
2020-12-23 11:34:35 +08:00
|
|
|
func (s BlobList) Less(i, j int) bool {
|
2024-03-04 19:31:09 +08:00
|
|
|
_, _, _, _, iLog, ok := metautil.ParseInsertLogPath(s[i].Key)
|
|
|
|
if !ok {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
_, _, _, _, jLog, ok := metautil.ParseInsertLogPath(s[j].Key)
|
|
|
|
if !ok {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
return iLog < jLog
|
2020-12-23 11:34:35 +08:00
|
|
|
}
|
|
|
|
|
2021-12-20 22:47:07 +08:00
|
|
|
// Swap implements Swap in sort.Interface
|
2020-12-23 11:34:35 +08:00
|
|
|
func (s BlobList) Swap(i, j int) {
|
|
|
|
s[i], s[j] = s[j], s[i]
|
2020-12-09 20:07:27 +08:00
|
|
|
}
|
|
|
|
|
2021-11-18 19:15:27 +08:00
|
|
|
// GetKey returns the key of blob
|
2020-12-22 08:14:36 +08:00
|
|
|
func (b Blob) GetKey() string {
|
2020-12-23 11:34:35 +08:00
|
|
|
return b.Key
|
2020-12-22 08:14:36 +08:00
|
|
|
}
|
|
|
|
|
2021-11-18 19:15:27 +08:00
|
|
|
// GetValue returns the value of blob
|
2020-12-22 08:14:36 +08:00
|
|
|
func (b Blob) GetValue() []byte {
|
2020-12-23 11:34:35 +08:00
|
|
|
return b.Value
|
2020-12-22 08:14:36 +08:00
|
|
|
}
|
|
|
|
|
2021-09-15 10:47:48 +08:00
|
|
|
// InsertCodec serializes and deserializes the insert data
|
2020-12-09 20:07:27 +08:00
|
|
|
// Blob key example:
|
2020-12-18 15:21:25 +08:00
|
|
|
// ${tenant}/insert_log/${collection_id}/${partition_id}/${segment_id}/${field_id}/${log_idx}
|
2020-12-09 20:07:27 +08:00
|
|
|
type InsertCodec struct {
|
2021-11-22 17:27:14 +08:00
|
|
|
Schema *etcdpb.CollectionMeta
|
2020-12-09 20:07:27 +08:00
|
|
|
}
|
|
|
|
|
2023-03-27 00:42:00 +08:00
|
|
|
// NewInsertCodec creates an InsertCodec
|
|
|
|
func NewInsertCodec() *InsertCodec {
|
|
|
|
return &InsertCodec{}
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewInsertCodecWithSchema creates an InsertCodec with provided collection meta
|
|
|
|
func NewInsertCodecWithSchema(schema *etcdpb.CollectionMeta) *InsertCodec {
|
2020-12-23 18:06:04 +08:00
|
|
|
return &InsertCodec{Schema: schema}
|
|
|
|
}
|
|
|
|
|
2023-05-29 10:21:28 +08:00
|
|
|
// Serialize Pk stats log
|
|
|
|
func (insertCodec *InsertCodec) SerializePkStats(stats *PrimaryKeyStats, rowNum int64) (*Blob, error) {
|
|
|
|
if stats == nil || stats.BF == nil {
|
|
|
|
return nil, fmt.Errorf("sericalize empty pk stats")
|
|
|
|
}
|
|
|
|
|
2023-09-21 09:45:27 +08:00
|
|
|
// Serialize by pk stats
|
2023-05-29 10:21:28 +08:00
|
|
|
blobKey := fmt.Sprintf("%d", stats.FieldID)
|
|
|
|
statsWriter := &StatsWriter{}
|
|
|
|
err := statsWriter.Generate(stats)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
buffer := statsWriter.GetBuffer()
|
|
|
|
return &Blob{
|
|
|
|
Key: blobKey,
|
|
|
|
Value: buffer,
|
|
|
|
RowNum: rowNum,
|
|
|
|
}, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Serialize Pk stats list to one blob
|
|
|
|
func (insertCodec *InsertCodec) SerializePkStatsList(stats []*PrimaryKeyStats, rowNum int64) (*Blob, error) {
|
|
|
|
if len(stats) == 0 {
|
2023-11-25 15:10:25 +08:00
|
|
|
return nil, merr.WrapErrServiceInternal("shall not serialize zero length statslog list")
|
2023-05-29 10:21:28 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
blobKey := fmt.Sprintf("%d", stats[0].FieldID)
|
|
|
|
statsWriter := &StatsWriter{}
|
|
|
|
err := statsWriter.GenerateList(stats)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
buffer := statsWriter.GetBuffer()
|
|
|
|
return &Blob{
|
|
|
|
Key: blobKey,
|
|
|
|
Value: buffer,
|
|
|
|
RowNum: rowNum,
|
|
|
|
}, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Serialize Pk stats log by insert data
|
|
|
|
func (insertCodec *InsertCodec) SerializePkStatsByData(data *InsertData) (*Blob, error) {
|
|
|
|
timeFieldData, ok := data.Data[common.TimeStampField]
|
|
|
|
if !ok {
|
|
|
|
return nil, fmt.Errorf("data doesn't contains timestamp field")
|
|
|
|
}
|
|
|
|
if timeFieldData.RowNum() <= 0 {
|
|
|
|
return nil, fmt.Errorf("there's no data in InsertData")
|
|
|
|
}
|
|
|
|
rowNum := int64(timeFieldData.RowNum())
|
|
|
|
|
|
|
|
for _, field := range insertCodec.Schema.Schema.Fields {
|
|
|
|
// stats fields
|
|
|
|
if !field.GetIsPrimaryKey() {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
singleData := data.Data[field.FieldID]
|
|
|
|
blobKey := fmt.Sprintf("%d", field.FieldID)
|
|
|
|
statsWriter := &StatsWriter{}
|
|
|
|
err := statsWriter.GenerateByData(field.FieldID, field.DataType, singleData)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
buffer := statsWriter.GetBuffer()
|
|
|
|
return &Blob{
|
|
|
|
Key: blobKey,
|
|
|
|
Value: buffer,
|
|
|
|
RowNum: rowNum,
|
|
|
|
}, nil
|
|
|
|
}
|
|
|
|
return nil, fmt.Errorf("there is no pk field")
|
|
|
|
}
|
|
|
|
|
2024-03-14 05:32:54 +08:00
|
|
|
// Serialize transforms insert data to blob. It will sort insert data by timestamp.
|
2021-12-23 22:05:51 +08:00
|
|
|
// From schema, it gets all fields.
|
2021-12-20 19:08:38 +08:00
|
|
|
// For each field, it will create a binlog writer, and write an event to the binlog.
|
2021-09-17 11:05:49 +08:00
|
|
|
// It returns binlog buffer in the end.
|
2023-05-29 10:21:28 +08:00
|
|
|
func (insertCodec *InsertCodec) Serialize(partitionID UniqueID, segmentID UniqueID, data *InsertData) ([]*Blob, error) {
|
2021-10-22 12:51:11 +08:00
|
|
|
blobs := make([]*Blob, 0)
|
2020-12-09 20:07:27 +08:00
|
|
|
var writer *InsertBinlogWriter
|
2024-02-20 14:38:51 +08:00
|
|
|
if insertCodec.Schema == nil {
|
|
|
|
return nil, fmt.Errorf("schema is not set")
|
|
|
|
}
|
2022-04-07 22:05:32 +08:00
|
|
|
timeFieldData, ok := data.Data[common.TimeStampField]
|
2020-12-18 15:21:25 +08:00
|
|
|
if !ok {
|
2023-05-29 10:21:28 +08:00
|
|
|
return nil, fmt.Errorf("data doesn't contains timestamp field")
|
2020-12-18 15:21:25 +08:00
|
|
|
}
|
2021-11-26 17:43:17 +08:00
|
|
|
if timeFieldData.RowNum() <= 0 {
|
2023-05-29 10:21:28 +08:00
|
|
|
return nil, fmt.Errorf("there's no data in InsertData")
|
2021-11-26 17:43:17 +08:00
|
|
|
}
|
2022-12-01 20:33:17 +08:00
|
|
|
rowNum := int64(timeFieldData.RowNum())
|
2021-11-26 17:43:17 +08:00
|
|
|
|
2020-12-23 11:34:35 +08:00
|
|
|
ts := timeFieldData.(*Int64FieldData).Data
|
2023-07-18 10:41:20 +08:00
|
|
|
var startTs, endTs Timestamp
|
|
|
|
startTs, endTs = math.MaxUint64, 0
|
|
|
|
for _, t := range ts {
|
|
|
|
if uint64(t) > endTs {
|
|
|
|
endTs = uint64(t)
|
|
|
|
}
|
|
|
|
|
|
|
|
if uint64(t) < startTs {
|
|
|
|
startTs = uint64(t)
|
|
|
|
}
|
|
|
|
}
|
2021-05-14 10:59:49 +08:00
|
|
|
|
2022-08-11 14:06:38 +08:00
|
|
|
// sort insert data by rowID
|
2021-05-14 10:59:49 +08:00
|
|
|
dataSorter := &DataSorter{
|
|
|
|
InsertCodec: insertCodec,
|
|
|
|
InsertData: data,
|
|
|
|
}
|
|
|
|
sort.Sort(dataSorter)
|
2020-12-09 20:07:27 +08:00
|
|
|
|
2020-12-11 11:29:07 +08:00
|
|
|
for _, field := range insertCodec.Schema.Schema.Fields {
|
|
|
|
singleData := data.Data[field.FieldID]
|
2021-05-20 18:38:45 +08:00
|
|
|
|
|
|
|
// encode fields
|
2021-04-19 10:36:19 +08:00
|
|
|
writer = NewInsertBinlogWriter(field.DataType, insertCodec.Schema.ID, partitionID, segmentID, field.FieldID)
|
2022-09-09 22:12:34 +08:00
|
|
|
var eventWriter *insertEventWriter
|
|
|
|
var err error
|
|
|
|
if typeutil.IsVectorType(field.DataType) {
|
|
|
|
switch field.DataType {
|
|
|
|
case schemapb.DataType_FloatVector:
|
|
|
|
eventWriter, err = writer.NextInsertEventWriter(singleData.(*FloatVectorFieldData).Dim)
|
|
|
|
case schemapb.DataType_BinaryVector:
|
|
|
|
eventWriter, err = writer.NextInsertEventWriter(singleData.(*BinaryVectorFieldData).Dim)
|
2023-09-08 10:03:16 +08:00
|
|
|
case schemapb.DataType_Float16Vector:
|
|
|
|
eventWriter, err = writer.NextInsertEventWriter(singleData.(*Float16VectorFieldData).Dim)
|
2024-01-11 15:48:51 +08:00
|
|
|
case schemapb.DataType_BFloat16Vector:
|
|
|
|
eventWriter, err = writer.NextInsertEventWriter(singleData.(*BFloat16VectorFieldData).Dim)
|
2024-03-14 05:32:54 +08:00
|
|
|
case schemapb.DataType_SparseFloatVector:
|
|
|
|
eventWriter, err = writer.NextInsertEventWriter()
|
2022-09-09 22:12:34 +08:00
|
|
|
default:
|
2023-05-29 10:21:28 +08:00
|
|
|
return nil, fmt.Errorf("undefined data type %d", field.DataType)
|
2022-09-09 22:12:34 +08:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
eventWriter, err = writer.NextInsertEventWriter()
|
|
|
|
}
|
2020-12-11 11:29:07 +08:00
|
|
|
if err != nil {
|
2022-01-07 18:27:23 +08:00
|
|
|
writer.Close()
|
2023-05-29 10:21:28 +08:00
|
|
|
return nil, err
|
2020-12-11 11:29:07 +08:00
|
|
|
}
|
2021-05-20 18:38:45 +08:00
|
|
|
|
2023-07-18 10:41:20 +08:00
|
|
|
eventWriter.SetEventTimestamp(startTs, endTs)
|
2020-12-11 11:29:07 +08:00
|
|
|
switch field.DataType {
|
2021-03-12 14:22:09 +08:00
|
|
|
case schemapb.DataType_Bool:
|
2020-12-23 11:34:35 +08:00
|
|
|
err = eventWriter.AddBoolToPayload(singleData.(*BoolFieldData).Data)
|
2022-01-07 18:27:23 +08:00
|
|
|
if err != nil {
|
|
|
|
eventWriter.Close()
|
|
|
|
writer.Close()
|
2023-05-29 10:21:28 +08:00
|
|
|
return nil, err
|
2022-01-07 18:27:23 +08:00
|
|
|
}
|
2021-10-11 21:02:37 +08:00
|
|
|
writer.AddExtra(originalSizeKey, fmt.Sprintf("%v", singleData.(*BoolFieldData).GetMemorySize()))
|
2021-03-12 14:22:09 +08:00
|
|
|
case schemapb.DataType_Int8:
|
2020-12-23 11:34:35 +08:00
|
|
|
err = eventWriter.AddInt8ToPayload(singleData.(*Int8FieldData).Data)
|
2022-01-07 18:27:23 +08:00
|
|
|
if err != nil {
|
|
|
|
eventWriter.Close()
|
|
|
|
writer.Close()
|
2023-05-29 10:21:28 +08:00
|
|
|
return nil, err
|
2022-01-07 18:27:23 +08:00
|
|
|
}
|
2021-10-11 21:02:37 +08:00
|
|
|
writer.AddExtra(originalSizeKey, fmt.Sprintf("%v", singleData.(*Int8FieldData).GetMemorySize()))
|
2021-03-12 14:22:09 +08:00
|
|
|
case schemapb.DataType_Int16:
|
2020-12-23 11:34:35 +08:00
|
|
|
err = eventWriter.AddInt16ToPayload(singleData.(*Int16FieldData).Data)
|
2022-01-07 18:27:23 +08:00
|
|
|
if err != nil {
|
|
|
|
eventWriter.Close()
|
|
|
|
writer.Close()
|
2023-05-29 10:21:28 +08:00
|
|
|
return nil, err
|
2022-01-07 18:27:23 +08:00
|
|
|
}
|
2021-10-11 21:02:37 +08:00
|
|
|
writer.AddExtra(originalSizeKey, fmt.Sprintf("%v", singleData.(*Int16FieldData).GetMemorySize()))
|
2021-03-12 14:22:09 +08:00
|
|
|
case schemapb.DataType_Int32:
|
2020-12-23 11:34:35 +08:00
|
|
|
err = eventWriter.AddInt32ToPayload(singleData.(*Int32FieldData).Data)
|
2022-01-07 18:27:23 +08:00
|
|
|
if err != nil {
|
|
|
|
eventWriter.Close()
|
|
|
|
writer.Close()
|
2023-05-29 10:21:28 +08:00
|
|
|
return nil, err
|
2022-01-07 18:27:23 +08:00
|
|
|
}
|
2021-10-11 21:02:37 +08:00
|
|
|
writer.AddExtra(originalSizeKey, fmt.Sprintf("%v", singleData.(*Int32FieldData).GetMemorySize()))
|
2021-03-12 14:22:09 +08:00
|
|
|
case schemapb.DataType_Int64:
|
2020-12-23 11:34:35 +08:00
|
|
|
err = eventWriter.AddInt64ToPayload(singleData.(*Int64FieldData).Data)
|
2022-01-07 18:27:23 +08:00
|
|
|
if err != nil {
|
|
|
|
eventWriter.Close()
|
|
|
|
writer.Close()
|
2023-05-29 10:21:28 +08:00
|
|
|
return nil, err
|
2022-01-07 18:27:23 +08:00
|
|
|
}
|
2021-10-11 21:02:37 +08:00
|
|
|
writer.AddExtra(originalSizeKey, fmt.Sprintf("%v", singleData.(*Int64FieldData).GetMemorySize()))
|
2021-03-12 14:22:09 +08:00
|
|
|
case schemapb.DataType_Float:
|
2020-12-23 11:34:35 +08:00
|
|
|
err = eventWriter.AddFloatToPayload(singleData.(*FloatFieldData).Data)
|
2022-01-07 18:27:23 +08:00
|
|
|
if err != nil {
|
|
|
|
eventWriter.Close()
|
|
|
|
writer.Close()
|
2023-05-29 10:21:28 +08:00
|
|
|
return nil, err
|
2022-01-07 18:27:23 +08:00
|
|
|
}
|
2021-10-11 21:02:37 +08:00
|
|
|
writer.AddExtra(originalSizeKey, fmt.Sprintf("%v", singleData.(*FloatFieldData).GetMemorySize()))
|
2021-03-12 14:22:09 +08:00
|
|
|
case schemapb.DataType_Double:
|
2020-12-23 11:34:35 +08:00
|
|
|
err = eventWriter.AddDoubleToPayload(singleData.(*DoubleFieldData).Data)
|
2022-01-07 18:27:23 +08:00
|
|
|
if err != nil {
|
|
|
|
eventWriter.Close()
|
|
|
|
writer.Close()
|
2023-05-29 10:21:28 +08:00
|
|
|
return nil, err
|
2022-01-07 18:27:23 +08:00
|
|
|
}
|
2021-10-11 21:02:37 +08:00
|
|
|
writer.AddExtra(originalSizeKey, fmt.Sprintf("%v", singleData.(*DoubleFieldData).GetMemorySize()))
|
2022-03-25 14:27:25 +08:00
|
|
|
case schemapb.DataType_String, schemapb.DataType_VarChar:
|
2020-12-23 11:34:35 +08:00
|
|
|
for _, singleString := range singleData.(*StringFieldData).Data {
|
2020-12-09 20:07:27 +08:00
|
|
|
err = eventWriter.AddOneStringToPayload(singleString)
|
2020-12-11 11:29:07 +08:00
|
|
|
if err != nil {
|
2022-01-07 18:27:23 +08:00
|
|
|
eventWriter.Close()
|
|
|
|
writer.Close()
|
2023-05-29 10:21:28 +08:00
|
|
|
return nil, err
|
2020-12-11 11:29:07 +08:00
|
|
|
}
|
2020-12-09 20:07:27 +08:00
|
|
|
}
|
2021-10-11 21:02:37 +08:00
|
|
|
writer.AddExtra(originalSizeKey, fmt.Sprintf("%v", singleData.(*StringFieldData).GetMemorySize()))
|
2023-04-20 11:32:31 +08:00
|
|
|
case schemapb.DataType_Array:
|
|
|
|
for _, singleArray := range singleData.(*ArrayFieldData).Data {
|
|
|
|
err = eventWriter.AddOneArrayToPayload(singleArray)
|
|
|
|
if err != nil {
|
|
|
|
eventWriter.Close()
|
|
|
|
writer.Close()
|
2023-05-29 10:21:28 +08:00
|
|
|
return nil, err
|
2023-04-20 11:32:31 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
writer.AddExtra(originalSizeKey, fmt.Sprintf("%v", singleData.(*ArrayFieldData).GetMemorySize()))
|
|
|
|
case schemapb.DataType_JSON:
|
|
|
|
for _, singleJSON := range singleData.(*JSONFieldData).Data {
|
|
|
|
err = eventWriter.AddOneJSONToPayload(singleJSON)
|
|
|
|
if err != nil {
|
|
|
|
eventWriter.Close()
|
|
|
|
writer.Close()
|
2023-05-29 10:21:28 +08:00
|
|
|
return nil, err
|
2023-04-20 11:32:31 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
writer.AddExtra(originalSizeKey, fmt.Sprintf("%v", singleData.(*JSONFieldData).GetMemorySize()))
|
2021-03-12 14:22:09 +08:00
|
|
|
case schemapb.DataType_BinaryVector:
|
2020-12-23 11:34:35 +08:00
|
|
|
err = eventWriter.AddBinaryVectorToPayload(singleData.(*BinaryVectorFieldData).Data, singleData.(*BinaryVectorFieldData).Dim)
|
2022-01-07 18:27:23 +08:00
|
|
|
if err != nil {
|
|
|
|
eventWriter.Close()
|
|
|
|
writer.Close()
|
2023-05-29 10:21:28 +08:00
|
|
|
return nil, err
|
2022-01-07 18:27:23 +08:00
|
|
|
}
|
2021-10-11 21:02:37 +08:00
|
|
|
writer.AddExtra(originalSizeKey, fmt.Sprintf("%v", singleData.(*BinaryVectorFieldData).GetMemorySize()))
|
2021-03-12 14:22:09 +08:00
|
|
|
case schemapb.DataType_FloatVector:
|
2020-12-23 11:34:35 +08:00
|
|
|
err = eventWriter.AddFloatVectorToPayload(singleData.(*FloatVectorFieldData).Data, singleData.(*FloatVectorFieldData).Dim)
|
2022-01-07 18:27:23 +08:00
|
|
|
if err != nil {
|
|
|
|
eventWriter.Close()
|
|
|
|
writer.Close()
|
2023-05-29 10:21:28 +08:00
|
|
|
return nil, err
|
2022-01-07 18:27:23 +08:00
|
|
|
}
|
2021-10-11 21:02:37 +08:00
|
|
|
writer.AddExtra(originalSizeKey, fmt.Sprintf("%v", singleData.(*FloatVectorFieldData).GetMemorySize()))
|
2023-09-08 10:03:16 +08:00
|
|
|
case schemapb.DataType_Float16Vector:
|
|
|
|
err = eventWriter.AddFloat16VectorToPayload(singleData.(*Float16VectorFieldData).Data, singleData.(*Float16VectorFieldData).Dim)
|
|
|
|
if err != nil {
|
|
|
|
eventWriter.Close()
|
|
|
|
writer.Close()
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
writer.AddExtra(originalSizeKey, fmt.Sprintf("%v", singleData.(*Float16VectorFieldData).GetMemorySize()))
|
2024-01-11 15:48:51 +08:00
|
|
|
case schemapb.DataType_BFloat16Vector:
|
|
|
|
err = eventWriter.AddBFloat16VectorToPayload(singleData.(*BFloat16VectorFieldData).Data, singleData.(*BFloat16VectorFieldData).Dim)
|
2024-03-14 05:32:54 +08:00
|
|
|
writer.AddExtra(originalSizeKey, fmt.Sprintf("%v", singleData.(*BFloat16VectorFieldData).GetMemorySize()))
|
|
|
|
case schemapb.DataType_SparseFloatVector:
|
|
|
|
err = eventWriter.AddSparseFloatVectorToPayload(singleData.(*SparseFloatVectorFieldData))
|
2024-01-11 15:48:51 +08:00
|
|
|
if err != nil {
|
|
|
|
eventWriter.Close()
|
|
|
|
writer.Close()
|
|
|
|
return nil, err
|
|
|
|
}
|
2024-03-14 05:32:54 +08:00
|
|
|
writer.AddExtra(originalSizeKey, fmt.Sprintf("%v", singleData.(*SparseFloatVectorFieldData).GetMemorySize()))
|
2020-12-11 18:14:19 +08:00
|
|
|
default:
|
2023-05-29 10:21:28 +08:00
|
|
|
return nil, fmt.Errorf("undefined data type %d", field.DataType)
|
2020-12-11 11:29:07 +08:00
|
|
|
}
|
|
|
|
if err != nil {
|
2023-05-29 10:21:28 +08:00
|
|
|
return nil, err
|
2020-12-09 20:07:27 +08:00
|
|
|
}
|
2023-07-18 10:41:20 +08:00
|
|
|
writer.SetEventTimeStamp(startTs, endTs)
|
2020-12-09 20:07:27 +08:00
|
|
|
|
2021-12-09 12:37:06 +08:00
|
|
|
err = writer.Finish()
|
2020-12-09 20:07:27 +08:00
|
|
|
if err != nil {
|
2022-01-07 18:27:23 +08:00
|
|
|
eventWriter.Close()
|
|
|
|
writer.Close()
|
2023-05-29 10:21:28 +08:00
|
|
|
return nil, err
|
2020-12-09 20:07:27 +08:00
|
|
|
}
|
|
|
|
|
2020-12-10 15:50:09 +08:00
|
|
|
buffer, err := writer.GetBuffer()
|
|
|
|
if err != nil {
|
2022-01-07 18:27:23 +08:00
|
|
|
eventWriter.Close()
|
|
|
|
writer.Close()
|
2023-05-29 10:21:28 +08:00
|
|
|
return nil, err
|
2020-12-10 15:50:09 +08:00
|
|
|
}
|
2020-12-18 15:21:25 +08:00
|
|
|
blobKey := fmt.Sprintf("%d", field.FieldID)
|
2020-12-09 20:07:27 +08:00
|
|
|
blobs = append(blobs, &Blob{
|
2022-12-01 20:33:17 +08:00
|
|
|
Key: blobKey,
|
|
|
|
Value: buffer,
|
|
|
|
RowNum: rowNum,
|
2020-12-09 20:07:27 +08:00
|
|
|
})
|
2021-12-09 12:37:06 +08:00
|
|
|
eventWriter.Close()
|
|
|
|
writer.Close()
|
2020-12-09 20:07:27 +08:00
|
|
|
}
|
|
|
|
|
2023-05-29 10:21:28 +08:00
|
|
|
return blobs, nil
|
2020-12-09 20:07:27 +08:00
|
|
|
}
|
2021-07-07 19:10:07 +08:00
|
|
|
|
2021-09-29 09:52:12 +08:00
|
|
|
func (insertCodec *InsertCodec) DeserializeAll(blobs []*Blob) (
|
|
|
|
collectionID UniqueID,
|
|
|
|
partitionID UniqueID,
|
|
|
|
segmentID UniqueID,
|
|
|
|
data *InsertData,
|
|
|
|
err error,
|
|
|
|
) {
|
2020-12-09 20:07:27 +08:00
|
|
|
if len(blobs) == 0 {
|
2021-09-29 09:52:12 +08:00
|
|
|
return InvalidUniqueID, InvalidUniqueID, InvalidUniqueID, nil, fmt.Errorf("blobs is empty")
|
2020-12-09 20:07:27 +08:00
|
|
|
}
|
2020-12-23 11:34:35 +08:00
|
|
|
|
|
|
|
var blobList BlobList = blobs
|
|
|
|
sort.Sort(blobList)
|
|
|
|
|
2022-06-08 11:46:06 +08:00
|
|
|
data = &InsertData{
|
2022-04-25 15:57:47 +08:00
|
|
|
Data: make(map[FieldID]FieldData),
|
|
|
|
}
|
2022-06-08 11:46:06 +08:00
|
|
|
if collectionID, partitionID, segmentID, err = insertCodec.DeserializeInto(blobs, 0, data); err != nil {
|
|
|
|
return InvalidUniqueID, InvalidUniqueID, InvalidUniqueID, nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
func (insertCodec *InsertCodec) DeserializeInto(fieldBinlogs []*Blob, rowNum int, insertData *InsertData) (
|
|
|
|
collectionID UniqueID,
|
|
|
|
partitionID UniqueID,
|
|
|
|
segmentID UniqueID,
|
|
|
|
err error,
|
|
|
|
) {
|
|
|
|
for _, blob := range fieldBinlogs {
|
2020-12-23 11:34:35 +08:00
|
|
|
binlogReader, err := NewBinlogReader(blob.Value)
|
2020-12-09 20:07:27 +08:00
|
|
|
if err != nil {
|
2022-06-08 11:46:06 +08:00
|
|
|
return InvalidUniqueID, InvalidUniqueID, InvalidUniqueID, err
|
2020-12-09 20:07:27 +08:00
|
|
|
}
|
|
|
|
|
2020-12-11 11:29:07 +08:00
|
|
|
// read partitionID and SegmentID
|
2022-06-08 11:46:06 +08:00
|
|
|
collectionID, partitionID, segmentID = binlogReader.CollectionID, binlogReader.PartitionID, binlogReader.SegmentID
|
2020-12-11 11:29:07 +08:00
|
|
|
|
|
|
|
dataType := binlogReader.PayloadDataType
|
|
|
|
fieldID := binlogReader.FieldID
|
2021-06-16 12:03:57 +08:00
|
|
|
totalLength := 0
|
2022-06-08 11:46:06 +08:00
|
|
|
dim := 0
|
|
|
|
|
2020-12-23 11:34:35 +08:00
|
|
|
for {
|
2020-12-09 20:07:27 +08:00
|
|
|
eventReader, err := binlogReader.NextEventReader()
|
|
|
|
if err != nil {
|
2022-06-08 11:46:06 +08:00
|
|
|
return InvalidUniqueID, InvalidUniqueID, InvalidUniqueID, err
|
2020-12-09 20:07:27 +08:00
|
|
|
}
|
2020-12-23 11:34:35 +08:00
|
|
|
if eventReader == nil {
|
|
|
|
break
|
2020-12-09 20:07:27 +08:00
|
|
|
}
|
2020-12-23 11:34:35 +08:00
|
|
|
switch dataType {
|
2021-03-12 14:22:09 +08:00
|
|
|
case schemapb.DataType_Bool:
|
2020-12-23 11:34:35 +08:00
|
|
|
singleData, err := eventReader.GetBoolFromPayload()
|
2020-12-09 20:07:27 +08:00
|
|
|
if err != nil {
|
2021-12-08 21:11:39 +08:00
|
|
|
eventReader.Close()
|
|
|
|
binlogReader.Close()
|
2022-06-08 11:46:06 +08:00
|
|
|
return InvalidUniqueID, InvalidUniqueID, InvalidUniqueID, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if insertData.Data[fieldID] == nil {
|
|
|
|
insertData.Data[fieldID] = &BoolFieldData{
|
2023-01-28 11:09:52 +08:00
|
|
|
Data: make([]bool, 0, rowNum),
|
2022-06-08 11:46:06 +08:00
|
|
|
}
|
2020-12-09 20:07:27 +08:00
|
|
|
}
|
2022-06-08 11:46:06 +08:00
|
|
|
boolFieldData := insertData.Data[fieldID].(*BoolFieldData)
|
|
|
|
|
2020-12-23 11:34:35 +08:00
|
|
|
boolFieldData.Data = append(boolFieldData.Data, singleData...)
|
2022-04-02 17:43:29 +08:00
|
|
|
totalLength += len(singleData)
|
2022-06-08 11:46:06 +08:00
|
|
|
insertData.Data[fieldID] = boolFieldData
|
|
|
|
|
2021-03-12 14:22:09 +08:00
|
|
|
case schemapb.DataType_Int8:
|
2020-12-23 11:34:35 +08:00
|
|
|
singleData, err := eventReader.GetInt8FromPayload()
|
|
|
|
if err != nil {
|
2021-12-08 21:11:39 +08:00
|
|
|
eventReader.Close()
|
|
|
|
binlogReader.Close()
|
2022-06-08 11:46:06 +08:00
|
|
|
return InvalidUniqueID, InvalidUniqueID, InvalidUniqueID, err
|
2020-12-23 11:34:35 +08:00
|
|
|
}
|
2022-06-08 11:46:06 +08:00
|
|
|
|
|
|
|
if insertData.Data[fieldID] == nil {
|
|
|
|
insertData.Data[fieldID] = &Int8FieldData{
|
2023-01-28 11:09:52 +08:00
|
|
|
Data: make([]int8, 0, rowNum),
|
2022-06-08 11:46:06 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
int8FieldData := insertData.Data[fieldID].(*Int8FieldData)
|
|
|
|
|
2020-12-23 11:34:35 +08:00
|
|
|
int8FieldData.Data = append(int8FieldData.Data, singleData...)
|
2022-04-02 17:43:29 +08:00
|
|
|
totalLength += len(singleData)
|
2022-06-08 11:46:06 +08:00
|
|
|
insertData.Data[fieldID] = int8FieldData
|
|
|
|
|
2021-03-12 14:22:09 +08:00
|
|
|
case schemapb.DataType_Int16:
|
2020-12-23 11:34:35 +08:00
|
|
|
singleData, err := eventReader.GetInt16FromPayload()
|
|
|
|
if err != nil {
|
2021-12-08 21:11:39 +08:00
|
|
|
eventReader.Close()
|
|
|
|
binlogReader.Close()
|
2022-06-08 11:46:06 +08:00
|
|
|
return InvalidUniqueID, InvalidUniqueID, InvalidUniqueID, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if insertData.Data[fieldID] == nil {
|
|
|
|
insertData.Data[fieldID] = &Int16FieldData{
|
2023-01-28 11:09:52 +08:00
|
|
|
Data: make([]int16, 0, rowNum),
|
2022-06-08 11:46:06 +08:00
|
|
|
}
|
2020-12-23 11:34:35 +08:00
|
|
|
}
|
2022-06-08 11:46:06 +08:00
|
|
|
int16FieldData := insertData.Data[fieldID].(*Int16FieldData)
|
|
|
|
|
2020-12-23 11:34:35 +08:00
|
|
|
int16FieldData.Data = append(int16FieldData.Data, singleData...)
|
2022-04-02 17:43:29 +08:00
|
|
|
totalLength += len(singleData)
|
2022-06-08 11:46:06 +08:00
|
|
|
insertData.Data[fieldID] = int16FieldData
|
|
|
|
|
2021-03-12 14:22:09 +08:00
|
|
|
case schemapb.DataType_Int32:
|
2020-12-23 11:34:35 +08:00
|
|
|
singleData, err := eventReader.GetInt32FromPayload()
|
|
|
|
if err != nil {
|
2021-12-08 21:11:39 +08:00
|
|
|
eventReader.Close()
|
|
|
|
binlogReader.Close()
|
2022-06-08 11:46:06 +08:00
|
|
|
return InvalidUniqueID, InvalidUniqueID, InvalidUniqueID, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if insertData.Data[fieldID] == nil {
|
|
|
|
insertData.Data[fieldID] = &Int32FieldData{
|
2023-01-28 11:09:52 +08:00
|
|
|
Data: make([]int32, 0, rowNum),
|
2022-06-08 11:46:06 +08:00
|
|
|
}
|
2020-12-23 11:34:35 +08:00
|
|
|
}
|
2022-06-08 11:46:06 +08:00
|
|
|
int32FieldData := insertData.Data[fieldID].(*Int32FieldData)
|
|
|
|
|
2020-12-23 11:34:35 +08:00
|
|
|
int32FieldData.Data = append(int32FieldData.Data, singleData...)
|
2022-04-02 17:43:29 +08:00
|
|
|
totalLength += len(singleData)
|
2022-06-08 11:46:06 +08:00
|
|
|
insertData.Data[fieldID] = int32FieldData
|
|
|
|
|
2021-03-12 14:22:09 +08:00
|
|
|
case schemapb.DataType_Int64:
|
2020-12-23 11:34:35 +08:00
|
|
|
singleData, err := eventReader.GetInt64FromPayload()
|
|
|
|
if err != nil {
|
2021-12-08 21:11:39 +08:00
|
|
|
eventReader.Close()
|
|
|
|
binlogReader.Close()
|
2022-06-08 11:46:06 +08:00
|
|
|
return InvalidUniqueID, InvalidUniqueID, InvalidUniqueID, err
|
2020-12-23 11:34:35 +08:00
|
|
|
}
|
2022-06-08 11:46:06 +08:00
|
|
|
|
|
|
|
if insertData.Data[fieldID] == nil {
|
|
|
|
insertData.Data[fieldID] = &Int64FieldData{
|
2023-01-28 11:09:52 +08:00
|
|
|
Data: make([]int64, 0, rowNum),
|
2022-06-08 11:46:06 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
int64FieldData := insertData.Data[fieldID].(*Int64FieldData)
|
|
|
|
|
2020-12-23 11:34:35 +08:00
|
|
|
int64FieldData.Data = append(int64FieldData.Data, singleData...)
|
2022-04-02 17:43:29 +08:00
|
|
|
totalLength += len(singleData)
|
2022-06-08 11:46:06 +08:00
|
|
|
insertData.Data[fieldID] = int64FieldData
|
|
|
|
|
2021-03-12 14:22:09 +08:00
|
|
|
case schemapb.DataType_Float:
|
2020-12-23 11:34:35 +08:00
|
|
|
singleData, err := eventReader.GetFloatFromPayload()
|
|
|
|
if err != nil {
|
2021-12-08 21:11:39 +08:00
|
|
|
eventReader.Close()
|
|
|
|
binlogReader.Close()
|
2022-06-08 11:46:06 +08:00
|
|
|
return InvalidUniqueID, InvalidUniqueID, InvalidUniqueID, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if insertData.Data[fieldID] == nil {
|
|
|
|
insertData.Data[fieldID] = &FloatFieldData{
|
2023-01-28 11:09:52 +08:00
|
|
|
Data: make([]float32, 0, rowNum),
|
2022-06-08 11:46:06 +08:00
|
|
|
}
|
2020-12-23 11:34:35 +08:00
|
|
|
}
|
2022-06-08 11:46:06 +08:00
|
|
|
floatFieldData := insertData.Data[fieldID].(*FloatFieldData)
|
|
|
|
|
2020-12-23 11:34:35 +08:00
|
|
|
floatFieldData.Data = append(floatFieldData.Data, singleData...)
|
2022-04-02 17:43:29 +08:00
|
|
|
totalLength += len(singleData)
|
2022-06-08 11:46:06 +08:00
|
|
|
insertData.Data[fieldID] = floatFieldData
|
|
|
|
|
2021-03-12 14:22:09 +08:00
|
|
|
case schemapb.DataType_Double:
|
2020-12-23 11:34:35 +08:00
|
|
|
singleData, err := eventReader.GetDoubleFromPayload()
|
|
|
|
if err != nil {
|
2021-12-08 21:11:39 +08:00
|
|
|
eventReader.Close()
|
|
|
|
binlogReader.Close()
|
2022-06-08 11:46:06 +08:00
|
|
|
return InvalidUniqueID, InvalidUniqueID, InvalidUniqueID, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if insertData.Data[fieldID] == nil {
|
|
|
|
insertData.Data[fieldID] = &DoubleFieldData{
|
2023-01-28 11:09:52 +08:00
|
|
|
Data: make([]float64, 0, rowNum),
|
2022-06-08 11:46:06 +08:00
|
|
|
}
|
2020-12-23 11:34:35 +08:00
|
|
|
}
|
2022-06-08 11:46:06 +08:00
|
|
|
doubleFieldData := insertData.Data[fieldID].(*DoubleFieldData)
|
|
|
|
|
2020-12-23 11:34:35 +08:00
|
|
|
doubleFieldData.Data = append(doubleFieldData.Data, singleData...)
|
2022-04-02 17:43:29 +08:00
|
|
|
totalLength += len(singleData)
|
2022-06-08 11:46:06 +08:00
|
|
|
insertData.Data[fieldID] = doubleFieldData
|
|
|
|
|
2022-03-25 14:27:25 +08:00
|
|
|
case schemapb.DataType_String, schemapb.DataType_VarChar:
|
2022-03-30 15:21:28 +08:00
|
|
|
stringPayload, err := eventReader.GetStringFromPayload()
|
|
|
|
if err != nil {
|
|
|
|
eventReader.Close()
|
|
|
|
binlogReader.Close()
|
2022-06-08 11:46:06 +08:00
|
|
|
return InvalidUniqueID, InvalidUniqueID, InvalidUniqueID, err
|
2022-03-30 15:21:28 +08:00
|
|
|
}
|
2022-04-02 17:43:29 +08:00
|
|
|
|
2022-06-08 11:46:06 +08:00
|
|
|
if insertData.Data[fieldID] == nil {
|
|
|
|
insertData.Data[fieldID] = &StringFieldData{
|
2023-01-28 11:09:52 +08:00
|
|
|
Data: make([]string, 0, rowNum),
|
2022-06-08 11:46:06 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
stringFieldData := insertData.Data[fieldID].(*StringFieldData)
|
|
|
|
|
2022-04-02 17:43:29 +08:00
|
|
|
stringFieldData.Data = append(stringFieldData.Data, stringPayload...)
|
feat: support inverted index (#28783)
issue: https://github.com/milvus-io/milvus/issues/27704
Add inverted index for some data types in Milvus. This index type can
save a lot of memory compared to loading all data into RAM and speed up
the term query and range query.
Supported: `INT8`, `INT16`, `INT32`, `INT64`, `FLOAT`, `DOUBLE`, `BOOL`
and `VARCHAR`.
Not supported: `ARRAY` and `JSON`.
Note:
- The inverted index for `VARCHAR` is not designed to serve full-text
search now. We will treat every row as a whole keyword instead of
tokenizing it into multiple terms.
- The inverted index don't support retrieval well, so if you create
inverted index for field, those operations which depend on the raw data
will fallback to use chunk storage, which will bring some performance
loss. For example, comparisons between two columns and retrieval of
output fields.
The inverted index is very easy to be used.
Taking below collection as an example:
```python
fields = [
FieldSchema(name="pk", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=100),
FieldSchema(name="int8", dtype=DataType.INT8),
FieldSchema(name="int16", dtype=DataType.INT16),
FieldSchema(name="int32", dtype=DataType.INT32),
FieldSchema(name="int64", dtype=DataType.INT64),
FieldSchema(name="float", dtype=DataType.FLOAT),
FieldSchema(name="double", dtype=DataType.DOUBLE),
FieldSchema(name="bool", dtype=DataType.BOOL),
FieldSchema(name="varchar", dtype=DataType.VARCHAR, max_length=1000),
FieldSchema(name="random", dtype=DataType.DOUBLE),
FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
schema = CollectionSchema(fields)
collection = Collection("demo", schema)
```
Then we can simply create inverted index for field via:
```python
index_type = "INVERTED"
collection.create_index("int8", {"index_type": index_type})
collection.create_index("int16", {"index_type": index_type})
collection.create_index("int32", {"index_type": index_type})
collection.create_index("int64", {"index_type": index_type})
collection.create_index("float", {"index_type": index_type})
collection.create_index("double", {"index_type": index_type})
collection.create_index("bool", {"index_type": index_type})
collection.create_index("varchar", {"index_type": index_type})
```
Then, term query and range query on the field can be speed up
automatically by the inverted index:
```python
result = collection.query(expr='int64 in [1, 2, 3]', output_fields=["pk"])
result = collection.query(expr='int64 < 5', output_fields=["pk"])
result = collection.query(expr='int64 > 2997', output_fields=["pk"])
result = collection.query(expr='1 < int64 < 5', output_fields=["pk"])
```
---------
Signed-off-by: longjiquan <jiquan.long@zilliz.com>
2023-12-31 19:50:47 +08:00
|
|
|
stringFieldData.DataType = dataType
|
2022-04-02 17:43:29 +08:00
|
|
|
totalLength += len(stringPayload)
|
2022-06-08 11:46:06 +08:00
|
|
|
insertData.Data[fieldID] = stringFieldData
|
|
|
|
|
2023-04-20 11:32:31 +08:00
|
|
|
case schemapb.DataType_Array:
|
|
|
|
arrayPayload, err := eventReader.GetArrayFromPayload()
|
|
|
|
if err != nil {
|
|
|
|
eventReader.Close()
|
|
|
|
binlogReader.Close()
|
|
|
|
return InvalidUniqueID, InvalidUniqueID, InvalidUniqueID, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if insertData.Data[fieldID] == nil {
|
|
|
|
insertData.Data[fieldID] = &ArrayFieldData{
|
|
|
|
Data: make([]*schemapb.ScalarField, 0, rowNum),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
arrayFieldData := insertData.Data[fieldID].(*ArrayFieldData)
|
|
|
|
|
|
|
|
arrayFieldData.Data = append(arrayFieldData.Data, arrayPayload...)
|
|
|
|
totalLength += len(arrayPayload)
|
|
|
|
insertData.Data[fieldID] = arrayFieldData
|
|
|
|
|
|
|
|
case schemapb.DataType_JSON:
|
|
|
|
jsonPayload, err := eventReader.GetJSONFromPayload()
|
|
|
|
if err != nil {
|
|
|
|
eventReader.Close()
|
|
|
|
binlogReader.Close()
|
|
|
|
return InvalidUniqueID, InvalidUniqueID, InvalidUniqueID, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if insertData.Data[fieldID] == nil {
|
|
|
|
insertData.Data[fieldID] = &JSONFieldData{
|
|
|
|
Data: make([][]byte, 0, rowNum),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
jsonFieldData := insertData.Data[fieldID].(*JSONFieldData)
|
|
|
|
|
|
|
|
jsonFieldData.Data = append(jsonFieldData.Data, jsonPayload...)
|
|
|
|
totalLength += len(jsonPayload)
|
|
|
|
insertData.Data[fieldID] = jsonFieldData
|
|
|
|
|
2021-03-12 14:22:09 +08:00
|
|
|
case schemapb.DataType_BinaryVector:
|
2020-12-23 11:34:35 +08:00
|
|
|
var singleData []byte
|
2022-06-08 11:46:06 +08:00
|
|
|
singleData, dim, err = eventReader.GetBinaryVectorFromPayload()
|
2020-12-23 11:34:35 +08:00
|
|
|
if err != nil {
|
2021-12-08 21:11:39 +08:00
|
|
|
eventReader.Close()
|
|
|
|
binlogReader.Close()
|
2022-06-08 11:46:06 +08:00
|
|
|
return InvalidUniqueID, InvalidUniqueID, InvalidUniqueID, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if insertData.Data[fieldID] == nil {
|
|
|
|
insertData.Data[fieldID] = &BinaryVectorFieldData{
|
2023-01-28 11:09:52 +08:00
|
|
|
Data: make([]byte, 0, rowNum*dim),
|
2022-06-08 11:46:06 +08:00
|
|
|
}
|
2020-12-23 11:34:35 +08:00
|
|
|
}
|
2022-06-08 11:46:06 +08:00
|
|
|
binaryVectorFieldData := insertData.Data[fieldID].(*BinaryVectorFieldData)
|
|
|
|
|
2020-12-23 11:34:35 +08:00
|
|
|
binaryVectorFieldData.Data = append(binaryVectorFieldData.Data, singleData...)
|
|
|
|
length, err := eventReader.GetPayloadLengthFromReader()
|
|
|
|
if err != nil {
|
2021-12-08 21:11:39 +08:00
|
|
|
eventReader.Close()
|
|
|
|
binlogReader.Close()
|
2022-06-08 11:46:06 +08:00
|
|
|
return InvalidUniqueID, InvalidUniqueID, InvalidUniqueID, err
|
2020-12-23 11:34:35 +08:00
|
|
|
}
|
2021-06-16 12:03:57 +08:00
|
|
|
totalLength += length
|
2022-06-08 11:46:06 +08:00
|
|
|
binaryVectorFieldData.Dim = dim
|
|
|
|
insertData.Data[fieldID] = binaryVectorFieldData
|
|
|
|
|
2023-09-08 10:03:16 +08:00
|
|
|
case schemapb.DataType_Float16Vector:
|
|
|
|
var singleData []byte
|
|
|
|
singleData, dim, err = eventReader.GetFloat16VectorFromPayload()
|
|
|
|
if err != nil {
|
|
|
|
eventReader.Close()
|
|
|
|
binlogReader.Close()
|
|
|
|
return InvalidUniqueID, InvalidUniqueID, InvalidUniqueID, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if insertData.Data[fieldID] == nil {
|
|
|
|
insertData.Data[fieldID] = &Float16VectorFieldData{
|
|
|
|
Data: make([]byte, 0, rowNum*dim),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
float16VectorFieldData := insertData.Data[fieldID].(*Float16VectorFieldData)
|
|
|
|
|
|
|
|
float16VectorFieldData.Data = append(float16VectorFieldData.Data, singleData...)
|
|
|
|
length, err := eventReader.GetPayloadLengthFromReader()
|
|
|
|
if err != nil {
|
|
|
|
eventReader.Close()
|
|
|
|
binlogReader.Close()
|
|
|
|
return InvalidUniqueID, InvalidUniqueID, InvalidUniqueID, err
|
|
|
|
}
|
|
|
|
totalLength += length
|
|
|
|
float16VectorFieldData.Dim = dim
|
|
|
|
insertData.Data[fieldID] = float16VectorFieldData
|
|
|
|
|
2024-01-11 15:48:51 +08:00
|
|
|
case schemapb.DataType_BFloat16Vector:
|
|
|
|
var singleData []byte
|
|
|
|
singleData, dim, err = eventReader.GetBFloat16VectorFromPayload()
|
|
|
|
if err != nil {
|
|
|
|
eventReader.Close()
|
|
|
|
binlogReader.Close()
|
|
|
|
return InvalidUniqueID, InvalidUniqueID, InvalidUniqueID, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if insertData.Data[fieldID] == nil {
|
|
|
|
insertData.Data[fieldID] = &BFloat16VectorFieldData{
|
|
|
|
Data: make([]byte, 0, rowNum*dim),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
bfloat16VectorFieldData := insertData.Data[fieldID].(*BFloat16VectorFieldData)
|
|
|
|
|
|
|
|
bfloat16VectorFieldData.Data = append(bfloat16VectorFieldData.Data, singleData...)
|
|
|
|
length, err := eventReader.GetPayloadLengthFromReader()
|
|
|
|
if err != nil {
|
|
|
|
eventReader.Close()
|
|
|
|
binlogReader.Close()
|
|
|
|
return InvalidUniqueID, InvalidUniqueID, InvalidUniqueID, err
|
|
|
|
}
|
|
|
|
totalLength += length
|
|
|
|
bfloat16VectorFieldData.Dim = dim
|
|
|
|
insertData.Data[fieldID] = bfloat16VectorFieldData
|
|
|
|
|
2021-03-12 14:22:09 +08:00
|
|
|
case schemapb.DataType_FloatVector:
|
2020-12-23 11:34:35 +08:00
|
|
|
var singleData []float32
|
2022-06-08 11:46:06 +08:00
|
|
|
singleData, dim, err = eventReader.GetFloatVectorFromPayload()
|
2020-12-23 11:34:35 +08:00
|
|
|
if err != nil {
|
2021-12-08 21:11:39 +08:00
|
|
|
eventReader.Close()
|
|
|
|
binlogReader.Close()
|
2022-06-08 11:46:06 +08:00
|
|
|
return InvalidUniqueID, InvalidUniqueID, InvalidUniqueID, err
|
2020-12-23 11:34:35 +08:00
|
|
|
}
|
2022-06-08 11:46:06 +08:00
|
|
|
|
|
|
|
if insertData.Data[fieldID] == nil {
|
|
|
|
insertData.Data[fieldID] = &FloatVectorFieldData{
|
2023-01-28 11:09:52 +08:00
|
|
|
Data: make([]float32, 0, rowNum*dim),
|
2022-06-08 11:46:06 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
floatVectorFieldData := insertData.Data[fieldID].(*FloatVectorFieldData)
|
|
|
|
|
2020-12-23 11:34:35 +08:00
|
|
|
floatVectorFieldData.Data = append(floatVectorFieldData.Data, singleData...)
|
|
|
|
length, err := eventReader.GetPayloadLengthFromReader()
|
|
|
|
if err != nil {
|
2021-12-08 21:11:39 +08:00
|
|
|
eventReader.Close()
|
|
|
|
binlogReader.Close()
|
2022-06-08 11:46:06 +08:00
|
|
|
return InvalidUniqueID, InvalidUniqueID, InvalidUniqueID, err
|
2020-12-23 11:34:35 +08:00
|
|
|
}
|
2021-06-16 12:03:57 +08:00
|
|
|
totalLength += length
|
2022-06-08 11:46:06 +08:00
|
|
|
floatVectorFieldData.Dim = dim
|
|
|
|
insertData.Data[fieldID] = floatVectorFieldData
|
|
|
|
|
2024-03-14 05:32:54 +08:00
|
|
|
case schemapb.DataType_SparseFloatVector:
|
|
|
|
sparseData, _, err := eventReader.GetSparseFloatVectorFromPayload()
|
|
|
|
if err != nil {
|
|
|
|
eventReader.Close()
|
|
|
|
binlogReader.Close()
|
|
|
|
return InvalidUniqueID, InvalidUniqueID, InvalidUniqueID, err
|
|
|
|
}
|
|
|
|
if insertData.Data[fieldID] == nil {
|
|
|
|
insertData.Data[fieldID] = &SparseFloatVectorFieldData{}
|
|
|
|
}
|
|
|
|
vec := insertData.Data[fieldID].(*SparseFloatVectorFieldData)
|
|
|
|
vec.AppendAllRows(sparseData)
|
|
|
|
|
|
|
|
totalLength += sparseData.RowNum()
|
|
|
|
insertData.Data[fieldID] = vec
|
|
|
|
|
2020-12-23 11:34:35 +08:00
|
|
|
default:
|
2021-12-08 21:11:39 +08:00
|
|
|
eventReader.Close()
|
|
|
|
binlogReader.Close()
|
2022-06-08 11:46:06 +08:00
|
|
|
return InvalidUniqueID, InvalidUniqueID, InvalidUniqueID, fmt.Errorf("undefined data type %d", dataType)
|
2020-12-09 20:07:27 +08:00
|
|
|
}
|
2021-12-08 21:11:39 +08:00
|
|
|
eventReader.Close()
|
2020-12-09 20:07:27 +08:00
|
|
|
}
|
2022-06-08 11:46:06 +08:00
|
|
|
|
|
|
|
if rowNum <= 0 {
|
|
|
|
rowNum = totalLength
|
|
|
|
}
|
|
|
|
|
2022-04-07 22:05:32 +08:00
|
|
|
if fieldID == common.TimeStampField {
|
2021-06-16 12:03:57 +08:00
|
|
|
blobInfo := BlobInfo{
|
|
|
|
Length: totalLength,
|
|
|
|
}
|
2022-06-08 11:46:06 +08:00
|
|
|
insertData.Infos = append(insertData.Infos, blobInfo)
|
2021-06-16 12:03:57 +08:00
|
|
|
}
|
2021-12-08 21:11:39 +08:00
|
|
|
binlogReader.Close()
|
2020-12-09 20:07:27 +08:00
|
|
|
}
|
|
|
|
|
2022-06-08 11:46:06 +08:00
|
|
|
return collectionID, partitionID, segmentID, nil
|
2021-09-29 09:52:12 +08:00
|
|
|
}
|
|
|
|
|
2023-04-20 11:32:31 +08:00
|
|
|
// func deserializeEntity[T any, U any](
|
|
|
|
// eventReader *EventReader,
|
|
|
|
// binlogReader *BinlogReader,
|
|
|
|
// insertData *InsertData,
|
|
|
|
// getPayloadFunc func() (U, error),
|
|
|
|
// fillDataFunc func() FieldData,
|
|
|
|
// ) error {
|
|
|
|
// fieldID := binlogReader.FieldID
|
|
|
|
// stringPayload, err := getPayloadFunc()
|
|
|
|
// if err != nil {
|
|
|
|
// eventReader.Close()
|
|
|
|
// binlogReader.Close()
|
|
|
|
// return err
|
|
|
|
// }
|
|
|
|
//
|
|
|
|
// if insertData.Data[fieldID] == nil {
|
|
|
|
// insertData.Data[fieldID] = fillDataFunc()
|
|
|
|
// }
|
|
|
|
// stringFieldData := insertData.Data[fieldID].(*T)
|
|
|
|
//
|
|
|
|
// stringFieldData.Data = append(stringFieldData.Data, stringPayload...)
|
|
|
|
// totalLength += len(stringPayload)
|
|
|
|
// insertData.Data[fieldID] = stringFieldData
|
|
|
|
// }
|
|
|
|
|
2021-09-29 09:52:12 +08:00
|
|
|
// Deserialize transfer blob back to insert data.
|
|
|
|
// From schema, it get all fields.
|
|
|
|
// For each field, it will create a binlog reader, and read all event to the buffer.
|
|
|
|
// It returns origin @InsertData in the end.
|
|
|
|
func (insertCodec *InsertCodec) Deserialize(blobs []*Blob) (partitionID UniqueID, segmentID UniqueID, data *InsertData, err error) {
|
|
|
|
_, partitionID, segmentID, data, err = insertCodec.DeserializeAll(blobs)
|
|
|
|
return partitionID, segmentID, data, err
|
2020-12-09 20:07:27 +08:00
|
|
|
}
|
|
|
|
|
2022-04-02 17:43:29 +08:00
|
|
|
type DeleteLog struct {
|
|
|
|
Pk PrimaryKey `json:"pk"`
|
|
|
|
Ts uint64 `json:"ts"`
|
|
|
|
PkType int64 `json:"pkType"`
|
|
|
|
}
|
|
|
|
|
|
|
|
func NewDeleteLog(pk PrimaryKey, ts Timestamp) *DeleteLog {
|
|
|
|
pkType := pk.Type()
|
|
|
|
|
|
|
|
return &DeleteLog{
|
|
|
|
Pk: pk,
|
|
|
|
Ts: ts,
|
|
|
|
PkType: int64(pkType),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (dl *DeleteLog) UnmarshalJSON(data []byte) error {
|
|
|
|
var messageMap map[string]*json.RawMessage
|
|
|
|
err := json.Unmarshal(data, &messageMap)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
err = json.Unmarshal(*messageMap["pkType"], &dl.PkType)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
switch schemapb.DataType(dl.PkType) {
|
|
|
|
case schemapb.DataType_Int64:
|
|
|
|
dl.Pk = &Int64PrimaryKey{}
|
|
|
|
case schemapb.DataType_VarChar:
|
|
|
|
dl.Pk = &VarCharPrimaryKey{}
|
|
|
|
}
|
|
|
|
|
|
|
|
err = json.Unmarshal(*messageMap["pk"], dl.Pk)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
err = json.Unmarshal(*messageMap["ts"], &dl.Ts)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2021-09-28 14:30:02 +08:00
|
|
|
// DeleteData saves each entity delete message represented as <primarykey,timestamp> map.
|
|
|
|
// timestamp represents the time when this instance was deleted
|
|
|
|
type DeleteData struct {
|
2022-04-02 17:43:29 +08:00
|
|
|
Pks []PrimaryKey // primary keys
|
|
|
|
Tss []Timestamp // timestamps
|
2021-11-09 15:01:17 +08:00
|
|
|
RowCount int64
|
2024-02-02 10:47:04 +08:00
|
|
|
memSize int64
|
2021-11-09 15:01:17 +08:00
|
|
|
}
|
|
|
|
|
2023-11-21 18:24:22 +08:00
|
|
|
func NewDeleteData(pks []PrimaryKey, tss []Timestamp) *DeleteData {
|
|
|
|
return &DeleteData{
|
|
|
|
Pks: pks,
|
|
|
|
Tss: tss,
|
|
|
|
RowCount: int64(len(pks)),
|
2024-02-02 10:47:04 +08:00
|
|
|
memSize: lo.SumBy(pks, func(pk PrimaryKey) int64 { return pk.Size() }) + int64(len(tss)*8),
|
2023-11-21 18:24:22 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-11-09 15:01:17 +08:00
|
|
|
// Append append 1 pk&ts pair to DeleteData
|
2022-04-02 17:43:29 +08:00
|
|
|
func (data *DeleteData) Append(pk PrimaryKey, ts Timestamp) {
|
2021-11-09 15:01:17 +08:00
|
|
|
data.Pks = append(data.Pks, pk)
|
|
|
|
data.Tss = append(data.Tss, ts)
|
|
|
|
data.RowCount++
|
2024-02-02 10:47:04 +08:00
|
|
|
data.memSize += pk.Size() + int64(8)
|
2021-09-28 14:30:02 +08:00
|
|
|
}
|
|
|
|
|
2023-11-21 18:24:22 +08:00
|
|
|
// Append append 1 pk&ts pair to DeleteData
|
|
|
|
func (data *DeleteData) AppendBatch(pks []PrimaryKey, tss []Timestamp) {
|
|
|
|
data.Pks = append(data.Pks, pks...)
|
|
|
|
data.Tss = append(data.Tss, tss...)
|
|
|
|
data.RowCount += int64(len(pks))
|
2024-02-02 10:47:04 +08:00
|
|
|
data.memSize += lo.SumBy(pks, func(pk PrimaryKey) int64 { return pk.Size() }) + int64(len(tss)*8)
|
2023-11-21 18:24:22 +08:00
|
|
|
}
|
|
|
|
|
2023-11-07 01:44:18 +08:00
|
|
|
func (data *DeleteData) Merge(other *DeleteData) {
|
|
|
|
data.Pks = append(other.Pks, other.Pks...)
|
|
|
|
data.Tss = append(other.Tss, other.Tss...)
|
|
|
|
data.RowCount += other.RowCount
|
2024-02-02 10:47:04 +08:00
|
|
|
data.memSize += other.Size()
|
2023-11-07 01:44:18 +08:00
|
|
|
|
|
|
|
other.Pks = nil
|
|
|
|
other.Tss = nil
|
|
|
|
other.RowCount = 0
|
2024-02-02 10:47:04 +08:00
|
|
|
other.memSize = 0
|
2023-11-07 01:44:18 +08:00
|
|
|
}
|
|
|
|
|
2023-11-30 14:30:28 +08:00
|
|
|
func (data *DeleteData) Size() int64 {
|
2024-02-02 10:47:04 +08:00
|
|
|
return data.memSize
|
2023-11-30 14:30:28 +08:00
|
|
|
}
|
|
|
|
|
2021-10-13 18:57:44 +08:00
|
|
|
// DeleteCodec serializes and deserializes the delete data
|
2023-09-21 09:45:27 +08:00
|
|
|
type DeleteCodec struct{}
|
2021-09-28 14:30:02 +08:00
|
|
|
|
2021-10-13 18:57:44 +08:00
|
|
|
// NewDeleteCodec returns a DeleteCodec
|
2021-10-24 09:59:10 +08:00
|
|
|
func NewDeleteCodec() *DeleteCodec {
|
|
|
|
return &DeleteCodec{}
|
2021-09-28 14:30:02 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Serialize transfer delete data to blob. .
|
|
|
|
// For each delete message, it will save "pk,ts" string to binlog.
|
2021-10-24 09:59:10 +08:00
|
|
|
func (deleteCodec *DeleteCodec) Serialize(collectionID UniqueID, partitionID UniqueID, segmentID UniqueID, data *DeleteData) (*Blob, error) {
|
|
|
|
binlogWriter := NewDeleteBinlogWriter(schemapb.DataType_String, collectionID, partitionID, segmentID)
|
2021-09-28 14:30:02 +08:00
|
|
|
eventWriter, err := binlogWriter.NextDeleteEventWriter()
|
2022-01-07 18:27:23 +08:00
|
|
|
if err != nil {
|
|
|
|
binlogWriter.Close()
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
defer binlogWriter.Close()
|
|
|
|
defer eventWriter.Close()
|
2022-04-02 17:43:29 +08:00
|
|
|
length := len(data.Pks)
|
|
|
|
if length != len(data.Tss) {
|
2021-12-22 09:59:04 +08:00
|
|
|
return nil, fmt.Errorf("the length of pks, and TimeStamps is not equal")
|
2021-11-09 15:01:17 +08:00
|
|
|
}
|
2022-04-02 17:43:29 +08:00
|
|
|
|
2021-10-11 17:28:30 +08:00
|
|
|
sizeTotal := 0
|
2021-11-09 15:01:17 +08:00
|
|
|
var startTs, endTs Timestamp
|
|
|
|
startTs, endTs = math.MaxUint64, 0
|
|
|
|
for i := 0; i < length; i++ {
|
|
|
|
ts := data.Tss[i]
|
|
|
|
if ts < startTs {
|
|
|
|
startTs = ts
|
2021-09-28 14:30:02 +08:00
|
|
|
}
|
2021-11-09 15:01:17 +08:00
|
|
|
if ts > endTs {
|
|
|
|
endTs = ts
|
2021-09-28 14:30:02 +08:00
|
|
|
}
|
2022-04-02 17:43:29 +08:00
|
|
|
|
|
|
|
deleteLog := NewDeleteLog(data.Pks[i], ts)
|
|
|
|
serializedPayload, err := json.Marshal(deleteLog)
|
2021-09-28 14:30:02 +08:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2022-04-02 17:43:29 +08:00
|
|
|
err = eventWriter.AddOneStringToPayload(string(serializedPayload))
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
sizeTotal += binary.Size(serializedPayload)
|
2021-09-28 14:30:02 +08:00
|
|
|
}
|
2021-11-09 15:01:17 +08:00
|
|
|
eventWriter.SetEventTimestamp(startTs, endTs)
|
|
|
|
binlogWriter.SetEventTimeStamp(startTs, endTs)
|
2021-10-11 17:28:30 +08:00
|
|
|
|
|
|
|
// https://github.com/milvus-io/milvus/issues/9620
|
|
|
|
// It's a little complicated to count the memory size of a map.
|
|
|
|
// See: https://stackoverflow.com/questions/31847549/computing-the-memory-footprint-or-byte-length-of-a-map
|
|
|
|
// Since the implementation of golang map may differ from version, so we'd better not to use this magic method.
|
2021-10-11 21:02:37 +08:00
|
|
|
binlogWriter.AddExtra(originalSizeKey, fmt.Sprintf("%v", sizeTotal))
|
2021-10-11 17:28:30 +08:00
|
|
|
|
2021-12-09 12:37:06 +08:00
|
|
|
err = binlogWriter.Finish()
|
2021-09-28 14:30:02 +08:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
buffer, err := binlogWriter.GetBuffer()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
blob := &Blob{
|
|
|
|
Value: buffer,
|
|
|
|
}
|
|
|
|
return blob, nil
|
|
|
|
}
|
|
|
|
|
2021-10-19 10:28:35 +08:00
|
|
|
// Deserialize deserializes the deltalog blobs into DeleteData
|
|
|
|
func (deleteCodec *DeleteCodec) Deserialize(blobs []*Blob) (partitionID UniqueID, segmentID UniqueID, data *DeleteData, err error) {
|
|
|
|
if len(blobs) == 0 {
|
2021-09-28 14:30:02 +08:00
|
|
|
return InvalidUniqueID, InvalidUniqueID, nil, fmt.Errorf("blobs is empty")
|
|
|
|
}
|
2021-10-19 10:28:35 +08:00
|
|
|
|
|
|
|
var pid, sid UniqueID
|
2021-11-09 15:01:17 +08:00
|
|
|
result := &DeleteData{}
|
2024-02-21 17:10:52 +08:00
|
|
|
|
|
|
|
deserializeBlob := func(blob *Blob) error {
|
2021-10-19 10:28:35 +08:00
|
|
|
binlogReader, err := NewBinlogReader(blob.Value)
|
2021-09-28 14:30:02 +08:00
|
|
|
if err != nil {
|
2024-02-21 17:10:52 +08:00
|
|
|
return err
|
2021-09-28 14:30:02 +08:00
|
|
|
}
|
2024-02-21 17:10:52 +08:00
|
|
|
defer binlogReader.Close()
|
2021-10-19 10:28:35 +08:00
|
|
|
|
|
|
|
pid, sid = binlogReader.PartitionID, binlogReader.SegmentID
|
|
|
|
eventReader, err := binlogReader.NextEventReader()
|
|
|
|
if err != nil {
|
2024-02-21 17:10:52 +08:00
|
|
|
return err
|
2021-09-28 14:30:02 +08:00
|
|
|
}
|
2024-02-21 17:10:52 +08:00
|
|
|
defer eventReader.Close()
|
2021-10-19 10:28:35 +08:00
|
|
|
|
2024-02-21 17:10:52 +08:00
|
|
|
rr, err := eventReader.GetArrowRecordReader()
|
2021-09-28 14:30:02 +08:00
|
|
|
if err != nil {
|
2024-02-21 17:10:52 +08:00
|
|
|
return err
|
2021-09-28 14:30:02 +08:00
|
|
|
}
|
2024-02-21 17:10:52 +08:00
|
|
|
defer rr.Release()
|
2024-02-06 17:04:25 +08:00
|
|
|
|
2024-02-21 17:10:52 +08:00
|
|
|
for rr.Next() {
|
|
|
|
rec := rr.Record()
|
|
|
|
defer rec.Release()
|
|
|
|
column := rec.Column(0)
|
|
|
|
for i := 0; i < column.Len(); i++ {
|
2024-02-06 17:04:25 +08:00
|
|
|
deleteLog := &DeleteLog{}
|
2024-02-21 17:10:52 +08:00
|
|
|
strVal := column.ValueStr(i)
|
|
|
|
if err = json.Unmarshal([]byte(strVal), deleteLog); err != nil {
|
2024-02-06 17:04:25 +08:00
|
|
|
// compatible with versions that only support int64 type primary keys
|
|
|
|
// compatible with fmt.Sprintf("%d,%d", pk, ts)
|
|
|
|
// compatible error info (unmarshal err invalid character ',' after top-level value)
|
2024-02-21 17:10:52 +08:00
|
|
|
splits := strings.Split(strVal, ",")
|
2024-02-06 17:04:25 +08:00
|
|
|
if len(splits) != 2 {
|
2024-02-21 17:10:52 +08:00
|
|
|
return fmt.Errorf("the format of delta log is incorrect, %v can not be split", strVal)
|
2024-02-06 17:04:25 +08:00
|
|
|
}
|
|
|
|
pk, err := strconv.ParseInt(splits[0], 10, 64)
|
|
|
|
if err != nil {
|
2024-02-21 17:10:52 +08:00
|
|
|
return err
|
2024-02-06 17:04:25 +08:00
|
|
|
}
|
|
|
|
deleteLog.Pk = &Int64PrimaryKey{
|
|
|
|
Value: pk,
|
|
|
|
}
|
|
|
|
deleteLog.PkType = int64(schemapb.DataType_Int64)
|
|
|
|
deleteLog.Ts, err = strconv.ParseUint(splits[1], 10, 64)
|
|
|
|
if err != nil {
|
2024-02-21 17:10:52 +08:00
|
|
|
return err
|
2024-02-06 17:04:25 +08:00
|
|
|
}
|
|
|
|
}
|
2021-10-19 10:28:35 +08:00
|
|
|
|
2024-02-06 17:04:25 +08:00
|
|
|
result.Append(deleteLog.Pk, deleteLog.Ts)
|
|
|
|
}
|
2021-10-19 10:28:35 +08:00
|
|
|
}
|
2024-02-21 17:10:52 +08:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, blob := range blobs {
|
|
|
|
if err := deserializeBlob(blob); err != nil {
|
|
|
|
return InvalidUniqueID, InvalidUniqueID, nil, err
|
|
|
|
}
|
2021-09-28 14:30:02 +08:00
|
|
|
}
|
|
|
|
|
2021-10-19 10:28:35 +08:00
|
|
|
return pid, sid, result, nil
|
2021-09-28 14:30:02 +08:00
|
|
|
}
|
|
|
|
|
2021-12-02 18:57:32 +08:00
|
|
|
// DataDefinitionCodec serializes and deserializes the data definition
|
2020-12-09 20:07:27 +08:00
|
|
|
// Blob key example:
|
2020-12-23 18:06:04 +08:00
|
|
|
// ${tenant}/data_definition_log/${collection_id}/ts/${log_idx}
|
|
|
|
// ${tenant}/data_definition_log/${collection_id}/ddl/${log_idx}
|
2020-12-09 20:07:27 +08:00
|
|
|
type DataDefinitionCodec struct {
|
2021-11-22 17:27:14 +08:00
|
|
|
collectionID int64
|
2020-12-09 20:07:27 +08:00
|
|
|
}
|
|
|
|
|
2021-12-02 18:57:32 +08:00
|
|
|
// NewDataDefinitionCodec is constructor for DataDefinitionCodec
|
2020-12-23 18:06:04 +08:00
|
|
|
func NewDataDefinitionCodec(collectionID int64) *DataDefinitionCodec {
|
|
|
|
return &DataDefinitionCodec{collectionID: collectionID}
|
|
|
|
}
|
|
|
|
|
2021-09-17 11:05:49 +08:00
|
|
|
// Serialize transfer @ts and @ddRequsts to blob.
|
|
|
|
// From schema, it get all fields.
|
|
|
|
// For each field, it will create a binlog writer, and write specific event according
|
|
|
|
// to the dataDefinition type.
|
|
|
|
// It returns blobs in the end.
|
2020-12-11 11:29:07 +08:00
|
|
|
func (dataDefinitionCodec *DataDefinitionCodec) Serialize(ts []Timestamp, ddRequests []string, eventTypes []EventTypeCode) ([]*Blob, error) {
|
2021-04-19 10:36:19 +08:00
|
|
|
writer := NewDDLBinlogWriter(schemapb.DataType_Int64, dataDefinitionCodec.collectionID)
|
2022-01-07 18:27:23 +08:00
|
|
|
eventWriter, err := writer.NextCreateCollectionEventWriter()
|
|
|
|
if err != nil {
|
|
|
|
writer.Close()
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
defer writer.Close()
|
|
|
|
defer eventWriter.Close()
|
2020-12-09 20:07:27 +08:00
|
|
|
|
|
|
|
var blobs []*Blob
|
|
|
|
|
2020-12-23 18:06:04 +08:00
|
|
|
var int64Ts []int64
|
|
|
|
for _, singleTs := range ts {
|
|
|
|
int64Ts = append(int64Ts, int64(singleTs))
|
|
|
|
}
|
|
|
|
err = eventWriter.AddInt64ToPayload(int64Ts)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2021-07-07 19:10:07 +08:00
|
|
|
eventWriter.SetEventTimestamp(ts[0], ts[len(ts)-1])
|
|
|
|
writer.SetEventTimeStamp(ts[0], ts[len(ts)-1])
|
2021-10-11 17:28:30 +08:00
|
|
|
|
|
|
|
// https://github.com/milvus-io/milvus/issues/9620
|
2021-10-11 21:02:37 +08:00
|
|
|
writer.AddExtra(originalSizeKey, fmt.Sprintf("%v", binary.Size(int64Ts)))
|
2021-10-11 17:28:30 +08:00
|
|
|
|
2021-12-09 12:37:06 +08:00
|
|
|
err = writer.Finish()
|
2020-12-23 18:06:04 +08:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
buffer, err := writer.GetBuffer()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
blobs = append(blobs, &Blob{
|
|
|
|
Key: Ts,
|
|
|
|
Value: buffer,
|
|
|
|
})
|
2021-12-09 12:37:06 +08:00
|
|
|
eventWriter.Close()
|
|
|
|
writer.Close()
|
2020-12-23 18:06:04 +08:00
|
|
|
|
2021-04-19 10:36:19 +08:00
|
|
|
writer = NewDDLBinlogWriter(schemapb.DataType_String, dataDefinitionCodec.collectionID)
|
2020-12-23 18:06:04 +08:00
|
|
|
|
2021-10-11 17:28:30 +08:00
|
|
|
sizeTotal := 0
|
2020-12-09 20:07:27 +08:00
|
|
|
for pos, req := range ddRequests {
|
2021-10-11 17:28:30 +08:00
|
|
|
sizeTotal += len(req)
|
2020-12-09 20:07:27 +08:00
|
|
|
switch eventTypes[pos] {
|
|
|
|
case CreateCollectionEventType:
|
|
|
|
eventWriter, err := writer.NextCreateCollectionEventWriter()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
err = eventWriter.AddOneStringToPayload(req)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2021-07-07 19:10:07 +08:00
|
|
|
eventWriter.SetEventTimestamp(ts[pos], ts[pos])
|
2020-12-09 20:07:27 +08:00
|
|
|
case DropCollectionEventType:
|
|
|
|
eventWriter, err := writer.NextDropCollectionEventWriter()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
err = eventWriter.AddOneStringToPayload(req)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2021-07-07 19:10:07 +08:00
|
|
|
eventWriter.SetEventTimestamp(ts[pos], ts[pos])
|
2020-12-09 20:07:27 +08:00
|
|
|
case CreatePartitionEventType:
|
|
|
|
eventWriter, err := writer.NextCreatePartitionEventWriter()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
err = eventWriter.AddOneStringToPayload(req)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2021-07-07 19:10:07 +08:00
|
|
|
eventWriter.SetEventTimestamp(ts[pos], ts[pos])
|
2020-12-09 20:07:27 +08:00
|
|
|
case DropPartitionEventType:
|
|
|
|
eventWriter, err := writer.NextDropPartitionEventWriter()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
err = eventWriter.AddOneStringToPayload(req)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2021-07-07 19:10:07 +08:00
|
|
|
eventWriter.SetEventTimestamp(ts[pos], ts[pos])
|
2020-12-09 20:07:27 +08:00
|
|
|
}
|
|
|
|
}
|
2021-07-07 19:10:07 +08:00
|
|
|
writer.SetEventTimeStamp(ts[0], ts[len(ts)-1])
|
2021-10-11 17:28:30 +08:00
|
|
|
|
|
|
|
// https://github.com/milvus-io/milvus/issues/9620
|
2021-10-11 21:02:37 +08:00
|
|
|
writer.AddExtra(originalSizeKey, fmt.Sprintf("%v", sizeTotal))
|
2021-10-11 17:28:30 +08:00
|
|
|
|
2021-12-09 12:37:06 +08:00
|
|
|
err = writer.Finish()
|
2020-12-09 20:07:27 +08:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2020-12-10 15:50:09 +08:00
|
|
|
buffer, err = writer.GetBuffer()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2020-12-09 20:07:27 +08:00
|
|
|
blobs = append(blobs, &Blob{
|
2020-12-23 18:06:04 +08:00
|
|
|
Key: DDL,
|
2020-12-23 11:34:35 +08:00
|
|
|
Value: buffer,
|
2020-12-09 20:07:27 +08:00
|
|
|
})
|
|
|
|
|
|
|
|
return blobs, nil
|
|
|
|
}
|
|
|
|
|
2021-09-17 11:05:49 +08:00
|
|
|
// Deserialize transfer blob back to data definition data.
|
|
|
|
// From schema, it get all fields.
|
|
|
|
// It will sort blob by blob key for blob logid is increasing by time.
|
|
|
|
// For each field, it will create a binlog reader, and read all event to the buffer.
|
|
|
|
// It returns origin @ts and @ddRequests in the end.
|
2020-12-10 09:57:14 +08:00
|
|
|
func (dataDefinitionCodec *DataDefinitionCodec) Deserialize(blobs []*Blob) (ts []Timestamp, ddRequests []string, err error) {
|
2020-12-09 20:07:27 +08:00
|
|
|
if len(blobs) == 0 {
|
2021-07-07 19:10:07 +08:00
|
|
|
return nil, nil, fmt.Errorf("blobs is empty")
|
2020-12-09 20:07:27 +08:00
|
|
|
}
|
|
|
|
var requestsStrings []string
|
|
|
|
var resultTs []Timestamp
|
2020-12-23 11:34:35 +08:00
|
|
|
|
|
|
|
var blobList BlobList = blobs
|
|
|
|
sort.Sort(blobList)
|
|
|
|
|
|
|
|
for _, blob := range blobList {
|
|
|
|
binlogReader, err := NewBinlogReader(blob.Value)
|
2020-12-09 20:07:27 +08:00
|
|
|
if err != nil {
|
|
|
|
return nil, nil, err
|
|
|
|
}
|
2020-12-18 15:21:25 +08:00
|
|
|
dataType := binlogReader.PayloadDataType
|
2020-12-11 11:29:07 +08:00
|
|
|
|
2020-12-23 11:34:35 +08:00
|
|
|
for {
|
2020-12-09 20:07:27 +08:00
|
|
|
eventReader, err := binlogReader.NextEventReader()
|
|
|
|
if err != nil {
|
2021-12-08 21:11:39 +08:00
|
|
|
binlogReader.Close()
|
2020-12-09 20:07:27 +08:00
|
|
|
return nil, nil, err
|
|
|
|
}
|
2020-12-23 11:34:35 +08:00
|
|
|
if eventReader == nil {
|
|
|
|
break
|
2020-12-09 20:07:27 +08:00
|
|
|
}
|
2020-12-23 11:34:35 +08:00
|
|
|
switch dataType {
|
2021-03-12 14:22:09 +08:00
|
|
|
case schemapb.DataType_Int64:
|
2020-12-23 11:34:35 +08:00
|
|
|
int64Ts, err := eventReader.GetInt64FromPayload()
|
|
|
|
if err != nil {
|
2021-12-08 21:11:39 +08:00
|
|
|
eventReader.Close()
|
|
|
|
binlogReader.Close()
|
2020-12-23 11:34:35 +08:00
|
|
|
return nil, nil, err
|
|
|
|
}
|
|
|
|
for _, singleTs := range int64Ts {
|
|
|
|
resultTs = append(resultTs, Timestamp(singleTs))
|
|
|
|
}
|
2021-03-12 14:22:09 +08:00
|
|
|
case schemapb.DataType_String:
|
2022-03-30 15:21:28 +08:00
|
|
|
stringPayload, err := eventReader.GetStringFromPayload()
|
2020-12-09 20:07:27 +08:00
|
|
|
if err != nil {
|
2021-12-08 21:11:39 +08:00
|
|
|
eventReader.Close()
|
|
|
|
binlogReader.Close()
|
2020-12-09 20:07:27 +08:00
|
|
|
return nil, nil, err
|
|
|
|
}
|
2023-06-07 19:34:36 +08:00
|
|
|
requestsStrings = append(requestsStrings, stringPayload...)
|
2020-12-09 20:07:27 +08:00
|
|
|
}
|
2021-12-08 21:11:39 +08:00
|
|
|
eventReader.Close()
|
2020-12-09 20:07:27 +08:00
|
|
|
}
|
2021-12-08 21:11:39 +08:00
|
|
|
binlogReader.Close()
|
2020-12-09 20:07:27 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return resultTs, requestsStrings, nil
|
|
|
|
}
|