milvus/internal/storage/data_codec.go
godchen 96539f02d8
Add data codec comment (#7998)
Signed-off-by: godchen <qingxiang.chen@zilliz.com>
2021-09-17 11:05:49 +08:00

734 lines
22 KiB
Go

// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License.
package storage
import (
"encoding/json"
"fmt"
"sort"
"strconv"
"strings"
"github.com/milvus-io/milvus/internal/proto/etcdpb"
"github.com/milvus-io/milvus/internal/proto/schemapb"
"github.com/milvus-io/milvus/internal/rootcoord"
"github.com/milvus-io/milvus/internal/util/typeutil"
)
const (
Ts = "ts"
DDL = "ddl"
IndexParamsFile = "indexParams"
)
type (
UniqueID = typeutil.UniqueID
FieldID = typeutil.UniqueID
Timestamp = typeutil.Timestamp
)
const InvalidUniqueID = UniqueID(-1)
type Blob struct {
Key string
Value []byte
}
type BlobList []*Blob
func (s BlobList) Len() int {
return len(s)
}
func (s BlobList) Less(i, j int) bool {
leftValues := strings.Split(s[i].Key, "/")
rightValues := strings.Split(s[j].Key, "/")
left, _ := strconv.ParseInt(leftValues[len(leftValues)-1], 0, 10)
right, _ := strconv.ParseInt(rightValues[len(rightValues)-1], 0, 10)
return left < right
}
func (s BlobList) Swap(i, j int) {
s[i], s[j] = s[j], s[i]
}
func (b Blob) GetKey() string {
return b.Key
}
func (b Blob) GetValue() []byte {
return b.Value
}
type FieldData interface{}
type BoolFieldData struct {
NumRows []int64
Data []bool
}
type Int8FieldData struct {
NumRows []int64
Data []int8
}
type Int16FieldData struct {
NumRows []int64
Data []int16
}
type Int32FieldData struct {
NumRows []int64
Data []int32
}
type Int64FieldData struct {
NumRows []int64
Data []int64
}
type FloatFieldData struct {
NumRows []int64
Data []float32
}
type DoubleFieldData struct {
NumRows []int64
Data []float64
}
type StringFieldData struct {
NumRows []int64
Data []string
}
type BinaryVectorFieldData struct {
NumRows []int64
Data []byte
Dim int
}
type FloatVectorFieldData struct {
NumRows []int64
Data []float32
Dim int
}
// system filed id:
// 0: unique row id
// 1: timestamp
// 100: first user field id
// 101: second user field id
// 102: ...
// TODO: fill it
// info for each blob
type BlobInfo struct {
Length int
}
// example row_schema: {float_field, int_field, float_vector_field, string_field}
// Data {<0, row_id>, <1, timestamp>, <100, float_field>, <101, int_field>, <102, float_vector_field>, <103, string_field>}
type InsertData struct {
Data map[FieldID]FieldData // field id to field data
Infos []BlobInfo
}
// InsertCodec serializes and deserializes the insert data
// Blob key example:
// ${tenant}/insert_log/${collection_id}/${partition_id}/${segment_id}/${field_id}/${log_idx}
type InsertCodec struct {
Schema *etcdpb.CollectionMeta
readerCloseFunc []func() error
}
func NewInsertCodec(schema *etcdpb.CollectionMeta) *InsertCodec {
return &InsertCodec{Schema: schema}
}
// Serialize transfer insert data to blob. It will sort insert data by timestamp.
// From schema, it get all fields.
// For each field, it will create a binlog writer, and write a event to the binlog.
// It returns binlog buffer in the end.
func (insertCodec *InsertCodec) Serialize(partitionID UniqueID, segmentID UniqueID, data *InsertData) ([]*Blob, []*Blob, error) {
var blobs []*Blob
var statsBlobs []*Blob
var writer *InsertBinlogWriter
timeFieldData, ok := data.Data[rootcoord.TimeStampField]
if !ok {
return nil, nil, fmt.Errorf("data doesn't contains timestamp field")
}
ts := timeFieldData.(*Int64FieldData).Data
startTs := ts[0]
endTs := ts[len(ts)-1]
dataSorter := &DataSorter{
InsertCodec: insertCodec,
InsertData: data,
}
sort.Sort(dataSorter)
for _, field := range insertCodec.Schema.Schema.Fields {
singleData := data.Data[field.FieldID]
// encode fields
writer = NewInsertBinlogWriter(field.DataType, insertCodec.Schema.ID, partitionID, segmentID, field.FieldID)
eventWriter, err := writer.NextInsertEventWriter()
if err != nil {
return nil, nil, err
}
eventWriter.SetEventTimestamp(typeutil.Timestamp(startTs), typeutil.Timestamp(endTs))
switch field.DataType {
case schemapb.DataType_Bool:
err = eventWriter.AddBoolToPayload(singleData.(*BoolFieldData).Data)
case schemapb.DataType_Int8:
err = eventWriter.AddInt8ToPayload(singleData.(*Int8FieldData).Data)
case schemapb.DataType_Int16:
err = eventWriter.AddInt16ToPayload(singleData.(*Int16FieldData).Data)
case schemapb.DataType_Int32:
err = eventWriter.AddInt32ToPayload(singleData.(*Int32FieldData).Data)
case schemapb.DataType_Int64:
err = eventWriter.AddInt64ToPayload(singleData.(*Int64FieldData).Data)
case schemapb.DataType_Float:
err = eventWriter.AddFloatToPayload(singleData.(*FloatFieldData).Data)
case schemapb.DataType_Double:
err = eventWriter.AddDoubleToPayload(singleData.(*DoubleFieldData).Data)
case schemapb.DataType_String:
for _, singleString := range singleData.(*StringFieldData).Data {
err = eventWriter.AddOneStringToPayload(singleString)
if err != nil {
return nil, nil, err
}
}
case schemapb.DataType_BinaryVector:
err = eventWriter.AddBinaryVectorToPayload(singleData.(*BinaryVectorFieldData).Data, singleData.(*BinaryVectorFieldData).Dim)
case schemapb.DataType_FloatVector:
err = eventWriter.AddFloatVectorToPayload(singleData.(*FloatVectorFieldData).Data, singleData.(*FloatVectorFieldData).Dim)
default:
return nil, nil, fmt.Errorf("undefined data type %d", field.DataType)
}
if err != nil {
return nil, nil, err
}
writer.SetEventTimeStamp(typeutil.Timestamp(startTs), typeutil.Timestamp(endTs))
err = writer.Close()
if err != nil {
return nil, nil, err
}
buffer, err := writer.GetBuffer()
if err != nil {
return nil, nil, err
}
blobKey := fmt.Sprintf("%d", field.FieldID)
blobs = append(blobs, &Blob{
Key: blobKey,
Value: buffer,
})
// stats fields
statsWriter := &StatsWriter{}
switch field.DataType {
case schemapb.DataType_Int64:
err = statsWriter.StatsInt64(singleData.(*Int64FieldData).Data)
}
if err != nil {
return nil, nil, err
}
statsBuffer := statsWriter.GetBuffer()
statsBlobs = append(statsBlobs, &Blob{
Key: blobKey,
Value: statsBuffer,
})
}
return blobs, statsBlobs, nil
}
// Deserialize transfer blob back to insert data.
// From schema, it get all fields.
// For each field, it will create a binlog reader, and read all event to the buffer.
// It returns origin @InsertData in the end.
func (insertCodec *InsertCodec) Deserialize(blobs []*Blob) (partitionID UniqueID, segmentID UniqueID, data *InsertData, err error) {
if len(blobs) == 0 {
return InvalidUniqueID, InvalidUniqueID, nil, fmt.Errorf("blobs is empty")
}
readerClose := func(reader *BinlogReader) func() error {
return func() error { return reader.Close() }
}
var blobList BlobList = blobs
sort.Sort(blobList)
var pID UniqueID
var sID UniqueID
resultData := &InsertData{}
resultData.Data = make(map[FieldID]FieldData)
for _, blob := range blobList {
binlogReader, err := NewBinlogReader(blob.Value)
if err != nil {
return InvalidUniqueID, InvalidUniqueID, nil, err
}
// read partitionID and SegmentID
pID, sID = binlogReader.PartitionID, binlogReader.SegmentID
dataType := binlogReader.PayloadDataType
fieldID := binlogReader.FieldID
totalLength := 0
for {
eventReader, err := binlogReader.NextEventReader()
if err != nil {
return InvalidUniqueID, InvalidUniqueID, nil, err
}
if eventReader == nil {
break
}
switch dataType {
case schemapb.DataType_Bool:
if resultData.Data[fieldID] == nil {
resultData.Data[fieldID] = &BoolFieldData{}
}
boolFieldData := resultData.Data[fieldID].(*BoolFieldData)
singleData, err := eventReader.GetBoolFromPayload()
if err != nil {
return InvalidUniqueID, InvalidUniqueID, nil, err
}
boolFieldData.Data = append(boolFieldData.Data, singleData...)
length, err := eventReader.GetPayloadLengthFromReader()
if err != nil {
return InvalidUniqueID, InvalidUniqueID, nil, err
}
totalLength += length
boolFieldData.NumRows = append(boolFieldData.NumRows, int64(length))
resultData.Data[fieldID] = boolFieldData
case schemapb.DataType_Int8:
if resultData.Data[fieldID] == nil {
resultData.Data[fieldID] = &Int8FieldData{}
}
int8FieldData := resultData.Data[fieldID].(*Int8FieldData)
singleData, err := eventReader.GetInt8FromPayload()
if err != nil {
return InvalidUniqueID, InvalidUniqueID, nil, err
}
int8FieldData.Data = append(int8FieldData.Data, singleData...)
length, err := eventReader.GetPayloadLengthFromReader()
if err != nil {
return InvalidUniqueID, InvalidUniqueID, nil, err
}
totalLength += length
int8FieldData.NumRows = append(int8FieldData.NumRows, int64(length))
resultData.Data[fieldID] = int8FieldData
case schemapb.DataType_Int16:
if resultData.Data[fieldID] == nil {
resultData.Data[fieldID] = &Int16FieldData{}
}
int16FieldData := resultData.Data[fieldID].(*Int16FieldData)
singleData, err := eventReader.GetInt16FromPayload()
if err != nil {
return InvalidUniqueID, InvalidUniqueID, nil, err
}
int16FieldData.Data = append(int16FieldData.Data, singleData...)
length, err := eventReader.GetPayloadLengthFromReader()
if err != nil {
return InvalidUniqueID, InvalidUniqueID, nil, err
}
totalLength += length
int16FieldData.NumRows = append(int16FieldData.NumRows, int64(length))
resultData.Data[fieldID] = int16FieldData
case schemapb.DataType_Int32:
if resultData.Data[fieldID] == nil {
resultData.Data[fieldID] = &Int32FieldData{}
}
int32FieldData := resultData.Data[fieldID].(*Int32FieldData)
singleData, err := eventReader.GetInt32FromPayload()
if err != nil {
return InvalidUniqueID, InvalidUniqueID, nil, err
}
int32FieldData.Data = append(int32FieldData.Data, singleData...)
length, err := eventReader.GetPayloadLengthFromReader()
if err != nil {
return InvalidUniqueID, InvalidUniqueID, nil, err
}
totalLength += length
int32FieldData.NumRows = append(int32FieldData.NumRows, int64(length))
resultData.Data[fieldID] = int32FieldData
case schemapb.DataType_Int64:
if resultData.Data[fieldID] == nil {
resultData.Data[fieldID] = &Int64FieldData{}
}
int64FieldData := resultData.Data[fieldID].(*Int64FieldData)
singleData, err := eventReader.GetInt64FromPayload()
if err != nil {
return InvalidUniqueID, InvalidUniqueID, nil, err
}
int64FieldData.Data = append(int64FieldData.Data, singleData...)
length, err := eventReader.GetPayloadLengthFromReader()
if err != nil {
return InvalidUniqueID, InvalidUniqueID, nil, err
}
totalLength += length
int64FieldData.NumRows = append(int64FieldData.NumRows, int64(length))
resultData.Data[fieldID] = int64FieldData
case schemapb.DataType_Float:
if resultData.Data[fieldID] == nil {
resultData.Data[fieldID] = &FloatFieldData{}
}
floatFieldData := resultData.Data[fieldID].(*FloatFieldData)
singleData, err := eventReader.GetFloatFromPayload()
if err != nil {
return InvalidUniqueID, InvalidUniqueID, nil, err
}
floatFieldData.Data = append(floatFieldData.Data, singleData...)
length, err := eventReader.GetPayloadLengthFromReader()
if err != nil {
return InvalidUniqueID, InvalidUniqueID, nil, err
}
totalLength += length
floatFieldData.NumRows = append(floatFieldData.NumRows, int64(length))
resultData.Data[fieldID] = floatFieldData
case schemapb.DataType_Double:
if resultData.Data[fieldID] == nil {
resultData.Data[fieldID] = &DoubleFieldData{}
}
doubleFieldData := resultData.Data[fieldID].(*DoubleFieldData)
singleData, err := eventReader.GetDoubleFromPayload()
if err != nil {
return InvalidUniqueID, InvalidUniqueID, nil, err
}
doubleFieldData.Data = append(doubleFieldData.Data, singleData...)
length, err := eventReader.GetPayloadLengthFromReader()
if err != nil {
return InvalidUniqueID, InvalidUniqueID, nil, err
}
totalLength += length
doubleFieldData.NumRows = append(doubleFieldData.NumRows, int64(length))
resultData.Data[fieldID] = doubleFieldData
case schemapb.DataType_String:
if resultData.Data[fieldID] == nil {
resultData.Data[fieldID] = &StringFieldData{}
}
stringFieldData := resultData.Data[fieldID].(*StringFieldData)
length, err := eventReader.GetPayloadLengthFromReader()
if err != nil {
return InvalidUniqueID, InvalidUniqueID, nil, err
}
totalLength += length
stringFieldData.NumRows = append(stringFieldData.NumRows, int64(length))
for i := 0; i < length; i++ {
singleString, err := eventReader.GetOneStringFromPayload(i)
if err != nil {
return InvalidUniqueID, InvalidUniqueID, nil, err
}
stringFieldData.Data = append(stringFieldData.Data, singleString)
}
resultData.Data[fieldID] = stringFieldData
case schemapb.DataType_BinaryVector:
if resultData.Data[fieldID] == nil {
resultData.Data[fieldID] = &BinaryVectorFieldData{}
}
binaryVectorFieldData := resultData.Data[fieldID].(*BinaryVectorFieldData)
var singleData []byte
singleData, binaryVectorFieldData.Dim, err = eventReader.GetBinaryVectorFromPayload()
if err != nil {
return InvalidUniqueID, InvalidUniqueID, nil, err
}
binaryVectorFieldData.Data = append(binaryVectorFieldData.Data, singleData...)
length, err := eventReader.GetPayloadLengthFromReader()
if err != nil {
return InvalidUniqueID, InvalidUniqueID, nil, err
}
totalLength += length
binaryVectorFieldData.NumRows = append(binaryVectorFieldData.NumRows, int64(length))
resultData.Data[fieldID] = binaryVectorFieldData
case schemapb.DataType_FloatVector:
if resultData.Data[fieldID] == nil {
resultData.Data[fieldID] = &FloatVectorFieldData{}
}
floatVectorFieldData := resultData.Data[fieldID].(*FloatVectorFieldData)
var singleData []float32
singleData, floatVectorFieldData.Dim, err = eventReader.GetFloatVectorFromPayload()
if err != nil {
return InvalidUniqueID, InvalidUniqueID, nil, err
}
floatVectorFieldData.Data = append(floatVectorFieldData.Data, singleData...)
length, err := eventReader.GetPayloadLengthFromReader()
if err != nil {
return InvalidUniqueID, InvalidUniqueID, nil, err
}
totalLength += length
floatVectorFieldData.NumRows = append(floatVectorFieldData.NumRows, int64(length))
resultData.Data[fieldID] = floatVectorFieldData
default:
return InvalidUniqueID, InvalidUniqueID, nil, fmt.Errorf("undefined data type %d", dataType)
}
}
if fieldID == rootcoord.TimeStampField {
blobInfo := BlobInfo{
Length: totalLength,
}
resultData.Infos = append(resultData.Infos, blobInfo)
}
insertCodec.readerCloseFunc = append(insertCodec.readerCloseFunc, readerClose(binlogReader))
}
return pID, sID, resultData, nil
}
func (insertCodec *InsertCodec) Close() error {
for _, closeFunc := range insertCodec.readerCloseFunc {
err := closeFunc()
if err != nil {
return err
}
}
return nil
}
// Blob key example:
// ${tenant}/data_definition_log/${collection_id}/ts/${log_idx}
// ${tenant}/data_definition_log/${collection_id}/ddl/${log_idx}
type DataDefinitionCodec struct {
collectionID int64
readerCloseFunc []func() error
}
func NewDataDefinitionCodec(collectionID int64) *DataDefinitionCodec {
return &DataDefinitionCodec{collectionID: collectionID}
}
// Serialize transfer @ts and @ddRequsts to blob.
// From schema, it get all fields.
// For each field, it will create a binlog writer, and write specific event according
// to the dataDefinition type.
// It returns blobs in the end.
func (dataDefinitionCodec *DataDefinitionCodec) Serialize(ts []Timestamp, ddRequests []string, eventTypes []EventTypeCode) ([]*Blob, error) {
writer := NewDDLBinlogWriter(schemapb.DataType_Int64, dataDefinitionCodec.collectionID)
var blobs []*Blob
eventWriter, err := writer.NextCreateCollectionEventWriter()
if err != nil {
return nil, err
}
var int64Ts []int64
for _, singleTs := range ts {
int64Ts = append(int64Ts, int64(singleTs))
}
err = eventWriter.AddInt64ToPayload(int64Ts)
if err != nil {
return nil, err
}
eventWriter.SetEventTimestamp(ts[0], ts[len(ts)-1])
writer.SetEventTimeStamp(ts[0], ts[len(ts)-1])
err = writer.Close()
if err != nil {
return nil, err
}
buffer, err := writer.GetBuffer()
if err != nil {
return nil, err
}
blobs = append(blobs, &Blob{
Key: Ts,
Value: buffer,
})
writer = NewDDLBinlogWriter(schemapb.DataType_String, dataDefinitionCodec.collectionID)
for pos, req := range ddRequests {
switch eventTypes[pos] {
case CreateCollectionEventType:
eventWriter, err := writer.NextCreateCollectionEventWriter()
if err != nil {
return nil, err
}
err = eventWriter.AddOneStringToPayload(req)
if err != nil {
return nil, err
}
eventWriter.SetEventTimestamp(ts[pos], ts[pos])
case DropCollectionEventType:
eventWriter, err := writer.NextDropCollectionEventWriter()
if err != nil {
return nil, err
}
err = eventWriter.AddOneStringToPayload(req)
if err != nil {
return nil, err
}
eventWriter.SetEventTimestamp(ts[pos], ts[pos])
case CreatePartitionEventType:
eventWriter, err := writer.NextCreatePartitionEventWriter()
if err != nil {
return nil, err
}
err = eventWriter.AddOneStringToPayload(req)
if err != nil {
return nil, err
}
eventWriter.SetEventTimestamp(ts[pos], ts[pos])
case DropPartitionEventType:
eventWriter, err := writer.NextDropPartitionEventWriter()
if err != nil {
return nil, err
}
err = eventWriter.AddOneStringToPayload(req)
if err != nil {
return nil, err
}
eventWriter.SetEventTimestamp(ts[pos], ts[pos])
}
}
writer.SetEventTimeStamp(ts[0], ts[len(ts)-1])
err = writer.Close()
if err != nil {
return nil, err
}
buffer, err = writer.GetBuffer()
if err != nil {
return nil, err
}
blobs = append(blobs, &Blob{
Key: DDL,
Value: buffer,
})
return blobs, nil
}
// Deserialize transfer blob back to data definition data.
// From schema, it get all fields.
// It will sort blob by blob key for blob logid is increasing by time.
// For each field, it will create a binlog reader, and read all event to the buffer.
// It returns origin @ts and @ddRequests in the end.
func (dataDefinitionCodec *DataDefinitionCodec) Deserialize(blobs []*Blob) (ts []Timestamp, ddRequests []string, err error) {
if len(blobs) == 0 {
return nil, nil, fmt.Errorf("blobs is empty")
}
readerClose := func(reader *BinlogReader) func() error {
return func() error { return reader.Close() }
}
var requestsStrings []string
var resultTs []Timestamp
var blobList BlobList = blobs
sort.Sort(blobList)
for _, blob := range blobList {
binlogReader, err := NewBinlogReader(blob.Value)
if err != nil {
return nil, nil, err
}
dataType := binlogReader.PayloadDataType
for {
eventReader, err := binlogReader.NextEventReader()
if err != nil {
return nil, nil, err
}
if eventReader == nil {
break
}
switch dataType {
case schemapb.DataType_Int64:
int64Ts, err := eventReader.GetInt64FromPayload()
if err != nil {
return nil, nil, err
}
for _, singleTs := range int64Ts {
resultTs = append(resultTs, Timestamp(singleTs))
}
case schemapb.DataType_String:
length, err := eventReader.GetPayloadLengthFromReader()
if err != nil {
return nil, nil, err
}
for i := 0; i < length; i++ {
singleString, err := eventReader.GetOneStringFromPayload(i)
if err != nil {
return nil, nil, err
}
requestsStrings = append(requestsStrings, singleString)
}
}
}
dataDefinitionCodec.readerCloseFunc = append(dataDefinitionCodec.readerCloseFunc, readerClose(binlogReader))
}
return resultTs, requestsStrings, nil
}
func (dataDefinitionCodec *DataDefinitionCodec) Close() error {
for _, closeFunc := range dataDefinitionCodec.readerCloseFunc {
err := closeFunc()
if err != nil {
return err
}
}
return nil
}
//type IndexCodec struct {
// Base
// readerCloseFunc []func() error
//}
//
////func (builder *IndexBuilder) Build(fieldData FieldData, typeParams map[string]string, indexParams map[string]string) ([]*Blob, error) {}
//func (indexCodec *IndexCodec) Serialize(indexSlices []*Blob) ([]*Blob, error) {}
//
//// TODO: describe inputs and return
//func (indexCodec *IndexCodec) Deserialize(blobs []*Blob) ([]*Blob, error) {}
type IndexCodec struct {
}
func NewIndexCodec() *IndexCodec {
return &IndexCodec{}
}
func (indexCodec *IndexCodec) Serialize(blobs []*Blob, params map[string]string, indexName string, indexID UniqueID) ([]*Blob, error) {
paramsBytes, err := json.Marshal(struct {
Params map[string]string
IndexName string
IndexID UniqueID
}{
Params: params,
IndexName: indexName,
IndexID: indexID,
})
if err != nil {
return nil, err
}
blobs = append(blobs, &Blob{Key: IndexParamsFile, Value: paramsBytes})
return blobs, nil
}
func (indexCodec *IndexCodec) Deserialize(blobs []*Blob) ([]*Blob, map[string]string, string, UniqueID, error) {
var file *Blob
for i := 0; i < len(blobs); i++ {
if blobs[i].Key != IndexParamsFile {
continue
}
file = blobs[i]
blobs = append(blobs[:i], blobs[i+1:]...)
break
}
if file == nil {
return nil, nil, "", InvalidUniqueID, fmt.Errorf("can not find params blob")
}
info := struct {
Params map[string]string
IndexName string
IndexID UniqueID
}{}
if err := json.Unmarshal(file.Value, &info); err != nil {
return nil, nil, "", InvalidUniqueID, fmt.Errorf("json unmarshal error: %s", err.Error())
}
return blobs, info.Params, info.IndexName, info.IndexID, nil
}