milvus/internal/datanode/writebuffer/insert_buffer.go
congqixia 512ea6be5f
enhance: Avoid merging insert data when buffering insert msgs (#33562)
See also #33561

This PR:
- Use zero copy when buffering insert messages
- Make `storage.InsertCodec` support serialize multiple insert data
chunk into same batch binlog files

Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>

---------

Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>
2024-06-13 11:15:56 +08:00

150 lines
3.6 KiB
Go

package writebuffer
import (
"math"
"github.com/cockroachdb/errors"
"go.uber.org/zap"
"github.com/milvus-io/milvus-proto/go-api/v2/msgpb"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/storage"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/util/paramtable"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
const (
noLimit int64 = -1
)
type BufferBase struct {
rows int64
rowLimit int64
size int64
sizeLimit int64
TimestampFrom typeutil.Timestamp
TimestampTo typeutil.Timestamp
startPos *msgpb.MsgPosition
endPos *msgpb.MsgPosition
}
func (b *BufferBase) UpdateStatistics(entryNum, size int64, tr TimeRange, startPos, endPos *msgpb.MsgPosition) {
b.rows += entryNum
b.size += size
if tr.timestampMin < b.TimestampFrom {
b.TimestampFrom = tr.timestampMin
}
if tr.timestampMax > b.TimestampTo {
b.TimestampTo = tr.timestampMax
}
if b.startPos == nil || startPos.Timestamp < b.startPos.Timestamp {
b.startPos = startPos
}
if b.endPos == nil || endPos.Timestamp > b.endPos.Timestamp {
b.endPos = endPos
}
}
func (b *BufferBase) IsFull() bool {
return (b.rowLimit != noLimit && b.rows >= b.rowLimit) ||
(b.sizeLimit != noLimit && b.size >= b.sizeLimit)
}
func (b *BufferBase) IsEmpty() bool {
return b.rows == 0 && b.size == 0
}
func (b *BufferBase) MinTimestamp() typeutil.Timestamp {
if b.startPos == nil {
return math.MaxUint64
}
return b.startPos.GetTimestamp()
}
func (b *BufferBase) GetTimeRange() *TimeRange {
return NewTimeRange(b.TimestampFrom, b.TimestampTo)
}
type InsertBuffer struct {
BufferBase
collSchema *schemapb.CollectionSchema
buffers []*storage.InsertData
}
func NewInsertBuffer(sch *schemapb.CollectionSchema) (*InsertBuffer, error) {
estSize, err := typeutil.EstimateSizePerRecord(sch)
if err != nil {
log.Warn("failed to estimate size per record", zap.Error(err))
return nil, err
}
if estSize == 0 {
return nil, errors.New("Invalid schema")
}
sizeLimit := paramtable.Get().DataNodeCfg.FlushInsertBufferSize.GetAsInt64()
ib := &InsertBuffer{
BufferBase: BufferBase{
rowLimit: noLimit,
sizeLimit: sizeLimit,
TimestampFrom: math.MaxUint64,
TimestampTo: 0,
},
collSchema: sch,
}
return ib, nil
}
func (ib *InsertBuffer) buffer(inData *storage.InsertData, tr TimeRange, startPos, endPos *msgpb.MsgPosition) {
// buffer := ib.currentBuffer()
// storage.MergeInsertData(buffer.buffer, inData)
ib.buffers = append(ib.buffers, inData)
ib.UpdateStatistics(int64(inData.GetRowNum()), int64(inData.GetMemorySize()), tr, startPos, endPos)
}
func (ib *InsertBuffer) Yield() []*storage.InsertData {
result := ib.buffers
// set buffer nil to so that fragmented buffer could get GCed
ib.buffers = nil
return result
}
func (ib *InsertBuffer) Buffer(inData *inData, startPos, endPos *msgpb.MsgPosition) int64 {
bufferedSize := int64(0)
for idx, data := range inData.data {
tsData := inData.tsField[idx]
tr := ib.getTimestampRange(tsData)
ib.buffer(data, tr, startPos, endPos)
// update buffer size
ib.UpdateStatistics(int64(data.GetRowNum()), int64(data.GetMemorySize()), tr, startPos, endPos)
bufferedSize += int64(data.GetMemorySize())
}
return bufferedSize
}
func (ib *InsertBuffer) getTimestampRange(tsData *storage.Int64FieldData) TimeRange {
tr := TimeRange{
timestampMin: math.MaxUint64,
timestampMax: 0,
}
for _, data := range tsData.Data {
if uint64(data) < tr.timestampMin {
tr.timestampMin = typeutil.Timestamp(data)
}
if uint64(data) > tr.timestampMax {
tr.timestampMax = typeutil.Timestamp(data)
}
}
return tr
}