2021-12-28 09:44:06 +08:00
|
|
|
// Licensed to the LF AI & Data foundation under one
|
|
|
|
// or more contributor license agreements. See the NOTICE file
|
|
|
|
// distributed with this work for additional information
|
|
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
|
|
// to you under the Apache License, Version 2.0 (the
|
|
|
|
// "License"); you may not use this file except in compliance
|
2021-05-20 18:38:45 +08:00
|
|
|
// with the License. You may obtain a copy of the License at
|
|
|
|
//
|
2021-12-28 09:44:06 +08:00
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
2021-05-20 18:38:45 +08:00
|
|
|
//
|
2021-12-28 09:44:06 +08:00
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
2021-05-20 18:38:45 +08:00
|
|
|
|
|
|
|
package storage
|
|
|
|
|
|
|
|
import (
|
|
|
|
"encoding/json"
|
2023-05-29 10:21:28 +08:00
|
|
|
"fmt"
|
2021-10-13 10:22:33 +08:00
|
|
|
|
|
|
|
"github.com/bits-and-blooms/bloom/v3"
|
2023-09-21 09:45:27 +08:00
|
|
|
|
2023-06-09 01:28:37 +08:00
|
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
2023-04-06 19:14:32 +08:00
|
|
|
"github.com/milvus-io/milvus/pkg/common"
|
2023-05-29 10:21:28 +08:00
|
|
|
"github.com/milvus-io/milvus/pkg/log"
|
2023-06-20 16:40:42 +08:00
|
|
|
"github.com/milvus-io/milvus/pkg/util/merr"
|
2021-10-13 10:22:33 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
const (
|
|
|
|
// TODO silverxia maybe need set from config
|
2022-10-31 17:41:34 +08:00
|
|
|
BloomFilterSize uint = 100000
|
|
|
|
MaxBloomFalsePositive float64 = 0.005
|
2021-05-20 18:38:45 +08:00
|
|
|
)
|
|
|
|
|
2022-03-25 14:27:25 +08:00
|
|
|
// PrimaryKeyStats contains statistics data for pk column
|
|
|
|
type PrimaryKeyStats struct {
|
2021-10-13 10:22:33 +08:00
|
|
|
FieldID int64 `json:"fieldID"`
|
2022-03-25 14:27:25 +08:00
|
|
|
Max int64 `json:"max"` // useless, will delete
|
2023-09-21 09:45:27 +08:00
|
|
|
Min int64 `json:"min"` // useless, will delete
|
2021-10-13 10:22:33 +08:00
|
|
|
BF *bloom.BloomFilter `json:"bf"`
|
2022-03-25 14:27:25 +08:00
|
|
|
PkType int64 `json:"pkType"`
|
|
|
|
MaxPk PrimaryKey `json:"maxPk"`
|
|
|
|
MinPk PrimaryKey `json:"minPk"`
|
|
|
|
}
|
|
|
|
|
|
|
|
// UnmarshalJSON unmarshal bytes to PrimaryKeyStats
|
|
|
|
func (stats *PrimaryKeyStats) UnmarshalJSON(data []byte) error {
|
|
|
|
var messageMap map[string]*json.RawMessage
|
|
|
|
err := json.Unmarshal(data, &messageMap)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
err = json.Unmarshal(*messageMap["fieldID"], &stats.FieldID)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
stats.PkType = int64(schemapb.DataType_Int64)
|
2022-03-30 10:15:28 +08:00
|
|
|
if value, ok := messageMap["pkType"]; ok && value != nil {
|
|
|
|
var typeValue int64
|
|
|
|
err = json.Unmarshal(*value, &typeValue)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
// valid pkType
|
|
|
|
if typeValue > 0 {
|
|
|
|
stats.PkType = typeValue
|
|
|
|
}
|
2022-03-25 14:27:25 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
switch schemapb.DataType(stats.PkType) {
|
|
|
|
case schemapb.DataType_Int64:
|
|
|
|
stats.MaxPk = &Int64PrimaryKey{}
|
|
|
|
stats.MinPk = &Int64PrimaryKey{}
|
|
|
|
|
|
|
|
// Compatible with versions that only support int64 type primary keys
|
|
|
|
err = json.Unmarshal(*messageMap["max"], &stats.Max)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
err = stats.MaxPk.SetValue(stats.Max)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
err = json.Unmarshal(*messageMap["min"], &stats.Min)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
err = stats.MinPk.SetValue(stats.Min)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
case schemapb.DataType_VarChar:
|
2022-04-02 17:43:29 +08:00
|
|
|
stats.MaxPk = &VarCharPrimaryKey{}
|
|
|
|
stats.MinPk = &VarCharPrimaryKey{}
|
2023-05-29 10:21:28 +08:00
|
|
|
default:
|
|
|
|
return fmt.Errorf("Invalid PK Data Type")
|
2022-03-25 14:27:25 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if maxPkMessage, ok := messageMap["maxPk"]; ok && maxPkMessage != nil {
|
|
|
|
err = json.Unmarshal(*maxPkMessage, stats.MaxPk)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if minPkMessage, ok := messageMap["minPk"]; ok && minPkMessage != nil {
|
|
|
|
err = json.Unmarshal(*minPkMessage, stats.MinPk)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if bfMessage, ok := messageMap["bf"]; ok && bfMessage != nil {
|
2022-10-31 17:41:34 +08:00
|
|
|
stats.BF = &bloom.BloomFilter{}
|
2022-03-25 14:27:25 +08:00
|
|
|
err = stats.BF.UnmarshalJSON(*bfMessage)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2023-05-29 10:21:28 +08:00
|
|
|
func (stats *PrimaryKeyStats) UpdateByMsgs(msgs FieldData) {
|
|
|
|
switch schemapb.DataType(stats.PkType) {
|
2022-04-02 17:43:29 +08:00
|
|
|
case schemapb.DataType_Int64:
|
|
|
|
data := msgs.(*Int64FieldData).Data
|
2022-03-25 14:27:25 +08:00
|
|
|
if len(data) < 1 {
|
|
|
|
// return error: msgs must has one element at least
|
2023-05-29 10:21:28 +08:00
|
|
|
return
|
2022-03-25 14:27:25 +08:00
|
|
|
}
|
|
|
|
|
2021-10-13 10:22:33 +08:00
|
|
|
b := make([]byte, 8)
|
2022-03-25 14:27:25 +08:00
|
|
|
for _, int64Value := range data {
|
2022-04-02 17:43:29 +08:00
|
|
|
pk := NewInt64PrimaryKey(int64Value)
|
2023-05-29 10:21:28 +08:00
|
|
|
stats.UpdateMinMax(pk)
|
2022-03-25 14:27:25 +08:00
|
|
|
common.Endian.PutUint64(b, uint64(int64Value))
|
2021-10-13 10:22:33 +08:00
|
|
|
stats.BF.Add(b)
|
|
|
|
}
|
2022-04-02 17:43:29 +08:00
|
|
|
case schemapb.DataType_VarChar:
|
|
|
|
data := msgs.(*StringFieldData).Data
|
2022-03-25 14:27:25 +08:00
|
|
|
if len(data) < 1 {
|
|
|
|
// return error: msgs must has one element at least
|
2023-05-29 10:21:28 +08:00
|
|
|
return
|
2022-03-25 14:27:25 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
for _, str := range data {
|
2022-04-02 17:43:29 +08:00
|
|
|
pk := NewVarCharPrimaryKey(str)
|
2023-05-29 10:21:28 +08:00
|
|
|
stats.UpdateMinMax(pk)
|
2022-04-02 17:43:29 +08:00
|
|
|
stats.BF.AddString(str)
|
2022-03-25 14:27:25 +08:00
|
|
|
}
|
|
|
|
default:
|
2023-09-21 09:45:27 +08:00
|
|
|
// TODO::
|
2021-05-20 18:38:45 +08:00
|
|
|
}
|
2023-05-29 10:21:28 +08:00
|
|
|
}
|
2022-03-25 14:27:25 +08:00
|
|
|
|
2023-05-29 10:21:28 +08:00
|
|
|
func (stats *PrimaryKeyStats) Update(pk PrimaryKey) {
|
|
|
|
stats.UpdateMinMax(pk)
|
|
|
|
switch schemapb.DataType(stats.PkType) {
|
|
|
|
case schemapb.DataType_Int64:
|
|
|
|
data := pk.GetValue().(int64)
|
|
|
|
b := make([]byte, 8)
|
|
|
|
common.Endian.PutUint64(b, uint64(data))
|
|
|
|
stats.BF.Add(b)
|
|
|
|
case schemapb.DataType_VarChar:
|
|
|
|
data := pk.GetValue().(string)
|
|
|
|
stats.BF.AddString(data)
|
|
|
|
default:
|
|
|
|
log.Warn("Update pk stats with invalid data type")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// updatePk update minPk and maxPk value
|
|
|
|
func (stats *PrimaryKeyStats) UpdateMinMax(pk PrimaryKey) {
|
|
|
|
if stats.MinPk == nil {
|
|
|
|
stats.MinPk = pk
|
|
|
|
} else if stats.MinPk.GT(pk) {
|
|
|
|
stats.MinPk = pk
|
|
|
|
}
|
|
|
|
|
|
|
|
if stats.MaxPk == nil {
|
|
|
|
stats.MaxPk = pk
|
|
|
|
} else if stats.MaxPk.LT(pk) {
|
|
|
|
stats.MaxPk = pk
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func NewPrimaryKeyStats(fieldID, pkType, rowNum int64) *PrimaryKeyStats {
|
|
|
|
return &PrimaryKeyStats{
|
|
|
|
FieldID: fieldID,
|
|
|
|
PkType: pkType,
|
|
|
|
BF: bloom.NewWithEstimates(uint(rowNum), MaxBloomFalsePositive),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// StatsWriter writes stats to buffer
|
|
|
|
type StatsWriter struct {
|
|
|
|
buffer []byte
|
|
|
|
}
|
|
|
|
|
|
|
|
// GetBuffer returns buffer
|
|
|
|
func (sw *StatsWriter) GetBuffer() []byte {
|
|
|
|
return sw.buffer
|
|
|
|
}
|
|
|
|
|
|
|
|
// GenerateList writes Stats slice to buffer
|
|
|
|
func (sw *StatsWriter) GenerateList(stats []*PrimaryKeyStats) error {
|
2021-05-20 18:38:45 +08:00
|
|
|
b, err := json.Marshal(stats)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
sw.buffer = b
|
2023-05-29 10:21:28 +08:00
|
|
|
return nil
|
|
|
|
}
|
2021-05-20 18:38:45 +08:00
|
|
|
|
2023-05-29 10:21:28 +08:00
|
|
|
// Generate writes Stats to buffer
|
|
|
|
func (sw *StatsWriter) Generate(stats *PrimaryKeyStats) error {
|
|
|
|
b, err := json.Marshal(stats)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
sw.buffer = b
|
2021-05-20 18:38:45 +08:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2023-05-29 10:21:28 +08:00
|
|
|
// GenerateByData writes Int64Stats or StringStats from @msgs with @fieldID to @buffer
|
|
|
|
func (sw *StatsWriter) GenerateByData(fieldID int64, pkType schemapb.DataType, msgs FieldData) error {
|
|
|
|
stats := &PrimaryKeyStats{
|
|
|
|
FieldID: fieldID,
|
|
|
|
PkType: int64(pkType),
|
|
|
|
BF: bloom.NewWithEstimates(uint(msgs.RowNum()), MaxBloomFalsePositive),
|
|
|
|
}
|
|
|
|
|
|
|
|
stats.UpdateByMsgs(msgs)
|
|
|
|
return sw.Generate(stats)
|
|
|
|
}
|
|
|
|
|
2021-12-17 18:47:56 +08:00
|
|
|
// StatsReader reads stats
|
2021-05-20 18:38:45 +08:00
|
|
|
type StatsReader struct {
|
|
|
|
buffer []byte
|
|
|
|
}
|
|
|
|
|
2021-12-07 14:53:13 +08:00
|
|
|
// SetBuffer sets buffer
|
2021-05-20 18:38:45 +08:00
|
|
|
func (sr *StatsReader) SetBuffer(buffer []byte) {
|
|
|
|
sr.buffer = buffer
|
|
|
|
}
|
|
|
|
|
2022-03-25 14:27:25 +08:00
|
|
|
// GetInt64Stats returns buffer as PrimaryKeyStats
|
|
|
|
func (sr *StatsReader) GetPrimaryKeyStats() (*PrimaryKeyStats, error) {
|
|
|
|
stats := &PrimaryKeyStats{}
|
2021-10-13 10:22:33 +08:00
|
|
|
err := json.Unmarshal(sr.buffer, &stats)
|
|
|
|
if err != nil {
|
2023-06-20 16:40:42 +08:00
|
|
|
return nil, merr.WrapErrParameterInvalid(
|
|
|
|
"valid JSON",
|
|
|
|
string(sr.buffer),
|
|
|
|
err.Error())
|
2021-10-13 10:22:33 +08:00
|
|
|
}
|
2022-03-25 14:27:25 +08:00
|
|
|
|
2021-10-13 10:22:33 +08:00
|
|
|
return stats, nil
|
|
|
|
}
|
|
|
|
|
2023-05-29 10:21:28 +08:00
|
|
|
// GetInt64Stats returns buffer as PrimaryKeyStats
|
|
|
|
func (sr *StatsReader) GetPrimaryKeyStatsList() ([]*PrimaryKeyStats, error) {
|
|
|
|
stats := []*PrimaryKeyStats{}
|
|
|
|
err := json.Unmarshal(sr.buffer, &stats)
|
|
|
|
if err != nil {
|
2023-06-20 16:40:42 +08:00
|
|
|
return nil, merr.WrapErrParameterInvalid(
|
|
|
|
"valid JSON",
|
|
|
|
string(sr.buffer),
|
|
|
|
err.Error())
|
2023-05-29 10:21:28 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return stats, nil
|
|
|
|
}
|
|
|
|
|
2022-03-25 14:27:25 +08:00
|
|
|
// DeserializeStats deserialize @blobs as []*PrimaryKeyStats
|
|
|
|
func DeserializeStats(blobs []*Blob) ([]*PrimaryKeyStats, error) {
|
|
|
|
results := make([]*PrimaryKeyStats, 0, len(blobs))
|
2021-10-20 14:26:35 +08:00
|
|
|
for _, blob := range blobs {
|
2023-06-25 11:32:43 +08:00
|
|
|
if len(blob.Value) == 0 {
|
2021-10-13 10:22:33 +08:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
sr := &StatsReader{}
|
|
|
|
sr.SetBuffer(blob.Value)
|
2022-03-25 14:27:25 +08:00
|
|
|
stats, err := sr.GetPrimaryKeyStats()
|
2021-10-13 10:22:33 +08:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2021-10-20 14:26:35 +08:00
|
|
|
results = append(results, stats)
|
2021-10-13 10:22:33 +08:00
|
|
|
}
|
|
|
|
return results, nil
|
2021-05-20 18:38:45 +08:00
|
|
|
}
|
2023-05-29 10:21:28 +08:00
|
|
|
|
|
|
|
func DeserializeStatsList(blob *Blob) ([]*PrimaryKeyStats, error) {
|
2023-06-25 11:32:43 +08:00
|
|
|
if len(blob.Value) == 0 {
|
2023-05-29 10:21:28 +08:00
|
|
|
return []*PrimaryKeyStats{}, nil
|
|
|
|
}
|
|
|
|
sr := &StatsReader{}
|
|
|
|
sr.SetBuffer(blob.Value)
|
|
|
|
stats, err := sr.GetPrimaryKeyStatsList()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
return stats, nil
|
|
|
|
}
|