mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-11-30 10:59:32 +08:00
537a817be9
Cherry-pick from master pr: #35380 Related to #35379 Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>
251 lines
6.7 KiB
Go
251 lines
6.7 KiB
Go
// Licensed to the LF AI & Data foundation under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package storage
|
|
|
|
import (
|
|
"fmt"
|
|
|
|
"github.com/cockroachdb/errors"
|
|
"github.com/samber/lo"
|
|
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
|
"github.com/milvus-io/milvus/internal/util/bloomfilter"
|
|
"github.com/milvus-io/milvus/pkg/common"
|
|
)
|
|
|
|
// pkStatistics contains pk field statistic information
|
|
type PkStatistics struct {
|
|
PkFilter bloomfilter.BloomFilterInterface // bloom filter of pk inside a segment
|
|
MinPK PrimaryKey // minimal pk value, shortcut for checking whether a pk is inside this segment
|
|
MaxPK PrimaryKey // maximal pk value, same above
|
|
}
|
|
|
|
// update set pk min/max value if input value is beyond former range.
|
|
func (st *PkStatistics) UpdateMinMax(pk PrimaryKey) error {
|
|
if st == nil {
|
|
return errors.New("nil pk statistics")
|
|
}
|
|
if st.MinPK == nil {
|
|
st.MinPK = pk
|
|
} else if st.MinPK.GT(pk) {
|
|
st.MinPK = pk
|
|
}
|
|
|
|
if st.MaxPK == nil {
|
|
st.MaxPK = pk
|
|
} else if st.MaxPK.LT(pk) {
|
|
st.MaxPK = pk
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (st *PkStatistics) UpdatePKRange(ids FieldData) error {
|
|
switch pks := ids.(type) {
|
|
case *Int64FieldData:
|
|
buf := make([]byte, 8)
|
|
for _, pk := range pks.Data {
|
|
id := NewInt64PrimaryKey(pk)
|
|
err := st.UpdateMinMax(id)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
common.Endian.PutUint64(buf, uint64(pk))
|
|
st.PkFilter.Add(buf)
|
|
}
|
|
case *StringFieldData:
|
|
for _, pk := range pks.Data {
|
|
id := NewVarCharPrimaryKey(pk)
|
|
err := st.UpdateMinMax(id)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
st.PkFilter.AddString(pk)
|
|
}
|
|
default:
|
|
return fmt.Errorf("invalid data type for primary key: %T", ids)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (st *PkStatistics) PkExist(pk PrimaryKey) bool {
|
|
// empty pkStatics
|
|
if st.MinPK == nil || st.MaxPK == nil || st.PkFilter == nil {
|
|
return false
|
|
}
|
|
// check pk range first, ugly but key it for now
|
|
if st.MinPK.GT(pk) || st.MaxPK.LT(pk) {
|
|
return false
|
|
}
|
|
|
|
// if in range, check bloom filter
|
|
switch pk.Type() {
|
|
case schemapb.DataType_Int64:
|
|
buf := make([]byte, 8)
|
|
int64Pk := pk.(*Int64PrimaryKey)
|
|
common.Endian.PutUint64(buf, uint64(int64Pk.Value))
|
|
return st.PkFilter.Test(buf)
|
|
case schemapb.DataType_VarChar:
|
|
varCharPk := pk.(*VarCharPrimaryKey)
|
|
return st.PkFilter.TestString(varCharPk.Value)
|
|
default:
|
|
// TODO::
|
|
}
|
|
// no idea, just make it as false positive
|
|
return true
|
|
}
|
|
|
|
// Locations returns a list of hash locations representing a data item.
|
|
func Locations(pk PrimaryKey, k uint, bfType bloomfilter.BFType) []uint64 {
|
|
switch pk.Type() {
|
|
case schemapb.DataType_Int64:
|
|
buf := make([]byte, 8)
|
|
int64Pk := pk.(*Int64PrimaryKey)
|
|
common.Endian.PutUint64(buf, uint64(int64Pk.Value))
|
|
return bloomfilter.Locations(buf, k, bfType)
|
|
case schemapb.DataType_VarChar:
|
|
varCharPk := pk.(*VarCharPrimaryKey)
|
|
return bloomfilter.Locations([]byte(varCharPk.Value), k, bfType)
|
|
default:
|
|
// TODO::
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (st *PkStatistics) TestLocationCache(lc *LocationsCache) bool {
|
|
// empty pkStatics
|
|
if st.MinPK == nil || st.MaxPK == nil || st.PkFilter == nil {
|
|
return false
|
|
}
|
|
|
|
// check bf first, TestLocation just do some bitset compute, cost is cheaper
|
|
if !st.PkFilter.TestLocations(lc.Locations(st.PkFilter.K(), st.PkFilter.Type())) {
|
|
return false
|
|
}
|
|
|
|
// check pk range after
|
|
return st.MinPK.LE(lc.pk) && st.MaxPK.GE(lc.pk)
|
|
}
|
|
|
|
func (st *PkStatistics) BatchPkExist(lc *BatchLocationsCache, hits []bool) []bool {
|
|
// empty pkStatics
|
|
if st.MinPK == nil || st.MaxPK == nil || st.PkFilter == nil {
|
|
return hits
|
|
}
|
|
|
|
// check bf first, TestLocation just do some bitset compute, cost is cheaper
|
|
locations := lc.Locations(st.PkFilter.K(), st.PkFilter.Type())
|
|
ret := st.PkFilter.BatchTestLocations(locations, hits)
|
|
|
|
// todo: a bit ugly, hits[i]'s value will depends on multi bf in single segment,
|
|
// hits array will be removed after we merge bf in segment
|
|
pks := lc.PKs()
|
|
for i := range ret {
|
|
if !hits[i] {
|
|
hits[i] = ret[i] && st.MinPK.LE(pks[i]) && st.MaxPK.GE(pks[i])
|
|
}
|
|
}
|
|
|
|
return hits
|
|
}
|
|
|
|
// LocationsCache is a helper struct caching pk bloom filter locations.
|
|
// Note that this helper is not concurrent safe and shall be used in same goroutine.
|
|
type LocationsCache struct {
|
|
pk PrimaryKey
|
|
basicBFLocations []uint64
|
|
blockBFLocations []uint64
|
|
}
|
|
|
|
func (lc *LocationsCache) GetPk() PrimaryKey {
|
|
return lc.pk
|
|
}
|
|
|
|
func (lc *LocationsCache) Locations(k uint, bfType bloomfilter.BFType) []uint64 {
|
|
switch bfType {
|
|
case bloomfilter.BasicBF:
|
|
if int(k) > len(lc.basicBFLocations) {
|
|
lc.basicBFLocations = Locations(lc.pk, k, bfType)
|
|
}
|
|
return lc.basicBFLocations[:k]
|
|
case bloomfilter.BlockedBF:
|
|
// for block bf, we only need cache the hash result, which is a uint and only compute once for any k value
|
|
if len(lc.blockBFLocations) != 1 {
|
|
lc.blockBFLocations = Locations(lc.pk, 1, bfType)
|
|
}
|
|
return lc.blockBFLocations
|
|
default:
|
|
return nil
|
|
}
|
|
}
|
|
|
|
func NewLocationsCache(pk PrimaryKey) *LocationsCache {
|
|
return &LocationsCache{
|
|
pk: pk,
|
|
}
|
|
}
|
|
|
|
type BatchLocationsCache struct {
|
|
pks []PrimaryKey
|
|
k uint
|
|
|
|
// for block bf
|
|
blockLocations [][]uint64
|
|
|
|
// for basic bf
|
|
basicLocations [][]uint64
|
|
}
|
|
|
|
func (lc *BatchLocationsCache) PKs() []PrimaryKey {
|
|
return lc.pks
|
|
}
|
|
|
|
func (lc *BatchLocationsCache) Size() int {
|
|
return len(lc.pks)
|
|
}
|
|
|
|
func (lc *BatchLocationsCache) Locations(k uint, bfType bloomfilter.BFType) [][]uint64 {
|
|
switch bfType {
|
|
case bloomfilter.BasicBF:
|
|
if k > lc.k {
|
|
lc.k = k
|
|
lc.basicLocations = lo.Map(lc.pks, func(pk PrimaryKey, _ int) []uint64 {
|
|
return Locations(pk, lc.k, bfType)
|
|
})
|
|
}
|
|
|
|
return lo.Map(lc.basicLocations, func(locations []uint64, _ int) []uint64 { return locations[:k] })
|
|
case bloomfilter.BlockedBF:
|
|
// for block bf, we only need cache the hash result, which is a uint and only compute once for any k value
|
|
if len(lc.blockLocations) != len(lc.pks) {
|
|
lc.blockLocations = lo.Map(lc.pks, func(pk PrimaryKey, _ int) []uint64 {
|
|
return Locations(pk, lc.k, bfType)
|
|
})
|
|
}
|
|
|
|
return lc.blockLocations
|
|
default:
|
|
return nil
|
|
}
|
|
}
|
|
|
|
func NewBatchLocationsCache(pks []PrimaryKey) *BatchLocationsCache {
|
|
return &BatchLocationsCache{
|
|
pks: pks,
|
|
}
|
|
}
|