Add support to build binary index

Signed-off-by: dragondriver <jiquan.long@zilliz.com>
This commit is contained in:
dragondriver 2020-12-28 10:00:02 +08:00 committed by yefu.chen
parent e8ec0424d8
commit 11cef6e978
7 changed files with 263 additions and 83 deletions

View File

@ -68,7 +68,7 @@ IndexWrapper::parse() {
if (!config_.contains(milvus::knowhere::meta::DIM)) {
// should raise exception here?
throw "dim must be specific in type params or index params!";
PanicInfo("dim must be specific in type params or index params!");
} else {
auto dim = config_[milvus::knowhere::meta::DIM].get<std::string>();
config_[milvus::knowhere::meta::DIM] = std::stoi(dim);
@ -130,10 +130,22 @@ IndexWrapper::dim() {
void
IndexWrapper::BuildWithoutIds(const knowhere::DatasetPtr& dataset) {
auto index_type = index_->index_type();
if (index_type == milvus::knowhere::IndexEnum::INDEX_FAISS_BIN_IVFFLAT) {
PanicInfo(std::string(index_type) + " doesn't support build without ids yet!");
}
index_->Train(dataset, config_);
index_->AddWithoutIds(dataset, config_);
}
void
IndexWrapper::BuildWithIds(const knowhere::DatasetPtr& dataset) {
Assert(dataset->data().find(milvus::knowhere::meta::IDS) != dataset->data().end());
// index_->Train(dataset, config_);
// index_->Add(dataset, config_);
index_->BuildAll(dataset, config_);
}
/*
* brief Return serialized binary set
*/

View File

@ -45,6 +45,10 @@ class IndexWrapper {
std::optional<T>
get_config_by_name(std::string name);
public:
void
BuildWithIds(const knowhere::DatasetPtr& dataset);
private:
knowhere::VecIndexPtr index_ = nullptr;
std::string type_params_;

View File

@ -28,7 +28,7 @@ DeleteIndex(CIndex index) {
}
void
BuildFloatVecIndex(CIndex index, int64_t float_value_num, const float* vectors) {
BuildFloatVecIndexWithoutIds(CIndex index, int64_t float_value_num, const float* vectors) {
auto cIndex = (milvus::indexbuilder::IndexWrapper*)index;
auto dim = cIndex->dim();
auto row_nums = float_value_num / dim;
@ -36,6 +36,15 @@ BuildFloatVecIndex(CIndex index, int64_t float_value_num, const float* vectors)
cIndex->BuildWithoutIds(ds);
}
void
BuildBinaryVecIndexWithoutIds(CIndex index, int64_t data_size, const uint8_t* vectors) {
auto cIndex = (milvus::indexbuilder::IndexWrapper*)index;
auto dim = cIndex->dim();
auto row_nums = (data_size * 8) / dim;
auto ds = milvus::knowhere::GenDataset(row_nums, dim, vectors);
cIndex->BuildWithoutIds(ds);
}
char*
SerializeToSlicedBuffer(CIndex index, int32_t* buffer_size) {
auto cIndex = (milvus::indexbuilder::IndexWrapper*)index;

View File

@ -41,7 +41,10 @@ void
DeleteIndex(CIndex index);
void
BuildFloatVecIndex(CIndex index, int64_t float_value_num, const float* vectors);
BuildFloatVecIndexWithoutIds(CIndex index, int64_t float_value_num, const float* vectors);
void
BuildBinaryVecIndexWithoutIds(CIndex index, int64_t data_size, const uint8_t* vectors);
char*
SerializeToSlicedBuffer(CIndex index, int32_t* buffer_size);

View File

@ -10,7 +10,7 @@
// or implied. See the License for the specific language governing permissions and limitations under the License
#include <tuple>
#include <random>
#include <map>
#include <gtest/gtest.h>
#include "pb/index_cgo_msg.pb.h"
@ -24,7 +24,7 @@
namespace indexcgo = milvus::proto::indexcgo;
constexpr int64_t DIM = 4;
constexpr int64_t DIM = 8;
constexpr int64_t NB = 10000;
constexpr int64_t NQ = 10;
constexpr int64_t K = 4;
@ -32,8 +32,8 @@ constexpr auto METRIC_TYPE = milvus::knowhere::Metric::L2;
namespace {
auto
generate_conf(const milvus::knowhere::IndexType& type) {
if (type == milvus::knowhere::IndexEnum::INDEX_FAISS_IVFPQ) {
generate_conf(const milvus::knowhere::IndexType& index_type, const milvus::knowhere::MetricType& metric_type) {
if (index_type == milvus::knowhere::IndexEnum::INDEX_FAISS_IVFPQ) {
return milvus::knowhere::Config{
{milvus::knowhere::meta::DIM, DIM},
{milvus::knowhere::meta::TOPK, K},
@ -41,19 +41,36 @@ generate_conf(const milvus::knowhere::IndexType& type) {
// {milvus::knowhere::IndexParams::nprobe, 4},
{milvus::knowhere::IndexParams::m, 4},
{milvus::knowhere::IndexParams::nbits, 8},
{milvus::knowhere::Metric::TYPE, milvus::knowhere::Metric::L2},
{milvus::knowhere::Metric::TYPE, metric_type},
{milvus::knowhere::INDEX_FILE_SLICE_SIZE_IN_MEGABYTE, 4},
};
} else if (index_type == milvus::knowhere::IndexEnum::INDEX_FAISS_BIN_IVFFLAT) {
return milvus::knowhere::Config{
{milvus::knowhere::meta::DIM, DIM},
{milvus::knowhere::meta::TOPK, K},
{milvus::knowhere::IndexParams::nlist, 100},
// {milvus::knowhere::IndexParams::nprobe, 4},
{milvus::knowhere::IndexParams::m, 4},
{milvus::knowhere::IndexParams::nbits, 8},
{milvus::knowhere::Metric::TYPE, metric_type},
{milvus::knowhere::INDEX_FILE_SLICE_SIZE_IN_MEGABYTE, 4},
};
} else if (index_type == milvus::knowhere::IndexEnum::INDEX_FAISS_BIN_IDMAP) {
return milvus::knowhere::Config{
{milvus::knowhere::meta::DIM, DIM},
{milvus::knowhere::meta::TOPK, K},
{milvus::knowhere::Metric::TYPE, metric_type},
};
}
return milvus::knowhere::Config();
}
auto
generate_params() {
generate_params(const milvus::knowhere::IndexType& index_type, const milvus::knowhere::MetricType& metric_type) {
indexcgo::TypeParams type_params;
indexcgo::IndexParams index_params;
auto configs = generate_conf(milvus::knowhere::IndexEnum::INDEX_FAISS_IVFPQ);
auto configs = generate_conf(index_type, metric_type);
for (auto& [key, value] : configs.items()) {
auto param = index_params.add_params();
auto value_str = value.is_string() ? value.get<std::string>() : value.dump();
@ -61,106 +78,231 @@ generate_params() {
param->set_value(value_str);
}
auto param = index_params.add_params();
param->set_key("index_type");
param->set_value(std::string(index_type));
return std::make_tuple(type_params, index_params);
}
auto
GenDataset(int64_t N, milvus::knowhere::MetricType metric_type, bool is_binary) {
auto schema = std::make_shared<milvus::Schema>();
auto faiss_metric_type = milvus::knowhere::GetMetricType(metric_type);
if (!is_binary) {
schema->AddField("fakevec", milvus::engine::DataType::VECTOR_FLOAT, DIM, faiss_metric_type);
return milvus::segcore::DataGen(schema, N);
} else {
schema->AddField("fakebinvec", milvus::engine::DataType::VECTOR_BINARY, DIM, faiss_metric_type);
return milvus::segcore::DataGen(schema, N);
}
}
} // namespace
TEST(IndexWrapperTest, Constructor) {
auto [type_params, index_params] = generate_params();
using Param = std::pair<milvus::knowhere::IndexType, milvus::knowhere::MetricType>;
class IndexWrapperTest : public ::testing::TestWithParam<Param> {
protected:
void
SetUp() override {
auto param = GetParam();
index_type = param.first;
metric_type = param.second;
std::tie(type_params, index_params) = generate_params(index_type, metric_type);
std::map<std::string, bool> is_binary_map = {{milvus::knowhere::IndexEnum::INDEX_FAISS_IVFPQ, false},
{milvus::knowhere::IndexEnum::INDEX_FAISS_BIN_IVFFLAT, true}};
is_binary = is_binary_map[index_type];
bool ok;
ok = type_params.SerializeToString(&type_params_str);
assert(ok);
ok = index_params.SerializeToString(&index_params_str);
assert(ok);
auto dataset = GenDataset(NB, metric_type, is_binary);
if (!is_binary) {
xb_data = dataset.get_col<float>(0);
xb_dataset = milvus::knowhere::GenDataset(NB, DIM, xb_data.data());
} else if (index_type == milvus::knowhere::IndexEnum::INDEX_FAISS_BIN_IVFFLAT) {
xb_bin_data = dataset.get_col<uint8_t>(0);
ids.resize(NB);
std::iota(ids.begin(), ids.end(), 0);
xb_dataset = milvus::knowhere::GenDatasetWithIds(NB, DIM, xb_bin_data.data(), ids.data());
} else {
xb_bin_data = dataset.get_col<uint8_t>(0);
xb_dataset = milvus::knowhere::GenDataset(NB, DIM, xb_bin_data.data());
}
}
void
TearDown() override {
}
protected:
std::string index_type, metric_type;
indexcgo::TypeParams type_params;
indexcgo::IndexParams index_params;
std::string type_params_str, index_params_str;
bool is_binary;
milvus::knowhere::DatasetPtr xb_dataset;
std::vector<float> xb_data;
std::vector<uint8_t> xb_bin_data;
std::vector<milvus::knowhere::IDType> ids;
};
TEST(PQ, Build) {
auto index_type = milvus::knowhere::IndexEnum::INDEX_FAISS_IVFPQ;
auto metric_type = milvus::knowhere::Metric::L2;
auto conf = generate_conf(index_type, metric_type);
auto index = milvus::knowhere::VecIndexFactory::GetInstance().CreateVecIndex(index_type);
auto dataset = GenDataset(NB, metric_type, false);
auto xb_data = dataset.get_col<float>(0);
auto xb_dataset = milvus::knowhere::GenDataset(NB, DIM, xb_data.data());
ASSERT_NO_THROW(index->Train(xb_dataset, conf));
ASSERT_NO_THROW(index->AddWithoutIds(xb_dataset, conf));
}
TEST(BINFLAT, Build) {
auto index_type = milvus::knowhere::IndexEnum::INDEX_FAISS_BIN_IVFFLAT;
auto metric_type = milvus::knowhere::Metric::JACCARD;
auto conf = generate_conf(index_type, metric_type);
auto index = milvus::knowhere::VecIndexFactory::GetInstance().CreateVecIndex(index_type);
auto dataset = GenDataset(NB, metric_type, true);
auto xb_data = dataset.get_col<uint8_t>(0);
std::vector<milvus::knowhere::IDType> ids(NB, 0);
std::iota(ids.begin(), ids.end(), 0);
auto xb_dataset = milvus::knowhere::GenDatasetWithIds(NB, DIM, xb_data.data(), ids.data());
ASSERT_NO_THROW(index->BuildAll(xb_dataset, conf));
}
TEST(BINIDMAP, Build) {
auto index_type = milvus::knowhere::IndexEnum::INDEX_FAISS_BIN_IDMAP;
auto metric_type = milvus::knowhere::Metric::JACCARD;
auto conf = generate_conf(index_type, metric_type);
auto index = milvus::knowhere::VecIndexFactory::GetInstance().CreateVecIndex(index_type);
auto dataset = GenDataset(NB, metric_type, true);
auto xb_data = dataset.get_col<uint8_t>(0);
std::vector<milvus::knowhere::IDType> ids(NB, 0);
std::iota(ids.begin(), ids.end(), 0);
auto xb_dataset = milvus::knowhere::GenDatasetWithIds(NB, DIM, xb_data.data(), ids.data());
ASSERT_NO_THROW(index->BuildAll(xb_dataset, conf));
}
TEST(PQWrapper, Build) {
auto index_type = milvus::knowhere::IndexEnum::INDEX_FAISS_IVFPQ;
auto metric_type = milvus::knowhere::Metric::L2;
indexcgo::TypeParams type_params;
indexcgo::IndexParams index_params;
std::tie(type_params, index_params) = generate_params(index_type, metric_type);
std::string type_params_str, index_params_str;
bool ok;
ok = type_params.SerializeToString(&type_params_str);
assert(ok);
ok = index_params.SerializeToString(&index_params_str);
assert(ok);
auto dataset = GenDataset(NB, metric_type, false);
auto xb_data = dataset.get_col<float>(0);
auto xb_dataset = milvus::knowhere::GenDataset(NB, DIM, xb_data.data());
auto index =
std::make_unique<milvus::indexbuilder::IndexWrapper>(type_params_str.c_str(), index_params_str.c_str());
ASSERT_NO_THROW(index->BuildWithoutIds(xb_dataset));
}
TEST(BinFlatWrapper, Build) {
auto index_type = milvus::knowhere::IndexEnum::INDEX_FAISS_BIN_IVFFLAT;
auto metric_type = milvus::knowhere::Metric::JACCARD;
indexcgo::TypeParams type_params;
indexcgo::IndexParams index_params;
std::tie(type_params, index_params) = generate_params(index_type, metric_type);
std::string type_params_str, index_params_str;
bool ok;
ok = type_params.SerializeToString(&type_params_str);
assert(ok);
ok = index_params.SerializeToString(&index_params_str);
assert(ok);
auto dataset = GenDataset(NB, metric_type, true);
auto xb_data = dataset.get_col<uint8_t>(0);
std::vector<milvus::knowhere::IDType> ids(NB, 0);
std::iota(ids.begin(), ids.end(), 0);
auto xb_dataset = milvus::knowhere::GenDatasetWithIds(NB, DIM, xb_data.data(), ids.data());
auto index =
std::make_unique<milvus::indexbuilder::IndexWrapper>(type_params_str.c_str(), index_params_str.c_str());
ASSERT_ANY_THROW(index->BuildWithoutIds(xb_dataset));
ASSERT_NO_THROW(index->BuildWithIds(xb_dataset));
}
TEST(BinIdMapWrapper, Build) {
auto index_type = milvus::knowhere::IndexEnum::INDEX_FAISS_BIN_IDMAP;
auto metric_type = milvus::knowhere::Metric::JACCARD;
indexcgo::TypeParams type_params;
indexcgo::IndexParams index_params;
std::tie(type_params, index_params) = generate_params(index_type, metric_type);
std::string type_params_str, index_params_str;
bool ok;
ok = type_params.SerializeToString(&type_params_str);
assert(ok);
ok = index_params.SerializeToString(&index_params_str);
assert(ok);
auto dataset = GenDataset(NB, metric_type, true);
auto xb_data = dataset.get_col<uint8_t>(0);
std::vector<milvus::knowhere::IDType> ids(NB, 0);
std::iota(ids.begin(), ids.end(), 0);
auto xb_dataset = milvus::knowhere::GenDatasetWithIds(NB, DIM, xb_data.data(), ids.data());
auto index =
std::make_unique<milvus::indexbuilder::IndexWrapper>(type_params_str.c_str(), index_params_str.c_str());
ASSERT_NO_THROW(index->BuildWithoutIds(xb_dataset));
ASSERT_NO_THROW(index->BuildWithIds(xb_dataset));
}
INSTANTIATE_TEST_CASE_P(IndexTypeParameters,
IndexWrapperTest,
::testing::Values(std::pair(milvus::knowhere::IndexEnum::INDEX_FAISS_IVFPQ,
milvus::knowhere::Metric::L2),
std::pair(milvus::knowhere::IndexEnum::INDEX_FAISS_BIN_IVFFLAT,
milvus::knowhere::Metric::JACCARD),
std::pair(milvus::knowhere::IndexEnum::INDEX_FAISS_BIN_IDMAP,
milvus::knowhere::Metric::JACCARD)));
TEST_P(IndexWrapperTest, Constructor) {
auto index =
std::make_unique<milvus::indexbuilder::IndexWrapper>(type_params_str.c_str(), index_params_str.c_str());
}
TEST(IndexWrapperTest, Dim) {
auto [type_params, index_params] = generate_params();
std::string type_params_str, index_params_str;
bool ok;
ok = type_params.SerializeToString(&type_params_str);
assert(ok);
ok = index_params.SerializeToString(&index_params_str);
assert(ok);
TEST_P(IndexWrapperTest, Dim) {
auto index =
std::make_unique<milvus::indexbuilder::IndexWrapper>(type_params_str.c_str(), index_params_str.c_str());
ASSERT_EQ(index->dim(), DIM);
}
TEST(IndexWrapperTest, BuildWithoutIds) {
auto [type_params, index_params] = generate_params();
std::string type_params_str, index_params_str;
bool ok;
ok = type_params.SerializeToString(&type_params_str);
assert(ok);
ok = index_params.SerializeToString(&index_params_str);
assert(ok);
TEST_P(IndexWrapperTest, BuildWithoutIds) {
auto index =
std::make_unique<milvus::indexbuilder::IndexWrapper>(type_params_str.c_str(), index_params_str.c_str());
auto schema = std::make_shared<milvus::Schema>();
schema->AddField("fakevec", milvus::engine::DataType::VECTOR_FLOAT, DIM, faiss::MetricType::METRIC_L2);
auto dataset = milvus::segcore::DataGen(schema, NB);
auto xb_data = dataset.get_col<float>(0);
index->BuildWithoutIds(milvus::knowhere::GenDataset(NB, DIM, xb_data.data()));
if (index_type == milvus::knowhere::IndexEnum::INDEX_FAISS_BIN_IVFFLAT) {
ASSERT_ANY_THROW(index->BuildWithoutIds(xb_dataset));
} else {
ASSERT_NO_THROW(index->BuildWithoutIds(xb_dataset));
}
}
TEST(IndexWrapperTest, Load) {
auto type = milvus::knowhere::IndexEnum::INDEX_FAISS_IVFPQ;
auto index = milvus::knowhere::VecIndexFactory::GetInstance().CreateVecIndex(type);
auto conf = generate_conf(type);
auto schema = std::make_shared<milvus::Schema>();
schema->AddField("fakevec", milvus::engine::DataType::VECTOR_FLOAT, DIM, faiss::MetricType::METRIC_L2);
auto dataset = milvus::segcore::DataGen(schema, NB);
auto xb_data = dataset.get_col<float>(0);
auto ds = milvus::knowhere::GenDataset(NB, DIM, xb_data.data());
index->Train(ds, conf);
index->AddWithoutIds(ds, conf);
// std::vector<int64_t> ids(NB);
// std::iota(ids.begin(), ids.end(), 0); // range(0, NB)
// auto ds = milvus::knowhere::GenDatasetWithIds(NB, DIM, xb_data.data(), ids.data());
// index->Train(ds, conf);
// index->Add(ds, conf);
auto binary_set = index->Serialize(conf);
auto copy_index = milvus::knowhere::VecIndexFactory::GetInstance().CreateVecIndex(type);
copy_index->Load(binary_set);
}
TEST(IndexWrapperTest, Codec) {
auto [type_params, index_params] = generate_params();
std::string type_params_str, index_params_str;
bool ok;
ok = type_params.SerializeToString(&type_params_str);
assert(ok);
ok = index_params.SerializeToString(&index_params_str);
assert(ok);
TEST_P(IndexWrapperTest, Codec) {
auto index =
std::make_unique<milvus::indexbuilder::IndexWrapper>(type_params_str.c_str(), index_params_str.c_str());
auto schema = std::make_shared<milvus::Schema>();
schema->AddField("fakevec", milvus::engine::DataType::VECTOR_FLOAT, DIM, faiss::MetricType::METRIC_L2);
auto dataset = milvus::segcore::DataGen(schema, NB);
auto xb_data = dataset.get_col<float>(0);
index->BuildWithoutIds(milvus::knowhere::GenDataset(NB, DIM, xb_data.data()));
if (index_type == milvus::knowhere::IndexEnum::INDEX_FAISS_BIN_IVFFLAT) {
ASSERT_ANY_THROW(index->BuildWithoutIds(xb_dataset));
ASSERT_NO_THROW(index->BuildWithIds(xb_dataset));
} else {
ASSERT_NO_THROW(index->BuildWithoutIds(xb_dataset));
}
auto binary = index->Serialize();
auto copy_index =
std::make_unique<milvus::indexbuilder::IndexWrapper>(type_params_str.c_str(), index_params_str.c_str());
copy_index->Load(binary.data, binary.size);
ASSERT_NO_THROW(copy_index->Load(binary.data, binary.size));
ASSERT_EQ(copy_index->dim(), copy_index->dim());
auto copy_binary = copy_index->Serialize();
ASSERT_EQ(binary.size, copy_binary.size);

View File

@ -27,7 +27,8 @@ type Blob = storage.Blob
type Index interface {
Serialize() ([]*Blob, error)
Load([]*Blob) error
BuildFloatVecIndex(vectors []float32) error
BuildFloatVecIndexWithoutIds(vectors []float32) error
BuildBinaryVecIndexWithoutIds(vectors []byte) error
Delete() error
}
@ -78,12 +79,21 @@ func (index *CIndex) Load(blobs []*Blob) error {
return nil
}
func (index *CIndex) BuildFloatVecIndex(vectors []float32) error {
func (index *CIndex) BuildFloatVecIndexWithoutIds(vectors []float32) error {
/*
void
BuildFloatVecIndex(CIndex index, int64_t float_value_num, const float* vectors);
BuildFloatVecIndexWithoutIds(CIndex index, int64_t float_value_num, const float* vectors);
*/
C.BuildFloatVecIndex(index.indexPtr, (C.int64_t)(len(vectors)), (*C.float)(&vectors[0]))
C.BuildFloatVecIndexWithoutIds(index.indexPtr, (C.int64_t)(len(vectors)), (*C.float)(&vectors[0]))
return nil
}
func (index *CIndex) BuildBinaryVecIndexWithoutIds(vectors []byte) error {
/*
void
BuildBinaryVecIndexWithoutIds(CIndex index, int64_t data_size, const uint8_t* vectors);
*/
C.BuildBinaryVecIndexWithoutIds(index.indexPtr, (C.int64_t)(len(vectors)), (*C.uint8_t)(&vectors[0]))
return nil
}

View File

@ -227,7 +227,7 @@ func (it *IndexBuildTask) Execute() error {
return errors.New("we expect FloatVectorFieldData or BinaryVectorFieldData")
}
err = it.index.BuildFloatVecIndex(floatVectorFieldData.Data)
err = it.index.BuildFloatVecIndexWithoutIds(floatVectorFieldData.Data)
if err != nil {
return err
}