Add cache for thirdparty files cache

Signed-off-by: XuanYang-cn <xuan.yang@zilliz.com>
This commit is contained in:
XuanYang-cn 2020-12-08 18:51:07 +08:00 committed by yefu.chen
parent 6fddb992f4
commit e6f726e73a
40 changed files with 798 additions and 367 deletions

View File

@ -35,10 +35,6 @@ message( STATUS "Build version = ${MILVUS_VERSION}" )
get_last_commit_id( LAST_COMMIT_ID )
message( STATUS "LAST_COMMIT_ID = ${LAST_COMMIT_ID}" )
#configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/src/version.h.in
# ${CMAKE_CURRENT_SOURCE_DIR}/src/version.h @ONLY )
# unset(CMAKE_EXPORT_COMPILE_COMMANDS CACHE)
set( CMAKE_EXPORT_COMPILE_COMMANDS ON )
# **************************** Project ****************************

View File

@ -1,8 +1,9 @@
set(COMMON_SRC
Schema.cpp
)
set(COMMON_SRC
Schema.cpp
Types.cpp
)
add_library(milvus_common
${COMMON_SRC}
)
add_library(milvus_common
${COMMON_SRC}
)
target_link_libraries(milvus_common milvus_proto)

View File

@ -10,18 +10,13 @@
// or implied. See the License for the specific language governing permissions and limitations under the License
#pragma once
#include "utils/Types.h"
#include "common/Types.h"
#include "utils/Status.h"
#include "utils/EasyAssert.h"
#include <string>
#include <stdexcept>
namespace milvus {
using Timestamp = uint64_t; // TODO: use TiKV-like timestamp
using engine::DataType;
using engine::FieldElementType;
inline int
field_sizeof(DataType data_type, int dim = 1) {
switch (data_type) {
@ -89,7 +84,13 @@ field_is_vector(DataType datatype) {
struct FieldMeta {
public:
FieldMeta(std::string_view name, DataType type, int dim = 1) : name_(name), type_(type), dim_(dim) {
FieldMeta(std::string_view name, DataType type) : name_(name), type_(type) {
Assert(!is_vector());
}
FieldMeta(std::string_view name, DataType type, int64_t dim, MetricType metric_type)
: name_(name), type_(type), vector_info_(VectorInfo{dim, metric_type}) {
Assert(is_vector());
}
bool
@ -98,14 +99,11 @@ struct FieldMeta {
return type_ == DataType::VECTOR_BINARY || type_ == DataType::VECTOR_FLOAT;
}
void
set_dim(int dim) {
dim_ = dim;
}
int
int64_t
get_dim() const {
return dim_;
Assert(is_vector());
Assert(vector_info_.has_value());
return vector_info_->dim_;
}
const std::string&
@ -120,12 +118,20 @@ struct FieldMeta {
int
get_sizeof() const {
return field_sizeof(type_, dim_);
if (is_vector()) {
return field_sizeof(type_, get_dim());
} else {
return field_sizeof(type_, 1);
}
}
private:
struct VectorInfo {
int64_t dim_;
MetricType metric_type_;
};
std::string name_;
DataType type_ = DataType::NONE;
int dim_ = 1;
std::optional<VectorInfo> vector_info_;
};
} // namespace milvus

View File

@ -11,35 +11,50 @@
#include "common/Schema.h"
#include <google/protobuf/text_format.h>
#include <boost/lexical_cast.hpp>
namespace milvus {
using std::string;
static std::map<string, string>
RepeatedKeyValToMap(const google::protobuf::RepeatedPtrField<proto::common::KeyValuePair>& kvs) {
std::map<string, string> mapping;
for (auto& kv : kvs) {
AssertInfo(!mapping.count(kv.key()), "repeat key(" + kv.key() + ") in protobuf");
mapping.emplace(kv.key(), kv.value());
}
return mapping;
}
std::shared_ptr<Schema>
Schema::ParseFrom(const milvus::proto::schema::CollectionSchema& schema_proto) {
auto schema = std::make_shared<Schema>();
schema->set_auto_id(schema_proto.autoid());
for (const milvus::proto::schema::FieldSchema& child : schema_proto.fields()) {
const auto& type_params = child.type_params();
int64_t dim = -1;
auto data_type = DataType(child.data_type());
for (const auto& type_param : type_params) {
if (type_param.key() == "dim") {
dim = strtoll(type_param.value().c_str(), nullptr, 10);
}
}
if (field_is_vector(data_type)) {
AssertInfo(dim != -1, "dim not found");
} else {
AssertInfo(dim == 1 || dim == -1, "Invalid dim field. Should be 1 or not exists");
dim = 1;
}
if (child.is_primary_key()) {
AssertInfo(!schema->primary_key_offset_opt_.has_value(), "repetitive primary key");
schema->primary_key_offset_opt_ = schema->size();
}
schema->AddField(child.name(), data_type, dim);
if (field_is_vector(data_type)) {
auto type_map = RepeatedKeyValToMap(child.type_params());
auto index_map = RepeatedKeyValToMap(child.index_params());
if (!index_map.count("metric_type")) {
auto default_metric_type =
data_type == DataType::VECTOR_FLOAT ? MetricType::METRIC_L2 : MetricType::METRIC_Jaccard;
index_map["metric_type"] = default_metric_type;
}
AssertInfo(type_map.count("dim"), "dim not found");
auto dim = boost::lexical_cast<int64_t>(type_map.at("dim"));
AssertInfo(index_map.count("metric_type"), "index not found");
auto metric_type = GetMetricType(index_map.at("metric_type"));
schema->AddField(child.name(), data_type, dim, metric_type);
} else {
schema->AddField(child.name(), data_type);
}
}
return schema;
}

View File

@ -24,19 +24,15 @@ namespace milvus {
class Schema {
public:
void
AddField(std::string_view field_name, DataType data_type, int dim = 1) {
auto field_meta = FieldMeta(field_name, data_type, dim);
AddField(std::string_view field_name, DataType data_type) {
auto field_meta = FieldMeta(field_name, data_type);
this->AddField(std::move(field_meta));
}
void
AddField(FieldMeta field_meta) {
auto offset = fields_.size();
fields_.emplace_back(field_meta);
offsets_.emplace(field_meta.get_name(), offset);
auto field_sizeof = field_meta.get_sizeof();
sizeof_infos_.push_back(field_sizeof);
total_sizeof_ += field_sizeof;
AddField(std::string_view field_name, DataType data_type, int64_t dim, MetricType metric_type) {
auto field_meta = FieldMeta(field_name, data_type, dim, metric_type);
this->AddField(std::move(field_meta));
}
void
@ -44,17 +40,6 @@ class Schema {
is_auto_id_ = is_auto_id;
}
auto
begin() {
return fields_.begin();
}
auto
end() {
return fields_.end();
}
public:
bool
get_is_auto_id() const {
return is_auto_id_;
@ -123,11 +108,20 @@ class Schema {
static std::shared_ptr<Schema>
ParseFrom(const milvus::proto::schema::CollectionSchema& schema_proto);
void
AddField(FieldMeta&& field_meta) {
auto offset = fields_.size();
fields_.emplace_back(field_meta);
offsets_.emplace(field_meta.get_name(), offset);
auto field_sizeof = field_meta.get_sizeof();
sizeof_infos_.push_back(std::move(field_sizeof));
total_sizeof_ += field_sizeof;
}
private:
// this is where data holds
std::vector<FieldMeta> fields_;
private:
// a mapping for random access
std::unordered_map<std::string, int> offsets_;
std::vector<int> sizeof_infos_;

View File

@ -0,0 +1,45 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
//
// Created by mike on 12/3/20.
//
#include "common/Types.h"
#include <knowhere/index/vector_index/helpers/IndexParameter.h>
#include "utils/EasyAssert.h"
#include <boost/bimap.hpp>
#include <boost/algorithm/string/case_conv.hpp>
namespace milvus {
using boost::algorithm::to_lower_copy;
namespace Metric = knowhere::Metric;
static auto map = [] {
boost::bimap<std::string, MetricType> mapping;
using pos = boost::bimap<std::string, MetricType>::value_type;
mapping.insert(pos(to_lower_copy(std::string(Metric::L2)), MetricType::METRIC_L2));
mapping.insert(pos(to_lower_copy(std::string(Metric::IP)), MetricType::METRIC_INNER_PRODUCT));
mapping.insert(pos(to_lower_copy(std::string(Metric::JACCARD)), MetricType::METRIC_Jaccard));
mapping.insert(pos(to_lower_copy(std::string(Metric::TANIMOTO)), MetricType::METRIC_Tanimoto));
mapping.insert(pos(to_lower_copy(std::string(Metric::HAMMING)), MetricType::METRIC_Hamming));
mapping.insert(pos(to_lower_copy(std::string(Metric::SUBSTRUCTURE)), MetricType::METRIC_Substructure));
mapping.insert(pos(to_lower_copy(std::string(Metric::SUPERSTRUCTURE)), MetricType::METRIC_Superstructure));
return mapping;
}();
MetricType
GetMetricType(const std::string& type_name) {
auto real_name = to_lower_copy(type_name);
AssertInfo(map.left.count(real_name), "metric type not found: " + type_name);
return map.left.at(real_name);
}
} // namespace milvus

View File

@ -0,0 +1,27 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#pragma once
#include "utils/Types.h"
#include <faiss/MetricType.h>
#include <string>
namespace milvus {
using Timestamp = uint64_t; // TODO: use TiKV-like timestamp
using engine::DataType;
using engine::FieldElementType;
using engine::QueryResult;
using MetricType = faiss::MetricType;
faiss::MetricType
GetMetricType(const std::string& type);
} // namespace milvus

View File

@ -11,20 +11,66 @@
#include "BruteForceSearch.h"
#include <vector>
#include <common/Types.h>
#include <boost/dynamic_bitset.hpp>
#include <queue>
namespace milvus::query {
void
BinarySearchBruteForce(faiss::MetricType metric_type,
int64_t code_size,
const uint8_t* binary_chunk,
int64_t chunk_size,
int64_t topk,
int64_t num_queries,
const uint8_t* query_data,
float* result_distances,
idx_t* result_labels,
faiss::ConcurrentBitsetPtr bitset) {
BinarySearchBruteForceNaive(MetricType metric_type,
int64_t code_size,
const uint8_t* binary_chunk,
int64_t chunk_size,
int64_t topk,
int64_t num_queries,
const uint8_t* query_data,
float* result_distances,
idx_t* result_labels,
faiss::ConcurrentBitsetPtr bitset) {
// THIS IS A NAIVE IMPLEMENTATION, ready for optimize
Assert(metric_type == faiss::METRIC_Jaccard);
Assert(code_size % 4 == 0);
using T = std::tuple<float, int>;
for (int64_t q = 0; q < num_queries; ++q) {
auto query_ptr = query_data + code_size * q;
auto query = boost::dynamic_bitset(query_ptr, query_ptr + code_size);
std::vector<T> max_heap(topk + 1, std::make_tuple(std::numeric_limits<float>::max(), -1));
for (int64_t i = 0; i < chunk_size; ++i) {
auto element_ptr = binary_chunk + code_size * i;
auto element = boost::dynamic_bitset(element_ptr, element_ptr + code_size);
auto the_and = (query & element).count();
auto the_or = (query | element).count();
auto distance = the_or ? (float)(the_or - the_and) / the_or : 0;
if (distance < std::get<0>(max_heap[0])) {
max_heap[topk] = std::make_tuple(distance, i);
std::push_heap(max_heap.begin(), max_heap.end());
std::pop_heap(max_heap.begin(), max_heap.end());
}
}
std::sort(max_heap.begin(), max_heap.end());
for (int k = 0; k < topk; ++k) {
auto info = max_heap[k];
result_distances[k + q * topk] = std::get<0>(info);
result_labels[k + q * topk] = std::get<1>(info);
}
}
}
void
BinarySearchBruteForceFast(MetricType metric_type,
int64_t code_size,
const uint8_t* binary_chunk,
int64_t chunk_size,
int64_t topk,
int64_t num_queries,
const uint8_t* query_data,
float* result_distances,
idx_t* result_labels,
faiss::ConcurrentBitsetPtr bitset) {
const idx_t block_size = segcore::DefaultElementPerChunk;
bool use_heap = true;
@ -83,6 +129,21 @@ BinarySearchBruteForce(faiss::MetricType metric_type,
for (int i = 0; i < num_queries; ++i) {
result_distances[i] = static_cast<float>(int_distances[i]);
}
} else {
PanicInfo("Unsupported metric type");
}
}
void
BinarySearchBruteForce(const dataset::BinaryQueryDataset& query_dataset,
const uint8_t* binary_chunk,
int64_t chunk_size,
float* result_distances,
idx_t* result_labels,
faiss::ConcurrentBitsetPtr bitset) {
// TODO: refactor the internal function
BinarySearchBruteForceFast(query_dataset.metric_type, query_dataset.code_size, binary_chunk, chunk_size,
query_dataset.topk, query_dataset.num_queries, query_dataset.query_data,
result_distances, result_labels, bitset);
}
} // namespace milvus::query

View File

@ -15,15 +15,25 @@
#include "common/Schema.h"
namespace milvus::query {
using MetricType = faiss::MetricType;
namespace dataset {
struct BinaryQueryDataset {
MetricType metric_type;
int64_t num_queries;
int64_t topk;
int64_t code_size;
const uint8_t* query_data;
};
} // namespace dataset
void
BinarySearchBruteForce(faiss::MetricType metric_type,
int64_t code_size,
BinarySearchBruteForce(const dataset::BinaryQueryDataset& query_dataset,
const uint8_t* binary_chunk,
int64_t chunk_size,
int64_t topk,
int64_t num_queries,
const uint8_t* query_data,
float* result_distances,
idx_t* result_labels,
faiss::ConcurrentBitsetPtr bitset = nullptr);
} // namespace milvus::query

View File

@ -26,15 +26,25 @@ static std::unique_ptr<VectorPlanNode>
ParseVecNode(Plan* plan, const Json& out_body) {
Assert(out_body.is_object());
// TODO add binary info
auto vec_node = std::make_unique<FloatVectorANNS>();
Assert(out_body.size() == 1);
auto iter = out_body.begin();
std::string field_name = iter.key();
auto& vec_info = iter.value();
Assert(vec_info.is_object());
auto topK = vec_info["topk"];
AssertInfo(topK > 0, "topK must greater than 0");
AssertInfo(topK < 16384, "topK is too large");
auto field_meta = plan->schema_.operator[](field_name);
auto vec_node = [&]() -> std::unique_ptr<VectorPlanNode> {
auto data_type = field_meta.get_data_type();
if (data_type == DataType::VECTOR_FLOAT) {
return std::make_unique<FloatVectorANNS>();
} else {
return std::make_unique<BinaryVectorANNS>();
}
}();
vec_node->query_info_.topK_ = topK;
vec_node->query_info_.metric_type_ = vec_info.at("metric_type");
vec_node->query_info_.search_params_ = vec_info.at("params");

View File

@ -16,6 +16,7 @@
#include <faiss/utils/distances.h>
#include "utils/tools.h"
#include "query/BruteForceSearch.h"
namespace milvus::query {
using segcore::DefaultElementPerChunk;
@ -41,7 +42,7 @@ QueryBruteForceImpl(const segcore::SegmentSmallIndex& segment,
int64_t num_queries,
Timestamp timestamp,
std::optional<const BitmapSimple*> bitmaps_opt,
segcore::QueryResult& results) {
QueryResult& results) {
auto& schema = segment.get_schema();
auto& indexing_record = segment.get_indexing_record();
auto& record = segment.get_insert_record();
@ -131,7 +132,92 @@ QueryBruteForceImpl(const segcore::SegmentSmallIndex& segment,
}
results.result_ids_ = std::move(final_uids);
// TODO: deprecated code end
return Status::OK();
}
Status
BinaryQueryBruteForceImpl(const segcore::SegmentSmallIndex& segment,
const query::QueryInfo& info,
const uint8_t* query_data,
int64_t num_queries,
Timestamp timestamp,
std::optional<const BitmapSimple*> bitmaps_opt,
QueryResult& results) {
auto& schema = segment.get_schema();
auto& indexing_record = segment.get_indexing_record();
auto& record = segment.get_insert_record();
// step 1: binary search to find the barrier of the snapshot
auto ins_barrier = get_barrier(record, timestamp);
auto max_chunk = upper_div(ins_barrier, DefaultElementPerChunk);
auto metric_type = GetMetricType(info.metric_type_);
// auto del_barrier = get_barrier(deleted_record_, timestamp);
#if 0
auto bitmap_holder = get_deleted_bitmap(del_barrier, timestamp, ins_barrier);
Assert(bitmap_holder);
auto bitmap = bitmap_holder->bitmap_ptr;
#endif
// step 2.1: get meta
// step 2.2: get which vector field to search
auto vecfield_offset_opt = schema.get_offset(info.field_id_);
Assert(vecfield_offset_opt.has_value());
auto vecfield_offset = vecfield_offset_opt.value();
auto& field = schema[vecfield_offset];
Assert(field.get_data_type() == DataType::VECTOR_BINARY);
auto dim = field.get_dim();
auto code_size = dim / 8;
auto topK = info.topK_;
auto total_count = topK * num_queries;
// step 3: small indexing search
std::vector<int64_t> final_uids(total_count, -1);
std::vector<float> final_dis(total_count, std::numeric_limits<float>::max());
query::dataset::BinaryQueryDataset query_dataset{metric_type, num_queries, topK, code_size, query_data};
using segcore::BinaryVector;
auto vec_ptr = record.get_entity<BinaryVector>(vecfield_offset);
auto max_indexed_id = 0;
// step 4: brute force search where small indexing is unavailable
for (int chunk_id = max_indexed_id; chunk_id < max_chunk; ++chunk_id) {
std::vector<int64_t> buf_uids(total_count, -1);
std::vector<float> buf_dis(total_count, std::numeric_limits<float>::max());
auto& chunk = vec_ptr->get_chunk(chunk_id);
auto nsize =
chunk_id != max_chunk - 1 ? DefaultElementPerChunk : ins_barrier - chunk_id * DefaultElementPerChunk;
auto bitmap_view = create_bitmap_view(bitmaps_opt, chunk_id);
BinarySearchBruteForce(query_dataset, chunk.data(), nsize, buf_dis.data(), buf_uids.data(), bitmap_view);
// convert chunk uid to segment uid
for (auto& x : buf_uids) {
if (x != -1) {
x += chunk_id * DefaultElementPerChunk;
}
}
segcore::merge_into(num_queries, topK, final_dis.data(), final_uids.data(), buf_dis.data(), buf_uids.data());
}
results.result_distances_ = std::move(final_dis);
results.internal_seg_offsets_ = std::move(final_uids);
results.topK_ = topK;
results.num_queries_ = num_queries;
// TODO: deprecated code begin
final_uids = results.internal_seg_offsets_;
for (auto& id : final_uids) {
if (id == -1) {
continue;
}
id = record.uids_[id];
}
results.result_ids_ = std::move(final_uids);
// TODO: deprecated code end
return Status::OK();
}
} // namespace milvus::query

View File

@ -27,5 +27,14 @@ QueryBruteForceImpl(const segcore::SegmentSmallIndex& segment,
int64_t num_queries,
Timestamp timestamp,
std::optional<const BitmapSimple*> bitmap_opt,
segcore::QueryResult& results);
QueryResult& results);
Status
BinaryQueryBruteForceImpl(const segcore::SegmentSmallIndex& segment,
const query::QueryInfo& info,
const uint8_t* query_data,
int64_t num_queries,
Timestamp timestamp,
std::optional<const BitmapSimple*> bitmaps_opt,
QueryResult& results);
} // namespace milvus::query

View File

@ -18,7 +18,7 @@
#include <unordered_map>
#include <vector>
#include "utils/Types.h"
#include "common/Types.h"
#include "utils/Json.h"
namespace milvus {

View File

@ -28,7 +28,7 @@ class ExecPlanNodeVisitor : PlanNodeVisitor {
visit(BinaryVectorANNS& node) override;
public:
using RetType = segcore::QueryResult;
using RetType = QueryResult;
ExecPlanNodeVisitor(segcore::SegmentBase& segment, Timestamp timestamp, const PlaceholderGroup& placeholder_group)
: segment_(segment), timestamp_(timestamp), placeholder_group_(placeholder_group) {
}

View File

@ -26,7 +26,7 @@ namespace impl {
// WILL BE USED BY GENERATOR UNDER suvlim/core_gen/
class ExecPlanNodeVisitor : PlanNodeVisitor {
public:
using RetType = segcore::QueryResult;
using RetType = QueryResult;
ExecPlanNodeVisitor(segcore::SegmentBase& segment, Timestamp timestamp, const PlaceholderGroup& placeholder_group)
: segment_(segment), timestamp_(timestamp), placeholder_group_(placeholder_group) {
}
@ -75,7 +75,22 @@ ExecPlanNodeVisitor::visit(FloatVectorANNS& node) {
void
ExecPlanNodeVisitor::visit(BinaryVectorANNS& node) {
// TODO
// TODO: optimize here, remove the dynamic cast
assert(!ret_.has_value());
auto segment = dynamic_cast<segcore::SegmentSmallIndex*>(&segment_);
AssertInfo(segment, "support SegmentSmallIndex Only");
RetType ret;
auto& ph = placeholder_group_.at(0);
auto src_data = ph.get_blob<uint8_t>();
auto num_queries = ph.num_of_queries_;
if (node.predicate_.has_value()) {
auto bitmap = ExecExprVisitor(*segment).call_child(*node.predicate_.value());
auto ptr = &bitmap;
BinaryQueryBruteForceImpl(*segment, node.query_info_, src_data, num_queries, timestamp_, ptr, ret);
} else {
BinaryQueryBruteForceImpl(*segment, node.query_info_, src_data, num_queries, timestamp_, std::nullopt, ret);
}
ret_ = ret;
}
} // namespace milvus::query

View File

@ -73,7 +73,24 @@ ShowPlanNodeVisitor::visit(FloatVectorANNS& node) {
void
ShowPlanNodeVisitor::visit(BinaryVectorANNS& node) {
// TODO
assert(!ret_);
auto& info = node.query_info_;
Json json_body{
{"node_type", "BinaryVectorANNS"}, //
{"metric_type", info.metric_type_}, //
{"field_id_", info.field_id_}, //
{"topK", info.topK_}, //
{"search_params", info.search_params_}, //
{"placeholder_tag", node.placeholder_tag_}, //
};
if (node.predicate_.has_value()) {
ShowExprVisitor expr_show;
Assert(node.predicate_.value());
json_body["predicate"] = expr_show.call_child(node.predicate_->operator*());
} else {
json_body["predicate"] = "None";
}
ret_ = json_body;
}
} // namespace milvus::query

View File

@ -123,9 +123,10 @@ Collection::CreateIndex(std::string& index_config) {
void
Collection::parse() {
if (collection_proto_.empty()) {
// TODO: remove hard code use unittests are ready
std::cout << "WARN: Use default schema" << std::endl;
auto schema = std::make_shared<Schema>();
schema->AddField("fakevec", DataType::VECTOR_FLOAT, 16);
schema->AddField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
schema->AddField("age", DataType::INT32);
schema_ = schema;
return;

View File

@ -226,8 +226,14 @@ class ConcurrentVector : public ConcurrentVectorImpl<Type, true> {
using ConcurrentVectorImpl<Type, true>::ConcurrentVectorImpl;
};
class FloatVector {};
class BinaryVector {};
class VectorTrait {};
class FloatVector : public VectorTrait {
using embedded_type = float;
};
class BinaryVector : public VectorTrait {
using embedded_type = uint8_t;
};
template <>
class ConcurrentVector<FloatVector> : public ConcurrentVectorImpl<float, false> {

View File

@ -85,8 +85,6 @@ IndexingRecord::UpdateResourceAck(int64_t chunk_ack, const InsertRecord& record)
template <typename T>
void
ScalarIndexingEntry<T>::BuildIndexRange(int64_t ack_beg, int64_t ack_end, const VectorBase* vec_base) {
auto dim = field_meta_.get_dim();
auto source = dynamic_cast<const ConcurrentVector<T>*>(vec_base);
Assert(source);
auto chunk_size = source->chunk_size();

View File

@ -24,7 +24,7 @@ namespace milvus {
namespace segcore {
// using engine::DataChunk;
// using engine::DataChunkPtr;
using engine::QueryResult;
using QueryResult = milvus::QueryResult;
struct RowBasedRawData {
void* raw_data; // schema
int sizeof_per_row; // alignment

View File

@ -42,7 +42,7 @@ DeleteSegment(CSegmentBase segment) {
void
DeleteQueryResult(CQueryResult query_result) {
auto res = (milvus::segcore::QueryResult*)query_result;
auto res = (milvus::QueryResult*)query_result;
delete res;
}
@ -134,7 +134,7 @@ Search(CSegmentBase c_segment,
placeholder_groups.push_back((const milvus::query::PlaceholderGroup*)c_placeholder_groups[i]);
}
auto query_result = std::make_unique<milvus::segcore::QueryResult>();
auto query_result = std::make_unique<milvus::QueryResult>();
auto status = CStatus();
try {

View File

@ -42,8 +42,11 @@ EasyAssertInfo(
[[noreturn]] void
ThrowWithTrace(const std::exception& exception) {
if (typeid(exception) == typeid(WrappedRuntimError)) {
throw exception;
}
auto err_msg = exception.what() + std::string("\n") + EasyStackTrace();
throw std::runtime_error(err_msg);
throw WrappedRuntimError(err_msg);
}
} // namespace milvus::impl

View File

@ -11,6 +11,7 @@
#pragma once
#include <string_view>
#include <stdexcept>
#include <exception>
#include <stdio.h>
#include <stdlib.h>
@ -22,6 +23,10 @@ void
EasyAssertInfo(
bool value, std::string_view expr_str, std::string_view filename, int lineno, std::string_view extra_info);
class WrappedRuntimError : public std::runtime_error {
using std::runtime_error::runtime_error;
};
[[noreturn]] void
ThrowWithTrace(const std::exception& exception);

View File

@ -26,6 +26,12 @@ include( FetchContent )
set( FETCHCONTENT_BASE_DIR ${MILVUS_BINARY_DIR}/3rdparty_download )
set( FETCHCONTENT_QUIET OFF )
if( CUSTOM_THIRDPARTY_DOWNLOAD_PATH )
set( THIRDPARTY_DOWNLOAD_PATH ${CUSTOM_THIRDPARTY_DOWNLOAD_PATH} )
else()
set( THIRDPARTY_DOWNLOAD_PATH ${CMAKE_BINARY_DIR}/3rdparty_download/download )
endif()
message( STATUS "Thirdparty downloaded file path: ${THIRDPARTY_DOWNLOAD_PATH}" )
# ----------------------------------------------------------------------
# Find pthreads

View File

@ -24,7 +24,7 @@ FetchContent_Declare(
opentracing
URL ${OPENTRACING_SOURCE_URL}
URL_MD5 "e598ba4b81ae8e1ceed8cd8bbf86f2fd"
DOWNLOAD_DIR ${MILVUS_BINARY_DIR}/3rdparty_download/download
DOWNLOAD_DIR ${THIRDPARTY_DOWNLOAD_PATH}
SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/opentracing-src
BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/opentracing-build
)

View File

@ -25,7 +25,7 @@ FetchContent_Declare(
protobuf
URL ${GTEST_SOURCE_URL}
URL_MD5 "9562b27cc6ac5ebd087f201f1310c885"
DOWNLOAD_DIR ${MILVUS_BINARY_DIR}/3rdparty_download/download
DOWNLOAD_DIR ${THIRDPARTY_DOWNLOAD_PATH}
SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/protobuf-src
BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/protobuf-build

View File

@ -23,7 +23,7 @@ FetchContent_Declare(
yaml-cpp
URL ${YAMLCPP_SOURCE_URL}
URL_MD5 "b45bf1089a382e81f6b661062c10d0c2"
DOWNLOAD_DIR ${MILVUS_BINARY_DIR}/3rdparty_download/download
DOWNLOAD_DIR ${THIRDPARTY_DOWNLOAD_PATH}
SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/yaml-src
BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/yaml-build
)

View File

@ -21,7 +21,7 @@ TEST(Binary, Insert) {
int64_t num_queries = 10;
int64_t topK = 5;
auto schema = std::make_shared<Schema>();
schema->AddField("vecbin", DataType::VECTOR_BINARY, 128);
schema->AddField("vecbin", DataType::VECTOR_BINARY, 128, MetricType::METRIC_Jaccard);
schema->AddField("age", DataType::INT64);
auto dataset = DataGen(schema, N, 10);
auto segment = CreateSegment(schema);

View File

@ -98,7 +98,49 @@ TEST(Expr, Range) {
}
})";
auto schema = std::make_shared<Schema>();
schema->AddField("fakevec", DataType::VECTOR_FLOAT, 16);
schema->AddField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
schema->AddField("age", DataType::INT32);
auto plan = CreatePlan(*schema, dsl_string);
ShowPlanNodeVisitor shower;
Assert(plan->tag2field_.at("$0") == "fakevec");
auto out = shower.call_child(*plan->plan_node_);
std::cout << out.dump(4);
}
TEST(Expr, RangeBinary) {
SUCCEED();
using namespace milvus;
using namespace milvus::query;
using namespace milvus::segcore;
std::string dsl_string = R"(
{
"bool": {
"must": [
{
"range": {
"age": {
"GT": 1,
"LT": 100
}
}
},
{
"vector": {
"fakevec": {
"metric_type": "Jaccard",
"params": {
"nprobe": 10
},
"query": "$0",
"topk": 10
}
}
}
]
}
})";
auto schema = std::make_shared<Schema>();
schema->AddField("fakevec", DataType::VECTOR_BINARY, 512, MetricType::METRIC_Jaccard);
schema->AddField("age", DataType::INT32);
auto plan = CreatePlan(*schema, dsl_string);
ShowPlanNodeVisitor shower;
@ -140,7 +182,7 @@ TEST(Expr, InvalidRange) {
}
})";
auto schema = std::make_shared<Schema>();
schema->AddField("fakevec", DataType::VECTOR_FLOAT, 16);
schema->AddField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
schema->AddField("age", DataType::INT32);
ASSERT_ANY_THROW(CreatePlan(*schema, dsl_string));
}
@ -179,7 +221,7 @@ TEST(Expr, InvalidDSL) {
})";
auto schema = std::make_shared<Schema>();
schema->AddField("fakevec", DataType::VECTOR_FLOAT, 16);
schema->AddField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
schema->AddField("age", DataType::INT32);
ASSERT_ANY_THROW(CreatePlan(*schema, dsl_string));
}
@ -189,7 +231,7 @@ TEST(Expr, ShowExecutor) {
using namespace milvus::segcore;
auto node = std::make_unique<FloatVectorANNS>();
auto schema = std::make_shared<Schema>();
schema->AddField("fakevec", DataType::VECTOR_FLOAT, 16);
schema->AddField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
int64_t num_queries = 100L;
auto raw_data = DataGen(schema, num_queries);
auto& info = node->query_info_;
@ -248,7 +290,7 @@ TEST(Expr, TestRange) {
}
})";
auto schema = std::make_shared<Schema>();
schema->AddField("fakevec", DataType::VECTOR_FLOAT, 16);
schema->AddField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
schema->AddField("age", DataType::INT32);
auto seg = CreateSegment(schema);

View File

@ -235,14 +235,14 @@ TEST(Indexing, IVFFlatNM) {
}
}
TEST(Indexing, DISABLED_BinaryBruteForce) {
TEST(Indexing, BinaryBruteForce) {
int64_t N = 100000;
int64_t num_queries = 10;
int64_t topk = 5;
int64_t dim = 64;
int64_t dim = 512;
auto result_count = topk * num_queries;
auto schema = std::make_shared<Schema>();
schema->AddField("vecbin", DataType::VECTOR_BINARY, dim);
schema->AddField("vecbin", DataType::VECTOR_BINARY, dim, MetricType::METRIC_Jaccard);
schema->AddField("age", DataType::INT64);
auto dataset = DataGen(schema, N, 10);
vector<float> distances(result_count);
@ -250,8 +250,16 @@ TEST(Indexing, DISABLED_BinaryBruteForce) {
auto bin_vec = dataset.get_col<uint8_t>(0);
auto line_sizeof = schema->operator[](0).get_sizeof();
auto query_data = 1024 * line_sizeof + bin_vec.data();
query::BinarySearchBruteForce(faiss::MetricType::METRIC_Jaccard, line_sizeof, bin_vec.data(), N, topk, num_queries,
query_data, distances.data(), ids.data());
query::dataset::BinaryQueryDataset query_dataset{
faiss::MetricType::METRIC_Jaccard, //
num_queries, //
topk, //
line_sizeof, //
query_data //
};
query::BinarySearchBruteForce(query_dataset, bin_vec.data(), N, distances.data(), ids.data());
QueryResult qr;
qr.num_queries_ = num_queries;
qr.topK_ = topk;
@ -264,76 +272,78 @@ TEST(Indexing, DISABLED_BinaryBruteForce) {
[
[
"1024->0.000000",
"86966->0.395349",
"24843->0.404762",
"13806->0.416667",
"44313->0.421053"
"43190->0.578804",
"5255->0.586207",
"23247->0.586486",
"4936->0.588889"
],
[
"1025->0.000000",
"14226->0.348837",
"1488->0.365854",
"47337->0.377778",
"20913->0.377778"
"15147->0.562162",
"49910->0.564304",
"67435->0.567867",
"38292->0.569921"
],
[
"1026->0.000000",
"81882->0.386364",
"9215->0.409091",
"95024->0.409091",
"54987->0.414634"
"15332->0.569061",
"56391->0.572559",
"17187->0.572603",
"26988->0.573771"
],
[
"1027->0.000000",
"68981->0.394737",
"75528->0.404762",
"68794->0.405405",
"21975->0.425000"
"4502->0.559585",
"25879->0.566234",
"66937->0.566489",
"21228->0.566845"
],
[
"1028->0.000000",
"90290->0.375000",
"34309->0.394737",
"58559->0.400000",
"33865->0.400000"
"38490->0.578804",
"12946->0.581717",
"31677->0.582173",
"94474->0.583569"
],
[
"1029->0.000000",
"62722->0.388889",
"89070->0.394737",
"18528->0.414634",
"94971->0.421053"
"59011->0.551630",
"82575->0.555263",
"42914->0.561828",
"23705->0.564171"
],
[
"1030->0.000000",
"67402->0.333333",
"3988->0.347826",
"86376->0.354167",
"84381->0.361702"
"39782->0.579946",
"65553->0.589947",
"82154->0.590028",
"13374->0.590164"
],
[
"1031->0.000000",
"81569->0.325581",
"12715->0.347826",
"40332->0.363636",
"21037->0.372093"
"47826->0.582873",
"72669->0.587432",
"334->0.588076",
"80652->0.589333"
],
[
"1032->0.000000",
"60536->0.428571",
"93293->0.432432",
"70969->0.435897",
"64048->0.450000"
"31968->0.573034",
"63545->0.575758",
"76913->0.575916",
"6286->0.576000"
],
[
"1033->0.000000",
"99022->0.394737",
"11763->0.405405",
"50073->0.428571",
"97118->0.428571"
"95635->0.570248",
"93439->0.574866",
"6709->0.578534",
"6367->0.579634"
]
]
]
)");
ASSERT_EQ(json, ref);
auto json_str = json.dump(2);
auto ref_str = ref.dump(2);
ASSERT_EQ(json_str, ref_str);
}

View File

@ -72,7 +72,7 @@ TEST(Query, ShowExecutor) {
using namespace milvus;
auto node = std::make_unique<FloatVectorANNS>();
auto schema = std::make_shared<Schema>();
schema->AddField("fakevec", DataType::VECTOR_FLOAT, 16);
schema->AddField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
int64_t num_queries = 100L;
auto raw_data = DataGen(schema, num_queries);
auto& info = node->query_info_;
@ -98,7 +98,7 @@ TEST(Query, DSL) {
"must": [
{
"vector": {
"Vec": {
"fakevec": {
"metric_type": "L2",
"params": {
"nprobe": 10
@ -113,7 +113,7 @@ TEST(Query, DSL) {
})";
auto schema = std::make_shared<Schema>();
schema->AddField("fakevec", DataType::VECTOR_FLOAT, 16);
schema->AddField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
auto plan = CreatePlan(*schema, dsl_string);
auto res = shower.call_child(*plan->plan_node_);
@ -123,7 +123,7 @@ TEST(Query, DSL) {
{
"bool": {
"vector": {
"Vec": {
"fakevec": {
"metric_type": "L2",
"params": {
"nprobe": 10
@ -159,7 +159,7 @@ TEST(Query, ParsePlaceholderGroup) {
})";
auto schema = std::make_shared<Schema>();
schema->AddField("fakevec", DataType::VECTOR_FLOAT, 16);
schema->AddField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
auto plan = CreatePlan(*schema, dsl_string);
int64_t num_queries = 100000;
int dim = 16;
@ -172,7 +172,7 @@ TEST(Query, ExecWithPredicate) {
using namespace milvus::query;
using namespace milvus::segcore;
auto schema = std::make_shared<Schema>();
schema->AddField("fakevec", DataType::VECTOR_FLOAT, 16);
schema->AddField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
schema->AddField("age", DataType::FLOAT);
std::string dsl = R"({
"bool": {
@ -217,8 +217,8 @@ TEST(Query, ExecWithPredicate) {
int topk = 5;
Json json = QueryResultToJson(qr);
auto ref = Json::parse(R"([
auto ref = Json::parse(R"(
[
[
[
"980486->3.149221",
@ -257,15 +257,14 @@ TEST(Query, ExecWithPredicate) {
]
]
])");
ASSERT_EQ(json, ref);
ASSERT_EQ(json.dump(2), ref.dump(2));
}
TEST(Query, ExecWihtoutPredicate) {
TEST(Query, ExecWithoutPredicate) {
using namespace milvus::query;
using namespace milvus::segcore;
auto schema = std::make_shared<Schema>();
schema->AddField("fakevec", DataType::VECTOR_FLOAT, 16);
schema->AddField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
schema->AddField("age", DataType::FLOAT);
std::string dsl = R"({
"bool": {
@ -301,18 +300,49 @@ TEST(Query, ExecWihtoutPredicate) {
segment->Search(plan.get(), ph_group_arr.data(), &time, 1, qr);
std::vector<std::vector<std::string>> results;
int topk = 5;
for (int q = 0; q < num_queries; ++q) {
std::vector<std::string> result;
for (int k = 0; k < topk; ++k) {
int index = q * topk + k;
result.emplace_back(std::to_string(qr.result_ids_[index]) + "->" +
std::to_string(qr.result_distances_[index]));
}
results.emplace_back(std::move(result));
}
Json json{results};
std::cout << json.dump(2);
auto json = QueryResultToJson(qr);
auto ref = Json::parse(R"(
[
[
[
"980486->3.149221",
"318367->3.661235",
"302798->4.553688",
"321424->4.757450",
"565529->5.083780"
],
[
"233390->7.931535",
"238958->8.109344",
"230645->8.439169",
"901939->8.658772",
"380328->8.731251"
],
[
"749862->3.398494",
"701321->3.632437",
"897246->3.749835",
"750683->3.897577",
"105995->4.073595"
],
[
"138274->3.454446",
"124548->3.783290",
"840855->4.782170",
"936719->5.026924",
"709627->5.063170"
],
[
"810401->3.926393",
"46575->4.054171",
"201740->4.274491",
"669040->4.399628",
"231500->4.831223"
]
]
]
)");
ASSERT_EQ(json.dump(2), ref.dump(2));
}
TEST(Query, FillSegment) {
@ -331,6 +361,9 @@ TEST(Query, FillSegment) {
auto param = field->add_type_params();
param->set_key("dim");
param->set_value("16");
auto iparam = field->add_index_params();
iparam->set_key("metric_type");
iparam->set_value("L2");
}
{
@ -392,3 +425,57 @@ TEST(Query, FillSegment) {
++std_index;
}
}
TEST(Query, ExecWithPredicateBinary) {
using namespace milvus::query;
using namespace milvus::segcore;
auto schema = std::make_shared<Schema>();
schema->AddField("fakevec", DataType::VECTOR_BINARY, 512, MetricType::METRIC_Jaccard);
schema->AddField("age", DataType::FLOAT);
std::string dsl = R"({
"bool": {
"must": [
{
"range": {
"age": {
"GE": -1,
"LT": 1
}
}
},
{
"vector": {
"fakevec": {
"metric_type": "Jaccard",
"params": {
"nprobe": 10
},
"query": "$0",
"topk": 5
}
}
}
]
}
})";
int64_t N = 1000 * 1000;
auto dataset = DataGen(schema, N);
auto segment = std::make_unique<SegmentSmallIndex>(schema);
segment->PreInsert(N);
segment->Insert(0, N, dataset.row_ids_.data(), dataset.timestamps_.data(), dataset.raw_);
auto vec_ptr = dataset.get_col<uint8_t>(0);
auto plan = CreatePlan(*schema, dsl);
auto num_queries = 5;
auto ph_group_raw = CreateBinaryPlaceholderGroupFromBlob(num_queries, 512, vec_ptr.data() + 1024 * 512 / 8);
auto ph_group = ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString());
QueryResult qr;
Timestamp time = 1000000;
std::vector<const PlaceholderGroup*> ph_group_arr = {ph_group.get()};
segment->Search(plan.get(), ph_group_arr.data(), &time, 1, qr);
int topk = 5;
Json json = QueryResultToJson(qr);
std::cout << json.dump(2);
// ASSERT_EQ(json.dump(2), ref.dump(2));
}

View File

@ -63,7 +63,7 @@ TEST(SegmentCoreTest, NormalDistributionTest) {
using namespace milvus::segcore;
using namespace milvus::engine;
auto schema = std::make_shared<Schema>();
schema->AddField("fakevec", DataType::VECTOR_FLOAT, 16);
schema->AddField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
schema->AddField("age", DataType::INT32);
int N = 1000 * 1000;
auto [raw_data, timestamps, uids] = generate_data(N);
@ -76,7 +76,7 @@ TEST(SegmentCoreTest, MockTest) {
using namespace milvus::segcore;
using namespace milvus::engine;
auto schema = std::make_shared<Schema>();
schema->AddField("fakevec", DataType::VECTOR_FLOAT, 16);
schema->AddField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
schema->AddField("age", DataType::INT32);
std::vector<char> raw_data;
std::vector<Timestamp> timestamps;
@ -116,7 +116,7 @@ TEST(SegmentCoreTest, SmallIndex) {
using namespace milvus::segcore;
using namespace milvus::engine;
auto schema = std::make_shared<Schema>();
schema->AddField("fakevec", DataType::VECTOR_FLOAT, 16);
schema->AddField("fakevec", DataType::VECTOR_FLOAT, 16, MetricType::METRIC_L2);
schema->AddField("age", DataType::INT32);
int N = 1024 * 1024;
auto data = DataGen(schema, N);

View File

@ -167,7 +167,7 @@ CreateBinaryPlaceholderGroup(int64_t num_queries, int64_t dim, int64_t seed = 42
ser::PlaceholderGroup raw_group;
auto value = raw_group.add_placeholders();
value->set_tag("$0");
value->set_type(ser::PlaceholderType::VECTOR_FLOAT);
value->set_type(ser::PlaceholderType::VECTOR_BINARY);
std::default_random_engine e(seed);
for (int i = 0; i < num_queries; ++i) {
std::vector<uint8_t> vec;
@ -175,7 +175,27 @@ CreateBinaryPlaceholderGroup(int64_t num_queries, int64_t dim, int64_t seed = 42
vec.push_back(e());
}
// std::string line((char*)vec.data(), (char*)vec.data() + vec.size() * sizeof(float));
value->add_values(vec.data(), vec.size() * sizeof(float));
value->add_values(vec.data(), vec.size());
}
return raw_group;
}
inline auto
CreateBinaryPlaceholderGroupFromBlob(int64_t num_queries, int64_t dim, const uint8_t* ptr) {
assert(dim % 8 == 0);
namespace ser = milvus::proto::service;
ser::PlaceholderGroup raw_group;
auto value = raw_group.add_placeholders();
value->set_tag("$0");
value->set_type(ser::PlaceholderType::VECTOR_BINARY);
for (int i = 0; i < num_queries; ++i) {
std::vector<uint8_t> vec;
for (int d = 0; d < dim / 8; ++d) {
vec.push_back(*ptr);
++ptr;
}
// std::string line((char*)vec.data(), (char*)vec.data() + vec.size() * sizeof(float));
value->add_values(vec.data(), vec.size());
}
return raw_group;
}

View File

@ -4,60 +4,126 @@ project(wrapper)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
if (NOT GIT_ARROW_REPO)
set(GIT_ARROW_REPO "https://github.com/apache/arrow.git")
endif ()
message(STATUS "Arrow Repo:" ${GIT_ARROW_REPO})
include( ExternalProject )
set( ARROW_VERSION "2.0.0" )
set( ARROW_SOURCE_URL
"https://github.com/apache/arrow/archive/apache-arrow-${ARROW_VERSION}.tar.gz")
if (NOT GIT_ARROW_TAG)
set(GIT_ARROW_TAG "apache-arrow-2.0.0")
endif ()
message(STATUS "Arrow Tag:" ${GIT_ARROW_TAG})
if( CUSTOM_THIRDPARTY_DOWNLOAD_PATH )
set( THIRDPARTY_DOWNLOAD_PATH ${CUSTOM_THIRDPARTY_DOWNLOAD_PATH} )
else()
set( THIRDPARTY_DOWNLOAD_PATH ${CMAKE_BINARY_DIR}/3rdparty_download/download )
endif()
message( STATUS "Thirdparty downloaded file path: ${THIRDPARTY_DOWNLOAD_PATH}" )
###################################################################################################
# - cmake modules ---------------------------------------------------------------------------------
macro( build_arrow )
message( STATUS "Building ARROW-${ARROW_VERSION} from source" )
set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/" ${CMAKE_MODULE_PATH})
set( ARROW_CMAKE_ARGS
"-DARROW_WITH_LZ4=OFF"
"-DARROW_WITH_ZSTD=OFF"
"-DARROW_WITH_BROTLI=OFF"
"-DARROW_WITH_SNAPPY=OFF"
"-DARROW_WITH_ZLIB=OFF"
"-DARROW_BUILD_STATIC=ON"
"-DARROW_BUILD_SHARED=OFF"
"-DARROW_BOOST_USE_SHARED=OFF"
"-DARROW_BUILD_TESTS=OFF"
"-DARROW_TEST_MEMCHECK=OFF"
"-DARROW_BUILD_BENCHMARKS=OFF"
"-DARROW_CUDA=OFF"
"-DARROW_JEMALLOC=OFF"
"-DARROW_PYTHON=OFF"
"-DARROW_BUILD_UTILITIES=OFF"
"-DARROW_PARQUET=ON"
"-DPARQUET_BUILD_SHARED=OFF"
"-DARROW_S3=OFF"
"-DCMAKE_VERBOSE_MAKEFILE=ON"
"-DCMAKE_INSTALL_PREFIX=${CMAKE_CURRENT_BINARY_DIR}"
)
###################################################################################################
# - build arrow ------------------------------------------------------------------------------------
ExternalProject_Add(
arrow-ep
PREFIX ${CMAKE_BINARY_DIR}/3rdparty_download/arrow-subbuild
BINARY_DIR arrow-bin
DOWNLOAD_DIR ${THIRDPARTY_DOWNLOAD_PATH}
INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}
SOURCE_SUBDIR "cpp"
URL ${ARROW_SOURCE_URL}
URL_MD5 "37fbe42e7f155dd76cf6f63257373b24"
CMAKE_ARGS ${ARROW_CMAKE_ARGS}
${EP_LOG_OPTIONS}
)
message(STATUS "BUILDING ARROW")
include(ConfigureArrow)
ExternalProject_Get_Property( arrow-ep INSTALL_DIR )
ExternalProject_Get_Property( arrow-ep BINARY_DIR )
set( THRIFT_LOCATION ${BINARY_DIR}/thrift_ep-install )
set( UTF8PROC_LOCATION ${BINARY_DIR}/utf8proc_ep-install )
if (ARROW_FOUND)
message(STATUS "Apache Arrow found in ${ARROW_INCLUDE_DIR}")
else ()
message(FATAL_ERROR "Apache Arrow not found, please check your settings.")
endif (ARROW_FOUND)
if( NOT IS_DIRECTORY ${INSTALL_DIR}/include )
file( MAKE_DIRECTORY "${INSTALL_DIR}/include" )
endif()
if( NOT IS_DIRECTORY ${THRIFT_LOCATION}/include )
file( MAKE_DIRECTORY "${THRIFT_LOCATION}/include" )
endif()
if( NOT IS_DIRECTORY ${UTF8PROC_LOCATION}/include )
file( MAKE_DIRECTORY "${UTF8PROC_LOCATION}/include" )
endif()
add_library(arrow STATIC IMPORTED ${ARROW_LIB})
add_library(parquet STATIC IMPORTED ${PARQUET_LIB})
add_library(thrift STATIC IMPORTED ${THRIFT_LIB})
add_library(utf8proc STATIC IMPORTED ${UTF8PROC_LIB})
if (ARROW_FOUND)
set_target_properties(arrow PROPERTIES IMPORTED_LOCATION ${ARROW_LIB})
set_target_properties(parquet PROPERTIES IMPORTED_LOCATION ${PARQUET_LIB})
set_target_properties(thrift PROPERTIES IMPORTED_LOCATION ${THRIFT_LIB})
set_target_properties(utf8proc PROPERTIES IMPORTED_LOCATION ${UTF8PROC_LIB})
endif (ARROW_FOUND)
add_library( thrift STATIC IMPORTED )
set_target_properties( thrift
PROPERTIES
IMPORTED_GLOBAL TRUE
IMPORTED_LOCATION ${THRIFT_LOCATION}/lib/libthrift.a
INTERFACE_INCLUDE_DIRECTORIES ${THRIFT_LOCATION}/include )
add_dependencies(thrift arrow-ep)
###################################################################################################
add_library( utf8proc STATIC IMPORTED )
set_target_properties( utf8proc
PROPERTIES
IMPORTED_GLOBAL TRUE
IMPORTED_LOCATION ${UTF8PROC_LOCATION}/lib/libutf8proc.a
INTERFACE_INCLUDE_DIRECTORIES ${UTF8PROC_LOCATION}/include )
add_dependencies(utf8proc arrow-ep)
include_directories(${ARROW_INCLUDE_DIR})
include_directories(${PROJECT_SOURCE_DIR})
add_library( arrow STATIC IMPORTED )
set_target_properties( arrow
PROPERTIES
IMPORTED_GLOBAL TRUE
IMPORTED_LOCATION ${INSTALL_DIR}/lib/libarrow.a
INTERFACE_INCLUDE_DIRECTORIES ${INSTALL_DIR}/include )
add_dependencies(arrow arrow-ep )
add_library( parquet STATIC IMPORTED )
set_target_properties( parquet
PROPERTIES
IMPORTED_GLOBAL TRUE
IMPORTED_LOCATION ${INSTALL_DIR}/lib/libparquet.a
INTERFACE_INCLUDE_DIRECTORIES ${INSTALL_DIR}/include )
add_dependencies(parquet arrow-ep)
target_link_libraries(parquet INTERFACE arrow thrift utf8proc)
endmacro()
build_arrow()
add_library(wrapper STATIC)
target_sources(wrapper PUBLIC ParquetWrapper.cpp
PayloadStream.cpp)
target_link_libraries(wrapper PUBLIC parquet arrow thrift utf8proc pthread)
target_sources(wrapper PUBLIC ParquetWrapper.cpp PayloadStream.cpp
)
set_target_properties( wrapper PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${CMAKE_CURRENT_SOURCE_DIR} )
target_link_libraries(wrapper PUBLIC parquet pthread)
if(NOT CMAKE_INSTALL_PREFIX)
set(CMAKE_INSTALL_PREFIX ${CMAKE_CURRENT_BINARY_DIR})
endif()
install(TARGETS wrapper DESTINATION ${CMAKE_INSTALL_PREFIX})
install(FILES ${ARROW_LIB} ${PARQUET_LIB} ${THRIFT_LIB} ${UTF8PROC_LIB} DESTINATION ${CMAKE_INSTALL_PREFIX})
add_subdirectory(test)
get_target_property( THRIFT_LIB thrift LOCATION )
get_target_property( ARROW_LIB arrow LOCATION )
get_target_property( PARQUET_LIB parquet LOCATION )
get_target_property( UTF8PROC_LIB utf8proc LOCATION )
install(TARGETS wrapper DESTINATION ${CMAKE_INSTALL_PREFIX})
install(
FILES ${ARROW_LIB} ${PARQUET_LIB} ${THRIFT_LIB} ${UTF8PROC_LIB} DESTINATION ${CMAKE_INSTALL_PREFIX})
add_subdirectory(test)

View File

@ -22,11 +22,13 @@ fi
mkdir ${OUTPUT_LIB}
BUILD_TYPE="Debug"
GIT_ARROW_REPO="https://github.com/apache/arrow.git"
GIT_ARROW_TAG="apache-arrow-2.0.0"
CUSTOM_THIRDPARTY_PATH=""
while getopts "a:b:t:h" arg; do
while getopts "a:b:t:h:f:" arg; do
case $arg in
f)
CUSTOM_THIRDPARTY_PATH=$OPTARG
;;
t)
BUILD_TYPE=$OPTARG # BUILD_TYPE
;;
@ -40,6 +42,7 @@ while getopts "a:b:t:h" arg; do
echo "-t: build type(default: Debug)
-a: arrow repo(default: https://github.com/apache/arrow.git)
-b: arrow tag(default: apache-arrow-2.0.0)
-f: custom thirdparty path(default:)
-h: help
"
exit 0
@ -51,8 +54,18 @@ while getopts "a:b:t:h" arg; do
esac
done
echo "BUILD_TYPE: " $BUILD_TYPE
echo "GIT_ARROW_REPO: " $GIT_ARROW_REPO
echo "GIT_ARROW_TAG: " $GIT_ARROW_TAG
echo "CUSTOM_THIRDPARTY_PATH: " $CUSTOM_THIRDPARTY_PATH
pushd ${CMAKE_BUILD}
cmake -DCMAKE_INSTALL_PREFIX=${OUTPUT_LIB} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DGIT_ARROW_REPO=${GIT_ARROW_REPO} -DGIT_ARROW_TAG=${GIT_ARROW_TAG} .. && make && make install
CMAKE_CMD="cmake \
-DCMAKE_INSTALL_PREFIX=${OUTPUT_LIB} \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-DCUSTOM_THIRDPARTY_DOWNLOAD_PATH=${CUSTOM_THIRDPARTY_PATH} .."
${CMAKE_CMD}
echo ${CMAKE_CMD}
if [[ ! ${jobs+1} ]]; then
jobs=$(nproc)
fi
make -j ${jobs} && make install

View File

@ -1,98 +0,0 @@
set(ARROW_ROOT ${CMAKE_BINARY_DIR}/arrow)
set(ARROW_CMAKE_ARGS " -DARROW_WITH_LZ4=OFF"
" -DARROW_WITH_ZSTD=OFF"
" -DARROW_WITH_BROTLI=OFF"
" -DARROW_WITH_SNAPPY=OFF"
" -DARROW_WITH_ZLIB=OFF"
" -DARROW_BUILD_STATIC=ON"
" -DARROW_BUILD_SHARED=OFF"
" -DARROW_BOOST_USE_SHARED=OFF"
" -DARROW_BUILD_TESTS=OFF"
" -DARROW_TEST_MEMCHECK=OFF"
" -DARROW_BUILD_BENCHMARKS=OFF"
" -DARROW_CUDA=OFF"
" -DARROW_JEMALLOC=OFF"
" -DARROW_PYTHON=OFF"
" -DARROW_BUILD_UTILITIES=OFF"
" -DARROW_PARQUET=ON"
" -DPARQUET_BUILD_SHARED=OFF"
" -DARROW_S3=OFF"
" -DCMAKE_VERBOSE_MAKEFILE=ON")
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/Templates/Arrow.CMakeLists.txt.cmake"
"${ARROW_ROOT}/CMakeLists.txt")
file(MAKE_DIRECTORY "${ARROW_ROOT}/build")
file(MAKE_DIRECTORY "${ARROW_ROOT}/install")
execute_process(
COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
RESULT_VARIABLE ARROW_CONFIG
WORKING_DIRECTORY ${ARROW_ROOT})
if(ARROW_CONFIG)
message(FATAL_ERROR "Configuring Arrow failed: " ${ARROW_CONFIG})
endif(ARROW_CONFIG)
#set(PARALLEL_BUILD -j)
#if($ENV{PARALLEL_LEVEL})
# set(NUM_JOBS $ENV{PARALLEL_LEVEL})
# set(PARALLEL_BUILD "${PARALLEL_BUILD}${NUM_JOBS}")
#endif($ENV{PARALLEL_LEVEL})
set(NUM_JOBS 4)
set(PARALLEL_BUILD "-j${NUM_JOBS}")
if(${NUM_JOBS})
if(${NUM_JOBS} EQUAL 1)
message(STATUS "ARROW BUILD: Enabling Sequential CMake build")
elseif(${NUM_JOBS} GREATER 1)
message(STATUS "ARROW BUILD: Enabling Parallel CMake build with ${NUM_JOBS} jobs")
endif(${NUM_JOBS} EQUAL 1)
else()
message(STATUS "ARROW BUILD: Enabling Parallel CMake build with all threads")
endif(${NUM_JOBS})
execute_process(
COMMAND ${CMAKE_COMMAND} --build .. -- ${PARALLEL_BUILD}
RESULT_VARIABLE ARROW_BUILD
WORKING_DIRECTORY ${ARROW_ROOT}/build)
if(ARROW_BUILD)
message(FATAL_ERROR "Building Arrow failed: " ${ARROW_BUILD})
endif(ARROW_BUILD)
message(STATUS "Arrow installed here: " ${ARROW_ROOT}/install)
set(ARROW_LIBRARY_DIR "${ARROW_ROOT}/install/lib")
set(ARROW_INCLUDE_DIR "${ARROW_ROOT}/install/include")
find_library(ARROW_LIB arrow
NO_DEFAULT_PATH
HINTS "${ARROW_LIBRARY_DIR}")
message(STATUS "Arrow library: " ${ARROW_LIB})
find_library(PARQUET_LIB parquet
NO_DEFAULT_PATH
HINTS "${ARROW_LIBRARY_DIR}")
message(STATUS "Parquet library: " ${PARQUET_LIB})
find_library(THRIFT_LIB thrift
NO_DEFAULT_PATH
HINTS "${ARROW_ROOT}/build/thrift_ep-install/lib")
message(STATUS "Thirft library: " ${THRIFT_LIB})
find_library(UTF8PROC_LIB utf8proc
NO_DEFAULT_PATH
HINTS "${ARROW_ROOT}/build/utf8proc_ep-install/lib")
message(STATUS "utf8proc library: " ${UTF8PROC_LIB})
if(ARROW_LIB AND PARQUET_LIB AND THRIFT_LIB AND UTF8PROC_LIB)
set(ARROW_FOUND TRUE)
endif(ARROW_LIB AND PARQUET_LIB AND THRIFT_LIB AND UTF8PROC_LIB)
# message(STATUS "FlatBuffers installed here: " ${FLATBUFFERS_ROOT})
# set(FLATBUFFERS_INCLUDE_DIR "${FLATBUFFERS_ROOT}/include")
# set(FLATBUFFERS_LIBRARY_DIR "${FLATBUFFERS_ROOT}/lib")
add_definitions(-DARROW_METADATA_V4)

View File

@ -1,30 +0,0 @@
#=============================================================================
# Copyright (c) 2018-2020, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#=============================================================================
cmake_minimum_required(VERSION 3.14...3.17 FATAL_ERROR)
project(wrapper-Arrow)
include(ExternalProject)
ExternalProject_Add(Arrow
GIT_REPOSITORY ${GIT_ARROW_REPO}
GIT_TAG ${GIT_ARROW_TAG}
GIT_SHALLOW true
SOURCE_DIR "${ARROW_ROOT}/arrow"
SOURCE_SUBDIR "cpp"
BINARY_DIR "${ARROW_ROOT}/build"
INSTALL_DIR "${ARROW_ROOT}/install"
CMAKE_ARGS ${ARROW_CMAKE_ARGS} -DCMAKE_INSTALL_PREFIX=${ARROW_ROOT}/install)

View File

@ -10,12 +10,9 @@ FetchContent_MakeAvailable(googletest)
target_link_libraries(wrapper_test
gtest_main
pthread
wrapper
parquet arrow thrift utf8proc pthread
parquet
)
install(TARGETS wrapper_test DESTINATION ${CMAKE_INSTALL_PREFIX})
# Defines `gtest_discover_tests()`.
#include(GoogleTest)
#gtest_discover_tests(milvusd_test)

View File

@ -23,11 +23,13 @@ fi
mkdir ${OUTPUT_LIB}
BUILD_TYPE="Debug"
GIT_ARROW_REPO="https://github.com/apache/arrow.git"
GIT_ARROW_TAG="apache-arrow-2.0.0"
CUSTOM_THIRDPARTY_PATH=""
while getopts "a:b:t:h" arg; do
while getopts "a:b:t:h:f:" arg; do
case $arg in
f)
CUSTOM_THIRDPARTY_PATH=$OPTARG
;;
t)
BUILD_TYPE=$OPTARG # BUILD_TYPE
;;
@ -41,6 +43,7 @@ while getopts "a:b:t:h" arg; do
echo "-t: build type(default: Debug)
-a: arrow repo(default: https://github.com/apache/arrow.git)
-b: arrow tag(default: apache-arrow-2.0.0)
-f: custom thirdparty path(default: "")
-h: help
"
exit 0
@ -52,8 +55,18 @@ while getopts "a:b:t:h" arg; do
esac
done
echo "BUILD_TYPE: " $BUILD_TYPE
echo "GIT_ARROW_REPO: " $GIT_ARROW_REPO
echo "GIT_ARROW_TAG: " $GIT_ARROW_TAG
echo "CUSTOM_THIRDPARTY_PATH: " $CUSTOM_THIRDPARTY_PATH
pushd ${CMAKE_BUILD}
cmake -DCMAKE_INSTALL_PREFIX=${OUTPUT_LIB} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DGIT_ARROW_REPO=${GIT_ARROW_REPO} -DGIT_ARROW_TAG=${GIT_ARROW_TAG} ${SRC_DIR} && make && make install
CMAKE_CMD="cmake \
-DCMAKE_INSTALL_PREFIX=${OUTPUT_LIB} \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-DCUSTOM_THIRDPARTY_DOWNLOAD_PATH=${CUSTOM_THIRDPARTY_PATH} ${SRC_DIR}"
${CMAKE_CMD}
echo ${CMAKE_CMD}
if [[ ! ${jobs+1} ]]; then
jobs=$(nproc)
fi
make -j ${jobs} && make install

View File

@ -260,8 +260,8 @@ def gen_binary_default_fields(auto_id=True):
"fields": [
{"name": "int64", "type": DataType.INT64, "is_primary_key": not auto_id},
{"name": "float", "type": DataType.FLOAT},
{"name": default_binary_vec_field_name, "type": DataType.BINARY_VECTOR, "params": {"dim": default_dim}}
],
{"name": default_binary_vec_field_name, "type": DataType.BINARY_VECTOR, "params": {"dim": default_dim}, "indexes": [{"metric_type": "JACCARD"}]}
],
"segment_row_limit": default_segment_row_limit,
"auto_id": auto_id
}