2023-09-19 14:23:23 +08:00
|
|
|
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
|
|
|
// with the License. You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
|
|
|
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
|
|
// or implied. See the License for the specific language governing permissions and limitations under the License
|
|
|
|
|
|
|
|
#include <gtest/gtest.h>
|
|
|
|
#include <random>
|
|
|
|
|
|
|
|
#include "common/Array.h"
|
|
|
|
|
|
|
|
TEST(Array, TestConstructArray) {
|
|
|
|
using namespace milvus;
|
|
|
|
|
|
|
|
int N = 10;
|
|
|
|
milvus::proto::schema::ScalarField field_int_data;
|
|
|
|
milvus::proto::plan::Array field_int_array;
|
|
|
|
field_int_array.set_same_type(true);
|
|
|
|
for (int i = 0; i < N; i++) {
|
|
|
|
field_int_data.mutable_int_data()->add_data(i);
|
|
|
|
field_int_array.mutable_array()->Add()->set_int64_val(i);
|
|
|
|
}
|
|
|
|
auto int_array = Array(field_int_data);
|
|
|
|
ASSERT_EQ(N, int_array.length());
|
|
|
|
ASSERT_EQ(N * sizeof(int), int_array.byte_size());
|
|
|
|
for (int i = 0; i < N; ++i) {
|
|
|
|
ASSERT_EQ(int_array.get_data<int>(i), i);
|
|
|
|
}
|
|
|
|
ASSERT_TRUE(int_array.is_same_array(field_int_array));
|
feat: support inverted index (#28783)
issue: https://github.com/milvus-io/milvus/issues/27704
Add inverted index for some data types in Milvus. This index type can
save a lot of memory compared to loading all data into RAM and speed up
the term query and range query.
Supported: `INT8`, `INT16`, `INT32`, `INT64`, `FLOAT`, `DOUBLE`, `BOOL`
and `VARCHAR`.
Not supported: `ARRAY` and `JSON`.
Note:
- The inverted index for `VARCHAR` is not designed to serve full-text
search now. We will treat every row as a whole keyword instead of
tokenizing it into multiple terms.
- The inverted index don't support retrieval well, so if you create
inverted index for field, those operations which depend on the raw data
will fallback to use chunk storage, which will bring some performance
loss. For example, comparisons between two columns and retrieval of
output fields.
The inverted index is very easy to be used.
Taking below collection as an example:
```python
fields = [
FieldSchema(name="pk", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=100),
FieldSchema(name="int8", dtype=DataType.INT8),
FieldSchema(name="int16", dtype=DataType.INT16),
FieldSchema(name="int32", dtype=DataType.INT32),
FieldSchema(name="int64", dtype=DataType.INT64),
FieldSchema(name="float", dtype=DataType.FLOAT),
FieldSchema(name="double", dtype=DataType.DOUBLE),
FieldSchema(name="bool", dtype=DataType.BOOL),
FieldSchema(name="varchar", dtype=DataType.VARCHAR, max_length=1000),
FieldSchema(name="random", dtype=DataType.DOUBLE),
FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
schema = CollectionSchema(fields)
collection = Collection("demo", schema)
```
Then we can simply create inverted index for field via:
```python
index_type = "INVERTED"
collection.create_index("int8", {"index_type": index_type})
collection.create_index("int16", {"index_type": index_type})
collection.create_index("int32", {"index_type": index_type})
collection.create_index("int64", {"index_type": index_type})
collection.create_index("float", {"index_type": index_type})
collection.create_index("double", {"index_type": index_type})
collection.create_index("bool", {"index_type": index_type})
collection.create_index("varchar", {"index_type": index_type})
```
Then, term query and range query on the field can be speed up
automatically by the inverted index:
```python
result = collection.query(expr='int64 in [1, 2, 3]', output_fields=["pk"])
result = collection.query(expr='int64 < 5', output_fields=["pk"])
result = collection.query(expr='int64 > 2997', output_fields=["pk"])
result = collection.query(expr='1 < int64 < 5', output_fields=["pk"])
```
---------
Signed-off-by: longjiquan <jiquan.long@zilliz.com>
2023-12-31 19:50:47 +08:00
|
|
|
auto int_array_tmp = Array(const_cast<char*>(int_array.data()),
|
|
|
|
int_array.byte_size(),
|
|
|
|
int_array.get_element_type(),
|
2023-11-07 23:38:21 +08:00
|
|
|
{});
|
|
|
|
auto int_8_array = Array(const_cast<char*>(int_array.data()),
|
|
|
|
int_array.byte_size(),
|
|
|
|
DataType::INT8,
|
|
|
|
{});
|
|
|
|
ASSERT_EQ(int_array.length(), int_8_array.length());
|
|
|
|
auto int_16_array = Array(const_cast<char*>(int_array.data()),
|
|
|
|
int_array.byte_size(),
|
|
|
|
DataType::INT16,
|
|
|
|
{});
|
|
|
|
ASSERT_EQ(int_array.length(), int_16_array.length());
|
|
|
|
ASSERT_TRUE(int_array_tmp == int_array);
|
feat: support inverted index (#28783)
issue: https://github.com/milvus-io/milvus/issues/27704
Add inverted index for some data types in Milvus. This index type can
save a lot of memory compared to loading all data into RAM and speed up
the term query and range query.
Supported: `INT8`, `INT16`, `INT32`, `INT64`, `FLOAT`, `DOUBLE`, `BOOL`
and `VARCHAR`.
Not supported: `ARRAY` and `JSON`.
Note:
- The inverted index for `VARCHAR` is not designed to serve full-text
search now. We will treat every row as a whole keyword instead of
tokenizing it into multiple terms.
- The inverted index don't support retrieval well, so if you create
inverted index for field, those operations which depend on the raw data
will fallback to use chunk storage, which will bring some performance
loss. For example, comparisons between two columns and retrieval of
output fields.
The inverted index is very easy to be used.
Taking below collection as an example:
```python
fields = [
FieldSchema(name="pk", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=100),
FieldSchema(name="int8", dtype=DataType.INT8),
FieldSchema(name="int16", dtype=DataType.INT16),
FieldSchema(name="int32", dtype=DataType.INT32),
FieldSchema(name="int64", dtype=DataType.INT64),
FieldSchema(name="float", dtype=DataType.FLOAT),
FieldSchema(name="double", dtype=DataType.DOUBLE),
FieldSchema(name="bool", dtype=DataType.BOOL),
FieldSchema(name="varchar", dtype=DataType.VARCHAR, max_length=1000),
FieldSchema(name="random", dtype=DataType.DOUBLE),
FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
schema = CollectionSchema(fields)
collection = Collection("demo", schema)
```
Then we can simply create inverted index for field via:
```python
index_type = "INVERTED"
collection.create_index("int8", {"index_type": index_type})
collection.create_index("int16", {"index_type": index_type})
collection.create_index("int32", {"index_type": index_type})
collection.create_index("int64", {"index_type": index_type})
collection.create_index("float", {"index_type": index_type})
collection.create_index("double", {"index_type": index_type})
collection.create_index("bool", {"index_type": index_type})
collection.create_index("varchar", {"index_type": index_type})
```
Then, term query and range query on the field can be speed up
automatically by the inverted index:
```python
result = collection.query(expr='int64 in [1, 2, 3]', output_fields=["pk"])
result = collection.query(expr='int64 < 5', output_fields=["pk"])
result = collection.query(expr='int64 > 2997', output_fields=["pk"])
result = collection.query(expr='1 < int64 < 5', output_fields=["pk"])
```
---------
Signed-off-by: longjiquan <jiquan.long@zilliz.com>
2023-12-31 19:50:47 +08:00
|
|
|
auto int_array_view = ArrayView(const_cast<char*>(int_array.data()),
|
|
|
|
int_array.byte_size(),
|
|
|
|
int_array.get_element_type(),
|
2023-11-07 23:38:21 +08:00
|
|
|
{});
|
|
|
|
ASSERT_EQ(int_array.length(), int_array_view.length());
|
|
|
|
ASSERT_EQ(int_array.byte_size(), int_array_view.byte_size());
|
|
|
|
ASSERT_EQ(int_array.get_element_type(), int_array_view.get_element_type());
|
2023-09-19 14:23:23 +08:00
|
|
|
|
|
|
|
milvus::proto::schema::ScalarField field_long_data;
|
|
|
|
milvus::proto::plan::Array field_long_array;
|
|
|
|
field_long_array.set_same_type(true);
|
|
|
|
for (int i = 0; i < N; i++) {
|
|
|
|
field_long_data.mutable_long_data()->add_data(i);
|
|
|
|
field_long_array.mutable_array()->Add()->set_int64_val(i);
|
|
|
|
}
|
|
|
|
auto long_array = Array(field_long_data);
|
|
|
|
ASSERT_EQ(N, long_array.length());
|
|
|
|
ASSERT_EQ(N * sizeof(int64_t), long_array.byte_size());
|
|
|
|
for (int i = 0; i < N; ++i) {
|
|
|
|
ASSERT_EQ(long_array.get_data<int64_t>(i), i);
|
|
|
|
}
|
|
|
|
ASSERT_TRUE(long_array.is_same_array(field_int_array));
|
2023-11-07 23:38:21 +08:00
|
|
|
auto long_array_tmp = Array(const_cast<char*>(long_array.data()),
|
|
|
|
long_array.byte_size(),
|
|
|
|
long_array.get_element_type(),
|
|
|
|
{});
|
|
|
|
ASSERT_TRUE(long_array_tmp == long_array);
|
feat: support inverted index (#28783)
issue: https://github.com/milvus-io/milvus/issues/27704
Add inverted index for some data types in Milvus. This index type can
save a lot of memory compared to loading all data into RAM and speed up
the term query and range query.
Supported: `INT8`, `INT16`, `INT32`, `INT64`, `FLOAT`, `DOUBLE`, `BOOL`
and `VARCHAR`.
Not supported: `ARRAY` and `JSON`.
Note:
- The inverted index for `VARCHAR` is not designed to serve full-text
search now. We will treat every row as a whole keyword instead of
tokenizing it into multiple terms.
- The inverted index don't support retrieval well, so if you create
inverted index for field, those operations which depend on the raw data
will fallback to use chunk storage, which will bring some performance
loss. For example, comparisons between two columns and retrieval of
output fields.
The inverted index is very easy to be used.
Taking below collection as an example:
```python
fields = [
FieldSchema(name="pk", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=100),
FieldSchema(name="int8", dtype=DataType.INT8),
FieldSchema(name="int16", dtype=DataType.INT16),
FieldSchema(name="int32", dtype=DataType.INT32),
FieldSchema(name="int64", dtype=DataType.INT64),
FieldSchema(name="float", dtype=DataType.FLOAT),
FieldSchema(name="double", dtype=DataType.DOUBLE),
FieldSchema(name="bool", dtype=DataType.BOOL),
FieldSchema(name="varchar", dtype=DataType.VARCHAR, max_length=1000),
FieldSchema(name="random", dtype=DataType.DOUBLE),
FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
schema = CollectionSchema(fields)
collection = Collection("demo", schema)
```
Then we can simply create inverted index for field via:
```python
index_type = "INVERTED"
collection.create_index("int8", {"index_type": index_type})
collection.create_index("int16", {"index_type": index_type})
collection.create_index("int32", {"index_type": index_type})
collection.create_index("int64", {"index_type": index_type})
collection.create_index("float", {"index_type": index_type})
collection.create_index("double", {"index_type": index_type})
collection.create_index("bool", {"index_type": index_type})
collection.create_index("varchar", {"index_type": index_type})
```
Then, term query and range query on the field can be speed up
automatically by the inverted index:
```python
result = collection.query(expr='int64 in [1, 2, 3]', output_fields=["pk"])
result = collection.query(expr='int64 < 5', output_fields=["pk"])
result = collection.query(expr='int64 > 2997', output_fields=["pk"])
result = collection.query(expr='1 < int64 < 5', output_fields=["pk"])
```
---------
Signed-off-by: longjiquan <jiquan.long@zilliz.com>
2023-12-31 19:50:47 +08:00
|
|
|
auto long_array_view = ArrayView(const_cast<char*>(long_array.data()),
|
|
|
|
long_array.byte_size(),
|
|
|
|
long_array.get_element_type(),
|
2023-11-07 23:38:21 +08:00
|
|
|
{});
|
|
|
|
ASSERT_EQ(long_array.length(), long_array_view.length());
|
|
|
|
ASSERT_EQ(long_array.byte_size(), long_array_view.byte_size());
|
|
|
|
ASSERT_EQ(long_array.get_element_type(),
|
|
|
|
long_array_view.get_element_type());
|
2023-09-19 14:23:23 +08:00
|
|
|
|
|
|
|
milvus::proto::schema::ScalarField field_string_data;
|
|
|
|
milvus::proto::plan::Array field_string_array;
|
|
|
|
field_string_array.set_same_type(true);
|
|
|
|
for (int i = 0; i < N; i++) {
|
|
|
|
field_string_data.mutable_string_data()->add_data(std::to_string(i));
|
|
|
|
proto::plan::GenericValue string_val;
|
|
|
|
string_val.set_string_val(std::to_string(i));
|
|
|
|
field_string_array.mutable_array()->Add()->CopyFrom(string_val);
|
|
|
|
}
|
|
|
|
auto string_array = Array(field_string_data);
|
|
|
|
ASSERT_EQ(N, string_array.length());
|
|
|
|
// ASSERT_EQ(N, string_array.size());
|
|
|
|
for (int i = 0; i < N; ++i) {
|
|
|
|
ASSERT_EQ(string_array.get_data<std::string_view>(i),
|
|
|
|
std::to_string(i));
|
|
|
|
}
|
|
|
|
ASSERT_TRUE(string_array.is_same_array(field_string_array));
|
2023-11-07 23:38:21 +08:00
|
|
|
std::vector<uint64_t> string_element_offsets;
|
|
|
|
std::vector<uint64_t> string_view_element_offsets;
|
|
|
|
for (auto& offset : string_array.get_offsets()) {
|
|
|
|
string_element_offsets.emplace_back(offset);
|
|
|
|
string_view_element_offsets.emplace_back(offset);
|
|
|
|
}
|
|
|
|
auto string_array_tmp = Array(const_cast<char*>(string_array.data()),
|
|
|
|
string_array.byte_size(),
|
|
|
|
string_array.get_element_type(),
|
|
|
|
std::move(string_element_offsets));
|
|
|
|
ASSERT_TRUE(string_array_tmp == string_array);
|
feat: support inverted index (#28783)
issue: https://github.com/milvus-io/milvus/issues/27704
Add inverted index for some data types in Milvus. This index type can
save a lot of memory compared to loading all data into RAM and speed up
the term query and range query.
Supported: `INT8`, `INT16`, `INT32`, `INT64`, `FLOAT`, `DOUBLE`, `BOOL`
and `VARCHAR`.
Not supported: `ARRAY` and `JSON`.
Note:
- The inverted index for `VARCHAR` is not designed to serve full-text
search now. We will treat every row as a whole keyword instead of
tokenizing it into multiple terms.
- The inverted index don't support retrieval well, so if you create
inverted index for field, those operations which depend on the raw data
will fallback to use chunk storage, which will bring some performance
loss. For example, comparisons between two columns and retrieval of
output fields.
The inverted index is very easy to be used.
Taking below collection as an example:
```python
fields = [
FieldSchema(name="pk", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=100),
FieldSchema(name="int8", dtype=DataType.INT8),
FieldSchema(name="int16", dtype=DataType.INT16),
FieldSchema(name="int32", dtype=DataType.INT32),
FieldSchema(name="int64", dtype=DataType.INT64),
FieldSchema(name="float", dtype=DataType.FLOAT),
FieldSchema(name="double", dtype=DataType.DOUBLE),
FieldSchema(name="bool", dtype=DataType.BOOL),
FieldSchema(name="varchar", dtype=DataType.VARCHAR, max_length=1000),
FieldSchema(name="random", dtype=DataType.DOUBLE),
FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
schema = CollectionSchema(fields)
collection = Collection("demo", schema)
```
Then we can simply create inverted index for field via:
```python
index_type = "INVERTED"
collection.create_index("int8", {"index_type": index_type})
collection.create_index("int16", {"index_type": index_type})
collection.create_index("int32", {"index_type": index_type})
collection.create_index("int64", {"index_type": index_type})
collection.create_index("float", {"index_type": index_type})
collection.create_index("double", {"index_type": index_type})
collection.create_index("bool", {"index_type": index_type})
collection.create_index("varchar", {"index_type": index_type})
```
Then, term query and range query on the field can be speed up
automatically by the inverted index:
```python
result = collection.query(expr='int64 in [1, 2, 3]', output_fields=["pk"])
result = collection.query(expr='int64 < 5', output_fields=["pk"])
result = collection.query(expr='int64 > 2997', output_fields=["pk"])
result = collection.query(expr='1 < int64 < 5', output_fields=["pk"])
```
---------
Signed-off-by: longjiquan <jiquan.long@zilliz.com>
2023-12-31 19:50:47 +08:00
|
|
|
auto string_array_view = ArrayView(const_cast<char*>(string_array.data()),
|
|
|
|
string_array.byte_size(),
|
|
|
|
string_array.get_element_type(),
|
2023-11-07 23:38:21 +08:00
|
|
|
std::move(string_view_element_offsets));
|
|
|
|
ASSERT_EQ(string_array.length(), string_array_view.length());
|
|
|
|
ASSERT_EQ(string_array.byte_size(), string_array_view.byte_size());
|
|
|
|
ASSERT_EQ(string_array.get_element_type(),
|
|
|
|
string_array_view.get_element_type());
|
2023-09-19 14:23:23 +08:00
|
|
|
|
|
|
|
milvus::proto::schema::ScalarField field_bool_data;
|
|
|
|
milvus::proto::plan::Array field_bool_array;
|
|
|
|
field_bool_array.set_same_type(true);
|
|
|
|
for (int i = 0; i < N; i++) {
|
|
|
|
field_bool_data.mutable_bool_data()->add_data(bool(i));
|
|
|
|
field_bool_array.mutable_array()->Add()->set_bool_val(bool(i));
|
|
|
|
}
|
|
|
|
auto bool_array = Array(field_bool_data);
|
|
|
|
ASSERT_EQ(N, bool_array.length());
|
|
|
|
ASSERT_EQ(N * sizeof(bool), bool_array.byte_size());
|
|
|
|
for (int i = 0; i < N; ++i) {
|
|
|
|
ASSERT_EQ(bool_array.get_data<bool>(i), bool(i));
|
|
|
|
}
|
|
|
|
ASSERT_TRUE(bool_array.is_same_array(field_bool_array));
|
2023-11-07 23:38:21 +08:00
|
|
|
auto bool_array_tmp = Array(const_cast<char*>(bool_array.data()),
|
|
|
|
bool_array.byte_size(),
|
|
|
|
bool_array.get_element_type(),
|
|
|
|
{});
|
|
|
|
ASSERT_TRUE(bool_array_tmp == bool_array);
|
feat: support inverted index (#28783)
issue: https://github.com/milvus-io/milvus/issues/27704
Add inverted index for some data types in Milvus. This index type can
save a lot of memory compared to loading all data into RAM and speed up
the term query and range query.
Supported: `INT8`, `INT16`, `INT32`, `INT64`, `FLOAT`, `DOUBLE`, `BOOL`
and `VARCHAR`.
Not supported: `ARRAY` and `JSON`.
Note:
- The inverted index for `VARCHAR` is not designed to serve full-text
search now. We will treat every row as a whole keyword instead of
tokenizing it into multiple terms.
- The inverted index don't support retrieval well, so if you create
inverted index for field, those operations which depend on the raw data
will fallback to use chunk storage, which will bring some performance
loss. For example, comparisons between two columns and retrieval of
output fields.
The inverted index is very easy to be used.
Taking below collection as an example:
```python
fields = [
FieldSchema(name="pk", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=100),
FieldSchema(name="int8", dtype=DataType.INT8),
FieldSchema(name="int16", dtype=DataType.INT16),
FieldSchema(name="int32", dtype=DataType.INT32),
FieldSchema(name="int64", dtype=DataType.INT64),
FieldSchema(name="float", dtype=DataType.FLOAT),
FieldSchema(name="double", dtype=DataType.DOUBLE),
FieldSchema(name="bool", dtype=DataType.BOOL),
FieldSchema(name="varchar", dtype=DataType.VARCHAR, max_length=1000),
FieldSchema(name="random", dtype=DataType.DOUBLE),
FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
schema = CollectionSchema(fields)
collection = Collection("demo", schema)
```
Then we can simply create inverted index for field via:
```python
index_type = "INVERTED"
collection.create_index("int8", {"index_type": index_type})
collection.create_index("int16", {"index_type": index_type})
collection.create_index("int32", {"index_type": index_type})
collection.create_index("int64", {"index_type": index_type})
collection.create_index("float", {"index_type": index_type})
collection.create_index("double", {"index_type": index_type})
collection.create_index("bool", {"index_type": index_type})
collection.create_index("varchar", {"index_type": index_type})
```
Then, term query and range query on the field can be speed up
automatically by the inverted index:
```python
result = collection.query(expr='int64 in [1, 2, 3]', output_fields=["pk"])
result = collection.query(expr='int64 < 5', output_fields=["pk"])
result = collection.query(expr='int64 > 2997', output_fields=["pk"])
result = collection.query(expr='1 < int64 < 5', output_fields=["pk"])
```
---------
Signed-off-by: longjiquan <jiquan.long@zilliz.com>
2023-12-31 19:50:47 +08:00
|
|
|
auto bool_array_view = ArrayView(const_cast<char*>(bool_array.data()),
|
|
|
|
bool_array.byte_size(),
|
|
|
|
bool_array.get_element_type(),
|
2023-11-07 23:38:21 +08:00
|
|
|
{});
|
|
|
|
ASSERT_EQ(bool_array.length(), bool_array_view.length());
|
|
|
|
ASSERT_EQ(bool_array.byte_size(), bool_array_view.byte_size());
|
|
|
|
ASSERT_EQ(bool_array.get_element_type(),
|
|
|
|
bool_array_view.get_element_type());
|
2023-09-19 14:23:23 +08:00
|
|
|
|
|
|
|
milvus::proto::schema::ScalarField field_float_data;
|
|
|
|
milvus::proto::plan::Array field_float_array;
|
|
|
|
field_float_array.set_same_type(true);
|
|
|
|
for (int i = 0; i < N; i++) {
|
|
|
|
field_float_data.mutable_float_data()->add_data(float(i) * 0.1);
|
|
|
|
field_float_array.mutable_array()->Add()->set_float_val(float(i * 0.1));
|
|
|
|
}
|
|
|
|
auto float_array = Array(field_float_data);
|
|
|
|
ASSERT_EQ(N, float_array.length());
|
|
|
|
ASSERT_EQ(N * sizeof(float), float_array.byte_size());
|
|
|
|
for (int i = 0; i < N; ++i) {
|
|
|
|
ASSERT_DOUBLE_EQ(float_array.get_data<float>(i), float(i * 0.1));
|
|
|
|
}
|
|
|
|
ASSERT_TRUE(float_array.is_same_array(field_float_array));
|
2023-11-07 23:38:21 +08:00
|
|
|
auto float_array_tmp = Array(const_cast<char*>(float_array.data()),
|
|
|
|
float_array.byte_size(),
|
|
|
|
float_array.get_element_type(),
|
|
|
|
{});
|
|
|
|
ASSERT_TRUE(float_array_tmp == float_array);
|
feat: support inverted index (#28783)
issue: https://github.com/milvus-io/milvus/issues/27704
Add inverted index for some data types in Milvus. This index type can
save a lot of memory compared to loading all data into RAM and speed up
the term query and range query.
Supported: `INT8`, `INT16`, `INT32`, `INT64`, `FLOAT`, `DOUBLE`, `BOOL`
and `VARCHAR`.
Not supported: `ARRAY` and `JSON`.
Note:
- The inverted index for `VARCHAR` is not designed to serve full-text
search now. We will treat every row as a whole keyword instead of
tokenizing it into multiple terms.
- The inverted index don't support retrieval well, so if you create
inverted index for field, those operations which depend on the raw data
will fallback to use chunk storage, which will bring some performance
loss. For example, comparisons between two columns and retrieval of
output fields.
The inverted index is very easy to be used.
Taking below collection as an example:
```python
fields = [
FieldSchema(name="pk", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=100),
FieldSchema(name="int8", dtype=DataType.INT8),
FieldSchema(name="int16", dtype=DataType.INT16),
FieldSchema(name="int32", dtype=DataType.INT32),
FieldSchema(name="int64", dtype=DataType.INT64),
FieldSchema(name="float", dtype=DataType.FLOAT),
FieldSchema(name="double", dtype=DataType.DOUBLE),
FieldSchema(name="bool", dtype=DataType.BOOL),
FieldSchema(name="varchar", dtype=DataType.VARCHAR, max_length=1000),
FieldSchema(name="random", dtype=DataType.DOUBLE),
FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
schema = CollectionSchema(fields)
collection = Collection("demo", schema)
```
Then we can simply create inverted index for field via:
```python
index_type = "INVERTED"
collection.create_index("int8", {"index_type": index_type})
collection.create_index("int16", {"index_type": index_type})
collection.create_index("int32", {"index_type": index_type})
collection.create_index("int64", {"index_type": index_type})
collection.create_index("float", {"index_type": index_type})
collection.create_index("double", {"index_type": index_type})
collection.create_index("bool", {"index_type": index_type})
collection.create_index("varchar", {"index_type": index_type})
```
Then, term query and range query on the field can be speed up
automatically by the inverted index:
```python
result = collection.query(expr='int64 in [1, 2, 3]', output_fields=["pk"])
result = collection.query(expr='int64 < 5', output_fields=["pk"])
result = collection.query(expr='int64 > 2997', output_fields=["pk"])
result = collection.query(expr='1 < int64 < 5', output_fields=["pk"])
```
---------
Signed-off-by: longjiquan <jiquan.long@zilliz.com>
2023-12-31 19:50:47 +08:00
|
|
|
auto float_array_view = ArrayView(const_cast<char*>(float_array.data()),
|
|
|
|
float_array.byte_size(),
|
|
|
|
float_array.get_element_type(),
|
2023-11-07 23:38:21 +08:00
|
|
|
{});
|
|
|
|
ASSERT_EQ(float_array.length(), float_array_view.length());
|
|
|
|
ASSERT_EQ(float_array.byte_size(), float_array_view.byte_size());
|
|
|
|
ASSERT_EQ(float_array.get_element_type(),
|
|
|
|
float_array_view.get_element_type());
|
2023-09-19 14:23:23 +08:00
|
|
|
|
|
|
|
milvus::proto::schema::ScalarField field_double_data;
|
|
|
|
milvus::proto::plan::Array field_double_array;
|
|
|
|
field_double_array.set_same_type(true);
|
|
|
|
for (int i = 0; i < N; i++) {
|
|
|
|
field_double_data.mutable_double_data()->add_data(double(i) * 0.1);
|
|
|
|
field_double_array.mutable_array()->Add()->set_float_val(
|
|
|
|
double(i * 0.1));
|
|
|
|
}
|
|
|
|
auto double_array = Array(field_double_data);
|
|
|
|
ASSERT_EQ(N, double_array.length());
|
|
|
|
ASSERT_EQ(N * sizeof(double), double_array.byte_size());
|
|
|
|
for (int i = 0; i < N; ++i) {
|
|
|
|
ASSERT_DOUBLE_EQ(double_array.get_data<double>(i), double(i * 0.1));
|
|
|
|
}
|
|
|
|
ASSERT_TRUE(double_array.is_same_array(field_double_array));
|
2023-11-07 23:38:21 +08:00
|
|
|
auto double_array_tmp = Array(const_cast<char*>(double_array.data()),
|
|
|
|
double_array.byte_size(),
|
|
|
|
double_array.get_element_type(),
|
|
|
|
{});
|
|
|
|
ASSERT_TRUE(double_array_tmp == double_array);
|
feat: support inverted index (#28783)
issue: https://github.com/milvus-io/milvus/issues/27704
Add inverted index for some data types in Milvus. This index type can
save a lot of memory compared to loading all data into RAM and speed up
the term query and range query.
Supported: `INT8`, `INT16`, `INT32`, `INT64`, `FLOAT`, `DOUBLE`, `BOOL`
and `VARCHAR`.
Not supported: `ARRAY` and `JSON`.
Note:
- The inverted index for `VARCHAR` is not designed to serve full-text
search now. We will treat every row as a whole keyword instead of
tokenizing it into multiple terms.
- The inverted index don't support retrieval well, so if you create
inverted index for field, those operations which depend on the raw data
will fallback to use chunk storage, which will bring some performance
loss. For example, comparisons between two columns and retrieval of
output fields.
The inverted index is very easy to be used.
Taking below collection as an example:
```python
fields = [
FieldSchema(name="pk", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=100),
FieldSchema(name="int8", dtype=DataType.INT8),
FieldSchema(name="int16", dtype=DataType.INT16),
FieldSchema(name="int32", dtype=DataType.INT32),
FieldSchema(name="int64", dtype=DataType.INT64),
FieldSchema(name="float", dtype=DataType.FLOAT),
FieldSchema(name="double", dtype=DataType.DOUBLE),
FieldSchema(name="bool", dtype=DataType.BOOL),
FieldSchema(name="varchar", dtype=DataType.VARCHAR, max_length=1000),
FieldSchema(name="random", dtype=DataType.DOUBLE),
FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
schema = CollectionSchema(fields)
collection = Collection("demo", schema)
```
Then we can simply create inverted index for field via:
```python
index_type = "INVERTED"
collection.create_index("int8", {"index_type": index_type})
collection.create_index("int16", {"index_type": index_type})
collection.create_index("int32", {"index_type": index_type})
collection.create_index("int64", {"index_type": index_type})
collection.create_index("float", {"index_type": index_type})
collection.create_index("double", {"index_type": index_type})
collection.create_index("bool", {"index_type": index_type})
collection.create_index("varchar", {"index_type": index_type})
```
Then, term query and range query on the field can be speed up
automatically by the inverted index:
```python
result = collection.query(expr='int64 in [1, 2, 3]', output_fields=["pk"])
result = collection.query(expr='int64 < 5', output_fields=["pk"])
result = collection.query(expr='int64 > 2997', output_fields=["pk"])
result = collection.query(expr='1 < int64 < 5', output_fields=["pk"])
```
---------
Signed-off-by: longjiquan <jiquan.long@zilliz.com>
2023-12-31 19:50:47 +08:00
|
|
|
auto double_array_view = ArrayView(const_cast<char*>(double_array.data()),
|
|
|
|
double_array.byte_size(),
|
|
|
|
double_array.get_element_type(),
|
2023-11-07 23:38:21 +08:00
|
|
|
{});
|
|
|
|
ASSERT_EQ(double_array.length(), double_array_view.length());
|
|
|
|
ASSERT_EQ(double_array.byte_size(), double_array_view.byte_size());
|
|
|
|
ASSERT_EQ(double_array.get_element_type(),
|
|
|
|
double_array_view.get_element_type());
|
2023-09-19 14:23:23 +08:00
|
|
|
|
|
|
|
milvus::proto::schema::ScalarField field_empty_data;
|
|
|
|
milvus::proto::plan::Array field_empty_array;
|
|
|
|
auto empty_array = Array(field_empty_data);
|
|
|
|
ASSERT_EQ(0, empty_array.length());
|
|
|
|
ASSERT_EQ(0, empty_array.byte_size());
|
|
|
|
ASSERT_TRUE(empty_array.is_same_array(field_empty_array));
|
|
|
|
}
|