From 45a74e1560f77fb6cf4c98827458b5da377aa446 Mon Sep 17 00:00:00 2001 From: Xu Peng Date: Wed, 17 Apr 2019 10:22:38 +0800 Subject: [PATCH 1/5] refactor(db): using index factory Former-commit-id: 7026fb087240214666a43f1c486d2683a5a1ecb2 --- cpp/src/db/DBImpl.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/db/DBImpl.cpp b/cpp/src/db/DBImpl.cpp index a076606515..62821c556c 100644 --- a/cpp/src/db/DBImpl.cpp +++ b/cpp/src/db/DBImpl.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include "DBImpl.h" #include "DBMetaImpl.h" #include "Env.h" @@ -113,21 +114,20 @@ Status DBImpl::merge_files(const std::string& group_id, const meta::DateT& date, return status; } - faiss::IndexFlat innerIndex(group_file.dimension); - faiss::IndexIDMap index(&innerIndex); + std::shared_ptr index(faiss::index_factory(group_file.dimension, "IDMap,Flat")); meta::GroupFilesSchema updated; for (auto& file : files) { auto file_index = dynamic_cast(faiss::read_index(file.location.c_str())); - index.add_with_ids(file_index->ntotal, dynamic_cast(file_index->index)->xb.data(), + index->add_with_ids(file_index->ntotal, dynamic_cast(file_index->index)->xb.data(), file_index->id_map.data()); auto file_schema = file; file_schema.file_type = meta::GroupFileSchema::TO_DELETE; updated.push_back(file_schema); } - faiss::write_index(&index, group_file.location.c_str()); + faiss::write_index(index.get(), group_file.location.c_str()); group_file.file_type = meta::GroupFileSchema::RAW; updated.push_back(group_file); status = _pMeta->update_files(updated); From ada3903352915e4dd317604a677be55e161143d6 Mon Sep 17 00:00:00 2001 From: Xu Peng Date: Wed, 17 Apr 2019 10:32:57 +0800 Subject: [PATCH 2/5] refactor(db): using index factory in MemManager Former-commit-id: 808f882176b0f933c609475bbd3930b3402525a8 --- cpp/src/db/MemManager.cpp | 28 ++++++++++++---------------- cpp/src/db/MemManager.h | 4 +--- 2 files changed, 13 insertions(+), 19 deletions(-) diff --git a/cpp/src/db/MemManager.cpp b/cpp/src/db/MemManager.cpp index 31d1ed0db3..3f74301f47 100644 --- a/cpp/src/db/MemManager.cpp +++ b/cpp/src/db/MemManager.cpp @@ -1,5 +1,6 @@ -#include -#include +/* #include */ +/* #include */ +#include #include #include #include @@ -19,20 +20,19 @@ MemVectors::MemVectors(const std::string& group_id, _file_location(file_location), _pIdGenerator(new SimpleIDGenerator()), _dimension(dimension), - _pInnerIndex(new faiss::IndexFlat(_dimension)), - _pIdMapIndex(new faiss::IndexIDMap(_pInnerIndex)) { + pIndex_(faiss::index_factory(_dimension, "IDMap,Flat")) { } void MemVectors::add(size_t n_, const float* vectors_, IDNumbers& vector_ids_) { _pIdGenerator->getNextIDNumbers(n_, vector_ids_); - _pIdMapIndex->add_with_ids(n_, vectors_, &vector_ids_[0]); + pIndex_->add_with_ids(n_, vectors_, &vector_ids_[0]); for(auto i=0 ; intotal; + return pIndex_->ntotal; } size_t MemVectors::approximate_size() const { @@ -42,10 +42,10 @@ size_t MemVectors::approximate_size() const { Status MemVectors::serialize(std::string& group_id) { /* std::stringstream ss; */ /* ss << "/tmp/test/" << _pIdGenerator->getNextIDNumber(); */ - /* faiss::write_index(_pIdMapIndex, ss.str().c_str()); */ - /* std::cout << _pIdMapIndex->ntotal << std::endl; */ + /* faiss::write_index(pIndex_, ss.str().c_str()); */ + /* std::cout << pIndex_->ntotal << std::endl; */ /* std::cout << _file_location << std::endl; */ - faiss::write_index(_pIdMapIndex, _file_location.c_str()); + faiss::write_index(pIndex_, _file_location.c_str()); group_id = group_id_; return Status::OK(); } @@ -55,13 +55,9 @@ MemVectors::~MemVectors() { delete _pIdGenerator; _pIdGenerator = nullptr; } - if (_pIdMapIndex != nullptr) { - delete _pIdMapIndex; - _pIdMapIndex = nullptr; - } - if (_pInnerIndex != nullptr) { - delete _pInnerIndex; - _pInnerIndex = nullptr; + if (pIndex_ != nullptr) { + delete pIndex_; + pIndex_ = nullptr; } } diff --git a/cpp/src/db/MemManager.h b/cpp/src/db/MemManager.h index 86b3973d62..48aacc4fb6 100644 --- a/cpp/src/db/MemManager.h +++ b/cpp/src/db/MemManager.h @@ -10,7 +10,6 @@ #include "Status.h" namespace faiss { - class IndexIDMap; class Index; } @@ -50,8 +49,7 @@ private: const std::string _file_location; IDGenerator* _pIdGenerator; size_t _dimension; - faiss::Index* _pInnerIndex; - faiss::IndexIDMap* _pIdMapIndex; + faiss::Index* pIndex_; }; // MemVectors From 5c3319d5d28193a89f3348d6d1ab626b72847270 Mon Sep 17 00:00:00 2001 From: Xu Peng Date: Wed, 17 Apr 2019 10:46:12 +0800 Subject: [PATCH 3/5] refactor(db): using wrapper write_index Former-commit-id: a7ce7f72f6c50385c32d14aa01530db8fbfcdc3b --- cpp/src/db/MemManager.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/db/MemManager.cpp b/cpp/src/db/MemManager.cpp index 3f74301f47..37a2895e61 100644 --- a/cpp/src/db/MemManager.cpp +++ b/cpp/src/db/MemManager.cpp @@ -1,11 +1,10 @@ -/* #include */ -/* #include */ #include -#include #include #include #include +#include + #include "MemManager.h" #include "Meta.h" @@ -45,7 +44,8 @@ Status MemVectors::serialize(std::string& group_id) { /* faiss::write_index(pIndex_, ss.str().c_str()); */ /* std::cout << pIndex_->ntotal << std::endl; */ /* std::cout << _file_location << std::endl; */ - faiss::write_index(pIndex_, _file_location.c_str()); + /* faiss::write_index(pIndex_, _file_location.c_str()); */ + write_index(pIndex_, _file_location.c_str()); group_id = group_id_; return Status::OK(); } From c87cdc8736cc649a62098026e44f1c9820e42d3e Mon Sep 17 00:00:00 2001 From: Xu Peng Date: Wed, 17 Apr 2019 11:08:16 +0800 Subject: [PATCH 4/5] feat(db): mock build_index related Former-commit-id: a4908e8eaf9b8ba57d873e0f9b1128980edbf3ff --- cpp/src/db/DBImpl.cpp | 4 ++-- cpp/src/db/DBMetaImpl.cpp | 30 ++++++++++++++++++++++++++++++ cpp/src/db/DBMetaImpl.h | 2 ++ cpp/src/db/Meta.h | 2 ++ 4 files changed, 36 insertions(+), 2 deletions(-) diff --git a/cpp/src/db/DBImpl.cpp b/cpp/src/db/DBImpl.cpp index 62821c556c..026f34b64b 100644 --- a/cpp/src/db/DBImpl.cpp +++ b/cpp/src/db/DBImpl.cpp @@ -166,14 +166,14 @@ Status DBImpl::background_merge_files(const std::string& group_id) { Status DBImpl::build_index(const meta::GroupFileSchema& file) { //PXU TODO + std::cout << ">>Building Index for: " << file.location << std::endl; return Status::OK(); } Status DBImpl::background_build_index() { assert(bg_build_index_started_); meta::GroupFilesSchema to_index_files; - // PXU TODO - /* _pMeta->files_to_index(to_index_files); */ + _pMeta->files_to_index(to_index_files); Status status; for (auto& file : to_index_files) { status = build_index(file); diff --git a/cpp/src/db/DBMetaImpl.cpp b/cpp/src/db/DBMetaImpl.cpp index a9f7b1bf86..fd37038bfc 100644 --- a/cpp/src/db/DBMetaImpl.cpp +++ b/cpp/src/db/DBMetaImpl.cpp @@ -1,6 +1,9 @@ +#include +#include #include #include #include +#include #include "DBMetaImpl.h" #include "IDGenerator.h" @@ -9,6 +12,13 @@ namespace vecwise { namespace engine { namespace meta { +long GetFileSize(const std::string& filename) +{ + struct stat stat_buf; + int rc = stat(filename.c_str(), &stat_buf); + return rc == 0 ? stat_buf.st_size : -1; +} + DBMetaImpl::DBMetaImpl(const MetaOptions& options_) : _options(static_cast(options_)) { initialize(); @@ -57,6 +67,24 @@ Status DBMetaImpl::add_group_file(const std::string& group_id, return Status::OK(); } +Status DBMetaImpl::files_to_index(GroupFilesSchema& files) { + // PXU TODO + files.clear(); + std::stringstream ss; + ss << "/tmp/test/" << Meta::GetDate(); + boost::filesystem::path path(ss.str().c_str()); + boost::filesystem::directory_iterator end_itr; + for (boost::filesystem::directory_iterator itr(path); itr != end_itr; ++itr) { + std::cout << itr->path().string() << std::endl; + GroupFileSchema f; + f.location = itr->path().string(); + if (1024*1024*50 >= GetFileSize(f.location)) continue; + std::cout << "About to index " << f.location << std::endl; + files.push_back(f); + } + return Status::OK(); +} + Status DBMetaImpl::files_to_merge(const std::string& group_id, DatePartionedGroupFilesSchema& files) { //PXU TODO @@ -72,6 +100,8 @@ Status DBMetaImpl::files_to_merge(const std::string& group_id, std::cout << itr->path().string() << std::endl; GroupFileSchema f; f.location = itr->path().string(); + if (1024*1024*50 < GetFileSize(f.location)) continue; + std::cout << "About to merge " << f.location << std::endl; files[date].push_back(f); } diff --git a/cpp/src/db/DBMetaImpl.h b/cpp/src/db/DBMetaImpl.h index 4bf159cfd4..ade7ced50a 100644 --- a/cpp/src/db/DBMetaImpl.h +++ b/cpp/src/db/DBMetaImpl.h @@ -41,6 +41,8 @@ public: virtual Status files_to_merge(const std::string& group_id, DatePartionedGroupFilesSchema& files) override; + virtual Status files_to_index(GroupFilesSchema&) override; + private: Status initialize(); diff --git a/cpp/src/db/Meta.h b/cpp/src/db/Meta.h index 73f9ebeddf..847b31bef4 100644 --- a/cpp/src/db/Meta.h +++ b/cpp/src/db/Meta.h @@ -77,6 +77,8 @@ public: virtual Status files_to_merge(const std::string& group_id, DatePartionedGroupFilesSchema& files) = 0; + virtual Status files_to_index(GroupFilesSchema&) = 0; + static DateT GetDate(const std::time_t& t); static DateT GetDate(); From c0326906d972687d0ba8588056b21160cc96cb81 Mon Sep 17 00:00:00 2001 From: Xu Peng Date: Wed, 17 Apr 2019 11:47:57 +0800 Subject: [PATCH 5/5] feat(wrapper): add one more build_all api in IndexBuilder Former-commit-id: d6d7187a419bafb81751b815e2ebd235058e43f9 --- .gitignore | 1 + cpp/src/wrapper/IndexBuilder.cpp | 22 +++++++++++++++------- cpp/src/wrapper/IndexBuilder.h | 8 ++++++-- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index 3eb961bb58..e0cd6a4d50 100644 --- a/.gitignore +++ b/.gitignore @@ -9,5 +9,6 @@ cmake_build *.o *.lo *.tar.gz +*.log cpp/third_party/thrift-0.12.0/ diff --git a/cpp/src/wrapper/IndexBuilder.cpp b/cpp/src/wrapper/IndexBuilder.cpp index 1e28df1914..5d0180a3a6 100644 --- a/cpp/src/wrapper/IndexBuilder.cpp +++ b/cpp/src/wrapper/IndexBuilder.cpp @@ -21,9 +21,11 @@ IndexBuilder::IndexBuilder(const Operand_ptr &opd) { opd_ = opd; } -Index_ptr IndexBuilder::build_all(const long &nb, const vector &xb, - const vector &ids, - const long &nt, const vector &xt) { +Index_ptr IndexBuilder::build_all(const long &nb, + const float* xb, + const long* ids, + const long &nt, + const float* xt) { std::shared_ptr index = nullptr; index.reset(faiss::index_factory(opd_->d, opd_->index_type.c_str())); @@ -31,14 +33,20 @@ Index_ptr IndexBuilder::build_all(const long &nb, const vector &xb, // currently only cpu resources are used. std::lock_guard lk(cpu_resource); if (!index->is_trained) { - nt == 0 || xt.empty() ? index->train(nb, xb.data()) - : index->train(nt, xt.data()); + nt == 0 || xt == nullptr ? index->train(nb, xb) + : index->train(nt, xt); } - index->add(nb, xb.data()); - index->add_with_ids(nb, xb.data(), ids.data()); // todo(linxj): support add_with_idmap + index->add_with_ids(nb, xb, ids); // todo(linxj): support add_with_idmap } return std::make_shared(index); + +} + +Index_ptr IndexBuilder::build_all(const long &nb, const vector &xb, + const vector &ids, + const long &nt, const vector &xt) { + return build_all(nb, xb.data(), ids.data(), nt, xt.data()); } // Be Factory pattern later diff --git a/cpp/src/wrapper/IndexBuilder.h b/cpp/src/wrapper/IndexBuilder.h index 97479b91e5..ed5f8a3956 100644 --- a/cpp/src/wrapper/IndexBuilder.h +++ b/cpp/src/wrapper/IndexBuilder.h @@ -19,6 +19,12 @@ class IndexBuilder { public: explicit IndexBuilder(const Operand_ptr &opd); + Index_ptr build_all(const long &nb, + const float* xb, + const long* ids, + const long &nt = 0, + const float* xt = nullptr); + Index_ptr build_all(const long &nb, const std::vector &xb, const std::vector &ids, @@ -47,5 +53,3 @@ extern IndexBuilderPtr GetIndexBuilder(const Operand_ptr &opd); } } } - -