From a51241dfcd75340e28e02431ed47126fb61fbd9c Mon Sep 17 00:00:00 2001 From: Masajiro Iwasaki Date: Tue, 13 Apr 2021 15:24:42 +0900 Subject: [PATCH] add C APIs for NGTQG (#98) add C APIs for NGTQG --- VERSION | 2 +- bin/ngtqg/README.md | 5 +- bin/ngtqg/ngtqg.cpp | 2 +- lib/NGT/Capi.cpp | 10 +-- lib/NGT/Capi.h | 2 +- lib/NGT/Command.cpp | 2 +- lib/NGT/Index.cpp | 2 +- lib/NGT/NGTQ/Capi.cpp | 132 ++++++++++++++++++++++++++++++++ lib/NGT/NGTQ/Capi.h | 137 ++++++++++++++++++++++++++++++++++ lib/NGT/NGTQ/NGTQGCommand.cpp | 76 ++----------------- lib/NGT/NGTQ/NGTQGCommand.h | 14 +--- lib/NGT/NGTQ/QuantizedGraph.h | 121 +++++++++++++++++++++++++++++- lib/NGT/NGTQ/Quantizer.h | 8 +- lib/NGT/ObjectSpace.h | 2 +- 14 files changed, 415 insertions(+), 100 deletions(-) create mode 100644 lib/NGT/NGTQ/Capi.cpp create mode 100644 lib/NGT/NGTQ/Capi.h diff --git a/VERSION b/VERSION index 43ded90..2e3a551 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.13.5 +1.13.6 diff --git a/bin/ngtqg/README.md b/bin/ngtqg/README.md index 2176512..08af70b 100644 --- a/bin/ngtqg/README.md +++ b/bin/ngtqg/README.md @@ -34,7 +34,7 @@ before the command as follows. Quantize the objects of the specified index and build a quantized graph into the index. - $ ngtqg quantize [-E max_no_of_edges] index + $ ngtqg quantize [-E max_no_of_edges] [-Q dimension_of_subvector] index *index* Specify the name of the directory for the existing index such as ANNG or ONNG to be quantized. The index only with L2 distance and normalized cosine similarity distance can be quantized. You should build the ANNG or ONNG with normalized cosine similarity in order to use cosine similarity for the quantized graph. @@ -42,6 +42,9 @@ Specify the name of the directory for the existing index such as ANNG or ONNG to **-E** *max_no_of_edges* Specify the maximum number of edges to build a qunatized graph. Since every 16 objects that are associated with edges of each node are processed, the number should be a multiple of 16. +**-Q** *dimension_of_subvector* +Specify dimension of a suvbector for quantized objects. The dimension should be a divisor of the dimension of the inserted objects. + ### SEARCH Search the index using the specified query data. diff --git a/bin/ngtqg/ngtqg.cpp b/bin/ngtqg/ngtqg.cpp index bc760bf..0248c45 100644 --- a/bin/ngtqg/ngtqg.cpp +++ b/bin/ngtqg/ngtqg.cpp @@ -1,5 +1,5 @@ // -// Copyright (C) 2016-2020 Yahoo Japan Corporation +// Copyright (C) 2020 Yahoo Japan Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/lib/NGT/Capi.cpp b/lib/NGT/Capi.cpp index 698dc62..6ccb654 100644 --- a/lib/NGT/Capi.cpp +++ b/lib/NGT/Capi.cpp @@ -325,7 +325,7 @@ NGTObjectDistances ngt_create_empty_results(NGTError error) { } static bool ngt_search_index_(NGT::Index* pindex, NGT::Object *ngtquery, size_t size, float epsilon, float radius, NGTObjectDistances results, int edge_size = INT_MIN) { - // set search prameters. + // set search parameters. NGT::SearchContainer sc(*ngtquery); // search parametera container. sc.setResults(static_cast(results)); // set the result set. @@ -683,18 +683,18 @@ uint8_t* ngt_get_object_as_integer(NGTObjectSpace object_space, ObjectID id, NGT void ngt_destroy_results(NGTObjectDistances results) { if(results == NULL) return; - delete(static_cast(results)); + delete static_cast(results); } void ngt_destroy_property(NGTProperty prop) { if(prop == NULL) return; - delete(static_cast(prop)); + delete static_cast(prop); } void ngt_close_index(NGTIndex index) { if(index == NULL) return; (static_cast(index))->close(); - delete(static_cast(index)); + delete static_cast(index); } int16_t ngt_get_property_edge_size_for_creation(NGTProperty prop, NGTError error) { @@ -886,7 +886,7 @@ bool ngt_optimizer_set_processing_modes(NGTOptimizer optimizer, bool searchParam void ngt_destroy_optimizer(NGTOptimizer optimizer) { if(optimizer == NULL) return; - delete(static_cast(optimizer)); + delete static_cast(optimizer); } bool ngt_refine_anng(NGTIndex index, float epsilon, float accuracy, int noOfEdges, int exploreEdgeSize, size_t batchSize, NGTError error) diff --git a/lib/NGT/Capi.h b/lib/NGT/Capi.h index e583020..f429a21 100644 --- a/lib/NGT/Capi.h +++ b/lib/NGT/Capi.h @@ -184,7 +184,7 @@ bool ngt_optimizer_set_processing_modes(NGTOptimizer optimizer, bool searchParam void ngt_destroy_optimizer(NGTOptimizer); // refine: the specified index by searching each node. -// epsilon, exepectedAccuracy and edgeSize: the same as the prameters for search. but if edgeSize is INT_MIN, default is used. +// epsilon, exepectedAccuracy and edgeSize: the same as the parameters for search. but if edgeSize is INT_MIN, default is used. // noOfEdges: if this is not 0, kNNG with k = noOfEdges is build // batchSize: batch size for parallelism. bool ngt_refine_anng(NGTIndex index, float epsilon, float expectedAccuracy, diff --git a/lib/NGT/Command.cpp b/lib/NGT/Command.cpp index c595811..11f9ddb 100644 --- a/lib/NGT/Command.cpp +++ b/lib/NGT/Command.cpp @@ -764,7 +764,7 @@ using namespace std; const string usage = "Usage: ngt optimize-search-parameters [-m optimization-target(s|p|a)] [-q #-of-queries] [-n #-of-results] index\n" "\t-m mode\n" "\t\ts: optimize search parameters (the number of explored edges).\n" - "\t\tp: optimize prefetch prameters.\n" + "\t\tp: optimize prefetch parameters.\n" "\t\ta: generate an accuracy table to specify an expected accuracy instead of an epsilon for search.\n"; string indexPath; diff --git a/lib/NGT/Index.cpp b/lib/NGT/Index.cpp index bc95c13..de044da 100644 --- a/lib/NGT/Index.cpp +++ b/lib/NGT/Index.cpp @@ -1417,7 +1417,7 @@ findPathAmongIdenticalObjects(GraphAndTreeIndex &graph, size_t srcid, size_t dst done.insert(tid); GraphNode &node = *graph.GraphIndex::getNode(tid); #ifdef NGT_SHARED_MEMORY_ALLOCATOR - for (auto i = node.begin(graph.repository.allocator); i != node.end(graph.GraphIndex::repository.allocator); ++i) { + for (auto i = node.begin(graph.GraphIndex::repository.allocator); i != node.end(graph.GraphIndex::repository.allocator); ++i) { #else for (auto i = node.begin(); i != node.end(); ++i) { #endif diff --git a/lib/NGT/NGTQ/Capi.cpp b/lib/NGT/NGTQ/Capi.cpp new file mode 100644 index 0000000..c86ade0 --- /dev/null +++ b/lib/NGT/NGTQ/Capi.cpp @@ -0,0 +1,132 @@ +// +// Copyright (C) 2021 Yahoo Japan Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include +#include +#include + +#include "NGT/Capi.h" +#include "NGT/NGTQ/Capi.h" +#include "NGT/NGTQ/QuantizedGraph.h" + +static bool operate_error_string_(const std::stringstream &ss, NGTError error){ + if(error != NULL){ + try{ + std::string *error_str = static_cast(error); + *error_str = ss.str(); + }catch(std::exception &err){ + std::cerr << ss.str() << " > " << err.what() << std::endl; + return false; + } + }else{ + std::cerr << ss.str() << std::endl; + } + return true; +} + +void ngtqg_initialize_query(NGTQGQuery *query) { + query->query = 0; + query->size = 20; + query->epsilon = 0.03; + query->result_expansion = 3.0; + query->radius = FLT_MAX; +} + +NGTQGIndex ngtqg_open_index(const char *index_path, NGTError error) { + try{ + std::string index_path_str(index_path); + auto *index = new NGTQG::Index(index_path_str); + index->disableLog(); + return static_cast(index); + }catch(std::exception &err){ + std::stringstream ss; + ss << "Capi : " << __FUNCTION__ << "() : Error: " << err.what(); + operate_error_string_(ss, error); + return NULL; + } +} + +void ngtqg_close_index(NGTQGIndex index) { + if(index == NULL) return; + (static_cast(index))->close(); + delete static_cast(index); +} + +static bool ngtqg_search_index_(NGTQG::Index* pindex, std::vector &query, NGTQGQuery ¶m, NGTObjectDistances results) { + // set search parameters. + NGTQG::SearchQuery sq(query); // Query. + + sq.setResults(static_cast(results)); // set the result set. + sq.setSize(param.size); // the number of resultant objects. + sq.setRadius(param.radius); // search radius. + sq.setEpsilon(param.epsilon); // exploration coefficient. + sq.setResultExpansion(param.result_expansion); // result expansion. + + auto tmp = static_cast(results); + + pindex->search(sq); + + return true; +} + +bool ngtqg_search_index(NGTQGIndex index, NGTQGQuery query, NGTObjectDistances results, NGTError error) { + if(index == NULL || query.query == NULL || results == NULL){ + std::stringstream ss; + ss << "Capi : " << __FUNCTION__ << "() : parametor error: index = " << index << " query = " << query.query << " results = " << results; + operate_error_string_(ss, error); + return false; + } + + NGTQG::Index* pindex = static_cast(index); + int32_t dim = pindex->getObjectSpace().getDimension(); + + NGT::Object *ngtquery = NULL; + + if(query.radius < 0.0){ + query.radius = FLT_MAX; + } + + try{ + std::vector vquery(&query.query[0], &query.query[dim]); + ngtqg_search_index_(pindex, vquery, query, results); + }catch(std::exception &err) { + std::stringstream ss; + ss << "Capi : " << __FUNCTION__ << "() : Error: " << err.what(); + operate_error_string_(ss, error); + if(ngtquery != NULL){ + pindex->deleteObject(ngtquery); + } + return false; + } + return true; +} + +void ngtqg_initialize_quantization_parameters(NGTQGQuantizationParameters *parameters) { + parameters->dimension_of_subvector = 0; + parameters->max_number_of_edges = 128; +} + +void ngtqg_quantize(const char *indexPath, NGTQGQuantizationParameters parameters, NGTError error) { + try{ + NGTQG::Index::quantize(indexPath, parameters.dimension_of_subvector, parameters.max_number_of_edges); + }catch(std::exception &err){ + std::stringstream ss; + ss << "Capi : " << __FUNCTION__ << "() : Error: " << err.what(); + operate_error_string_(ss, error); + return; + } +} + diff --git a/lib/NGT/NGTQ/Capi.h b/lib/NGT/NGTQ/Capi.h new file mode 100644 index 0000000..96b5932 --- /dev/null +++ b/lib/NGT/NGTQ/Capi.h @@ -0,0 +1,137 @@ +// +// Copyright (C) 2021 Yahoo Japan Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +/*** + { + // simple quantization and search example + + std::string indexPath = "onng_index"; // ONNG + std::string queryPath = "query.tsv"; // Query file. + NGTError err = ngt_create_error_object(); + + // quantize the specified existing index + // build quantized objects and a quantized graph + NGTQGQuantizationParameters quantizationParameters; + ngtqg_initialize_quantization_parameters(&quantizationParameters); + ngtqg_quantize(indexPath.c_str(), quantizationParameters, err); + + // open the index (ANNG or ONNG). + index = ngtqg_open_index(indexPath.c_str(), err); + if (index == NULL) { + std::cerr << ngt_get_error_string(err) << std::endl; + return false; + } + + std::ifstream is(queryPath); // open a query file. + if (!is) { + std::cerr << "Cannot open the specified file. " << queryPath << std::endl; + return false; + } + + // get the dimension of the index to check the dimension of the query + NGTProperty property = ngt_create_property(err); + ngt_get_property(index, property, err); + size_t dimension = ngt_get_property_dimension(property, err); + ngt_destroy_property(property); + + std::string line; + float queryVector[dimension]; + if (!getline(is, line)) { // read a query object from the query file. + std::cerr << "no data" << std::endl; + } + std::vector tokens; + NGT::Common::tokenize(line, tokens, " \t"); // split a string into words by the separators. + // create a query vector from the tokens. + if (tokens.size() != dimension) { + std::cerr << "dimension of the query is invalid. dimesion=" << tokens.size() << ":" << dimension << std::endl; + return false; + } + for (std::vector::iterator ti = tokens.begin(); ti != tokens.end(); ++ti) { + queryVector[distance(tokens.begin(), ti)] = NGT::Common::strtod(*ti); + } + // set search parameters. + NGTObjectDistances result = ngt_create_empty_results(err); + NGTQGQuery query; + ngtqg_initialize_query(&query); + query.query = queryVector; + query.size = 20; + query.epsilon = 0.03; + query.result_expansion = 2; + + // search with the quantized graph + bool status = ngtqg_search_index(index, query, result, err); + NGTObjectSpace objectSpace = ngt_get_object_space(index, err); + auto rsize = ngt_get_result_size(result, err); + // show resultant objects. + std::cout << "Rank\tID\tDistance\tObject" << std::endl; + for (size_t i = 0; i < rsize; i++) { + NGTObjectDistance object = ngt_get_result(result, i, err); + std::cout << i + 1 << "\t" << object.id << "\t" << object.distance << "\t"; + float *objectVector = ngt_get_object_as_float(objectSpace, object.id, err); + for (size_t i = 0; i < dimension; i++) { + std::cout << objectVector[i] << " "; + } + std::cout << std::endl; + } + ngt_destroy_results(result); + ngtqg_close_index(index); + } +***/ + +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#include "NGT/Capi.h" + +typedef void* NGTQGIndex; +typedef NGTObjectDistance NGTObjectDistance; +typedef NGTError NGTQGError; + +typedef struct { + float *query; + size_t size; // # of returned objects + float epsilon; + float result_expansion; + float radius; +} NGTQGQuery; + +typedef struct { + float dimension_of_subvector; + size_t max_number_of_edges; +} NGTQGQuantizationParameters; + +NGTQGIndex ngtqg_open_index(const char *, NGTError); + +void ngtqg_close_index(NGTQGIndex); + +void ngtqg_initialize_quantization_parameters(NGTQGQuantizationParameters *); + +void ngtqg_quantize(const char *, NGTQGQuantizationParameters, NGTError); + +void ngtqg_initialize_query(NGTQGQuery *); + +bool ngtqg_search_index(NGTQGIndex, NGTQGQuery, NGTObjectDistances, NGTError); + +#ifdef __cplusplus +} +#endif diff --git a/lib/NGT/NGTQ/NGTQGCommand.cpp b/lib/NGT/NGTQ/NGTQGCommand.cpp index f24c3fc..f4c6599 100644 --- a/lib/NGT/NGTQ/NGTQGCommand.cpp +++ b/lib/NGT/NGTQ/NGTQGCommand.cpp @@ -1,5 +1,5 @@ // -// Copyright (C) 2015-2020 Yahoo Japan Corporation +// Copyright (C) 2020 Yahoo Japan Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -30,7 +30,7 @@ NGTQG::Command::create(NGT::Args &args) "[-D distance-function] " "[-p #-of-thread] [-d dimension] [-R global-codebook-range] [-r local-codebook-range] " "[-C global-codebook-size-limit] [-c local-codebook-size-limit] " - "[-Q quantization-ratio] [-i index-type (t:Tree|g:Graph)] " + "[-Q dimension-of-subvector] [-i index-type (t:Tree|g:Graph)] " "[-M global-centroid-creation-mode (d|s)] [-l global-centroid-creation-mode (d|k|s)] " "[-s local-sample-coefficient] " "index(output)"; @@ -80,21 +80,13 @@ NGTQG::Command::create(NGT::Args &args) void NGTQG::Command::build(NGT::Args &args) { - NGT::Command::append(args); - } - void NGTQG::Command::quantize(NGT::Args &args) { - const std::string usage = "Usage: ngtqg quantize " - "[-m quantization-mode(q|g|a)] [-E max-number-of-edges] [creation parameters] index\n" - "\t-m mode\n" - "\t\ta: quantize the objects and build and save a quantized graph. (default)\n" - "\t\tq: just quantize the objects but not build and save a quantized graph.\n" - "\t\tg: not quantize the objects but build and save a quantized graph.\n"; + const std::string usage = "Usage: ngtqg quantize [-Q dimension-of-subvector] [-E max-number-of-edges] index"; string indexPath; try { indexPath = args.get("#1"); @@ -103,67 +95,9 @@ NGTQG::Command::quantize(NGT::Args &args) cerr << usage << endl; return; } - char mode = args.getChar("m", 'a'); size_t maxNumOfEdges = args.getl("E", 128); - if (mode == 'q') { - maxNumOfEdges = 0; - } - if (mode != 'g') { - NGT::Index index(indexPath); - NGT::ObjectSpace &objectSpace = index.getObjectSpace(); - - - { - std::string quantizedIndexPath = indexPath + "/qg"; - struct stat st; - if (stat(quantizedIndexPath.c_str(), &st) != 0) { - NGT::Property property; - index.getProperty(property); - NGTQG::Command::CreateParameters createParameters(args, property.dimension); - - try { - createParameters.property.centroidCreationMode = NGTQ::CentroidCreationModeStatic; - NGTQ::Index::create(quantizedIndexPath, createParameters.property, createParameters.globalProperty, createParameters.localProperty); - } catch(NGT::Exception &err) { - std::cerr << err.what() << std::endl; - //cerr << usage << endl; - } - } - - NGTQ::Index quantizedIndex(quantizedIndexPath); - NGTQ::Quantizer &quantizer = quantizedIndex.getQuantizer(); - - { - std::vector meanObject(objectSpace.getDimension(), 0); - quantizedIndex.getQuantizer().globalCodebook.insert(meanObject); - quantizedIndex.getQuantizer().globalCodebook.createIndex(8); - } - - vector > objects; - for (size_t id = 1; id < objectSpace.getRepository().size(); id++) { - if (id % 100000 == 0) { - std::cerr << "Processed " << id << " objects." << std::endl; - } - std::vector object; - try { - objectSpace.getObject(id, object); - } catch(...) { - continue; - } - quantizer.insert(object, objects, id); - } - if (objects.size() > 0) { - quantizer.insert(objects); - } - - quantizedIndex.save(); - quantizedIndex.close(); - } - } - if (maxNumOfEdges != 0) { - NGTQG::Index index(indexPath, maxNumOfEdges); - index.save(); - } + size_t dimensionOfSubvector = args.getl("Q", 0); + NGTQG::Index::quantize(indexPath, dimensionOfSubvector, maxNumOfEdges); } void diff --git a/lib/NGT/NGTQ/NGTQGCommand.h b/lib/NGT/NGTQ/NGTQGCommand.h index 3d6f9cb..4f39767 100644 --- a/lib/NGT/NGTQ/NGTQGCommand.h +++ b/lib/NGT/NGTQ/NGTQGCommand.h @@ -1,5 +1,5 @@ // -// Copyright (C) 2015-2020 Yahoo Japan Corporation +// Copyright (C) 2020 Yahoo Japan Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -35,16 +35,8 @@ namespace NGTQG { property.globalCentroidLimit = args.getl("C", 1); property.localCentroidLimit = args.getl("c", 16); property.localClusteringSampleCoefficient = args.getl("s", 100); - size_t quantizationRatio = args.getl("Q", 0); - if (quantizationRatio == 0) { - if ((dimension > 400) && (dimension % 2 == 0)) { - property.localDivisionNo = dimension / 2; - } else { - property.localDivisionNo = dimension; - } - } else { - property.localDivisionNo = dimension / quantizationRatio; - } + size_t dimensionOfSubvector = args.getl("Q", 0); + property.localDivisionNo = NGTQG::Index::getNumberOfSubvectors(dimension, dimensionOfSubvector); property.dimension = dimension; } }; diff --git a/lib/NGT/NGTQ/QuantizedGraph.h b/lib/NGT/NGTQ/QuantizedGraph.h index bf3a556..d63a745 100644 --- a/lib/NGT/NGTQ/QuantizedGraph.h +++ b/lib/NGT/NGTQ/QuantizedGraph.h @@ -1,5 +1,5 @@ // -// Copyright (C) 2016-2020 Yahoo Japan Corporation +// Copyright (C) 2020 Yahoo Japan Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -20,8 +20,6 @@ #include "NGT/Index.h" #include "NGT/NGTQ/Quantizer.h" -#if !defined(NGT_SHARED_MEMORY_ALLOCATOR) && !defined(NGTQ_SHARED_INVERTED_INDEX) - #define GLOBAL_SIZE 1 @@ -78,8 +76,13 @@ namespace NGTQG { size_t numOfEdges = node.size() < maxNoOfEdges ? node.size() : maxNoOfEdges; (*this)[id].ids.reserve(numOfEdges); NGTQ::QuantizedObjectProcessingStream quantizedStream(quantizedIndex.getQuantizer(), numOfEdges); +#ifdef NGT_SHARED_MEMORY_ALLOCATOR + for (auto i = node.begin(graphRepository.allocator); i != node.end(graphRepository.allocator); ++i) { + if (distance(node.begin(graphRepository.allocator), i) >= static_cast(numOfEdges)) { +#else for (auto i = node.begin(); i != node.end(); i++) { if (distance(node.begin(), i) >= static_cast(numOfEdges)) { +#endif break; } if ((*i).id == 0) { @@ -89,10 +92,18 @@ namespace NGTQG { } (*this)[id].ids.push_back((*i).id); for (size_t idx = 0; idx < numOfSubspaces; idx++) { +#ifdef NGT_SHARED_MEMORY_ALLOCATOR + size_t dataNo = distance(node.begin(graphRepository.allocator), i); +#else size_t dataNo = distance(node.begin(), i); +#endif #if defined(NGT_SHARED_MEMORY_ALLOCATOR) abort(); #else + if (invertedIndexObjects[(*i).id].localID[idx] < 1 || invertedIndexObjects[(*i).id].localID[idx] > 16) { + std::cerr << "Fatal inner error! Invalid local centroid ID. ID=" << (*i).id << ":" << invertedIndexObjects[(*i).id].localID[idx] << std::endl; + abort(); + } quantizedStream.arrangeQuantizedObject(dataNo, idx, invertedIndexObjects[(*i).id].localID[idx] - 1); #endif } @@ -360,6 +371,109 @@ namespace NGTQG { deleteObject(query); } + static size_t getNumberOfSubvectors(size_t dimension, size_t dimensionOfSubvector) { + if (dimensionOfSubvector == 0) { + dimensionOfSubvector = dimension > 400 ? 2 : 1; + dimensionOfSubvector = (dimension % dimensionOfSubvector == 0) ? dimensionOfSubvector : 1; + } + if (dimension % dimensionOfSubvector != 0) { + stringstream msg; + msg << "Quantizer::getNumOfSubvectors: dimensionOfSubvector is invalid. " << dimension << " : " << dimensionOfSubvector << std::endl; + NGTThrowException(msg); + } + return dimension / dimensionOfSubvector; + } + + static void buildQuantizedGraph(const std::string indexPath, size_t maxNumOfEdges) { + NGTQG::Index index(indexPath, maxNumOfEdges); + index.save(); + } + + static void buildQuantizedObjects(const std::string quantizedIndexPath, NGT::ObjectSpace &objectSpace) { + NGTQ::Index quantizedIndex(quantizedIndexPath); + NGTQ::Quantizer &quantizer = quantizedIndex.getQuantizer(); + + { + std::vector meanObject(objectSpace.getDimension(), 0); + quantizedIndex.getQuantizer().globalCodebook.insert(meanObject); + quantizedIndex.getQuantizer().globalCodebook.createIndex(8); + } + + vector > objects; + for (size_t id = 1; id < objectSpace.getRepository().size(); id++) { + if (id % 100000 == 0) { + std::cerr << "Processed " << id << " objects." << std::endl; + } + std::vector object; + try { + objectSpace.getObject(id, object); + } catch(...) { + continue; + } + quantizer.insert(object, objects, id); + } + if (objects.size() > 0) { + quantizer.insert(objects); + } + + quantizedIndex.save(); + quantizedIndex.close(); + } + + static void constructQuantizedGraphFrame(const std::string quantizedIndexPath, size_t dimension, size_t dimensionOfSubvector) { + NGTQ::Property property; + NGT::Property globalProperty; + NGT::Property localProperty; + + property.threadSize = 24; + property.globalRange = 0; + property.localRange = 0; + property.dataType = NGTQ::DataTypeFloat; + property.distanceType = NGTQ::DistanceTypeL2; + property.singleLocalCodebook = false; + property.batchSize = 1000; + property.centroidCreationMode = NGTQ::CentroidCreationModeStatic; + property.localCentroidCreationMode = NGTQ::CentroidCreationModeDynamicKmeans; + + property.globalCentroidLimit = 1; + property.localCentroidLimit = 16; + property.localClusteringSampleCoefficient = 100; + property.localDivisionNo = NGTQG::Index::getNumberOfSubvectors(dimension, dimensionOfSubvector); + property.dimension = dimension; + + globalProperty.edgeSizeForCreation = 10; + globalProperty.edgeSizeForSearch = 40; + globalProperty.indexType = NGT::Property::GraphAndTree; + globalProperty.insertionRadiusCoefficient = 1.1; + + localProperty.indexType = NGT::Property::GraphAndTree; + localProperty.insertionRadiusCoefficient = 1.1; + + NGTQ::Index::create(quantizedIndexPath, property, globalProperty, localProperty); + + } + + static void quantize(const std::string indexPath, float dimensionOfSubvector, size_t maxNumOfEdges) { + NGT::Index index(indexPath); + NGT::ObjectSpace &objectSpace = index.getObjectSpace(); + + + { + std::string quantizedIndexPath = indexPath + "/qg"; + struct stat st; + if (stat(quantizedIndexPath.c_str(), &st) != 0) { + NGT::Property ngtProperty; + index.getProperty(ngtProperty); + //NGTQG::Command::CreateParameters createParameters(args, property.dimension); + constructQuantizedGraphFrame(quantizedIndexPath, ngtProperty.dimension, dimensionOfSubvector); + buildQuantizedObjects(quantizedIndexPath, objectSpace); + if (maxNumOfEdges != 0) { + buildQuantizedGraph(indexPath, maxNumOfEdges); + } + } + } + } + const std::string path; NGTQ::Index quantizedIndex; NGTQ::Index blobIndex; @@ -370,4 +484,3 @@ namespace NGTQG { } -#endif diff --git a/lib/NGT/NGTQ/Quantizer.h b/lib/NGT/NGTQ/Quantizer.h index 501e0a3..6c6a424 100644 --- a/lib/NGT/NGTQ/Quantizer.h +++ b/lib/NGT/NGTQ/Quantizer.h @@ -1739,9 +1739,14 @@ class QuantizerInstance : public Quantizer { #else invertedIndexEntry.pushBack(object.second); #endif - if (id.distance != 0.0) { + if (property.centroidCreationMode == CentroidCreationModeStatic) { localData.push_back(LocalDatam(globalCentroidID, invertedIndexEntry.size() - 1)); + } else { + if (id.distance != 0.0) { + localData.push_back(LocalDatam(globalCentroidID, + invertedIndexEntry.size() - 1)); + } } } else { if (property.centroidCreationMode != CentroidCreationModeDynamic) { @@ -2899,7 +2904,6 @@ class Quantization { return quantizer->getSharedMemorySize(os, t); } - protected: static NGTQ::Quantizer *getQuantizer(const string &index) { diff --git a/lib/NGT/ObjectSpace.h b/lib/NGT/ObjectSpace.h index 2e31f65..ba1a687 100644 --- a/lib/NGT/ObjectSpace.h +++ b/lib/NGT/ObjectSpace.h @@ -256,7 +256,7 @@ namespace NGT { } if (sum == 0.0) { std::stringstream msg; - msg << "ObjectSpace::normalize: Error! the object is an invalid zero vector for the cosine similarity or angle distance."; + msg << "ObjectSpace::normalize: Error! the object is an invalid zero vector for the cosine similarity or normalized distances."; NGTThrowException(msg); } sum = sqrt(sum);