diff --git a/VERSION b/VERSION index 6b37cb7..0eed1a2 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.11.6 +1.12.0 diff --git a/lib/NGT/Command.cpp b/lib/NGT/Command.cpp index c15b0f0..2dce27e 100644 --- a/lib/NGT/Command.cpp +++ b/lib/NGT/Command.cpp @@ -32,6 +32,9 @@ using namespace std; "[-e epsilon] [-o object-type(f|c)] [-D distance-function(1|2|a|A|h|j|c|C)] [-n #-of-inserted-objects] " "[-P path-adjustment-interval] [-B dynamic-edge-size-base] [-A object-alignment(t|f)] " "[-T build-time-limit] [-O outgoing x incoming] " +#if defined(NGT_SHARED_MEMORY_ALLOCATOR) + "[-N maximum-#-of-inserted-objects] " +#endif "index(output) [data.tsv(input)]"; string database; try { @@ -159,6 +162,9 @@ using namespace std; case 'j': property.distanceType = NGT::Index::Property::DistanceType::DistanceTypeJaccard; break; + case 'J': + property.distanceType = NGT::Index::Property::DistanceType::DistanceTypeSparseJaccard; + break; case 'c': property.distanceType = NGT::Index::Property::DistanceType::DistanceTypeCosine; break; @@ -171,6 +177,15 @@ using namespace std; return; } +#ifdef NGT_SHARED_MEMORY_ALLOCATOR + size_t maxNoOfObjects = args.getl("N", 0); + if (maxNoOfObjects > 0) { + property.graphSharedMemorySize + = property.treeSharedMemorySize + = property.objectSharedMemorySize = 512 * ceil(maxNoOfObjects / 50000000); + } +#endif + switch (indexType) { case 't': NGT::Index::createGraphAndTree(database, property, data, dataSize); diff --git a/lib/NGT/Common.h b/lib/NGT/Common.h index d9b7fb0..33f440e 100644 --- a/lib/NGT/Common.h +++ b/lib/NGT/Common.h @@ -1700,11 +1700,11 @@ namespace NGT { class SearchQuery : public NGT::SearchContainer { public: - template SearchQuery(std::vector &q):query(0) { setQuery(q); } - template SearchQuery(SearchContainer &sc, std::vector &q): SearchContainer(sc), query(0) { setQuery(q); } + template SearchQuery(const std::vector &q):query(0) { setQuery(q); } + template SearchQuery(SearchContainer &sc, const std::vector &q): SearchContainer(sc), query(0) { setQuery(q); } ~SearchQuery() { deleteQuery(); } - template void setQuery(std::vector &q) { + template void setQuery(const std::vector &q) { if (query != 0) { deleteQuery(); } diff --git a/lib/NGT/Graph.cpp b/lib/NGT/Graph.cpp index 1163509..1fed17e 100644 --- a/lib/NGT/Graph.cpp +++ b/lib/NGT/Graph.cpp @@ -100,6 +100,12 @@ NeighborhoodGraph::Search::l2Float(NeighborhoodGraph &graph, NGT::SearchContaine graph.searchReadOnlyGraph(sc, seeds); } +void +NeighborhoodGraph::Search::sparseJaccardFloat(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds) +{ + graph.searchReadOnlyGraph(sc, seeds); +} + void NeighborhoodGraph::Search::l1Uint8(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds) { @@ -162,6 +168,12 @@ NeighborhoodGraph::Search::l2FloatForLargeDataset(NeighborhoodGraph &graph, NGT: graph.searchReadOnlyGraph(sc, seeds); } +void +NeighborhoodGraph::Search::sparseJaccardFloatForLargeDataset(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds) +{ + graph.searchReadOnlyGraph(sc, seeds); +} + void NeighborhoodGraph::Search::l1Uint8ForLargeDataset(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds) { diff --git a/lib/NGT/Graph.h b/lib/NGT/Graph.h index a0b8ffb..a3708cd 100644 --- a/lib/NGT/Graph.h +++ b/lib/NGT/Graph.h @@ -287,6 +287,7 @@ namespace NGT { case NGT::ObjectSpace::DistanceTypeAngle : return angleFloat; case NGT::ObjectSpace::DistanceTypeL2 : return l2Float; case NGT::ObjectSpace::DistanceTypeL1 : return l1Float; + case NGT::ObjectSpace::DistanceTypeSparseJaccard : return sparseJaccardFloat; default: return l2Float; } break; @@ -312,6 +313,7 @@ namespace NGT { case NGT::ObjectSpace::DistanceTypeAngle : return angleFloatForLargeDataset; case NGT::ObjectSpace::DistanceTypeL2 : return l2FloatForLargeDataset; case NGT::ObjectSpace::DistanceTypeL1 : return l1FloatForLargeDataset; + case NGT::ObjectSpace::DistanceTypeSparseJaccard : return sparseJaccardFloatForLargeDataset; default: return l2FloatForLargeDataset; } break; @@ -334,6 +336,7 @@ namespace NGT { static void l2Float(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds); static void hammingUint8(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds); static void jaccardUint8(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds); + static void sparseJaccardFloat(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds); static void cosineSimilarityFloat(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds); static void angleFloat(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds); static void normalizedCosineSimilarityFloat(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds); @@ -345,6 +348,7 @@ namespace NGT { static void l2FloatForLargeDataset(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds); static void hammingUint8ForLargeDataset(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds); static void jaccardUint8ForLargeDataset(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds); + static void sparseJaccardFloatForLargeDataset(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds); static void cosineSimilarityFloatForLargeDataset(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds); static void angleFloatForLargeDataset(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds); static void normalizedCosineSimilarityFloatForLargeDataset(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds); diff --git a/lib/NGT/GraphReconstructor.h b/lib/NGT/GraphReconstructor.h index 0ecb2e1..1d80248 100644 --- a/lib/NGT/GraphReconstructor.h +++ b/lib/NGT/GraphReconstructor.h @@ -696,7 +696,7 @@ class GraphReconstructor { #endif } - // reconstruct a pseudo ANNG with a fewer edges form an actual ANNG with more edges. + // reconstruct a pseudo ANNG with a fewer edges from an actual ANNG with more edges. // graph is a source ANNG // index is an index with a reconstructed ANNG static diff --git a/lib/NGT/Index.cpp b/lib/NGT/Index.cpp index f52fa50..20e9aba 100644 --- a/lib/NGT/Index.cpp +++ b/lib/NGT/Index.cpp @@ -300,6 +300,25 @@ NGT::Index::exportIndex(const string &database, const string &file) { cerr << "# of objects=" << idx.getObjectRepositorySize() - 1 << endl; } +std::vector +NGT::Index::makeSparseObject(std::vector &object) +{ + if (static_cast(getIndex()).getProperty().distanceType != NGT::ObjectSpace::DistanceType::DistanceTypeSparseJaccard) { + NGTThrowException("NGT::Index::makeSparseObject: Not sparse jaccard."); + } + size_t dimension = getObjectSpace().getDimension(); + if (object.size() + 1 > dimension) { + std::stringstream msg; + dimension = object.size() + 1; + } + std::vector obj(dimension, 0.0); + for (size_t i = 0; i < object.size(); i++) { + float fv = *reinterpret_cast(&object[i]); + obj[i] = fv; + } + return obj; +} + void NGT::Index::Property::set(NGT::Property &prop) { if (prop.dimension != -1) dimension = prop.dimension; @@ -465,12 +484,17 @@ class BuildTimeController { void NGT::GraphIndex::constructObjectSpace(NGT::Property &prop) { assert(prop.dimension != 0); + size_t dimension = prop.dimension; + if (prop.distanceType == NGT::ObjectSpace::DistanceType::DistanceTypeSparseJaccard) { + dimension++; + } + switch (prop.objectType) { case NGT::ObjectSpace::ObjectType::Float : - objectSpace = new ObjectSpaceRepository(prop.dimension, typeid(float), prop.distanceType); + objectSpace = new ObjectSpaceRepository(dimension, typeid(float), prop.distanceType); break; case NGT::ObjectSpace::ObjectType::Uint8 : - objectSpace = new ObjectSpaceRepository(prop.dimension, typeid(uint8_t), prop.distanceType); + objectSpace = new ObjectSpaceRepository(dimension, typeid(uint8_t), prop.distanceType); break; default: stringstream msg; diff --git a/lib/NGT/Index.h b/lib/NGT/Index.h index 7f308ae..9cf5e1a 100644 --- a/lib/NGT/Index.h +++ b/lib/NGT/Index.h @@ -76,7 +76,7 @@ namespace NGT { databaseType = DatabaseType::MemoryMappedFile; graphSharedMemorySize = 512; // MB treeSharedMemorySize = 512; // MB - objectSharedMemorySize = 512; // MB 512 is up to 20M objects. + objectSharedMemorySize = 512; // MB 512 is up to 50M objects. #else databaseType = DatabaseType::Memory; #endif @@ -116,6 +116,7 @@ namespace NGT { case DistanceType::DistanceTypeL2: p.set("DistanceType", "L2"); break; case DistanceType::DistanceTypeHamming: p.set("DistanceType", "Hamming"); break; case DistanceType::DistanceTypeJaccard: p.set("DistanceType", "Jaccard"); break; + case DistanceType::DistanceTypeSparseJaccard: p.set("DistanceType", "SparseJaccard"); break; case DistanceType::DistanceTypeAngle: p.set("DistanceType", "Angle"); break; case DistanceType::DistanceTypeCosine: p.set("DistanceType", "Cosine"); break; case DistanceType::DistanceTypeNormalizedAngle: p.set("DistanceType", "NormalizedAngle"); break; @@ -177,6 +178,8 @@ namespace NGT { distanceType = DistanceType::DistanceTypeHamming; } else if (it->second == "Jaccard") { distanceType = DistanceType::DistanceTypeJaccard; + } else if (it->second == "SparseJaccard") { + distanceType = DistanceType::DistanceTypeSparseJaccard; } else if (it->second == "Angle") { distanceType = DistanceType::DistanceTypeAngle; } else if (it->second == "Cosine") { @@ -392,8 +395,8 @@ namespace NGT { static void createGraphAndTree(const std::string &database, NGT::Property &prop, const std::string &dataFile, size_t dataSize = 0, bool redirect = false); static void createGraphAndTree(const std::string &database, NGT::Property &prop, bool redirect = false) { createGraphAndTree(database, prop, "", redirect); } static void createGraph(const std::string &database, NGT::Property &prop, const std::string &dataFile, size_t dataSize = 0, bool redirect = false); - template size_t insert(std::vector &object); - template size_t append(std::vector &object); + template size_t insert(const std::vector &object); + template size_t append(const std::vector &object); static void append(const std::string &database, const std::string &dataFile, size_t threadSize, size_t dataSize); static void append(const std::string &database, const float *data, size_t dataSize, size_t threadSize); static void remove(const std::string &database, std::vector &objects, bool force = false); @@ -470,6 +473,7 @@ namespace NGT { ObjectDistances seeds; getIndex().search(sc, seeds); } + std::vector makeSparseObject(std::vector &object); Index &getIndex() { if (index == 0) { assert(index != 0); @@ -1638,9 +1642,8 @@ namespace NGT { } // namespace NGT - template -size_t NGT::Index::append(std::vector &object) +size_t NGT::Index::append(const std::vector &object) { if (getObjectSpace().getRepository().size() == 0) { getObjectSpace().getRepository().initialize(); @@ -1653,7 +1656,7 @@ size_t NGT::Index::append(std::vector &object) } template -size_t NGT::Index::insert(std::vector &object) +size_t NGT::Index::insert(const std::vector &object) { if (getObjectSpace().getRepository().size() == 0) { getObjectSpace().getRepository().initialize(); diff --git a/lib/NGT/ObjectRepository.h b/lib/NGT/ObjectRepository.h index d4c6684..a19ccc6 100644 --- a/lib/NGT/ObjectRepository.h +++ b/lib/NGT/ObjectRepository.h @@ -32,7 +32,7 @@ namespace NGT { public: typedef Repository Parent; #endif - ObjectRepository(size_t dim, const std::type_info &ot):dimension(dim), type(ot) { } + ObjectRepository(size_t dim, const std::type_info &ot):dimension(dim), type(ot), sparse(false) { } void initialize() { deleteAll(); @@ -220,22 +220,28 @@ namespace NGT { } template - Object *allocateObject(T *o, size_t size = 0) { - Object *po = new Object(paddedByteSize); - if (size != 0 && dimension != size) { - std::cerr << "ObjectSpace::allocateObject: Fatal error! dimension is invalid. The indexed objects=" - << dimension << " The specified object=" << size << std::endl; - assert(dimension == size); + Object *allocateObject(T *o, size_t size) { + size_t osize = paddedByteSize; + if (sparse) { + size_t vsize = size * (type == typeid(float) ? 4 : 1); + osize = osize < vsize ? vsize : osize; + } else { + if (dimension != size) { + std::cerr << "ObjectSpace::allocateObject: Fatal error! dimension is invalid. The indexed objects=" + << dimension << " The specified object=" << size << std::endl; + assert(dimension == size); + } } + Object *po = new Object(osize); void *object = static_cast(&(*po)[0]); if (type == typeid(uint8_t)) { uint8_t *obj = static_cast(object); - for (size_t i = 0; i < dimension; i++) { + for (size_t i = 0; i < size; i++) { obj[i] = static_cast(o[i]); } } else if (type == typeid(float)) { float *obj = static_cast(object); - for (size_t i = 0; i < dimension; i++) { + for (size_t i = 0; i < size; i++) { obj[i] = static_cast(o[i]); } } else { @@ -270,13 +276,14 @@ namespace NGT { } template - PersistentObject *allocatePersistentObject(T *o, size_t size = 0) { + PersistentObject *allocatePersistentObject(T *o, size_t size) { SharedMemoryAllocator &objectAllocator = getAllocator(); PersistentObject *po = new (objectAllocator) PersistentObject(objectAllocator, paddedByteSize); if (size != 0 && dimension != size) { - std::cerr << "ObjectSpace::allocateObject: Fatal error! dimension is invalid. The indexed objects=" - << dimension << " The specified object=" << size << std::endl; - assert(dimension == size); + std::stringstream msg; + msg << "ObjectSpace::allocatePersistentObject: Fatal error! The dimensionality is invalid. The specified dimensionality=" + << (sparse ? dimension - 1 : dimension) << ". The specified object=" << (sparse ? size - 1 : size) << "."; + NGTThrowException(msg); } void *object = static_cast(&(*po).at(0, allocator)); if (type == typeid(uint8_t)) { @@ -302,10 +309,20 @@ namespace NGT { } #else - // ObjectRepository + template + PersistentObject *allocatePersistentObject(T *o, size_t size) { + if (size != 0 && dimension != size) { + std::stringstream msg; + msg << "ObjectSpace::allocatePersistentObject: Fatal error! The dimensionality is invalid. The specified dimensionality=" + << (sparse ? dimension - 1 : dimension) << ". The specified object=" << (sparse ? size - 1 : size) << "."; + NGTThrowException(msg); + } + return allocateObject(o, size); + } + template PersistentObject *allocatePersistentObject(const std::vector &o) { - return allocateObject(o); + return allocatePersistentObject(o.data(), o.size()); } #endif @@ -344,13 +361,9 @@ namespace NGT { } #endif - void setLength(size_t l) { - byteSize = l; - } - void setPaddedLength(size_t l) { - paddedByteSize = l; - } - + void setLength(size_t l) { byteSize = l; } + void setPaddedLength(size_t l) { paddedByteSize = l; } + void setSparse() { sparse = true; } size_t getByteSize() { return byteSize; } size_t insert(PersistentObject *obj) { return Parent::insert(obj); } const size_t dimension; @@ -358,6 +371,7 @@ namespace NGT { protected: size_t byteSize; // the length of all of elements. size_t paddedByteSize; + bool sparse; // sparse data format }; } // namespace NGT diff --git a/lib/NGT/ObjectSpace.h b/lib/NGT/ObjectSpace.h index 5e575fe..abbaae7 100644 --- a/lib/NGT/ObjectSpace.h +++ b/lib/NGT/ObjectSpace.h @@ -172,7 +172,8 @@ namespace NGT { DistanceTypeCosine = 4, DistanceTypeNormalizedAngle = 5, DistanceTypeNormalizedCosine = 6, - DistanceTypeJaccard = 7 + DistanceTypeJaccard = 7, + DistanceTypeSparseJaccard = 8 }; enum ObjectType { diff --git a/lib/NGT/ObjectSpaceRepository.h b/lib/NGT/ObjectSpaceRepository.h index 38482a9..b967593 100644 --- a/lib/NGT/ObjectSpaceRepository.h +++ b/lib/NGT/ObjectSpaceRepository.h @@ -114,6 +114,27 @@ namespace NGT { #endif }; + class ComparatorSparseJaccardDistance : public Comparator { + public: +#ifdef NGT_SHARED_MEMORY_ALLOCATOR + ComparatorSparseJaccardDistance(size_t d, SharedMemoryAllocator &a) : Comparator(d, a) {} + double operator()(Object &objecta, Object &objectb) { + return PrimitiveComparator::compareSparseJaccardDistance((OBJECT_TYPE*)&objecta[0], (OBJECT_TYPE*)&objectb[0], dimension); + } + double operator()(Object &objecta, PersistentObject &objectb) { + return PrimitiveComparator::compareSparseJaccardDistance((OBJECT_TYPE*)&objecta[0], (OBJECT_TYPE*)&objectb.at(0, allocator), dimension); + } + double operator()(PersistentObject &objecta, PersistentObject &objectb) { + return PrimitiveComparator::compareSparseJaccardDistance((OBJECT_TYPE*)&objecta.at(0, allocator), (OBJECT_TYPE*)&objectb.at(0, allocator), dimension); + } +#else + ComparatorSparseJaccardDistance(size_t d) : Comparator(d) {} + double operator()(Object &objecta, Object &objectb) { + return PrimitiveComparator::compareSparseJaccardDistance((OBJECT_TYPE*)&objecta[0], (OBJECT_TYPE*)&objectb[0], dimension); + } +#endif + }; + class ComparatorAngleDistance : public Comparator { public: #ifdef NGT_SHARED_MEMORY_ALLOCATOR @@ -279,6 +300,10 @@ namespace NGT { case DistanceTypeJaccard: comparator = new ObjectSpaceRepository::ComparatorJaccardDistance(ObjectSpace::getPaddedDimension(), ObjectRepository::allocator); break; + case DistanceTypeSparseJaccard: + comparator = new ObjectSpaceRepository::ComparatorSparseJaccardDistance(ObjectSpace::getPaddedDimension(), ObjectRepository::allocator); + setSparse(); + break; case DistanceTypeAngle: comparator = new ObjectSpaceRepository::ComparatorAngleDistance(ObjectSpace::getPaddedDimension(), ObjectRepository::allocator); break; @@ -306,6 +331,10 @@ namespace NGT { case DistanceTypeJaccard: comparator = new ObjectSpaceRepository::ComparatorJaccardDistance(ObjectSpace::getPaddedDimension()); break; + case DistanceTypeSparseJaccard: + comparator = new ObjectSpaceRepository::ComparatorSparseJaccardDistance(ObjectSpace::getPaddedDimension()); + setSparse(); + break; case DistanceTypeAngle: comparator = new ObjectSpaceRepository::ComparatorAngleDistance(ObjectSpace::getPaddedDimension()); break; @@ -363,7 +392,11 @@ namespace NGT { for (size_t idx = 0; idx < rep.size(); idx++) { #ifndef NGT_PREFETCH_DISABLED if (idx + prefetchOffset < rep.size() && rep[idx + prefetchOffset] != 0) { +#if defined(NGT_SHARED_MEMORY_ALLOCATOR) + MemoryCache::prefetch((unsigned char*)&(*static_cast(ObjectRepository::get(idx + prefetchOffset))), byteSizeOfObject); +#else MemoryCache::prefetch((unsigned char*)&(*static_cast(rep[idx + prefetchOffset]))[0], byteSizeOfObject); +#endif } #endif if (rep[idx] == 0) { diff --git a/lib/NGT/PrimitiveComparator.h b/lib/NGT/PrimitiveComparator.h index fb9a71f..e031e19 100644 --- a/lib/NGT/PrimitiveComparator.h +++ b/lib/NGT/PrimitiveComparator.h @@ -379,6 +379,32 @@ namespace NGT { } #endif + inline static double compareSparseJaccardDistance(const unsigned char *a, unsigned char *b, size_t size) { + abort(); + } + + + inline static double compareSparseJaccardDistance(const float *a, const float *b, size_t size) { + size_t loca = 0; + size_t locb = 0; + const uint32_t *ai = reinterpret_cast(a); + const uint32_t *bi = reinterpret_cast(b); + size_t count = 0; + while (locb < size && ai[loca] != 0 && bi[loca] != 0) { + int64_t sub = static_cast(ai[loca]) - static_cast(bi[locb]); + count += sub == 0; + loca += sub <= 0; + locb += sub >= 0; + } + while (ai[loca] != 0) { + loca++; + } + while (locb < size && bi[locb] != 0) { + locb++; + } + return 1.0 - static_cast(count) / static_cast(loca + locb - count); + } + #if defined(NGT_NO_AVX) template inline static double compareDotProduct(const OBJECT_TYPE *a, const OBJECT_TYPE *b, size_t size) { @@ -595,6 +621,13 @@ namespace NGT { } }; + class SparseJaccardFloat { + public: + inline static double compare(const void *a, const void *b, size_t size) { + return PrimitiveComparator::compareSparseJaccardDistance((const float*)a, (const float*)b, size); + } + }; + class L2Float { public: inline static double compare(const void *a, const void *b, size_t size) { diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index d233b68..c4a9444 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -5,4 +5,5 @@ if( ${UNIX} ) add_subdirectory("${PROJECT_SOURCE_DIR}/samples/l2-uint8") add_subdirectory("${PROJECT_SOURCE_DIR}/samples/l2-uint8-range-search") add_subdirectory("${PROJECT_SOURCE_DIR}/samples/cosine-float") + add_subdirectory("${PROJECT_SOURCE_DIR}/samples/jaccard-sparse") endif() diff --git a/samples/cosine-float/cosine-float.cpp b/samples/cosine-float/cosine-float.cpp index 67df507..d76018b 100644 --- a/samples/cosine-float/cosine-float.cpp +++ b/samples/cosine-float/cosine-float.cpp @@ -25,8 +25,16 @@ main(int argc, char **argv) while (!linestream.eof()) { float value; linestream >> value; + if (linestream.fail()) { + obj.clear(); + break; + } obj.push_back(value); } + if (obj.empty()) { + cerr << "An empty line or invalid value: " << line << endl; + continue; + } obj.resize(property.dimension); // cut off additional data in the file. index.append(obj); } diff --git a/samples/hamming-uint8/hamming-uint8.cpp b/samples/hamming-uint8/hamming-uint8.cpp index 2a37680..ba73607 100644 --- a/samples/hamming-uint8/hamming-uint8.cpp +++ b/samples/hamming-uint8/hamming-uint8.cpp @@ -26,8 +26,16 @@ main(int argc, char **argv) while (!linestream.eof()) { int value; linestream >> value; + if (linestream.fail()) { + obj.clear(); + break; + } obj.push_back(value); } + if (obj.empty()) { + cerr << "An empty line or invalid value: " << line << endl; + continue; + } obj.resize(property.dimension); // cut off additional data in the file. index.append(obj); } diff --git a/samples/jaccard-sparse/CMakeLists.txt b/samples/jaccard-sparse/CMakeLists.txt new file mode 100644 index 0000000..bd6408d --- /dev/null +++ b/samples/jaccard-sparse/CMakeLists.txt @@ -0,0 +1,9 @@ +if( ${UNIX} ) + include_directories("${PROJECT_BINARY_DIR}/lib") + include_directories("${PROJECT_SOURCE_DIR}/lib") + link_directories("${PROJECT_SOURCE_DIR}/lib/NGT") + + add_executable(jaccard-sparse jaccard-sparse.cpp) + add_dependencies(jaccard-sparse ngt) + target_link_libraries(jaccard-sparse ngt pthread) +endif() diff --git a/samples/jaccard-sparse/jaccard-sparse.cpp b/samples/jaccard-sparse/jaccard-sparse.cpp new file mode 100644 index 0000000..1e4c472 --- /dev/null +++ b/samples/jaccard-sparse/jaccard-sparse.cpp @@ -0,0 +1,248 @@ + +// sort -R sparse_binary.tsv |head -10 > sparse_binary_query_10.tsv +// ./jaccard-sparse create -d 100 -D J sparse +// ./jaccard-sparse append sparse sparse_binary.tsv +// ./jaccard-sparse search sparse sparse_binary_query_10.tsv +// + +#include "NGT/Command.h" + +using namespace std; + +void help() { + cerr << "Usage : jaccard-sparse command index [data]" << endl; + cerr << " command : info create search append" << endl; +} + +void +append(NGT::Args &args) +{ + const string usage = "Usage: jaccard-sparse append [-p #-of-thread] [-n data-size] " + "index(output) [data.tsv(input)]"; + string database; + try { + database = args.get("#1"); + } catch (...) { + cerr << "jaccard-sparse: Error: DB is not specified." << endl; + cerr << usage << endl; + return; + } + string data; + try { + data = args.get("#2"); + } catch (...) { + cerr << "jaccard-sparse: Warning: No specified object file. Just build an index for the existing objects." << endl; + } + + int threadSize = args.getl("p", 50); + size_t dataSize = args.getl("n", 0); + + std::istream *is; + std::ifstream *ifs = 0; + + try { + NGT::Index index(database); + if (data == "-") { + is = &std::cin; + } else { + ifs = new std::ifstream; + ifs->std::ifstream::open(data); + if (!(*ifs)) { + cerr << "Cannot open the specified data file. " << data << endl; + return; + } + is = ifs; + } + string line; + size_t count = 0; + while(getline(*is, line)) { + if (dataSize > 0 && count >= dataSize) { + break; + } + count++; + vector object; + stringstream linestream(line); + while (!linestream.eof()) { + uint32_t value; + linestream >> value; + if (linestream.fail()) { + object.clear(); + break; + } + object.push_back(value); + } + if (object.empty()) { + std::cerr << "jaccard-sparse: Empty line or invalid value. " << count << ":" << line << std::endl; + continue; + } + NGT::ObjectID id = index.append(index.makeSparseObject(object)); + } + if (data != "-") { + delete ifs; + } + index.createIndex(threadSize); + index.saveIndex(database); + } catch (NGT::Exception &err) { + if (data != "-") { + delete ifs; + } + cerr << "jaccard-sparse: Error " << err.what() << endl; + cerr << usage << endl; + } + return; +} + + +void +search(NGT::Index &index, NGT::Command::SearchParameter &searchParameter, ostream &stream) +{ + + std::ifstream is(searchParameter.query); + if (!is) { + std::cerr << "Cannot open the specified file. " << searchParameter.query << std::endl; + return; + } + + if (searchParameter.outputMode[0] == 'e') { + stream << "# Beginning of Evaluation" << endl; + } + + string line; + double totalTime = 0; + size_t queryCount = 0; + double epsilon = searchParameter.beginOfEpsilon; + + while(getline(is, line)) { + if (searchParameter.querySize > 0 && queryCount >= searchParameter.querySize) { + break; + } + vector query; + stringstream linestream(line); + while (!linestream.eof()) { + uint32_t value; + linestream >> value; + query.push_back(value); + } + auto sparseQuery = index.makeSparseObject(query); + queryCount++; + NGT::SearchQuery sc(sparseQuery); + NGT::ObjectDistances objects; + sc.setResults(&objects); + sc.setSize(searchParameter.size); + sc.setRadius(searchParameter.radius); + if (searchParameter.accuracy > 0.0) { + sc.setExpectedAccuracy(searchParameter.accuracy); + } else { + sc.setEpsilon(epsilon); + } + sc.setEdgeSize(searchParameter.edgeSize); + NGT::Timer timer; + switch (searchParameter.indexType) { + case 't': timer.start(); index.search(sc); timer.stop(); break; + case 'g': timer.start(); index.searchUsingOnlyGraph(sc); timer.stop(); break; + case 's': timer.start(); index.linearSearch(sc); timer.stop(); break; + } + totalTime += timer.time; + if (searchParameter.outputMode[0] == 'e') { + stream << "# Query No.=" << queryCount << endl; + stream << "# Query=" << line.substr(0, 20) + " ..." << endl; + stream << "# Index Type=" << searchParameter.indexType << endl; + stream << "# Size=" << searchParameter.size << endl; + stream << "# Radius=" << searchParameter.radius << endl; + stream << "# Epsilon=" << epsilon << endl; + stream << "# Query Time (msec)=" << timer.time * 1000.0 << endl; + stream << "# Distance Computation=" << sc.distanceComputationCount << endl; + stream << "# Visit Count=" << sc.visitCount << endl; + } else { + stream << "Query No." << queryCount << endl; + stream << "Rank\tID\tDistance" << endl; + } + for (size_t i = 0; i < objects.size(); i++) { + stream << i + 1 << "\t" << objects[i].id << "\t"; + stream << objects[i].distance << endl; + } + if (searchParameter.outputMode[0] == 'e') { + stream << "# End of Search" << endl; + } else { + stream << "Query Time= " << timer.time << " (sec), " << timer.time * 1000.0 << " (msec)" << endl; + } + if (searchParameter.outputMode[0] == 'e') { + stream << "# End of Query" << endl; + } + } + if (searchParameter.outputMode[0] == 'e') { + stream << "# Average Query Time (msec)=" << totalTime * 1000.0 / (double)queryCount << endl; + stream << "# Number of queries=" << queryCount << endl; + stream << "# End of Evaluation" << endl; + } else { + stream << "Average Query Time= " << totalTime / (double)queryCount << " (sec), " + << totalTime * 1000.0 / (double)queryCount << " (msec), (" + << totalTime << "/" << queryCount << ")" << endl; + } +} + +void +search(NGT::Args &args) { + const string usage = "Usage: ngt search [-i index-type(g|t|s)] [-n result-size] [-e epsilon] [-E edge-size] " + "[-m open-mode(r|w)] [-o output-mode] index(input) query.tsv(input)"; + + string database; + try { + database = args.get("#1"); + } catch (...) { + cerr << "jaccard-sparse: Error: DB is not specified" << endl; + cerr << usage << endl; + return; + } + + NGT::Command::SearchParameter searchParameter(args); + + try { + NGT::Index index(database, searchParameter.openMode == 'r'); + search(index, searchParameter, cout); + } catch (NGT::Exception &err) { + cerr << "jaccard-sparse: Error " << err.what() << endl; + cerr << usage << endl; + } catch (...) { + cerr << "jaccard-sparse: Error" << endl; + cerr << usage << endl; + } + +} + +int +main(int argc, char **argv) +{ + + NGT::Args args(argc, argv); + + NGT::Command ngt; + + string command; + try { + command = args.get("#0"); + } catch(...) { + help(); + return 0; + } + + try { + if (command == "create") { + ngt.create(args); + } else if (command == "append") { + append(args); + } else if (command == "search") { + search(args); + } else { + cerr << "jaccard-sparse: Error: Illegal command. " << command << endl; + help(); + } + } catch(NGT::Exception &err) { + cerr << "jaccard-sparse: Error: " << err.what() << endl; + help(); + return 0; + } + return 0; +} + + diff --git a/samples/l2-uint8-range-search/l2-uint8-range-search.cpp b/samples/l2-uint8-range-search/l2-uint8-range-search.cpp index 543cd77..55984cd 100644 --- a/samples/l2-uint8-range-search/l2-uint8-range-search.cpp +++ b/samples/l2-uint8-range-search/l2-uint8-range-search.cpp @@ -25,8 +25,16 @@ main(int argc, char **argv) while (!linestream.eof()) { int value; linestream >> value; + if (linestream.fail()) { + obj.clear(); + break; + } obj.push_back(value); } + if (obj.empty()) { + cerr << "An empty line or invalid value: " << line << endl; + continue; + } obj.resize(property.dimension); // cut off additional data in the file. index.append(obj); } diff --git a/samples/l2-uint8/l2-uint8.cpp b/samples/l2-uint8/l2-uint8.cpp index 3beba93..90a1b20 100644 --- a/samples/l2-uint8/l2-uint8.cpp +++ b/samples/l2-uint8/l2-uint8.cpp @@ -25,8 +25,16 @@ main(int argc, char **argv) while (!linestream.eof()) { int value; linestream >> value; + if (linestream.fail()) { + obj.clear(); + break; + } obj.push_back(value); } + if (obj.empty()) { + cerr << "An empty line or invalid value: " << line << endl; + continue; + } obj.resize(property.dimension); // cut off additional data in the file. index.append(obj); }