Skip to content

Commit

Permalink
v1.12.0 add sparse jaccard distance
Browse files Browse the repository at this point in the history
  • Loading branch information
masajiro committed Jul 1, 2020
1 parent b9f8c37 commit a158833
Show file tree
Hide file tree
Showing 19 changed files with 465 additions and 36 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.11.6
1.12.0
15 changes: 15 additions & 0 deletions lib/NGT/Command.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ using namespace std;
"[-e epsilon] [-o object-type(f|c)] [-D distance-function(1|2|a|A|h|j|c|C)] [-n #-of-inserted-objects] "
"[-P path-adjustment-interval] [-B dynamic-edge-size-base] [-A object-alignment(t|f)] "
"[-T build-time-limit] [-O outgoing x incoming] "
#if defined(NGT_SHARED_MEMORY_ALLOCATOR)
"[-N maximum-#-of-inserted-objects] "
#endif
"index(output) [data.tsv(input)]";
string database;
try {
Expand Down Expand Up @@ -159,6 +162,9 @@ using namespace std;
case 'j':
property.distanceType = NGT::Index::Property::DistanceType::DistanceTypeJaccard;
break;
case 'J':
property.distanceType = NGT::Index::Property::DistanceType::DistanceTypeSparseJaccard;
break;
case 'c':
property.distanceType = NGT::Index::Property::DistanceType::DistanceTypeCosine;
break;
Expand All @@ -171,6 +177,15 @@ using namespace std;
return;
}

#ifdef NGT_SHARED_MEMORY_ALLOCATOR
size_t maxNoOfObjects = args.getl("N", 0);
if (maxNoOfObjects > 0) {
property.graphSharedMemorySize
= property.treeSharedMemorySize
= property.objectSharedMemorySize = 512 * ceil(maxNoOfObjects / 50000000);
}
#endif

switch (indexType) {
case 't':
NGT::Index::createGraphAndTree(database, property, data, dataSize);
Expand Down
6 changes: 3 additions & 3 deletions lib/NGT/Common.h
Original file line number Diff line number Diff line change
Expand Up @@ -1700,11 +1700,11 @@ namespace NGT {

class SearchQuery : public NGT::SearchContainer {
public:
template <typename QTYPE> SearchQuery(std::vector<QTYPE> &q):query(0) { setQuery(q); }
template <typename QTYPE> SearchQuery(SearchContainer &sc, std::vector<QTYPE> &q): SearchContainer(sc), query(0) { setQuery(q); }
template <typename QTYPE> SearchQuery(const std::vector<QTYPE> &q):query(0) { setQuery(q); }
template <typename QTYPE> SearchQuery(SearchContainer &sc, const std::vector<QTYPE> &q): SearchContainer(sc), query(0) { setQuery(q); }
~SearchQuery() { deleteQuery(); }

template <typename QTYPE> void setQuery(std::vector<QTYPE> &q) {
template <typename QTYPE> void setQuery(const std::vector<QTYPE> &q) {
if (query != 0) {
deleteQuery();
}
Expand Down
12 changes: 12 additions & 0 deletions lib/NGT/Graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,12 @@ NeighborhoodGraph::Search::l2Float(NeighborhoodGraph &graph, NGT::SearchContaine
graph.searchReadOnlyGraph<PrimitiveComparator::L2Float, DistanceCheckedSet>(sc, seeds);
}

void
NeighborhoodGraph::Search::sparseJaccardFloat(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds)
{
graph.searchReadOnlyGraph<PrimitiveComparator::SparseJaccardFloat, DistanceCheckedSet>(sc, seeds);
}

void
NeighborhoodGraph::Search::l1Uint8(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds)
{
Expand Down Expand Up @@ -162,6 +168,12 @@ NeighborhoodGraph::Search::l2FloatForLargeDataset(NeighborhoodGraph &graph, NGT:
graph.searchReadOnlyGraph<PrimitiveComparator::L2Float, DistanceCheckedSetForLargeDataset>(sc, seeds);
}

void
NeighborhoodGraph::Search::sparseJaccardFloatForLargeDataset(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds)
{
graph.searchReadOnlyGraph<PrimitiveComparator::SparseJaccardFloat, DistanceCheckedSet>(sc, seeds);
}

void
NeighborhoodGraph::Search::l1Uint8ForLargeDataset(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds)
{
Expand Down
4 changes: 4 additions & 0 deletions lib/NGT/Graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,7 @@ namespace NGT {
case NGT::ObjectSpace::DistanceTypeAngle : return angleFloat;
case NGT::ObjectSpace::DistanceTypeL2 : return l2Float;
case NGT::ObjectSpace::DistanceTypeL1 : return l1Float;
case NGT::ObjectSpace::DistanceTypeSparseJaccard : return sparseJaccardFloat;
default: return l2Float;
}
break;
Expand All @@ -312,6 +313,7 @@ namespace NGT {
case NGT::ObjectSpace::DistanceTypeAngle : return angleFloatForLargeDataset;
case NGT::ObjectSpace::DistanceTypeL2 : return l2FloatForLargeDataset;
case NGT::ObjectSpace::DistanceTypeL1 : return l1FloatForLargeDataset;
case NGT::ObjectSpace::DistanceTypeSparseJaccard : return sparseJaccardFloatForLargeDataset;
default: return l2FloatForLargeDataset;
}
break;
Expand All @@ -334,6 +336,7 @@ namespace NGT {
static void l2Float(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds);
static void hammingUint8(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds);
static void jaccardUint8(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds);
static void sparseJaccardFloat(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds);
static void cosineSimilarityFloat(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds);
static void angleFloat(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds);
static void normalizedCosineSimilarityFloat(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds);
Expand All @@ -345,6 +348,7 @@ namespace NGT {
static void l2FloatForLargeDataset(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds);
static void hammingUint8ForLargeDataset(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds);
static void jaccardUint8ForLargeDataset(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds);
static void sparseJaccardFloatForLargeDataset(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds);
static void cosineSimilarityFloatForLargeDataset(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds);
static void angleFloatForLargeDataset(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds);
static void normalizedCosineSimilarityFloatForLargeDataset(NeighborhoodGraph &graph, NGT::SearchContainer &sc, ObjectDistances &seeds);
Expand Down
2 changes: 1 addition & 1 deletion lib/NGT/GraphReconstructor.h
Original file line number Diff line number Diff line change
Expand Up @@ -696,7 +696,7 @@ class GraphReconstructor {
#endif
}

// reconstruct a pseudo ANNG with a fewer edges form an actual ANNG with more edges.
// reconstruct a pseudo ANNG with a fewer edges from an actual ANNG with more edges.
// graph is a source ANNG
// index is an index with a reconstructed ANNG
static
Expand Down
28 changes: 26 additions & 2 deletions lib/NGT/Index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,25 @@ NGT::Index::exportIndex(const string &database, const string &file) {
cerr << "# of objects=" << idx.getObjectRepositorySize() - 1 << endl;
}

std::vector<float>
NGT::Index::makeSparseObject(std::vector<uint32_t> &object)
{
if (static_cast<NGT::GraphIndex&>(getIndex()).getProperty().distanceType != NGT::ObjectSpace::DistanceType::DistanceTypeSparseJaccard) {
NGTThrowException("NGT::Index::makeSparseObject: Not sparse jaccard.");
}
size_t dimension = getObjectSpace().getDimension();
if (object.size() + 1 > dimension) {
std::stringstream msg;
dimension = object.size() + 1;
}
std::vector<float> obj(dimension, 0.0);
for (size_t i = 0; i < object.size(); i++) {
float fv = *reinterpret_cast<float*>(&object[i]);
obj[i] = fv;
}
return obj;
}

void
NGT::Index::Property::set(NGT::Property &prop) {
if (prop.dimension != -1) dimension = prop.dimension;
Expand Down Expand Up @@ -465,12 +484,17 @@ class BuildTimeController {
void
NGT::GraphIndex::constructObjectSpace(NGT::Property &prop) {
assert(prop.dimension != 0);
size_t dimension = prop.dimension;
if (prop.distanceType == NGT::ObjectSpace::DistanceType::DistanceTypeSparseJaccard) {
dimension++;
}

switch (prop.objectType) {
case NGT::ObjectSpace::ObjectType::Float :
objectSpace = new ObjectSpaceRepository<float, double>(prop.dimension, typeid(float), prop.distanceType);
objectSpace = new ObjectSpaceRepository<float, double>(dimension, typeid(float), prop.distanceType);
break;
case NGT::ObjectSpace::ObjectType::Uint8 :
objectSpace = new ObjectSpaceRepository<unsigned char, int>(prop.dimension, typeid(uint8_t), prop.distanceType);
objectSpace = new ObjectSpaceRepository<unsigned char, int>(dimension, typeid(uint8_t), prop.distanceType);
break;
default:
stringstream msg;
Expand Down
15 changes: 9 additions & 6 deletions lib/NGT/Index.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ namespace NGT {
databaseType = DatabaseType::MemoryMappedFile;
graphSharedMemorySize = 512; // MB
treeSharedMemorySize = 512; // MB
objectSharedMemorySize = 512; // MB 512 is up to 20M objects.
objectSharedMemorySize = 512; // MB 512 is up to 50M objects.
#else
databaseType = DatabaseType::Memory;
#endif
Expand Down Expand Up @@ -116,6 +116,7 @@ namespace NGT {
case DistanceType::DistanceTypeL2: p.set("DistanceType", "L2"); break;
case DistanceType::DistanceTypeHamming: p.set("DistanceType", "Hamming"); break;
case DistanceType::DistanceTypeJaccard: p.set("DistanceType", "Jaccard"); break;
case DistanceType::DistanceTypeSparseJaccard: p.set("DistanceType", "SparseJaccard"); break;
case DistanceType::DistanceTypeAngle: p.set("DistanceType", "Angle"); break;
case DistanceType::DistanceTypeCosine: p.set("DistanceType", "Cosine"); break;
case DistanceType::DistanceTypeNormalizedAngle: p.set("DistanceType", "NormalizedAngle"); break;
Expand Down Expand Up @@ -177,6 +178,8 @@ namespace NGT {
distanceType = DistanceType::DistanceTypeHamming;
} else if (it->second == "Jaccard") {
distanceType = DistanceType::DistanceTypeJaccard;
} else if (it->second == "SparseJaccard") {
distanceType = DistanceType::DistanceTypeSparseJaccard;
} else if (it->second == "Angle") {
distanceType = DistanceType::DistanceTypeAngle;
} else if (it->second == "Cosine") {
Expand Down Expand Up @@ -392,8 +395,8 @@ namespace NGT {
static void createGraphAndTree(const std::string &database, NGT::Property &prop, const std::string &dataFile, size_t dataSize = 0, bool redirect = false);
static void createGraphAndTree(const std::string &database, NGT::Property &prop, bool redirect = false) { createGraphAndTree(database, prop, "", redirect); }
static void createGraph(const std::string &database, NGT::Property &prop, const std::string &dataFile, size_t dataSize = 0, bool redirect = false);
template<typename T> size_t insert(std::vector<T> &object);
template<typename T> size_t append(std::vector<T> &object);
template<typename T> size_t insert(const std::vector<T> &object);
template<typename T> size_t append(const std::vector<T> &object);
static void append(const std::string &database, const std::string &dataFile, size_t threadSize, size_t dataSize);
static void append(const std::string &database, const float *data, size_t dataSize, size_t threadSize);
static void remove(const std::string &database, std::vector<ObjectID> &objects, bool force = false);
Expand Down Expand Up @@ -470,6 +473,7 @@ namespace NGT {
ObjectDistances seeds;
getIndex().search(sc, seeds);
}
std::vector<float> makeSparseObject(std::vector<uint32_t> &object);
Index &getIndex() {
if (index == 0) {
assert(index != 0);
Expand Down Expand Up @@ -1638,9 +1642,8 @@ namespace NGT {

} // namespace NGT


template<typename T>
size_t NGT::Index::append(std::vector<T> &object)
size_t NGT::Index::append(const std::vector<T> &object)
{
if (getObjectSpace().getRepository().size() == 0) {
getObjectSpace().getRepository().initialize();
Expand All @@ -1653,7 +1656,7 @@ size_t NGT::Index::append(std::vector<T> &object)
}

template<typename T>
size_t NGT::Index::insert(std::vector<T> &object)
size_t NGT::Index::insert(const std::vector<T> &object)
{
if (getObjectSpace().getRepository().size() == 0) {
getObjectSpace().getRepository().initialize();
Expand Down
58 changes: 36 additions & 22 deletions lib/NGT/ObjectRepository.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ namespace NGT {
public:
typedef Repository<Object> Parent;
#endif
ObjectRepository(size_t dim, const std::type_info &ot):dimension(dim), type(ot) { }
ObjectRepository(size_t dim, const std::type_info &ot):dimension(dim), type(ot), sparse(false) { }

void initialize() {
deleteAll();
Expand Down Expand Up @@ -220,22 +220,28 @@ namespace NGT {
}

template <typename T>
Object *allocateObject(T *o, size_t size = 0) {
Object *po = new Object(paddedByteSize);
if (size != 0 && dimension != size) {
std::cerr << "ObjectSpace::allocateObject: Fatal error! dimension is invalid. The indexed objects="
<< dimension << " The specified object=" << size << std::endl;
assert(dimension == size);
Object *allocateObject(T *o, size_t size) {
size_t osize = paddedByteSize;
if (sparse) {
size_t vsize = size * (type == typeid(float) ? 4 : 1);
osize = osize < vsize ? vsize : osize;
} else {
if (dimension != size) {
std::cerr << "ObjectSpace::allocateObject: Fatal error! dimension is invalid. The indexed objects="
<< dimension << " The specified object=" << size << std::endl;
assert(dimension == size);
}
}
Object *po = new Object(osize);
void *object = static_cast<void*>(&(*po)[0]);
if (type == typeid(uint8_t)) {
uint8_t *obj = static_cast<uint8_t*>(object);
for (size_t i = 0; i < dimension; i++) {
for (size_t i = 0; i < size; i++) {
obj[i] = static_cast<uint8_t>(o[i]);
}
} else if (type == typeid(float)) {
float *obj = static_cast<float*>(object);
for (size_t i = 0; i < dimension; i++) {
for (size_t i = 0; i < size; i++) {
obj[i] = static_cast<float>(o[i]);
}
} else {
Expand Down Expand Up @@ -270,13 +276,14 @@ namespace NGT {
}

template <typename T>
PersistentObject *allocatePersistentObject(T *o, size_t size = 0) {
PersistentObject *allocatePersistentObject(T *o, size_t size) {
SharedMemoryAllocator &objectAllocator = getAllocator();
PersistentObject *po = new (objectAllocator) PersistentObject(objectAllocator, paddedByteSize);
if (size != 0 && dimension != size) {
std::cerr << "ObjectSpace::allocateObject: Fatal error! dimension is invalid. The indexed objects="
<< dimension << " The specified object=" << size << std::endl;
assert(dimension == size);
std::stringstream msg;
msg << "ObjectSpace::allocatePersistentObject: Fatal error! The dimensionality is invalid. The specified dimensionality="
<< (sparse ? dimension - 1 : dimension) << ". The specified object=" << (sparse ? size - 1 : size) << ".";
NGTThrowException(msg);
}
void *object = static_cast<void*>(&(*po).at(0, allocator));
if (type == typeid(uint8_t)) {
Expand All @@ -302,10 +309,20 @@ namespace NGT {
}

#else
// ObjectRepository
template <typename T>
PersistentObject *allocatePersistentObject(T *o, size_t size) {
if (size != 0 && dimension != size) {
std::stringstream msg;
msg << "ObjectSpace::allocatePersistentObject: Fatal error! The dimensionality is invalid. The specified dimensionality="
<< (sparse ? dimension - 1 : dimension) << ". The specified object=" << (sparse ? size - 1 : size) << ".";
NGTThrowException(msg);
}
return allocateObject(o, size);
}

template <typename T>
PersistentObject *allocatePersistentObject(const std::vector<T> &o) {
return allocateObject(o);
return allocatePersistentObject(o.data(), o.size());
}
#endif

Expand Down Expand Up @@ -344,20 +361,17 @@ namespace NGT {
}
#endif

void setLength(size_t l) {
byteSize = l;
}
void setPaddedLength(size_t l) {
paddedByteSize = l;
}

void setLength(size_t l) { byteSize = l; }
void setPaddedLength(size_t l) { paddedByteSize = l; }
void setSparse() { sparse = true; }
size_t getByteSize() { return byteSize; }
size_t insert(PersistentObject *obj) { return Parent::insert(obj); }
const size_t dimension;
const std::type_info &type;
protected:
size_t byteSize; // the length of all of elements.
size_t paddedByteSize;
bool sparse; // sparse data format
};

} // namespace NGT
3 changes: 2 additions & 1 deletion lib/NGT/ObjectSpace.h
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,8 @@ namespace NGT {
DistanceTypeCosine = 4,
DistanceTypeNormalizedAngle = 5,
DistanceTypeNormalizedCosine = 6,
DistanceTypeJaccard = 7
DistanceTypeJaccard = 7,
DistanceTypeSparseJaccard = 8
};

enum ObjectType {
Expand Down
Loading

0 comments on commit a158833

Please # to comment.