From b56fcb38d7e556109dce09ec9dc5259abe1f12c3 Mon Sep 17 00:00:00 2001
From: PyDataBlog <pimpfada@gmail.com>
Date: Mon, 24 Jan 2022 22:46:50 +0100
Subject: [PATCH 1/3] Added bulk insertion via files & cleanup

---
 Project.toml          |  4 ++-
 README.md             |  3 +++
 docs/src/index.md     |  4 ++-
 src/SimString.jl      |  1 -
 src/features.jl       | 60 +++++++++++++++++++++++++++++++++++++++++++
 test/dummy_sents.txt  |  2 ++
 test/dummy_words.txt  |  3 +++
 test/test01_dictdb.jl | 34 ++++++++++++++++++++++++
 8 files changed, 108 insertions(+), 3 deletions(-)
 create mode 100644 test/dummy_sents.txt
 create mode 100644 test/dummy_words.txt

diff --git a/Project.toml b/Project.toml
index 63a7c5e..6b72164 100644
--- a/Project.toml
+++ b/Project.toml
@@ -7,9 +7,11 @@ version = "0.1.0"
 CircularArrays = "7a955b69-7140-5f4e-a0ed-f168c5e2e749"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
-ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 
 [compat]
+CircularArrays = "1"
+DataStructures = "0.18"
+OffsetArrays = "1"
 julia = "1"
 
 [extras]
diff --git a/README.md b/README.md
index 9f07e30..900293d 100644
--- a/README.md
+++ b/README.md
@@ -17,6 +17,8 @@ This package is be particulary useful for natural language processing tasks whic
 - [X] Support for unicodes
 - [ ] Custom user defined feature generation methods
 - [ ] Mecab-based tokenizer support
+- [X] Support for building databases directly from text files
+- [ ] Support for persistent databases
 
 ## Suported String Similarity Measures
 
@@ -24,6 +26,7 @@ This package is be particulary useful for natural language processing tasks whic
 - [X] Jaccard coefficient
 - [X] Cosine coefficient
 - [X] Overlap coefficient
+- [X] Exact match
 
 ## Installation
 
diff --git a/docs/src/index.md b/docs/src/index.md
index ef9edaa..dc07919 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -16,7 +16,7 @@ This package is be particulary useful for natural language processing tasks whic
 - [X] Support for unicodes
 - [ ] Custom user defined feature generation methods
 - [ ] Mecab-based tokenizer support
-- [ ] Support for building databases directly from text files
+- [X] Support for building databases directly from text files
 - [ ] Support for persistent databases
 
 ## Suported String Similarity Measures
@@ -64,6 +64,8 @@ push!(db, "fooo");
 
 # Convinient approach is to use an array of strings for multiple entries: `append!(db, ["foo", "bar", "fooo"]);`
 
+# OR: Build database from text files: `append!(db, "YOUR_FILE_NAME.txt");
+
 # Retrieve the closest match(es)
 res = search(Dice(), db, "foo"; α=0.8, ranked=true)
 # 2-element Vector{Tuple{String, Float64}}:
diff --git a/src/SimString.jl b/src/SimString.jl
index 8def7a0..12a0f45 100644
--- a/src/SimString.jl
+++ b/src/SimString.jl
@@ -2,7 +2,6 @@ module SimString
 
 import Base: push!, append!
 using DataStructures: DefaultOrderedDict, DefaultDict
-using ProgressMeter
 using CircularArrays
 using OffsetArrays
 
diff --git a/src/features.jl b/src/features.jl
index 2ebc0ca..9980503 100644
--- a/src/features.jl
+++ b/src/features.jl
@@ -99,8 +99,25 @@ end
 
 
 """
+    push!(db::AbstractSimStringDB, str::AbstractString)
+
 Add a new item to a new or existing collection of strings using
 the custom AbstractSimStringDB type.
+
+# Arguments:
+* `db`: AbstractSimStringDB - The collection of strings to add to
+* `str`: AbstractString - The string to add to the collection
+
+# Example:
+```julia
+db = DictDB(CharacterNGrams(2, " "));
+push!(db, "foo")
+push!(db, "bar")
+push!(db, "fooo")
+````
+
+# Returns:
+* `db`: AbstractSimStringDB - The collection of strings with the new string added
 """
 function push!(db::AbstractSimStringDB, str::AbstractString)
     # Extract features based on the specified feature extractor
@@ -125,11 +142,54 @@ end
 
 
 """
+    append!(db::AbstractSimStringDB, str::Vector)
+
 Add bulk items to a new or existing collection of strings using
 the custom AbstractSimStringDB type.
+
+# Arguments:
+* db: AbstractSimStringDB - The database to add the strings to
+* str: Vector of AbstractString - Vector/Array of strings to add to the database
+
+# Example:
+```julia
+db = DictDB(CharacterNGrams(2, " "));
+append!(db, ["foo", "foo", "fooo"]);
+```
+
+# Returns:
+* db: AbstractSimStringDB - The database with the new strings added
 """
 function append!(db::AbstractSimStringDB, str::Vector)
     @inbounds @simd for i in str
         push!(db, i)
     end
+end
+
+
+"""
+    append!(db::AbstractSimStringDB, file::AbstractString)
+
+Add bulk items to a new or existing collection of strings using
+from a file using the custom AbstractSimStringDB type.
+
+# Arguments:
+* `db``: AbstractSimStringDB - The database to add the items to
+* `file`: AbstractString - Path to the file to read from
+
+# Example:
+```julia
+db = DictDB(CharacterNGrams(2, " "));
+append!(db, "./data/test.txt")
+```
+
+# Returns:
+* `db`: AbstractSimStringDB - The database with the items added
+"""
+function append!(db::AbstractSimStringDB, file::AbstractString)
+    open(file) do f
+        while !eof(f)
+            push!(db, readline(f))
+        end
+    end
 end
\ No newline at end of file
diff --git a/test/dummy_sents.txt b/test/dummy_sents.txt
new file mode 100644
index 0000000..b60b3c9
--- /dev/null
+++ b/test/dummy_sents.txt
@@ -0,0 +1,2 @@
+You are a really really really cool dude.
+Sometimes you are not really really cool tho
\ No newline at end of file
diff --git a/test/dummy_words.txt b/test/dummy_words.txt
new file mode 100644
index 0000000..d4b7bf6
--- /dev/null
+++ b/test/dummy_words.txt
@@ -0,0 +1,3 @@
+foo
+bar
+fooo
\ No newline at end of file
diff --git a/test/test01_dictdb.jl b/test/test01_dictdb.jl
index 6531a70..4896098 100644
--- a/test/test01_dictdb.jl
+++ b/test/test01_dictdb.jl
@@ -75,6 +75,40 @@ end
 end
 
 
+@testset "Test bulk insertion from a file using CharacterNGrams" begin
+    db = DictDB(CharacterNGrams(3, " "))
+    append!(db, "dummy_words.txt")
+
+    @test db.string_collection == ["foo", "bar", "fooo"]
+    @test db.string_size_map[5] == Set(["bar", "foo"])
+    @test db.string_size_map[6] == Set(["fooo"])
+
+    @test collect(keys(db.string_feature_map)) == [5, 6]
+
+    @test collect(values(db.string_feature_map[5])) == vcat( (repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)) )
+    @test collect(values(db.string_feature_map[6])) ==  repeat([Set(["fooo"])], 6)
+
+    @test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String, Int64}
+end
+
+
+
+@testset "Test bulk insertion from a file using WordNGrams" begin
+    db = DictDB(WordNGrams(2, " ", " "))
+    append!(db, "dummy_sents.txt")
+
+    @test db.string_collection == ["You are a really really really cool dude.", "Sometimes you are not really really cool tho"]
+    @test db.string_size_map[9] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
+
+    @test collect(keys(db.string_feature_map)) == [9]
+    @test collect(values(db.string_feature_map[9]))[5] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
+    @test collect(values(db.string_feature_map[9]))[7] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
+
+    @test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String, String}, Int64}
+
+end
+
+
 
 
 end  # module
\ No newline at end of file

From b9ffde995b0934afefe8b26a58781b18183357da Mon Sep 17 00:00:00 2001
From: Bernard Brenyah <pimpfada@gmail.com>
Date: Tue, 25 Jan 2022 20:52:43 +0000
Subject: [PATCH 2/3] Added show functionality for better information

---
 Project.toml            |  3 ++-
 docs/src/index.md       |  2 +-
 extras/benchmark_sim.jl | 48 +++++++++++++++++++++++++++++++++++++++++
 src/dictdb.jl           | 17 +++++++++++++--
 src/features.jl         |  4 ++--
 test/test01_dictdb.jl   | 28 ++++++++++++------------
 test/test04_search.jl   | 13 +++++++++++
 7 files changed, 95 insertions(+), 20 deletions(-)
 create mode 100644 extras/benchmark_sim.jl

diff --git a/Project.toml b/Project.toml
index 6b72164..84d9468 100644
--- a/Project.toml
+++ b/Project.toml
@@ -17,6 +17,7 @@ julia = "1"
 [extras]
 Faker = "0efc519c-db33-5916-ab87-703215c3906f"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Suppressor = "fd094767-a336-5f1f-9728-57cf17d0bbfb"
 
 [targets]
-test = ["Test", "Faker"]
+test = ["Test", "Faker", "Suppressor"]
diff --git a/docs/src/index.md b/docs/src/index.md
index dc07919..06d50b2 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -74,7 +74,7 @@ res = search(Dice(), db, "foo"; α=0.8, ranked=true)
 
 # Describe a working database collection
 desc = describe_collection(db)
-# (total_collection = 3, avg_num_ngrams = 4.5, total_ngrams = 13)
+# (total_collection = 3, avg_size_ngrams = 4.5, total_ngrams = 13)
 ```
 
 ## TODO: Benchmarks
diff --git a/extras/benchmark_sim.jl b/extras/benchmark_sim.jl
new file mode 100644
index 0000000..82f9e9e
--- /dev/null
+++ b/extras/benchmark_sim.jl
@@ -0,0 +1,48 @@
+using SimString
+using Faker
+using BenchmarkTools
+using DataStructures
+
+################################# Benchmark Bulk addition #####################
+db = DictDB(CharacterNGrams(3, " "));
+Faker.seed(2020)
+@time fake_names = [string(Faker.first_name(), " ", Faker.last_name()) for i in 1:100_000];
+
+
+f(d, x) = append!(d, x)
+@time f(db, fake_names)
+
+
+
+################################ Simple Addition ###############################
+
+db = DictDB(CharacterNGrams(2, " "));
+push!(db, "foo");
+push!(db, "bar");
+push!(db, "fooo");
+
+f(x, c, s, a, r) = search(x, c, s; α=a, ranked=r)
+test = "foo";
+col = db;
+sim = Cosine();
+a = 0.8;
+r = true;
+
+f(Cosine(),  db, "foo", 0.8, true)
+
+@btime f($sim,  $col, $test, $a, $r)
+@btime search(Cosine(), db, "foo"; α=0.8, ranked=true)
+
+
+
+db2 = DictDB(CharacterNGrams(3, " "));
+append!(db2, ["foo", "bar", "fooo", "foor"]) # also works via multiple dispatch on a vector
+
+results = search(Cosine(), db, "foo"; α=0.8, ranked=true)  # yet to be implemented
+
+bs = ["foo", "bar", "foo", "foo", "bar"]
+SimString.extract_features(CharacterNGrams(3, " "), "prepress")
+SimString.extract_features(WordNGrams(2, " ", " "), "You are a really really really cool dude.")
+
+db = DictDB(WordNGrams(2, " ", " "))
+push!(db, "You are a really really really cool dude.")
diff --git a/src/dictdb.jl b/src/dictdb.jl
index 7ae857c..f4570ec 100644
--- a/src/dictdb.jl
+++ b/src/dictdb.jl
@@ -87,6 +87,7 @@ Basic summary stats for the DB
 db = DictDB(CharacterNGrams(2, " "));
 append!(db, ["foo", "bar", "fooo"]);
 describe_collection(db)
+(total_collection = 3, avg_size_ngrams = 4.5, total_ngrams = 13)
 
 # Returns
 * NamedTuples: Summary stats for the DB
@@ -98,7 +99,7 @@ function describe_collection(db::DictDB)
 # Total number of strings in collection
 ∑ = length(db.string_collection)
 
-# Average number of ngram features
+# Average size of ngram features
 n = [x for x in keys(db.string_size_map)]
 μ = sum(n) / length(n)
 
@@ -108,7 +109,19 @@ for i in values(db.string_feature_map)
     total_ngrams += length(i)
 end
 
-return (total_collection = ∑, avg_num_ngrams = μ, total_ngrams = total_ngrams)
+return (total_collection = ∑, avg_size_ngrams = μ, total_ngrams = total_ngrams)
+end
+
+
+"""
+Pretty print summary stats for the DB
+"""
+function Base.show(io::IO, x::DictDB)
+    metrics = describe_collection(x)
+    println(io, "DictDB($(x.feature_extractor))")
+    println(io, "Total collection: ", metrics.total_collection)
+    println(io, "Average number of ngram features: ", metrics.avg_size_ngrams)
+    println(io, "Total number of ngram features: ", metrics.total_ngrams)
 end
 
 
diff --git a/src/features.jl b/src/features.jl
index 9980503..8486b3b 100644
--- a/src/features.jl
+++ b/src/features.jl
@@ -188,8 +188,8 @@ append!(db, "./data/test.txt")
 """
 function append!(db::AbstractSimStringDB, file::AbstractString)
     open(file) do f
-        while !eof(f)
-            push!(db, readline(f))
+        for line in eachline(f)
+            push!(db, line)
         end
     end
 end
\ No newline at end of file
diff --git a/test/test01_dictdb.jl b/test/test01_dictdb.jl
index 4896098..ddb7f43 100644
--- a/test/test01_dictdb.jl
+++ b/test/test01_dictdb.jl
@@ -15,8 +15,8 @@ using Test
 
     @test collect(keys(db.string_feature_map)) == [5, 6]
 
-    @test collect(values(db.string_feature_map[5])) == vcat( (repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)) )
-    @test collect(values(db.string_feature_map[6])) ==  repeat([Set(["fooo"])], 6)
+    @test collect(values(db.string_feature_map[5])) == vcat((repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)))
+    @test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6)
 end
 
 
@@ -41,10 +41,10 @@ end
 
     @test collect(keys(db.string_feature_map)) == [5, 6]
 
-    @test collect(values(db.string_feature_map[5])) == vcat( (repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)) )
-    @test collect(values(db.string_feature_map[6])) ==  repeat([Set(["fooo"])], 6)
+    @test collect(values(db.string_feature_map[5])) == vcat((repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)))
+    @test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6)
 
-    @test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String, Int64}
+    @test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String,Int64}
 end
 
 
@@ -59,19 +59,19 @@ end
     @test collect(values(db.string_feature_map[9]))[5] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
     @test collect(values(db.string_feature_map[9]))[7] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
 
-    @test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String, String}, Int64}
+    @test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String,String},Int64}
 end
 
 
 
 @testset "Test describe functionality" begin
-    db = DictDB(CharacterNGrams(2, " "));
-    append!(db, ["foo", "bar", "fooo"]);
+    db = DictDB(CharacterNGrams(2, " "))
+    append!(db, ["foo", "bar", "fooo"])
 
     # Interact with db
-    search(Dice(), db, "zep"; α=0.8, ranked=true)
+    search(Dice(), db, "zep"; α = 0.8, ranked = true)
 
-    @test describe_collection(db) == (total_collection = 3, avg_num_ngrams = 4.5, total_ngrams = 13)
+    @test describe_collection(db) == (total_collection = 3, avg_size_ngrams = 4.5, total_ngrams = 13)
 end
 
 
@@ -85,10 +85,10 @@ end
 
     @test collect(keys(db.string_feature_map)) == [5, 6]
 
-    @test collect(values(db.string_feature_map[5])) == vcat( (repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)) )
-    @test collect(values(db.string_feature_map[6])) ==  repeat([Set(["fooo"])], 6)
+    @test collect(values(db.string_feature_map[5])) == vcat((repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)))
+    @test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6)
 
-    @test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String, Int64}
+    @test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String,Int64}
 end
 
 
@@ -104,7 +104,7 @@ end
     @test collect(values(db.string_feature_map[9]))[5] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
     @test collect(values(db.string_feature_map[9]))[7] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
 
-    @test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String, String}, Int64}
+    @test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String,String},Int64}
 
 end
 
diff --git a/test/test04_search.jl b/test/test04_search.jl
index 14f93e0..8cdb283 100644
--- a/test/test04_search.jl
+++ b/test/test04_search.jl
@@ -2,6 +2,7 @@ module TestMeasures
 using SimString
 using Test
 using Faker
+using Suppressor
 
 
 @testset "Test Dice Search" begin
@@ -54,6 +55,7 @@ end
 
 end
 
+
 @testset "Test Micro Deep Dive Search" begin
     db = DictDB(CharacterNGrams(2, " "));
     append!(db, ["a", "ab", "abc", "abcd", "abcde"]);
@@ -76,6 +78,17 @@ end
 end
 
 
+@testset "Test output from show" begin
+    db = DictDB(CharacterNGrams(2, " "));
+    append!(db, ["foo", "bar", "fooo"]);
+
+    expected_out = "DictDB(SimString.CharacterNGrams{Int64, String}(2, \" \"))\nTotal collection: 3\nAverage number of ngram features: 4.5\nTotal number of ngram features: 13\n"
+    r = @capture_out show(db)
+    @test r == expected_out
+end
+
+
+
 
 
 end  # module
\ No newline at end of file

From ee18b527d12e0b392f460089a4013ef047e632fa Mon Sep 17 00:00:00 2001
From: Bernard Brenyah <pimpfada@gmail.com>
Date: Thu, 10 Feb 2022 11:31:48 +0000
Subject: [PATCH 3/3] Initial public release

---
 src/search.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/search.jl b/src/search.jl
index 29d6e2d..fa0ddb4 100644
--- a/src/search.jl
+++ b/src/search.jl
@@ -74,7 +74,7 @@ function overlap_join(db_collection::AbstractSimStringDB, features, τ, candidat
     results = String[]
 
     for (candidate, match_count) in candidate_match_counts
-        for i in (query_feature_length - τ + 1) : query_feature_length # TODO: Verify
+        for i in (query_feature_length - τ + 1) : query_feature_length
             if candidate in lookup_feature_set_by_size_feature(db_collection, candidate_size, features[i])
                 match_count += 1
             end
@@ -103,16 +103,16 @@ function search!(measure::AbstractSimilarityMeasure, db_collection::DictDB, quer
     features = extract_features(db_collection.feature_extractor, query)
 
     # Metadata from the generated features (length, min & max sizes)
-    length_of_features = length(features)
-    min_feature_size = minimum_feature_size(measure, length_of_features, α)
-    max_feature_size = maximum_feature_size(measure, db_collection, length_of_features, α)
+    # length_of_features = length(features)
+    # min_feature_size = minimum_feature_size(measure, length_of_features, α)
+    # max_feature_size = maximum_feature_size(measure, db_collection, length_of_features, α)
 
     results = String[]
 
     # Generate and return results from the potential candidate size pool
-    @inbounds for candidate_size in min_feature_size:max_feature_size
+    @inbounds for candidate_size in minimum_feature_size(measure, length(features), α) : maximum_feature_size(measure, db_collection, length(features), α)
         # Minimum overlap
-        τ = minimum_overlap(measure, length_of_features, candidate_size, α)
+        τ = minimum_overlap(measure, length(features), candidate_size, α)
 
         # Generate approximate candidates from the overlap join
         append!(results, overlap_join(db_collection, features, τ, candidate_size))