From b56fcb38d7e556109dce09ec9dc5259abe1f12c3 Mon Sep 17 00:00:00 2001 From: PyDataBlog Date: Mon, 24 Jan 2022 22:46:50 +0100 Subject: [PATCH 1/3] Added bulk insertion via files & cleanup --- Project.toml | 4 ++- README.md | 3 +++ docs/src/index.md | 4 ++- src/SimString.jl | 1 - src/features.jl | 60 +++++++++++++++++++++++++++++++++++++++++++ test/dummy_sents.txt | 2 ++ test/dummy_words.txt | 3 +++ test/test01_dictdb.jl | 34 ++++++++++++++++++++++++ 8 files changed, 108 insertions(+), 3 deletions(-) create mode 100644 test/dummy_sents.txt create mode 100644 test/dummy_words.txt diff --git a/Project.toml b/Project.toml index 63a7c5e..6b72164 100644 --- a/Project.toml +++ b/Project.toml @@ -7,9 +7,11 @@ version = "0.1.0" CircularArrays = "7a955b69-7140-5f4e-a0ed-f168c5e2e749" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" -ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" [compat] +CircularArrays = "1" +DataStructures = "0.18" +OffsetArrays = "1" julia = "1" [extras] diff --git a/README.md b/README.md index 9f07e30..900293d 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,8 @@ This package is be particulary useful for natural language processing tasks whic - [X] Support for unicodes - [ ] Custom user defined feature generation methods - [ ] Mecab-based tokenizer support +- [X] Support for building databases directly from text files +- [ ] Support for persistent databases ## Suported String Similarity Measures @@ -24,6 +26,7 @@ This package is be particulary useful for natural language processing tasks whic - [X] Jaccard coefficient - [X] Cosine coefficient - [X] Overlap coefficient +- [X] Exact match ## Installation diff --git a/docs/src/index.md b/docs/src/index.md index ef9edaa..dc07919 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -16,7 +16,7 @@ This package is be particulary useful for natural language processing tasks whic - [X] Support for unicodes - [ ] Custom user defined feature generation methods - [ ] Mecab-based tokenizer support -- [ ] Support for building databases directly from text files +- [X] Support for building databases directly from text files - [ ] Support for persistent databases ## Suported String Similarity Measures @@ -64,6 +64,8 @@ push!(db, "fooo"); # Convinient approach is to use an array of strings for multiple entries: `append!(db, ["foo", "bar", "fooo"]);` +# OR: Build database from text files: `append!(db, "YOUR_FILE_NAME.txt"); + # Retrieve the closest match(es) res = search(Dice(), db, "foo"; α=0.8, ranked=true) # 2-element Vector{Tuple{String, Float64}}: diff --git a/src/SimString.jl b/src/SimString.jl index 8def7a0..12a0f45 100644 --- a/src/SimString.jl +++ b/src/SimString.jl @@ -2,7 +2,6 @@ module SimString import Base: push!, append! using DataStructures: DefaultOrderedDict, DefaultDict -using ProgressMeter using CircularArrays using OffsetArrays diff --git a/src/features.jl b/src/features.jl index 2ebc0ca..9980503 100644 --- a/src/features.jl +++ b/src/features.jl @@ -99,8 +99,25 @@ end """ + push!(db::AbstractSimStringDB, str::AbstractString) + Add a new item to a new or existing collection of strings using the custom AbstractSimStringDB type. + +# Arguments: +* `db`: AbstractSimStringDB - The collection of strings to add to +* `str`: AbstractString - The string to add to the collection + +# Example: +```julia +db = DictDB(CharacterNGrams(2, " ")); +push!(db, "foo") +push!(db, "bar") +push!(db, "fooo") +```` + +# Returns: +* `db`: AbstractSimStringDB - The collection of strings with the new string added """ function push!(db::AbstractSimStringDB, str::AbstractString) # Extract features based on the specified feature extractor @@ -125,11 +142,54 @@ end """ + append!(db::AbstractSimStringDB, str::Vector) + Add bulk items to a new or existing collection of strings using the custom AbstractSimStringDB type. + +# Arguments: +* db: AbstractSimStringDB - The database to add the strings to +* str: Vector of AbstractString - Vector/Array of strings to add to the database + +# Example: +```julia +db = DictDB(CharacterNGrams(2, " ")); +append!(db, ["foo", "foo", "fooo"]); +``` + +# Returns: +* db: AbstractSimStringDB - The database with the new strings added """ function append!(db::AbstractSimStringDB, str::Vector) @inbounds @simd for i in str push!(db, i) end +end + + +""" + append!(db::AbstractSimStringDB, file::AbstractString) + +Add bulk items to a new or existing collection of strings using +from a file using the custom AbstractSimStringDB type. + +# Arguments: +* `db``: AbstractSimStringDB - The database to add the items to +* `file`: AbstractString - Path to the file to read from + +# Example: +```julia +db = DictDB(CharacterNGrams(2, " ")); +append!(db, "./data/test.txt") +``` + +# Returns: +* `db`: AbstractSimStringDB - The database with the items added +""" +function append!(db::AbstractSimStringDB, file::AbstractString) + open(file) do f + while !eof(f) + push!(db, readline(f)) + end + end end \ No newline at end of file diff --git a/test/dummy_sents.txt b/test/dummy_sents.txt new file mode 100644 index 0000000..b60b3c9 --- /dev/null +++ b/test/dummy_sents.txt @@ -0,0 +1,2 @@ +You are a really really really cool dude. +Sometimes you are not really really cool tho \ No newline at end of file diff --git a/test/dummy_words.txt b/test/dummy_words.txt new file mode 100644 index 0000000..d4b7bf6 --- /dev/null +++ b/test/dummy_words.txt @@ -0,0 +1,3 @@ +foo +bar +fooo \ No newline at end of file diff --git a/test/test01_dictdb.jl b/test/test01_dictdb.jl index 6531a70..4896098 100644 --- a/test/test01_dictdb.jl +++ b/test/test01_dictdb.jl @@ -75,6 +75,40 @@ end end +@testset "Test bulk insertion from a file using CharacterNGrams" begin + db = DictDB(CharacterNGrams(3, " ")) + append!(db, "dummy_words.txt") + + @test db.string_collection == ["foo", "bar", "fooo"] + @test db.string_size_map[5] == Set(["bar", "foo"]) + @test db.string_size_map[6] == Set(["fooo"]) + + @test collect(keys(db.string_feature_map)) == [5, 6] + + @test collect(values(db.string_feature_map[5])) == vcat( (repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)) ) + @test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6) + + @test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String, Int64} +end + + + +@testset "Test bulk insertion from a file using WordNGrams" begin + db = DictDB(WordNGrams(2, " ", " ")) + append!(db, "dummy_sents.txt") + + @test db.string_collection == ["You are a really really really cool dude.", "Sometimes you are not really really cool tho"] + @test db.string_size_map[9] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"]) + + @test collect(keys(db.string_feature_map)) == [9] + @test collect(values(db.string_feature_map[9]))[5] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"]) + @test collect(values(db.string_feature_map[9]))[7] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"]) + + @test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String, String}, Int64} + +end + + end # module \ No newline at end of file From b9ffde995b0934afefe8b26a58781b18183357da Mon Sep 17 00:00:00 2001 From: Bernard Brenyah Date: Tue, 25 Jan 2022 20:52:43 +0000 Subject: [PATCH 2/3] Added show functionality for better information --- Project.toml | 3 ++- docs/src/index.md | 2 +- extras/benchmark_sim.jl | 48 +++++++++++++++++++++++++++++++++++++++++ src/dictdb.jl | 17 +++++++++++++-- src/features.jl | 4 ++-- test/test01_dictdb.jl | 28 ++++++++++++------------ test/test04_search.jl | 13 +++++++++++ 7 files changed, 95 insertions(+), 20 deletions(-) create mode 100644 extras/benchmark_sim.jl diff --git a/Project.toml b/Project.toml index 6b72164..84d9468 100644 --- a/Project.toml +++ b/Project.toml @@ -17,6 +17,7 @@ julia = "1" [extras] Faker = "0efc519c-db33-5916-ab87-703215c3906f" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +Suppressor = "fd094767-a336-5f1f-9728-57cf17d0bbfb" [targets] -test = ["Test", "Faker"] +test = ["Test", "Faker", "Suppressor"] diff --git a/docs/src/index.md b/docs/src/index.md index dc07919..06d50b2 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -74,7 +74,7 @@ res = search(Dice(), db, "foo"; α=0.8, ranked=true) # Describe a working database collection desc = describe_collection(db) -# (total_collection = 3, avg_num_ngrams = 4.5, total_ngrams = 13) +# (total_collection = 3, avg_size_ngrams = 4.5, total_ngrams = 13) ``` ## TODO: Benchmarks diff --git a/extras/benchmark_sim.jl b/extras/benchmark_sim.jl new file mode 100644 index 0000000..82f9e9e --- /dev/null +++ b/extras/benchmark_sim.jl @@ -0,0 +1,48 @@ +using SimString +using Faker +using BenchmarkTools +using DataStructures + +################################# Benchmark Bulk addition ##################### +db = DictDB(CharacterNGrams(3, " ")); +Faker.seed(2020) +@time fake_names = [string(Faker.first_name(), " ", Faker.last_name()) for i in 1:100_000]; + + +f(d, x) = append!(d, x) +@time f(db, fake_names) + + + +################################ Simple Addition ############################### + +db = DictDB(CharacterNGrams(2, " ")); +push!(db, "foo"); +push!(db, "bar"); +push!(db, "fooo"); + +f(x, c, s, a, r) = search(x, c, s; α=a, ranked=r) +test = "foo"; +col = db; +sim = Cosine(); +a = 0.8; +r = true; + +f(Cosine(), db, "foo", 0.8, true) + +@btime f($sim, $col, $test, $a, $r) +@btime search(Cosine(), db, "foo"; α=0.8, ranked=true) + + + +db2 = DictDB(CharacterNGrams(3, " ")); +append!(db2, ["foo", "bar", "fooo", "foor"]) # also works via multiple dispatch on a vector + +results = search(Cosine(), db, "foo"; α=0.8, ranked=true) # yet to be implemented + +bs = ["foo", "bar", "foo", "foo", "bar"] +SimString.extract_features(CharacterNGrams(3, " "), "prepress") +SimString.extract_features(WordNGrams(2, " ", " "), "You are a really really really cool dude.") + +db = DictDB(WordNGrams(2, " ", " ")) +push!(db, "You are a really really really cool dude.") diff --git a/src/dictdb.jl b/src/dictdb.jl index 7ae857c..f4570ec 100644 --- a/src/dictdb.jl +++ b/src/dictdb.jl @@ -87,6 +87,7 @@ Basic summary stats for the DB db = DictDB(CharacterNGrams(2, " ")); append!(db, ["foo", "bar", "fooo"]); describe_collection(db) +(total_collection = 3, avg_size_ngrams = 4.5, total_ngrams = 13) # Returns * NamedTuples: Summary stats for the DB @@ -98,7 +99,7 @@ function describe_collection(db::DictDB) # Total number of strings in collection ∑ = length(db.string_collection) -# Average number of ngram features +# Average size of ngram features n = [x for x in keys(db.string_size_map)] μ = sum(n) / length(n) @@ -108,7 +109,19 @@ for i in values(db.string_feature_map) total_ngrams += length(i) end -return (total_collection = ∑, avg_num_ngrams = μ, total_ngrams = total_ngrams) +return (total_collection = ∑, avg_size_ngrams = μ, total_ngrams = total_ngrams) +end + + +""" +Pretty print summary stats for the DB +""" +function Base.show(io::IO, x::DictDB) + metrics = describe_collection(x) + println(io, "DictDB($(x.feature_extractor))") + println(io, "Total collection: ", metrics.total_collection) + println(io, "Average number of ngram features: ", metrics.avg_size_ngrams) + println(io, "Total number of ngram features: ", metrics.total_ngrams) end diff --git a/src/features.jl b/src/features.jl index 9980503..8486b3b 100644 --- a/src/features.jl +++ b/src/features.jl @@ -188,8 +188,8 @@ append!(db, "./data/test.txt") """ function append!(db::AbstractSimStringDB, file::AbstractString) open(file) do f - while !eof(f) - push!(db, readline(f)) + for line in eachline(f) + push!(db, line) end end end \ No newline at end of file diff --git a/test/test01_dictdb.jl b/test/test01_dictdb.jl index 4896098..ddb7f43 100644 --- a/test/test01_dictdb.jl +++ b/test/test01_dictdb.jl @@ -15,8 +15,8 @@ using Test @test collect(keys(db.string_feature_map)) == [5, 6] - @test collect(values(db.string_feature_map[5])) == vcat( (repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)) ) - @test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6) + @test collect(values(db.string_feature_map[5])) == vcat((repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5))) + @test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6) end @@ -41,10 +41,10 @@ end @test collect(keys(db.string_feature_map)) == [5, 6] - @test collect(values(db.string_feature_map[5])) == vcat( (repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)) ) - @test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6) + @test collect(values(db.string_feature_map[5])) == vcat((repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5))) + @test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6) - @test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String, Int64} + @test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String,Int64} end @@ -59,19 +59,19 @@ end @test collect(values(db.string_feature_map[9]))[5] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"]) @test collect(values(db.string_feature_map[9]))[7] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"]) - @test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String, String}, Int64} + @test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String,String},Int64} end @testset "Test describe functionality" begin - db = DictDB(CharacterNGrams(2, " ")); - append!(db, ["foo", "bar", "fooo"]); + db = DictDB(CharacterNGrams(2, " ")) + append!(db, ["foo", "bar", "fooo"]) # Interact with db - search(Dice(), db, "zep"; α=0.8, ranked=true) + search(Dice(), db, "zep"; α = 0.8, ranked = true) - @test describe_collection(db) == (total_collection = 3, avg_num_ngrams = 4.5, total_ngrams = 13) + @test describe_collection(db) == (total_collection = 3, avg_size_ngrams = 4.5, total_ngrams = 13) end @@ -85,10 +85,10 @@ end @test collect(keys(db.string_feature_map)) == [5, 6] - @test collect(values(db.string_feature_map[5])) == vcat( (repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)) ) - @test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6) + @test collect(values(db.string_feature_map[5])) == vcat((repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5))) + @test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6) - @test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String, Int64} + @test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String,Int64} end @@ -104,7 +104,7 @@ end @test collect(values(db.string_feature_map[9]))[5] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"]) @test collect(values(db.string_feature_map[9]))[7] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"]) - @test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String, String}, Int64} + @test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String,String},Int64} end diff --git a/test/test04_search.jl b/test/test04_search.jl index 14f93e0..8cdb283 100644 --- a/test/test04_search.jl +++ b/test/test04_search.jl @@ -2,6 +2,7 @@ module TestMeasures using SimString using Test using Faker +using Suppressor @testset "Test Dice Search" begin @@ -54,6 +55,7 @@ end end + @testset "Test Micro Deep Dive Search" begin db = DictDB(CharacterNGrams(2, " ")); append!(db, ["a", "ab", "abc", "abcd", "abcde"]); @@ -76,6 +78,17 @@ end end +@testset "Test output from show" begin + db = DictDB(CharacterNGrams(2, " ")); + append!(db, ["foo", "bar", "fooo"]); + + expected_out = "DictDB(SimString.CharacterNGrams{Int64, String}(2, \" \"))\nTotal collection: 3\nAverage number of ngram features: 4.5\nTotal number of ngram features: 13\n" + r = @capture_out show(db) + @test r == expected_out +end + + + end # module \ No newline at end of file From ee18b527d12e0b392f460089a4013ef047e632fa Mon Sep 17 00:00:00 2001 From: Bernard Brenyah Date: Thu, 10 Feb 2022 11:31:48 +0000 Subject: [PATCH 3/3] Initial public release --- src/search.jl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/search.jl b/src/search.jl index 29d6e2d..fa0ddb4 100644 --- a/src/search.jl +++ b/src/search.jl @@ -74,7 +74,7 @@ function overlap_join(db_collection::AbstractSimStringDB, features, τ, candidat results = String[] for (candidate, match_count) in candidate_match_counts - for i in (query_feature_length - τ + 1) : query_feature_length # TODO: Verify + for i in (query_feature_length - τ + 1) : query_feature_length if candidate in lookup_feature_set_by_size_feature(db_collection, candidate_size, features[i]) match_count += 1 end @@ -103,16 +103,16 @@ function search!(measure::AbstractSimilarityMeasure, db_collection::DictDB, quer features = extract_features(db_collection.feature_extractor, query) # Metadata from the generated features (length, min & max sizes) - length_of_features = length(features) - min_feature_size = minimum_feature_size(measure, length_of_features, α) - max_feature_size = maximum_feature_size(measure, db_collection, length_of_features, α) + # length_of_features = length(features) + # min_feature_size = minimum_feature_size(measure, length_of_features, α) + # max_feature_size = maximum_feature_size(measure, db_collection, length_of_features, α) results = String[] # Generate and return results from the potential candidate size pool - @inbounds for candidate_size in min_feature_size:max_feature_size + @inbounds for candidate_size in minimum_feature_size(measure, length(features), α) : maximum_feature_size(measure, db_collection, length(features), α) # Minimum overlap - τ = minimum_overlap(measure, length_of_features, candidate_size, α) + τ = minimum_overlap(measure, length(features), candidate_size, α) # Generate approximate candidates from the overlap join append!(results, overlap_join(db_collection, features, τ, candidate_size))