Skip to content

Initial public release #15

New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Merged
merged 3 commits into from
Feb 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,17 @@ version = "0.1.0"
CircularArrays = "7a955b69-7140-5f4e-a0ed-f168c5e2e749"
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"

[compat]
CircularArrays = "1"
DataStructures = "0.18"
OffsetArrays = "1"
julia = "1"

[extras]
Faker = "0efc519c-db33-5916-ab87-703215c3906f"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Suppressor = "fd094767-a336-5f1f-9728-57cf17d0bbfb"

[targets]
test = ["Test", "Faker"]
test = ["Test", "Faker", "Suppressor"]
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,16 @@ This package is be particulary useful for natural language processing tasks whic
- [X] Support for unicodes
- [ ] Custom user defined feature generation methods
- [ ] Mecab-based tokenizer support
- [X] Support for building databases directly from text files
- [ ] Support for persistent databases

## Suported String Similarity Measures

- [X] Dice coefficient
- [X] Jaccard coefficient
- [X] Cosine coefficient
- [X] Overlap coefficient
- [X] Exact match

## Installation

Expand Down
6 changes: 4 additions & 2 deletions docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ This package is be particulary useful for natural language processing tasks whic
- [X] Support for unicodes
- [ ] Custom user defined feature generation methods
- [ ] Mecab-based tokenizer support
- [ ] Support for building databases directly from text files
- [X] Support for building databases directly from text files
- [ ] Support for persistent databases

## Suported String Similarity Measures
Expand Down Expand Up @@ -64,6 +64,8 @@ push!(db, "fooo");

# Convinient approach is to use an array of strings for multiple entries: `append!(db, ["foo", "bar", "fooo"]);`

# OR: Build database from text files: `append!(db, "YOUR_FILE_NAME.txt");

# Retrieve the closest match(es)
res = search(Dice(), db, "foo"; α=0.8, ranked=true)
# 2-element Vector{Tuple{String, Float64}}:
Expand All @@ -72,7 +74,7 @@ res = search(Dice(), db, "foo"; α=0.8, ranked=true)

# Describe a working database collection
desc = describe_collection(db)
# (total_collection = 3, avg_num_ngrams = 4.5, total_ngrams = 13)
# (total_collection = 3, avg_size_ngrams = 4.5, total_ngrams = 13)
```

## TODO: Benchmarks
Expand Down
48 changes: 48 additions & 0 deletions extras/benchmark_sim.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
using SimString
using Faker
using BenchmarkTools
using DataStructures

################################# Benchmark Bulk addition #####################
db = DictDB(CharacterNGrams(3, " "));
Faker.seed(2020)
@time fake_names = [string(Faker.first_name(), " ", Faker.last_name()) for i in 1:100_000];


f(d, x) = append!(d, x)
@time f(db, fake_names)



################################ Simple Addition ###############################

db = DictDB(CharacterNGrams(2, " "));
push!(db, "foo");
push!(db, "bar");
push!(db, "fooo");

f(x, c, s, a, r) = search(x, c, s; α=a, ranked=r)
test = "foo";
col = db;
sim = Cosine();
a = 0.8;
r = true;

f(Cosine(), db, "foo", 0.8, true)

@btime f($sim, $col, $test, $a, $r)
@btime search(Cosine(), db, "foo"; α=0.8, ranked=true)



db2 = DictDB(CharacterNGrams(3, " "));
append!(db2, ["foo", "bar", "fooo", "foor"]) # also works via multiple dispatch on a vector

results = search(Cosine(), db, "foo"; α=0.8, ranked=true) # yet to be implemented

bs = ["foo", "bar", "foo", "foo", "bar"]
SimString.extract_features(CharacterNGrams(3, " "), "prepress")
SimString.extract_features(WordNGrams(2, " ", " "), "You are a really really really cool dude.")

db = DictDB(WordNGrams(2, " ", " "))
push!(db, "You are a really really really cool dude.")
1 change: 0 additions & 1 deletion src/SimString.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ module SimString

import Base: push!, append!
using DataStructures: DefaultOrderedDict, DefaultDict
using ProgressMeter
using CircularArrays
using OffsetArrays

Expand Down
17 changes: 15 additions & 2 deletions src/dictdb.jl
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ Basic summary stats for the DB
db = DictDB(CharacterNGrams(2, " "));
append!(db, ["foo", "bar", "fooo"]);
describe_collection(db)
(total_collection = 3, avg_size_ngrams = 4.5, total_ngrams = 13)

# Returns
* NamedTuples: Summary stats for the DB
Expand All @@ -98,7 +99,7 @@ function describe_collection(db::DictDB)
# Total number of strings in collection
∑ = length(db.string_collection)

# Average number of ngram features
# Average size of ngram features
n = [x for x in keys(db.string_size_map)]
μ = sum(n) / length(n)

Expand All @@ -108,7 +109,19 @@ for i in values(db.string_feature_map)
total_ngrams += length(i)
end

return (total_collection = ∑, avg_num_ngrams = μ, total_ngrams = total_ngrams)
return (total_collection = ∑, avg_size_ngrams = μ, total_ngrams = total_ngrams)
end


"""
Pretty print summary stats for the DB
"""
function Base.show(io::IO, x::DictDB)
metrics = describe_collection(x)
println(io, "DictDB($(x.feature_extractor))")
println(io, "Total collection: ", metrics.total_collection)
println(io, "Average number of ngram features: ", metrics.avg_size_ngrams)
println(io, "Total number of ngram features: ", metrics.total_ngrams)
end


Expand Down
60 changes: 60 additions & 0 deletions src/features.jl
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,25 @@ end


"""
push!(db::AbstractSimStringDB, str::AbstractString)

Add a new item to a new or existing collection of strings using
the custom AbstractSimStringDB type.

# Arguments:
* `db`: AbstractSimStringDB - The collection of strings to add to
* `str`: AbstractString - The string to add to the collection

# Example:
```julia
db = DictDB(CharacterNGrams(2, " "));
push!(db, "foo")
push!(db, "bar")
push!(db, "fooo")
````

# Returns:
* `db`: AbstractSimStringDB - The collection of strings with the new string added
"""
function push!(db::AbstractSimStringDB, str::AbstractString)
# Extract features based on the specified feature extractor
Expand All @@ -125,11 +142,54 @@ end


"""
append!(db::AbstractSimStringDB, str::Vector)

Add bulk items to a new or existing collection of strings using
the custom AbstractSimStringDB type.

# Arguments:
* db: AbstractSimStringDB - The database to add the strings to
* str: Vector of AbstractString - Vector/Array of strings to add to the database

# Example:
```julia
db = DictDB(CharacterNGrams(2, " "));
append!(db, ["foo", "foo", "fooo"]);
```

# Returns:
* db: AbstractSimStringDB - The database with the new strings added
"""
function append!(db::AbstractSimStringDB, str::Vector)
@inbounds @simd for i in str
push!(db, i)
end
end


"""
append!(db::AbstractSimStringDB, file::AbstractString)

Add bulk items to a new or existing collection of strings using
from a file using the custom AbstractSimStringDB type.

# Arguments:
* `db``: AbstractSimStringDB - The database to add the items to
* `file`: AbstractString - Path to the file to read from

# Example:
```julia
db = DictDB(CharacterNGrams(2, " "));
append!(db, "./data/test.txt")
```

# Returns:
* `db`: AbstractSimStringDB - The database with the items added
"""
function append!(db::AbstractSimStringDB, file::AbstractString)
open(file) do f
for line in eachline(f)
push!(db, line)
end
end
end
12 changes: 6 additions & 6 deletions src/search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ function overlap_join(db_collection::AbstractSimStringDB, features, τ, candidat
results = String[]

for (candidate, match_count) in candidate_match_counts
for i in (query_feature_length - τ + 1) : query_feature_length # TODO: Verify
for i in (query_feature_length - τ + 1) : query_feature_length
if candidate in lookup_feature_set_by_size_feature(db_collection, candidate_size, features[i])
match_count += 1
end
Expand Down Expand Up @@ -103,16 +103,16 @@ function search!(measure::AbstractSimilarityMeasure, db_collection::DictDB, quer
features = extract_features(db_collection.feature_extractor, query)

# Metadata from the generated features (length, min & max sizes)
length_of_features = length(features)
min_feature_size = minimum_feature_size(measure, length_of_features, α)
max_feature_size = maximum_feature_size(measure, db_collection, length_of_features, α)
# length_of_features = length(features)
# min_feature_size = minimum_feature_size(measure, length_of_features, α)
# max_feature_size = maximum_feature_size(measure, db_collection, length_of_features, α)

results = String[]

# Generate and return results from the potential candidate size pool
@inbounds for candidate_size in min_feature_size:max_feature_size
@inbounds for candidate_size in minimum_feature_size(measure, length(features), α) : maximum_feature_size(measure, db_collection, length(features), α)
# Minimum overlap
τ = minimum_overlap(measure, length_of_features, candidate_size, α)
τ = minimum_overlap(measure, length(features), candidate_size, α)

# Generate approximate candidates from the overlap join
append!(results, overlap_join(db_collection, features, τ, candidate_size))
Expand Down
2 changes: 2 additions & 0 deletions test/dummy_sents.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
You are a really really really cool dude.
Sometimes you are not really really cool tho
3 changes: 3 additions & 0 deletions test/dummy_words.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
foo
bar
fooo
54 changes: 44 additions & 10 deletions test/test01_dictdb.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ using Test

@test collect(keys(db.string_feature_map)) == [5, 6]

@test collect(values(db.string_feature_map[5])) == vcat( (repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)) )
@test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6)
@test collect(values(db.string_feature_map[5])) == vcat((repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)))
@test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6)
end


Expand All @@ -41,10 +41,10 @@ end

@test collect(keys(db.string_feature_map)) == [5, 6]

@test collect(values(db.string_feature_map[5])) == vcat( (repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)) )
@test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6)
@test collect(values(db.string_feature_map[5])) == vcat((repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)))
@test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6)

@test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String, Int64}
@test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String,Int64}
end


Expand All @@ -59,19 +59,53 @@ end
@test collect(values(db.string_feature_map[9]))[5] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
@test collect(values(db.string_feature_map[9]))[7] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])

@test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String, String}, Int64}
@test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String,String},Int64}
end



@testset "Test describe functionality" begin
db = DictDB(CharacterNGrams(2, " "));
append!(db, ["foo", "bar", "fooo"]);
db = DictDB(CharacterNGrams(2, " "))
append!(db, ["foo", "bar", "fooo"])

# Interact with db
search(Dice(), db, "zep"; α=0.8, ranked=true)
search(Dice(), db, "zep"; α = 0.8, ranked = true)

@test describe_collection(db) == (total_collection = 3, avg_size_ngrams = 4.5, total_ngrams = 13)
end


@testset "Test bulk insertion from a file using CharacterNGrams" begin
db = DictDB(CharacterNGrams(3, " "))
append!(db, "dummy_words.txt")

@test db.string_collection == ["foo", "bar", "fooo"]
@test db.string_size_map[5] == Set(["bar", "foo"])
@test db.string_size_map[6] == Set(["fooo"])

@test collect(keys(db.string_feature_map)) == [5, 6]

@test collect(values(db.string_feature_map[5])) == vcat((repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)))
@test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6)

@test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String,Int64}
end



@testset "Test bulk insertion from a file using WordNGrams" begin
db = DictDB(WordNGrams(2, " ", " "))
append!(db, "dummy_sents.txt")

@test db.string_collection == ["You are a really really really cool dude.", "Sometimes you are not really really cool tho"]
@test db.string_size_map[9] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])

@test collect(keys(db.string_feature_map)) == [9]
@test collect(values(db.string_feature_map[9]))[5] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
@test collect(values(db.string_feature_map[9]))[7] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])

@test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String,String},Int64}

@test describe_collection(db) == (total_collection = 3, avg_num_ngrams = 4.5, total_ngrams = 13)
end


Expand Down
13 changes: 13 additions & 0 deletions test/test04_search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ module TestMeasures
using SimString
using Test
using Faker
using Suppressor


@testset "Test Dice Search" begin
Expand Down Expand Up @@ -54,6 +55,7 @@ end

end


@testset "Test Micro Deep Dive Search" begin
db = DictDB(CharacterNGrams(2, " "));
append!(db, ["a", "ab", "abc", "abcd", "abcde"]);
Expand All @@ -76,6 +78,17 @@ end
end


@testset "Test output from show" begin
db = DictDB(CharacterNGrams(2, " "));
append!(db, ["foo", "bar", "fooo"]);

expected_out = "DictDB(SimString.CharacterNGrams{Int64, String}(2, \" \"))\nTotal collection: 3\nAverage number of ngram features: 4.5\nTotal number of ngram features: 13\n"
r = @capture_out show(db)
@test r == expected_out
end





end # module