diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 3254a4a..a817ad2 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -1,10 +1,7 @@ name: CI on: - push: - branches: - - main - tags: '*' - pull_request: + - push + - pull_request jobs: test: name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} @@ -13,8 +10,8 @@ jobs: fail-fast: false matrix: version: - - '1.0' - '1.6' + - '1.7' - 'nightly' os: - ubuntu-latest @@ -22,10 +19,6 @@ jobs: - windows-latest arch: - x64 - - x86 - exclude: - - os: macOS-latest - arch: x86 steps: - uses: actions/checkout@v2 - uses: julia-actions/setup-julia@v1 diff --git a/.gitignore b/.gitignore index 20fe29d..f181182 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ *.jl.mem /Manifest.toml /docs/build/ +.vscode \ No newline at end of file diff --git a/Project.toml b/Project.toml index 690e774..63a7c5e 100644 --- a/Project.toml +++ b/Project.toml @@ -3,11 +3,18 @@ uuid = "2e3c4037-312d-4650-b9c0-fcd0fc09aae4" authors = ["Bernard Brenyah"] version = "0.1.0" +[deps] +CircularArrays = "7a955b69-7140-5f4e-a0ed-f168c5e2e749" +DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" +ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" + [compat] julia = "1" [extras] +Faker = "0efc519c-db33-5916-ab87-703215c3906f" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test"] +test = ["Test", "Faker"] diff --git a/README.md b/README.md index 893e546..9f07e30 100644 --- a/README.md +++ b/README.md @@ -6,3 +6,45 @@ [![Coverage](https://codecov.io/gh/PyDataBlog/SimString.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/PyDataBlog/SimString.jl) [![Code Style: Blue](https://img.shields.io/badge/code%20style-blue-4495d1.svg)](https://github.com/invenia/BlueStyle) [![ColPrac: Contributor's Guide on Collaborative Practices for Community Packages](https://img.shields.io/badge/ColPrac-Contributor's%20Guide-blueviolet)](https://github.com/SciML/ColPrac) + +A native Julia implementation of the CPMerge algorithm, which is designed for approximate string matching. +This package is be particulary useful for natural language processing tasks which demand the retrieval of strings/texts from a very large corpora (big amounts of texts). Currently, this package supports both Character and Word based N-grams feature generations and there are plans to open the package up for custom user defined feature generation methods. + +## Features + +- [X] Fast algorithm for string matching +- [X] 100% exact retrieval +- [X] Support for unicodes +- [ ] Custom user defined feature generation methods +- [ ] Mecab-based tokenizer support + +## Suported String Similarity Measures + +- [X] Dice coefficient +- [X] Jaccard coefficient +- [X] Cosine coefficient +- [X] Overlap coefficient + +## Installation + +You can grab the latest stable version of this package from Julia registries by simply running; + +*NB:* Don't forget to invoke Julia's package manager with `]` + +```julia +pkg> add SimString +``` + +The few (and selected) brave ones can simply grab the current experimental features by simply adding the master branch to your development environment after invoking the package manager with `]`: + +```julia +pkg> add SimString#master +``` + +You are good to go with bleeding edge features and breakages! + +To revert to a stable version, you can simply run: + +```julia +pkg> free SimString +``` diff --git a/docs/src/index.md b/docs/src/index.md index 26be98c..807e880 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -6,6 +6,76 @@ CurrentModule = SimString Documentation for [SimString](https://github.com/PyDataBlog/SimString.jl). +A native Julia implementation of the CPMerge algorithm, which is designed for approximate string matching. +This package is be particulary useful for natural language processing tasks which demand the retrieval of strings/texts from a very large corpora (big amounts of texts). Currently, this package supports both Character and Word based N-grams feature generations and there are plans to open the package up for custom user defined feature generation methods. + +## Features + +- [X] Fast algorithm for string matching +- [X] 100% exact retrieval +- [X] Support for unicodes +- [ ] Custom user defined feature generation methods +- [ ] Mecab-based tokenizer support + +## Suported String Similarity Measures + +- [X] Dice coefficient +- [X] Jaccard coefficient +- [X] Cosine coefficient +- [X] Overlap coefficient + +## Installation + +You can grab the latest stable version of this package from Julia registries by simply running; + +*NB:* Don't forget to invoke Julia's package manager with `]` + +```julia +pkg> add SimString +``` + +The few (and selected) brave ones can simply grab the current experimental features by simply adding the master branch to your development environment after invoking the package manager with `]`: + +```julia +pkg> add SimString#master +``` + +You are good to go with bleeding edge features and breakages! + +To revert to a stable version, you can simply run: + +```julia +pkg> free SimString +``` + +## Usage + +```julia +using SimString + +# Inilisate database and some strings +db = DictDB(CharacterNGrams(2, " ")); +push!(db, "foo"); +push!(db, "bar"); +push!(db, "fooo"); + +# Convinient approach is to use an array of strings for multiple entries: `append!(db, ["foo", "bar", "fooo"]);` + +# Retrieve the closest match(es) +res = search(Dice(), db, "foo"; α=0.8, ranked=true) +# 2-element Vector{Tuple{String, Float64}}: +# ("foo", 1.0) +# ("fooo", 0.8888888888888888) + + +``` + +## TODO: Benchmarks + +## Release History + +- 0.1.0 Initial release. + ```@index ``` diff --git a/extras/examples.jl b/extras/examples.jl new file mode 100644 index 0000000..b4d9595 --- /dev/null +++ b/extras/examples.jl @@ -0,0 +1,46 @@ +using SimString +using Faker +using BenchmarkTools +using DataStructures + +################################# Benchmark Bulk addition ##################### +db = DictDB(CharacterNGrams(3, " ")); +Faker.seed(2020) +@time fake_names = [string(Faker.first_name(), " ", Faker.last_name()) for i in 1:100_000]; + + +f(d, x) = append!(d, x) +@time f(db, fake_names) + + + +################################ Simple Addition ############################### + +db = DictDB(CharacterNGrams(2, " ")); +push!(db, "foo"); +push!(db, "bar"); +push!(db, "fooo"); + +f(x, c, s) = search(x, c, s) +test = "foo"; +col = db; +sim = Cosine(); + +f(Cosine(), db, "foo") + +@btime f($sim, $col, $test) +@btime search(Cosine(), db, "foo"; α=0.8, ranked=true) + + + +db2 = DictDB(CharacterNGrams(3, " ")); +append!(db2, ["foo", "bar", "fooo", "foor"]) # also works via multiple dispatch on a vector + +results = search(Cosine(), db, "foo"; α=0.8, ranked=true) # yet to be implemented + +bs = ["foo", "bar", "foo", "foo", "bar"] +SimString.extract_features(CharacterNGrams(3, " "), "prepress") +SimString.extract_features(WordNGrams(2, " ", " "), "You are a really really really cool dude.") + +db = DictDB(WordNGrams(2, " ", " ")) +push!(db, "You are a really really really cool dude.") diff --git a/extras/py_benchmarks.py b/extras/py_benchmarks.py new file mode 100644 index 0000000..be23cb8 --- /dev/null +++ b/extras/py_benchmarks.py @@ -0,0 +1,16 @@ +from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor +from simstring.measure.cosine import CosineMeasure +from simstring.database.dict import DictDatabase +from simstring.searcher import Searcher +from faker import Faker + +db = DictDatabase(CharacterNgramFeatureExtractor(3)) + +fake = Faker() +fake_names = [fake.name() for i in range(100_000)] + +def f(x): + for i in x: + db.add(i) + +# %time f(fake_names) \ No newline at end of file diff --git a/src/SimString.jl b/src/SimString.jl index 6af507d..7bec0a7 100644 --- a/src/SimString.jl +++ b/src/SimString.jl @@ -1,5 +1,29 @@ module SimString -# Write your package code here. +import Base: push!, append! +using DataStructures: DefaultOrderedDict, DefaultDict +# using ProgressMeter +# using CircularArrays +# using OffsetArrays + +######### Import modules & utils ################ +include("db_collection.jl") +include("dictdb.jl") +include("features.jl") +include("measures.jl") +include("search.jl") + + + +####### Global export of user API ####### +export Dice, Jaccard, Cosine, Overlap, + AbstractSimStringDB, DictDB, + CharacterNGrams, WordNGrams, + search + + + + + end diff --git a/src/db_collection.jl b/src/db_collection.jl new file mode 100644 index 0000000..cffce8b --- /dev/null +++ b/src/db_collection.jl @@ -0,0 +1,35 @@ +# Custom Collections + +""" +Base type for all custom db collections. +""" +abstract type AbstractSimStringDB end + + +""" +Abstract type for feature extraction structs +""" +abstract type FeatureExtractor end + + +# Feature Extraction Definitions + +""" +Feature extraction on character-level ngrams +""" +struct CharacterNGrams{T1<:Int, T2<:AbstractString} <: FeatureExtractor + n::T1 # number of n-grams to extract + padder::T2 # string to use to pad n-grams +end + + +""" +Feature extraction based on word-level ngrams +""" +struct WordNGrams{T1<:Int, T2<:AbstractString} <: FeatureExtractor + n::T1 # number of n-grams to extract + padder::T2 # string to use to pad n-grams + splitter::T2 # string to use to split words +end + + diff --git a/src/dictdb.jl b/src/dictdb.jl new file mode 100644 index 0000000..77136b1 --- /dev/null +++ b/src/dictdb.jl @@ -0,0 +1,102 @@ +""" +Custom DB collection for storing SimString data using base Dictionary `Dict` +""" +struct DictDB{ + T1<:FeatureExtractor, + T2<:AbstractString, + T3<:AbstractDict, + T4<:AbstractDict, + T5<:AbstractDict, + } <: AbstractSimStringDB + + feature_extractor::T1 # NGram feature extractor + string_collection::Vector{T2} # Collection of strings in the DB + string_size_map::T3 # Index map of feature sizes + string_feature_map::T4 # Index map of all features with associated strings and sizes + lookup_cache::T5 # Cache for lookup results +end + + +""" + DictDB(x::CharacterNGrams) + +Initialize a dict DB with additional containers and Metadata for CharacterNGrams + +# Arguments +* `x`: CharacterNGrams object + +# Example +```julia +db = DictDB(CharacterNGrams(2, " ")) +``` + +# Returns +* `DictDB`: A DictDB object with additional containers and Metadata for CharacterNGrams +""" +function DictDB(x::CharacterNGrams) + DictDB( + x, + String[], + DefaultDict{Int, Set{String}}( () -> Set{String}() ), + DefaultDict{ Int, DefaultOrderedDict{Tuple{String, Int64}, Set{String}} }( () -> DefaultOrderedDict{Tuple{String, Int64}, Set{String} }(Set{String})), + DefaultDict{ Int, DefaultDict{Tuple{String, Int64}, Set{String}} }( () -> DefaultDict{Tuple{String, Int64}, Set{String}}(Set{String})) + ) +end + + +""" + DictDB(x::WordNGrams) + +Initialize a dict DB with additional containers and Metadata for WordNGrams + +# Arguments +* `x`: WordNGrams object + +# Example +```julia +db = DictDB(WordNGrams(2, " ", " ")) +``` + +# Returns +* `DictDB`: A DictDB object with additional containers and Metadata for WordNGrams +""" +function DictDB(x::WordNGrams) + DictDB( + x, + String[], + DefaultDict{Int, Set{String}}( () -> Set{String}() ), + DefaultDict{ Int, DefaultOrderedDict{Tuple{NTuple{x.n, String}, Int}, Set{String}} }( () -> DefaultOrderedDict{Tuple{NTuple{x.n, String}, Int}, Set{String} }(Set{String})), + DefaultDict{ Int, DefaultDict{Tuple{NTuple{x.n, String}, Int}, Set{String}} }( () -> DefaultDict{Tuple{NTuple{x.n, String}, Int}, Set{String}}(Set{String})) + ) +end + + + + +################################## DictDB UTIL Functions ############################ +""" +Internal function for retrieving existing features by size +""" +function retrieve_existing_feature_by_size(db::DictDB, size, feature) + return db.string_feature_map[size][feature] +end + + +# """ +# Basic summary stats for the DB +# """ +# function describe_db(db::DictDB) + +# end + + +""" +Internal function to lookup feature sets by size and feature +""" +function lookup_feature_set_by_size_feature(db::DictDB, size, feature) + # TODO: Clean this up and make it more efficient. Shouldn't updated db.string_feature_map + if feature ∉ keys(db.lookup_cache[size]) + db.lookup_cache[size][feature] = retrieve_existing_feature_by_size(db, size, feature) + end + return db.lookup_cache[size][feature] +end \ No newline at end of file diff --git a/src/features.jl b/src/features.jl new file mode 100644 index 0000000..f53acb5 --- /dev/null +++ b/src/features.jl @@ -0,0 +1,130 @@ +""" +Internal function to pad AbstractString types with specified padder +""" +function pad_string(x::AbstractString, padder::AbstractString) + return string(padder, x, padder) +end + + +""" +Internal function to pad AbstractVector types with specified padder +""" +function pad_string(x::AbstractVector, padder::AbstractString) + # Insert a padder as the first and last element of x + insert!(x, 1, padder) + push!(x, padder) + return x +end + + +""" +Internal function to generate intial uncounted ngrams on a character level +""" +function init_ngrams(extractor::CharacterNGrams, x, n) + map(0:length(x)-n) do i + x[i+1: i+n] + end +end + + +""" +Internal function to generate intial uncounted ngrams on a word level +""" +function init_ngrams(extractor::WordNGrams, x, n) + map(0:length(x)-n) do i + tuple(String.(x[i+1: i+n])...) + end +end + + +""" +Internal function to create character-level ngrams features from an AbstractString +""" +function n_grams(extractor::CharacterNGrams, x, n) + # Return counted n-grams (including duplicates) + return cummulative_ngram_count(init_ngrams(extractor, x, n)) +end + + +""" +Internal function to create word-level ngrams from an AbstractVector +""" +function n_grams(extractor::WordNGrams, x, n) + return cummulative_ngram_count(init_ngrams(extractor, x, n)) +end + + +""" +Internal function to generate character-level ngrams features from an AbstractString +""" +function extract_features(extractor::CharacterNGrams, str) + n = extractor.n - 1 == 0 ? 1 : extractor.n - 1 + str = pad_string(str, repeat(extractor.padder, n)) + return n_grams(extractor, str, extractor.n) +end + + +""" +Internal function to generate word-level ngrams features from an AbstractString +""" +function extract_features(extractor::WordNGrams, str) + words_split = split(str, extractor.splitter) + padded_words = pad_string(words_split, extractor.padder) + return n_grams(extractor, padded_words, extractor.n) +end + + +""" +Internal function to count and pad generated character-level ngrams (including duplicates) +""" +function cummulative_ngram_count(x) + counter = Dict{eltype(x), Int}() + + unique_list = map(x) do val + if val in keys(counter) + counter[val] += 1 + else + counter[val] = 1 + end + (val, counter[val]) + end + + return unique_list +end + + +""" +Add a new item to a new or existing collection of strings using +the custom AbstractSimStringDB type. +""" +function push!(db::AbstractSimStringDB, str::AbstractString) + # Extract features based on the specified feature extractor + features = extract_features(db.feature_extractor, str) + + # Size of the new feature + size = length(features) + + # Add the string to the database + push!(db.string_collection, str) + + # Add the size of the incoming string to size map + push!(db.string_size_map[size], str) + + # Map each feature to a size map along with the originating string + @inbounds for n in features + push!(db.string_feature_map[size][n], str) + end + + return db +end + + +""" +Add bulk items to a new or existing collection of strings using +the custom AbstractSimStringDB type. +""" +function append!(db::AbstractSimStringDB, str::Vector) + @inbounds @simd for i in str + push!(db, i) + end +end \ No newline at end of file diff --git a/src/measures.jl b/src/measures.jl new file mode 100644 index 0000000..286a656 --- /dev/null +++ b/src/measures.jl @@ -0,0 +1,169 @@ +############## String Similarity Measure Definitions ############## + +""" +Abstract base type for all string similarity measures. +""" +abstract type AbstractSimilarityMeasure end + + +""" +Dice Similarity Measure. +""" +struct Dice <: AbstractSimilarityMeasure end + + +""" +Jaccard Similarity Measure. +""" +struct Jaccard <: AbstractSimilarityMeasure end + + +""" +Cosine Similarity Measure. +""" +struct Cosine <: AbstractSimilarityMeasure end + + +""" +Overlap Similarity Measure. +""" +struct Overlap <: AbstractSimilarityMeasure end + + + +############## Minimum Feature Sizes Per Measure ############## +""" +Calculate minimum feature size for Dice similarity measure. +""" +function minimum_feature_size(measure::Dice, query_size, α) + return ceil(Int, ( (α / (2 - α)) * query_size) ) +end + + +""" +Calculate minimum feature size for Jaccard similarity measure. +""" +function minimum_feature_size(measure::Jaccard, query_size, α) + return ceil(Int, (α * query_size)) +end + + +""" +Calculate minimum feature size for Cosine similarity measure. +""" +function minimum_feature_size(measure::Cosine, query_size, α) + return ceil(Int, (α * α * query_size) ) +end + + +""" +Calculate minimum feature size for Overlap similarity measure. +""" +function minimum_feature_size(measure::Overlap, query_size, α) + return 1 +end + + +############## Maximum Feature Size Per Measure ############## + +""" +Calculate maximum feature size for Dice similarity measure. +""" +function maximum_feature_size(measure::Dice, db::AbstractSimStringDB, query_size, α) + return floor(Int, ( ((2 - α) / α) * query_size) ) +end + + +""" +Calculate maximum feature size for Jaccard similarity measure. +""" +function maximum_feature_size(measure::Jaccard, db::AbstractSimStringDB, query_size, α) + return floor(Int, (query_size / α)) +end + + +""" +Calculate maximum feature size for Cosine similarity measure. +""" +function maximum_feature_size(measure::Cosine, db::AbstractSimStringDB, query_size, α) + return floor(Int, ( query_size / (α * α) )) +end + + +""" +Calculate maximum feature size for Overlap similarity measure. +""" +function maximum_feature_size(measure::Overlap, db::AbstractSimStringDB, query_size, α) + return min(typemax(Int), maximum(keys(db.string_feature_map))) +end + + +############## Similarity Score Per Measure ############## +""" +Calculate similarity score between X and Y using Dice similarity measure. +""" +function similarity_score(measure::Dice, X, Y) + return 2 * ( length( Set(X) ∩ Set(Y) ) ) / ( length( Set(X) ) + length( Set(Y) ) ) +end + + +""" +Calculate similarity score between X and Y using Jaccard similarity measure. +""" +function similarity_score(measure::Jaccard, X, Y) + return length( Set(X) ∩ Set(Y) ) / ( length( Set(X) ∪ Set(Y) ) ) +end + + +""" +Calculate similarity score between X and Y using Cosine similarity measure. +""" +function similarity_score(measure::Cosine, X, Y) + return length( Set(X) ∩ Set(Y) ) / ( √(length( Set(X) ) * length( Set(Y) )) ) +end + + +""" +Calculate similarity score between X and Y using Overlap similarity measure. +""" +function similarity_score(measure::Overlap, X, Y) + return length( Set(X) ∩ Set(Y) ) / min(length( Set(X) ), length( Set(Y) )) +end + + + +############## Number of Minimum Overlaps Per Measure ############## +""" +Calculate the minimum overlap (τ) for a query size, candidate size, and α +using Dice similarity measure. +""" +function minimum_overlap(measure::Dice, query_size, candidate_size, α) + return ceil(Int, (0.5 * α * query_size * candidate_size)) +end + + +""" +Calculate the minimum overlap (τ) for a query size, candidate size, and α +using Jaccard similarity measure. +""" +function minimum_overlap(measure::Jaccard, query_size, candidate_size, α) + return ceil(Int, ((α * (query_size + candidate_size)) / (1 + α)) ) +end + + +""" +Calculate the minimum overlap (τ) for a query size, candidate size, and α +using Cosine similarity measure. +""" +function minimum_overlap(measure::Cosine, query_size, candidate_size, α) + return ceil(Int, ( α * √(query_size * candidate_size) )) +end + + +""" +Calculate the minimum overlap (τ) for a query size, candidate size, and α +using Overlap similarity measure. +""" +function minimum_overlap(measure::Overlap, query_size, candidate_size, α) + return ceil(Int, (α * min(query_size, candidate_size)) ) +end \ No newline at end of file diff --git a/src/search.jl b/src/search.jl new file mode 100644 index 0000000..70d3b84 --- /dev/null +++ b/src/search.jl @@ -0,0 +1,146 @@ +# Main SimString search algorithm + +""" + search(measure::AbstractSimilarityMeasure, db_collection::AbstractSimStringDB, query::AbstractString; + α=0.7, ranked=true) + +Search for strings in a string collection using the SimString algorithm and a +similarity measure. + +# Arguments: +* `measure`::AbstractSimilarityMeasure - The similarity measure to use. +* `db_collection`::AbstractSimStringDB - The database collection to search. +* `query`::AbstractString - The query string to search for. +* `α`::float - The α parameter for the SimString algorithm. +* `ranked`::Boolean - Whether to return the results in ranked order. + +# Example +```julia +db = DictDB(CharacterNGrams(2, " ")); +append!(db, ["foo", "bar", "fooo"]); + +search(Dice(), db, "foo"; α=0.8, ranked=true) +# 2-element Vector{Tuple{String, Float64}}: +# ("foo", 1.0) +# ("fooo", 0.8888888888888888) +``` + +# Returns +* A Vector of results, where each element is a Tuple of the form (`string`, `similarity measure score`). +""" +function search(measure::AbstractSimilarityMeasure, db_collection::AbstractSimStringDB, query::AbstractString; α=0.7, ranked=true) + return search!(measure, db_collection, query; α=α, ranked=ranked) +end + + +""" +Internal function which ranks the results of a search using the specified similarity measure. +""" +function rank_search_results(measure::AbstractSimilarityMeasure, db_collection::DictDB, query, results; ranked=true) + features = extract_features(db_collection.feature_extractor, query) + + # Compute similarity scores for each result + ranked_results = map(results) do x + x, similarity_score(measure, features, extract_features(db_collection.feature_extractor, x) ) + end + + # Sort by similarity score and return + return ranked ? sort(ranked_results, by = i -> i[2], rev=true) : ranked_results +end + + +""" +Internal function which performs the overlap join +""" +function overlap_join(db_collection::AbstractSimStringDB, features, τ, candidate_size) + # length of features + query_feature_length = length(features) + + # Sort features from the most uncommon and the most common + features = sort(features, by = i -> length(lookup_feature_set_by_size_feature(db_collection, candidate_size, i) ) ) + + # Count the occurrences of each feature + candidate_match_counts = DefaultDict(0) + + feature_slice_index = query_feature_length - τ + 1 + + if feature_slice_index < 0 + focus_features = features[1:end + feature_slice_index] + else + focus_features = features[1:feature_slice_index] + end + + for i in focus_features + for s in lookup_feature_set_by_size_feature(db_collection, candidate_size, i) + candidate_match_counts[s] += 1 + end + end + + results = String[] + + # TODO: Return results in case of a perfect match?? + # if τ == 1 + # results = collect(keys(candidate_match_counts)) + # end + + for (candidate, match_count) in candidate_match_counts + + for i in (query_feature_length - τ + 1) : query_feature_length - 1 # TODO: Verify + + if i < 0 + feature = features[end + i] + elseif i == 0 + feature = features[i+1] + else + feature = features[i] + + end + + if candidate in lookup_feature_set_by_size_feature(db_collection, candidate_size, feature) + match_count += 1 + end + + if match_count >= τ + append!(results, [candidate]) + break + end + + remaining_count = query_feature_length - i - 1 + + if (match_count + remaining_count) < τ + break + end + + end + end + return results +end + + +""" +Search for strings in custom DictDB string collection using the SimString algorithm +and a similarity measure. +""" +function search!(measure::AbstractSimilarityMeasure, db_collection::DictDB, query::AbstractString; α=0.7, ranked=true) + # Generate features from query string + features = extract_features(db_collection.feature_extractor, query) + + # Metadata from the generated features (length, min & max sizes) + length_of_features = length(features) + min_feature_size = minimum_feature_size(measure, length_of_features, α) + max_feature_size = maximum_feature_size(measure, db_collection, length_of_features, α) + + results = String[] + + # Generate and return results from the potential candidate size pool + for candidate_size in min_feature_size:max_feature_size + # Minimum overlap + τ = minimum_overlap(measure, length_of_features, candidate_size, α) + + # Generate approximate candidates from the overlap join + append!(results, overlap_join(db_collection, features, τ, candidate_size)) + end + + # Rank search results + return rank_search_results(measure, db_collection, query, results; ranked=ranked) +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 63b8101..333e0e6 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,6 +1,23 @@ -using SimString +module TestSimString using Test -@testset "SimString.jl" begin - # Write your tests here. +for file in sort([file for file in readdir(@__DIR__) if occursin(r"^test[_0-9]+.*\.jl$", file)]) + m = match(r"test([0-9]+)_(.*).jl", file) + filename = String(m[2]) + testnum = string(parse(Int, m[1])) + + # with this test one can run only specific tests, for example + # Pkg.test("SimString", test_args = ["features"]) + # or + # Pkg.test("SimString", test_args = ["6"]) + if isempty(ARGS) || (filename in ARGS) || (testnum in ARGS) || (m[1] in ARGS) + @testset "$filename" begin + # Here you can optionally exclude some test files + # VERSION < v"1.1" && file == "test_xxx.jl" && continue + + include(file) + end + end end + +end # module \ No newline at end of file diff --git a/test/test01_dictdb.jl b/test/test01_dictdb.jl new file mode 100644 index 0000000..31f4e41 --- /dev/null +++ b/test/test01_dictdb.jl @@ -0,0 +1,71 @@ +module TestDBCollection +using SimString +using Test + + +@testset "Check single updates of DictDB using CharacterNGrams" begin + db = DictDB(CharacterNGrams(3, " ")) + push!(db, "foo") + push!(db, "bar") + push!(db, "fooo") + + @test db.string_collection == ["foo", "bar", "fooo"] + @test db.string_size_map[5] == Set(["bar", "foo"]) + @test db.string_size_map[6] == Set(["fooo"]) + + @test collect(keys(db.string_feature_map)) == [5, 6] + + @test collect(values(db.string_feature_map[5])) == vcat( (repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)) ) + @test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6) +end + + +@testset "Check single update of DictDB using WordNGrams" begin + db = DictDB(WordNGrams(2, " ", " ")) + push!(db, "You are a really really really cool dude.") + + @test db.string_collection == ["You are a really really really cool dude."] + @test db.string_size_map[9] == Set(["You are a really really really cool dude."]) + @test collect(keys(db.string_feature_map)) == [9] + @test collect(values(db.string_feature_map[9])) == repeat([Set(["You are a really really really cool dude."])], 9) +end + + +@testset "Check bulk updates of DictDB using CharacterNGrams" begin + db = DictDB(CharacterNGrams(3, " ")) + append!(db, ["foo", "bar", "fooo"]) + + @test db.string_collection == ["foo", "bar", "fooo"] + @test db.string_size_map[5] == Set(["bar", "foo"]) + @test db.string_size_map[6] == Set(["fooo"]) + + @test collect(keys(db.string_feature_map)) == [5, 6] + + @test collect(values(db.string_feature_map[5])) == vcat( (repeat([Set(["foo"])], 5)), (repeat([Set(["bar"])], 5)) ) + @test collect(values(db.string_feature_map[6])) == repeat([Set(["fooo"])], 6) + + @test eltype(collect(keys(db.string_feature_map[5]))) == Tuple{String, Int64} +end + + +@testset "Check bulk updates of DictDB using WordNGrams" begin + db = DictDB(WordNGrams(2, " ", " ")) + append!(db, ["You are a really really really cool dude.", "Sometimes you are not really really cool tho"]) + + @test db.string_collection == ["You are a really really really cool dude.", "Sometimes you are not really really cool tho"] + @test db.string_size_map[9] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"]) + + @test collect(keys(db.string_feature_map)) == [9] + @test collect(values(db.string_feature_map[9]))[5] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"]) + @test collect(values(db.string_feature_map[9]))[7] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"]) + + @test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String, String}, Int64} +end + + + + + + + +end # module \ No newline at end of file diff --git a/test/test02_features.jl b/test/test02_features.jl new file mode 100644 index 0000000..dcf6146 --- /dev/null +++ b/test/test02_features.jl @@ -0,0 +1,35 @@ +module TestFeatures +using SimString +using Test + + +@testset "Test feature extraction" begin + char_ngram_res = SimString.extract_features(CharacterNGrams(3, " "), "prepress") + @test char_ngram_res[6] == ("pre", 2) + + word_ngram_res = SimString.extract_features(WordNGrams(2, " ", " "), "You are a really really really cool dude.") + @test word_ngram_res[6] == (("really", "really"), 2) +end + + +@testset "Test padding" begin + @test SimString.pad_string(["one", "word"], " ") == [" ", "one", "word", " "] + @test SimString.pad_string("one word", " ") == " one word " +end + + + +@testset "Test cummulative_ngram_count" begin + +end + + + + + + + + + + +end # module \ No newline at end of file diff --git a/test/test03_measures.jl b/test/test03_measures.jl new file mode 100644 index 0000000..3083b80 --- /dev/null +++ b/test/test03_measures.jl @@ -0,0 +1,70 @@ +module TestMeasureUtils +using SimString +using Test + + +@testset "Test Similarity Scores" begin + X = [1, 2, 3] + Y = [1, 2, 4, 5] + @test SimString.similarity_score(Dice(), X, Y) ≈ 0.5714285714285714 + @test SimString.similarity_score(Jaccard(), X, Y) ≈ 0.4 + @test SimString.similarity_score(Cosine(), X, Y) ≈ 0.5773502691896258 + @test SimString.similarity_score(Overlap(), X, Y) ≈ 0.6666666666666666 +end + + +@testset "Test Minimum Candidate Feature Size" begin + @test SimString.minimum_feature_size(Dice(), 5, 1.) == 5 + @test SimString.minimum_feature_size(Dice(), 5, 0.5) == 2 + + @test SimString.minimum_feature_size(Jaccard(), 5, 1.) == 5 + @test SimString.minimum_feature_size(Jaccard(), 5, 0.5) == 3 + + @test SimString.minimum_feature_size(Cosine(), 5, 1.) == 5 + @test SimString.minimum_feature_size(Cosine(), 5, 0.5) == 2 + + @test SimString.minimum_feature_size(Overlap(), 5, 1.) == 1 + @test SimString.minimum_feature_size(Overlap(), 5, 0.5) == 1 +end + + +@testset "Test Maximum Candidate Feature Size" begin + db = DictDB(CharacterNGrams(3, " ")) + append!(db, ["foo", "bar", "fooo"]) + + @test SimString.maximum_feature_size(Dice(), db, 5, 1.) == 5 + @test SimString.maximum_feature_size(Dice(), db, 5, 0.5) == 15 + + @test SimString.maximum_feature_size(Jaccard(), db, 5, 1.) == 5 + @test SimString.maximum_feature_size(Jaccard(), db, 5, 0.5) == 10 + + @test SimString.maximum_feature_size(Cosine(), db, 5, 1.) == 5 + @test SimString.maximum_feature_size(Cosine(), db, 5, 0.5) == 20 + + @test SimString.maximum_feature_size(Overlap(), db, 5, 1.) == 6 + @test SimString.maximum_feature_size(Overlap(), db, 5, 0.5) == 6 +end + + +@testset "Test Minimum Feature Overlap" begin + @test SimString.minimum_overlap(Dice(), 5, 5, 1.0) == 13 + @test SimString.minimum_overlap(Dice(), 5, 20, 1.0) == 50 + @test SimString.minimum_overlap(Dice(), 5, 5, 0.5) == 7 + + @test SimString.minimum_overlap(Jaccard(), 5, 5, 1.0) == 5 + @test SimString.minimum_overlap(Jaccard(), 5, 20, 1.0) == 13 + @test SimString.minimum_overlap(Jaccard(), 5, 5, 0.5) == 4 + + @test SimString.minimum_overlap(Cosine(), 5, 5, 1.0) == 5 + @test SimString.minimum_overlap(Cosine(), 5, 20, 1.0) == 10 + @test SimString.minimum_overlap(Cosine(), 5, 5, 0.5) == 3 + + @test SimString.minimum_overlap(Overlap(), 5, 5, 1.0) == 5 + @test SimString.minimum_overlap(Overlap(), 5, 20, 1.0) == 5 + @test SimString.minimum_overlap(Overlap(), 5, 5, 0.5) == 3 + +end + + + +end # module \ No newline at end of file diff --git a/test/test04_search.jl b/test/test04_search.jl new file mode 100644 index 0000000..b062bcb --- /dev/null +++ b/test/test04_search.jl @@ -0,0 +1,52 @@ +module TestMeasures +using SimString +using Test +using Faker + + +@testset "Test Dice Search" begin + db = DictDB(CharacterNGrams(2, " ")); + append!(db, ["foo", "bar", "fooo"]); + + res = search(Dice(), db, "foo"; α=0.8, ranked=true) + @test res == [("foo", 1.0), ("fooo", 0.8888888888888888)] +end + + +@testset "Test Jaccard Search" begin + db = DictDB(CharacterNGrams(2, " ")); + append!(db, ["foo", "bar", "fooo"]); + + res = search(Jaccard(), db, "foo"; α=0.8, ranked=true) + @test res == [("foo", 1.0), ("fooo", 0.8)] + +end + + +@testset "Test Cosine Search" begin + db = DictDB(CharacterNGrams(2, " ")); + append!(db, ["foo", "bar", "fooo"]); + + res = search(Cosine(), db, "foo"; α=0.8, ranked=true) + @test res == [("foo", 1.0), ("fooo", 0.8944271909999159)] + +end + + +@testset "Test Overlap Search" begin + db = DictDB(CharacterNGrams(2, " ")); + append!(db, ["foo", "bar", "fooo"]); + + res = search(Overlap(), db, "foo"; α=0.8, ranked=true) + @test res == [("foo", 1.0), ("fooo", 1.0)] + +end + + + + + + + + +end # module \ No newline at end of file