PyDataBlog · PyDataBlog · Jan 3, 2022 · Oct 24, 2021 · Oct 26, 2021 · Oct 27, 2021
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -1,10 +1,7 @@
 name: CI
 on:
-  push:
-    branches:
-      - main
-    tags: '*'
-  pull_request:
+  - push
+  - pull_request
 jobs:
   test:
     name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
@@ -13,19 +10,15 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.0'
           - '1.6'
+          - '1.7'
           - 'nightly'
         os:
           - ubuntu-latest
           - macOS-latest
           - windows-latest
         arch:
           - x64
-          - x86
-        exclude:
-          - os: macOS-latest
-            arch: x86
     steps:
       - uses: actions/checkout@v2
       - uses: julia-actions/setup-julia@v1

diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,4 @@
 *.jl.mem
 /Manifest.toml
 /docs/build/
+.vscode
diff --git a/Project.toml b/Project.toml
@@ -3,11 +3,18 @@ uuid = "2e3c4037-312d-4650-b9c0-fcd0fc09aae4"
 authors = ["Bernard Brenyah"]
 version = "0.1.0"
 
+[deps]
+CircularArrays = "7a955b69-7140-5f4e-a0ed-f168c5e2e749"
+DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
+OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
+ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
+
 [compat]
 julia = "1"
 
 [extras]
+Faker = "0efc519c-db33-5916-ab87-703215c3906f"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test"]
+test = ["Test", "Faker"]
diff --git a/README.md b/README.md
@@ -6,3 +6,45 @@
 [![Coverage](https://codecov.io/gh/PyDataBlog/SimString.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/PyDataBlog/SimString.jl)
 [![Code Style: Blue](https://img.shields.io/badge/code%20style-blue-4495d1.svg)](https://github.com/invenia/BlueStyle)
 [![ColPrac: Contributor's Guide on Collaborative Practices for Community Packages](https://img.shields.io/badge/ColPrac-Contributor's%20Guide-blueviolet)](https://github.com/SciML/ColPrac)
+
+A native Julia implementation of the CPMerge algorithm, which is designed for approximate string matching.
+This package is be particulary useful for natural language processing tasks which demand the retrieval of strings/texts from a very large corpora (big amounts of texts). Currently, this package supports both Character and Word based N-grams feature generations and there are plans to open the package up for custom user defined feature generation methods.
+
+## Features
+
+- [X] Fast algorithm for string matching
+- [X] 100% exact retrieval
+- [X] Support for unicodes
+- [ ] Custom user defined feature generation methods
+- [ ] Mecab-based tokenizer support
+
+## Suported String Similarity Measures
+
+- [X] Dice coefficient
+- [X] Jaccard coefficient
+- [X] Cosine coefficient
+- [X] Overlap coefficient
+
+## Installation
+
+You can grab the latest stable version of this package from Julia registries by simply running;
+
+*NB:* Don't forget to invoke Julia's package manager with `]`
+
+```julia
+pkg> add SimString
+```
+
+The few (and selected) brave ones can simply grab the current experimental features by simply adding the master branch to your development environment after invoking the package manager with `]`:
+
+```julia
+pkg> add SimString#master
+```
+
+You are good to go with bleeding edge features and breakages!
+
+To revert to a stable version, you can simply run:
+
+```julia
+pkg> free SimString
+```
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -6,6 +6,76 @@ CurrentModule = SimString
 
 Documentation for [SimString](https://github.com/PyDataBlog/SimString.jl).
 
+A native Julia implementation of the CPMerge algorithm, which is designed for approximate string matching.
+This package is be particulary useful for natural language processing tasks which demand the retrieval of strings/texts from a very large corpora (big amounts of texts). Currently, this package supports both Character and Word based N-grams feature generations and there are plans to open the package up for custom user defined feature generation methods.
+
+## Features
+
+- [X] Fast algorithm for string matching
+- [X] 100% exact retrieval
+- [X] Support for unicodes
+- [ ] Custom user defined feature generation methods
+- [ ] Mecab-based tokenizer support
+
+## Suported String Similarity Measures
+
+- [X] Dice coefficient
+- [X] Jaccard coefficient
+- [X] Cosine coefficient
+- [X] Overlap coefficient
+
+## Installation
+
+You can grab the latest stable version of this package from Julia registries by simply running;
+
+*NB:* Don't forget to invoke Julia's package manager with `]`
+
+```julia
+pkg> add SimString
+```
+
+The few (and selected) brave ones can simply grab the current experimental features by simply adding the master branch to your development environment after invoking the package manager with `]`:
+
+```julia
+pkg> add SimString#master
+```
+
+You are good to go with bleeding edge features and breakages!
+
+To revert to a stable version, you can simply run:
+
+```julia
+pkg> free SimString
+```
+
+## Usage
+
+```julia
+using SimString
+
+# Inilisate database and some strings
+db = DictDB(CharacterNGrams(2, " "));
+push!(db, "foo");
+push!(db, "bar");
+push!(db, "fooo");
+
+# Convinient approach is to use an array of strings for multiple entries: `append!(db, ["foo", "bar", "fooo"]);`
+
+# Retrieve the closest match(es)
+res = search(Dice(), db, "foo"; α=0.8, ranked=true)
+# 2-element Vector{Tuple{String, Float64}}:
+#  ("foo", 1.0)
+#  ("fooo", 0.8888888888888888)
+
+
+```
+
+## TODO: Benchmarks
+
+## Release History
+
+- 0.1.0 Initial release.
+
 ```@index
 ```
 

diff --git a/extras/examples.jl b/extras/examples.jl
@@ -0,0 +1,46 @@
+using SimString
+using Faker
+using BenchmarkTools
+using DataStructures
+
+################################# Benchmark Bulk addition #####################
+db = DictDB(CharacterNGrams(3, " "));
+Faker.seed(2020)
+@time fake_names = [string(Faker.first_name(), " ", Faker.last_name()) for i in 1:100_000];
+
+
+f(d, x) = append!(d, x)
+@time f(db, fake_names)
+
+
+
+################################ Simple Addition ###############################
+
+db = DictDB(CharacterNGrams(2, " "));
+push!(db, "foo");
+push!(db, "bar");
+push!(db, "fooo");
+
+f(x, c, s) = search(x, c, s)
+test = "foo";
+col = db;
+sim = Cosine();
+
+f(Cosine(),  db, "foo")
+
+@btime f($sim,  $col, $test)
+@btime search(Cosine(), db, "foo"; α=0.8, ranked=true)
+
+
+
+db2 = DictDB(CharacterNGrams(3, " "));
+append!(db2, ["foo", "bar", "fooo", "foor"]) # also works via multiple dispatch on a vector
+
+results = search(Cosine(), db, "foo"; α=0.8, ranked=true)  # yet to be implemented
+
+bs = ["foo", "bar", "foo", "foo", "bar"]
+SimString.extract_features(CharacterNGrams(3, " "), "prepress")
+SimString.extract_features(WordNGrams(2, " ", " "), "You are a really really really cool dude.")
+
+db = DictDB(WordNGrams(2, " ", " "))
+push!(db, "You are a really really really cool dude.")
diff --git a/extras/py_benchmarks.py b/extras/py_benchmarks.py
@@ -0,0 +1,16 @@
+from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor
+from simstring.measure.cosine import CosineMeasure
+from simstring.database.dict import DictDatabase
+from simstring.searcher import Searcher
+from faker import Faker
+
+db = DictDatabase(CharacterNgramFeatureExtractor(3))
+
+fake = Faker()
+fake_names = [fake.name() for i in range(100_000)]
+
+def f(x):
+    for i in x:
+        db.add(i)
+
+# %time f(fake_names)
diff --git a/src/SimString.jl b/src/SimString.jl
@@ -1,5 +1,29 @@
 module SimString
 
-# Write your package code here.
+import Base: push!, append!
+using DataStructures: DefaultOrderedDict, DefaultDict
+# using ProgressMeter
+# using CircularArrays
+# using OffsetArrays
+
+######### Import modules & utils ################
+include("db_collection.jl")
+include("dictdb.jl")
+include("features.jl")
+include("measures.jl")
+include("search.jl")
+
+
+
+####### Global export of user API #######
+export Dice, Jaccard, Cosine, Overlap,
+    AbstractSimStringDB, DictDB,
+    CharacterNGrams, WordNGrams,
+    search
+
+
+
+
+
 
 end
diff --git a/src/db_collection.jl b/src/db_collection.jl
@@ -0,0 +1,35 @@
+# Custom Collections
+
+"""
+Base type for all custom db collections.
+"""
+abstract type AbstractSimStringDB end
+
+
+"""
+Abstract type for feature extraction structs
+"""
+abstract type FeatureExtractor end
+
+
+# Feature Extraction Definitions
+
+"""
+Feature extraction on character-level ngrams
+"""
+struct CharacterNGrams{T1<:Int, T2<:AbstractString} <: FeatureExtractor
+    n::T1         # number of n-grams to extract
+    padder::T2    # string to use to pad n-grams
+end
+
+
+"""
+Feature extraction based on word-level ngrams
+"""
+struct WordNGrams{T1<:Int, T2<:AbstractString} <: FeatureExtractor
+    n::T1           # number of n-grams to extract
+    padder::T2      # string to use to pad n-grams
+    splitter::T2    # string to use to split words
+end
+
+