Skip to content

Commit 33f1c57

Browse files
authored
Optimize allocations (#21)
* Initial allocation optimisations * Updated init_grams to support UTF8 texts
1 parent f822944 commit 33f1c57

File tree

4 files changed

+26
-10
lines changed

4 files changed

+26
-10
lines changed

src/dictdb.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,8 @@ function DictDB(x::WordNGrams)
6565
x,
6666
String[],
6767
DefaultDict{Int, Set{String}}( () -> Set{String}() ),
68-
DefaultDict{ Int, DefaultOrderedDict{Tuple{NTuple{x.n, String}, Int}, Set{String}} }( () -> DefaultOrderedDict{Tuple{NTuple{x.n, String}, Int}, Set{String} }(Set{String})),
69-
DefaultDict{ Int, DefaultDict{Tuple{NTuple{x.n, String}, Int}, Set{String}} }( () -> DefaultDict{Tuple{NTuple{x.n, String}, Int}, Set{String}}(Set{String}))
68+
DefaultDict{ Int, DefaultOrderedDict{Tuple{SubArray{SubString{String}}, Int}, Set{String}} }( () -> DefaultOrderedDict{Tuple{SubArray{SubString{String}}, Int}, Set{String} }(Set{String})),
69+
DefaultDict{ Int, DefaultDict{Tuple{SubArray{SubString{String}}, Int}, Set{String}} }( () -> DefaultDict{Tuple{SubArray{SubString{String}}, Int}, Set{String}}(Set{String}))
7070
)
7171
end
7272

src/features.jl

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ end
1010
Internal function to pad AbstractVector types with specified padder
1111
"""
1212
function pad_string(x::AbstractVector, padder::AbstractString)
13-
# Insert a padder as the first and last element of x
13+
# TODO: Insert a padder as the first and last element of x with undef
1414
insert!(x, 1, padder)
1515
push!(x, padder)
1616
return x
@@ -21,18 +21,29 @@ end
2121
Internal function to generate intial uncounted ngrams on a character level
2222
"""
2323
function init_ngrams(extractor::CharacterNGrams, x, n)
24-
map(0:length(x)-n) do i
25-
x[i+1: i+n]
24+
y = Vector{SubString{String}}(undef, length(x)-n+1)
25+
26+
i = 0
27+
i1 = nextind(x, 0, 1)
28+
i2 = nextind(x, i1, n-1)
29+
30+
while i2 <= lastindex(x)
31+
i += 1
32+
y[i] = SubString(x, i1:i2)
33+
i1 = nextind(x, i1)
34+
i2 = nextind(x, i2)
2635
end
36+
return y
2737
end
2838

2939

40+
3041
"""
3142
Internal function to generate intial uncounted ngrams on a word level
3243
"""
3344
function init_ngrams(extractor::WordNGrams, x, n)
3445
map(0:length(x)-n) do i
35-
tuple(String.(x[i+1: i+n])...)
46+
@view x[i+1: i+n]
3647
end
3748
end
3849

@@ -85,6 +96,7 @@ end
8596
Internal function to count and pad generated character-level ngrams (including duplicates)
8697
"""
8798
function cummulative_ngram_count(x)
99+
# TODO: Use length of x initiate non allocated ngrams
88100
counter = Dict{eltype(x), Int}()
89101

90102
return map(x) do val

test/test01_dictdb.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ end
5959
@test collect(values(db.string_feature_map[9]))[5] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
6060
@test collect(values(db.string_feature_map[9]))[7] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
6161

62-
@test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String,String},Int64}
62+
@test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{SubArray{SubString{String}},Int64}
6363
end
6464

6565

@@ -104,7 +104,7 @@ end
104104
@test collect(values(db.string_feature_map[9]))[5] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
105105
@test collect(values(db.string_feature_map[9]))[7] == Set(["You are a really really really cool dude.", "Sometimes you are not really really cool tho"])
106106

107-
@test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{Tuple{String,String},Int64}
107+
@test eltype(collect(keys(db.string_feature_map[9]))) == Tuple{SubArray{SubString{String}},Int64}
108108

109109
end
110110

test/test02_features.jl

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,12 @@ using Test
77
char_ngram_res = SimString.extract_features(CharacterNGrams(3, " "), "prepress")
88
@test char_ngram_res[5] == ("pre", 2)
99

10-
word_ngram_res = SimString.extract_features(WordNGrams(2, " ", " "), "You are a really really really cool dude.")
11-
@test word_ngram_res[5] == (("really", "really"), 2)
10+
# Unicode text test
11+
@test SimString.extract_features(CharacterNGrams(2, " "), "∀∃😄🍕")[3] == ("😄🍕", 1)
12+
13+
word_ngram_res = SimString.extract_features(WordNGrams(2, " ", " "), "You are a really really really cool dude 😄🍕")
14+
@test word_ngram_res[5] == (["really", "really"], 2)
15+
@test word_ngram_res[8] == (["dude", "😄🍕"], 1)
1216
end
1317

1418

0 commit comments

Comments
 (0)