-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconfusables.go
112 lines (100 loc) · 3.29 KB
/
confusables.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
package confusablehomoglyphs
import (
"strings"
)
type ConfusableResult struct {
Character rune `json:"character"`
Alias string `json:"alias"`
Homoglyphs []Homoglyph `json:"homoglyphs"`
}
// IsMixedScript checks if str contains mixed-scripts content,
// excluding script blocks aliases in allowedAliases.
// E.g. ``B. C`` is not considered mixed-scripts by default: it contains characters
// from Latin and Common, but Common is excluded by default.
func IsMixedScript(str string, allowedAliases []string) bool {
if allowedAliases == nil {
allowedAliases = []string{"COMMON"}
}
allowedAliasesSet := map[string]interface{}{}
for _, a := range allowedAliases {
allowedAliasesSet[strings.ToUpper(a)] = struct{}{}
}
uniqueAliases := UniqueAliases(str)
count := 0
for _, ua := range uniqueAliases {
if _, ok := allowedAliasesSet[ua]; ok {
continue
}
count++
}
return count > 1
}
// IsConfusable check if str contains characters which might be confusable with
// characters from preferredAliases.
// If greedy is false, it will only return the first confusable character
// found without looking at the rest of the string, greedy is true returns
// all of them.
// preferredAliases can take an array of unicode block aliases to
// be considered as your 'base' unicode blocks
func IsConfusable(str string, greedy bool, preferredAliases []string) []ConfusableResult {
preferredAliasesSet := map[string]struct{}{}
for _, a := range preferredAliases {
preferredAliasesSet[strings.ToUpper(a)] = struct{}{}
}
outputs := []ConfusableResult{}
checked := map[rune]struct{}{}
for _, chr := range str {
if _, ok := checked[chr]; ok {
continue
}
checked[chr] = struct{}{}
charAlias := Alias(chr)
if _, ok := preferredAliasesSet[charAlias]; ok {
// it's safe if the character might be confusable with homoglyphs from other
// categories than our preferred categories (=aliases)
continue
}
found, ok := confusablesData[string(chr)]
if !ok {
continue
}
// character λ is considered confusable if λ can be confused with a character from
// preferred_aliases, e.g. if 'LATIN', 'ρ' is confusable with 'p' from LATIN.
// if 'LATIN', 'Γ' is not confusable because in all the characters confusable with Γ,
// none of them is LATIN.
var potentiallyConfusable []Homoglyph
if len(preferredAliasesSet) > 0 {
potentiallyConfusable = []Homoglyph{}
OUTER:
for _, d := range found {
for _, glyph := range d.C {
a := Alias(glyph)
if _, ok := preferredAliasesSet[a]; ok {
potentiallyConfusable = found
break OUTER
}
}
}
} else {
potentiallyConfusable = found
}
if len(potentiallyConfusable) > 0 {
outputs = append(outputs, ConfusableResult{
Character: chr,
Alias: charAlias,
Homoglyphs: potentiallyConfusable,
})
if !greedy {
return outputs
}
}
}
return outputs
}
// IsDangerous checks if str can be dangerous, i.e. is it not only mixed-scripts
// but also contains characters from other scripts than the ones in preferredAliases
// that might be confusable with characters from scripts in preferredAliases.
func IsDangerous(str string, preferredAliases []string) bool {
confusablesResult := IsConfusable(str, false, preferredAliases)
return IsMixedScript(str, nil) && len(confusablesResult) > 0
}