From 0cccc69126f38208c30a757f61d0a244b68daf29 Mon Sep 17 00:00:00 2001 From: LIFEfreedom Date: Tue, 30 Jun 2020 11:57:45 +0300 Subject: [PATCH 1/2] add string Aho-Corasick algorithm --- Algorithms/Strings/AhoCorasick.cs | 165 ++++++++++++++++++ Algorithms/Strings/AhoCorasickVertex.cs | 68 ++++++++ .../AlgorithmsTests/StringAhoCorasickTest.cs | 50 ++++++ 3 files changed, 283 insertions(+) create mode 100644 Algorithms/Strings/AhoCorasick.cs create mode 100644 Algorithms/Strings/AhoCorasickVertex.cs create mode 100644 UnitTest/AlgorithmsTests/StringAhoCorasickTest.cs diff --git a/Algorithms/Strings/AhoCorasick.cs b/Algorithms/Strings/AhoCorasick.cs new file mode 100644 index 00000000..4e47f4ca --- /dev/null +++ b/Algorithms/Strings/AhoCorasick.cs @@ -0,0 +1,165 @@ +using System.Collections.Generic; + +namespace Algorithms.Strings +{ + /// + /// The substring search algorithm implements the search for multiple substrings from the dictionary in a given string. + /// + public class AhoCorasick + { + /// + /// Tree in which each vertex denotes a row (the root denotes a zero row - $). + /// We will store the Tree as an array of vertices, where each vertex has its own unique number, and the root has a zero value (root = 0) + /// + private readonly List Tree = new List(); + + public AhoCorasick() + { + // Add root vertex. + Tree.Add(new AhoCorasickVertex(0, '$')); + } + + public void AddPattern(string pattern) + { + int num = 0; + + foreach (char ch in pattern.ToCharArray()) + { + if (!Tree[num].NextVertex.ContainsKey(ch)) // sign of no rib. + { + Tree.Add(new AhoCorasickVertex(num, ch)); + Tree[num].NextVertex.Add(ch, Tree.Count - 1); + } + + num = Tree[num].NextVertex[ch]; + } + + Tree[num].IsPattern = true; + Tree[num].Str = pattern; + } + + public void ClearPatterns() + { + Tree.Clear(); + // Add root vertex. + Tree.Add(new AhoCorasickVertex(0, '$')); + } + + public bool Exist(string pattern) + { + int num = 0; + foreach(var ch in pattern) + { + if(!Tree[num].NextVertex.ContainsKey(ch)) + { + return false; + } + num = Tree[num].NextVertex[ch]; + } + + return Tree[num].IsPattern; + } + + private int GetSuffLink(int index) + { + AhoCorasickVertex node = Tree[index]; + if (node.SuffLink == -1) + { + node.SuffLink = (index == 0 || node.Parent == 0) ? 0 : GetAutoMove(GetSuffLink(node.Parent), node.Symbol); + } + + return node.SuffLink; + } + + /// + /// Transition from the state of the automaton are interconnected. + /// + /// Vertex index. + /// Transition symbol. + private int GetAutoMove(int index, char ch) + { + AhoCorasickVertex node = Tree[index]; + if (!node.AutoMove.ContainsKey(ch)) + { + // if there is an vertex with the symbol ch from the current vertex, then we will follow it, + // otherwise we will follow the suffix link and start recursively from the new vertex. + int autoMove; + if (node.NextVertex.ContainsKey(ch)) + { + autoMove = node.NextVertex[ch]; + } + else + { + autoMove = (index == 0) ? 0 : GetAutoMove(GetSuffLink(index), ch); + } + + node.AutoMove.Add(ch, autoMove); + } + + return node.AutoMove[ch]; + } + + private int GetGoodSuffLink(int index) + { + AhoCorasickVertex node = Tree[index]; + if (node.GoodSuffLink == -1) + { + int slink = GetSuffLink(index); + + if (slink == 0) + { + // Suffix link is root vertex. + node.GoodSuffLink = 0; + } + else + { + // If flag = true for the vertex by the suffix link, then this is the desired vertex; otherwise, we start recursively from the same vertex. + node.GoodSuffLink = Tree[slink].IsPattern ? slink : GetGoodSuffLink(slink); + } + } + + return node.GoodSuffLink; + } + + /// + /// Walking on "good" suffix links. + /// + /// Current position of the automaton. + /// For tests. + private List Check(int index) + { + List patterns = new List(); + while (index != 0) + { + AhoCorasickVertex node = Tree[index]; + if (node.IsPattern) + { + patterns.Add(node.Str); + } + + index = GetGoodSuffLink(index); + } + + return patterns; + } + + /// + /// Search for all patterns in a string. + /// + /// Line in which the search occurs. + /// For tests. + public List FindAllOccurrences(string line) + { + List occurences = new List(); + int index = 0; + + for (int i = 0; i < line.Length; i++) + { + index = GetAutoMove(index, line[i]); + occurences.AddRange(Check(index)); + } + + return occurences; + } + } +} diff --git a/Algorithms/Strings/AhoCorasickVertex.cs b/Algorithms/Strings/AhoCorasickVertex.cs new file mode 100644 index 00000000..c1d7e0de --- /dev/null +++ b/Algorithms/Strings/AhoCorasickVertex.cs @@ -0,0 +1,68 @@ +using System.Collections.Generic; + +namespace Algorithms.Strings +{ + internal class AhoCorasickVertex + { + /// + /// A flag indicating whether our vertex is the source string. + /// + public bool IsPattern; + + /// + /// The number (Value) of the vertex to which we arrive by symbol (Key). + /// + public readonly SortedDictionary NextVertex; + + /// + /// Remembering the transition of the automaton. + /// + public readonly SortedDictionary AutoMove; + + /// + /// The suffix link of the vertex X is a pointer to the vertex Y, + /// such that the string Y is the largest own suffix of the string X, or, + /// if there is no such vertex in the tree, then the pointer to the root. + /// In particular, a link from the root leads to it. + /// + public int SuffLink; + + /// + /// "Good" suffix link. + /// + public int GoodSuffLink; + + /// + /// parrent vertex in a tree. + /// + public readonly int Parent; + + /// + /// Symbol on the vertex. + /// + public readonly char Symbol; + + /// + /// For tests. + /// + public string Str; + + /// + /// Create a vertex by initializing the variables and setting the parrent and symbol. + /// + /// Number of the parrent + /// Symbol on the vertex in the tree. + public AhoCorasickVertex(int parent, char symbol) + { + IsPattern = false; + NextVertex = new SortedDictionary(); + AutoMove = new SortedDictionary(); + + Parent = parent; + Symbol = symbol; + + GoodSuffLink = -1; // initially - no suffix flink. + SuffLink = -1; // initially - no suffix link. + } + } +} diff --git a/UnitTest/AlgorithmsTests/StringAhoCorasickTest.cs b/UnitTest/AlgorithmsTests/StringAhoCorasickTest.cs new file mode 100644 index 00000000..783e8c82 --- /dev/null +++ b/UnitTest/AlgorithmsTests/StringAhoCorasickTest.cs @@ -0,0 +1,50 @@ +using Algorithms.Strings; + +using System.Collections.Generic; +using System.Linq; + +using Xunit; + +namespace UnitTest.AlgorithmsTests +{ + public static class StringAhoCorasickTest + { + [Fact] + public static void DoTest() + { + AhoCorasick alg = new AhoCorasick(); + + // Initialize patterns + + alg.AddPattern("a"); + alg.AddPattern("b"); + alg.AddPattern("c"); + alg.AddPattern("d"); + alg.AddPattern("aa"); + + List foundPatterns = alg.FindAllOccurrences("caaab"); + + Assert.True(foundPatterns.Count == 7); + Assert.True(foundPatterns.Where(q => q.Equals("c")).Count() == 1); + Assert.True(foundPatterns.Where(q => q.Equals("a")).Count() == 3); + Assert.True(foundPatterns.Where(q => q.Equals("aa")).Count() == 2); + Assert.True(foundPatterns.Where(q => q.Equals("b")).Count() == 1); + alg.ClearPatterns(); + + alg.AddPattern("test1"); + alg.AddPattern("test2"); + alg.AddPattern("test3"); + alg.AddPattern("test33"); + alg.AddPattern("verybigtest"); + + foundPatterns = alg.FindAllOccurrences("testtest1test1122test22test3549798test3656test333354654sdjkhbfabvdskhjfbashjdvbfjhksdbahjfvhusgdabvfhjsdvfgsdkhjvkverybigtesthdsagfhkgasdkhfverybigtestsdhgfjhkgsdfgk"); + + Assert.True(foundPatterns.Count == 9); + Assert.True(foundPatterns.Where(q => q.Equals("test1")).Count() == 2); + Assert.True(foundPatterns.Where(q => q.Equals("test2")).Count() == 1); + Assert.True(foundPatterns.Where(q => q.Equals("test3")).Count() == 3); + Assert.True(foundPatterns.Where(q => q.Equals("test33")).Count() == 1); + Assert.True(foundPatterns.Where(q => q.Equals("verybigtest")).Count() == 2); + } + } +} From 1abd62133d66e0730d6372c4f601fe8d9fd41e80 Mon Sep 17 00:00:00 2001 From: LIFEfreedom Date: Tue, 30 Jun 2020 12:19:29 +0300 Subject: [PATCH 2/2] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 7526c80b..a5f8fd7a 100644 --- a/README.md +++ b/README.md @@ -190,6 +190,8 @@ If you wish to contribute to C# ALGORITHMS, then please make sure you check out * [Permutations and Anagrams](Algorithms/Strings/Permutations.cs) * [Edit Distance](Algorithms/Strings/EditDistance.cs) + Uses a generic custom class for passing costs: [EditDistanceCostsMap\](Algorithms/Strings/EditDistanceCostsMap.cs) + * [Aho-Corasick](Algorithms/Strings/AhoCorasick.cs) + + Uses a class to store information about vertices and transitions between them.: [AhoCorasickVertex](Algorithms/Strings/AhoCorasickVertex.cs) #### Numeric: