-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathsimil.c
145 lines (125 loc) · 4.1 KB
/
simil.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
/*
* A Ratcliff-Obershelp style string matcher.
*
* The Ratcliff-Obershelp algorithm was presented in Dr Dobbs' Journal in
* the article "Pattern Matching: The Gestalt Approach" by John W. Ratcliff
* and David E. Metzener in July 1988.
* Unfortunately the original implementation was in assembly, limiting its
* portability and readability, and its relevance to modern architectures
* (although its performance was extremely optimized).
* In Nov 1988 Joe Preston sent a C implementation of the algorithm to
* Dr. Dobbs' letters to the editor. The C implementation used a recursive
* algorithm. Unfortunately, according to the author it ran about twice as slow
* as the original. My implementation is inspired by that C implementation.
*
* The algorithm returns a value between 0 and 100 which indicates how alike
* two strings are. 0 means the strings have nothing in common, and 100 means
* they're exactly alike.
*
* The typical use of the algorithm is for validating user input. The original
* article described educational software that can deal with spelling errors in
* answers. Other uses are applications that could supply a user with feedback
* along the lines of "The command 'fobo' is invalid. Did you mean 'foo'?" or
* intelligent compilers that could make suggestions when identifiers or
* keywords are used incorrectly.
*
* This is free and unencumbered software released into the public domain.
* http://unlicense.org/
*/
/*
* See simil.h for more info
*/
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
static int rsimil (const char *a, int alen, const char *b, int blen, int cs);
/*
* Tests the similarity of two strings a and b using the Ratcliff-Obershelp
* method.
* The return value is a value between 0 and 100 where 0 means that the
* two strings have nothing in common, and 100 means that they're exact
* matches.
*/
int
simil (const char *a, const char *b)
{
int alen, blen;
alen = strlen (a);
blen = strlen (b);
if (alen == 0 || blen == 0)
return 0;
return (rsimil (a, alen, b, blen, 1) * 200) / (alen + blen);
}
/*
* Case insensitive version of simil().
* It copies the strings internally using strdup(), converts the copies
* to uppercase, and compares those.
* It returns the same values as simil(), but it may also return zero if
* the calls to strdup() fail.
*/
int
isimil (const char *a, const char *b)
{
int alen, blen;
alen = strlen (a);
blen = strlen (b);
if (alen == 0 || blen == 0)
return 0;
return (rsimil (a, alen, b, blen, 0) * 200) / (alen + blen);
}
/*
* This is the core of the algorithm. It finds the longest matching substring
* and then recursively matches the left and right remaining strings.
* cs - Case sensitive
*/
static int
rsimil (const char *a, int alen, const char *b, int blen, int cs)
{
int i, j, k, l, p = 0, q = 0, len = 0, left = 0, right = 0;
/* Find a matching substring */
for (i = 0; i < alen - len; i++)
for (j = 0; j < blen - len; j++)
{
if(cs)
{
if (a[i] == b[j] && a[i + len] == b[j + len])
{
/* Find out whether this is the longest match */
for (k = i + 1, l = j + 1; a[k] == b[l] && k < alen && l < blen; k++, l++);
if (k - i > len)
{
p = i;
q = j;
len = k - i;
}
}
} else {
if (tolower(a[i]) == tolower(b[j]) && tolower(a[i + len]) == tolower(b[j + len]))
{
/* Find out whether this is the longest match */
for (k = i + 1, l = j + 1; tolower(a[k]) == tolower(b[l]) && k < alen && l < blen; k++, l++);
if (k - i > len)
{
p = i;
q = j;
len = k - i;
}
}
}
}
/* No match */
if (len == 0)
return 0;
/* Match the strings to the left */
if (p != 0 && q != 0)
left = rsimil (a, p, b, q, cs);
i = (p + len);
alen -= i;
j = (q + len);
blen -= j;
/* Match the strings to the right */
if (alen != 0 && blen != 0)
right = rsimil (a + i, alen, b + j, blen, cs);
/* Return the score */
return len + left + right;
}