-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRemoveStopWordsExample.scala
102 lines (77 loc) · 2.5 KB
/
RemoveStopWordsExample.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
package application
import anorm._
import application.Utils._
object RemoveStopWordsExample extends App {
implicit val conn = getDbConnection(dbUrl = args(0))
val names: List[String] =
SQL("select person_name from TLS906_PERSON limit 100000").as(SqlParser.str(1).*)
printList("first ten names", names.take(10))
// show token frequency
val tokenFreq: List[FreqCount[String]] =
names.flatMap(_.split(" ")).countFreq
printList("top tokens", tokenFreq.take(100))
// example stop words: limited, co., corporatiopn,gmbh.
// regular expression ignores case: (?i) and take into account only entire words: \b
val stopWords = "(?i)\\b(Limited|Co\\.|Corporation|Ltd\\.|GmbH)\\b"
def stopWordFound(text: String): Boolean =
stopWords.r.findAllIn(text).nonEmpty
// show names which contain at least one stop word
val someNames = names.filter(stopWordFound).take(100)
printList("some companies", someNames)
def removeStopWords(text: String): String =
text.replaceAll(stopWords, "")
// show those names with stop words removed
println("+++ some companies, stop words removed")
someNames.foreach(name =>
println(s"'$name' --> '${removeStopWords(name)}'")
)
}
/*
+++ first ten names
Nokia Corporation
Lipponen, Markku
Laitinen, Timo
Aho, Ari
Knuutila, Jarno
NOKIA MOBILE PHONES LTD.
Medical Research Council
MEDIMMUNE LIMITED
Griffiths, Andrew David
Hoogenboom, Hendricus Renerus Jacobus Mattheus
+++ top tokens
4763 c/o
2893 Ltd.
2213 Inc.
2172 Co.,
1903 Corporation
1817 GmbH
1499 Dr.
1277 &
1194 J.
1154 A.
1097 Michael
1004 Limited
...
+++ some companies
Nokia Corporation
MEDIMMUNE LIMITED
CAMBRIDGE ANTIBODY TECHNOLOGY LIMITED
Cambridge Antibody Technology Limited
Philips Intellectual Property & Standards GmbH
Philips Corporate Intellectual Property GmbH
Marioff Corporation Oy
Saltigo GmbH
IVT - Industrie Vertrieb Technik GmbH & Co. KG
...
+++ some companies, stop words removed
'Nokia Corporation' --> 'Nokia '
'MEDIMMUNE LIMITED' --> 'MEDIMMUNE '
'CAMBRIDGE ANTIBODY TECHNOLOGY LIMITED' --> 'CAMBRIDGE ANTIBODY TECHNOLOGY '
'Cambridge Antibody Technology Limited' --> 'Cambridge Antibody Technology '
'Philips Intellectual Property & Standards GmbH' --> 'Philips Intellectual Property & Standards '
'Philips Corporate Intellectual Property GmbH' --> 'Philips Corporate Intellectual Property '
'Marioff Corporation Oy' --> 'Marioff Oy'
'Saltigo GmbH' --> 'Saltigo '
'IVT - Industrie Vertrieb Technik GmbH & Co. KG' --> 'IVT - Industrie Vertrieb Technik & Co. KG'
...
*/