From 0913d52203b40b52eca351aed4d3089bf09feacc Mon Sep 17 00:00:00 2001 From: "Harry R. Schwartz" Date: Tue, 16 May 2023 16:05:11 -0700 Subject: [PATCH] Add --no-stemming flag to skip stemming --- lib/config.go | 1 + lib/document.go | 7 ++++++- lib/main.go | 2 ++ man/docsim.1 | 10 ++++++++++ 4 files changed, 19 insertions(+), 1 deletion(-) diff --git a/lib/config.go b/lib/config.go index 257bf64..bc47347 100644 --- a/lib/config.go +++ b/lib/config.go @@ -3,6 +3,7 @@ package main type Config struct { BestFirst bool Limit int + NoStemming bool NoStoplist bool OmitQuery bool ShowScores bool diff --git a/lib/document.go b/lib/document.go index b66fbc0..0f4a75f 100644 --- a/lib/document.go +++ b/lib/document.go @@ -64,7 +64,12 @@ func NewDocument(path string, config *Config) (*Document, error) { if word != "" { if config.NoStoplist || !inStoplist(word) { - termCount[stem(word)]++ + if config.NoStemming { + termCount[word]++ + } else { + termCount[stem(word)]++ + } + totalWordCount++ } } diff --git a/lib/main.go b/lib/main.go index 687c7fe..71a8f28 100644 --- a/lib/main.go +++ b/lib/main.go @@ -57,6 +57,7 @@ func sameFile(a, b string) bool { func main() { bestFirstFlag := flag.Bool("best-first", false, "print best matches first") limitFlag := flag.Int("limit", 0, "return at most `limit` results") + noStemmingFlag := flag.Bool("no-stemming", false, "don't perform stemming on words") noStoplistFlag := flag.Bool("no-stoplist", false, "don't omit common words by using a stoplist") omitQueryFlag := flag.Bool("omit-query", false, "don't include the query file itself in search results") queryFlag := flag.String("query", "", "path to the file that results should match") @@ -67,6 +68,7 @@ func main() { config := Config{ BestFirst: *bestFirstFlag, Limit: *limitFlag, + NoStemming: *noStemmingFlag, NoStoplist: *noStoplistFlag, OmitQuery: *omitQueryFlag, ShowScores: *showScoresFlag, diff --git a/man/docsim.1 b/man/docsim.1 index ddc7855..9567255 100644 --- a/man/docsim.1 +++ b/man/docsim.1 @@ -26,6 +26,16 @@ default (worst to best). .BR \-\-limit " " \fINUM\fR Show no more that the best NUM results. .TP +.BR \-\-no\-stemming +Don't stem words. Stemming reduces inflected words to their word stem before +comparing documents, which maps morphologically related words to a common term +(so "spinning", "spinner", and "spins" might all map to "spin"). +.PP +.RS +Stemming usually improves scoring accuracy in English, but should probably be +disabled when searching across code or documents in other languages. +.RE +.TP .BR \-\-no\-stoplist Don't filter out common words, like "the" and "because". .TP