Skip to content

Commit

Permalink
Add --no-stemming flag to skip stemming
Browse files Browse the repository at this point in the history
  • Loading branch information
hrs committed May 16, 2023
1 parent b41825e commit 0913d52
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 1 deletion.
1 change: 1 addition & 0 deletions lib/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package main
type Config struct {
BestFirst bool
Limit int
NoStemming bool
NoStoplist bool
OmitQuery bool
ShowScores bool
Expand Down
7 changes: 6 additions & 1 deletion lib/document.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,12 @@ func NewDocument(path string, config *Config) (*Document, error) {

if word != "" {
if config.NoStoplist || !inStoplist(word) {
termCount[stem(word)]++
if config.NoStemming {
termCount[word]++
} else {
termCount[stem(word)]++
}

totalWordCount++
}
}
Expand Down
2 changes: 2 additions & 0 deletions lib/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ func sameFile(a, b string) bool {
func main() {
bestFirstFlag := flag.Bool("best-first", false, "print best matches first")
limitFlag := flag.Int("limit", 0, "return at most `limit` results")
noStemmingFlag := flag.Bool("no-stemming", false, "don't perform stemming on words")
noStoplistFlag := flag.Bool("no-stoplist", false, "don't omit common words by using a stoplist")
omitQueryFlag := flag.Bool("omit-query", false, "don't include the query file itself in search results")
queryFlag := flag.String("query", "", "path to the file that results should match")
Expand All @@ -67,6 +68,7 @@ func main() {
config := Config{
BestFirst: *bestFirstFlag,
Limit: *limitFlag,
NoStemming: *noStemmingFlag,
NoStoplist: *noStoplistFlag,
OmitQuery: *omitQueryFlag,
ShowScores: *showScoresFlag,
Expand Down
10 changes: 10 additions & 0 deletions man/docsim.1
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,16 @@ default (worst to best).
.BR \-\-limit " " \fINUM\fR
Show no more that the best NUM results.
.TP
.BR \-\-no\-stemming
Don't stem words. Stemming reduces inflected words to their word stem before
comparing documents, which maps morphologically related words to a common term
(so "spinning", "spinner", and "spins" might all map to "spin").
.PP
.RS
Stemming usually improves scoring accuracy in English, but should probably be
disabled when searching across code or documents in other languages.
.RE
.TP
.BR \-\-no\-stoplist
Don't filter out common words, like "the" and "because".
.TP
Expand Down

0 comments on commit 0913d52

Please # to comment.