Skip to content

Commit

Permalink
Split tokens but retain contractions
Browse files Browse the repository at this point in the history
  • Loading branch information
hrs committed May 12, 2023
1 parent c5a08da commit 7959b68
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 2 deletions.
17 changes: 16 additions & 1 deletion document.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package main
import (
"bufio"
"os"
"regexp"
"strings"
)

Expand All @@ -11,6 +12,8 @@ type Document struct {
Words []string
}

var nonAlphanumericRegex = regexp.MustCompile(`[^a-zA-Z0-9 ']+`)

func NewDocument(path string) (*Document, error) {
// Open the file
file, err := os.Open(path)
Expand All @@ -30,7 +33,19 @@ func NewDocument(path string) (*Document, error) {

// Loop over the words and append each to the words slice
for scanner.Scan() {
words = append(words, strings.ToLower(scanner.Text()))
token := strings.ToLower(scanner.Text())

// Split each token on non-alphanumeric characters (except single qutoes, to
// handle contractions)
for _, word := range nonAlphanumericRegex.Split(token, -1) {
// Since we didn't split on single quotes, we need to trim them off now.
// We'd like "don't" to stay "don't", but "'hello" to become "hello".
trimmedWord := strings.Trim(word, "'")

if trimmedWord != "" {
words = append(words, trimmedWord)
}
}
}

// Check for errors in scanning
Expand Down
2 changes: 1 addition & 1 deletion main.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ func main() {
fmt.Println("corpus:", docs)

targetDoc, _ := NewDocument(*targetFlag)
fmt.Println("target doc:", targetDoc)
fmt.Printf("target doc: #%v\n", targetDoc)
}

0 comments on commit 7959b68

Please # to comment.