diff --git a/document.go b/document.go index 5b22396..11a79f9 100644 --- a/document.go +++ b/document.go @@ -3,6 +3,7 @@ package main import ( "bufio" "os" + "regexp" "strings" ) @@ -11,6 +12,8 @@ type Document struct { Words []string } +var nonAlphanumericRegex = regexp.MustCompile(`[^a-zA-Z0-9 ']+`) + func NewDocument(path string) (*Document, error) { // Open the file file, err := os.Open(path) @@ -30,7 +33,19 @@ func NewDocument(path string) (*Document, error) { // Loop over the words and append each to the words slice for scanner.Scan() { - words = append(words, strings.ToLower(scanner.Text())) + token := strings.ToLower(scanner.Text()) + + // Split each token on non-alphanumeric characters (except single qutoes, to + // handle contractions) + for _, word := range nonAlphanumericRegex.Split(token, -1) { + // Since we didn't split on single quotes, we need to trim them off now. + // We'd like "don't" to stay "don't", but "'hello" to become "hello". + trimmedWord := strings.Trim(word, "'") + + if trimmedWord != "" { + words = append(words, trimmedWord) + } + } } // Check for errors in scanning diff --git a/main.go b/main.go index a20e54e..6a04b8a 100644 --- a/main.go +++ b/main.go @@ -15,5 +15,5 @@ func main() { fmt.Println("corpus:", docs) targetDoc, _ := NewDocument(*targetFlag) - fmt.Println("target doc:", targetDoc) + fmt.Printf("target doc: #%v\n", targetDoc) }