@@ -377,6 +377,8 @@ import edu.stanford.nlp.util.logging.Redwood;
377
377
* If an apparent negative number is generated from inside a hyphenated word
378
378
* (e.g., for "11-20", we first tokenize "11" and then appear to have generated "-20"),
379
379
* then tokenize the hyphen separately as a hyphen or dash.
380
+ * <p >
381
+ * Note that this method has side effects: it may push back characters.
380
382
*/
381
383
private void handleHyphenatedNumber(String in) {
382
384
// Strip dashes from hyphenated words
@@ -573,9 +575,9 @@ SENTEND2 = {SPACE}({SPACE}|[:uppercase:]|{SGML2})
573
575
574
576
/* Note that JFlex doesn't support {2,} pattern form. Only {j,k}. */
575
577
DATE = {DIGIT} {1,2} [ \- \u2012 \/ ] {DIGIT} {1,2} [ \- \u2012 \/ ] {DIGIT} {2,4}| {DIGIT} {4} [ \- \u2012 \/ ] {DIGIT} {1,2} [ \- \u2012 \/ ] {DIGIT} {1,2}
576
- /* Note that NUM also includes times like 12:55. One can start with a . or , but not a : */
577
- NUM = {DIGIT} *( [ ., \u066B\u066C ] {DIGIT} +)+| {DIGIT} +( [ .:,\u00AD\u066B\u066C\u2009\u202F ] {DIGIT} +)*
578
- LEADING_NUM = {DIGIT} +( [ .,\u066B\u066C ] {DIGIT} +)+
578
+ /* Note that NUM also includes times like 12:55. One can start with a . or but not a : or , */
579
+ NUM = {DIGIT} *( [ .\u066B ] {DIGIT} +)+| {DIGIT} +( [ .:,\u00AD\u066B\u066C\u2009\u202F ] {DIGIT} +)*
580
+ LEADING_NUM = {DIGIT} +( [ .: ,\u066B\u066C ] {DIGIT} +)+
579
581
/* Now don't allow bracketed negative numbers! They have too many uses (e.g.,
580
582
years or times in parentheses), and having them in tokens messes up treebank parsing.
581
583
NUMBER = [\-+]?{NUM}|\({NUM}\) */
@@ -1002,10 +1004,11 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
1002
1004
if (DEBUG ) { logger. info(" Used {DATE} to recognize " + origTxt + " as " + txt); }
1003
1005
return getNext(txt, origTxt);
1004
1006
}
1005
- {NUMBER} { String txt = yytext();
1006
- handleHyphenatedNumber(txt);
1007
- if (DEBUG ) { logger. info(" Used {NUMBER} to recognize " + yytext() + " as " + removeFromNumber(yytext())); }
1008
- return getNext(removeFromNumber(yytext()), yytext());
1007
+ {NUMBER} { handleHyphenatedNumber(yytext());
1008
+ String origTxt = yytext();
1009
+ String txt = removeFromNumber(origTxt);
1010
+ if (DEBUG ) { logger. info(" Used {NUMBER} to recognize " + origTxt + " as " + txt); }
1011
+ return getNext(txt, origTxt);
1009
1012
}
1010
1013
{SUBSUPNUM} { String txt = yytext();
1011
1014
if (DEBUG ) { logger. info(" Used {SUBSUPNUM} to recognize " + txt); }
0 commit comments