Skip to content

Commit 974383a

Browse files
committed
Adjust NUMBER to not be able to start with a comma
1 parent 87d0bd2 commit 974383a

File tree

1 file changed

+10
-7
lines changed

1 file changed

+10
-7
lines changed

src/edu/stanford/nlp/process/PTBLexer.flex

+10-7
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,8 @@ import edu.stanford.nlp.util.logging.Redwood;
377377
* If an apparent negative number is generated from inside a hyphenated word
378378
* (e.g., for "11-20", we first tokenize "11" and then appear to have generated "-20"),
379379
* then tokenize the hyphen separately as a hyphen or dash.
380+
* <p>
381+
* Note that this method has side effects: it may push back characters.
380382
*/
381383
private void handleHyphenatedNumber(String in) {
382384
// Strip dashes from hyphenated words
@@ -573,9 +575,9 @@ SENTEND2 = {SPACE}({SPACE}|[:uppercase:]|{SGML2})
573575

574576
/* Note that JFlex doesn't support {2,} pattern form. Only {j,k}. */
575577
DATE = {DIGIT}{1,2}[\-\u2012\/]{DIGIT}{1,2}[\-\u2012\/]{DIGIT}{2,4}|{DIGIT}{4}[\-\u2012\/]{DIGIT}{1,2}[\-\u2012\/]{DIGIT}{1,2}
576-
/* Note that NUM also includes times like 12:55. One can start with a . or , but not a : */
577-
NUM = {DIGIT}*([.,\u066B\u066C]{DIGIT}+)+|{DIGIT}+([.:,\u00AD\u066B\u066C\u2009\u202F]{DIGIT}+)*
578-
LEADING_NUM = {DIGIT}+([.,\u066B\u066C]{DIGIT}+)+
578+
/* Note that NUM also includes times like 12:55. One can start with a . or but not a : or , */
579+
NUM = {DIGIT}*([.\u066B]{DIGIT}+)+|{DIGIT}+([.:,\u00AD\u066B\u066C\u2009\u202F]{DIGIT}+)*
580+
LEADING_NUM = {DIGIT}+([.:,\u066B\u066C]{DIGIT}+)+
579581
/* Now don't allow bracketed negative numbers! They have too many uses (e.g.,
580582
years or times in parentheses), and having them in tokens messes up treebank parsing.
581583
NUMBER = [\-+]?{NUM}|\({NUM}\) */
@@ -1002,10 +1004,11 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
10021004
if (DEBUG) { logger.info("Used {DATE} to recognize " + origTxt + " as " + txt); }
10031005
return getNext(txt, origTxt);
10041006
}
1005-
{NUMBER} { String txt = yytext();
1006-
handleHyphenatedNumber(txt);
1007-
if (DEBUG) { logger.info("Used {NUMBER} to recognize " + yytext() + " as " + removeFromNumber(yytext())); }
1008-
return getNext(removeFromNumber(yytext()), yytext());
1007+
{NUMBER} { handleHyphenatedNumber(yytext());
1008+
String origTxt = yytext();
1009+
String txt = removeFromNumber(origTxt);
1010+
if (DEBUG) { logger.info("Used {NUMBER} to recognize " + origTxt + " as " + txt); }
1011+
return getNext(txt, origTxt);
10091012
}
10101013
{SUBSUPNUM} { String txt = yytext();
10111014
if (DEBUG) { logger.info("Used {SUBSUPNUM} to recognize " + txt); }

0 commit comments

Comments
 (0)