Skip to content

Commit c70ddec

Browse files
committed
Add a fake conllu output format
1 parent 7680af5 commit c70ddec

File tree

1 file changed

+63
-3
lines changed

1 file changed

+63
-3
lines changed

src/edu/stanford/nlp/wordseg/Sighan2005DocumentReaderAndWriter.java

+63-3
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import java.util.ArrayList;
88
import java.util.Iterator;
99
import java.util.List;
10+
import java.util.Locale;
1011
import java.util.Map;
1112
import java.util.Set;
1213
import java.util.regex.Pattern;
@@ -39,9 +40,13 @@
3940
/**
4041
* DocumentReader for Chinese segmentation task. (Sighan bakeoff 2005)
4142
* Reads in characters and labels them as 1 or 0 (word START or NONSTART).
42-
*
43+
* <br>
4344
* Note: maybe this can do less interning, since some is done in
4445
* ObjectBankWrapper, but this also calls trim() as it works....
46+
* <br>
47+
* Data can be output in two formats: plaintext, meaning whitespace
48+
* separated words, or a fake conllu document usable with the conllu
49+
* scoring script.
4550
*
4651
* @author Pi-Chuan Chang
4752
* @author Michel Galley (Viterbi search graph printing)
@@ -81,6 +86,12 @@ public class Sighan2005DocumentReaderAndWriter implements DocumentReaderAndWrite
8186
private SeqClassifierFlags flags;
8287
private IteratorFromReaderFactory<List<CoreLabel>> factory;
8388

89+
private enum OutputFormat {
90+
PLAINTEXT, CONLLU
91+
}
92+
93+
private OutputFormat outputFormat;
94+
8495
@Override
8596
public Iterator<List<CoreLabel>> getIterator(Reader r) {
8697
return factory.getIterator(r);
@@ -108,6 +119,13 @@ public void init(SeqClassifierFlags flags) {
108119
String[] dicts2 = flags.dictionary2.split(",");
109120
cdict2 = new ChineseDictionary(dicts2, cdtos, flags.expandMidDot);
110121
}
122+
123+
if (flags.outputFormat != null) {
124+
outputFormat = OutputFormat.valueOf(flags.outputFormat.toUpperCase(Locale.ROOT));
125+
logger.info("Output format: " + outputFormat);
126+
} else {
127+
outputFormat = OutputFormat.PLAINTEXT;
128+
}
111129
}
112130

113131

@@ -309,13 +327,55 @@ private static void addDictionaryFeatures(ChineseDictionary dict, Class<? extend
309327
}
310328
}
311329

312-
@Override
313-
public void printAnswers(List<CoreLabel> doc, PrintWriter pw) {
330+
private void printPlainTextAnswer(List<CoreLabel> doc, PrintWriter pw) {
314331
String ansStr = ChineseStringUtils.combineSegmentedSentence(doc, flags);
315332
pw.print(ansStr);
316333
pw.println();
317334
}
318335

336+
/**
337+
* Prints a fake Conllu document for use in the conllu tokenization scoring scripts
338+
*/
339+
private void printConlluAnswer(List<CoreLabel> doc, PrintWriter pw) {
340+
String ansStr = ChineseStringUtils.combineSegmentedSentence(doc, flags);
341+
pw.print("# text = " + ansStr);
342+
pw.println();
343+
344+
List<String> words = StringUtils.split(ansStr);
345+
int idx = 0;
346+
for (String word : words) {
347+
idx = idx + 1;
348+
pw.print(idx + "\t" + word);
349+
// 4 _ - print blanks for lemma & tags
350+
pw.print("\t_\t_\t_\t_\t");
351+
pw.print(idx - 1);
352+
pw.print("\t");
353+
if (idx == 1) {
354+
pw.print("root");
355+
} else {
356+
pw.print("dep");
357+
}
358+
pw.print("\t_\t_");
359+
pw.println();
360+
}
361+
362+
pw.println();
363+
}
364+
365+
@Override
366+
public void printAnswers(List<CoreLabel> doc, PrintWriter pw) {
367+
switch (outputFormat) {
368+
case PLAINTEXT:
369+
printPlainTextAnswer(doc, pw);
370+
break;
371+
case CONLLU:
372+
printConlluAnswer(doc, pw);
373+
break;
374+
default:
375+
throw new IllegalArgumentException("Unknown outputFormat: " + outputFormat);
376+
}
377+
}
378+
319379

320380
private static String intern(String s) {
321381
return s.trim().intern();

0 commit comments

Comments
 (0)