|
7 | 7 | import java.util.ArrayList;
|
8 | 8 | import java.util.Iterator;
|
9 | 9 | import java.util.List;
|
| 10 | +import java.util.Locale; |
10 | 11 | import java.util.Map;
|
11 | 12 | import java.util.Set;
|
12 | 13 | import java.util.regex.Pattern;
|
|
39 | 40 | /**
|
40 | 41 | * DocumentReader for Chinese segmentation task. (Sighan bakeoff 2005)
|
41 | 42 | * Reads in characters and labels them as 1 or 0 (word START or NONSTART).
|
42 |
| - * |
| 43 | + * <br> |
43 | 44 | * Note: maybe this can do less interning, since some is done in
|
44 | 45 | * ObjectBankWrapper, but this also calls trim() as it works....
|
| 46 | + * <br> |
| 47 | + * Data can be output in two formats: plaintext, meaning whitespace |
| 48 | + * separated words, or a fake conllu document usable with the conllu |
| 49 | + * scoring script. |
45 | 50 | *
|
46 | 51 | * @author Pi-Chuan Chang
|
47 | 52 | * @author Michel Galley (Viterbi search graph printing)
|
@@ -81,6 +86,12 @@ public class Sighan2005DocumentReaderAndWriter implements DocumentReaderAndWrite
|
81 | 86 | private SeqClassifierFlags flags;
|
82 | 87 | private IteratorFromReaderFactory<List<CoreLabel>> factory;
|
83 | 88 |
|
| 89 | + private enum OutputFormat { |
| 90 | + PLAINTEXT, CONLLU |
| 91 | + } |
| 92 | + |
| 93 | + private OutputFormat outputFormat; |
| 94 | + |
84 | 95 | @Override
|
85 | 96 | public Iterator<List<CoreLabel>> getIterator(Reader r) {
|
86 | 97 | return factory.getIterator(r);
|
@@ -108,6 +119,13 @@ public void init(SeqClassifierFlags flags) {
|
108 | 119 | String[] dicts2 = flags.dictionary2.split(",");
|
109 | 120 | cdict2 = new ChineseDictionary(dicts2, cdtos, flags.expandMidDot);
|
110 | 121 | }
|
| 122 | + |
| 123 | + if (flags.outputFormat != null) { |
| 124 | + outputFormat = OutputFormat.valueOf(flags.outputFormat.toUpperCase(Locale.ROOT)); |
| 125 | + logger.info("Output format: " + outputFormat); |
| 126 | + } else { |
| 127 | + outputFormat = OutputFormat.PLAINTEXT; |
| 128 | + } |
111 | 129 | }
|
112 | 130 |
|
113 | 131 |
|
@@ -309,13 +327,55 @@ private static void addDictionaryFeatures(ChineseDictionary dict, Class<? extend
|
309 | 327 | }
|
310 | 328 | }
|
311 | 329 |
|
312 |
| - @Override |
313 |
| - public void printAnswers(List<CoreLabel> doc, PrintWriter pw) { |
| 330 | + private void printPlainTextAnswer(List<CoreLabel> doc, PrintWriter pw) { |
314 | 331 | String ansStr = ChineseStringUtils.combineSegmentedSentence(doc, flags);
|
315 | 332 | pw.print(ansStr);
|
316 | 333 | pw.println();
|
317 | 334 | }
|
318 | 335 |
|
| 336 | + /** |
| 337 | + * Prints a fake Conllu document for use in the conllu tokenization scoring scripts |
| 338 | + */ |
| 339 | + private void printConlluAnswer(List<CoreLabel> doc, PrintWriter pw) { |
| 340 | + String ansStr = ChineseStringUtils.combineSegmentedSentence(doc, flags); |
| 341 | + pw.print("# text = " + ansStr); |
| 342 | + pw.println(); |
| 343 | + |
| 344 | + List<String> words = StringUtils.split(ansStr); |
| 345 | + int idx = 0; |
| 346 | + for (String word : words) { |
| 347 | + idx = idx + 1; |
| 348 | + pw.print(idx + "\t" + word); |
| 349 | + // 4 _ - print blanks for lemma & tags |
| 350 | + pw.print("\t_\t_\t_\t_\t"); |
| 351 | + pw.print(idx - 1); |
| 352 | + pw.print("\t"); |
| 353 | + if (idx == 1) { |
| 354 | + pw.print("root"); |
| 355 | + } else { |
| 356 | + pw.print("dep"); |
| 357 | + } |
| 358 | + pw.print("\t_\t_"); |
| 359 | + pw.println(); |
| 360 | + } |
| 361 | + |
| 362 | + pw.println(); |
| 363 | + } |
| 364 | + |
| 365 | + @Override |
| 366 | + public void printAnswers(List<CoreLabel> doc, PrintWriter pw) { |
| 367 | + switch (outputFormat) { |
| 368 | + case PLAINTEXT: |
| 369 | + printPlainTextAnswer(doc, pw); |
| 370 | + break; |
| 371 | + case CONLLU: |
| 372 | + printConlluAnswer(doc, pw); |
| 373 | + break; |
| 374 | + default: |
| 375 | + throw new IllegalArgumentException("Unknown outputFormat: " + outputFormat); |
| 376 | + } |
| 377 | + } |
| 378 | + |
319 | 379 |
|
320 | 380 | private static String intern(String s) {
|
321 | 381 | return s.trim().intern();
|
|
0 commit comments