diff --git a/README.md b/README.md index aba40ecf..b2f02511 100644 --- a/README.md +++ b/README.md @@ -149,7 +149,7 @@ positional arguments: optional arguments: -h, --help show this help message and exit -H, --html HTML mode (default: False) - -m JSON, --model JSON custom model file path (default: /path/to/budoux/models/ja-knbc.json) + -m JSON, --model JSON custom model file path (default: /path/to/budoux/models/ja.json) -l LANG, --lang LANG language of custom model (default: None) -d STR, --delim STR output delimiter in TEXT mode (default: ---) -V, --version show program's version number and exit @@ -203,8 +203,8 @@ Good news is that the training algorithm is an [anytime algorithm](https://en.wi ## Constructing a training dataset from the KNBC corpus for Japanese -The default model for Japanese (`budoux/models/ja_knbc.json`) is built using the [KNBC corpus](https://nlp.ist.i.kyoto-u.ac.jp/kuntt/). -You can create a training dataset, which we name `source_knbc.txt` for example, from the corpus by running the commands below. +The default model for Japanese (`budoux/models/ja.json`) is built using the [KNBC corpus](https://nlp.ist.i.kyoto-u.ac.jp/kuntt/). +You can create a training dataset, which we name `source_knbc.txt` below for example, from the corpus by running the following commands: ```shellsession $ curl -o knbc.tar.bz2 https://nlp.ist.i.kyoto-u.ac.jp/kuntt/KNBC_v1.0_090925_utf8.tar.bz2 diff --git a/budoux/main.py b/budoux/main.py index d48f16c9..c1ff06fd 100644 --- a/budoux/main.py +++ b/budoux/main.py @@ -59,14 +59,11 @@ def get_model_langs() -> typing.Dict[str, str]: typing.Dict[str, str]: A dictionary of model languages and its paths. """ models = glob.glob( - pkg_resources.resource_filename(__name__, "models") + "/*-*.json") + pkg_resources.resource_filename(__name__, "models") + "/*.json") langs = {} for model in models: - model_name = model.split(os.sep)[-1][:-5] - if model_name.startswith('zh-'): - langs[model_name] = model - else: - langs[model_name[:2]] = model + lang = model.split(os.sep)[-1][:-5] + langs[lang] = model return langs @@ -129,7 +126,7 @@ def parse_args(test: ArgList = None) -> argparse.Namespace: "--model", metavar="JSON", type=check_file, - default=pkg_resources.resource_filename(__name__, "models/ja-knbc.json"), + default=pkg_resources.resource_filename(__name__, "models/ja.json"), help="custom model file path", ) model_select_group.add_argument( diff --git a/budoux/models/ja-knbc.json b/budoux/models/ja.json similarity index 100% rename from budoux/models/ja-knbc.json rename to budoux/models/ja.json diff --git a/budoux/parser.py b/budoux/parser.py index 21fd4ebf..0167b48f 100644 --- a/budoux/parser.py +++ b/budoux/parser.py @@ -164,7 +164,7 @@ def load_default_japanese_parser() -> Parser: Returns: A parser (:obj:`budoux.Parser`). """ - with open(os.path.join(MODEL_DIR, 'ja-knbc.json'), encoding='utf-8') as f: + with open(os.path.join(MODEL_DIR, 'ja.json'), encoding='utf-8') as f: model = json.load(f) return Parser(model) diff --git a/javascript/package.json b/javascript/package.json index f7cf5577..e7b38572 100644 --- a/javascript/package.json +++ b/javascript/package.json @@ -14,14 +14,15 @@ }, "scripts": { "build": "npm run build:es && npm run build:cjs", - "build:cjs": "tsc --outDir dist --module CommonJS --sourceMap false && cp -r src/tests/models dist/tests/models", - "build:es": "tsc --outDir module --module ES6 --sourceMap false && cp module/dom-browser.js module/dom.js && cp -r src/tests/models module/tests/models", + "build:cjs": "tsc --outDir dist --module CommonJS --sourceMap false && cp -r src/tests/models/ dist/tests/models/", + "build:es": "tsc --outDir module --module ES6 --sourceMap false && cp module/dom-browser.js module/dom.js && cp -r src/tests/models/ module/tests/models/", "bundle": "npm run bundle:webcomponent:ja && npm run bundle:webcomponent:zh-hans && npm run bundle:webcomponent:zh-hant", "bundle:webcomponent:ja": "esbuild module/webcomponents/budoux-ja.js --bundle --minify --sourcemap --outfile=bundle/budoux-ja.min.js", "bundle:webcomponent:zh-hans": "esbuild module/webcomponents/budoux-zh-hans.js --bundle --minify --sourcemap --outfile=bundle/budoux-zh-hans.min.js", "bundle:webcomponent:zh-hant": "esbuild module/webcomponents/budoux-zh-hant.js --bundle --minify --sourcemap --outfile=bundle/budoux-zh-hant.min.js", "copy": "node ./scripts/copy-data.js", - "prepare": "npm run copy && npm run build && npm run bundle", + "prepare": "npm run clean && npm run copy && npm run build && npm run bundle", + "pretest": "npm run build:cjs", "test": "npm run test:jasmine && npm run test:cli-version", "test:jasmine": "jasmine dist/tests/test_*.js", "test:cli-version": "node ./scripts/check-cli-version.js", diff --git a/javascript/src/cli.ts b/javascript/src/cli.ts index a68e2d69..7c42534b 100644 --- a/javascript/src/cli.ts +++ b/javascript/src/cli.ts @@ -15,12 +15,17 @@ */ import {readFileSync} from 'fs'; -import {resolve} from 'path'; +import * as path from 'path'; import * as readline from 'readline'; import {Command} from 'commander'; -import {Parser, loadDefaultJapaneseParser} from './parser.js'; +import { + Parser, + loadDefaultParsers, + loadDefaultJapaneseParser, +} from './parser.js'; const CLI_VERSION = '0.4.0'; +const defaultParsers = loadDefaultParsers(); /** * Run the command line interface program. @@ -29,14 +34,20 @@ const CLI_VERSION = '0.4.0'; export const cli = (argv: string[]) => { const program = new Command('budoux'); - program.usage('[-h] [-H] [-d STR] [-t THRES] [-m JSON] [-V] [TXT]'); + program.usage('[-h] [-H] [-d STR] [-t THRES] [-m JSON] [-l LANG] [-V] [TXT]'); program.description( 'BudouX is the successor to Budou, the machine learning powered line break organizer tool.' ); program .option('-H, --html', 'HTML mode', false) .option('-d, --delim ', 'output delimiter in TEXT mode', '---') - .option('-m, --model ', 'custom model file path') + .option('-m, --model ', 'model file path') + .option( + '-l, --lang ', + `language model to use. -m and --model will be prioritized if any.\navailable languages: ${[ + ...defaultParsers.keys(), + ].join(', ')}` + ) .argument('[txt]', 'text'); program.version(CLI_VERSION); @@ -44,14 +55,19 @@ export const cli = (argv: string[]) => { program.parse(argv); const options = program.opts(); - const {model, delim, html} = options as { + const {lang, model, delim, html} = options as { html: boolean; delim: string; model?: string; + lang?: string; }; const {args} = program; - const parser = model ? loadCustomParser(model) : loadDefaultJapaneseParser(); + const parser = model + ? loadCustomParser(model) + : lang && defaultParsers.has(lang) + ? defaultParsers.get(lang)! + : loadDefaultJapaneseParser(); switch (args.length) { case 0: { @@ -115,8 +131,8 @@ const outputParsedTexts = ( * Loads a parser equipped with custom model. * @returns A parser with the loaded model. */ -const loadCustomParser = (path: string) => { - const file = readFileSync(resolve(path)).toString(); +const loadCustomParser = (modelPath: string) => { + const file = readFileSync(path.resolve(modelPath)).toString(); const json = JSON.parse(file); return new Parser(new Map(Object.entries(json))); }; diff --git a/javascript/src/parser.ts b/javascript/src/parser.ts index c8652b26..e3503ee0 100644 --- a/javascript/src/parser.ts +++ b/javascript/src/parser.ts @@ -14,7 +14,7 @@ * limitations under the License. */ -import {model as jaKNBCModel} from './data/models/ja-knbc.js'; +import {model as jaModel} from './data/models/ja.js'; import {model as zhHansModel} from './data/models/zh-hans.js'; import {model as zhHantModel} from './data/models/zh-hant.js'; import {parseFromString} from './dom.js'; @@ -150,7 +150,7 @@ export class Parser { * @returns A parser with the default Japanese model. */ export const loadDefaultJapaneseParser = () => { - return new Parser(new Map(Object.entries(jaKNBCModel))); + return new Parser(new Map(Object.entries(jaModel))); }; /** @@ -168,3 +168,15 @@ export const loadDefaultSimplifiedChineseParser = () => { export const loadDefaultTraditionalChineseParser = () => { return new Parser(new Map(Object.entries(zhHantModel))); }; + +/** + * Loads available default parsers. + * @returns A map between available lang codes and their default parsers. + */ +export const loadDefaultParsers = () => { + return new Map([ + ['ja', loadDefaultJapaneseParser()], + ['zh-hans', loadDefaultSimplifiedChineseParser()], + ['zh-hant', loadDefaultTraditionalChineseParser()], + ]); +}; diff --git a/javascript/src/tests/test_cli.ts b/javascript/src/tests/test_cli.ts index 3bfd0741..b4324ee0 100644 --- a/javascript/src/tests/test_cli.ts +++ b/javascript/src/tests/test_cli.ts @@ -18,6 +18,7 @@ import {cli} from '../cli.js'; import {execFile, ExecFileException} from 'child_process'; import * as path from 'path'; import stream from 'stream'; +import {loadDefaultParsers} from '../parser.js'; type execFileCallBack = { error: ExecFileException | null; @@ -91,6 +92,53 @@ describe('cli', () => { }); }); + it('should use the corresponding language model when the -l parameter is given.', () => { + const inputTextHans = '我们的使命是整合全球信息,供大众使用,让人人受益。'; + const expectedStdOuts = loadDefaultParsers() + .get('zh-hans')! + .parse(inputTextHans); + const argv = ['node', 'budoux', '-l', 'zh-hans', inputTextHans]; + cli(argv); + expectedStdOuts.forEach(stdout => { + expect(console.log).toHaveBeenCalledWith(stdout); + }); + }); + + it('should use the corresponding language model when the --lang parameter is given.', () => { + const inputTextHans = '我們的使命是匯整全球資訊,供大眾使用,使人人受惠。'; + const expectedStdOuts = loadDefaultParsers() + .get('zh-hant')! + .parse(inputTextHans); + const argv = ['node', 'budoux', '--lang', 'zh-hant', inputTextHans]; + cli(argv); + expectedStdOuts.forEach(stdout => { + expect(console.log).toHaveBeenCalledWith(stdout); + }); + }); + + it('should prioritize -m and --model over -l and --lang', () => { + const inputTextHans = '我們的使a命'; + const customModelPath = path.resolve( + __dirname, + 'models', + 'separate_right_before_a.json' + ); + const argv = [ + 'node', + 'budoux', + '--model', + customModelPath, + '--lang', + 'zh-hant', + inputTextHans, + ]; + cli(argv); + const expectedStdOuts = '我們的使\na命'.split('\n'); + expectedStdOuts.forEach(stdout => { + expect(console.log).toHaveBeenCalledWith(stdout); + }); + }); + it('should output the separated sentence with separater when execute budoux command with --delim option.', () => { const inputText = '今日は天気です。\n明日は雨かな?'; const argv = ['node', 'budoux', '--delim', '---', inputText]; diff --git a/tests/test_main.py b/tests/test_main.py index ad08d7a8..528e8af0 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -72,7 +72,7 @@ def test_cmdargs_invalid_lang_1(self) -> None: self.assertEqual(cm.exception.code, 2) def test_cmdargs_invalid_lang_2(self) -> None: - cmdargs = ['-l', 'ja-knbc'] + cmdargs = ['-l', 'ja-abc'] with self.assertRaises(SystemExit) as cm: main.parse_args(cmdargs)