Skip to content

Commit

Permalink
Add lang option to JS CLI (#102)
Browse files Browse the repository at this point in the history
  • Loading branch information
tushuhei authored Jan 5, 2023
1 parent 2b6e8af commit 823662b
Show file tree
Hide file tree
Showing 9 changed files with 99 additions and 25 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ positional arguments:
optional arguments:
-h, --help show this help message and exit
-H, --html HTML mode (default: False)
-m JSON, --model JSON custom model file path (default: /path/to/budoux/models/ja-knbc.json)
-m JSON, --model JSON custom model file path (default: /path/to/budoux/models/ja.json)
-l LANG, --lang LANG language of custom model (default: None)
-d STR, --delim STR output delimiter in TEXT mode (default: ---)
-V, --version show program's version number and exit
Expand Down Expand Up @@ -203,8 +203,8 @@ Good news is that the training algorithm is an [anytime algorithm](https://en.wi

## Constructing a training dataset from the KNBC corpus for Japanese

The default model for Japanese (`budoux/models/ja_knbc.json`) is built using the [KNBC corpus](https://nlp.ist.i.kyoto-u.ac.jp/kuntt/).
You can create a training dataset, which we name `source_knbc.txt` for example, from the corpus by running the commands below.
The default model for Japanese (`budoux/models/ja.json`) is built using the [KNBC corpus](https://nlp.ist.i.kyoto-u.ac.jp/kuntt/).
You can create a training dataset, which we name `source_knbc.txt` below for example, from the corpus by running the following commands:

```shellsession
$ curl -o knbc.tar.bz2 https://nlp.ist.i.kyoto-u.ac.jp/kuntt/KNBC_v1.0_090925_utf8.tar.bz2
Expand Down
11 changes: 4 additions & 7 deletions budoux/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,14 +59,11 @@ def get_model_langs() -> typing.Dict[str, str]:
typing.Dict[str, str]: A dictionary of model languages and its paths.
"""
models = glob.glob(
pkg_resources.resource_filename(__name__, "models") + "/*-*.json")
pkg_resources.resource_filename(__name__, "models") + "/*.json")
langs = {}
for model in models:
model_name = model.split(os.sep)[-1][:-5]
if model_name.startswith('zh-'):
langs[model_name] = model
else:
langs[model_name[:2]] = model
lang = model.split(os.sep)[-1][:-5]
langs[lang] = model
return langs


Expand Down Expand Up @@ -129,7 +126,7 @@ def parse_args(test: ArgList = None) -> argparse.Namespace:
"--model",
metavar="JSON",
type=check_file,
default=pkg_resources.resource_filename(__name__, "models/ja-knbc.json"),
default=pkg_resources.resource_filename(__name__, "models/ja.json"),
help="custom model file path",
)
model_select_group.add_argument(
Expand Down
File renamed without changes.
2 changes: 1 addition & 1 deletion budoux/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def load_default_japanese_parser() -> Parser:
Returns:
A parser (:obj:`budoux.Parser`).
"""
with open(os.path.join(MODEL_DIR, 'ja-knbc.json'), encoding='utf-8') as f:
with open(os.path.join(MODEL_DIR, 'ja.json'), encoding='utf-8') as f:
model = json.load(f)
return Parser(model)

Expand Down
7 changes: 4 additions & 3 deletions javascript/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,15 @@
},
"scripts": {
"build": "npm run build:es && npm run build:cjs",
"build:cjs": "tsc --outDir dist --module CommonJS --sourceMap false && cp -r src/tests/models dist/tests/models",
"build:es": "tsc --outDir module --module ES6 --sourceMap false && cp module/dom-browser.js module/dom.js && cp -r src/tests/models module/tests/models",
"build:cjs": "tsc --outDir dist --module CommonJS --sourceMap false && cp -r src/tests/models/ dist/tests/models/",
"build:es": "tsc --outDir module --module ES6 --sourceMap false && cp module/dom-browser.js module/dom.js && cp -r src/tests/models/ module/tests/models/",
"bundle": "npm run bundle:webcomponent:ja && npm run bundle:webcomponent:zh-hans && npm run bundle:webcomponent:zh-hant",
"bundle:webcomponent:ja": "esbuild module/webcomponents/budoux-ja.js --bundle --minify --sourcemap --outfile=bundle/budoux-ja.min.js",
"bundle:webcomponent:zh-hans": "esbuild module/webcomponents/budoux-zh-hans.js --bundle --minify --sourcemap --outfile=bundle/budoux-zh-hans.min.js",
"bundle:webcomponent:zh-hant": "esbuild module/webcomponents/budoux-zh-hant.js --bundle --minify --sourcemap --outfile=bundle/budoux-zh-hant.min.js",
"copy": "node ./scripts/copy-data.js",
"prepare": "npm run copy && npm run build && npm run bundle",
"prepare": "npm run clean && npm run copy && npm run build && npm run bundle",
"pretest": "npm run build:cjs",
"test": "npm run test:jasmine && npm run test:cli-version",
"test:jasmine": "jasmine dist/tests/test_*.js",
"test:cli-version": "node ./scripts/check-cli-version.js",
Expand Down
32 changes: 24 additions & 8 deletions javascript/src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,17 @@
*/

import {readFileSync} from 'fs';
import {resolve} from 'path';
import * as path from 'path';
import * as readline from 'readline';
import {Command} from 'commander';
import {Parser, loadDefaultJapaneseParser} from './parser.js';
import {
Parser,
loadDefaultParsers,
loadDefaultJapaneseParser,
} from './parser.js';

const CLI_VERSION = '0.4.0';
const defaultParsers = loadDefaultParsers();

/**
* Run the command line interface program.
Expand All @@ -29,29 +34,40 @@ const CLI_VERSION = '0.4.0';
export const cli = (argv: string[]) => {
const program = new Command('budoux');

program.usage('[-h] [-H] [-d STR] [-t THRES] [-m JSON] [-V] [TXT]');
program.usage('[-h] [-H] [-d STR] [-t THRES] [-m JSON] [-l LANG] [-V] [TXT]');
program.description(
'BudouX is the successor to Budou, the machine learning powered line break organizer tool.'
);
program
.option('-H, --html', 'HTML mode', false)
.option('-d, --delim <str>', 'output delimiter in TEXT mode', '---')
.option('-m, --model <json>', 'custom model file path')
.option('-m, --model <json>', 'model file path')
.option(
'-l, --lang <str>',
`language model to use. -m and --model will be prioritized if any.\navailable languages: ${[
...defaultParsers.keys(),
].join(', ')}`
)
.argument('[txt]', 'text');

program.version(CLI_VERSION);

program.parse(argv);

const options = program.opts();
const {model, delim, html} = options as {
const {lang, model, delim, html} = options as {
html: boolean;
delim: string;
model?: string;
lang?: string;
};
const {args} = program;

const parser = model ? loadCustomParser(model) : loadDefaultJapaneseParser();
const parser = model
? loadCustomParser(model)
: lang && defaultParsers.has(lang)
? defaultParsers.get(lang)!
: loadDefaultJapaneseParser();

switch (args.length) {
case 0: {
Expand Down Expand Up @@ -115,8 +131,8 @@ const outputParsedTexts = (
* Loads a parser equipped with custom model.
* @returns A parser with the loaded model.
*/
const loadCustomParser = (path: string) => {
const file = readFileSync(resolve(path)).toString();
const loadCustomParser = (modelPath: string) => {
const file = readFileSync(path.resolve(modelPath)).toString();
const json = JSON.parse(file);
return new Parser(new Map(Object.entries(json)));
};
16 changes: 14 additions & 2 deletions javascript/src/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*/

import {model as jaKNBCModel} from './data/models/ja-knbc.js';
import {model as jaModel} from './data/models/ja.js';
import {model as zhHansModel} from './data/models/zh-hans.js';
import {model as zhHantModel} from './data/models/zh-hant.js';
import {parseFromString} from './dom.js';
Expand Down Expand Up @@ -150,7 +150,7 @@ export class Parser {
* @returns A parser with the default Japanese model.
*/
export const loadDefaultJapaneseParser = () => {
return new Parser(new Map(Object.entries(jaKNBCModel)));
return new Parser(new Map(Object.entries(jaModel)));
};

/**
Expand All @@ -168,3 +168,15 @@ export const loadDefaultSimplifiedChineseParser = () => {
export const loadDefaultTraditionalChineseParser = () => {
return new Parser(new Map(Object.entries(zhHantModel)));
};

/**
* Loads available default parsers.
* @returns A map between available lang codes and their default parsers.
*/
export const loadDefaultParsers = () => {
return new Map([
['ja', loadDefaultJapaneseParser()],
['zh-hans', loadDefaultSimplifiedChineseParser()],
['zh-hant', loadDefaultTraditionalChineseParser()],
]);
};
48 changes: 48 additions & 0 deletions javascript/src/tests/test_cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import {cli} from '../cli.js';
import {execFile, ExecFileException} from 'child_process';
import * as path from 'path';
import stream from 'stream';
import {loadDefaultParsers} from '../parser.js';

type execFileCallBack = {
error: ExecFileException | null;
Expand Down Expand Up @@ -91,6 +92,53 @@ describe('cli', () => {
});
});

it('should use the corresponding language model when the -l parameter is given.', () => {
const inputTextHans = '我们的使命是整合全球信息,供大众使用,让人人受益。';
const expectedStdOuts = loadDefaultParsers()
.get('zh-hans')!
.parse(inputTextHans);
const argv = ['node', 'budoux', '-l', 'zh-hans', inputTextHans];
cli(argv);
expectedStdOuts.forEach(stdout => {
expect(console.log).toHaveBeenCalledWith(stdout);
});
});

it('should use the corresponding language model when the --lang parameter is given.', () => {
const inputTextHans = '我們的使命是匯整全球資訊,供大眾使用,使人人受惠。';
const expectedStdOuts = loadDefaultParsers()
.get('zh-hant')!
.parse(inputTextHans);
const argv = ['node', 'budoux', '--lang', 'zh-hant', inputTextHans];
cli(argv);
expectedStdOuts.forEach(stdout => {
expect(console.log).toHaveBeenCalledWith(stdout);
});
});

it('should prioritize -m and --model over -l and --lang', () => {
const inputTextHans = '我們的使a命';
const customModelPath = path.resolve(
__dirname,
'models',
'separate_right_before_a.json'
);
const argv = [
'node',
'budoux',
'--model',
customModelPath,
'--lang',
'zh-hant',
inputTextHans,
];
cli(argv);
const expectedStdOuts = '我們的使\na命'.split('\n');
expectedStdOuts.forEach(stdout => {
expect(console.log).toHaveBeenCalledWith(stdout);
});
});

it('should output the separated sentence with separater when execute budoux command with --delim option.', () => {
const inputText = '今日は天気です。\n明日は雨かな?';
const argv = ['node', 'budoux', '--delim', '---', inputText];
Expand Down
2 changes: 1 addition & 1 deletion tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def test_cmdargs_invalid_lang_1(self) -> None:
self.assertEqual(cm.exception.code, 2)

def test_cmdargs_invalid_lang_2(self) -> None:
cmdargs = ['-l', 'ja-knbc']
cmdargs = ['-l', 'ja-abc']
with self.assertRaises(SystemExit) as cm:
main.parse_args(cmdargs)

Expand Down

0 comments on commit 823662b

Please # to comment.