diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 7fcd1c9..e3aabf6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,5 +1,4 @@ on: - - push - pull_request jobs: @@ -8,7 +7,7 @@ jobs: steps: - name: Checkout code uses: actions/checkout@v4 - - name: Use Node.js 20.x + - name: Use Node.js uses: actions/setup-node@v4 with: node-version: 20 diff --git a/.github/workflows/translations.yml b/.github/workflows/translations.yml index 0b3afd4..7aaf544 100644 --- a/.github/workflows/translations.yml +++ b/.github/workflows/translations.yml @@ -9,10 +9,10 @@ jobs: steps: - name: Checkout code uses: actions/checkout@v4 - - name: Use Node.js 18.x + - name: Use Node.js uses: actions/setup-node@v4 with: - node-version: 18 + node-version: 20 cache: 'npm' - name: Initialize project run: | diff --git a/README.md b/README.md index 3cc9fdd..94baad0 100644 --- a/README.md +++ b/README.md @@ -12,27 +12,44 @@ ## About -This is a CLI application for parsing all tldr pages from the [tldr-pages/tldr](https://github.com/tldr-pages/tldr) repository, and producing a dataset that maps the strings across localized pages. The primary motivation was to provide an additional corpus for [OPUS](https://opus.nlpl.eu/), a collection of translated resources from the web, readily available in standardized formats. +A CLI application for parsing tldr pages from the [tldr-pages/tldr](https://github.com/tldr-pages/tldr) repository, and producing a dataset that maps the strings across localized pages. The motivation was to provide an additional corpus for [OPUS](https://opus.nlpl.eu), see [What is Opus?](#what-is-opus) for more context. -### What is OPUS? +## Installation -OPUS is public dataset of translated text on the web. All translations are derived from freely available and openly licensed sources, so the translations themselves are safe to use with minimal restrictions. These datasets are helpful for a variety of applications such as research and machine learning. +You can install the tool by running the following commands: -A notable project that uses the OPUS corpuses is [LibreTranslate](https://libretranslate.com/), powered by [argos-translate](https://www.argosopentech.com/). It's a free, open-source, and self-hostable machine translation API that doesn't depend on third-party services. Now by translating tldr-pages, we're collectively contributing more data to improve open-source machine translations! +```sh +# Clone the repository +git clone https://github.com/tldr-pages/tldr-translation-pairs-gen.git -## Usage +# Enter the directory that git created when cloning +cd tldr-translation-pairs-gen -### Obtain a copy of tldr-pages +# Install dependencies +npm install -One way or another, obtain a copy of the tldr-pages. The easiest way is to use [Git](https://git-scm.com/). +# Build the project +npm run build + +# Install the project on your machine +npm install -g . +``` + +You should now have `tldr-translation-pairs-gen` on your path, try the help command to see the available options: ```sh -git clone https://github.com/tldr-pages/tldr.git +tldr-translation-pairs-gen --help ``` -### Execute tldr-translation-pairs-gen +## Usage -Once you have tldr-pages locally, you can point tldr-translation-pairs-gen to the directory using the `--source` argument. This will output a file for every combination of languages to the `dataset/` directory, with all alignments that can be found between localized pages. +One way or another, obtain a copy of the tldr-pages. The easiest way is to use [Git](https://git-scm.com): + +```sh +git clone https://github.com/tldr-pages/tldr.git +``` + +Point tldr-translation-pairs-gen to the directory using the `--source` argument. This will output a file for every combination of languages to the `dataset/` directory, with all alignments that can be found between localized pages. ```sh tldr-translation-pairs-gen --source {{path/to/tldr_dir}} @@ -62,3 +79,9 @@ Here is a real-world example of the problem: the English version was modified af | - Print the tldr page for a specific subcommand:

`tldr {{command}}-{{subcommand}}` | - Affiche la page tldr de `cd`, en forçant la plateforme par défaut :

`tldr -p {{android\|linux\|osx\|sunos\|windows}} {{cd}}` | - Print the tldr page for a command for a specific [p]latform:

`tldr {{command}}` | - Affiche la page tldr d'une sous-commande :

`tldr {{git-checkout}}` | - [u]pdate the local cache of tldr pages:

`tldr -u` | - Met à jour les pages enregistrées localement (si le client supporte la mise en cache) :

`tldr -u` + +## What is OPUS? + +OPUS is public dataset of translated resources on the web. All translations are derived from freely available and openly licensed sources, so the translations themselves are safe to use with minimal restrictions. These datasets are helpful for a variety of applications such as research and machine learning. + +A notable project that uses the OPUS corpuses is [LibreTranslate](https://libretranslate.com/), powered by [argos-translate](https://www.argosopentech.com/). It's a free, open-source, and self-hostable machine translation API that doesn't depend on third-party services. Now by translating tldr-pages, we're collectively contributing more data to improve open-source machine translations! diff --git a/package.json b/package.json index 1eaf41c..c86491c 100644 --- a/package.json +++ b/package.json @@ -2,31 +2,31 @@ "name": "tldr-translation-pairs-gen", "version": "0.2.1", "description": "Generates a structured dataset in various formats derived from tldr-pages.", - "bin": { - "tldr-translation-pairs-gen": "./dist/index.js" - }, - "scripts": { - "build": "tsc", - "build:watch": "tsc --watch", - "test": "mocha -r ts-node/register 'tests/**/*.ts'", - "tldr-translation-pairs-gen": "node ./dist/index.js" - }, + "author": "tldr", + "license": "MIT", + "homepage": "https://github.com/tldr-pages/tldr-translation-pairs-gen#readme", "repository": { "type": "git", "url": "git+https://github.com/tldr-pages/tldr-translation-pairs-gen.git" }, + "bugs": { + "url": "https://github.com/tldr-pages/tldr-translation-pairs-gen/issues" + }, "keywords": [ "tldr", "opus", "corpus", "i18n" ], - "author": "tldr", - "license": "MIT", - "bugs": { - "url": "https://github.com/tldr-pages/tldr-translation-pairs-gen/issues" + "bin": { + "tldr-translation-pairs-gen": "./dist/index.js" + }, + "scripts": { + "build": "tsc", + "build:watch": "tsc --watch", + "test": "mocha -r ts-node/register 'tests/**/*.ts'", + "tldr-translation-pairs-gen": "node ./dist/index.js" }, - "homepage": "https://github.com/tldr-pages/tldr-translation-pairs-gen#readme", "dependencies": { "commander": "^12.0.0", "csv-stringify": "^6.4.6", diff --git a/src/index.ts b/src/index.ts index be30809..2b24e5a 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,3 +1,5 @@ +#!/usr/bin/env node + import fs from 'fs'; import path from 'path'; import { Command } from 'commander'; diff --git a/src/lib/lib.ts b/src/lib/lib.ts index 0b83ada..07b7f04 100644 --- a/src/lib/lib.ts +++ b/src/lib/lib.ts @@ -60,14 +60,14 @@ export function parseTldrPage(source: string): TldrPage { const markdownTokens = lexer.lex(source); if (markdownTokens[0].type !== 'heading' || markdownTokens[1].type !== 'blockquote') { - throw new Error('Invalid tldr page provided.'); + throw new Error('Malformed tldr page provided.'); } const name = markdownTokens[0].text; const descriptionText = markdownTokens[1].tokens?.[0]; if (descriptionText?.type !== 'paragraph') { - throw new Error('Invalid tldr page provided.'); + throw new Error('Malformed tldr page provided.'); } const descriptionTokens = descriptionText.tokens; @@ -98,7 +98,7 @@ export function parseTldrPage(source: string): TldrPage { while (index < markdownTokens.length) { if (markdownTokens[index].type !== 'list' || markdownTokens[index + 2].type !== 'paragraph') { - throw new Error('Invalid tldr page provided.'); + throw new Error('Malformed tldr page provided.'); } const description = (markdownTokens[index] as any).items[0].text;