diff --git a/README.md b/README.md index eda254f..a5923d6 100755 --- a/README.md +++ b/README.md @@ -80,7 +80,8 @@ import { NodeHtmlMarkdown, NodeHtmlMarkdownOptions } from 'node-html-markdown' NodeHtmlMarkdown.translate( /* html */ `hello`, /* options (optional) */ {}, - /* customTranslators (optional) */ undefined + /* customTranslators (optional) */ undefined, + /* customCodeBlockTranslators (optional) */ undefined ); // Multiple files @@ -90,7 +91,8 @@ NodeHtmlMarkdown.translate( 'file2.html': `goodbye` }, /* options (optional) */ {}, - /* customTranslators (optional) */ undefined + /* customTranslators (optional) */ undefined, + /* customCodeBlockTranslators (optional) */ undefined ); @@ -101,7 +103,8 @@ NodeHtmlMarkdown.translate( const nhm = new NodeHtmlMarkdown( /* options (optional) */ {}, - /* customTransformers (optional) */ undefined + /* customTransformers (optional) */ undefined, + /* customCodeBlockTranslators (optional) */ undefined ); // Single file @@ -160,12 +163,12 @@ export interface NodeHtmlMarkdownOptions { /** * Supplied elements will be ignored (ignores inner text does not parse children) */ - readonly ignore?: string[], + ignore?: string[], /** * Supplied elements will be treated as blocks (surrounded with blank lines) */ - readonly blockElements?: string[], + blockElements?: string[], /** * Max consecutive new lines allowed @@ -225,6 +228,8 @@ __For detail on how to use them see__: - [translator.ts](https://github.com/crosstype/node-html-markdown/blob/master/src/translator.ts) - Documentation for `TranslatorConfig` - [config.ts](https://github.com/crosstype/node-html-markdown/blob/master/src/config.ts) - Translators in `defaultTranslators` +The `NodeHtmlMarkdown#codeBlockTranslators` property is a collection of translators which handles elements within a `
` block.
+
## Further improvements
Being a performance-centric library, we're always interested in further improvements.
diff --git a/src/config.ts b/src/config.ts
index d8fb420..5f10f13 100755
--- a/src/config.ts
+++ b/src/config.ts
@@ -171,11 +171,13 @@ export const defaultTranslators: TranslatorConfigObject = {
noEscape: true,
prefix: codeFence + language + '\n',
postfix: '\n' + codeFence,
+ childTranslators: visitor.instance.codeBlockTranslators
}
} else {
return {
noEscape: true,
- postprocess: ({ content }) => content.replace(/^/gm, ' ')
+ postprocess: ({ content }) => content.replace(/^/gm, ' '),
+ childTranslators: visitor.instance.codeBlockTranslators
}
}
},
@@ -215,6 +217,15 @@ export const defaultTranslators: TranslatorConfigObject = {
},
}
+export const defaultCodeBlockTranslators: TranslatorConfigObject = {
+ 'br': { content: `\n`, recurse: false },
+ 'hr': { content: '---', recurse: false },
+ 'h1,h2,h3,h4,h5,h6': { prefix: '[', postfix: ']' },
+ 'ol,ul': defaultTranslators['ol,ul'],
+ 'li': defaultTranslators['li'],
+ 'img': { recurse: false }
+}
+
// endregion
diff --git a/src/main.ts b/src/main.ts
index b03f92d..74b7070 100755
--- a/src/main.ts
+++ b/src/main.ts
@@ -1,6 +1,8 @@
import { NodeHtmlMarkdownOptions } from './options';
import { TranslatorCollection, TranslatorConfigObject } from './translator';
-import { defaultBlockElements, defaultIgnoreElements, defaultOptions, defaultTranslators } from './config';
+import {
+ defaultBlockElements, defaultCodeBlockTranslators, defaultIgnoreElements, defaultOptions, defaultTranslators
+} from './config';
import { parseHTML } from './utilities';
import { getMarkdownForHtmlNodes } from './visitor';
@@ -21,22 +23,33 @@ type Options = Partial
export class NodeHtmlMarkdown {
public translators = new TranslatorCollection();
+ public codeBlockTranslators = new TranslatorCollection();
public readonly options: NodeHtmlMarkdownOptions
- constructor(options?: Options, customTranslators?: TranslatorConfigObject) {
+ constructor(options?: Options, customTranslators?: TranslatorConfigObject, customCodeBlockTranslators?: TranslatorConfigObject) {
/* Setup Options */
this.options = { ...defaultOptions, ...options };
const ignoredElements = this.options.ignore?.concat(defaultIgnoreElements) ?? defaultIgnoreElements;
const blockElements = this.options.blockElements?.concat(defaultBlockElements) ?? defaultBlockElements;
/* Setup Translator Bases */
- ignoredElements?.forEach(el => this.translators.set(el, { ignore: true, recurse: false }));
- blockElements?.forEach(el => this.translators.set(el, { surroundingNewlines: 2 }));
+ ignoredElements?.forEach(el => {
+ this.translators.set(el, { ignore: true, recurse: false });
+ this.codeBlockTranslators.set(el, { ignore: true, recurse: false });
+ })
+
+ blockElements?.forEach(el => {
+ this.translators.set(el, { surroundingNewlines: 2 });
+ this.codeBlockTranslators.set(el, { surroundingNewlines: 2 });
+ });
/* Add and merge bases with default and custom translator configs */
for (const [ elems, cfg ] of Object.entries({ ...defaultTranslators, ...customTranslators }))
this.translators.set(elems, cfg, true);
+ for (const [ elems, cfg ] of Object.entries({ ...defaultCodeBlockTranslators, ...customCodeBlockTranslators }))
+ this.codeBlockTranslators.set(elems, cfg, true);
+
// TODO - Workaround for upstream issue (may not be fixed) - https://github.com/taoqf/node-html-parser/issues/78
if (!this.options.textReplace) this.options.textReplace = [];
this.options.textReplace.push([ /^/gmi, '' ]);
@@ -50,15 +63,15 @@ export class NodeHtmlMarkdown {
/**
* Translate HTML source text to markdown
*/
- static translate(html: string, options?: Options, customTranslators?: TranslatorConfigObject): string
+ static translate(html: string, options?: Options, customTranslators?: TranslatorConfigObject, customCodeBlockTranslators?: TranslatorConfigObject): string
/**
* Translate collection of HTML source text to markdown
*/
- static translate(files: FileCollection, options?: Options, customTranslators?: TranslatorConfigObject): FileCollection
- static translate(htmlOrFiles: string | FileCollection, opt?: Options, trans?: TranslatorConfigObject):
+ static translate(files: FileCollection, options?: Options, customTranslators?: TranslatorConfigObject, customCodeBlockTranslators?: TranslatorConfigObject): FileCollection
+ static translate(htmlOrFiles: string | FileCollection, opt?: Options, customTranslators?: TranslatorConfigObject, customCodeBlockTranslators?: TranslatorConfigObject):
string | FileCollection
{
- return NodeHtmlMarkdown.prototype.translateWorker.call(new NodeHtmlMarkdown(opt, trans), htmlOrFiles);
+ return NodeHtmlMarkdown.prototype.translateWorker.call(new NodeHtmlMarkdown(opt, customTranslators, customCodeBlockTranslators), htmlOrFiles);
}
// endregion
diff --git a/src/translator.ts b/src/translator.ts
index d4a1628..238bd48 100755
--- a/src/translator.ts
+++ b/src/translator.ts
@@ -79,6 +79,11 @@ export type TranslatorConfig = {
* Keep whitespace as it is
*/
preserveWhitespace?: boolean
+
+ /**
+ * Custom translator collection to use for child HTML nodes
+ */
+ childTranslators?: TranslatorCollection
}
export enum PostProcessResult {
diff --git a/src/visitor.ts b/src/visitor.ts
index 515d78a..a763a95 100755
--- a/src/visitor.ts
+++ b/src/visitor.ts
@@ -3,7 +3,7 @@ import { ElementNode, HtmlNode, isElementNode, isTextNode } from './nodes';
import { getChildNodes, getTrailingWhitespaceInfo, perfStart, perfStop, trimNewLines } from './utilities';
import {
createTranslatorContext, isTranslatorConfig, PostProcessResult, TranslatorConfig, TranslatorConfigFactory,
- TranslatorContext
+ TranslatorConfigObject, TranslatorContext
} from './translator';
import { NodeHtmlMarkdownOptions } from './options';
import { contentlessElements } from './config';
@@ -19,6 +19,7 @@ export interface NodeMetadata {
listItemNumber?: number
noEscape?: boolean
preserveWhitespace?: boolean
+ translators?: TranslatorConfigObject
}
export type NodeMetadataMap = Map
@@ -160,7 +161,7 @@ export class Visitor {
if (textOnly || !isElementNode(node)) return;
/* Handle element node */
- const { instance: { translators } } = this;
+ const translators = metadata?.translators ?? this.instance.translators;
const translatorCfgOrFactory = translators[node.tagName] as TranslatorConfig | TranslatorConfigFactory;
/* Update metadata with list detail */
@@ -202,9 +203,9 @@ export class Visitor {
// Skip and don't check children if ignore flag set
if (cfg.ignore) return;
- /* Update metadata for noEscape flag */
- if (cfg.noEscape && !metadata?.noEscape) {
- metadata = { ...metadata, noEscape: true };
+ /* Update metadata if needed */
+ if ((cfg.noEscape && !metadata?.noEscape) || (cfg.childTranslators && !metadata?.translators)) {
+ metadata = { ...metadata, noEscape: cfg.noEscape, translators: cfg.childTranslators };
this.nodeMetadata.set(node, metadata);
}
diff --git a/test/default-tags-codeblock.test.ts b/test/default-tags-codeblock.test.ts
new file mode 100755
index 0000000..4cf5146
--- /dev/null
+++ b/test/default-tags-codeblock.test.ts
@@ -0,0 +1,80 @@
+// noinspection HtmlUnknownTarget
+
+import { NodeHtmlMarkdown } from '../src';
+
+
+/* ****************************************************************************************************************** *
+ * Tests
+ * ****************************************************************************************************************** */
+
+// Note: Newline handling for block elements within code blocks is not very clean. This can be fixed later.
+describe(`Default Tags`, () => {
+ let instance: NodeHtmlMarkdown;
+ const translateAsBlock = (html: string) => instance.translate(`${html}
`);
+ const getExpected = (s: string) => '```\n' + s + '\n```';
+ beforeAll(() => {
+ instance = new NodeHtmlMarkdown();
+ });
+
+ test(`Line Break (br)`, () => {
+ const res = translateAsBlock(`a
b`);
+ expect(res).toBe(getExpected(`a\nb`));
+ });
+
+ test(`Horizontal Rule (hr)`, () => {
+ const res = translateAsBlock(`a
b`);
+ expect(res).toBe(getExpected(`a\n\n---\n\nb`));
+ });
+
+ test(`Non-processed Elements (b, strong, del, s, strike, em, i, pre, code, blockquote, a)`, () => {
+ const tags = [ 'b', 'strong', 'del', 's', 'strike', 'em', 'i', 'code', 'a', 'pre', 'blockquote' ];
+ const html = tags.map(t => `<${t}>${t}${t}>`).join(' ');
+ const exp = 'b strong del s strike em i code a \n\npre\n\n blockquote\n\n';
+
+ const res = translateAsBlock(html);
+ expect(res).toBe(getExpected(exp));
+ });
+
+ test(`Image (img)`, () => {
+ const res = translateAsBlock(`a
b`);
+ expect(res).toBe(getExpected(`ab`));
+ });
+
+ test(`Headings (h1, h2, h3, h4, h5, h6)`, () => {
+ let nodes: string[] = [];
+ for (let i = 1; i < 8; i++) nodes.push(`a `);
+ const res = translateAsBlock(nodes.join(''));
+ expect(res).toBe(getExpected('\n[a]\n'.repeat(6) + '\na'));
+ });
+
+ // Note: Newline handling here for block elements is unusual
+ describe(`Lists (ol + li, ul + li)`, () => {
+ test(`Multi-level Ordered List`, () => {
+ const res = translateAsBlock(`
+
+ - a
b
+ -
+ - b
+
- c
d
+ - e
f
+
+
+ `);
+ expect(res).toBe(getExpected(` \n \n1. a \nb\n \n \n2. b \n 1. c \n d \n \n * e \n f\n \n `));
+ });
+
+ test(`Multi-level Unordered List`, () => {
+ const res = translateAsBlock(`
+
+ - a
b
+ -
+ - b
+
- c
d
+ - e
f
+
+
+ `);
+ expect(res).toBe(getExpected(` \n \n* a \nb\n \n \n* b \n * c \n d \n \n 1. e \n f\n \n `));
+ });
+ });
+});