Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Use ZWSP instead of WBR #346

Merged
merged 8 commits into from
Oct 29, 2023
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions budoux/html_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,16 @@ class HTMLChunkResolver(HTMLParser):
"""
output = ''

def __init__(self, chunks: typing.List[str]):
def __init__(self, chunks: typing.List[str], separator: str):
"""Initializes the parser.

Args:
chunks (List[str]): The chunks to resolve.
separator (str): The separator string.
"""
HTMLParser.__init__(self)
self.chunks_joined = SEP.join(chunks)
self.separator = separator
self.to_skip = False
self.scan_index = 0
self.element_stack: queue.LifoQueue[bool] = queue.LifoQueue()
Expand All @@ -73,7 +75,7 @@ def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None:
if tag.upper() in SKIP_NODES:
if not self.to_skip and self.chunks_joined[self.scan_index] == SEP:
self.scan_index += 1
self.output += '<wbr>'
self.output += self.separator
self.to_skip = True
self.output += '<%s%s>' % (tag, encoded_attrs)

Expand All @@ -85,7 +87,7 @@ def handle_data(self, data: str) -> None:
for char in data:
if not char == self.chunks_joined[self.scan_index]:
if not self.to_skip:
self.output += '<wbr>'
self.output += self.separator
self.scan_index += 1
self.output += char
self.scan_index += 1
Expand All @@ -105,17 +107,20 @@ def get_text(html: str) -> str:
return text_content_extractor.output


def resolve(phrases: typing.List[str], html: str) -> str:
def resolve(phrases: typing.List[str],
html: str,
separator: str = '\u200b') -> str:
"""Wraps phrases in the HTML string with non-breaking markup.

Args:
phrases (List[str]): The phrases included in the HTML string.
html (str): The HTML string to resolve.
separator (str, optional): The separator string.

Returns:
The HTML string with phrases wrapped in non-breaking markup.
"""
resolver = HTMLChunkResolver(phrases)
resolver = HTMLChunkResolver(phrases, separator)
resolver.feed(html)
result = '<span style="%s">%s</span>' % (PARENT_CSS_STYLE, resolver.output)
return result
2 changes: 1 addition & 1 deletion demo/src/app.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ const run = () => {
const renderWithBR = brCheckElement.checked;
if (renderWithBR) {
outputContainerElement.innerHTML = window.DOMPurify.sanitize(
outputContainerElement.innerHTML.replace(/<wbr>/g, '<br>'));
outputContainerElement.innerHTML.replace(/\u200b/g, '<br>'));
}
};

Expand Down
23 changes: 18 additions & 5 deletions java/src/main/java/com/google/budoux/HTMLProcessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -59,15 +59,28 @@ private HTMLProcessor() {}
private static class PhraseResolvingNodeVisitor implements NodeVisitor {
private static final char SEP = '\uFFFF';
private final String phrasesJoined;
private final String separator;
private final StringBuilder output = new StringBuilder();
private Integer scanIndex = 0;
private boolean toSkip = false;
private Stack<Boolean> elementStack = new Stack<Boolean>();

PhraseResolvingNodeVisitor(List<String> phrases) {
/**
* Constructs a PhraseResolvingNodeVisitor.
*
* @param phrases a list of phrase strings.
* @param separator the separator string.
*/
PhraseResolvingNodeVisitor(List<String> phrases, String separator) {
this.separator = separator;
this.phrasesJoined = String.join(Character.toString(SEP), phrases);
}

/**
* Returns the resolved output string.
*
* @return the output string.
*/
public StringBuilder getOutput() {
return output;
}
Expand All @@ -86,7 +99,7 @@ public void head(Node node, int depth) {
final String nodeName = node.nodeName();
if (skipNodes.contains(nodeName.toUpperCase(Locale.ENGLISH))) {
if (!toSkip && phrasesJoined.charAt(scanIndex) == SEP) {
output.append("<wbr>");
output.append(separator);
scanIndex++;
}
toSkip = true;
Expand All @@ -98,7 +111,7 @@ public void head(Node node, int depth) {
char c = data.charAt(i);
if (c != phrasesJoined.charAt(scanIndex)) {
if (!toSkip) {
output.append("<wbr>");
output.append(separator);
}
scanIndex++;
}
Expand Down Expand Up @@ -126,9 +139,9 @@ public void tail(Node node, int depth) {
* @param html the HTML string to resolve.
* @return the HTML string of phrases wrapped in non-breaking markup.
*/
public static String resolve(List<String> phrases, String html) {
public static String resolve(List<String> phrases, String html, String separator) {
Document doc = Jsoup.parseBodyFragment(html);
PhraseResolvingNodeVisitor nodeVisitor = new PhraseResolvingNodeVisitor(phrases);
PhraseResolvingNodeVisitor nodeVisitor = new PhraseResolvingNodeVisitor(phrases, separator);
doc.body().traverse(nodeVisitor);
return String.format("<span style=\"%s\">%s</span>", STYLE, nodeVisitor.getOutput());
}
Expand Down
2 changes: 1 addition & 1 deletion java/src/main/java/com/google/budoux/Parser.java
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,6 @@ public List<String> parse(String sentence) {
public String translateHTMLString(String html) {
String sentence = HTMLProcessor.getText(html);
List<String> phrases = parse(sentence);
return HTMLProcessor.resolve(phrases, html);
return HTMLProcessor.resolve(phrases, html, "\u200b");
}
}
10 changes: 5 additions & 5 deletions java/src/test/java/com/google/budoux/HTMLProcessorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ public class HTMLProcessorTest {
public void testResolveWithSimpleTextInput() {
List<String> phrases = Arrays.asList("abc", "def");
String html = "abcdef";
String result = HTMLProcessor.resolve(phrases, html);
String result = HTMLProcessor.resolve(phrases, html, "<wbr>");
assertEquals(
"<span style=\"word-break: keep-all; overflow-wrap: anywhere;\">abc<wbr>def</span>",
result);
Expand All @@ -42,7 +42,7 @@ public void testResolveWithSimpleTextInput() {
public void testResolveWithStandardHTMLInput() {
List<String> phrases = Arrays.asList("abc", "def");
String html = "ab<a href=\"http://example.com\">cd</a>ef";
String result = HTMLProcessor.resolve(phrases, html);
String result = HTMLProcessor.resolve(phrases, html, "<wbr>");
assertEquals(
"<span style=\"word-break: keep-all; overflow-wrap: anywhere;\">ab<a"
+ " href=\"http://example.com\">c<wbr>d</a>ef</span>",
Expand All @@ -53,7 +53,7 @@ public void testResolveWithStandardHTMLInput() {
public void testResolveWithNodesToSkip() {
List<String> phrases = Arrays.asList("abc", "def", "ghi");
String html = "a<button>bcde</button>fghi";
String result = HTMLProcessor.resolve(phrases, html);
String result = HTMLProcessor.resolve(phrases, html, "<wbr>");
assertEquals(
"<span style=\"word-break: keep-all; overflow-wrap:"
+ " anywhere;\">a<button>bcde</button>f<wbr>ghi</span>",
Expand All @@ -64,7 +64,7 @@ public void testResolveWithNodesToSkip() {
public void testResolveWithNodesBreakBeforeSkip() {
List<String> phrases = Arrays.asList("abc", "def", "ghi", "jkl");
String html = "abc<nobr>defghi</nobr>jkl";
String result = HTMLProcessor.resolve(phrases, html);
String result = HTMLProcessor.resolve(phrases, html, "<wbr>");
assertEquals(
"<span style=\"word-break: keep-all; overflow-wrap:"
+ " anywhere;\">abc<wbr><nobr>defghi</nobr><wbr>jkl</span>",
Expand All @@ -75,7 +75,7 @@ public void testResolveWithNodesBreakBeforeSkip() {
public void testResolveWithNothingToSplit() {
List<String> phrases = Arrays.asList("abcdef");
String html = "abcdef";
String result = HTMLProcessor.resolve(phrases, html);
String result = HTMLProcessor.resolve(phrases, html, "<wbr>");
assertEquals(
"<span style=\"word-break: keep-all; overflow-wrap: anywhere;\">abcdef</span>", result);
}
Expand Down
2 changes: 1 addition & 1 deletion java/src/test/java/com/google/budoux/ParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ public void testTranslateHTMLString() {
String result = parser.translateHTMLString(html);
assertEquals(
"<span style=\"word-break: keep-all; overflow-wrap: anywhere;\"><a"
+ " href=\"http://example.com\">xyz<wbr>a</a>bc</span>",
+ " href=\"http://example.com\">xyz\u200ba</a>bc</span>",
result);
}
}
75 changes: 36 additions & 39 deletions javascript/src/html_processor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -586,48 +586,45 @@ export class HTMLProcessor {
}
}

// eslint-disable-next-line @typescript-eslint/no-explicit-any
type Constructor<T = {}> = new (...args: any[]) => T;

/**
* Mixin to add HTML processing support to {@link Parser}.
* @param Base A base {@link Parser} class
* @returns An extended {@link Parser} class with {@link HTMLProcessor}.
* BudouX {@link Parser} with HTML processing support.
*/
function HTMLProcessing<TBase extends Constructor<Parser>>(Base: TBase) {
return class _HTMLProcessable extends Base {
/**
* Applies markups for semantic line breaks to the given HTML element.
* @param parentElement The input element.
*/
applyElement(parentElement: HTMLElement) {
const htmlProcessor = new HTMLProcessor(this, {
separator: parentElement.ownerDocument.createElement('wbr'),
});
htmlProcessor.applyToElement(parentElement);
export class HTMLProcessingParser extends Parser {
htmlProcessor: HTMLProcessor;

constructor(
model: {[key: string]: {[key: string]: number}},
htmlProcessorOptions: HTMLProcessorOptions = {
separator: ZWSP,
}
) {
super(model);
this.htmlProcessor = new HTMLProcessor(this, htmlProcessorOptions);
}

/**
* Translates the given HTML string to another HTML string with markups
* for semantic line breaks.
* @param html An input html string.
* @returns The translated HTML string.
*/
translateHTMLString(html: string) {
if (html === '') return html;
const doc = parseFromString(html);
if (HTMLProcessor.hasChildTextNode(doc.body)) {
const wrapper = doc.createElement('span');
wrapper.append(...doc.body.childNodes);
doc.body.append(wrapper);
}
this.applyElement(doc.body.childNodes[0] as HTMLElement);
return doc.body.innerHTML;
/**
* Applies markups for semantic line breaks to the given HTML element.
* @param parentElement The input element.
*/
applyElement(parentElement: HTMLElement) {
this.htmlProcessor.applyToElement(parentElement);
}

/**
* Translates the given HTML string to another HTML string with markups
* for semantic line breaks.
* @param html An input html string.
* @returns The translated HTML string.
*/
translateHTMLString(html: string) {
if (html === '') return html;
const doc = parseFromString(html);
if (HTMLProcessor.hasChildTextNode(doc.body)) {
const wrapper = doc.createElement('span');
wrapper.append(...doc.body.childNodes);
doc.body.append(wrapper);
}
};
this.applyElement(doc.body.childNodes[0] as HTMLElement);
return doc.body.innerHTML;
}
}

/**
* BudouX {@link Parser} with HTML processing support.
*/
export class HTMLProcessingParser extends HTMLProcessing(Parser) {}
4 changes: 2 additions & 2 deletions javascript/src/tests/test_cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ describe('cli', () => {
const inputText = '今日は天気です。';
const argv = ['node', 'budoux', '--html', inputText];
const expectedStdOut =
'<span style="word-break: keep-all; overflow-wrap: anywhere;">今日は<wbr>天気です。</span>';
'<span style="word-break: keep-all; overflow-wrap: anywhere;">今日は\u200B天気です。</span>';
cli(argv);
expect(console.log).toHaveBeenCalledWith(expectedStdOut);
});
Expand All @@ -57,7 +57,7 @@ describe('cli', () => {
const inputText = '今日は天気です。';
const argv = ['node', 'budoux', '-H', inputText];
const expectedStdOut =
'<span style="word-break: keep-all; overflow-wrap: anywhere;">今日は<wbr>天気です。</span>';
'<span style="word-break: keep-all; overflow-wrap: anywhere;">今日は\u200B天気です。</span>';
cli(argv);
expect(console.log).toHaveBeenCalledWith(expectedStdOut);
});
Expand Down
30 changes: 15 additions & 15 deletions javascript/src/tests/test_html_processor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -332,35 +332,35 @@ describe('HTMLProcessingParser.applyElement', () => {
};
const style = 'word-break: keep-all; overflow-wrap: anywhere;';

it('should insert WBR tags where the sentence should break.', () => {
it('should insert ZWSPs where the sentence should break.', () => {
const inputHTML = '<p>xyzabcabc</p>';
const expectedHTML = `<p style="${style}">xyz<wbr>abc<wbr>abc</p>`;
const expectedHTML = `<p style="${style}">xyz\u200Babc\u200Babc</p>`;
const model = {
UW4: {a: 1001}, // means "should separate right before 'a'".
};
checkEqual(model, inputHTML, expectedHTML);
});

it('should insert WBR tags even it overlaps with other HTML tags.', () => {
it('should insert ZWSPs even it overlaps with other HTML tags.', () => {
const inputHTML = '<p>xy<a href="#">zabca</a>bc</p>';
const expectedHTML = `<p style="${style}">xy<a href="#">z<wbr>abc<wbr>a</a>bc</p>`;
const expectedHTML = `<p style="${style}">xy<a href="#">z\u200Babc\u200Ba</a>bc</p>`;
const model = {
UW4: {a: 1001}, // means "should separate right before 'a'".
};
checkEqual(model, inputHTML, expectedHTML);
});

it('should not insert WBR tags to where input has WBR tags.', () => {
it('should not insert ZWSPs to where input has WBR tags already.', () => {
const inputHTML = '<p>xyz<wbr>abcabc</p>';
const expectedHTML = `<p style="${style}">xyz<wbr>abc<wbr>abc</p>`;
const expectedHTML = `<p style="${style}">xyz<wbr>abc\u200Babc</p>`;
const model = {
UW4: {a: 1001}, // means "should separate right before 'a'".
};
checkEqual(model, inputHTML, expectedHTML);
});
it('should not insert WBR tags to where input has ZWSP.', () => {
it('should not insert ZWSPs to where input has ZWSPs.', () => {
const inputHTML = '<p>xyz\u200Babcabc</p>';
const expectedHTML = `<p style="${style}">xyz\u200babc<wbr>abc</p>`;
const expectedHTML = `<p style="${style}">xyz\u200babc\u200Babc</p>`;
const model = {
UW4: {a: 1001}, // means "should separate right before 'a'".
};
Expand All @@ -387,7 +387,7 @@ describe('HTMLProcessingParser.translateHTMLString', () => {
it('should output a html string with a SPAN parent with proper style attributes.', () => {
const inputHTML = 'xyzabcd';
const expectedHTML = `
<span style="word-break: keep-all; overflow-wrap: anywhere;">xyz<wbr>abcd</span>`;
<span style="word-break: keep-all; overflow-wrap: anywhere;">xyz\u200Babcd</span>`;
checkEqual(defaultModel, inputHTML, expectedHTML);
});

Expand All @@ -396,7 +396,7 @@ describe('HTMLProcessingParser.translateHTMLString', () => {
const expectedHTML = `
<p class="foo"
style="color: red; word-break: keep-all; overflow-wrap: anywhere;"
>xyz<wbr>abcd</p>`;
>xyz\u200Babcd</p>`;
checkEqual(defaultModel, inputHTML, expectedHTML);
});

Expand All @@ -410,39 +410,39 @@ describe('HTMLProcessingParser.translateHTMLString', () => {
const inputHTML = 'xyz<script>alert(1);</script>xyzabc';
const expectedHTML = `<span
style="word-break: keep-all; overflow-wrap: anywhere;"
>xyz<script>alert(1);</script>xyz<wbr>abc</span>`;
>xyz<script>alert(1);</script>xyz\u200Babc</span>`;
checkEqual(defaultModel, inputHTML, expectedHTML);
});

it('script tags on top should be discarded by the DOMParser.', () => {
const inputHTML = '<script>alert(1);</script>xyzabc';
const expectedHTML = `<span
style="word-break: keep-all; overflow-wrap: anywhere;"
>xyz<wbr>abc</span>`;
>xyz\u200Babc</span>`;
checkEqual(defaultModel, inputHTML, expectedHTML);
});

it('should skip some specific tags.', () => {
const inputHTML = 'xyz<code>abc</code>abc';
const expectedHTML = `<span
style="word-break: keep-all; overflow-wrap: anywhere;"
>xyz<code>abc</code><wbr>abc</span>`;
>xyz<code>abc</code>\u200Babc</span>`;
checkEqual(defaultModel, inputHTML, expectedHTML);
});

it('should not ruin attributes of child elements.', () => {
const inputHTML = 'xyza<a href="#" hidden>bc</a>abc';
const expectedHTML = `<span
style="word-break: keep-all; overflow-wrap: anywhere;"
>xyz<wbr>a<a href="#" hidden>bc</a><wbr>abc</span>`;
>xyz\u200Ba<a href="#" hidden>bc</a>\u200Babc</span>`;
checkEqual(defaultModel, inputHTML, expectedHTML);
});

it('should work with emojis.', () => {
const inputHTML = 'xyza🇯🇵🇵🇹abc';
const expectedHTML = `<span
style="word-break: keep-all; overflow-wrap: anywhere;"
>xyz<wbr>a🇯🇵🇵🇹<wbr>abc</span>`;
>xyz\u200Ba🇯🇵🇵🇹\u200Babc</span>`;
checkEqual(defaultModel, inputHTML, expectedHTML);
});
});
Loading