google · tushuhei · Oct 29, 2023 · Oct 27, 2023 · Oct 28, 2023 · Oct 28, 2023
diff --git a/budoux/html_processor.py b/budoux/html_processor.py
@@ -49,14 +49,16 @@ class HTMLChunkResolver(HTMLParser):
   """
   output = ''
 
-  def __init__(self, chunks: typing.List[str]):
+  def __init__(self, chunks: typing.List[str], separator: str):
     """Initializes the parser.
 
     Args:
       chunks (List[str]): The chunks to resolve.
+      separator (str): The separator string.
     """
     HTMLParser.__init__(self)
     self.chunks_joined = SEP.join(chunks)
+    self.separator = separator
     self.to_skip = False
     self.scan_index = 0
     self.element_stack: queue.LifoQueue[bool] = queue.LifoQueue()
@@ -73,7 +75,7 @@ def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None:
     if tag.upper() in SKIP_NODES:
       if not self.to_skip and self.chunks_joined[self.scan_index] == SEP:
         self.scan_index += 1
-        self.output += '<wbr>'
+        self.output += self.separator
       self.to_skip = True
     self.output += '<%s%s>' % (tag, encoded_attrs)
 
@@ -85,7 +87,7 @@ def handle_data(self, data: str) -> None:
     for char in data:
       if not char == self.chunks_joined[self.scan_index]:
         if not self.to_skip:
-          self.output += '<wbr>'
+          self.output += self.separator
         self.scan_index += 1
       self.output += char
       self.scan_index += 1
@@ -105,17 +107,20 @@ def get_text(html: str) -> str:
   return text_content_extractor.output
 
 
-def resolve(phrases: typing.List[str], html: str) -> str:
+def resolve(phrases: typing.List[str],
+            html: str,
+            separator: str = '\u200b') -> str:
   """Wraps phrases in the HTML string with non-breaking markup.
 
   Args:
     phrases (List[str]): The phrases included in the HTML string.
     html (str): The HTML string to resolve.
+    separator (str, optional): The separator string.
 
   Returns:
     The HTML string with phrases wrapped in non-breaking markup.
   """
-  resolver = HTMLChunkResolver(phrases)
+  resolver = HTMLChunkResolver(phrases, separator)
   resolver.feed(html)
   result = '<span style="%s">%s</span>' % (PARENT_CSS_STYLE, resolver.output)
   return result
diff --git a/demo/src/app.ts b/demo/src/app.ts
@@ -59,7 +59,7 @@ const run = () => {
   const renderWithBR = brCheckElement.checked;
   if (renderWithBR) {
     outputContainerElement.innerHTML = window.DOMPurify.sanitize(
-      outputContainerElement.innerHTML.replace(/<wbr>/g, '<br>'));
+      outputContainerElement.innerHTML.replace(/\u200b/g, '<br>'));
   }
 };
 

diff --git a/java/src/main/java/com/google/budoux/HTMLProcessor.java b/java/src/main/java/com/google/budoux/HTMLProcessor.java
@@ -59,15 +59,28 @@ private HTMLProcessor() {}
   private static class PhraseResolvingNodeVisitor implements NodeVisitor {
     private static final char SEP = '\uFFFF';
     private final String phrasesJoined;
+    private final String separator;
     private final StringBuilder output = new StringBuilder();
     private Integer scanIndex = 0;
     private boolean toSkip = false;
     private Stack<Boolean> elementStack = new Stack<Boolean>();
 
-    PhraseResolvingNodeVisitor(List<String> phrases) {
+    /**
+     * Constructs a PhraseResolvingNodeVisitor.
+     *
+     * @param phrases a list of phrase strings.
+     * @param separator the separator string.
+     */
+    PhraseResolvingNodeVisitor(List<String> phrases, String separator) {
+      this.separator = separator;
       this.phrasesJoined = String.join(Character.toString(SEP), phrases);
     }
 
+    /**
+     * Returns the resolved output string.
+     *
+     * @return the output string.
+     */
     public StringBuilder getOutput() {
       return output;
     }
@@ -86,7 +99,7 @@ public void head(Node node, int depth) {
         final String nodeName = node.nodeName();
         if (skipNodes.contains(nodeName.toUpperCase(Locale.ENGLISH))) {
           if (!toSkip && phrasesJoined.charAt(scanIndex) == SEP) {
-            output.append("<wbr>");
+            output.append(separator);
             scanIndex++;
           }
           toSkip = true;
@@ -98,7 +111,7 @@ public void head(Node node, int depth) {
           char c = data.charAt(i);
           if (c != phrasesJoined.charAt(scanIndex)) {
             if (!toSkip) {
-              output.append("<wbr>");
+              output.append(separator);
             }
             scanIndex++;
           }
@@ -126,9 +139,9 @@ public void tail(Node node, int depth) {
    * @param html the HTML string to resolve.
    * @return the HTML string of phrases wrapped in non-breaking markup.
    */
-  public static String resolve(List<String> phrases, String html) {
+  public static String resolve(List<String> phrases, String html, String separator) {
     Document doc = Jsoup.parseBodyFragment(html);
-    PhraseResolvingNodeVisitor nodeVisitor = new PhraseResolvingNodeVisitor(phrases);
+    PhraseResolvingNodeVisitor nodeVisitor = new PhraseResolvingNodeVisitor(phrases, separator);
     doc.body().traverse(nodeVisitor);
     return String.format("<span style=\"%s\">%s</span>", STYLE, nodeVisitor.getOutput());
   }

diff --git a/java/src/main/java/com/google/budoux/Parser.java b/java/src/main/java/com/google/budoux/Parser.java
@@ -183,6 +183,6 @@ public List<String> parse(String sentence) {
   public String translateHTMLString(String html) {
     String sentence = HTMLProcessor.getText(html);
     List<String> phrases = parse(sentence);
-    return HTMLProcessor.resolve(phrases, html);
+    return HTMLProcessor.resolve(phrases, html, "\u200b");
   }
 }
diff --git a/java/src/test/java/com/google/budoux/HTMLProcessorTest.java b/java/src/test/java/com/google/budoux/HTMLProcessorTest.java
@@ -32,7 +32,7 @@ public class HTMLProcessorTest {
   public void testResolveWithSimpleTextInput() {
     List<String> phrases = Arrays.asList("abc", "def");
     String html = "abcdef";
-    String result = HTMLProcessor.resolve(phrases, html);
+    String result = HTMLProcessor.resolve(phrases, html, "<wbr>");
     assertEquals(
         "<span style=\"word-break: keep-all; overflow-wrap: anywhere;\">abc<wbr>def</span>",
         result);
@@ -42,7 +42,7 @@ public void testResolveWithSimpleTextInput() {
   public void testResolveWithStandardHTMLInput() {
     List<String> phrases = Arrays.asList("abc", "def");
     String html = "ab<a href=\"http://example.com\">cd</a>ef";
-    String result = HTMLProcessor.resolve(phrases, html);
+    String result = HTMLProcessor.resolve(phrases, html, "<wbr>");
     assertEquals(
         "<span style=\"word-break: keep-all; overflow-wrap: anywhere;\">ab<a"
             + " href=\"http://example.com\">c<wbr>d</a>ef</span>",
@@ -53,7 +53,7 @@ public void testResolveWithStandardHTMLInput() {
   public void testResolveWithNodesToSkip() {
     List<String> phrases = Arrays.asList("abc", "def", "ghi");
     String html = "a<button>bcde</button>fghi";
-    String result = HTMLProcessor.resolve(phrases, html);
+    String result = HTMLProcessor.resolve(phrases, html, "<wbr>");
     assertEquals(
         "<span style=\"word-break: keep-all; overflow-wrap:"
             + " anywhere;\">a<button>bcde</button>f<wbr>ghi</span>",
@@ -64,7 +64,7 @@ public void testResolveWithNodesToSkip() {
   public void testResolveWithNodesBreakBeforeSkip() {
     List<String> phrases = Arrays.asList("abc", "def", "ghi", "jkl");
     String html = "abc<nobr>defghi</nobr>jkl";
-    String result = HTMLProcessor.resolve(phrases, html);
+    String result = HTMLProcessor.resolve(phrases, html, "<wbr>");
     assertEquals(
         "<span style=\"word-break: keep-all; overflow-wrap:"
             + " anywhere;\">abc<wbr><nobr>defghi</nobr><wbr>jkl</span>",
@@ -75,7 +75,7 @@ public void testResolveWithNodesBreakBeforeSkip() {
   public void testResolveWithNothingToSplit() {
     List<String> phrases = Arrays.asList("abcdef");
     String html = "abcdef";
-    String result = HTMLProcessor.resolve(phrases, html);
+    String result = HTMLProcessor.resolve(phrases, html, "<wbr>");
     assertEquals(
         "<span style=\"word-break: keep-all; overflow-wrap: anywhere;\">abcdef</span>", result);
   }

diff --git a/java/src/test/java/com/google/budoux/ParserTest.java b/java/src/test/java/com/google/budoux/ParserTest.java
@@ -61,7 +61,7 @@ public void testTranslateHTMLString() {
     String result = parser.translateHTMLString(html);
     assertEquals(
         "<span style=\"word-break: keep-all; overflow-wrap: anywhere;\"><a"
-            + " href=\"http://example.com\">xyz<wbr>a</a>bc</span>",
+            + " href=\"http://example.com\">xyz\u200ba</a>bc</span>",
         result);
   }
 }
diff --git a/javascript/src/html_processor.ts b/javascript/src/html_processor.ts
@@ -586,48 +586,45 @@ export class HTMLProcessor {
   }
 }
 
-// eslint-disable-next-line @typescript-eslint/no-explicit-any
-type Constructor<T = {}> = new (...args: any[]) => T;
-
 /**
- * Mixin to add HTML processing support to {@link Parser}.
- * @param Base A base {@link Parser} class
- * @returns An extended {@link Parser} class with {@link HTMLProcessor}.
+ * BudouX {@link Parser} with HTML processing support.
  */
-function HTMLProcessing<TBase extends Constructor<Parser>>(Base: TBase) {
-  return class _HTMLProcessable extends Base {
-    /**
-     * Applies markups for semantic line breaks to the given HTML element.
-     * @param parentElement The input element.
-     */
-    applyElement(parentElement: HTMLElement) {
-      const htmlProcessor = new HTMLProcessor(this, {
-        separator: parentElement.ownerDocument.createElement('wbr'),
-      });
-      htmlProcessor.applyToElement(parentElement);
+export class HTMLProcessingParser extends Parser {
+  htmlProcessor: HTMLProcessor;
+
+  constructor(
+    model: {[key: string]: {[key: string]: number}},
+    htmlProcessorOptions: HTMLProcessorOptions = {
+      separator: ZWSP,
     }
+  ) {
+    super(model);
+    this.htmlProcessor = new HTMLProcessor(this, htmlProcessorOptions);
+  }
 
-    /**
-     * Translates the given HTML string to another HTML string with markups
-     * for semantic line breaks.
-     * @param html An input html string.
-     * @returns The translated HTML string.
-     */
-    translateHTMLString(html: string) {
-      if (html === '') return html;
-      const doc = parseFromString(html);
-      if (HTMLProcessor.hasChildTextNode(doc.body)) {
-        const wrapper = doc.createElement('span');
-        wrapper.append(...doc.body.childNodes);
-        doc.body.append(wrapper);
-      }
-      this.applyElement(doc.body.childNodes[0] as HTMLElement);
-      return doc.body.innerHTML;
+  /**
+   * Applies markups for semantic line breaks to the given HTML element.
+   * @param parentElement The input element.
+   */
+  applyElement(parentElement: HTMLElement) {
+    this.htmlProcessor.applyToElement(parentElement);
+  }
+
+  /**
+   * Translates the given HTML string to another HTML string with markups
+   * for semantic line breaks.
+   * @param html An input html string.
+   * @returns The translated HTML string.
+   */
+  translateHTMLString(html: string) {
+    if (html === '') return html;
+    const doc = parseFromString(html);
+    if (HTMLProcessor.hasChildTextNode(doc.body)) {
+      const wrapper = doc.createElement('span');
+      wrapper.append(...doc.body.childNodes);
+      doc.body.append(wrapper);
     }
-  };
+    this.applyElement(doc.body.childNodes[0] as HTMLElement);
+    return doc.body.innerHTML;
+  }
 }
-
-/**
- * BudouX {@link Parser} with HTML processing support.
- */
-export class HTMLProcessingParser extends HTMLProcessing(Parser) {}
diff --git a/javascript/src/tests/test_cli.ts b/javascript/src/tests/test_cli.ts
@@ -48,7 +48,7 @@ describe('cli', () => {
     const inputText = '今日は天気です。';
     const argv = ['node', 'budoux', '--html', inputText];
     const expectedStdOut =
-      '<span style="word-break: keep-all; overflow-wrap: anywhere;">今日は<wbr>天気です。</span>';
+      '<span style="word-break: keep-all; overflow-wrap: anywhere;">今日は\u200B天気です。</span>';
     cli(argv);
     expect(console.log).toHaveBeenCalledWith(expectedStdOut);
   });
@@ -57,7 +57,7 @@ describe('cli', () => {
     const inputText = '今日は天気です。';
     const argv = ['node', 'budoux', '-H', inputText];
     const expectedStdOut =
-      '<span style="word-break: keep-all; overflow-wrap: anywhere;">今日は<wbr>天気です。</span>';
+      '<span style="word-break: keep-all; overflow-wrap: anywhere;">今日は\u200B天気です。</span>';
     cli(argv);
     expect(console.log).toHaveBeenCalledWith(expectedStdOut);
   });

diff --git a/javascript/src/tests/test_html_processor.ts b/javascript/src/tests/test_html_processor.ts
@@ -332,35 +332,35 @@ describe('HTMLProcessingParser.applyElement', () => {
   };
   const style = 'word-break: keep-all; overflow-wrap: anywhere;';
 
-  it('should insert WBR tags where the sentence should break.', () => {
+  it('should insert ZWSPs where the sentence should break.', () => {
     const inputHTML = '<p>xyzabcabc</p>';
-    const expectedHTML = `<p style="${style}">xyz<wbr>abc<wbr>abc</p>`;
+    const expectedHTML = `<p style="${style}">xyz\u200Babc\u200Babc</p>`;
     const model = {
       UW4: {a: 1001}, // means "should separate right before 'a'".
     };
     checkEqual(model, inputHTML, expectedHTML);
   });
 
-  it('should insert WBR tags even it overlaps with other HTML tags.', () => {
+  it('should insert ZWSPs even it overlaps with other HTML tags.', () => {
     const inputHTML = '<p>xy<a href="#">zabca</a>bc</p>';
-    const expectedHTML = `<p style="${style}">xy<a href="#">z<wbr>abc<wbr>a</a>bc</p>`;
+    const expectedHTML = `<p style="${style}">xy<a href="#">z\u200Babc\u200Ba</a>bc</p>`;
     const model = {
       UW4: {a: 1001}, // means "should separate right before 'a'".
     };
     checkEqual(model, inputHTML, expectedHTML);
   });
 
-  it('should not insert WBR tags to where input has WBR tags.', () => {
+  it('should not insert ZWSPs to where input has WBR tags already.', () => {
     const inputHTML = '<p>xyz<wbr>abcabc</p>';
-    const expectedHTML = `<p style="${style}">xyz<wbr>abc<wbr>abc</p>`;
+    const expectedHTML = `<p style="${style}">xyz<wbr>abc\u200Babc</p>`;
     const model = {
       UW4: {a: 1001}, // means "should separate right before 'a'".
     };
     checkEqual(model, inputHTML, expectedHTML);
   });
-  it('should not insert WBR tags to where input has ZWSP.', () => {
+  it('should not insert ZWSPs to where input has ZWSPs.', () => {
     const inputHTML = '<p>xyz\u200Babcabc</p>';
-    const expectedHTML = `<p style="${style}">xyz\u200babc<wbr>abc</p>`;
+    const expectedHTML = `<p style="${style}">xyz\u200babc\u200Babc</p>`;
     const model = {
       UW4: {a: 1001}, // means "should separate right before 'a'".
     };
@@ -387,7 +387,7 @@ describe('HTMLProcessingParser.translateHTMLString', () => {
   it('should output a html string with a SPAN parent with proper style attributes.', () => {
     const inputHTML = 'xyzabcd';
     const expectedHTML = `
-    <span style="word-break: keep-all; overflow-wrap: anywhere;">xyz<wbr>abcd</span>`;
+    <span style="word-break: keep-all; overflow-wrap: anywhere;">xyz\u200Babcd</span>`;
     checkEqual(defaultModel, inputHTML, expectedHTML);
   });
 
@@ -396,7 +396,7 @@ describe('HTMLProcessingParser.translateHTMLString', () => {
     const expectedHTML = `
     <p class="foo"
        style="color: red; word-break: keep-all; overflow-wrap: anywhere;"
-    >xyz<wbr>abcd</p>`;
+    >xyz\u200Babcd</p>`;
     checkEqual(defaultModel, inputHTML, expectedHTML);
   });
 
@@ -410,39 +410,39 @@ describe('HTMLProcessingParser.translateHTMLString', () => {
     const inputHTML = 'xyz<script>alert(1);</script>xyzabc';
     const expectedHTML = `<span
     style="word-break: keep-all; overflow-wrap: anywhere;"
-    >xyz<script>alert(1);</script>xyz<wbr>abc</span>`;
+    >xyz<script>alert(1);</script>xyz\u200Babc</span>`;
     checkEqual(defaultModel, inputHTML, expectedHTML);
   });
 
   it('script tags on top should be discarded by the DOMParser.', () => {
     const inputHTML = '<script>alert(1);</script>xyzabc';
     const expectedHTML = `<span
     style="word-break: keep-all; overflow-wrap: anywhere;"
-    >xyz<wbr>abc</span>`;
+    >xyz\u200Babc</span>`;
     checkEqual(defaultModel, inputHTML, expectedHTML);
   });
 
   it('should skip some specific tags.', () => {
     const inputHTML = 'xyz<code>abc</code>abc';
     const expectedHTML = `<span
     style="word-break: keep-all; overflow-wrap: anywhere;"
-    >xyz<code>abc</code><wbr>abc</span>`;
+    >xyz<code>abc</code>\u200Babc</span>`;
     checkEqual(defaultModel, inputHTML, expectedHTML);
   });
 
   it('should not ruin attributes of child elements.', () => {
     const inputHTML = 'xyza<a href="#" hidden>bc</a>abc';
     const expectedHTML = `<span
     style="word-break: keep-all; overflow-wrap: anywhere;"
-    >xyz<wbr>a<a href="#" hidden>bc</a><wbr>abc</span>`;
+    >xyz\u200Ba<a href="#" hidden>bc</a>\u200Babc</span>`;
     checkEqual(defaultModel, inputHTML, expectedHTML);
   });
 
   it('should work with emojis.', () => {
     const inputHTML = 'xyza🇯🇵🇵🇹abc';
     const expectedHTML = `<span
     style="word-break: keep-all; overflow-wrap: anywhere;"
-    >xyz<wbr>a🇯🇵🇵🇹<wbr>abc</span>`;
+    >xyz\u200Ba🇯🇵🇵🇹\u200Babc</span>`;
     checkEqual(defaultModel, inputHTML, expectedHTML);
   });
 });