From 0293c7578b42f31c70d09eb7b008b5ea87d2e5fd Mon Sep 17 00:00:00 2001 From: Dayllan Maza Date: Mon, 1 Feb 2021 01:31:45 -0500 Subject: [PATCH] Fix references sometimes opening as external links I'm unsure about the repercussions of changing how we load the html content from XML to HTML but the later seems to fix this issue. It is also how we load pages' content from parsoid. This issue was happening becuase `$this->xPath->query( '//html:a[contains(@style, "mw-Ref")]' )` was not working when the DOMDocument content was loaded via loadHTML. Removing the namespace from the above query fixes the book references but breaks the About page unless it is loaded as HTML. --- src/BookProvider.php | 8 ++------ src/PageParser.php | 12 +++--------- src/Util/Api.php | 5 +---- src/Util/Util.php | 16 ++++++++++++++++ tests/Util/ApiTest.php | 2 +- 5 files changed, 23 insertions(+), 20 deletions(-) diff --git a/src/BookProvider.php b/src/BookProvider.php index dc1b63a8..1df48f1f 100644 --- a/src/BookProvider.php +++ b/src/BookProvider.php @@ -3,6 +3,7 @@ namespace App; use App\Util\Api; +use App\Util\Util; use DOMDocument; use GuzzleHttp\Pool; use GuzzleHttp\Promise\PromiseInterface; @@ -182,12 +183,7 @@ protected function getDocument( $title ) { } protected function domDocumentFromHtml( $html ) { - $document = new DOMDocument( '1.0', 'UTF-8' ); - libxml_use_internal_errors( true ); - $document->loadHTML( mb_convert_encoding( str_replace( '', '', $html ), 'HTML-ENTITIES', 'UTF-8' ) ); - libxml_clear_errors(); - $document->encoding = 'UTF-8'; - return $document; + return Util::buildDOMDocumentFromHtml( $html ); } /** diff --git a/src/PageParser.php b/src/PageParser.php index c5929718..355b1945 100644 --- a/src/PageParser.php +++ b/src/PageParser.php @@ -418,15 +418,9 @@ private function cleanRedLinks() { } private function cleanReferenceLinks() { - // Get all links that contain the "style" value "mw-Ref" - $links = $this->xPath->query( '//html:a[contains(@style, "mw-Ref")]' ); - foreach ( $links as $link ) { - $href = $link->getAttribute( 'href' ); - $pos = strpos( $href, '#' ); - $link->setAttribute( 'href', substr( $href, $pos ) ); - } - // Get all links that have the "rel" attribute equals to "mw-Ref" - $links = $this->xPath->query( '//html:a[@rel="mw:referencedBy"]' ); + $links = $this->xPath->query( + '//*[@typeof="mw:Extension/ref"]/a | //a[@rel="mw:referencedBy"]' + ); foreach ( $links as $link ) { $href = $link->getAttribute( 'href' ); $pos = strpos( $href, '#' ); diff --git a/src/Util/Api.php b/src/Util/Api.php index bf743c90..24578530 100644 --- a/src/Util/Api.php +++ b/src/Util/Api.php @@ -4,7 +4,6 @@ use App\PageParser; use DateInterval; -use DOMDocument; use Exception; use GuzzleHttp\Client; use GuzzleHttp\ClientInterface; @@ -145,9 +144,7 @@ public function getAboutPage(): string { return $oldWikisourceApi->getPageAsync( 'MediaWiki:Wsexport_about' )->wait(); } } ); - // Rewrite some parts of the returned HTML. - $document = new DOMDocument( '1.0', 'UTF-8' ); - $document->loadXML( $content ); + $document = Util::buildDOMDocumentFromHtml( $content ); $parser = new PageParser( $document ); $document = $parser->getContent( true ); // Add https to protocol-relative links. diff --git a/src/Util/Util.php b/src/Util/Util.php index 38ded627..6de37bbe 100644 --- a/src/Util/Util.php +++ b/src/Util/Util.php @@ -4,6 +4,7 @@ use App\FileCache; use App\Refresh; +use DOMDocument; use DOMElement; use DOMXPath; use Exception; @@ -214,4 +215,19 @@ public static function extractErrorMessage( ?ResponseInterface $resp, RequestInt return $text ? "$message: $text" : $message; } + + /** + * Build base DOMDocument from a html string + * + * @param string $html + * @return DOMDocument + */ + public static function buildDOMDocumentFromHtml( string $html ): DOMDocument { + $document = new DOMDocument( '1.0', 'UTF-8' ); + libxml_use_internal_errors( true ); + $document->loadHTML( mb_convert_encoding( str_replace( '', '', $html ), 'HTML-ENTITIES', 'UTF-8' ) ); + libxml_clear_errors(); + $document->encoding = 'UTF-8'; + return $document; + } } diff --git a/tests/Util/ApiTest.php b/tests/Util/ApiTest.php index 35ba292a..2c98f369 100644 --- a/tests/Util/ApiTest.php +++ b/tests/Util/ApiTest.php @@ -52,7 +52,7 @@ private function mockClient( $responses ) { } public function testGetAboutPage(): void { - $api = $this->apiWithResponse( 200, [], 'Foo' ); + $api = $this->apiWithResponse( 200, [], 'Foo' ); $this->assertStringContainsString( 'Foo', $api->getAboutPage() ); } }