diff --git a/src/BookProvider.php b/src/BookProvider.php index dc1b63a8..1df48f1f 100644 --- a/src/BookProvider.php +++ b/src/BookProvider.php @@ -3,6 +3,7 @@ namespace App; use App\Util\Api; +use App\Util\Util; use DOMDocument; use GuzzleHttp\Pool; use GuzzleHttp\Promise\PromiseInterface; @@ -182,12 +183,7 @@ protected function getDocument( $title ) { } protected function domDocumentFromHtml( $html ) { - $document = new DOMDocument( '1.0', 'UTF-8' ); - libxml_use_internal_errors( true ); - $document->loadHTML( mb_convert_encoding( str_replace( '', '', $html ), 'HTML-ENTITIES', 'UTF-8' ) ); - libxml_clear_errors(); - $document->encoding = 'UTF-8'; - return $document; + return Util::buildDOMDocumentFromHtml( $html ); } /** diff --git a/src/PageParser.php b/src/PageParser.php index c5929718..355b1945 100644 --- a/src/PageParser.php +++ b/src/PageParser.php @@ -418,15 +418,9 @@ private function cleanRedLinks() { } private function cleanReferenceLinks() { - // Get all links that contain the "style" value "mw-Ref" - $links = $this->xPath->query( '//html:a[contains(@style, "mw-Ref")]' ); - foreach ( $links as $link ) { - $href = $link->getAttribute( 'href' ); - $pos = strpos( $href, '#' ); - $link->setAttribute( 'href', substr( $href, $pos ) ); - } - // Get all links that have the "rel" attribute equals to "mw-Ref" - $links = $this->xPath->query( '//html:a[@rel="mw:referencedBy"]' ); + $links = $this->xPath->query( + '//*[@typeof="mw:Extension/ref"]/a | //a[@rel="mw:referencedBy"]' + ); foreach ( $links as $link ) { $href = $link->getAttribute( 'href' ); $pos = strpos( $href, '#' ); diff --git a/src/Util/Api.php b/src/Util/Api.php index bf743c90..24578530 100644 --- a/src/Util/Api.php +++ b/src/Util/Api.php @@ -4,7 +4,6 @@ use App\PageParser; use DateInterval; -use DOMDocument; use Exception; use GuzzleHttp\Client; use GuzzleHttp\ClientInterface; @@ -145,9 +144,7 @@ public function getAboutPage(): string { return $oldWikisourceApi->getPageAsync( 'MediaWiki:Wsexport_about' )->wait(); } } ); - // Rewrite some parts of the returned HTML. - $document = new DOMDocument( '1.0', 'UTF-8' ); - $document->loadXML( $content ); + $document = Util::buildDOMDocumentFromHtml( $content ); $parser = new PageParser( $document ); $document = $parser->getContent( true ); // Add https to protocol-relative links. diff --git a/src/Util/Util.php b/src/Util/Util.php index 38ded627..6de37bbe 100644 --- a/src/Util/Util.php +++ b/src/Util/Util.php @@ -4,6 +4,7 @@ use App\FileCache; use App\Refresh; +use DOMDocument; use DOMElement; use DOMXPath; use Exception; @@ -214,4 +215,19 @@ public static function extractErrorMessage( ?ResponseInterface $resp, RequestInt return $text ? "$message: $text" : $message; } + + /** + * Build base DOMDocument from a html string + * + * @param string $html + * @return DOMDocument + */ + public static function buildDOMDocumentFromHtml( string $html ): DOMDocument { + $document = new DOMDocument( '1.0', 'UTF-8' ); + libxml_use_internal_errors( true ); + $document->loadHTML( mb_convert_encoding( str_replace( '', '', $html ), 'HTML-ENTITIES', 'UTF-8' ) ); + libxml_clear_errors(); + $document->encoding = 'UTF-8'; + return $document; + } } diff --git a/tests/Util/ApiTest.php b/tests/Util/ApiTest.php index 35ba292a..2c98f369 100644 --- a/tests/Util/ApiTest.php +++ b/tests/Util/ApiTest.php @@ -52,7 +52,7 @@ private function mockClient( $responses ) { } public function testGetAboutPage(): void { - $api = $this->apiWithResponse( 200, [], 'Foo' ); + $api = $this->apiWithResponse( 200, [], 'Foo' ); $this->assertStringContainsString( 'Foo', $api->getAboutPage() ); } }