Skip to content

Commit

Permalink
Merge pull request #322 from wikimedia/T270372-ref-external-links
Browse files Browse the repository at this point in the history
Fix references sometimes opening as external links
  • Loading branch information
samwilson authored Feb 15, 2021
2 parents e1f0e4a + 0293c75 commit 8f7babd
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 20 deletions.
8 changes: 2 additions & 6 deletions src/BookProvider.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
namespace App;

use App\Util\Api;
use App\Util\Util;
use DOMDocument;
use GuzzleHttp\Pool;
use GuzzleHttp\Promise\PromiseInterface;
Expand Down Expand Up @@ -182,12 +183,7 @@ protected function getDocument( $title ) {
}

protected function domDocumentFromHtml( $html ) {
$document = new DOMDocument( '1.0', 'UTF-8' );
libxml_use_internal_errors( true );
$document->loadHTML( mb_convert_encoding( str_replace( '<?xml version="1.0" encoding="UTF-8" ?>', '', $html ), 'HTML-ENTITIES', 'UTF-8' ) );
libxml_clear_errors();
$document->encoding = 'UTF-8';
return $document;
return Util::buildDOMDocumentFromHtml( $html );
}

/**
Expand Down
12 changes: 3 additions & 9 deletions src/PageParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -418,15 +418,9 @@ private function cleanRedLinks() {
}

private function cleanReferenceLinks() {
// Get all links that contain the "style" value "mw-Ref"
$links = $this->xPath->query( '//html:a[contains(@style, "mw-Ref")]' );
foreach ( $links as $link ) {
$href = $link->getAttribute( 'href' );
$pos = strpos( $href, '#' );
$link->setAttribute( 'href', substr( $href, $pos ) );
}
// Get all links that have the "rel" attribute equals to "mw-Ref"
$links = $this->xPath->query( '//html:a[@rel="mw:referencedBy"]' );
$links = $this->xPath->query(
'//*[@typeof="mw:Extension/ref"]/a | //a[@rel="mw:referencedBy"]'
);
foreach ( $links as $link ) {
$href = $link->getAttribute( 'href' );
$pos = strpos( $href, '#' );
Expand Down
5 changes: 1 addition & 4 deletions src/Util/Api.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

use App\PageParser;
use DateInterval;
use DOMDocument;
use Exception;
use GuzzleHttp\Client;
use GuzzleHttp\ClientInterface;
Expand Down Expand Up @@ -145,9 +144,7 @@ public function getAboutPage(): string {
return $oldWikisourceApi->getPageAsync( 'MediaWiki:Wsexport_about' )->wait();
}
} );
// Rewrite some parts of the returned HTML.
$document = new DOMDocument( '1.0', 'UTF-8' );
$document->loadXML( $content );
$document = Util::buildDOMDocumentFromHtml( $content );
$parser = new PageParser( $document );
$document = $parser->getContent( true );
// Add https to protocol-relative links.
Expand Down
16 changes: 16 additions & 0 deletions src/Util/Util.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

use App\FileCache;
use App\Refresh;
use DOMDocument;
use DOMElement;
use DOMXPath;
use Exception;
Expand Down Expand Up @@ -214,4 +215,19 @@ public static function extractErrorMessage( ?ResponseInterface $resp, RequestInt

return $text ? "$message: $text" : $message;
}

/**
* Build base DOMDocument from a html string
*
* @param string $html
* @return DOMDocument
*/
public static function buildDOMDocumentFromHtml( string $html ): DOMDocument {
$document = new DOMDocument( '1.0', 'UTF-8' );
libxml_use_internal_errors( true );
$document->loadHTML( mb_convert_encoding( str_replace( '<?xml version="1.0" encoding="UTF-8" ?>', '', $html ), 'HTML-ENTITIES', 'UTF-8' ) );
libxml_clear_errors();
$document->encoding = 'UTF-8';
return $document;
}
}
2 changes: 1 addition & 1 deletion tests/Util/ApiTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ private function mockClient( $responses ) {
}

public function testGetAboutPage(): void {
$api = $this->apiWithResponse( 200, [], '<body><a href="./Foo">Foo</a></body>' );
$api = $this->apiWithResponse( 200, [], '<html><body><a href="./Foo">Foo</a></body></html>' );
$this->assertStringContainsString( '<body><a href="https://en.wikisource.org/wiki/Foo">Foo</a></body>', $api->getAboutPage() );
}
}

0 comments on commit 8f7babd

Please # to comment.