From f6759ca1c55031b8b95efebfb8a4fd767c40be5a Mon Sep 17 00:00:00 2001 From: hmonroy Date: Mon, 26 Oct 2020 17:45:36 -0700 Subject: [PATCH] Migrate page content retrieval to Parsoid's API Point getPageAsync to new Parsoid API 'api/rest_v1' Bug: T264788 --- README.md | 2 + src/BookProvider.php | 4 +- src/Cleaner/BookCleanerEpub.php | 2 + src/PageParser.php | 59 +++++++++++++++++-- src/Refresh.php | 2 +- src/Util/Api.php | 46 ++++----------- src/Util/Util.php | 12 +++- tests/Book/BookProviderTest.php | 4 +- tests/Book/RefreshTest.php | 21 +++---- .../fixtures/Tales_of_Unrest/Navigation.html | 26 ++++---- tests/Http/BookTest.php | 2 +- 11 files changed, 110 insertions(+), 70 deletions(-) diff --git a/README.md b/README.md index d7ff2d8b..aa880862 100644 --- a/README.md +++ b/README.md @@ -112,6 +112,8 @@ docker-compose exec wsexport ./bin/console app:install Wikisource Export should be up at http://localhost:8888/ +### Cache +Go to `/refresh` to clear the cache ### Setup Xdebug diff --git a/src/BookProvider.php b/src/BookProvider.php index 12188014..28f0af18 100644 --- a/src/BookProvider.php +++ b/src/BookProvider.php @@ -38,7 +38,7 @@ public function __construct( Api $api, array $options ) { /** * return all the data on a book needed to export it * @param $title string the title of the main page of the book in Wikisource - * @param $isMetadata bool only retrive metadata on the book + * @param $isMetadata bool only retrieve metadata on the book * @return Book */ public function get( $title, $isMetadata = false ) { @@ -123,6 +123,8 @@ public function getMetadata( $title, $isMetadata, DOMDocument $doc ) { } $chapterTitles = $parser->getFullChaptersList( $title, $pageList, $namespaces ); $chapters = $this->getPages( $chapterTitles ); + + // Generate all the chapters foreach ( $chapters as $chapter_key => $chapter ) { $parser = new PageParser( $chapter->content ); if ( $parser->metadataIsSet( 'ws-noinclude' ) ) { diff --git a/src/Cleaner/BookCleanerEpub.php b/src/Cleaner/BookCleanerEpub.php index 0fca6863..b6981d09 100644 --- a/src/Cleaner/BookCleanerEpub.php +++ b/src/Cleaner/BookCleanerEpub.php @@ -255,6 +255,8 @@ protected function setLinks( DOMDocument $dom ) { $node->setAttribute( 'href', 'http:' . $href ); } elseif ( substr( $href, 0, 1 ) === '/' ) { $node->setAttribute( 'href', $this->baseUrl . $href ); + } elseif ( substr( $href, 0, 2 ) === './' ) { + $node->setAttribute( 'href', $this->baseUrl . '/wiki/' . substr( $href, 2 ) ); } } } diff --git a/src/PageParser.php b/src/PageParser.php index 60a954b3..e5e542b0 100644 --- a/src/PageParser.php +++ b/src/PageParser.php @@ -2,7 +2,6 @@ namespace App; -use App\Util\Api; use DOMDocument; use DOMElement; use DOMXPath; @@ -51,7 +50,17 @@ public function metadataIsSet( $id ) { * @return Page[] */ public function getChaptersList( $pageList, $namespaces ) { - $list = $this->xPath->query( '//*[@id="ws-summary" or contains(@class,"ws-summary")]/descendant::a[not(contains(@href,"action=edit") or contains(@class,"extiw") or contains(@class,"external") or contains(@class,"internal") or contains(@class,"image"))]' ); + $list = $this->xPath->query( '//*[ + @id="ws-summary" or + contains(@class,"ws-summary")]/descendant::a[ + not( + contains(@class,"new") or + contains(@href,"action=edit") or + contains(@class,"extiw") or contains(@class,"external") or + contains(@class,"internal") or + contains(@class,"image") + ) + ]' ); $chapters = []; /** @var DOMElement $link */ foreach ( $list as $link ) { @@ -61,7 +70,14 @@ public function getChaptersList( $pageList, $namespaces ) { // If there's no path component, it can't be a link to a chapter. continue; } - $title = urldecode( substr( $urlParts['path'], strlen( '/wiki/' ) ) ); + + // Remove string "/wiki/" if it's found in $urlParts['path'] + if ( substr( $urlParts['path'], 0, strlen( '/wiki/' ) ) === "/wiki/" ) { + $title = urldecode( substr( $urlParts['path'], strlen( '/wiki/' ) ) ); + } else { + $title = urldecode( substr( $urlParts['path'], strlen( '/' ) ) ); + } + $parts = explode( ':', $title ); // Include the chapter if it's not already present and is a main-namespace page. if ( $title != '' && !in_array( $title, $pageList ) && !in_array( $parts[0], $namespaces ) ) { @@ -83,7 +99,17 @@ public function getChaptersList( $pageList, $namespaces ) { public function getFullChaptersList( $title, $pageList, $namespaces ) { $chapters = $this->getChaptersList( $pageList, $namespaces ); if ( empty( $chapters ) ) { - $list = $this->xPath->query( '//a[contains(@href,"' . Api::mediawikiUrlEncode( $title ) . '") and not(contains(@class,"extiw") or contains(@class,"external") or contains(@href,"#") or contains(@class,"internal") or contains(@href,"action=edit") or contains(@title,"/Texte entier") or contains(@class,"image"))]' ); + $list = $this->xPath->query( '//a[contains(@href,"' . $title . '") and + not( + contains(@class,"new") or + contains(@class,"extiw") or + contains(@class,"external") or + contains(@href,"#") or + contains(@class,"internal") or + contains(@href,"action=edit") or + contains(@title,"/Texte entier") or + contains(@class,"image") + )]' ); /** @var DOMElement $link */ foreach ( $list as $link ) { $title = str_replace( ' ', '_', $link->getAttribute( 'title' ) ); @@ -126,7 +152,11 @@ public function getPicturesList() { } - $list = $this->xPath->query( '//a[contains(@class,"image")]' ); + $list = $this->xPath->query( ' + //a[contains(@class,"image")] | + //figure[contains(@typeof,"mw:Image")] | + //figure-inline[contains(@typeof,"mw:Image")]' + ); /** @var DOMElement $node */ foreach ( $list as $node ) { /** @var DOMElement $img */ @@ -203,6 +233,7 @@ public function getContent( $isMainPage ) { $this->removeNodesWithXpath( '//span[@class="editsection" or @class="mw-editsection"]' ); $this->removeNodesWithXpath( '//a[@class="mw-headline-anchor"]' ); $this->removeNodesWithXpath( '//div[@class="mediaContainer"]' ); + $this->removeNodesWithXpath( '//link[@rel="mw:PageProp/Category"]' ); $this->deprecatedNodes( 'big', 'span', 'font-size:large;' ); $this->deprecatedNodes( 'center', 'div', 'text-align:center;' ); $this->deprecatedNodes( 'strike', 'span', 'text-decoration:line-through;' ); @@ -229,6 +260,7 @@ public function getContent( $isMainPage ) { $this->cleanIds(); $this->cleanRedLinks(); + $this->cleanReferenceLinks(); $this->moveStyleToHead(); return $this->xPath->document; @@ -314,6 +346,23 @@ private function cleanRedLinks() { } } + private function cleanReferenceLinks() { + // Get all links that contain the "style" value "mw-Ref" + $links = $this->xPath->query( '//html:a[contains(@style, "mw-Ref")]' ); + foreach ( $links as $link ) { + $href = $link->getAttribute( 'href' ); + $pos = strpos( $href, '#' ); + $link->setAttribute( 'href', substr( $href, $pos ) ); + } + // Get all links that have the "rel" attribute equals to "mw-Ref" + $links = $this->xPath->query( '//html:a[@rel="mw:referencedBy"]' ); + foreach ( $links as $link ) { + $href = $link->getAttribute( 'href' ); + $pos = strpos( $href, '#' ); + $link->setAttribute( 'href', substr( $href, $pos ) ); + } + } + private function moveStyleToHead() { /** @var DOMElement $head */ foreach ( $this->xPath->query( '//head' ) as $head ) { diff --git a/src/Refresh.php b/src/Refresh.php index 10732478..c531cd0c 100644 --- a/src/Refresh.php +++ b/src/Refresh.php @@ -74,7 +74,7 @@ protected function getAboutXhtmlWikisource() { // Add https to protocol-relative links. $aboutHtml = str_replace( 'href="//', 'href="https://', $document->saveXML() ); // Fully qualify unqualified links. - $aboutHtml = str_replace( 'href="/', 'href="https://' . $this->api->getDomainName() . '/', $aboutHtml ); + $aboutHtml = str_replace( 'href="./', 'href="https://' . $this->api->getDomainName() . '/wiki/', $aboutHtml ); $this->setTempFileContent( 'about.xhtml', $aboutHtml ); } } diff --git a/src/Util/Api.php b/src/Util/Api.php index 5eba0d3e..321d1865 100644 --- a/src/Util/Api.php +++ b/src/Util/Api.php @@ -143,7 +143,6 @@ public function query( $params ) { */ public function queryAsync( $params ) { $params += [ 'action' => 'query', 'format' => 'json' ]; - return $this->getAsync( 'https://' . $this->getDomainName() . '/w/api.php', [ 'query' => $params ] @@ -189,30 +188,16 @@ public function completeQuery( $params ) { * @return PromiseInterface promise with the content of a page */ public function getPageAsync( $title ) { - return $this->queryAsync( [ - 'titles' => $title, - 'prop' => 'revisions', - 'rvprop' => 'content', - 'rvparse' => true - ] )->then( function ( array $result ) { - return $this->parseGetPageResponse( $result ); - } ); - } - - private function parseGetPageResponse( $response ) { - $pages = $response['query']['pages'] ?? []; - foreach ( $pages as $page ) { - $title = $page['title']; - if ( isset( $page['revisions'] ) ) { - foreach ( $page['revisions'] as $revision ) { - return Util::getXhtmlFromContent( $this->getLang(), $revision['*'], $title ); - } - } - } - if ( !isset( $title ) ) { - throw new HttpException( 500, 'No page information found in response' ); - } - throw new NotFoundHttpException( "Page revision not found for: $title" ); + $url = 'https://' . $this->getDomainName() . '/api/rest_v1/page/html/' . urlencode( $title ); + return $this->getAsync( $url ) + ->then( + function ( string $result ) use ( $title ) { + return Util::getXhtmlFromContent( $this->getLang(), $result, $title ); + }, + function ( $reason ) use ( $title ) { + throw new NotFoundHttpException( "Page not found for: $title" ); + } + ); } /** @@ -223,17 +208,6 @@ public function get( $url ) { return $this->client->get( $url )->getBody()->getContents(); } - /** - * @param string $url - * @return string the url encoded like mediawiki does. - */ - public static function mediawikiUrlEncode( string $url ): string { - $search = [ '%21', '%24', '%28', '%29', '%2A', '%2C', '%2D', '%2E', '%2F', '%3A', '%3B', '%40' ]; - $replace = [ '!', '$', '(', ')', '*', ',', '-', '.', '/', ':', ';', '@' ]; - - return str_replace( $search, $replace, urlencode( str_replace( ' ', '_', $url ) ) ); - } - /** * @param LoggerInterface $logger * @return ClientInterface diff --git a/src/Util/Util.php b/src/Util/Util.php index 58df65bb..9ba757a5 100644 --- a/src/Util/Util.php +++ b/src/Util/Util.php @@ -28,6 +28,8 @@ class Util { public static function wikisourceUrl( $lang, $page = '' ) { if ( $lang === '' ) { $url = 'https://wikisource.org'; + } elseif ( $lang === 'beta' ) { + $url = 'https://en.wikisource.beta.wmflabs.org'; } else { $url = 'https://' . $lang . '.wikisource.org'; } @@ -63,6 +65,10 @@ public static function getFile( $file ) { * @return string */ public static function getXhtmlFromContent( $lang, $content, $title = ' ' ) { + $bodyPosition = stripos( $content, '#isU', '', $content ); } @@ -70,8 +76,12 @@ public static function getXhtmlFromContent( $lang, $content, $title = ' ' ) { if ( $lang != null ) { $html .= ' xml:lang="' . $lang . '" dir="' . static::getLanguageDirection( $lang ) . '"'; } + $html .= '>' . $title . ''; - return $html . '>' . $title . '' . $content . ''; + if ( $bodyPosition ) { + return $html . $content; + } + return $html . '' . $content . ''; } public static function getTempFile( Api $api, $lang, $name ) { diff --git a/tests/Book/BookProviderTest.php b/tests/Book/BookProviderTest.php index 20967ed3..92dd4379 100644 --- a/tests/Book/BookProviderTest.php +++ b/tests/Book/BookProviderTest.php @@ -29,8 +29,8 @@ public function setUp(): void { new Response( 200, [ 'Content-Type' => 'application/json' ], json_encode( $creditResponse ) ), // The rest of these responses are required for mocking the Refresh process (namespaces, 'about' page, etc.) new Response( 200, [], '' ), - new Response( 200, [], '' ), - new Response( 200, [], '' ), + new Response( 404, [], '' ), // mock returning 404 in first api call in Refresh::getAboutXhtmlWikisource + new Response( 200, [], '' ), // mock getting content from '$oldWikisourceApi' in Refresh::getAboutXhtmlWikisource new Response( 200, [], json_encode( [ 'query' => [ 'namespaces' => [ [ '*' => 'test' ] ], 'namespacealiases' => [] ] ] ) ), ]; $this->mockHandler = new MockHandler( $responses ); diff --git a/tests/Book/RefreshTest.php b/tests/Book/RefreshTest.php index 77752be2..25b167c5 100644 --- a/tests/Book/RefreshTest.php +++ b/tests/Book/RefreshTest.php @@ -84,16 +84,17 @@ private function mockCssWikisourceResponse( $content ) { } private function mockAboutWikisourceResponse( $title, $content ) { - return new Response( 200, [ 'Content' => 'application/json' ], json_encode( [ - 'query' => [ - 'pages' => [ - [ - 'title' => $title, - 'revisions' => [ [ '*' => $content ] ] - ], - ] - ] - ] ) ); + return new Response( 200, [ 'Content' => 'application/json' ], + ' + + + + + + ' . $title . ' + + ' . $content . '' + ); } private function mockNamespacesListResponse( $namespaces ) { diff --git a/tests/Book/fixtures/Tales_of_Unrest/Navigation.html b/tests/Book/fixtures/Tales_of_Unrest/Navigation.html index 7901b7dc..e5399ba2 100644 --- a/tests/Book/fixtures/Tales_of_Unrest/Navigation.html +++ b/tests/Book/fixtures/Tales_of_Unrest/Navigation.html @@ -37,28 +37,28 @@
FOR THE SAKE OF OLD DAYS
diff --git a/tests/Http/BookTest.php b/tests/Http/BookTest.php index 2ea89159..598a66f1 100644 --- a/tests/Http/BookTest.php +++ b/tests/Http/BookTest.php @@ -45,7 +45,7 @@ public function testGetPage( $title, $language ) { public function testGetNonExistingTitleDisplaysError() { $client = static::createClient(); $client->request( 'GET', '/book.php', [ 'page' => 'xxx' ] ); - $this->assertStringContainsString( 'Page revision not found', $client->getResponse()->getContent() ); + $this->assertStringContainsString( 'Page not found', $client->getResponse()->getContent() ); $this->assertSame( 404, $client->getResponse()->getStatusCode() ); }