Skip to content

Commit

Permalink
Merge pull request #267 from wsexport/migrate-to-parsoid-api
Browse files Browse the repository at this point in the history
Migrate to parsoid api
  • Loading branch information
samwilson authored Dec 14, 2020
2 parents 1b426f1 + f6759ca commit 526a5c0
Show file tree
Hide file tree
Showing 11 changed files with 110 additions and 70 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ docker-compose exec wsexport ./bin/console app:install

Wikisource Export should be up at http://localhost:8888/

### Cache
Go to `/refresh` to clear the cache

### Setup Xdebug
Xdebug is disabled by default. If you need to enable it you can do so via an env variable by creating a `./docker/docker-compose.override.yml` file with the following content
Expand Down
4 changes: 3 additions & 1 deletion src/BookProvider.php
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ public function __construct( Api $api, array $options ) {
/**
* return all the data on a book needed to export it
* @param $title string the title of the main page of the book in Wikisource
* @param $isMetadata bool only retrive metadata on the book
* @param $isMetadata bool only retrieve metadata on the book
* @return Book
*/
public function get( $title, $isMetadata = false ) {
Expand Down Expand Up @@ -123,6 +123,8 @@ public function getMetadata( $title, $isMetadata, DOMDocument $doc ) {
}
$chapterTitles = $parser->getFullChaptersList( $title, $pageList, $namespaces );
$chapters = $this->getPages( $chapterTitles );

// Generate all the chapters
foreach ( $chapters as $chapter_key => $chapter ) {
$parser = new PageParser( $chapter->content );
if ( $parser->metadataIsSet( 'ws-noinclude' ) ) {
Expand Down
2 changes: 2 additions & 0 deletions src/Cleaner/BookCleanerEpub.php
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,8 @@ protected function setLinks( DOMDocument $dom ) {
$node->setAttribute( 'href', 'http:' . $href );
} elseif ( substr( $href, 0, 1 ) === '/' ) {
$node->setAttribute( 'href', $this->baseUrl . $href );
} elseif ( substr( $href, 0, 2 ) === './' ) {
$node->setAttribute( 'href', $this->baseUrl . '/wiki/' . substr( $href, 2 ) );
}
}
}
Expand Down
59 changes: 54 additions & 5 deletions src/PageParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

namespace App;

use App\Util\Api;
use DOMDocument;
use DOMElement;
use DOMXPath;
Expand Down Expand Up @@ -51,7 +50,17 @@ public function metadataIsSet( $id ) {
* @return Page[]
*/
public function getChaptersList( $pageList, $namespaces ) {
$list = $this->xPath->query( '//*[@id="ws-summary" or contains(@class,"ws-summary")]/descendant::a[not(contains(@href,"action=edit") or contains(@class,"extiw") or contains(@class,"external") or contains(@class,"internal") or contains(@class,"image"))]' );
$list = $this->xPath->query( '//*[
@id="ws-summary" or
contains(@class,"ws-summary")]/descendant::a[
not(
contains(@class,"new") or
contains(@href,"action=edit") or
contains(@class,"extiw") or contains(@class,"external") or
contains(@class,"internal") or
contains(@class,"image")
)
]' );
$chapters = [];
/** @var DOMElement $link */
foreach ( $list as $link ) {
Expand All @@ -61,7 +70,14 @@ public function getChaptersList( $pageList, $namespaces ) {
// If there's no path component, it can't be a link to a chapter.
continue;
}
$title = urldecode( substr( $urlParts['path'], strlen( '/wiki/' ) ) );

// Remove string "/wiki/" if it's found in $urlParts['path']
if ( substr( $urlParts['path'], 0, strlen( '/wiki/' ) ) === "/wiki/" ) {
$title = urldecode( substr( $urlParts['path'], strlen( '/wiki/' ) ) );
} else {
$title = urldecode( substr( $urlParts['path'], strlen( '/' ) ) );
}

$parts = explode( ':', $title );
// Include the chapter if it's not already present and is a main-namespace page.
if ( $title != '' && !in_array( $title, $pageList ) && !in_array( $parts[0], $namespaces ) ) {
Expand All @@ -83,7 +99,17 @@ public function getChaptersList( $pageList, $namespaces ) {
public function getFullChaptersList( $title, $pageList, $namespaces ) {
$chapters = $this->getChaptersList( $pageList, $namespaces );
if ( empty( $chapters ) ) {
$list = $this->xPath->query( '//a[contains(@href,"' . Api::mediawikiUrlEncode( $title ) . '") and not(contains(@class,"extiw") or contains(@class,"external") or contains(@href,"#") or contains(@class,"internal") or contains(@href,"action=edit") or contains(@title,"/Texte entier") or contains(@class,"image"))]' );
$list = $this->xPath->query( '//a[contains(@href,"' . $title . '") and
not(
contains(@class,"new") or
contains(@class,"extiw") or
contains(@class,"external") or
contains(@href,"#") or
contains(@class,"internal") or
contains(@href,"action=edit") or
contains(@title,"/Texte entier") or
contains(@class,"image")
)]' );
/** @var DOMElement $link */
foreach ( $list as $link ) {
$title = str_replace( ' ', '_', $link->getAttribute( 'title' ) );
Expand Down Expand Up @@ -126,7 +152,11 @@ public function getPicturesList() {

}

$list = $this->xPath->query( '//a[contains(@class,"image")]' );
$list = $this->xPath->query( '
//a[contains(@class,"image")] |
//figure[contains(@typeof,"mw:Image")] |
//figure-inline[contains(@typeof,"mw:Image")]'
);
/** @var DOMElement $node */
foreach ( $list as $node ) {
/** @var DOMElement $img */
Expand Down Expand Up @@ -203,6 +233,7 @@ public function getContent( $isMainPage ) {
$this->removeNodesWithXpath( '//span[@class="editsection" or @class="mw-editsection"]' );
$this->removeNodesWithXpath( '//a[@class="mw-headline-anchor"]' );
$this->removeNodesWithXpath( '//div[@class="mediaContainer"]' );
$this->removeNodesWithXpath( '//link[@rel="mw:PageProp/Category"]' );
$this->deprecatedNodes( 'big', 'span', 'font-size:large;' );
$this->deprecatedNodes( 'center', 'div', 'text-align:center;' );
$this->deprecatedNodes( 'strike', 'span', 'text-decoration:line-through;' );
Expand All @@ -229,6 +260,7 @@ public function getContent( $isMainPage ) {

$this->cleanIds();
$this->cleanRedLinks();
$this->cleanReferenceLinks();
$this->moveStyleToHead();

return $this->xPath->document;
Expand Down Expand Up @@ -314,6 +346,23 @@ private function cleanRedLinks() {
}
}

private function cleanReferenceLinks() {
// Get all links that contain the "style" value "mw-Ref"
$links = $this->xPath->query( '//html:a[contains(@style, "mw-Ref")]' );
foreach ( $links as $link ) {
$href = $link->getAttribute( 'href' );
$pos = strpos( $href, '#' );
$link->setAttribute( 'href', substr( $href, $pos ) );
}
// Get all links that have the "rel" attribute equals to "mw-Ref"
$links = $this->xPath->query( '//html:a[@rel="mw:referencedBy"]' );
foreach ( $links as $link ) {
$href = $link->getAttribute( 'href' );
$pos = strpos( $href, '#' );
$link->setAttribute( 'href', substr( $href, $pos ) );
}
}

private function moveStyleToHead() {
/** @var DOMElement $head */
foreach ( $this->xPath->query( '//head' ) as $head ) {
Expand Down
2 changes: 1 addition & 1 deletion src/Refresh.php
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ protected function getAboutXhtmlWikisource() {
// Add https to protocol-relative links.
$aboutHtml = str_replace( 'href="//', 'href="https://', $document->saveXML() );
// Fully qualify unqualified links.
$aboutHtml = str_replace( 'href="/', 'href="https://' . $this->api->getDomainName() . '/', $aboutHtml );
$aboutHtml = str_replace( 'href="./', 'href="https://' . $this->api->getDomainName() . '/wiki/', $aboutHtml );
$this->setTempFileContent( 'about.xhtml', $aboutHtml );
}
}
Expand Down
46 changes: 10 additions & 36 deletions src/Util/Api.php
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,6 @@ public function query( $params ) {
*/
public function queryAsync( $params ) {
$params += [ 'action' => 'query', 'format' => 'json' ];

return $this->getAsync(
'https://' . $this->getDomainName() . '/w/api.php',
[ 'query' => $params ]
Expand Down Expand Up @@ -189,30 +188,16 @@ public function completeQuery( $params ) {
* @return PromiseInterface promise with the content of a page
*/
public function getPageAsync( $title ) {
return $this->queryAsync( [
'titles' => $title,
'prop' => 'revisions',
'rvprop' => 'content',
'rvparse' => true
] )->then( function ( array $result ) {
return $this->parseGetPageResponse( $result );
} );
}

private function parseGetPageResponse( $response ) {
$pages = $response['query']['pages'] ?? [];
foreach ( $pages as $page ) {
$title = $page['title'];
if ( isset( $page['revisions'] ) ) {
foreach ( $page['revisions'] as $revision ) {
return Util::getXhtmlFromContent( $this->getLang(), $revision['*'], $title );
}
}
}
if ( !isset( $title ) ) {
throw new HttpException( 500, 'No page information found in response' );
}
throw new NotFoundHttpException( "Page revision not found for: $title" );
$url = 'https://' . $this->getDomainName() . '/api/rest_v1/page/html/' . urlencode( $title );
return $this->getAsync( $url )
->then(
function ( string $result ) use ( $title ) {
return Util::getXhtmlFromContent( $this->getLang(), $result, $title );
},
function ( $reason ) use ( $title ) {
throw new NotFoundHttpException( "Page not found for: $title" );
}
);
}

/**
Expand All @@ -223,17 +208,6 @@ public function get( $url ) {
return $this->client->get( $url )->getBody()->getContents();
}

/**
* @param string $url
* @return string the url encoded like mediawiki does.
*/
public static function mediawikiUrlEncode( string $url ): string {
$search = [ '%21', '%24', '%28', '%29', '%2A', '%2C', '%2D', '%2E', '%2F', '%3A', '%3B', '%40' ];
$replace = [ '!', '$', '(', ')', '*', ',', '-', '.', '/', ':', ';', '@' ];

return str_replace( $search, $replace, urlencode( str_replace( ' ', '_', $url ) ) );
}

/**
* @param LoggerInterface $logger
* @return ClientInterface
Expand Down
12 changes: 11 additions & 1 deletion src/Util/Util.php
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ class Util {
public static function wikisourceUrl( $lang, $page = '' ) {
if ( $lang === '' ) {
$url = 'https://wikisource.org';
} elseif ( $lang === 'beta' ) {
$url = 'https://en.wikisource.beta.wmflabs.org';
} else {
$url = 'https://' . $lang . '.wikisource.org';
}
Expand Down Expand Up @@ -63,15 +65,23 @@ public static function getFile( $file ) {
* @return string
*/
public static function getXhtmlFromContent( $lang, $content, $title = ' ' ) {
$bodyPosition = stripos( $content, '<body' );
// Remove all content before <body tag
$content = substr( $content, $bodyPosition );

if ( $content != '' ) {
$content = preg_replace( '#<\!--(.+)-->#isU', '', $content );
}
$html = '<?xml version="1.0" encoding="UTF-8" ?><!DOCTYPE html><html xmlns="http://www.w3.org/1999/xhtml"';
if ( $lang != null ) {
$html .= ' xml:lang="' . $lang . '" dir="' . static::getLanguageDirection( $lang ) . '"';
}
$html .= '><head><meta content="application/xhtml+xml;charset=UTF-8" http-equiv="default-style" /><link type="text/css" rel="stylesheet" href="main.css" /><title>' . $title . '</title></head>';

return $html . '><head><meta content="application/xhtml+xml;charset=UTF-8" http-equiv="default-style" /><link type="text/css" rel="stylesheet" href="main.css" /><title>' . $title . '</title></head><body>' . $content . '</body></html>';
if ( $bodyPosition ) {
return $html . $content;
}
return $html . '<body>' . $content . '</body></html>';
}

public static function getTempFile( Api $api, $lang, $name ) {
Expand Down
4 changes: 2 additions & 2 deletions tests/Book/BookProviderTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ public function setUp(): void {
new Response( 200, [ 'Content-Type' => 'application/json' ], json_encode( $creditResponse ) ),
// The rest of these responses are required for mocking the Refresh process (namespaces, 'about' page, etc.)
new Response( 200, [], '' ),
new Response( 200, [], '' ),
new Response( 200, [], '' ),
new Response( 404, [], '' ), // mock returning 404 in first api call in Refresh::getAboutXhtmlWikisource
new Response( 200, [], '' ), // mock getting content from '$oldWikisourceApi' in Refresh::getAboutXhtmlWikisource
new Response( 200, [], json_encode( [ 'query' => [ 'namespaces' => [ [ '*' => 'test' ] ], 'namespacealiases' => [] ] ] ) ),
];
$this->mockHandler = new MockHandler( $responses );
Expand Down
21 changes: 11 additions & 10 deletions tests/Book/RefreshTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -84,16 +84,17 @@ private function mockCssWikisourceResponse( $content ) {
}

private function mockAboutWikisourceResponse( $title, $content ) {
return new Response( 200, [ 'Content' => 'application/json' ], json_encode( [
'query' => [
'pages' => [
[
'title' => $title,
'revisions' => [ [ '*' => $content ] ]
],
]
]
] ) );
return new Response( 200, [ 'Content' => 'application/json' ],
'<!DOCTYPE html>
<html prefix="dc: http://purl.org/dc/terms/ mw: http://mediawiki.org/rdf/" about="https://en.wikisource.org/wiki/Special:Redirect/revision/2952249">
<head prefix="mwr: https://en.wikisource.org/wiki/Special:Redirect/"><meta property="mw:TimeUuid" content="27feca60-13e5-11eb-ae2c-cd9b7fbfbfd2"/>
<meta charset="utf-8"/><meta property="mw:pageId" content="791503"/><meta property="mw:pageNamespace" content="0"/>
<link rel="dc:replaces" resource="mwr:revision/2952206"/><meta property="mw:revisionSHA1" content="3b67a798e367dda2bebc6a7a6f272ffd7cd7bfcf"/>
<meta property="dc:modified" content="2011-06-11T09:02:29.000Z"/><meta property="mw:html:version" content="2.1.0"/>
<link rel="dc:isVersionOf" href="//en.wikisource.org/wiki/' . urlencode( $title ) . '"/><title>' . $title . '</title>
<base href="//en.wikisource.org/wiki/"/><link rel="stylesheet" href="/w/load.php?lang=en&amp;modules=mediawiki.skinning.content.parsoid%7Cmediawiki.skinning.interface%7Csite.styles%7Cmediawiki.page.gallery.styles%7Cext.cite.style%7Cext.cite.styles&amp;only=styles&amp;skin=vector"/><meta http-equiv="content-language" content="en"/><meta http-equiv="vary" content="Accept"/></head>
<body>' . $content . '</body></html>'
);
}

private function mockNamespacesListResponse( $namespaces ) {
Expand Down
26 changes: 13 additions & 13 deletions tests/Book/fixtures/Tales_of_Unrest/Navigation.html
Original file line number Diff line number Diff line change
Expand Up @@ -37,28 +37,28 @@
<center><i>FOR THE SAKE OF OLD DAYS</i></center>
<div id="ws-summary">
<ul>
<li><a href="/wiki/Tales_of_Unrest/Author%27s_Note" title="Tales of Unrest/Author's Note">Author's Note</a></li>
<li><a href="./Tales_of_Unrest/Author%27s_Note" title="Tales of Unrest/Author's Note">Author's Note</a></li>
</ul>
<ul>
<li><a href="/wiki/Karain" title="Karain">Karain</a>
<ul>
<li><a href="/wiki/Karain/Chapter_I" title="Karain/Chapter I">Chapter I</a></li>
<li><a href="/wiki/Karain/Chapter_II" title="Karain/Chapter II">Chapter II</a></li>
<li><a href="/wiki/Karain/Chapter_III" title="Karain/Chapter III">Chapter III</a></li>
<li><a href="/wiki/Karain/Chapter_IV" title="Karain/Chapter IV - with a title that doesn't match">Chapter IV</a></li>
<li><a href="/wiki/Karain/Chapter_V#test-fragment" title="Karain/Chapter V">Chapter V</a> (with a fragment test)</li>
<li><a href="/wiki/Karain/Chapter_VI" title="Karain/Chapter VI">Chapter VI</a></li>
<li><a href="./Karain/Chapter_I" title="Karain/Chapter I">Chapter I</a></li>
<li><a href="./Karain/Chapter_II" title="Karain/Chapter II">Chapter II</a></li>
<li><a href="./Karain/Chapter_III" title="Karain/Chapter III">Chapter III</a></li>
<li><a href="./Karain/Chapter_IV" title="Karain/Chapter IV - with a title that doesn't match">Chapter IV</a></li>
<li><a href="./Karain/Chapter_V#test-fragment" title="Karain/Chapter V">Chapter V</a> (with a fragment test)</li>
<li><a href="./Karain/Chapter_VI" title="Karain/Chapter VI">Chapter VI</a></li>
</ul>
</li>
<li><a href="/wiki/The_Idiots" title="The Idiots">The Idiots</a></li>
<li><a href="/wiki/An_Outpost_of_Progress" title="An Outpost of Progress">An Outpost of Progress</a>
<li><a href="./The_Idiots" title="The Idiots">The Idiots</a></li>
<li><a href="./An_Outpost_of_Progress" title="An Outpost of Progress">An Outpost of Progress</a>
<ul>
<li><a href="/wiki/An_Outpost_of_Progress/Chapter_I" title="An Outpost of Progress/Chapter I">Chapter I</a></li>
<li><a href="/wiki/An_Outpost_of_Progress/Chapter_II" title="An Outpost of Progress/Chapter II">Chapter II</a></li>
<li><a href="./An_Outpost_of_Progress/Chapter_I" title="An Outpost of Progress/Chapter I">Chapter I</a></li>
<li><a href="./An_Outpost_of_Progress/Chapter_II" title="An Outpost of Progress/Chapter II">Chapter II</a></li>
</ul>
</li>
<li><a href="/wiki/The_Return_(Conrad)" title="The Return (Conrad)">The Return</a></li>
<li><a href="/wiki/The_Lagoon" title="The Lagoon">The Lagoon</a></li>
<li><a href="./The_Return_(Conrad)" title="The Return (Conrad)">The Return</a></li>
<li><a href="./The_Lagoon" title="The Lagoon">The Lagoon</a></li>
</ul>
<a href="//upload.wikimedia.org/wikipedia/commons/7/74/Guitar.pdf" class="internal" title="Guitar.pdf">Download</a>
</div>
Expand Down
2 changes: 1 addition & 1 deletion tests/Http/BookTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ public function testGetPage( $title, $language ) {
public function testGetNonExistingTitleDisplaysError() {
$client = static::createClient();
$client->request( 'GET', '/book.php', [ 'page' => 'xxx' ] );
$this->assertStringContainsString( 'Page revision not found', $client->getResponse()->getContent() );
$this->assertStringContainsString( 'Page not found', $client->getResponse()->getContent() );
$this->assertSame( 404, $client->getResponse()->getStatusCode() );
}

Expand Down

0 comments on commit 526a5c0

Please # to comment.