diff --git a/README.md b/README.md index c92442cd..cea13ca9 100644 --- a/README.md +++ b/README.md @@ -112,6 +112,8 @@ docker-compose exec wsexport ./bin/console app:install Wikisource Export should be up at http://localhost:8888/ +### Cache +Go to `/refresh` to clear the cache ### Setup Xdebug Xdebug is disabled by default. If you need to enable it you can do so via an env variable by creating a `./docker/docker-compose.override.yml` file with the following content diff --git a/src/BookProvider.php b/src/BookProvider.php index 12188014..28f0af18 100644 --- a/src/BookProvider.php +++ b/src/BookProvider.php @@ -38,7 +38,7 @@ public function __construct( Api $api, array $options ) { /** * return all the data on a book needed to export it * @param $title string the title of the main page of the book in Wikisource - * @param $isMetadata bool only retrive metadata on the book + * @param $isMetadata bool only retrieve metadata on the book * @return Book */ public function get( $title, $isMetadata = false ) { @@ -123,6 +123,8 @@ public function getMetadata( $title, $isMetadata, DOMDocument $doc ) { } $chapterTitles = $parser->getFullChaptersList( $title, $pageList, $namespaces ); $chapters = $this->getPages( $chapterTitles ); + + // Generate all the chapters foreach ( $chapters as $chapter_key => $chapter ) { $parser = new PageParser( $chapter->content ); if ( $parser->metadataIsSet( 'ws-noinclude' ) ) { diff --git a/src/Cleaner/BookCleanerEpub.php b/src/Cleaner/BookCleanerEpub.php index 0fca6863..b6981d09 100644 --- a/src/Cleaner/BookCleanerEpub.php +++ b/src/Cleaner/BookCleanerEpub.php @@ -255,6 +255,8 @@ protected function setLinks( DOMDocument $dom ) { $node->setAttribute( 'href', 'http:' . $href ); } elseif ( substr( $href, 0, 1 ) === '/' ) { $node->setAttribute( 'href', $this->baseUrl . $href ); + } elseif ( substr( $href, 0, 2 ) === './' ) { + $node->setAttribute( 'href', $this->baseUrl . '/wiki/' . substr( $href, 2 ) ); } } } diff --git a/src/PageParser.php b/src/PageParser.php index 60a954b3..e5e542b0 100644 --- a/src/PageParser.php +++ b/src/PageParser.php @@ -2,7 +2,6 @@ namespace App; -use App\Util\Api; use DOMDocument; use DOMElement; use DOMXPath; @@ -51,7 +50,17 @@ public function metadataIsSet( $id ) { * @return Page[] */ public function getChaptersList( $pageList, $namespaces ) { - $list = $this->xPath->query( '//*[@id="ws-summary" or contains(@class,"ws-summary")]/descendant::a[not(contains(@href,"action=edit") or contains(@class,"extiw") or contains(@class,"external") or contains(@class,"internal") or contains(@class,"image"))]' ); + $list = $this->xPath->query( '//*[ + @id="ws-summary" or + contains(@class,"ws-summary")]/descendant::a[ + not( + contains(@class,"new") or + contains(@href,"action=edit") or + contains(@class,"extiw") or contains(@class,"external") or + contains(@class,"internal") or + contains(@class,"image") + ) + ]' ); $chapters = []; /** @var DOMElement $link */ foreach ( $list as $link ) { @@ -61,7 +70,14 @@ public function getChaptersList( $pageList, $namespaces ) { // If there's no path component, it can't be a link to a chapter. continue; } - $title = urldecode( substr( $urlParts['path'], strlen( '/wiki/' ) ) ); + + // Remove string "/wiki/" if it's found in $urlParts['path'] + if ( substr( $urlParts['path'], 0, strlen( '/wiki/' ) ) === "/wiki/" ) { + $title = urldecode( substr( $urlParts['path'], strlen( '/wiki/' ) ) ); + } else { + $title = urldecode( substr( $urlParts['path'], strlen( '/' ) ) ); + } + $parts = explode( ':', $title ); // Include the chapter if it's not already present and is a main-namespace page. if ( $title != '' && !in_array( $title, $pageList ) && !in_array( $parts[0], $namespaces ) ) { @@ -83,7 +99,17 @@ public function getChaptersList( $pageList, $namespaces ) { public function getFullChaptersList( $title, $pageList, $namespaces ) { $chapters = $this->getChaptersList( $pageList, $namespaces ); if ( empty( $chapters ) ) { - $list = $this->xPath->query( '//a[contains(@href,"' . Api::mediawikiUrlEncode( $title ) . '") and not(contains(@class,"extiw") or contains(@class,"external") or contains(@href,"#") or contains(@class,"internal") or contains(@href,"action=edit") or contains(@title,"/Texte entier") or contains(@class,"image"))]' ); + $list = $this->xPath->query( '//a[contains(@href,"' . $title . '") and + not( + contains(@class,"new") or + contains(@class,"extiw") or + contains(@class,"external") or + contains(@href,"#") or + contains(@class,"internal") or + contains(@href,"action=edit") or + contains(@title,"/Texte entier") or + contains(@class,"image") + )]' ); /** @var DOMElement $link */ foreach ( $list as $link ) { $title = str_replace( ' ', '_', $link->getAttribute( 'title' ) ); @@ -126,7 +152,11 @@ public function getPicturesList() { } - $list = $this->xPath->query( '//a[contains(@class,"image")]' ); + $list = $this->xPath->query( ' + //a[contains(@class,"image")] | + //figure[contains(@typeof,"mw:Image")] | + //figure-inline[contains(@typeof,"mw:Image")]' + ); /** @var DOMElement $node */ foreach ( $list as $node ) { /** @var DOMElement $img */ @@ -203,6 +233,7 @@ public function getContent( $isMainPage ) { $this->removeNodesWithXpath( '//span[@class="editsection" or @class="mw-editsection"]' ); $this->removeNodesWithXpath( '//a[@class="mw-headline-anchor"]' ); $this->removeNodesWithXpath( '//div[@class="mediaContainer"]' ); + $this->removeNodesWithXpath( '//link[@rel="mw:PageProp/Category"]' ); $this->deprecatedNodes( 'big', 'span', 'font-size:large;' ); $this->deprecatedNodes( 'center', 'div', 'text-align:center;' ); $this->deprecatedNodes( 'strike', 'span', 'text-decoration:line-through;' ); @@ -229,6 +260,7 @@ public function getContent( $isMainPage ) { $this->cleanIds(); $this->cleanRedLinks(); + $this->cleanReferenceLinks(); $this->moveStyleToHead(); return $this->xPath->document; @@ -314,6 +346,23 @@ private function cleanRedLinks() { } } + private function cleanReferenceLinks() { + // Get all links that contain the "style" value "mw-Ref" + $links = $this->xPath->query( '//html:a[contains(@style, "mw-Ref")]' ); + foreach ( $links as $link ) { + $href = $link->getAttribute( 'href' ); + $pos = strpos( $href, '#' ); + $link->setAttribute( 'href', substr( $href, $pos ) ); + } + // Get all links that have the "rel" attribute equals to "mw-Ref" + $links = $this->xPath->query( '//html:a[@rel="mw:referencedBy"]' ); + foreach ( $links as $link ) { + $href = $link->getAttribute( 'href' ); + $pos = strpos( $href, '#' ); + $link->setAttribute( 'href', substr( $href, $pos ) ); + } + } + private function moveStyleToHead() { /** @var DOMElement $head */ foreach ( $this->xPath->query( '//head' ) as $head ) { diff --git a/src/Refresh.php b/src/Refresh.php index 10732478..c531cd0c 100644 --- a/src/Refresh.php +++ b/src/Refresh.php @@ -74,7 +74,7 @@ protected function getAboutXhtmlWikisource() { // Add https to protocol-relative links. $aboutHtml = str_replace( 'href="//', 'href="https://', $document->saveXML() ); // Fully qualify unqualified links. - $aboutHtml = str_replace( 'href="/', 'href="https://' . $this->api->getDomainName() . '/', $aboutHtml ); + $aboutHtml = str_replace( 'href="./', 'href="https://' . $this->api->getDomainName() . '/wiki/', $aboutHtml ); $this->setTempFileContent( 'about.xhtml', $aboutHtml ); } } diff --git a/src/Util/Api.php b/src/Util/Api.php index 5eba0d3e..321d1865 100644 --- a/src/Util/Api.php +++ b/src/Util/Api.php @@ -143,7 +143,6 @@ public function query( $params ) { */ public function queryAsync( $params ) { $params += [ 'action' => 'query', 'format' => 'json' ]; - return $this->getAsync( 'https://' . $this->getDomainName() . '/w/api.php', [ 'query' => $params ] @@ -189,30 +188,16 @@ public function completeQuery( $params ) { * @return PromiseInterface promise with the content of a page */ public function getPageAsync( $title ) { - return $this->queryAsync( [ - 'titles' => $title, - 'prop' => 'revisions', - 'rvprop' => 'content', - 'rvparse' => true - ] )->then( function ( array $result ) { - return $this->parseGetPageResponse( $result ); - } ); - } - - private function parseGetPageResponse( $response ) { - $pages = $response['query']['pages'] ?? []; - foreach ( $pages as $page ) { - $title = $page['title']; - if ( isset( $page['revisions'] ) ) { - foreach ( $page['revisions'] as $revision ) { - return Util::getXhtmlFromContent( $this->getLang(), $revision['*'], $title ); - } - } - } - if ( !isset( $title ) ) { - throw new HttpException( 500, 'No page information found in response' ); - } - throw new NotFoundHttpException( "Page revision not found for: $title" ); + $url = 'https://' . $this->getDomainName() . '/api/rest_v1/page/html/' . urlencode( $title ); + return $this->getAsync( $url ) + ->then( + function ( string $result ) use ( $title ) { + return Util::getXhtmlFromContent( $this->getLang(), $result, $title ); + }, + function ( $reason ) use ( $title ) { + throw new NotFoundHttpException( "Page not found for: $title" ); + } + ); } /** @@ -223,17 +208,6 @@ public function get( $url ) { return $this->client->get( $url )->getBody()->getContents(); } - /** - * @param string $url - * @return string the url encoded like mediawiki does. - */ - public static function mediawikiUrlEncode( string $url ): string { - $search = [ '%21', '%24', '%28', '%29', '%2A', '%2C', '%2D', '%2E', '%2F', '%3A', '%3B', '%40' ]; - $replace = [ '!', '$', '(', ')', '*', ',', '-', '.', '/', ':', ';', '@' ]; - - return str_replace( $search, $replace, urlencode( str_replace( ' ', '_', $url ) ) ); - } - /** * @param LoggerInterface $logger * @return ClientInterface diff --git a/src/Util/Util.php b/src/Util/Util.php index 58df65bb..9ba757a5 100644 --- a/src/Util/Util.php +++ b/src/Util/Util.php @@ -28,6 +28,8 @@ class Util { public static function wikisourceUrl( $lang, $page = '' ) { if ( $lang === '' ) { $url = 'https://wikisource.org'; + } elseif ( $lang === 'beta' ) { + $url = 'https://en.wikisource.beta.wmflabs.org'; } else { $url = 'https://' . $lang . '.wikisource.org'; } @@ -63,6 +65,10 @@ public static function getFile( $file ) { * @return string */ public static function getXhtmlFromContent( $lang, $content, $title = ' ' ) { + $bodyPosition = stripos( $content, '
#isU', '', $content ); } @@ -70,8 +76,12 @@ public static function getXhtmlFromContent( $lang, $content, $title = ' ' ) { if ( $lang != null ) { $html .= ' xml:lang="' . $lang . '" dir="' . static::getLanguageDirection( $lang ) . '"'; } + $html .= '>