Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Add support for local files with file:// schema. #22

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/SitemapParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,16 @@ protected function getContent()
throw new Exceptions\SitemapParserException('Invalid URL');
}
try {
if (strpos($this->currentURL, 'file://') === 0) {
$path = parse_url($this->currentURL, PHP_URL_PATH);
if (!$this->urlValidatePath($path)) {
throw new Exceptions\SitemapParserException('Invalid file path');
}
if (!file_exists($path) && PHP_OS === 'WINNT') {
return file_get_contents(urldecode($path));
}
return file_get_contents($path);
}
if (!isset($this->config['guzzle']['headers']['User-Agent'])) {
$this->config['guzzle']['headers']['User-Agent'] = $this->userAgent;
}
Expand Down
22 changes: 21 additions & 1 deletion src/SitemapParser/UrlParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,12 @@ protected function urlValidate($url)
return (
filter_var($url, FILTER_VALIDATE_URL) &&
($parsed = parse_url($url)) !== false &&
$this->urlValidateHost($parsed['host']) &&
$this->urlValidateScheme($parsed['scheme']) &&
(
(in_array($parsed['scheme'], ['http', 'https'], true) && $this->urlValidateHost($parsed['host']))
||
(in_array($parsed['scheme'], ['file'], true) && $this->urlValidatePath($parsed['path']))
) &&
$this->urlValidateAgainstBlackList($url)
);
}
Expand Down Expand Up @@ -88,10 +92,26 @@ protected static function urlValidateScheme($scheme)
return in_array($scheme, [
'http',
'https',
'file'
]
);
}

/**
* Check if local file exists at given path.
*
* @param mixed $path
* @return bool
*/
public function urlValidatePath(mixed $path) {
$result = file_exists($path);
if ($result === false && PHP_OS === 'WINNT') {
// try to reverse url encoding for windows paths:
return file_exists(urldecode($path));
}
return $result;
}

protected function urlValidateAgainstBlackList($url)
{
if (empty($this->config['url_black_list'])) {
Expand Down
36 changes: 36 additions & 0 deletions tests/LocalFileTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
<?php

namespace vipnytt\SitemapParser\Tests;

use PHPUnit\Framework\TestCase;
use vipnytt\SitemapParser;

class RecursiveTest extends TestCase {

public function testLocalFileXMLFile()
{
$parser = new SitemapParser('SitemapParser');
$this->assertInstanceOf('vipnytt\SitemapParser', $parser);

$tmpfname = tempnam(sys_get_temp_dir(), "sitemap_parser_test_file");
$fileContent = <<<XMLSITEMAP
<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<sitemap>
<loc>http://www.example.com/sitemap.xml</loc>
<lastmod>2004-10-01T18:23:17+00:00</lastmod>
</sitemap>
</sitemapindex>
XMLSITEMAP;
file_put_contents($tmpfname, $fileContent);
$parser->parse('file:///'.$tmpfname);
$this->assertEquals([
'http://www.example.com/sitemap.xml' => [
'loc' => 'http://www.example.com/sitemap.xml',
'lastmod' => '2004-10-01T18:23:17+00:00',
'namespaces' => [],
],
], $parser->getSitemaps());
}

}