From edbfc2984a69e34a93d72505195508edacb8dd5f Mon Sep 17 00:00:00 2001 From: Aaron Weeden Date: Fri, 19 Jan 2024 16:31:26 -0500 Subject: [PATCH] Parse request method, URL, and protocol separately in web server logs. --- .gitignore | 4 +- classes/ETL/DataEndpoint/WebServerLogFile.php | 7 ++ .../input/webserverlogfile/test.log | 1 + .../ETL/DataEndpoint/WebServerLogFileTest.php | 113 ++++++++++++++++++ 4 files changed, 123 insertions(+), 2 deletions(-) create mode 100644 tests/artifacts/xdmod/etlv2/dataendpoint/input/webserverlogfile/test.log create mode 100644 tests/unit/lib/ETL/DataEndpoint/WebServerLogFileTest.php diff --git a/.gitignore b/.gitignore index f896a065d7..8305c75f9c 100644 --- a/.gitignore +++ b/.gitignore @@ -317,8 +317,8 @@ configuration/aggregation_meta/ # configuration/etl/*.json # configuration/etl/**/*.json -# Include log files that are used for regression testing -!tests/artifacts/xdmod-test-artifacts/xdmod/referencedata/*.log +# Include log files that are used for CI testing +!tests/artifacts/**/*.log # Include CCR Log Class !classes/Log diff --git a/classes/ETL/DataEndpoint/WebServerLogFile.php b/classes/ETL/DataEndpoint/WebServerLogFile.php index f52022220f..0fa8207084 100644 --- a/classes/ETL/DataEndpoint/WebServerLogFile.php +++ b/classes/ETL/DataEndpoint/WebServerLogFile.php @@ -54,6 +54,13 @@ public function __construct(DataEndpointOptions $options, LoggerInterface $logge $this->web_parser->addPattern('%u', '(?P(?:-|[\w\-\.@]+))'); if (isset($options->log_format)) { + // Replace `%r` with `%m %U %H` so the request method, URL, and + // protocol can be parsed separately. + $options->log_format = str_replace( + '%r', + '%m %U %H', + $options->log_format + ); $this->web_parser->setFormat($options->log_format); } diff --git a/tests/artifacts/xdmod/etlv2/dataendpoint/input/webserverlogfile/test.log b/tests/artifacts/xdmod/etlv2/dataendpoint/input/webserverlogfile/test.log new file mode 100644 index 0000000000..a01a2d8f57 --- /dev/null +++ b/tests/artifacts/xdmod/etlv2/dataendpoint/input/webserverlogfile/test.log @@ -0,0 +1 @@ +127.0.0.0 - testuser1 [01/Jul/2021:03:17:06 -0500] "GET /pun/sys/dashboard/apps/icon/jupyter_quantum_chem/sys/sys?foo=bar HTTP/1.1" 200 381 "https://ondemand.ccr.buffalo.edu/pun/sys/dashboard/batch_connect/sessions" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36" diff --git a/tests/unit/lib/ETL/DataEndpoint/WebServerLogFileTest.php b/tests/unit/lib/ETL/DataEndpoint/WebServerLogFileTest.php new file mode 100644 index 0000000000..73c790bda7 --- /dev/null +++ b/tests/unit/lib/ETL/DataEndpoint/WebServerLogFileTest.php @@ -0,0 +1,113 @@ + false, + 'db' => false, + 'mail' => false, + 'consoleLogLevel' => Log::EMERG + ); + + self::$logger = Log::factory('PHPUnit', $conf); + } + + /** + * @dataProvider provideWebServerLogFile + */ + public function testWebServerLogFile($filename, $logFormat, $expected) + { + $config = [ + 'type' => 'directoryscanner', + 'name' => 'Web Server Logs', + 'path' => self::TEST_ARTIFACT_INPUT_PATH, + 'file_pattern' => "/$filename/", + 'handler' => (object)[ + 'type' => 'webserverlog', + 'record_separator' => "\n", + 'log_format' => $logFormat + ] + ]; + $options = new DataEndpointOptions($config); + $endpoint = DataEndpoint::factory($options, self::$logger); + $endpoint->verify(); + $endpoint->connect(); + $numIterations = 0; + foreach ($endpoint as $record) { + $this->assertSame($expected[$numIterations], $record); + $numIterations++; + } + $this->assertSame( + count($expected), + $numIterations, + 'Did not parse correct number of records.' + ); + } + + public function provideWebServerLogFile() + { + $logFormats = [ + '%h %l %u %t "%r" %>s %b "%{Referer}i" "%{User-Agent}i"', + '%h %l %u %t "%m %U %H" %>s %b "%{Referer}i" "%{User-Agent}i"' + ]; + $tests = []; + foreach ($logFormats as $logFormat) { + array_push( + $tests, + [ + 'test.log', + $logFormat, + [ + [ + 'host' => '127.0.0.0', + 'logname' => '-', + 'user' => 'testuser1', + 'stamp' => 1625127426, + 'time' => '01/Jul/2021:03:17:06 -0500', + 'requestMethod' => 'GET', + 'URL' => '/pun/sys/dashboard/apps/icon/jupyter_quantum_chem/sys/sys?foo=bar', + 'requestProtocol' => 'HTTP/1.1', + 'status' => '200', + 'responseBytes' => '381', + 'HeaderReferer' => 'https://ondemand.ccr.buffalo.edu/pun/sys/dashboard/batch_connect/sessions', + 'HeaderUserAgent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36', + 'ua_family' => 'Chrome', + 'ua_major' => '91', + 'ua_minor' => '0', + 'ua_patch' => '4472', + 'ua_os_family' => 'Windows', + 'ua_os_major' => '10', + 'ua_os_minor' => null, + 'ua_os_patch' => null, + 'ua_device_family' => 'Other', + 'ua_device_brand' => null, + 'ua_device_model' => null, + 'geo_city_name' => 'NA', + 'geo_subdivision' => 'NA', + 'geo_country' => 'NA' + ] + ] + ] + ); + } + return $tests; + } +}