forked from ubccr/xdmod
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWebServerLogFile.php
230 lines (178 loc) · 7.79 KB
/
WebServerLogFile.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
<?php
/**
* Web Server Log file endpoint. This endpoint contains the machinery
* to parse web server log files and user agent strings. It also includes
* the optional ability to use a GeoIP database file to associate the
* location with an ip address.
*
* Configuration properties:
* - log_format: A format string that specifies the expected log file format. The
* syntax is identical to the format in the apache configuration.
* - geoip_file: Optional path to a GeoLite2 database file in MMDB format. If absent
* then the IP to location mapping will not be performed.
*
* The fields output by this endpoint will be dependent on the log_format string.
*/
namespace ETL\DataEndpoint;
use Psr\Log\LoggerInterface;
use ETL\DataEndpoint\DataEndpointOptions;
class WebServerLogFile extends aStructuredFile implements iStructuredFile
{
const CACHE_SIZE = 1000;
private $web_parser = null;
private $ua_parser = null;
private $ua_parser_cache = array();
private $geoip_lookup = null;
private $geoip_cache = array();
/**
* @const string Defines the name for this endpoint that should be used in configuration files.
* It also allows us to implement auto-discovery.
*/
const ENDPOINT_NAME = 'webserverlog';
/**
* @see iDataEndpoint::__construct()
*/
public function __construct(DataEndpointOptions $options, LoggerInterface $logger = null)
{
parent::__construct($options, $logger);
$this->web_parser = new \Kassner\LogParser\LogParser();
// Allow "at" # remote user format string.
// This can be removed if Kassner LogParser is updated to
// version >2.1.1 (see note in composer.json).
$this->web_parser->addPattern('%u', '(?P<user>(?:-|[\w\-\.@]+))');
if (isset($options->log_format)) {
// Replace `%r` with `%m %U %H` so the request method, URL, and
// protocol can be parsed separately.
$options->log_format = str_replace(
'%r',
'%m %U %H',
$options->log_format
);
$this->web_parser->setFormat($options->log_format);
}
$this->ua_parser = \UAParser\Parser::create();
if (isset($options->geoip_file)) {
$this->geoip_lookup = new \GeoIp2\Database\Reader($options->geoip_file);
}
}
private function lookupGeoIp($host) {
if (array_key_exists($host, $this->geoip_cache)) {
return $this->geoip_cache[$host];
}
$result = new \stdClass();
$result->{"city"} = 'NA';
$result->{"subdivision"} = 'NA';
$result->{"country"} = 'NA';
if ($this->geoip_lookup !== null) {
try {
$geoip = $this->geoip_lookup->city($host);
$result->{"city"} = $geoip->city->name;
$result->{"subdivision"} = $geoip->mostSpecificSubdivision->isoCode;
$result->{"country"} = $geoip->country->isoCode;
}
catch (\GeoIp2\Exception\AddressNotFoundException $e) {
$result->{"city"} = 'unknown';
$result->{"subdivision"} = 'unknown';
$result->{"country"} = 'unknown';
}
catch (\InvalidArgumentException $e) {
// leave at the default value of 'N/A'
}
if (count($this->geoip_cache) > self::CACHE_SIZE) {
array_shift($this->geoip_cache);
}
$this->geoip_cache[$host] = $result;
}
return $result;
}
/**
* @see aStructuredFile::decodeRecord()
*/
protected function decodeRecord($data)
{
try {
$decoded = $this->web_parser->parse($data);
if (property_exists($decoded, 'HeaderUserAgent')) {
if (array_key_exists($decoded->HeaderUserAgent, $this->ua_parser_cache)) {
$ua_decoded = $this->ua_parser_cache[$decoded->HeaderUserAgent];
} else {
if (count($this->ua_parser_cache) > self::CACHE_SIZE) {
array_shift($this->ua_parser_cache);
}
$ua_decoded = $this->ua_parser->parse($decoded->HeaderUserAgent);
$this->ua_parser_cache[$decoded->HeaderUserAgent] = $ua_decoded;
}
$decoded->{"ua_family"} = $ua_decoded->ua->family;
$decoded->{"ua_major"} = $ua_decoded->ua->major;
$decoded->{"ua_minor"} = $ua_decoded->ua->minor;
$decoded->{"ua_patch"} = $ua_decoded->ua->patch;
$decoded->{"ua_os_family"} = $ua_decoded->os->family;
$decoded->{"ua_os_major"} = $ua_decoded->os->major;
$decoded->{"ua_os_minor"} = $ua_decoded->os->minor;
$decoded->{"ua_os_patch"} = $ua_decoded->os->patch;
$decoded->{"ua_device_family"} = $ua_decoded->device->family;
$decoded->{"ua_device_brand"} = $ua_decoded->device->brand;
$decoded->{"ua_device_model"} = $ua_decoded->device->model;
}
if (property_exists($decoded, 'host')) {
$location = $this->lookupGeoIp($decoded->host);
$decoded->{"geo_city_name"} = $location->city;
$decoded->{"geo_subdivision"} = $location->subdivision;
$decoded->{"geo_country"} = $location->country;
}
$this->recordList[] = $decoded;
} catch (\Kassner\LogParser\FormatException $e) {
// ignore failed lines
$this->logger->debug("Skip " . $data);
}
return true;
}
/**
* @see aStructuredFile::verifyData()
*/
protected function verifyData()
{
return true;
}
/**
* @see aStructuredFile::discoverRecordFieldNames()
*/
protected function discoverRecordFieldNames()
{
// If there are no records in the file then we don't need to set the discovered
// field names.
if ( 0 == count($this->recordList) ) {
return;
}
// Determine the record names based on the structure of the JSON that we are
// parsing.
reset($this->recordList);
$record = current($this->recordList);
if ( is_array($record) ) {
if ( $this->hasHeaderRecord ) {
// If we have a header record skip the first record and use its values as
// the field names
$this->discoveredRecordFieldNames = array_shift($this->recordList);
} elseif ( 0 !== count($this->requestedRecordFieldNames) ) {
// If there is no header record and the requested field names have been
// provided, use them as the discovered field names. If a subsequent
// record contains fewer fields return NULL values for those fields, if a
// subsequent record contains more fields ignore them.
$this->discoveredRecordFieldNames = $this->requestedRecordFieldNames;
} else {
$this->logAndThrowException("Record field names must be specified for JSON array records");
}
} elseif ( is_object($record) ) {
// Pull the record field names from the object keys
$this->discoveredRecordFieldNames = array_keys(get_object_vars($record));
} else {
$this->logAndThrowException(
sprintf("Unsupported record type in %s. Got %s, expected array or object", $this->path, gettype($record))
);
}
// If no field names were requested, return all discovered fields
if ( 0 == count($this->requestedRecordFieldNames) ) {
$this->requestedRecordFieldNames = $this->discoveredRecordFieldNames;
}
}
}