-
Notifications
You must be signed in to change notification settings - Fork 1
/
TikaClient.php
183 lines (176 loc) · 5.86 KB
/
TikaClient.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
<?php
/**
* PHP client for Apache Tika Application running in server mode.
* License: FSF GPLv3 or later (see http://www.fsf.org/licenses/gpl.html)
* Copyright (c) 2012, Vitaliy Filippov
*/
class TikaClient
{
const VERSION = '2012-09-03';
protected $tikaServer, $tikaPort;
protected $mimeTypes, $mimeRegexp;
protected $verbose, $logfile, $logcallback;
/**
* Create a client object.
*
* @param string $tikaServer IP and port server is listening to
* @param string $mimeTypes Space-separated list of wildcards for supported MIME types
* @param string $logfile Filename for error logging
* @param callback $logcallback Callback function for error logging
* @param boolean $verbose If true, empty response will be treated as error,
* and also success responses will be logged
*/
public function __construct($tikaServer, $mimeTypes, $logfile = NULL, $logcallback = NULL, $verbose = false)
{
$this->mimeTypes = $mimeTypes;
$this->logfile = $logfile;
$this->logcallback = $logcallback;
$this->verbose = $verbose;
if (strpos($tikaServer, ':') === false)
{
throw new Exception("Tika server address '$tikaServer' has incorrect format - correct format is 'server:port'");
}
list($this->tikaServer, $this->tikaPort) = explode(':', $tikaServer, 2);
// Build regexp for MIME types
$mimes = preg_split('/\s+/', trim($mimeTypes));
foreach ($mimes as &$m)
{
$m = '^'.str_replace(array('\\*', '/'), array('.*', '\\/'), preg_quote($m)).'$';
}
$this->mimeRegexp = '/'.implode('|', $mimes).'/is';
}
/**
* Get error message for socket
* @param string $msg
* @param resource $socket
* @return string
*/
protected function socketErr($msg, $socket)
{
$errno = socket_last_error($socket);
$errstr = socket_strerror($errno);
return $msg.($errno ? ": [$errno] $errstr" : '');
}
/**
* Log message to file or to function
* @param string $msg
*/
protected function log($msg)
{
$msg = date("[Y-m-d H:i:s] ").$msg."\n";
if ($this->logcallback)
{
call_user_func($this->logcallback, $msg);
}
elseif ($this->logfile)
{
file_put_contents($this->logfile, $msg, FILE_APPEND);
}
}
/**
* Extract plaintext from binary data $data using Tika
*
* @param string $data Input binary data
* @param string &$err Error message will be placed here on error, or 'false' on success
* @return string $text Extracted text
*/
public function extractText($data, &$err)
{
$fsize = strlen($data);
// Connect to Tika
$s = socket_create(AF_INET, SOCK_STREAM, SOL_TCP);
socket_connect($s, $this->tikaServer, $this->tikaPort);
socket_set_nonblock($s);
// Tika is somewhat delicate about network IO
// So read and write using select(2) system call
$text = '';
$err = false;
do
{
$read = $except = array($s);
$write = $data === false ? NULL : array($s);
socket_select($read, $write, $except, NULL);
if ($read)
{
$part = socket_read($s, 65536);
if ($part === false)
{
// Read failure
$err = $this->socketErr("Error reading from Tika server", $s);
break;
}
elseif ($part === '')
{
// EOF
break;
}
$text .= $part;
}
if ($write)
{
$l = socket_write($s, $data);
if ($l !== false)
{
$data = substr($data, $l);
if ($data === '' || $data === false)
{
// Shutdown output and forget about write events
$data = false;
socket_shutdown($s, 1);
}
}
else
{
// Write failure
$err = $this->socketErr("Error writing to Tika server", $s);
break;
}
}
} while (!$except); // except is also treated as EOF
socket_close($s);
if ($text === '' && $err === false && $this->verbose)
{
$err = 'Empty response from Tika server';
}
return $text;
}
/**
* Extract plaintext content from a file using Tika, skipping unsupported mime types
*
* @param string $filename Filename to read data from
* @param string $mimeType MIME type of this file
* @param string $filenameForLog Equal to $filename by default
* @return string $text Extracted text
*/
public function extractTextFromFile($filename, $mimeType, $filenameForLog = NULL)
{
if ($filenameForLog === NULL)
{
$filenameForLog = $filename;
}
if (!preg_match($this->mimeRegexp, $mimeType))
{
// Tika can't handle this mime type, return nothing
return '';
}
// Read file
$data = file_get_contents($filename);
$fsize = strlen($data);
if (!$fsize)
{
// File is empty
return '';
}
// Extract text
$text = $this->extractText($data, $err);
if ($err !== false)
{
$this->log("Error extracting text from $filenameForLog ($mimeType) of size $fsize: $err");
}
elseif ($this->verbose)
{
$this->log("Extracted ".strlen($text)." bytes from $filename ($mimeType) of size $fsize");
}
return $text;
}
}