This repository has been archived by the owner on Nov 21, 2017. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathl2cs.py
335 lines (274 loc) · 10.9 KB
/
l2cs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
#!/usr/bin/env python
'''
l2cs (lucene to CloudSearch) - is a module for converting search queries
from Apache lucene's base syntax
(http://lucene.apache.org/core/3_6_0/queryparsersyntax.html)
into an Amazon CloudSearch boolean query
(http://docs.amazonwebservices.com/cloudsearch/latest/developerguide/booleansearch.html).
'''
import sys
import whoosh.analysis
import whoosh.fields
import whoosh.qparser.default
import whoosh.qparser.plugins
import whoosh.qparser.syntax
import whoosh.qparser.taggers
import whoosh.query
__version__ = "2.0.2"
HANDLERS = {}
def handler(*classes):
def decorator(fn):
for cls in classes:
if cls in HANDLERS:
raise ValueError("%s already has a handler")
HANDLERS[cls] = fn
return fn
return decorator
# NullQuery is an instance of _NullQuery class
@handler(whoosh.query.NullQuery.__class__)
def build_null(clause):
yield ""
@handler(whoosh.query.Term, whoosh.query.Phrase, whoosh.query.Prefix)
def build_field(clause):
integer_field = getattr(clause, "integer_field", False)
if not integer_field:
yield "(field "
yield clause.fieldname
yield " '"
if isinstance(clause, whoosh.query.Term):
yield clause.text.replace(r"'", r"\'")
elif isinstance(clause, whoosh.query.Prefix):
yield clause.text.replace(r"'", r"\'")
yield '*'
elif isinstance(clause, whoosh.query.Phrase):
for word in clause.words[:-1]:
yield word.replace(r"'", r"\'")
yield " "
yield clause.words[-1]
yield "')"
else:
yield clause.fieldname
yield ':'
yield clause.text
@handler(whoosh.query.And, whoosh.query.Or, whoosh.query.Not,
whoosh.query.AndMaybe)
def build_grouper(clause):
yield "("
# CloudSearch only supports 'and' and 'or' clauses; neither really fit
# with the concept of "AndMaybe", which tries to "boost" results that
# include the "Maybe" portion of the clause.
if isinstance(clause, whoosh.query.AndMaybe):
yield "and"
else:
yield clause.__class__.__name__.lower()
for child_clause in clause.children():
yield " "
for piece in walk_clause(child_clause):
yield piece
yield ")"
@handler(whoosh.query.AndNot)
def build_compound(clause):
yield '(and '
use, avoid = list(clause.children())
for piece in walk_clause(use):
yield piece
yield ' (not '
for piece in walk_clause(avoid):
yield piece
yield '))'
def walk_clause(clause):
handler_fn = HANDLERS[clause.__class__]
for piece in handler_fn(clause):
yield piece
class IntNode(whoosh.qparser.syntax.WordNode):
def __init__(self, value):
self.__int_value = int(value)
whoosh.qparser.syntax.WordNode.__init__(self, value)
def query(self, parser):
q = whoosh.qparser.syntax.WordNode.query(self, parser)
q.integer_field = True
return q
class PseudoFieldPlugin(whoosh.qparser.plugins.PseudoFieldPlugin):
def __init__(self, fieldnames):
mapping = {}
for name in fieldnames:
function = self.modify_node_fn(name, self.modify_node)
mapping[name] = function
super(PseudoFieldPlugin, self).__init__(mapping)
@staticmethod
def modify_node_fn(fname, base_fn):
def fn(node):
return base_fn(fname, node)
return fn
def modify_node(self, fieldname, node):
raise NotImplementedError
class IntNodePlugin(PseudoFieldPlugin):
def modify_node(self, fieldname, node):
if node.has_text:
try:
new_node = IntNode(node.text)
new_node.set_fieldname(fieldname)
return new_node
except ValueError:
return None
else:
return node
class YesNoPlugin(PseudoFieldPlugin):
def modify_node(self, fieldname, node):
if node.has_text:
if node.text in (u"yes", u"y", u"1"):
new_node = IntNode(u'1')
else:
new_node = IntNode(u'0')
new_node.set_fieldname(fieldname)
return new_node
else:
return node
class FieldAliasPlugin(PseudoFieldPlugin):
def __init__(self, aliases):
reverse_aliases = {}
for fieldname, alias_list in aliases.items():
for alias in alias_list:
reverse_aliases[alias] = fieldname
self.aliases = reverse_aliases
super(FieldAliasPlugin, self).__init__(self.aliases.keys())
def modify_node(self, fieldname, node):
if node.has_text:
node.set_fieldname(self.aliases[fieldname])
return node
class MinusPlugin(whoosh.qparser.plugins.Plugin):
'''This differs from whoosh's PlusMinusPlugin. The concept of "AndMaybe"
isn't one that applies to CloudSearch, so "+" actions aren't needed.
Additionally, the logic is simplified from the whoosh version to just
swap out the nodes
'''
class Minus(whoosh.qparser.syntax.MarkerNode):
pass
def __init__(self, minusexpr=r"(?=\B)-+(?=\w)"):
self.minusexpr = minusexpr
def taggers(self, parser):
minus_tagger = whoosh.qparser.taggers.FnTagger(self.minusexpr,
self.Minus)
return [(minus_tagger, 0)]
def filters(self, parser):
return [(self.do_minus, 505)]
def do_minus(self, parser, group):
'''This filter sorts nodes in a flat group into "required", "default",
and "banned" subgroups based on the presence of plus and minus nodes.
'''
grouper = group.__class__()
next_not = None
for node in group:
if isinstance(node, self.Minus):
if next_not is not None:
# Two Minuses in a row; skip the second one
continue
next_not = whoosh.qparser.syntax.NotGroup()
grouper.append(next_not)
else:
# Nodes with children: search for nested Minus nodes
if isinstance(node, whoosh.qparser.syntax.GroupNode):
node = self.do_minus(parser, node)
if next_not is not None:
next_not.append(node)
next_not = None
else:
grouper.append(node)
if next_not is not None:
# Remove the empty NotGroup
grouper.pop()
return grouper
DEFAULT_PLUGINS = (
whoosh.qparser.plugins.WhitespacePlugin(),
whoosh.qparser.plugins.SingleQuotePlugin(),
whoosh.qparser.plugins.FieldsPlugin(),
whoosh.qparser.plugins.PhrasePlugin(),
whoosh.qparser.plugins.PrefixPlugin(),
whoosh.qparser.plugins.GroupPlugin(),
whoosh.qparser.plugins.OperatorsPlugin(AndMaybe=None,
Require=None),
whoosh.qparser.plugins.EveryPlugin(),
MinusPlugin(),
)
def make_parser(default_field='text', plugins=DEFAULT_PLUGINS, schema=None,
int_fields=None, yesno_fields=None, aliases=None):
'''Helper function to create a QueryParser.
Parameters:
default_field: the default field to search against for non-field
queries
plugins: a list of plugins to use when parsing
schema: If provided, a schema to check fieldnames against. If not
provided, any query of the form "foo:bar" will yield searches
against the "foo" field; if provided and "foo" is not a field,
then the search will look for "foo bar" in the default_field.
NOTE: If provided, search queries MUST use unicode
int_fields: A list of fields that expect integer values from
CloudSearch
yesno_fields: A list of fields to convert "yes" and "no" queries to
boolean 1 / 0 searches
aliases: A dictionary of aliases to use for the AliasPlugin
'''
parser = whoosh.qparser.default.QueryParser(default_field, schema,
plugins=plugins)
parser_parse = parser.parse
def parse(text, *args, **kwargs):
assert isinstance(text, unicode), 'Cannot parse non-unicode objects (%r)' % text
return parser_parse(text, *args, **kwargs)
parser.parse = parse
parser.parse.__doc__ = parser_parse.__doc__
if int_fields:
parser.add_plugin(IntNodePlugin(int_fields))
if yesno_fields:
parser.add_plugin(YesNoPlugin(yesno_fields))
if aliases:
parser.add_plugin(FieldAliasPlugin(aliases))
return parser
def make_schema(fields, datefields=()):
'''Create a whoosh.fields.Schema object from a list of field names.
All fields will be set as TEXT fields. If datefields is supplied,
additionally create DATETIME fields with those names
'''
text_field = whoosh.fields.TEXT(analyzer=whoosh.analysis.SimpleAnalyzer())
fields = dict.fromkeys(fields, text_field)
if datefields:
datefields = dict.fromkeys(datefields, whoosh.fields.DATETIME)
fields.update(datefields)
schema = whoosh.fields.Schema()
for fieldname in fields:
schema.add(fieldname, fields[fieldname])
return schema
def convert(query, parser):
parsed = parser.parse(query)
pieces = walk_clause(parsed)
return u''.join(pieces)
def __sample_parser(schema=None):
return make_parser(int_fields=["count", "number"],
yesno_fields=["active", "ready"],
aliases={"alias": ["alias1", "alias2"]},
schema=schema)
def __sample_schema():
return make_schema(["foo", "bar", "baz", "count", "number", "active",
"text", "ready", "active", "alias", "alias1",
"alias2"])
def main(args):
'''For command line experimentation. Sample output:
$ python l2cs.py 'foo:bar AND baz:bork'
Lucene input: foo:bar AND baz:bork
Parsed representation: And([Term(u'foo', u'bar'), Term(u'baz', u'bork')])
Lucene form: (foo:bar AND baz:bork)
Cloudsearch form: (and (field foo 'bar') (field baz 'bork'))
'''
args = [unicode(u, 'utf-8') for u in args[1:]]
schema = __sample_schema() if "--schema" in args else None
if schema:
args.pop(args.index("--schema"))
query = u' '.join(args)
print "Lucene input:", query
parser = __sample_parser(schema=schema)
parsed = parser.parse(query)
print "Parsed representation:", repr(parsed)
print "Lucene form:", unicode(parsed)
cloudsearch_query = ''.join(walk_clause(parsed))
print "Cloudsearch form:", cloudsearch_query
if __name__ == '__main__':
main(sys.argv)