-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathregexComp.R
173 lines (114 loc) · 6.53 KB
/
regexComp.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
library(Rllvm)
source("genXPathWrapper.R")
if(FALSE) {
library(RCIndex)
tu = createTU("/usr/local/include/pcre2.h")
r = getRoutines(tu)
grep("pcre2_compile", names(r), value = TRUE)
# Note the _8 and _16 versions of these functions. We'll use the _8 versions.
ds = getDataStructures(tu)
}
DEBUG = FALSE
# The pattern we will look for. We are hard-coding the value in this version.
# It is probably easier to allow the R user to specify this and hence change it as we did in regex.c
# This is because we don't have to work with string literals but directly with pointers from R.
regex = "foo"
m = Module()
# Declare opaque structures. We can probably just use vptrType below as a void *
real_match_data = structType(character(), name = "struct.real_match_data", withNames = FALSE)
real_code = structType(character(), name = "struct.real_code", withNames = FALSE)
preal_match_data = pointerType(real_match_data)
vptrType = pointerType(VoidType)
# NULL value for pointers to 8 and 32 integers
null = getNULLPointer(pointerType(Int8Type)) # VoidType)
null32 = getNULLPointer(pointerType(Int32Type)) # VoidType)
# Declarations of routines we will use.
# Don't actually need strlen() as we can use PCRE2_ZERO_TERMINATED (i.e. -1 as an Int64Type)
strlen = Function("strlen", Int32Type, list(StringType), module = m)
pcre2_match = Function("pcre2_match_8", Int32Type, list(pointerType(real_code), StringType, Int64Type, Int32Type, Int32Type, pointerType(real_match_data), vptrType), module = m)
pcre2_match_data_create = Function("pcre2_match_data_create_8", pointerType(real_match_data), list(Int32Type, vptrType), module = m)
iptrType = pointerType(Int32Type)
pcre2_compile = Function("pcre2_compile_8", pointerType(real_code),
list(StringType, Int64Type, Int32Type, iptrType, pointerType(Int64Type), vptrType), module = m)
if(DEBUG)
printf = Function("printf", Int32Type, list(StringType), module = m, varArgs = TRUE)
# Two global variables - matchData we use in pcre2_compile and pattern which is the compiled regular expression object we use in pcre2_match
pattern = createGlobalVariable("pattern", m, pointerType(real_code), NULL, alignment = 8L) # vptrType) # , linkage = InternalLinkage)
matchData = createGlobalVariable("matchData", m, pointerType(real_match_data), NULL, alignment = 8L) # vptrType) #, linkage = InternalLinkage)
setUnnamedAddr(pattern, Rllvm:::Local)
setUnnamedAddr(matchData, Rllvm:::Local)
# An initialize() routine which will call pcre2_match_data_create() and pcre2_compile()
f2 = simpleFunction("initialize", VoidType, .types = list(), mod = m)
ir = f2$ir
zero = ir$createConstant(0L)
md = ir$createCall(pcre2_match_data_create, ir$createConstant(4L), null)
ir$createStore(md, matchData)
rx = ir$createConstant("foo")
rxv = createGlobalVariable("regex", m, val = rx, type = getType(rx), constant = TRUE, align = 1L)
setUnnamedAddr(rxv, "Local")
#XXX add the error handlers.
#crx = ir$createCall(pcre2_compile, ir$createConstant(regex), ir$createConstant(nchar(regex)), zero, null32, null32, null)
#crx = ir$createCall(pcre2_compile, rx, ir$createConstant(nchar(regex)), zero, null32, null32, null)
#Works but we'll use -1L for PCRE2_ZERO_TERMINATED instead of calling strlen().
# crx = ir$createCall(pcre2_compile, ir$createGEP(rxv, c(0L, 0L)), ir$createConstant(nchar(regex), Int64Type), zero, null32, null32, null)
err = ir$createLocalVariable(Int32Type, "err")
errOff = ir$createLocalVariable(Int64Type, "errOff")
ir$createStore(ir$createConstant(0L), err)
ir$createStore(ir$createConstant(0L, Int64Type), errOff)
#crx = ir$createCall(pcre2_compile, ir$createGEP(rxv, c(0L, 0L)), ir$createConstant(-1L, Int64Type), zero, null32, null32, null)
crx = ir$createCall(pcre2_compile, ir$createGEP(rxv, c(0L, 0L)), ir$createConstant(-1L, Int64Type), zero, err, errOff, null)
if(DEBUG) {
fmt3 = createGlobalVariable("printf.fmt3", m, val = ir$createConstant("pattern = %p, err = %d\n"), constant = TRUE, align = 1L)
ir$createCall(printf, ir$createGEP(fmt3, c(0L, 0L)), crx, err)
}
ir$createStore(crx, pattern)
ir$createReturn()
########################
# The match function
f1 = simpleFunction("do_match", Int1Type, str = StringType, mod = m)
ir = f1$ir
if(DEBUG) {
fmt1 = createGlobalVariable("printf.fmt1", m, val = ir$createConstant("string = %s\n"), constant = TRUE, align = 1L)
ir$createCall(printf, ir$createGEP(fmt1, c(0L, 0L)), f1$params$str)
}
# Instead of strlen() call, we could use PCRE2_ZERO_TERMINATED which is ~ 0 -1 as a 64 bit integer
#len = ir$createCall(strlen, f1$params$str)
len = ir$createConstant(-1L, Int64Type)
ans = ir$createCall(pcre2_match, ir$createLoad(pattern), f1$params$str, len, zero, zero, ir$createLoad(matchData), null)
ans2 = ir$createICmp(ICMP_SGT, ans, zero)
if(DEBUG) {
fmt2 = createGlobalVariable("printf.fmt2", m, val = ir$createConstant("ans = %d, pattern = %p\n"), constant = TRUE, align = 1L)
ir$createCall(printf, ir$createGEP(fmt2, c(0L, 0L)), ans2, ir$createLoad(pattern))
}
ir$createReturn(ans2)
wrapper = genXPathWrapper(f1$fun, retType = Int1Type, module = m, funName = "xpath_grepl")
verifyModule(m)
if(FALSE) {
# Load the native libaries and pass the relevant address to LLVM
# pcre.p2 = dyn.load("/usr/local/lib/libpcre2-posix.2.dylib")
pcre2 = dyn.load("/usr/local/lib/libpcre2-8.0.dylib")
llvmAddSymbol("printf", "strlen", pcre2_match = "pcre2_match_8", pcre2_compile = "pcre2_compile_8", pcre2_match_data_create = "pcre2_match_data_create_8")
# Essential that both calls be given ee - and the same ee.
# If you leave it out of the second call, pattern is NULL and the code just returns 0.
ee = ExecutionEngine(m)
.llvm(m$initialize, .ee = ee)
.llvm(m$do_match, "xyz", .ee = ee)
.llvm(m$do_match, "a fool and his money are soon parted", .ee = ee)
library(XML)
doc = xmlParse("doc3.xml")
rref = getPointerToFunction(m$xpath_grepl, ee)@ref
els = getNodeSet(doc, "//text()[grep_p(string(.))]/..", xpathFuns = list(grep_p = rref))
}
###################################
if(FALSE) {
# If we were to build the wrapper ourselves, it would be along the following skeleton ....
Function("valuePush", VoidType, list(vptrType))
# f = Function("do_match", VoidType, list(ctxt = vptrType, nargs = Int32Type), module = m)
fn = simpleFunction("do_match", VoidType, .types = list(ctxt = vptrType, nargs = Int32Type), mod = m)
ir = fn$ir
str = popArg(ir, StringType, fn$params$ctxt, m)
ans = ir$createCall(pcre2_match, pattern, str, , 0L, 0L, matchData, 0L)
ans2 = ir$binOp(ICMP_SGT, ans, 0)
val = ir$createCall(xmlXPathNewBoolean, ans2)
ir$createCall(valuePush, val)
}