This repository has been archived by the owner on Nov 10, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathscrubber.py
298 lines (225 loc) · 10.3 KB
/
scrubber.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
# scrubber.py
# http://sites.google.com/site/pocketsense/
# fix known issues w/ OFX downloads
# Ver 1: rlc, 2010
# 05-Aug-2010*rlc
# - Added _scrubTime() function to fix NULL time stamps so that transactions record on the correct date
# regardless of time zone.
# 28-Jan-2011*rlc
# - Added _scrubDTSTART() to fix missing <DTEND> fields when a <DTSTART> exists.
# - Recoded scrub routines to use regex substitutions
# 28-Aug-2012*rlc
# - Added quietScrub option to sites.dat (suppresses scrub messages)
# 02-Feb-2013*rlc
# - Bug fix in _scrubDTSTART
# - Added scrub routine to verify Investment buy and sell transactions
# 17-Feb-2013*rlc
# - Added scrub routine for CORRECTACTION and CORRECTFITID tags (not supported by Money)
# 06-Jul-2013*rlc
# - Bug fix in _scrubDTSTART()
# 20-Feb-2014*rlc
# - Bug fix in _scrubINVsign() for SELL transactions
import os, sys, re, datetime
import site_cfg
from control2 import *
userdat = site_cfg.site_cfg()
nullTimeUpdated = False
stat=False
def scrubPrint(line):
if not userdat.quietScrub:
print line
def scrub(filename, site):
#filename = string
#site = DICT structure containing full site info from sites.dat
siteURL = FieldVal(site, 'url').upper()
dtHrs = FieldVal(site, 'timeOffset')
f = open(filename,'r')
ofx = f.read() #as-found ofx message
#print ofx
if 'DISCOVERCARD' in siteURL: ofx= _scrubDiscover(ofx)
ofx= _scrubTime(ofx) #fix 000000 and NULL datetime stamps
if dtHrs <> 0: ofx = _scrubShiftTime(ofx, dtHrs) #note: always call *after* _scrubTime()
ofx= _scrubDTSTART(ofx) #fix missing <DTEND> fields
#fix malformed investment buy/sell signs (neg vs pos), if they exist
if "<INVSTMTTRNRS>" in ofx.upper(): ofx= _scrubINVsign(ofx)
#perform general ofx cleanup
ofx = _scrubGeneral(ofx)
#close the input file
f.close()
#write the new version to the same file name
f = open(filename, 'w')
f.write(ofx)
f.close
#-----------------------------------------------------------------------------
# OFX.DISCOVERCARD.COM
# 1. Discover OFX files will contain transaction identifiers w/ the following format:
# FITIDYYYYMMDDamt#####, where
# FITID = string literal
# YYYY = year (numeric)
# MM = month (numeric)
# DD = day (numeric)
# amt = dollar amount of the transaction, including a hypen for negative entries (e.g., -24.95)
# ##### = 5 digit serial number
# 2. The 5-digit serial number can change each time you connect to the server,
# meaning that the same transaction can download with different FITID numbers.
# That's not good, since Money requires a unique FITID value for each valid transaction.
# Varying serial numbers result in duplicate transactions!
# 3. We'll replace the 5-digit serial number with one of our own.
# The default will be 0 for every transaction,
# and we'll increment by one for each subsequent transaction that that matches
# a previous transaction in the file.
_sD_knownvals = [] #global to keep track of Discover FITID values between regex.sub() calls
def _scrubDiscover(ofx):
scrubPrint(" +Scrubber: Processing Discover statement.")
ofx_final = '' #new ofx message
_sD_knownvals = [] #reset our global set of known vals (just in case)
#regex p captures everything from <FITID> up to the next <tag>, but excludes the next "<".
#p produces 2 results: r.group(1) = <FITID> field, r.group(2)=value
p = re.compile(r'(<FITID>)([^<\s]+)',re.IGNORECASE)
#call substitution (inline lamda, takes regex result = r as tuple)
ofx_final = p.sub(lambda r: _scrubDiscover_r1(r), ofx)
return ofx_final
def _scrubDiscover_r1(r):
#regex subsitution function for _scrubDiscover()
global _sD_knownvals
fieldtag = r.group(1)
fitid = r.group(2).strip(' ')
#pointer to end of "base" FITID value
bx = len(fitid) - 5
fitid_b = fitid[:bx]
#find a unique serial#, from 0 to 9999
seq = 0 #default
while seq < 9999:
fitid = fitid_b + str(seq)
exists = (fitid in _sD_knownvals)
if exists: #already used it... try another
seq=seq+1
else:
break #unique value... write it out
_sD_knownvals.append(fitid) #remember the assigned value between calls
return fieldtag + fitid #return the new string for regex.sub()
#--------------------------------
def _scrubTime(ofx):
#Replace NULL time stamps with noontime (12:00)
#regex p captures everything from <DT*> up to the next <tag>, but excludes the next "<".
#p produces 2 results: group(1) = <DT*> field, group(2)=dateval
p = re.compile(r'(<DT.+?>)([^<\s]+)',re.IGNORECASE)
#call date correct function (inline lamda, takes regex result = r tuple)
nullTimeUpdated = False
ofx_final = p.sub(lambda r: _scrubTime_r1(r), ofx)
if nullTimeUpdated: scrubPrint(" +Scrubber: Null time values updated.")
return ofx_final
def _scrubTime_r1(r):
# Replace zero and NULL time fields with a "NOON" timestamp (120000)
# Force "date" to be the same as the date listed, regardless of time zone by setting time to NOON.
# Applies when no time is given, and when time == MIDNIGHT (000000)
fieldtag = r.group(1)
DT = r.group(2).strip(' ') #date+time
# Full date/time format example: 20100730000000.000[-4:EDT]
if DT[8:] == '' or DT[8:14] == '000000':
#null time given. Adjust to 120000 value (noon).
DT = DT[:8] + '120000'
nullTimeUpdated = True
return fieldtag + DT
#--------------------------------
def _scrubDTSTART(ofx):
# <DTSTART> field for an account statement must have a matching <DTEND> field
# If DTEND is missing, insert <DTEND>="now"
# The assumption is made that only one statement exists in the OFX file (no multi-statement files!)
ofx_final = ofx
now = datetime.datetime.now()
nowstr = now.strftime("%Y%m%d%H%M00")
if ofx.find('<DTSTART>') >= 0 and ofx.find('<DTEND>') < 0:
#we have a dtstart, but no dtend... fix it.
scrubPrint(" +Scrubber: Fixing missing <DTEND> field")
#regex p captures everything from <DTSTART> up to the next <tag> or white space into group(1)
p = re.compile(r'(<DTSTART>[^<\s]+)',re.IGNORECASE)
if Debug: print "DTSTART: findall()=", p.findall(ofx_final)
#replace group1 with (group1 + <DTEND> + datetime)
ofx_final = p.sub(r'\1<DTEND>'+nowstr, ofx_final)
return ofx_final
def _scrubShiftTime(ofx, h):
#Shift DTASOF time values by (float) h hours
#Added: 15-Feb-2011, rlc
#regex p captures everything from <DTASOF> up to the next <tag> or white-space.
#p produces 2 results: group(1) = <DTASOF> field, group(2)=dateval
p = re.compile(r'(<DTASOF>)([^<\s]+)',re.IGNORECASE | re.DOTALL)
#call date correct function (inline lamda, takes regex result = r tuple)
if p.search(ofx):
scrubPrint(" +Scrubber: Shifting DTASOF time values " + str(h) + " hours.")
ofx_final = p.sub(lambda r: _scrubShiftTime_r1(r,h), ofx)
return ofx_final
def _scrubShiftTime_r1(r,h):
#Shift time value by (float) h hours for regex search result r.
#Added: 15-Feb-2011, rlc
fieldtag = r.group(1) #date field tag (e.g., <DTASOF>)
DT = r.group(2).strip(' ') #date+time
if Debug: print "fieldtag=", fieldtag, "| DT=" + DT
# Full date/time format example: 20100730120000.000[-4:EDT]
#separate into date/time + timezone
tz = ""
if '[' in DT:
p = DT.index('[')
tz = DT[p:]
DT = DT[:p]
#strip the decimal fraction, if we have it
if '.' in DT:
d = DT.index('.')
DT = DT[:d]
if Debug: scrubPrint("New DT=" + DT + "| tz=" + tz)
#shift the time
tval = datetime.datetime.strptime(DT,"%Y%m%d%H%M%S") #convert str to datetime
deltaT = datetime.timedelta(hours=h)
tval += deltaT #add hours
DT = tval.strftime("%Y%m%d%H%M%S") + tz #convert new datetime to str
return fieldtag + DT
def _scrubINVsign(ofx):
#Fix malformed parameters in Investment buy/sell sections, if they exist
#Issue first noticed with Fidelity netbenefits 401k accounts: rlc*2013
#BUY transactions:
# UNITS must be positive
# TOTAL must be negative
#SELL transactions:
# UNITS must be negative
# TOTAL must be positive
stat=False
p = re.compile(r'(<INVBUY>|<INVSELL>)(.+?<UNITS>)(.+?)(<.+?<TOTAL>)([^<\r\n]+)', re.IGNORECASE)
ofx_final=p.sub(lambda r: _scrubINVsign_r1(r), ofx)
if stat:
scrubPrint(" +Scrubber: Invalid investment sign (pos/neg) found. Corrected.")
return ofx_final
def _scrubINVsign_r1(r):
type=""
if "INVBUY" in r.group(1): type = "INVBUY"
if "INVSELL" in r.group(1): type="INVSELL"
qty = r.group(3)
total=r.group(5)
qty_v=0
total_v=0
try:
qty_v=float(qty)
total_v=float(total)
except:
pass
if (type=="INVBUY" and qty_v<0) or (type=="INVSELL" and qty_v>0):
stat=True
qty=str(-1*qty_v)
if (type=="INVBUY" and total_v>0) or (type=="INVSELL" and total_v<0):
stat=True
total=str(-1*total_v)
rtn=r.group(1) + r.group(2) + qty + r.group(4) + total
return rtn
def _scrubGeneral(ofx):
# General scrub routine for singular tag substitutions
# Remove tag/value pairs that Money doesn't support
#define unsupported tags that we've had trouble with
uTags = []
uTags.append('CORRECTACTION')
uTags.append('CORRECTFITID')
# remove tag/value pairs from ofx
for tag in uTags:
p = re.compile(r'<'+tag+'>[^<]+',re.IGNORECASE)
if p.search(ofx):
ofx = p.sub('',ofx)
print(" +Scrubber: <"+tag+"> tags removed. Not supported by Money.")
return ofx