diff --git a/LICENCE b/LICENCE
new file mode 100644
index 0000000..6d95953
--- /dev/null
+++ b/LICENCE
@@ -0,0 +1,23 @@
+Copyright (C) 2018-2021 RISE Research Institute of Sweden AB
+
+MIT License
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
diff --git a/README b/README
new file mode 100644
index 0000000..6172d66
--- /dev/null
+++ b/README
@@ -0,0 +1,35 @@
+This repository contains the code used for data analysis in the BONSAI
+project (Financed by the Swedish funding agency Vinnova, 2018 - 2020).
+
+The objective of the project was to explore methods for causal
+inference for finding causal links between child cancer treatment and
+late effects. The idea is to use a constraint based causal inference
+method adapted for small data sets (most medical data sets are small
+from a machine learning perspective), to distinguish whether a late
+effect is caused by personal predisposition, the primary cancer
+diagnosis, or one of the treatments for the cancer.
+
+This repository contains two parts. One part contains the code for
+arranging the data from several registers (person info, control person
+info, diagnosis, treatments, in-patient care registry, out-patient
+care registry, death registry, and drug prescription, into a common
+file with all information about a patient or control person on one
+line each. The other part contains the analysis and visualization code
+based on that common file.
+
+The actual data is not included here, since it contains sensitive
+personal information and is therefore not publicly available.
+
+All the code is "research project quality", not production quality,
+which means that it is patchy, inconsistent, unclearn, not well
+documented, and thus not easy to use by someone else. Neverthelsee, it
+is included here for transparency of the methods used in the project.
+
+For questions about the code and how to use reuse snippets of it in
+other projects, contact Anders Holst: anders.holst@ri.se
+
+For questions about availability of data, contact Helena Linge:
+helena.linge@med.lu.se
+
+For suggestion for future research collaboration around these issues,
+you can contact any of the above.
diff --git a/dataanalysis/Color.py b/dataanalysis/Color.py
new file mode 100644
index 0000000..a23ce37
--- /dev/null
+++ b/dataanalysis/Color.py
@@ -0,0 +1,67 @@
+"""
+Copyright (C) 2018-2021 RISE Research Institute of Sweden AB
+
+File: Color.py
+
+Author: anders.holst@ri.se
+
+"""
+
+import numpy as np
+
+def colorgamma(x):
+    if x < 0.0031308:
+        return x*12.92
+    else:
+        return 1.055 * np.power(x, 0.416667) - 0.055
+
+def invcolorgamma(x):
+    if x < 0.04045:
+        return x / 12.92
+    else:
+        return np.power((x + 0.055)/1.055, 2.4)
+
+def rgb_color(r, g, b):
+    return tuple(list(map(colorgamma, [r,g,b])))
+
+# mimics color.scm with 6color, bluebased, and smooth
+def hsl_color(h, s, l):
+    hramp = [0, 1/12, 1/6, 1/3, 2/3, 7/9, 1]
+    iramp = [(2.410996, 0.16, 0, 0, 1), (0.862552, 0.79, 0, 1, 1),
+             (-0.252442, 0.63, 0, 1, 0), (-1.981885, 0.84, 1, 1, 0),
+             (1.786451, 0.21, 1, 0, 0), (1.571190, 0.37, 1, 0, 1),
+             (2.410996, 0.16, 0, 0, 1)]
+    i = 0
+    while h > hramp[i+1]:
+        i += 1
+    p = (h - hramp[i])/(hramp[i+1] - hramp[i])
+    (a, br, r, g, b) = tuple(map(lambda x1, x2: p*(x2 - x1) + x1, iramp[i], iramp[i+1]))
+    ll0 = (l + 1.0)*0.5
+    ll = (np.exp(a*ll0) - 1.0)/(np.exp(a) - 1.0) if not a==0 else ll0
+    if ll < br:
+        t1 = s * ll / br
+        t2 = ll0 * (1.0 - s)
+    else:
+        t1 = s * (1.0 - ll) / (1.0 - br)
+        t2 = ll0 * (1.0 - s) + s * (ll - br) / (1.0 - br)
+    return rgb_color(r*t1+t2, g*t1+t2, b*t1+t2)
+
+def gencolor(light=0.0, starthue=0.0, startsat=1.0, unsat=False):
+    h = starthue
+    s = startsat
+    if unsat:
+        ss = s*s
+    l = light
+    gs1 = (3.0 - np.sqrt(5))/2.0
+    gs2 = gs1/np.sqrt(3)
+    while True:
+        yield hsl_color(h, s, l)
+        h -= gs1
+        if h < 0.0:
+            h += 1.0
+        if unsat:
+            ss -= gs2
+            if ss <= 0:
+                ss += 1.0
+            s = np.sqrt(ss)
+
diff --git a/dataanalysis/exptest3.py b/dataanalysis/exptest3.py
new file mode 100644
index 0000000..c3c85a8
--- /dev/null
+++ b/dataanalysis/exptest3.py
@@ -0,0 +1,1255 @@
+"""
+Copyright (C) 2018-2021 RISE Research Institute of Sweden AB
+
+File: exptest3.py
+
+Author: anders.holst@ri.se
+
+"""
+
+import pandas as pd
+from math import *
+from scipy.special import hyp1f1
+
+from hist import *
+from visual import *
+
+eps = 0.001
+
+# Brute force summation, in logarithms to avoid overflow
+def log_hyper_1f1_negintab(a, b, x):
+    m = -a
+    lr = 0.0
+    lhg = 0.0
+    for k in range(m):
+        lr += log(x*(a+k)/((k+1)*(b+k)))
+        lhg += log(1 + exp(lr - lhg))
+    return lhg
+
+# r *= (x*(a+k) / ((k+1)*(b+k)))
+# hg += r
+
+def log_hyper_1f1(a, b, x):
+    # b ej negativt heltal -> log(hyp1f1)
+    # b neg heltal -> interpolera b+-eps
+    eps = 0.0001
+    if b<0 and b==int(b):
+        ret = (hyp1f1(a, b-eps, x) + hyp1f1(a, b+eps, x))/2
+    else:
+        ret = hyp1f1(a, b, x)
+    if isinf(ret):
+        print("overflow at hyp1f1(%f, %f, %f)" % (a,b,x))
+        return -inf
+    elif ret<=0:
+        print("negative hyp1f1(%f, %f, %f)" % (a,b,x))
+        return -inf
+    return log(ret)
+
+def log_hyper_1f1_interpol(a, b, x):
+    # a ej negativt heltal -> interpolera log(hyp1f1)
+    if a==int(a):
+        return log_hyper_1f1_negintab(int(a), b, x)
+    else:
+        r1 = log_hyper_1f1_negintab(floor(a), b, x)
+        r2 = log_hyper_1f1_negintab(ceil(a), b, x)
+        prop = a - floor(a)
+        return r2*prop + r1*(1.0-prop)
+
+# pa = (n, t)
+def LogLambdaProb_un(pa1, ll):
+    (n1, t1) = pa1
+    if ll <= 0:
+        return -inf if n1 > 0 else 0.0
+    return n1 * log(ll) - t1*ll
+
+def LogLambdaProb_pr_un(pa1, ll, pr):
+    (n1, t1) = pa1
+    (n0, t0) = pr
+    if ll <= 0:
+        return -inf if n1+n0 > 0 else 0.0
+    return (n1+n0) * log(ll) - (t1+t0)*ll
+
+def LogDifflambdaProbOne_un(pa1, pa2, dl):
+    (n1, t1) = pa1
+    (n2, t2) = pa2
+    if dl == 0:
+        ret = 0.0
+    elif dl > 0.0:
+        ret = (-dl * t2) + log_hyper_1f1_negintab(-n2, -n1 - n2, dl*(t1 + t2))
+    else:
+        ret = (dl * t1) + log_hyper_1f1_negintab(-n1, -n1 - n2, -dl*(t1 + t2))
+    return ret
+
+def LogDifflambdaProbOne_pr_un_old(pa1, pa2, dl, pr):
+    (n1, t1) = pa1
+    (n2, t2) = pa2
+    (n0, t0) = pr
+    if dl == 0:
+        ret = 0.0
+    elif dl > 0.0:
+        ret = (-dl*(t2+t0)) + log_hyper_1f1_interpol(-n2-n0, -n1-n2-2*n0, dl*(t1+t2+2*t0))
+    else:
+        ret = (dl*(t1+t0)) + log_hyper_1f1_interpol(-n1-n0, -n1-n2-2*n0, -dl*(t1+t2+2*t0))
+    return ret
+
+def LogDifflambdaProbOne_pr_un(pa1, pa2, dl, pr):
+    (n1, t1) = pa1
+    (n2, t2) = pa2
+    if dl == 0 or t1+t2 == 0.0 or n1+n2 == 0:
+        ret = 0.0
+    else:
+        if prior_style == 'partaverage': # separate priors per condition
+            (n0, t0) = (pr[0], pr[0]*(t1+t2)/(n1+n2+pr[0]))
+        elif prior_style == 'partaveragezero': # inget extra event
+            (n0, t0) = (pr[0], pr[0]*(t1+t2)/(n1+n2))
+        else:
+            (n0, t0) = pr
+        if dl > 0.0:
+            ret = (-dl*(t2+t0)) + log_hyper_1f1_interpol(-n2-n0, -n1-n2-2*n0, dl*(t1+t2+2*t0))
+        else:
+            ret = (dl*(t1+t0)) + log_hyper_1f1_interpol(-n1-n0, -n1-n2-2*n0, -dl*(t1+t2+2*t0))
+    return ret
+
+def LogDifflambdaProbOne_num1_old(pa1, pa2, pr):
+    (n1, t1) = pa1
+    (n2, t2) = pa2
+    (n0, t0) = pr
+    # hitta gränser för pa1 och pa2 separat
+    func1 = lambda ll: LogLambdaProb_pr_un(pa1, ll, pr)
+    func2 = lambda ll: LogLambdaProb_pr_un(pa2, ll, pr)
+    (rng, vals) = find_calc_hrange(func1, (n1+n0+1)/(t1+t0), (n1+n0+1)/(5*(t1+t0)), 0.1, 25)
+    mx1 = max(vals)
+    a1 = rng[0]
+    b1 = rng[-1]
+    d1 = (b1 - a1)/(len(rng)-1)
+    (rng, vals) = find_calc_hrange(func2, (n2+n0+1)/(t2+t0), (n2+n0+1)/(5*(t2+t0)), 0.1, 25)
+    mx2 = max(vals)
+    a2 = rng[0]
+    b2 = rng[-1]
+    d2 = (b2 - a2)/(len(rng)-1)
+    return (mx1, a1, b1, d1, func1, mx2, a2, b2, d2, func2)
+
+def LogDifflambdaProbOne_num1(pa1, pa2):
+    (n1, t1) = pa1
+    (n2, t2) = pa2
+    if t1+t2==0.0 or n1+n2==0:
+        return ()
+    # hitta gränser för pa1 och pa2 separat
+    func1 = lambda ll: LogLambdaProb_un(pa1, ll)
+    func2 = lambda ll: LogLambdaProb_un(pa2, ll)
+    (rng, vals) = find_calc_hrange(func1, (n1+1)/t1, (n1+1)/(5*t1), 0.1, 25)
+    mx1 = max(vals)
+    a1 = rng[0]
+    b1 = rng[-1]
+    d1 = (b1 - a1)/(len(rng)-1)
+    (rng, vals) = find_calc_hrange(func2, (n2+1)/t2, (n2+1)/(5*t2), 0.1, 25)
+    mx2 = max(vals)
+    a2 = rng[0]
+    b2 = rng[-1]
+    d2 = (b2 - a2)/(len(rng)-1)
+    return (mx1, a1, b1, d1, func1, mx2, a2, b2, d2, func2)
+
+def LogDifflambdaProbOne_num2(tup, dl):
+    if not tup:
+        return 0.0
+    (mx1, a1, b1, d1, func1, mx2, a2, b2, d2, func2) = tup
+    # sen gå igenom för fixt dl och summera
+    dd = min(d1, d2)
+    aa = max(a1, a2-dl)
+    bb = min(b1, b2-dl)
+    sm = 0.0
+    lst = []
+    x = aa
+    while x<bb:
+        lst.append(func1(x) + func2(x+dl))
+        x += dd
+    if len(lst)==0:
+        y = func1((aa+bb)/2) + func2((aa+bb)/2 + dl)
+        return y - mx1 - mx2
+    mx = max(lst)
+    sm = sum(map(lambda y: exp(y-mx), lst))
+    return log(sm) + mx - mx1 - mx2 if sm>0.0 else -inf
+
+#def LogDifflambdaProb_un(da1, da2, dl):
+#    return sum(list(map(lambda pa1,pa2: LogDifflambdaProbOne_un(pa1, pa2, dl), da1, da2)))
+
+#def LogDifflambdaProb_pr_un(da1, da2, dl, pr):
+#    pr = (pr[0]/len(da1), pr[1]/len(da1))
+#    return sum(list(map(lambda pa1,pa2: LogDifflambdaProbOne_pr_un(pa1, pa2, dl, pr), da1, da2)))
+
+def LogDifflambdaProb_num1(da1, da2):
+    return list(map(lambda pa1,pa2: LogDifflambdaProbOne_num1(pa1, pa2), da1, da2))
+
+def LogDifflambdaProb_num2(tupl, dl):
+    return sum(list(map(lambda tup: LogDifflambdaProbOne_num2(tup, dl), tupl)))
+
+prior_style = None
+
+def set_prstyle(st):
+    global prior_style
+    prior_style = st
+
+def DifflambdaHist(da1, da2, pa1, pa2):
+    (n1, t1) = pa1
+    (n2, t2) = pa2
+    if t1 == 0.0 or t2 == 0.0 or (n1 == 0 and n2 == 0):
+        return False
+    tupl = LogDifflambdaProb_num1(da1, da2)
+    func = lambda dl: LogDifflambdaProb_num2(tupl, dl)
+    (xres, yres) = find_calc_hrange(func, 0.0, (n1+n2+1)/(t1+t2), 0.1, 20)
+    return [xres, normalize_logprob(yres)]
+
+def LambdaHist(pa1):
+    (n1, t1) = pa1
+    if t1 == 0.0 or n1 == 0:
+        return False
+    func = lambda dl: LogLambdaProb_un(pa1, dl)
+    (xres, yres) = find_calc_hrange(func, (n1+1)/t1, (n1+1)/(5*t1), 0.2, 20)
+    return [xres, normalize_logprob(yres)]
+
+def normalize_logprob(vec):
+    mx = max(vec)
+    sm = 0.0
+    for i in range(len(vec)):
+        sm += exp(vec[i] - mx)
+    norm = log(sm) + mx
+    return [exp(x-norm) for x in vec]
+
+def find_calc_hrange(func, start, step, mindiff, maxdiff):
+    def zpush(v):
+        return 0.0 if abs(v)<2e-11 else v
+    fa = nan
+    fb = nan
+    a = nan
+    b = nan
+    m1 = zpush(start - 0.5*step)
+    fm1 = func(m1)
+    m2 = zpush(start + 0.5*step)
+    fm2 = func(m2)
+    left = []
+    right = []
+    while True:
+        #print(a, fa, m1, fm1, m2, fm2, b, fb)
+        if fm1 > fm2:
+            if not isnan(b):
+                right = [(b, fb)] + right
+            (m, fm) = (m1, fm1)
+            (b, fb) = (m2, fm2)
+        else:
+            if not isnan(a):
+                left = [(a, fa)] + left
+            (a, fa) = (m1, fm1)
+            (m, fm) = (m2, fm2)
+        if isnan(a):
+            m1 = zpush(3*m - 2*b)
+            fm1 = func(m1)
+            (m2, fm2) = (m, fm)
+        elif isnan(b):
+            (m1, fm1) = (m, fm)
+            m2 = zpush(3*m - 2*a)
+            fm2 = func(m2)
+        else:
+            tmp = [fa, fm, fb]
+            mx = max(tmp)
+            mn = min(tmp)
+            if mx-mn < mindiff or (b-a)<step/1024:
+                break
+            if (m-a) > (b-m)*1.5:
+                dir = 0
+            elif (m-a)*1.5 < (b-m):
+                dir = 1
+            else:
+                dir = 0 if fa > fb else 1
+            if dir == 0:
+                m1 = zpush(0.5*(a+m))
+                fm1 = func(m1)
+                (m2, fm2) = (m, fm)
+            else:
+                (m1, fm1) = (m, fm)
+                m2 = zpush(0.5*(m+b))
+                fm2 = func(m2)
+    delta = min(m-a, b-m)
+    left = [(a, fa)] + left
+    right = [(b, fb)] + right
+    if m-a < m-b:
+        right = [(m, fm)] + right
+    else:
+        left = [(m, fm)] + left
+    tr = mx - maxdiff
+    #print("^", mn, mx, tr)
+    resx = []
+    resy = []
+    leftind = 0
+    if delta < 1e-10:
+        delta = 1e-3
+    while leftind < len(left) and left[leftind][1] > tr:
+        resx = [left[leftind][0]] + resx
+        resy = [left[leftind][1]] + resy
+        a = zpush(left[leftind][0] - delta)
+        leftind += 1
+        while leftind >= len(left) or a > left[leftind][0] + delta/2:
+            fa = func(a)
+            #print("L", a, fa)
+            if isnan(fa) or isinf(fa) or fa < tr:
+                break
+            resx = [a] + resx
+            resy = [fa] + resy
+            a = zpush(a-delta)
+    rightind = 0
+    while rightind < len(right) and right[rightind][1] > tr:
+        resx = resx + [right[rightind][0]]
+        resy = resy + [right[rightind][1]]
+        b = zpush(right[rightind][0] + delta)
+        rightind += 1
+        while rightind >= len(right) or b < right[rightind][0] - delta/2:
+            fb = func(b)
+            #print("R", b, fb)
+            if isnan(fb) or isinf(fb) or fb < tr:
+                break
+            resx = resx + [b]
+            resy = resy + [fb]
+            b = zpush(b+delta)
+    #print(a, m, b)
+    return (resx, resy)
+
+def selfunc(entry, val):
+    if type(entry) == list:
+        if len(entry)>0 and type(entry[0])==list:
+            entry = list(map(lambda x:x[0], entry))
+        if type(val) == list:
+            for x in val:
+                if x in entry:
+                    return True
+            return False
+        else:
+            return val in entry
+    else:
+        if type(val) == list:
+            return entry in val
+        else:
+            return val == entry
+
+def selassoc(sel, key):
+    if sel is False:
+        return []
+    else:
+        return [x[-1] for x in sel if key == x[0] or (len(x)==3 and key == x[1])]
+
+def assoc(lst, key, defl):
+    for ele in lst:
+        if ele[0]==key:
+            return ele[1]
+    return defl
+
+def assoc_a0(lst, key, defl):
+    if type(lst)==list:
+        for ele in lst:
+            if type(ele)==list:
+                if ele[0]==key:
+                    return ele[1]
+            else:
+                if ele==key:
+                    return 0
+        return defl
+    else:
+        return 0 if lst==key else defl        
+
+def subsets(lst, ord):
+    if ord==0:
+        return [[]]
+    elif  ord > len(lst):
+        return []
+    else:
+        rest = subsets(lst[1:], ord-1)
+        return list(map(lambda l:[lst[0]]+l, rest)) + subsets(lst[1:],ord)
+
+def issubset(l1, l2):
+    for e in l1:
+        if not e in l2:
+            return False
+    return True
+
+def tupleadd(t1, t2):
+    return tuple((t1[i]+t2[i] for i in range(min(len(t1),len(t2)))))
+    #return (t1[0]+t2[0],t1[1]+t2[1])
+
+def tuplescale(t1, scale):
+    return tuple((t1[i]*scale for i in range(min(len(t1),len(t2)))))
+    #return (t1[0]*scale,t1[1]*scale)
+
+def listminus(l1, l2):
+    return [e for e in l1 if not e in l2]
+
+def read_total_file(file):
+    df = pd.read_csv(file, sep='\t')
+    for c in df:
+        if type(df[c][0])==str and df[c][0][0] == '[':
+            df[c] = df[c].apply(eval)
+    return df
+
+def valsincolumn_old(df, col):
+    d = df[col]
+    res = []
+    for val in d:
+        if type(val)==list:
+            for e in val:
+                if not e in res:
+                    res.append(e)
+        else:
+            if not val in res:
+                res.append(val)
+    return sorted(res)
+
+def valsincolumn(df, col):
+    d = df[col]
+    res = []
+    for val in d:
+        if type(val)==list:
+            for e in val:
+                if type(e)==list:
+                    if not e[0] in res:
+                        res.append(e[0])
+                else:
+                    if not e in res:
+                        res.append(e)
+        else:
+            if not val in res and not (type(val)==float and isnan(val)):
+                res.append(val)
+    return sorted(res)
+
+#--------------------------
+
+# Ny approach för att undvika betingad utspädning: För varje
+# seneffekt, i lämplig submängd av data (patienter, kön, etc) kolla om
+# en viss faktor (egenskaper, diagnos, behandling) är signifikant
+# korrelerad. Sen betinga på varje annan faktor som också är
+# korrelerad, och se om signifikansen försvinner helt, alt frekvensen
+# förändras signifikant. Lista de som inte försvinner/ändras, i
+# signifikansordning. Håll reda på om alla försvinner vid korsvis
+# betingning, eller om flera finns kvar. Helst ska varje signifikant
+# korrelation förklaras av en faktor, alt flera faktorer av samma typ.
+
+def select_data(df, sel1, sel2, selg):
+    # sel = [(col, val)]    selg = [('otherdia', 't21')] [('DIA',1)] [False, 'stralung','ANSIKTE']
+    # cond = [col1, col2, ] 
+    if df.empty:
+        return (df,df)
+    if selg:
+        test = df[df.columns[0]].apply(lambda entry: True)
+        for ele in selg:
+            if len(ele) == 3:
+                test = test & df[ele[1]].apply(lambda entry: not selfunc(entry, ele[2]))
+            else:
+                test = test & df[ele[0]].apply(lambda entry: selfunc(entry, ele[1]))
+        df = df[test]
+    test = df[df.columns[0]].apply(lambda entry: True)
+    for ele in sel1:
+        if len(ele) == 3:
+            test = test & df[ele[1]].apply(lambda entry: not selfunc(entry, ele[2]))
+        else:
+            test = test & df[ele[0]].apply(lambda entry: selfunc(entry, ele[1]))
+    d1 = df[test]
+    if sel2 is False:
+        d2 = df[~test]
+    else:
+        test = df[df.columns[0]].apply(lambda entry: True)
+        for ele in sel2:
+            if len(ele) == 3:
+                test = test & df[ele[1]].apply(lambda entry: not selfunc(entry, ele[2]))
+            else:
+                test = test & df[ele[0]].apply(lambda entry: selfunc(entry, ele[1]))
+        d2 = df[test]
+    return (d1, d2)
+
+def select_data_alt(df, selalt, selg):
+    if df.empty:
+        return (df,df)
+    if selg:
+        test = df[df.columns[0]].apply(lambda entry: True)
+        for ele in selg:
+            if len(ele) == 3:
+                test = test & df[ele[1]].apply(lambda entry: not selfunc(entry, ele[2]))
+            else:
+                test = test & df[ele[0]].apply(lambda entry: selfunc(entry, ele[1]))
+        df = df[test]
+    test = df[df.columns[0]].apply(lambda entry: False)
+    for ele in selalt:
+        if len(ele) == 3:
+            test = test | df[ele[1]].apply(lambda entry: not selfunc(entry, ele[2]))
+        else:
+            test = test | df[ele[0]].apply(lambda entry: selfunc(entry, ele[1]))
+    df = df[test]
+    return df
+
+def count_effect(df, eff, ncol, ecol):
+    n = 0
+    t = 0
+    for i in range(len(df)):
+        row = df.iloc[i]
+        x = assoc(row[ecol], eff, False)
+        if x is not False:
+            n += 1
+            t += x
+        else:
+            t += row[ncol]
+    return (n, t)
+
+def count_effect_mtag(df, eff, tags):
+    ecols = ['event_time_' + tag for tag in tags]
+    ncols = ['no_event_time_' + tag for tag in tags]
+    #ccols = ['censored_time_' + tag for tag in tags]
+    res = (0, 0, 0)
+    for i in range(len(df)):
+        row = df.iloc[i]
+        x = min([assoc(row[ecol], eff, inf) for ecol in ecols])
+        if x is not inf:
+            res = tupleadd(res, (1, x, 1))
+        else:
+            x = min([row[ncol] for ncol in ncols])
+            res = tupleadd(res, (0, x, 1))
+    return res
+
+#def count_cond_effect(df, eff, ncol, ecol, cols, ignore):
+#    dic = {}
+#    for i in range(len(df)):
+#        row = df.iloc[i]
+#        x = assoc(row[ecol], eff, False)
+#        if x is not False:
+#            incr = (1, x)
+#        else:
+#            incr = (0, row[ncol])
+#        increment_tuple_value(row, dic, cols, ignore, incr)
+#    return dic
+
+def get_cond_stats_daydiff(df, pair, conds, selg, tags, eff):
+    # Vi får kuta igenom hela data, och för varje sample sortera in värden i rätt dict och key
+    # Regeln är att ett villkor är sant om eff-dag är senare än ev cond-dag
+    (dd, dummy) = select_data(df, [], False, selg)
+    kl = [""]
+    for cond in conds:
+        kl = [(k+"_"+str(cond[-1]), k+"_~"+str(cond[-1])) for k in kl]
+        kl = [k[0] for k in kl] + [k[1] for k in kl]
+    dick = { k : kl.index(k) for k in kl}
+    dic1 = { k : (0, 0, 0) for k in kl}
+    dic2 = { k : (0, 0, 0) for k in kl}
+    invk = { kl.index(k) : k for k in kl}
+    bits = [2**i for i in range(len(conds))]
+    ecols = ['event_time_' + tag for tag in tags]
+    ncols = ['no_event_time_' + tag for tag in tags]
+    for i in range(len(dd)):
+        row = dd.iloc[i]
+        x = min([assoc(row[ecol], eff, inf) for ecol in ecols])
+        # here, sort it into correct key
+        # för varje cond och pair, hitta senaste dag före eff-dag (minus marginal)
+        ylst = [y if y<x-30 else -inf for y in [assoc_a0(row[col], val, -inf) for col,val in ([pair]+conds if pair else conds)]]
+        y = max(max(ylst), 0)
+        k = invk[sum([b if v==-inf else 0 for (b,v) in zip(bits,ylst if not pair else ylst[1:])])]
+        td = dic2 if pair and ylst[0]==-inf else dic1
+        if x is not inf:
+            td[k] = tupleadd(td[k], (1, x-y, 1))
+        else:
+            x = min([row[ncol] for ncol in ncols])
+            td[k] = tupleadd(td[k], (0, max(x-y, 0), 1))
+    return (dic1, dic2, dick)
+
+def get_cond_stats_one(df, pair, conds, selg, tags, eff):
+    (d1, d2) = select_data(df, [pair], False, selg)
+    kl = [""]
+    dl1 = [d1]
+    dl2 = [d2]
+    for cond in conds:
+        dl1 = [select_data(d1, [cond], False, []) for d1 in dl1]
+        dl2 = [select_data(d2, [cond], False, []) for d2 in dl2]
+        kl = [(k+"_"+str(cond[-1]), k+"_~"+str(cond[-1])) for k in kl]
+        dl1 = [d1[0] for d1 in dl1] + [d1[1] for d1 in dl1]
+        dl2 = [d2[0] for d2 in dl2] + [d2[1] for d2 in dl2]
+        kl = [k[0] for k in kl] + [k[1] for k in kl]
+    dic1 = { k : count_effect_mtag(d, eff, tags) for k,d in zip(kl,dl1)}
+    dic2 = { k : count_effect_mtag(d, eff, tags) for k,d in zip(kl,dl2)}
+    dick = { k : kl.index(k) for k in kl}
+    return (dic1, dic2, dick)
+
+def get_cond_stats_comb(df, conds, selg, tags, eff):
+    (dd, dummy) = select_data(df, [], False, selg)
+    kl = [""]
+    dl = [dd]
+    for cond in conds:
+        dl = [select_data(d, [cond], False, []) for d in dl]
+        kl = [(k+"_"+str(cond[-1]), k+"_~"+str(cond[-1])) for k in kl]
+        dl = [d[0] for d in dl] + [d[1] for d in dl]
+        kl = [k[0] for k in kl] + [k[1] for k in kl]
+    dic = { k : count_effect_mtag(d, eff, tags) for k,d in zip(kl,dl)}
+    datadic = { k : d for k,d in zip(kl,dl)}
+    #dick = { k : kl.index(k) for k in kl}
+    return (dic, datadic)
+
+def estimatefactor(dlst1, dlst2):
+    lf = 0.0
+    sn = 0.0
+    for ((n1,t1),(n2,t2)) in zip(dlst1,dlst2):
+        if n1 > 0 and n2 > 0:
+            lf += min(n1,n2)*(log(n2/t2) - log(n1/t1))
+            sn += min(n1,n2)
+    return exp(lf/sn) if sn>0 else 1.0
+
+def dictolistwithprior(cdic1, cdic2, cdic0, tscale = 1.0):
+    n1,t1,s1 = (0,0,0)
+    n2,t2,s2 = (0,0,0)
+    for k in cdic0:
+        (tmp1, tmp2) = (cdic1[k], cdic2[k])
+        n1 += tmp1[0]
+        t1 += tmp1[1]*tscale
+        s1 += tmp1[2]
+        n2 += tmp2[0]
+        t2 += tmp2[1]*tscale
+        s2 += tmp2[2]
+    if prior_style == 'noninfo':  # Noninformative prior
+        prn = 1.0/len(cdic0)
+        lst1 = [ (cdic1[k][0] - prn, cdic1[k][1]*tscale) for k in cdic0 ]
+        lst2 = [ (cdic2[k][0] - prn, cdic2[k][1]*tscale) for k in cdic0 ]
+    elif prior_style == 'average':  # Average prior
+        prn = 1.0/len(cdic0)
+        prt = prn*(t1+t2)/(n1+n2+1)
+        lst1 = [ (cdic1[k][0] + prn, cdic1[k][1]*tscale + prt) for k in cdic0 ]
+        lst2 = [ (cdic2[k][0] + prn, cdic2[k][1]*tscale + prt) for k in cdic0 ]
+    elif prior_style == 'partaverage': # separate priors per condition
+        prn = 1.0/len(cdic0)
+        lst1 = []
+        lst2 = []
+        for k in cdic0:
+            (nn1, tt1, ss1) = cdic1[k]
+            (nn2, tt2, ss2) = cdic2[k]
+            prt = prn*(tt1+tt2)/(nn1+nn2+prn)
+            lst1.append((nn1+prn, (tt1+prt)*tscale))
+            lst2.append((nn2+prn, (tt2+prt)*tscale))
+    elif prior_style == 'partaveragezero': # inget extra event
+        prn = 1.0/len(cdic0)
+        lst1 = []
+        lst2 = []
+        for k in cdic0:
+            (nn1, tt1, ss1) = cdic1[k]
+            (nn2, tt2, ss2) = cdic2[k]
+            if nn1+nn2 > 0:
+                prt = prn*(tt1+tt2)/(nn1+nn2)
+                lst1.append((nn1+prn, (tt1+prt)*tscale))
+                lst2.append((nn2+prn, (tt2+prt)*tscale))
+            else:
+                lst1.append((nn1, tt1*tscale))
+                lst2.append((nn2, tt2*tscale))
+    else:  # No prior
+        lst1 = [ (cdic1[k][0], cdic1[k][1]*tscale) for k in cdic0 ]
+        lst2 = [ (cdic2[k][0], cdic2[k][1]*tscale) for k in cdic0 ]
+    return (lst1, lst2, (n1,t1,s1), (n2,t2,s2))
+
+def analysediff(cdic1, cdic2, cdic0):
+    (dlst1, dlst2, (n1, t1, s1), (n2, t2, s2)) = dictolistwithprior(cdic1, cdic2, cdic0, 1.0/36525)
+
+    hist = DifflambdaHist(dlst1, dlst2, (n1, t1), (n2, t2))
+    fact = estimatefactor(dlst1, dlst2)
+    if hist is not False:
+        mean, var = hist_mean_var(hist)
+        sig = hist_significance(hist, 0.0, mean)
+    else:
+        mean = 0.0
+        var= 0.0
+        sig = 1.0
+    nn1 = sum(list(map(lambda p:p[0], dlst1)))
+    tt1 = sum(list(map(lambda p:p[1], dlst1)))
+    nn2 = sum(list(map(lambda p:p[0], dlst2)))
+    tt2 = sum(list(map(lambda p:p[1], dlst2)))
+    hist1 = LambdaHist((nn1, tt1))
+    if hist1 is not False:
+        mean1, var1 = hist_mean_var(hist1)
+    else:
+        mean1 = 0.0
+    hist2 = LambdaHist((nn2, tt2))
+    if hist2 is not False:
+        mean2, var2 = hist_mean_var(hist2)
+    else:
+        mean2 = 0.0
+    return {'mean':mean, 'std':sqrt(var), 'sig':sig, 'fact': fact, 'mean1':mean1, 'n1':n1, 't1':t1, 's1':s1, 'mean2':mean2, 'n2':n2, 't2':t2, 's2':s2, 'hist':hist, 'hist1':hist1, 'hist2':hist2}
+
+def analysediff01(cdic1, cdic2, cdic0):
+    (dlst1, dlst2, (n1, t1, s1), (n2, t2, s2)) = dictolistwithprior(cdic1, cdic2, cdic0, 1.0/36525)
+
+    hist = DifflambdaHist(dlst1, dlst2, (n1, t1), (n2, t2))
+    fact = estimatefactor(dlst1, dlst2)
+    if hist is not False:
+        mean, var = hist_mean_var(hist)
+        sig = hist_significance(hist, 0.0, mean)
+    else:
+        mean = 0.0
+        var= 0.0
+        sig = 1.0
+    nn1 = sum(list(map(lambda p:p[0], dlst1)))
+    tt1 = sum(list(map(lambda p:p[1], dlst1)))
+    nn2 = sum(list(map(lambda p:p[0], dlst2)))
+    tt2 = sum(list(map(lambda p:p[1], dlst2)))
+    mean1 = nn1/tt1 if tt1 > 0 else 0.0
+    mean2 = nn2/tt2 if tt2 > 0 else 0.0
+    return {'mean':mean, 'std':sqrt(var), 'sig':sig, 'fact': fact, 'mean1':mean1, 'n1':n1, 't1':t1, 's1':s1, 'mean2':mean2, 'n2':n2, 't2':t2, 's2':s2, 'hist':hist, 'hist1':False, 'hist2':False}
+
+def analysediff0(cdic1, cdic2, cdic0):
+    #dlst1 = cdictolist(cdic1, cdic0, 1.0/36525)
+    #dlst2 = cdictolist(cdic2, cdic0, 1.0/36525)
+    #if prior_style == 'noninfo':
+    #    pr = (-1, 0) # Noninformative prior
+    #elif prior_style in ['average','partaverage','partaveragezero']:
+    #    n1 = sum(list(map(lambda p:p[0], dlst1)))
+    #    t1 = sum(list(map(lambda p:p[1], dlst1)))
+    #    n2 = sum(list(map(lambda p:p[0], dlst2)))
+    #    t2 = sum(list(map(lambda p:p[1], dlst2)))
+    #    pr = (1, 1*(t1+t2)/(n1+n2+1)) # Average prior
+    #else:
+    #    pr = (0, 0) # No prior
+    #hist = DifflambdaHist(dlst1, dlst2, pr)
+    (dlst1, dlst2, (n1, t1, s1), (n2, t2, s2)) = dictolistwithprior(cdic1, cdic2, cdic0, 1.0/36525)
+    hist = DifflambdaHist(dlst1, dlst2, (n1, t1), (n2, t2))
+    if hist is not False:
+        mean, var = hist_mean_var(hist)
+        sig = hist_significance(hist, 0.0, mean)
+    else:
+        mean = 0.0
+        var= 0.0
+        sig = 1.0
+    return {'mean':mean, 'std':sqrt(var), 'sig':sig}
+
+def analyseeffects1(df, selg, coldic, tags, eff):
+    resdic = {}
+    for col in coldic:
+        for val in coldic[col]:
+            #print((col,val))
+            #(dic1, dic2, kdic) = get_cond_stats_one(df, (col, val), [], selg, tags, eff)
+            (dic1, dic2, kdic) = get_cond_stats_daydiff(df, (col, val), [], selg, tags, eff)
+            resdic[(col,val)] = analysediff(dic2, dic1, kdic)
+    return resdic
+
+def analyseeffects_back(df, selg, pair, tags, efflst):
+    resdic = {}
+    for eff in efflst:
+        col,val = pair
+        (dic1, dic2, kdic) = get_cond_stats_one(df, pair, [], selg, tags, eff)
+        resdic[eff] = analysediff(dic2, dic1, kdic)
+    return resdic
+
+def analyseeffects2(df, selg, resdic1, sig, tags, eff):
+    resdic2 = {}
+    lst = []
+    for pair in resdic1:
+        if resdic1[pair]['sig'] <= sig:
+            lst.append(pair)
+    lst.sort(key=lambda x: resdic1[x]['sig'])
+    for (ind,pair1) in reversed(list(enumerate(lst))):
+        mnsig = 0.0
+        mntmp = resdic1[pair1]
+        for pair2 in lst:
+            if pair1 != pair2:
+                #print(pair1,pair2)
+                #(dic1, dic2, kdic) = get_cond_stats_one(df, pair1, [pair2], selg, tags, eff)
+                (dic1, dic2, kdic) = get_cond_stats_daydiff(df, pair1, [pair2], selg, tags, eff)
+                tmp = analysediff(dic2, dic1, kdic)
+                if tmp['sig'] > mnsig:
+                    mnsig = tmp['sig']
+                    tmp['cond'] = pair2
+                    mntmp = tmp
+        resdic2[pair1] = mntmp
+        if mnsig > sig:
+            del lst[ind]
+    return resdic2
+
+def movetosaved(remaining, saved, removed, dic):
+    # alla som inte tas bort av andra än removed flyttas till saved
+    changed = False
+    for (ind,pair) in reversed(list(enumerate(remaining))):
+        lst = dic[pair]
+        ok = True
+        for conds,sig in lst:
+            ok = False
+            for cond in conds:
+                if cond in removed:
+                    ok = True
+            if not ok:
+                break
+        if ok:
+            changed = True
+            saved.append(pair)
+            del remaining[ind]
+    return changed
+
+def movetoremoved(remaining, saved, removed, dic):
+    # alla som tas bort av någon i saved flyttas till removed
+    changed = False
+    for (ind,pair) in reversed(list(enumerate(remaining))):
+        lst = dic[pair]
+        ok = False
+        for conds,sig in lst:
+            ok = True
+            for cond in conds:
+                if cond not in saved:
+                    ok = False
+            if ok:
+                break
+        if ok:
+            changed = True
+            removed.append(pair)
+            del remaining[ind]
+    return changed
+
+def allexcept(lst, ele):
+    return [e for e in lst if e != ele]
+
+def analyseeffects2new(df, selg, resdic1, sig, tags, eff):
+    ciidic = {}
+    lst = []
+    # välj ut dem med signifiant indirekt effekt
+    for pair in resdic1:
+        if resdic1[pair]['sig'] <= sig:
+            lst.append(pair)
+            ciidic[pair] = []
+    # i första passet, betinga var och en på var och en av de andra,
+    # spara lista på vilka som gör den insignifikant
+    # ta bort dem som
+    # 1) blir insignifikant av någon (som den själv inte gör insignifikant)
+    # 2) och inte behövs för att göra någon annan insignifikant
+    # I loop: spara dem som inte tas bort av nåt, släng dem som tas bort av dem,
+    #  iterera med resten dvs spara av resten dem som inte tas bort av kvarvarande
+    # i nästa pass betinga på par (och senare trippler) av kvarvarande
+    # ta bort enligt analog princip
+    remaining = lst
+    saved = []
+    totsaved = []
+    for order in [1,2,3]:
+        remaining = saved + remaining
+        conds = subsets(remaining, order)
+        for pair in lst:
+            for cond in conds:
+                if not pair in cond:
+                    #(dic1, dic2, kdic) = get_cond_stats_one(df, pair, cond, selg, tags, eff)
+                    (dic1, dic2, kdic) = get_cond_stats_daydiff(df, pair, cond, selg, tags, eff)
+                    tmp = analysediff0(dic2, dic1, kdic)
+                    if tmp['sig'] > sig:
+                        ciidic[pair].append((cond,tmp['sig']))
+        remaining = lst.copy()
+        saved = []
+        removed = []
+        changed = True
+        while changed:
+            # alla som inte tas bort av andra än removed flyttas till saved
+            # alla som tas bort av någon i saved flyttas till removed
+            changed = False
+            if movetosaved(remaining, saved, removed, ciidic):
+                changed = True
+            if movetoremoved(remaining, saved, removed, ciidic):
+                changed = True
+    if remaining:
+        alternatives = []
+        origsaved = saved.copy()
+        origremaining = remaining.copy()
+        akeys = []
+        for pair in remaining:
+            for sp in ciidic[pair]:
+                sp2 = listminus(sp[0], saved)
+                if sp2 and sp2 not in akeys:
+                    akeys.append(sp2)
+        for sp in akeys:
+            # kolla också för var och en i listan sp om de nollas av de andra i sp plus saved
+            if len(sp) > 1:
+                ok = True
+                for p in sp:
+                    ll = ciidic[p]
+                    tset = listminus(sp,p) + saved
+                    for ele,sig in ll:
+                        if issubset(ele, tset):
+                            ok = False
+                            break
+                if not ok:
+                    continue
+            saved = origsaved + sp
+            remaining = listminus(origremaining, sp)
+            removed = []
+            changed = movetoremoved(remaining, saved, removed, ciidic)
+            while changed:
+                changed = False
+                if movetosaved(remaining, saved, removed, ciidic):
+                    changed = True
+                if movetoremoved(remaining, saved, removed, ciidic):
+                    changed = True
+            if not remaining:
+                s = set(saved)
+                if s not in alternatives:
+                    alternatives.append(s)
+        if not alternatives:
+            alternatives = [set(saved)]
+            print("Failed to find clean condition alternatives")
+            print("Saved: ", saved)
+            print("Remaining: ", remaining)
+            print("Dict: ", ciidic)
+    else:
+        alternatives = [set(saved)]
+    resdic2lst = []
+    for saved in alternatives:
+        # preparera resdic2 från saved, dvs betinga var och en på övriga
+        saved = list(saved)
+        resdic2 = {}
+        for pair in lst:
+            if pair in saved:
+                cond = allexcept(saved, pair)
+            else:
+                mx = ((),0.0)
+                for sp in ciidic[pair]:
+                    if issubset(sp[0], saved) and sp[1]>mx[1]:
+                        mx = sp
+                if mx[1] > 0.0:
+                    cond = mx[0]
+                else:
+                    cond = allexcept(saved, pair)
+            #(dic1, dic2, kdic) = get_cond_stats_one(df, pair, cond, selg, tags, eff)
+            (dic1, dic2, kdic) = get_cond_stats_daydiff(df, pair, cond, selg, tags, eff)
+            #tmp = analysediff(dic2, dic1, kdic)
+            tmp = analysediff01(dic2, dic1, kdic)
+            tmp['cond'] = cond
+            resdic2[pair] = tmp
+        for s in saved:
+            if not s in totsaved:
+                totsaved.append(s)
+        resdic2lst.append(resdic2)
+    return resdic2lst, totsaved
+
+def analyse_one_effect(df, selg, pair, cond, eff):
+    tags = []
+    for tag in ["death", "inc", "nic"]:
+        if eff in valsincolumn(df, 'event_time_' + tag):
+            tags.append(tag)
+    #(dic1, dic2, kdic) = get_cond_stats_one(df, pair, [], selg, tags, eff)
+    (dic1, dic2, kdic) = get_cond_stats_daydiff(df, pair, [], selg, tags, eff)
+    res0 = analysediff(dic2, dic1, kdic)
+    #(dic1, dic2, kdic) = get_cond_stats_one(df, pair, cond, selg, tags, eff)
+    (dic1, dic2, kdic) = get_cond_stats_daydiff(df, pair, cond, selg, tags, eff)
+    res1 = analysediff(dic2, dic1, kdic)
+    print("Correlates to " + eff)
+    display_analysis_one_row(pair, res1, res0)
+    print()
+    #return { 'mean0': res0['mean'], 'sig0':res0['sig'], 'mean1': res1['mean'], 'sig1': res1['sig']}
+
+def showeffects1(win, df, selg, coldic, tags, eff):
+    name = "Correlates of " + str(list(coldic.keys())) + " to " + eff
+    resdic = analyseeffects1(df, selg, coldic, tags, eff)
+    show_analysis_rows_dict(win, name, resdic, False)
+
+def show_all_effects(win, df, selg, sig, eff):
+    coldic = {}
+    selcols = [s[0] for s in selg if s[0] is not False]
+    if 'sex' not in selcols:
+        coldic['sex'] = ['flicka']  # special since using both are redundant
+    for col in ['other_dia','diagnosis_class','surgery_diff','radio_diff','cytoclass_diff','stemcell_diff']:
+        if col not in selcols:
+            coldic[col] = valsincolumn(df, col)
+    tags = []
+    for tag in ["death", "inc", "nic"]:
+        if eff in valsincolumn(df, 'event_time_' + tag):
+            tags.append(tag)
+    name = "Direct correlates to " + eff
+    resdic1 = analyseeffects1(df, selg, coldic, tags, eff)
+    resdic2lst,remaining = analyseeffects2new(df, selg, resdic1, sig, tags, eff)
+    if len(resdic2lst) == 1:
+        show_analysis_rows_dict(win, name, resdic2lst[0], resdic1)
+    else:
+        print("There are %d alternatives. Press return to switch." % len(resdic2lst))
+        for i,resdic2 in enumerate(resdic2lst):
+            show_analysis_rows_dict(win, name, resdic2lst[i], resdic1)
+            if (i<len(resdic2lst)-1):
+                input()
+
+def display_all_effects(df, selg, sig, efflst = False, extratext=""):
+    coldic = {}
+    selcols = [s[0] for s in selg if s[0] is not False]
+    if 'sex' not in selcols:
+        coldic['sex'] = ['flicka']  # special since using both are redundant
+    for col in ['other_dia','diagnosis_class','surgery_diff','radio_diff','cytoclass_diff','stemcell_diff']:
+        if col not in selcols:
+            coldic[col] = valsincolumn(df, col)
+    effdic = {tag : valsincolumn(df, 'event_time_' + tag) for tag in ["death", "inc", "nic"]}
+    if efflst is False:
+        efflst = []
+        for tag in effdic:
+            for eff in effdic[tag]:
+                if eff not in efflst:
+                    efflst.append(eff)
+        efflst.sort()
+    for eff in efflst:
+        tags = []
+        for tag in effdic:
+            if eff in effdic[tag]:
+                tags.append(tag)
+        name = "Direct correlates to " + eff + extratext
+        resdic1 = analyseeffects1(df, selg, coldic, tags, eff)
+        resdic2lst,remaining = analyseeffects2new(df, selg, resdic1, sig, tags, eff)
+        if len(resdic2lst) == 1:
+            display_analysis_rows_dict(name, resdic2lst[0], resdic1)
+            print()
+        else:
+            print("There are %d alternatives:" % len(resdic2lst))
+            for i,resdic2 in enumerate(resdic2lst):
+                display_analysis_rows_dict(name, resdic2lst[i], resdic1)
+                print("--------------------")
+            display_effect_all_comb(df, selg, remaining, eff)
+
+def get_list_stats2(df, efftag, eff):
+    # return min, max, first tsum, first nsum
+    mn = 0
+    mx = 0
+    nsum_f = 0
+    tsum_f = 0
+    ncol = 'no_event_time_' + efftag
+    ecol = 'event_time_' + efftag
+    for i in range(len(df)):
+        s = df.iloc[i]
+        r = s[ncol]
+        lst = s[ecol]
+        ok = False
+        for ele in lst:
+            if (ele[0] == eff):
+                if ele[1] < mn:
+                    mn = ele[1]
+                if ele[1] > mx:
+                    mx = ele[1]
+                tsum_f += ele[1]
+                nsum_f += 1
+                ok = True
+        if not ok:
+            tsum_f += r if not isnan(r) else 0
+    return (mn, mx, tsum_f, nsum_f)
+
+def show_profile_data(win, df, selg, pair, cond, eff):
+    # först utan betingning
+    (d1, d2) = select_data(df, [pair], False, selg)
+    stats = {}
+    for efftag in ['nic', 'inc', 'death']:
+        stats1[(1,efftag)] = get_list_stats2(d1, efftag, eff)
+        stats2[(2,efftag)] = get_list_stats2(d2, efftag, eff)
+    mn = min([stats[k][0] for k in stats])
+    mx = max([stats[k][1] for k in stats])
+    win.fig.clear()
+    show_profile_axes(win, 100, 200, 800, 100, mn, mx, 365)
+    for (dind, dd, c1, c2, c3) in zip([False,True], [d1,d2], ['pink','lightgreen'], ['red','#00CD00'], ['#8B0000','#006400']):
+        tsum = 0
+        nsum = 0
+        lst1 = []
+        lst2 = []
+        lst3 = []
+        for i in range(len(dd)):
+            s = dd.iloc[i]
+            tacc = max([s['no_event_time_nic'], s['no_event_time_inc'], s['no_event_time_death']]) 
+            nacc = 0
+            for ele in s['event_time_nic']:
+                if (ele[0] == eff):
+                    lst1.append(ele[1])
+                    if ele[1] < tacc:
+                        tacc = ele[1]
+                        nacc = 1
+            for ele in s['event_time_inc']:
+                if (ele[0] == eff):
+                    lst2.append(ele[1])
+                    if ele[1] < tacc:
+                        tacc = ele[1]
+                        nacc = 1
+            for ele in s['event_time_death']:
+                if (ele[0] == eff):
+                    lst3.append(ele[1])
+                    if ele[1] < tacc:
+                        tacc = ele[1]
+                        nacc = 1
+            tsum += tacc
+            nsum += nacc
+        ft = tsum / nsum if nsum > 0 else None
+        show_profile_bars(win, 100, 200, 800, 100, mn, mx, lst1, c1, c1, dind, 0)
+        show_profile_bars(win, 100, 200, 800, 100, mn, mx, lst2, c2, c2, dind, 0)
+        show_profile_bars(win, 100, 200, 800, 100, mn, mx, lst3, c3, c3, dind, 0)
+        show_profile_bars(win, 100, 200, 800, 100, mn, mx, [ft], c2, c2, dind, 1)
+
+    
+def printsorteddicts(diclst):
+    iset = set()
+    for dic in diclst:
+        iset = iset.union(dic.keys())
+    ilst = sorted(list(iset))
+    fstr = "%s" + ("\t%d" * len(diclst))
+    for ele in ilst:
+        tmp = [ele] + [dic[ele] if ele in dic else 0 for dic in diclst]
+        print(fstr % tuple(tmp))
+
+# Full genomlysning:
+# relativt seneffekt eff och konkurerande grundorsaker pairlist för varje
+# förekommande kombination av pair visa:
+# hur många som fått eff av hur många som fått behandlingen och hur lång total obstid.
+# visualisera individuella tider och medelfrekvensen
+
+def display_effect_all_comb(df, selg, conds, eff):
+    effdic = {tag : valsincolumn(df, 'event_time_' + tag) for tag in ["death", "inc", "nic"]}
+    tags = []
+    for tag in effdic:
+        if eff in effdic[tag]:
+            tags.append(tag)
+    name = "Statistics of " + eff
+    #(dic, datadic) = get_cond_stats_comb(df, conds, selg, tags, eff)
+    (dic, dummy1, dummy2) = get_cond_stats_daydiff(df, [], conds, selg, tags, eff)
+    print(name)
+    tscale = 1.0/36525
+    sum0 = 0
+    sum1 = 0
+    sum2 = 0
+    for k in dic:
+        sum0 += dic[k][0]
+        sum1 += dic[k][1]
+        sum2 += dic[k][2]
+    pr1 = sum1/sum0
+    pr2 = sum2/sum0
+    for k in dic:
+        if dic[k][2] > 0:
+            resstr = ""
+            for kk in k.split('_'):
+                if kk:
+                    if kk[0]=='~':
+                        resstr += " "+kk
+                    else:
+                        resstr += "  "+kk
+            resstr += "   "
+            f1 = (dic[k][0] + 1)/((dic[k][1] + pr1)*tscale)
+            f2 = (dic[k][0] + 1)*pr1/(dic[k][1] + pr1)
+            resstr += "%c  %f  %.2f  ( %d / %d : %.2f )" % ('+' if f2 >= 1.5 else ' ', f1, f2, dic[k][0], dic[k][2], dic[k][1]*tscale)
+            #if dic[k][1] > 0:
+            #    resstr += "%f  ( %d / %d : %.2f )" % (dic[k][0]/(dic[k][1]*tscale),dic[k][0],dic[k][2],dic[k][1]*tscale)
+            #else:
+            #    resstr += "%f  ( %d / %d : %.2f )" % (0.0, dic[k][0],dic[k][2],dic[k][1]*tscale)
+            print(resstr)
+    n = 0
+    for kk in k.split('_'):
+        if kk:
+            n += len(kk)+1
+    resstr = " average " + " "*max(0, n-3)
+    resstr += "%f  %.2f  ( %d / %d : %.2f )" % (1/(pr1*tscale), 1.0, 1, round(pr2), pr1*tscale)
+    print(resstr)
+    print()
+
+def display_condensed_data(df, selg, eff):
+    effcols = ['event_time_' + tag for tag in ["death", "inc", "nic"]]
+    df = select_data_alt(df, [(col, eff) for col in effcols], selg)
+    for i in range(len(df)):
+        vec = df.iloc[i]
+        resid = [c[0] for c in vec['event_time_inc'] if c[0][0]=='C']
+        resstr = "%s\t%s\t%s" % (vec['LopNr'],vec['diagnosis'],("("+",".join(resid)+")") if resid else "")
+        for col in ['cytoclass_diff','radio_diff','surgery_diff','stemcell_diff','other_dia']:
+            for b in vec[col]:
+                if type(b)==list:
+                    resstr += "\t" + b[0]
+                else:
+                    resstr += "\t" + b
+        print(resstr)
+
+def collect_days(df, col):
+    dic={}
+    #cols = ['surgery_diff','cytoclass_diff','stemcell_diff','radio_diff']
+    for lst in df[col]:
+        for ele in lst:
+            if not ele[0] in dic:
+                dic[ele[0]] = (0,0)
+            dic[ele[0]] = tupleadd(dic[ele[0]], (1,ele[1]))
+    for key in dic:
+        pair = dic[key]
+        dic[key] = pair[1]/pair[0] if pair[0]>0 else 0
+    return dic
+
+def ddate_hist_dict(df, mn, mx):
+    hdict = { k : 0 for k in range(mn, mx+1)}
+    for i in range(len(df)):
+        dd = df.iloc[i]['DiagnosDat']
+        if dd in hdict:
+            hdict[dd] += 1
+        else:
+            hdict[dd] = 1
+    return hdict
+
+def show_dd_axes(win, x, y, wdt, hgt, labels, count):
+    win.add_line(x, y, x+wdt, y, 0.75, 'black')
+    win.add_line(x, y+hgt+1, x+wdt, y+hgt+1, 0.75, 'black')
+    for pr in [0.25, 0.5, 0.75]:
+        ln = win.add_line(x, y+pr*hgt+1, x+wdt, y+pr*hgt+1, 0.75, 'lightgray')
+        ln.set_zorder(1)
+    onewdt = min(wdt/len(labels), hgt)
+    for ind in range(len(labels)):
+        win.add_text(x+onewdt/2+ind*onewdt, y-20, labels[ind], 12, 'center')
+    win.add_text(x-8, y-4, "0", 9, 'right')
+    win.add_text(x-8, y+hgt-4, "%d" % count, 9, 'right')
+
+def show_dd_bar(win, x, y, wdt, hgt, num, ind, maxcount, val, col):
+    onewdt = min(wdt/num, hgt)
+    bwdt = onewdt-2
+    boff = 1
+    win.add_rect(x + onewdt*ind + boff, y, bwdt, hgt*val/maxcount, 0, None, col)
+
+def show_dd_data(win, dic, x1, x2, mx, pos):
+    win.fig.clear()
+    show_histogram_axes(win, 60, 50+pos*250, 800, 200, ['1970','2015'], 100)
+    for k in dic:
+        show_dd_bar(win, 60, 50+pos*250, 800, 200, 46, int(k)-1970, 100, dic[k], 'orange')
+
+    
+
+def show_dd_data__(win, dic, sel1, sel3, selg, letters):
+    dd = select_data3(df, sel1, sel3, selg)
+    win.fig.clear()
+    show_histogram_axes(win, 100, 100, 800, 200, letters, len(dd[0]))
+    cols = ['#8B0000','red','pink',None]
+    #alpha = 0.0
+    #counts = [alpha/4, alpha/4, alpha/4, alpha/4]
+    for ind in range(len(letters)):
+        letter = letters[ind]
+        counts = [0.0, 0.0, 0.0, 0.0]
+        for i in range(len(dd[0])):
+            s = dd[0].iloc[i]
+            ok = False
+            for ele in s['event_time_death']:
+                if (letter is False or ele[0] == letter):
+                    counts[0] += 1
+                    ok = True
+            if not ok:
+                for ele in s['event_time_inc']:
+                    if (letter is False or ele[0] == letter):
+                        counts[1] += 1
+                        ok = True
+            if not ok:
+                for ele in s['event_time_nic']:
+                    if (letter is False or ele[0] == letter):
+                        counts[2] += 1
+                        ok = True
+            if not ok:
+                counts[3] += 1
+        total = sum(counts)
+        normcount = 0.0
+        normtotal = 0.0
+        for i in range(len(dd[2])):
+            s = dd[2].iloc[i]
+            ok = False
+            for ele in s['event_time_death']:
+                if (letter is False or ele[0] == letter):
+                    normcount += 1
+                    ok = True
+            if not ok:
+                for ele in s['event_time_inc']:
+                    if (letter is False or ele[0] == letter):
+                        normcount += 1
+                        ok = True
+            if not ok:
+                for ele in s['event_time_nic']:
+                    if (letter is False or ele[0] == letter):
+                        normcount += 1
+                        ok = True
+            normtotal += 1
+        show_histogram_bars(win, 100, 100, 800, 200, len(letters), ind, [x/total for x in counts], normcount/normtotal,cols)
+
+def add_complex_attribute(df, col, name, oldlst):
+    for i in range(len(df)):
+        lst = df.iloc[i][col]
+        x = [ ele[1] for ele in lst if ele[0] in oldlst]
+        if x:
+            lst.append([name, min(x)])
diff --git a/dataanalysis/exptest4.py b/dataanalysis/exptest4.py
new file mode 100644
index 0000000..98c374a
--- /dev/null
+++ b/dataanalysis/exptest4.py
@@ -0,0 +1,1357 @@
+"""
+Copyright (C) 2018-2021 RISE Research Institute of Sweden AB
+
+File: exptest4.py
+
+Author: anders.holst@ri.se
+
+"""
+
+import pandas as pd
+from math import *
+from scipy.special import hyp1f1
+
+from hist import *
+from visual import *
+
+eps = 0.001
+
+# Brute force summation, in logarithms to avoid overflow
+def log_hyper_1f1_negintab(a, b, x):
+    m = -a
+    lr = 0.0
+    lhg = 0.0
+    for k in range(m):
+        lr += log(x*(a+k)/((k+1)*(b+k)))
+        lhg += log(1 + exp(lr - lhg))
+    return lhg
+
+# r *= (x*(a+k) / ((k+1)*(b+k)))
+# hg += r
+
+def log_hyper_1f1(a, b, x):
+    # b ej negativt heltal -> log(hyp1f1)
+    # b neg heltal -> interpolera b+-eps
+    eps = 0.0001
+    if b<0 and b==int(b):
+        ret = (hyp1f1(a, b-eps, x) + hyp1f1(a, b+eps, x))/2
+    else:
+        ret = hyp1f1(a, b, x)
+    if isinf(ret):
+        print("overflow at hyp1f1(%f, %f, %f)" % (a,b,x))
+        return -inf
+    elif ret<=0:
+        print("negative hyp1f1(%f, %f, %f)" % (a,b,x))
+        return -inf
+    return log(ret)
+
+def log_hyper_1f1_interpol(a, b, x):
+    # a ej negativt heltal -> interpolera log(hyp1f1)
+    if a==int(a):
+        return log_hyper_1f1_negintab(int(a), b, x)
+    else:
+        r1 = log_hyper_1f1_negintab(floor(a), b, x)
+        r2 = log_hyper_1f1_negintab(ceil(a), b, x)
+        prop = a - floor(a)
+        return r2*prop + r1*(1.0-prop)
+
+# pa = (n, t)
+def LogLambdaProb_un(pa1, ll):
+    (n1, t1) = pa1
+    if ll <= 0:
+        return -inf if n1 > 0 else 0.0
+    return n1 * log(ll) - t1*ll
+
+def LogLambdaProb_pr_un(pa1, ll, pr):
+    (n1, t1) = pa1
+    (n0, t0) = pr
+    if ll <= 0:
+        return -inf if n1+n0 > 0 else 0.0
+    return (n1+n0) * log(ll) - (t1+t0)*ll
+
+def LogDifflambdaProbOne_un(pa1, pa2, dl):
+    (n1, t1) = pa1
+    (n2, t2) = pa2
+    if dl == 0:
+        ret = 0.0
+    elif dl > 0.0:
+        ret = (-dl * t2) + log_hyper_1f1_negintab(-n2, -n1 - n2, dl*(t1 + t2))
+    else:
+        ret = (dl * t1) + log_hyper_1f1_negintab(-n1, -n1 - n2, -dl*(t1 + t2))
+    return ret
+
+def LogDifflambdaProbOne_pr_un_old(pa1, pa2, dl, pr):
+    (n1, t1) = pa1
+    (n2, t2) = pa2
+    (n0, t0) = pr
+    if dl == 0:
+        ret = 0.0
+    elif dl > 0.0:
+        ret = (-dl*(t2+t0)) + log_hyper_1f1_interpol(-n2-n0, -n1-n2-2*n0, dl*(t1+t2+2*t0))
+    else:
+        ret = (dl*(t1+t0)) + log_hyper_1f1_interpol(-n1-n0, -n1-n2-2*n0, -dl*(t1+t2+2*t0))
+    return ret
+
+def LogDifflambdaProbOne_pr_un(pa1, pa2, dl, pr):
+    (n1, t1) = pa1
+    (n2, t2) = pa2
+    if dl == 0 or t1+t2 == 0.0 or n1+n2 == 0:
+        ret = 0.0
+    else:
+        if prior_style == 'partaverage': # separate priors per condition
+            (n0, t0) = (pr[0], pr[0]*(t1+t2)/(n1+n2+pr[0]))
+        elif prior_style == 'partaveragezero': # inget extra event
+            (n0, t0) = (pr[0], pr[0]*(t1+t2)/(n1+n2))
+        else:
+            (n0, t0) = pr
+        if dl > 0.0:
+            ret = (-dl*(t2+t0)) + log_hyper_1f1_interpol(-n2-n0, -n1-n2-2*n0, dl*(t1+t2+2*t0))
+        else:
+            ret = (dl*(t1+t0)) + log_hyper_1f1_interpol(-n1-n0, -n1-n2-2*n0, -dl*(t1+t2+2*t0))
+    return ret
+
+def LogDifflambdaProbOne_num1_old(pa1, pa2, pr):
+    (n1, t1) = pa1
+    (n2, t2) = pa2
+    (n0, t0) = pr
+    # hitta gränser för pa1 och pa2 separat
+    func1 = lambda ll: LogLambdaProb_pr_un(pa1, ll, pr)
+    func2 = lambda ll: LogLambdaProb_pr_un(pa2, ll, pr)
+    (rng, vals) = find_calc_hrange(func1, (n1+n0+1)/(t1+t0), (n1+n0+1)/(5*(t1+t0)), 0.1, 25)
+    mx1 = max(vals)
+    a1 = rng[0]
+    b1 = rng[-1]
+    d1 = (b1 - a1)/(len(rng)-1)
+    (rng, vals) = find_calc_hrange(func2, (n2+n0+1)/(t2+t0), (n2+n0+1)/(5*(t2+t0)), 0.1, 25)
+    mx2 = max(vals)
+    a2 = rng[0]
+    b2 = rng[-1]
+    d2 = (b2 - a2)/(len(rng)-1)
+    return (mx1, a1, b1, d1, func1, mx2, a2, b2, d2, func2)
+
+def LogDifflambdaProbOne_num1(pa1, pa2):
+    (n1, t1) = pa1
+    (n2, t2) = pa2
+    if t1+t2==0.0 or n1+n2==0:
+        return ()
+    # hitta gränser för pa1 och pa2 separat
+    func1 = lambda ll: LogLambdaProb_un(pa1, ll)
+    func2 = lambda ll: LogLambdaProb_un(pa2, ll)
+    (rng, vals) = find_calc_hrange(func1, (n1+1)/t1, (n1+1)/(5*t1), 0.1, 25)
+    mx1 = max(vals)
+    a1 = rng[0]
+    b1 = rng[-1]
+    d1 = (b1 - a1)/(len(rng)-1)
+    (rng, vals) = find_calc_hrange(func2, (n2+1)/t2, (n2+1)/(5*t2), 0.1, 25)
+    mx2 = max(vals)
+    a2 = rng[0]
+    b2 = rng[-1]
+    d2 = (b2 - a2)/(len(rng)-1)
+    return (mx1, a1, b1, d1, func1, mx2, a2, b2, d2, func2)
+
+def LogDifflambdaProbOne_num2(tup, dl):
+    if not tup:
+        return 0.0
+    (mx1, a1, b1, d1, func1, mx2, a2, b2, d2, func2) = tup
+    # sen gå igenom för fixt dl och summera
+    dd = min(d1, d2)
+    aa = max(a1, a2-dl)
+    bb = min(b1, b2-dl)
+    sm = 0.0
+    lst = []
+    x = aa
+    while x<bb:
+        lst.append(func1(x) + func2(x+dl))
+        x += dd
+    if len(lst)==0:
+        y = func1((aa+bb)/2) + func2((aa+bb)/2 + dl)
+        return y - mx1 - mx2
+    mx = max(lst)
+    sm = sum(map(lambda y: exp(y-mx), lst))
+    return log(sm) + mx - mx1 - mx2 if sm>0.0 else -inf
+
+#def LogDifflambdaProb_un(da1, da2, dl):
+#    return sum(list(map(lambda pa1,pa2: LogDifflambdaProbOne_un(pa1, pa2, dl), da1, da2)))
+
+#def LogDifflambdaProb_pr_un(da1, da2, dl, pr):
+#    pr = (pr[0]/len(da1), pr[1]/len(da1))
+#    return sum(list(map(lambda pa1,pa2: LogDifflambdaProbOne_pr_un(pa1, pa2, dl, pr), da1, da2)))
+
+def LogDifflambdaProb_num1(da1, da2):
+    return list(map(lambda pa1,pa2: LogDifflambdaProbOne_num1(pa1, pa2), da1, da2))
+
+def LogDifflambdaProb_num2(tupl, dl):
+    return sum(list(map(lambda tup: LogDifflambdaProbOne_num2(tup, dl), tupl)))
+
+prior_style = 'partaveragezero'
+
+def set_prstyle(st):
+    global prior_style
+    prior_style = st
+
+def DifflambdaHist(da1, da2, pa1, pa2):
+    (n1, t1) = pa1
+    (n2, t2) = pa2
+    if t1 == 0.0 or t2 == 0.0 or (n1 == 0 and n2 == 0):
+        return False
+    tupl = LogDifflambdaProb_num1(da1, da2)
+    func = lambda dl: LogDifflambdaProb_num2(tupl, dl)
+    (xres, yres) = find_calc_hrange(func, 0.0, (n1+n2+1)/(t1+t2), 0.1, 20)
+    return [xres, normalize_logprob(yres)]
+
+def LambdaHist(pa1):
+    (n1, t1) = pa1
+    if t1 == 0.0 or n1 == 0:
+        return False
+    func = lambda dl: LogLambdaProb_un(pa1, dl)
+    (xres, yres) = find_calc_hrange(func, (n1+1)/t1, (n1+1)/(5*t1), 0.2, 20)
+    return [xres, normalize_logprob(yres)]
+
+def normalize_logprob(vec):
+    mx = max(vec)
+    sm = 0.0
+    for i in range(len(vec)):
+        sm += exp(vec[i] - mx)
+    norm = log(sm) + mx
+    return [exp(x-norm) for x in vec]
+
+def find_calc_hrange(func, start, step, mindiff, maxdiff):
+    def zpush(v):
+        return 0.0 if abs(v)<2e-11 else v
+    fa = nan
+    fb = nan
+    a = nan
+    b = nan
+    m1 = zpush(start - 0.5*step)
+    fm1 = func(m1)
+    m2 = zpush(start + 0.5*step)
+    fm2 = func(m2)
+    left = []
+    right = []
+    while True:
+        #print(a, fa, m1, fm1, m2, fm2, b, fb)
+        if fm1 > fm2:
+            if not isnan(b):
+                right = [(b, fb)] + right
+            (m, fm) = (m1, fm1)
+            (b, fb) = (m2, fm2)
+        else:
+            if not isnan(a):
+                left = [(a, fa)] + left
+            (a, fa) = (m1, fm1)
+            (m, fm) = (m2, fm2)
+        if isnan(a):
+            m1 = zpush(3*m - 2*b)
+            fm1 = func(m1)
+            (m2, fm2) = (m, fm)
+        elif isnan(b):
+            (m1, fm1) = (m, fm)
+            m2 = zpush(3*m - 2*a)
+            fm2 = func(m2)
+        else:
+            tmp = [fa, fm, fb]
+            mx = max(tmp)
+            mn = min(tmp)
+            if mx-mn < mindiff or (b-a)<step/1024:
+                break
+            if (m-a) > (b-m)*1.5:
+                dir = 0
+            elif (m-a)*1.5 < (b-m):
+                dir = 1
+            else:
+                dir = 0 if fa > fb else 1
+            if dir == 0:
+                m1 = zpush(0.5*(a+m))
+                fm1 = func(m1)
+                (m2, fm2) = (m, fm)
+            else:
+                (m1, fm1) = (m, fm)
+                m2 = zpush(0.5*(m+b))
+                fm2 = func(m2)
+    delta = min(m-a, b-m)
+    left = [(a, fa)] + left
+    right = [(b, fb)] + right
+    if m-a < m-b:
+        right = [(m, fm)] + right
+    else:
+        left = [(m, fm)] + left
+    tr = mx - maxdiff
+    #print("^", mn, mx, tr)
+    resx = []
+    resy = []
+    leftind = 0
+    if delta < 1e-10:
+        delta = 1e-3
+    while leftind < len(left) and left[leftind][1] > tr:
+        resx = [left[leftind][0]] + resx
+        resy = [left[leftind][1]] + resy
+        a = zpush(left[leftind][0] - delta)
+        leftind += 1
+        while leftind >= len(left) or a > left[leftind][0] + delta/2:
+            fa = func(a)
+            #print("L", a, fa)
+            if isnan(fa) or isinf(fa) or fa < tr:
+                break
+            resx = [a] + resx
+            resy = [fa] + resy
+            a = zpush(a-delta)
+    rightind = 0
+    while rightind < len(right) and right[rightind][1] > tr:
+        resx = resx + [right[rightind][0]]
+        resy = resy + [right[rightind][1]]
+        b = zpush(right[rightind][0] + delta)
+        rightind += 1
+        while rightind >= len(right) or b < right[rightind][0] - delta/2:
+            fb = func(b)
+            #print("R", b, fb)
+            if isnan(fb) or isinf(fb) or fb < tr:
+                break
+            resx = resx + [b]
+            resy = resy + [fb]
+            b = zpush(b+delta)
+    #print(a, m, b)
+    return (resx, resy)
+
+def selfunc(entry, val):
+    if type(entry) == list:
+        if len(entry)>0 and type(entry[0])==list:
+            entry = list(map(lambda x:x[0], entry))
+        if type(val) == list:
+            for x in val:
+                if x in entry:
+                    return True
+            return False
+        else:
+            return val in entry
+    else:
+        if type(val) == list:
+            return entry in val
+        else:
+            return val == entry
+
+def dictincr(dic, key, incr=1):
+    if key not in dic:
+        dic[key] = incr
+    else:
+        dic[key] += incr
+
+def selassoc(sel, key):
+    if sel is False:
+        return []
+    else:
+        return [x[-1] for x in sel if key == x[0] or (len(x)==3 and key == x[1])]
+
+def assoc(lst, key, defl):
+    for ele in lst:
+        if ele[0]==key:
+            return ele[1]
+    return defl
+
+def assoc_a0(lst, key, defl):
+    if type(lst)==list:
+        for ele in lst:
+            if type(ele)==list:
+                if ele[0]==key:
+                    return ele[1]
+            else:
+                if ele==key:
+                    return 0
+        return defl
+    else:
+        return 0 if lst==key else defl        
+
+def subsets(lst, ord):
+    if ord==0:
+        return [[]]
+    elif  ord > len(lst):
+        return []
+    else:
+        rest = subsets(lst[1:], ord-1)
+        return list(map(lambda l:[lst[0]]+l, rest)) + subsets(lst[1:],ord)
+
+def issubset(l1, l2):
+    for e in l1:
+        if not e in l2:
+            return False
+    return True
+
+def tupleadd(t1, t2):
+    return tuple((t1[i]+t2[i] for i in range(min(len(t1),len(t2)))))
+    #return (t1[0]+t2[0],t1[1]+t2[1])
+
+def tuplescale(t1, scale):
+    return tuple((t1[i]*scale for i in range(min(len(t1),len(t2)))))
+    #return (t1[0]*scale,t1[1]*scale)
+
+def listminus(l1, l2):
+    return [e for e in l1 if not e in l2]
+
+def read_total_file(file):
+    df = pd.read_csv(file, sep='\t')
+    for c in df:
+        if type(df[c][0])==str and df[c][0][0] == '[':
+            df[c] = df[c].apply(eval)
+    return df
+
+def valsincolumn_old(df, col):
+    d = df[col]
+    res = []
+    for val in d:
+        if type(val)==list:
+            for e in val:
+                if not e in res:
+                    res.append(e)
+        else:
+            if not val in res:
+                res.append(val)
+    return sorted(res)
+
+def valsincolumn(df, col):
+    d = df[col]
+    res = []
+    for val in d:
+        if type(val)==list:
+            for e in val:
+                if type(e)==list:
+                    if not e[0] in res:
+                        res.append(e[0])
+                else:
+                    if not e in res:
+                        res.append(e)
+        else:
+            if not val in res and not (type(val)==float and isnan(val)):
+                res.append(val)
+    return sorted(res)
+
+#--------------------------
+
+# Ny approach för att undvika betingad utspädning: För varje
+# seneffekt, i lämplig submängd av data (patienter, kön, etc) kolla om
+# en viss faktor (egenskaper, diagnos, behandling) är signifikant
+# korrelerad. Sen betinga på varje annan faktor som också är
+# korrelerad, och se om signifikansen försvinner helt, alt frekvensen
+# förändras signifikant. Lista de som inte försvinner/ändras, i
+# signifikansordning. Håll reda på om alla försvinner vid korsvis
+# betingning, eller om flera finns kvar. Helst ska varje signifikant
+# korrelation förklaras av en faktor, alt flera faktorer av samma typ.
+
+def select_data(df, sel1, sel2, selg):
+    # sel = [(col, val)]    selg = [('otherdia', 't21')] [('DIA',1)] [False, 'stralung','ANSIKTE']
+    # cond = [col1, col2, ] 
+    if df.empty:
+        return (df,df)
+    if selg:
+        test = df[df.columns[0]].apply(lambda entry: True)
+        for ele in selg:
+            if len(ele) == 3:
+                test = test & df[ele[1]].apply(lambda entry: not selfunc(entry, ele[2]))
+            else:
+                test = test & df[ele[0]].apply(lambda entry: selfunc(entry, ele[1]))
+        df = df[test]
+    test = df[df.columns[0]].apply(lambda entry: True)
+    for ele in sel1:
+        if len(ele) == 3:
+            test = test & df[ele[1]].apply(lambda entry: not selfunc(entry, ele[2]))
+        else:
+            test = test & df[ele[0]].apply(lambda entry: selfunc(entry, ele[1]))
+    d1 = df[test]
+    if sel2 is False:
+        d2 = df[~test]
+    else:
+        test = df[df.columns[0]].apply(lambda entry: True)
+        for ele in sel2:
+            if len(ele) == 3:
+                test = test & df[ele[1]].apply(lambda entry: not selfunc(entry, ele[2]))
+            else:
+                test = test & df[ele[0]].apply(lambda entry: selfunc(entry, ele[1]))
+        d2 = df[test]
+    return (d1, d2)
+
+def select_data_alt(df, selalt, selg):
+    if df.empty:
+        return (df,df)
+    if selg:
+        test = df[df.columns[0]].apply(lambda entry: True)
+        for ele in selg:
+            if len(ele) == 3:
+                test = test & df[ele[1]].apply(lambda entry: not selfunc(entry, ele[2]))
+            else:
+                test = test & df[ele[0]].apply(lambda entry: selfunc(entry, ele[1]))
+        df = df[test]
+    test = df[df.columns[0]].apply(lambda entry: False)
+    for ele in selalt:
+        if len(ele) == 3:
+            test = test | df[ele[1]].apply(lambda entry: not selfunc(entry, ele[2]))
+        else:
+            test = test | df[ele[0]].apply(lambda entry: selfunc(entry, ele[1]))
+    df = df[test]
+    return df
+
+def count_effect(df, eff, ncol, ecol):
+    n = 0
+    t = 0
+    for i in range(len(df)):
+        row = df.iloc[i]
+        x = assoc(row[ecol], eff, False)
+        if x is not False:
+            n += 1
+            t += x
+        else:
+            t += row[ncol]
+    return (n, t)
+
+def count_effect_mtag(df, eff, tags):
+    ecols = ['event_time_' + tag for tag in tags]
+    ncols = ['no_event_time_' + tag for tag in tags]
+    #ccols = ['censored_time_' + tag for tag in tags]
+    res = (0, 0, 0)
+    for i in range(len(df)):
+        row = df.iloc[i]
+        x = min([assoc(row[ecol], eff, inf) for ecol in ecols])
+        if x is not inf:
+            res = tupleadd(res, (1, x, 1))
+        else:
+            x = min([row[ncol] for ncol in ncols])
+            res = tupleadd(res, (0, x, 1))
+    return res
+
+#def count_cond_effect(df, eff, ncol, ecol, cols, ignore):
+#    dic = {}
+#    for i in range(len(df)):
+#        row = df.iloc[i]
+#        x = assoc(row[ecol], eff, False)
+#        if x is not False:
+#            incr = (1, x)
+#        else:
+#            incr = (0, row[ncol])
+#        increment_tuple_value(row, dic, cols, ignore, incr)
+#    return dic
+
+def make_inc_nic_diag_dict(df):
+    dic = {}
+    for i in range(len(df)):
+        nr=df['LopNr'][i]
+        if nr not in dic:
+            dic[nr] = set()
+        dia=df['DIAGNOS'][i]
+        if type(dia)==str:
+            dic[nr].update(set(dia.split(' ')))
+    return dic
+
+def count_inc_nic_diag(df, dic, pair, selg, tags, eff):
+    (dic1, dic2, dick) = get_cond_stats_daydiff_lopnr(df, pair, [], selg, tags, eff)
+    resdic1 = {}
+    resdic2 = {}
+    for nr in dic1['']:
+        if nr in dic:
+            for diag in dic[nr]:
+                if eff in diag:
+                    dictincr(resdic1, diag)
+    for nr in dic2['']:
+        if nr in dic:
+            for diag in dic[nr]:
+                if eff in diag:
+                    dictincr(resdic2, diag)
+    return (resdic1, resdic2)
+
+def get_cond_stats_daydiff_old(df, pair, conds, selg, tags, eff):
+    # Vi får kuta igenom hela data, och för varje sample sortera in värden i rätt dict och key
+    # Regeln är att ett villkor är sant om eff-dag är senare än ev cond-dag
+    (dd, dummy) = select_data(df, [], False, selg)
+    kl = [""]
+    for cond in conds:
+        kl = [(k+"_"+str(cond[-1]), k+"_~"+str(cond[-1])) for k in kl]
+        kl = [k[0] for k in kl] + [k[1] for k in kl]
+    dick = { k : kl.index(k) for k in kl}
+    dic1 = { k : (0, 0, 0) for k in kl}
+    dic2 = { k : (0, 0, 0) for k in kl}
+    invk = { kl.index(k) : k for k in kl}
+    bits = [2**i for i in range(len(conds))]
+    ecols = ['event_time_' + tag for tag in tags]
+    ncols = ['no_event_time_' + tag for tag in tags]
+    for i in range(len(dd)):
+        row = dd.iloc[i]
+        x = min([assoc(row[ecol], eff, inf) for ecol in ecols])
+        # here, sort it into correct key
+        # för varje cond och pair, hitta senaste dag före eff-dag (minus marginal)
+        ylst = [y if y<x-30 else -inf for y in [assoc_a0(row[col], val, -inf) for col,val in ([pair]+conds if pair else conds)]]
+        y = max(max(ylst), 0)
+        k = invk[sum([b if v==-inf else 0 for (b,v) in zip(bits,ylst if not pair else ylst[1:])])]
+        td = dic2 if pair and ylst[0]==-inf else dic1
+        if x is not inf:
+            td[k] = tupleadd(td[k], (1, x-y, 1))
+        else:
+            x = min([row[ncol] for ncol in ncols])
+            td[k] = tupleadd(td[k], (0, max(x-y, 0), 1))
+    return (dic1, dic2, dick)
+
+global_margin = 1826
+
+def set_margin(marg):
+    global global_margin
+    global_margin = marg
+
+def get_cond_stats_daydiff(df, pair, conds, selg, tags, eff):
+    global global_margin
+    # Vi får kuta igenom hela data, och för varje sample sortera in värden i rätt dict och key
+    # Regeln är att ett villkor är sant om eff-dag är senare än ev cond-dag
+    marg = global_margin
+    (dd, dummy) = select_data(df, [], False, selg)
+    kl = [""]
+    for cond in conds:
+        kl = [(k+"_"+str(cond[-1]), k+"_~"+str(cond[-1])) for k in kl]
+        kl = [k[0] for k in kl] + [k[1] for k in kl]
+    dick = { k : kl.index(k) for k in kl}
+    dic1 = { k : (0, 0, 0) for k in kl}
+    dic2 = { k : (0, 0, 0) for k in kl}
+    invk = { kl.index(k) : k for k in kl}
+    bits = [2**i for i in range(len(conds))]
+    ecols = ['all_event_time_' + tag for tag in tags]
+    ncols = ['no_event_time_' + tag for tag in tags]
+    ccols = ['censored_time_' + tag for tag in tags]
+    for i in range(len(dd)):
+        row = dd.iloc[i]
+        # get first event across all tags
+        x,c = min([(assoc(row[ecol], eff, [inf])[0], row[ccol]) for ecol,ccol in zip(ecols,ccols)],key=lambda p:p[0]+p[1]) 
+        # för varje cond och pair, hitta senaste dag före eff-dag (minus marginal)
+        condlst = [y if y<x+c-marg else -inf if y > x+c else "Block" for y in [assoc_a0(row[col], val, -inf) for col,val in conds]]
+        yval = assoc_a0(row[pair[0]], pair[1], -inf) if pair else 0
+        td = dic2 if pair and yval==-inf else dic1
+        yval = max(yval, 0)
+        if "Block" in condlst or yval >= x+c-marg:
+            continue
+        # sort it into correct key
+        k = invk[sum([b if v==-inf else 0 for (b,v) in zip(bits,condlst)])]
+        if x is not inf:
+            ymax = max(condlst + [yval]) if pair else max(condlst + [0])
+            td[k] = tupleadd(td[k], (1, x-max(ymax+marg-c, 0), 1))
+        else:
+            x,c = min([(row[ncol],row[ccol]) for ncol,ccol in zip(ncols,ccols)], key=lambda p:p[0])
+            td[k] = tupleadd(td[k], (0, max(x-max(marg-c, 0), 0), 1))
+    return (dic1, dic2, dick)
+
+def get_cond_stats_daydiff_lopnr(df, pair, conds, selg, tags, eff):
+    (dd, dummy) = select_data(df, [], False, selg)
+    kl = [""]
+    for cond in conds:
+        kl = [(k+"_"+str(cond[-1]), k+"_~"+str(cond[-1])) for k in kl]
+        kl = [k[0] for k in kl] + [k[1] for k in kl]
+    dick = { k : kl.index(k) for k in kl}
+    dic1 = { k : [] for k in kl}
+    dic2 = { k : [] for k in kl}
+    invk = { kl.index(k) : k for k in kl}
+    bits = [2**i for i in range(len(conds))]
+    ecols = ['event_time_' + tag for tag in tags]
+    ncols = ['no_event_time_' + tag for tag in tags]
+    for i in range(len(dd)):
+        row = dd.iloc[i]
+        x = min([assoc(row[ecol], eff, inf) for ecol in ecols])
+        ylst = [y if y<x-30 else -inf for y in [assoc_a0(row[col], val, -inf) for col,val in ([pair]+conds if pair else conds)]]
+        y = max(max(ylst), 0)
+        k = invk[sum([b if v==-inf else 0 for (b,v) in zip(bits,ylst if not pair else ylst[1:])])]
+        td = dic2 if pair and ylst[0]==-inf else dic1
+        if x is not inf:
+            td[k].append(str(row['LopNr']))
+    return (dic1, dic2, dick)
+
+def get_cond_stats_one(df, pair, conds, selg, tags, eff):
+    (d1, d2) = select_data(df, [pair], False, selg)
+    kl = [""]
+    dl1 = [d1]
+    dl2 = [d2]
+    for cond in conds:
+        dl1 = [select_data(d1, [cond], False, []) for d1 in dl1]
+        dl2 = [select_data(d2, [cond], False, []) for d2 in dl2]
+        kl = [(k+"_"+str(cond[-1]), k+"_~"+str(cond[-1])) for k in kl]
+        dl1 = [d1[0] for d1 in dl1] + [d1[1] for d1 in dl1]
+        dl2 = [d2[0] for d2 in dl2] + [d2[1] for d2 in dl2]
+        kl = [k[0] for k in kl] + [k[1] for k in kl]
+    dic1 = { k : count_effect_mtag(d, eff, tags) for k,d in zip(kl,dl1)}
+    dic2 = { k : count_effect_mtag(d, eff, tags) for k,d in zip(kl,dl2)}
+    dick = { k : kl.index(k) for k in kl}
+    return (dic1, dic2, dick)
+
+def get_cond_stats_comb(df, conds, selg, tags, eff):
+    (dd, dummy) = select_data(df, [], False, selg)
+    kl = [""]
+    dl = [dd]
+    for cond in conds:
+        dl = [select_data(d, [cond], False, []) for d in dl]
+        kl = [(k+"_"+str(cond[-1]), k+"_~"+str(cond[-1])) for k in kl]
+        dl = [d[0] for d in dl] + [d[1] for d in dl]
+        kl = [k[0] for k in kl] + [k[1] for k in kl]
+    dic = { k : count_effect_mtag(d, eff, tags) for k,d in zip(kl,dl)}
+    datadic = { k : d for k,d in zip(kl,dl)}
+    #dick = { k : kl.index(k) for k in kl}
+    return (dic, datadic)
+
+def estimatefactor(dlst1, dlst2):
+    lf = 0.0
+    sn = 0.0
+    for ((n1,t1),(n2,t2)) in zip(dlst1,dlst2):
+        if n1 > 0 and n2 > 0:
+            lf += min(n1,n2)*(log(n2/t2) - log(n1/t1))
+            sn += min(n1,n2)
+    return exp(lf/sn) if sn>0 else 1.0
+
+def dictolistwithprior(cdic1, cdic2, cdic0, tscale = 1.0):
+    n1,t1,s1 = (0,0,0)
+    n2,t2,s2 = (0,0,0)
+    for k in cdic0:
+        (tmp1, tmp2) = (cdic1[k], cdic2[k])
+        n1 += tmp1[0]
+        t1 += tmp1[1]*tscale
+        s1 += tmp1[2]
+        n2 += tmp2[0]
+        t2 += tmp2[1]*tscale
+        s2 += tmp2[2]
+    if prior_style == 'noninfo':  # Noninformative prior
+        prn = 1.0/len(cdic0)
+        lst1 = [ (cdic1[k][0] - prn, cdic1[k][1]*tscale) for k in cdic0 ]
+        lst2 = [ (cdic2[k][0] - prn, cdic2[k][1]*tscale) for k in cdic0 ]
+    elif prior_style == 'average':  # Average prior
+        prn = 1.0/len(cdic0)
+        prt = prn*(t1+t2)/(n1+n2+1)
+        lst1 = [ (cdic1[k][0] + prn, cdic1[k][1]*tscale + prt) for k in cdic0 ]
+        lst2 = [ (cdic2[k][0] + prn, cdic2[k][1]*tscale + prt) for k in cdic0 ]
+    elif prior_style == 'partaverage': # separate priors per condition
+        prn = 1.0/len(cdic0)
+        lst1 = []
+        lst2 = []
+        for k in cdic0:
+            (nn1, tt1, ss1) = cdic1[k]
+            (nn2, tt2, ss2) = cdic2[k]
+            prt = prn*(tt1+tt2)/(nn1+nn2+prn)
+            lst1.append((nn1+prn, (tt1+prt)*tscale))
+            lst2.append((nn2+prn, (tt2+prt)*tscale))
+    elif prior_style == 'partaveragezero': # inget extra event
+        prn = 1.0/len(cdic0)
+        lst1 = []
+        lst2 = []
+        for k in cdic0:
+            (nn1, tt1, ss1) = cdic1[k]
+            (nn2, tt2, ss2) = cdic2[k]
+            if nn1+nn2 > 0:
+                prt = prn*(tt1+tt2)/(nn1+nn2)
+                lst1.append((nn1+prn, (tt1+prt)*tscale))
+                lst2.append((nn2+prn, (tt2+prt)*tscale))
+            else:
+                lst1.append((nn1, tt1*tscale))
+                lst2.append((nn2, tt2*tscale))
+    else:  # No prior
+        lst1 = [ (cdic1[k][0], cdic1[k][1]*tscale) for k in cdic0 ]
+        lst2 = [ (cdic2[k][0], cdic2[k][1]*tscale) for k in cdic0 ]
+    return (lst1, lst2, (n1,t1,s1), (n2,t2,s2))
+
+def analysediff(cdic1, cdic2, cdic0):
+    (dlst1, dlst2, (n1, t1, s1), (n2, t2, s2)) = dictolistwithprior(cdic1, cdic2, cdic0, 1.0/36525)
+
+    hist = DifflambdaHist(dlst1, dlst2, (n1, t1), (n2, t2))
+    fact = estimatefactor(dlst1, dlst2)
+    if hist is not False:
+        mean, var = hist_mean_var(hist)
+        sig = hist_significance(hist, 0.0, mean)
+    else:
+        mean = 0.0
+        var= 0.0
+        sig = 1.0
+    nn1 = sum(list(map(lambda p:p[0], dlst1)))
+    tt1 = sum(list(map(lambda p:p[1], dlst1)))
+    nn2 = sum(list(map(lambda p:p[0], dlst2)))
+    tt2 = sum(list(map(lambda p:p[1], dlst2)))
+    hist1 = LambdaHist((nn1, tt1))
+    if hist1 is not False:
+        mean1, var1 = hist_mean_var(hist1)
+    else:
+        mean1 = 0.0
+    hist2 = LambdaHist((nn2, tt2))
+    if hist2 is not False:
+        mean2, var2 = hist_mean_var(hist2)
+    else:
+        mean2 = 0.0
+    return {'mean':mean, 'std':sqrt(var), 'sig':sig, 'fact': fact, 'mean1':mean1, 'n1':n1, 't1':t1, 's1':s1, 'mean2':mean2, 'n2':n2, 't2':t2, 's2':s2, 'hist':hist, 'hist1':hist1, 'hist2':hist2}
+
+def analysediff01(cdic1, cdic2, cdic0):
+    (dlst1, dlst2, (n1, t1, s1), (n2, t2, s2)) = dictolistwithprior(cdic1, cdic2, cdic0, 1.0/36525)
+
+    hist = DifflambdaHist(dlst1, dlst2, (n1, t1), (n2, t2))
+    fact = estimatefactor(dlst1, dlst2)
+    if hist is not False:
+        mean, var = hist_mean_var(hist)
+        sig = hist_significance(hist, 0.0, mean)
+    else:
+        mean = 0.0
+        var= 0.0
+        sig = 1.0
+    nn1 = sum(list(map(lambda p:p[0], dlst1)))
+    tt1 = sum(list(map(lambda p:p[1], dlst1)))
+    nn2 = sum(list(map(lambda p:p[0], dlst2)))
+    tt2 = sum(list(map(lambda p:p[1], dlst2)))
+    mean1 = nn1/tt1 if tt1 > 0 else 0.0
+    mean2 = nn2/tt2 if tt2 > 0 else 0.0
+    return {'mean':mean, 'std':sqrt(var), 'sig':sig, 'fact': fact, 'mean1':mean1, 'n1':n1, 't1':t1, 's1':s1, 'mean2':mean2, 'n2':n2, 't2':t2, 's2':s2, 'hist':hist, 'hist1':False, 'hist2':False}
+
+def analysediff0(cdic1, cdic2, cdic0):
+    #dlst1 = cdictolist(cdic1, cdic0, 1.0/36525)
+    #dlst2 = cdictolist(cdic2, cdic0, 1.0/36525)
+    #if prior_style == 'noninfo':
+    #    pr = (-1, 0) # Noninformative prior
+    #elif prior_style in ['average','partaverage','partaveragezero']:
+    #    n1 = sum(list(map(lambda p:p[0], dlst1)))
+    #    t1 = sum(list(map(lambda p:p[1], dlst1)))
+    #    n2 = sum(list(map(lambda p:p[0], dlst2)))
+    #    t2 = sum(list(map(lambda p:p[1], dlst2)))
+    #    pr = (1, 1*(t1+t2)/(n1+n2+1)) # Average prior
+    #else:
+    #    pr = (0, 0) # No prior
+    #hist = DifflambdaHist(dlst1, dlst2, pr)
+    (dlst1, dlst2, (n1, t1, s1), (n2, t2, s2)) = dictolistwithprior(cdic1, cdic2, cdic0, 1.0/36525)
+    hist = DifflambdaHist(dlst1, dlst2, (n1, t1), (n2, t2))
+    if hist is not False:
+        mean, var = hist_mean_var(hist)
+        sig = hist_significance(hist, 0.0, mean)
+    else:
+        mean = 0.0
+        var= 0.0
+        sig = 1.0
+    return {'mean':mean, 'std':sqrt(var), 'sig':sig}
+
+def analyseeffects1(df, selg, coldic, tags, eff):
+    resdic = {}
+    for col in coldic:
+        for val in coldic[col]:
+            #print((col,val))
+            #(dic1, dic2, kdic) = get_cond_stats_one(df, (col, val), [], selg, tags, eff)
+            (dic1, dic2, kdic) = get_cond_stats_daydiff(df, (col, val), [], selg, tags, eff)
+            resdic[(col,val)] = analysediff(dic2, dic1, kdic)
+    return resdic
+
+def analyseeffects_back(df, selg, pair, tags, efflst):
+    resdic = {}
+    for eff in efflst:
+        col,val = pair
+        (dic1, dic2, kdic) = get_cond_stats_one(df, pair, [], selg, tags, eff)
+        resdic[eff] = analysediff(dic2, dic1, kdic)
+    return resdic
+
+def analyseeffects2(df, selg, resdic1, sig, tags, eff):
+    resdic2 = {}
+    lst = []
+    for pair in resdic1:
+        if resdic1[pair]['sig'] <= sig:
+            lst.append(pair)
+    lst.sort(key=lambda x: resdic1[x]['sig'])
+    for (ind,pair1) in reversed(list(enumerate(lst))):
+        mnsig = 0.0
+        mntmp = resdic1[pair1]
+        for pair2 in lst:
+            if pair1 != pair2:
+                #print(pair1,pair2)
+                #(dic1, dic2, kdic) = get_cond_stats_one(df, pair1, [pair2], selg, tags, eff)
+                (dic1, dic2, kdic) = get_cond_stats_daydiff(df, pair1, [pair2], selg, tags, eff)
+                tmp = analysediff(dic2, dic1, kdic)
+                if tmp['sig'] > mnsig:
+                    mnsig = tmp['sig']
+                    tmp['cond'] = pair2
+                    mntmp = tmp
+        resdic2[pair1] = mntmp
+        if mnsig > sig:
+            del lst[ind]
+    return resdic2
+
+def movetosaved(remaining, saved, removed, dic):
+    # alla som inte tas bort av andra än removed flyttas till saved
+    changed = False
+    for (ind,pair) in reversed(list(enumerate(remaining))):
+        lst = dic[pair]
+        ok = True
+        for conds,sig in lst:
+            ok = False
+            for cond in conds:
+                if cond in removed:
+                    ok = True
+            if not ok:
+                break
+        if ok:
+            changed = True
+            saved.append(pair)
+            del remaining[ind]
+    return changed
+
+def movetoremoved(remaining, saved, removed, dic):
+    # alla som tas bort av någon i saved flyttas till removed
+    changed = False
+    for (ind,pair) in reversed(list(enumerate(remaining))):
+        lst = dic[pair]
+        ok = False
+        for conds,sig in lst:
+            ok = True
+            for cond in conds:
+                if cond not in saved:
+                    ok = False
+            if ok:
+                break
+        if ok:
+            changed = True
+            removed.append(pair)
+            del remaining[ind]
+    return changed
+
+def allexcept(lst, ele):
+    return [e for e in lst if e != ele]
+
+def analyseeffects2new(df, selg, resdic1, sig, tags, eff):
+    ciidic = {}
+    lst = []
+    # välj ut dem med signifiant indirekt effekt
+    for pair in resdic1:
+        if resdic1[pair]['sig'] <= sig:
+            lst.append(pair)
+            ciidic[pair] = []
+    # i första passet, betinga var och en på var och en av de andra,
+    # spara lista på vilka som gör den insignifikant
+    # ta bort dem som
+    # 1) blir insignifikant av någon (som den själv inte gör insignifikant)
+    # 2) och inte behövs för att göra någon annan insignifikant
+    # I loop: spara dem som inte tas bort av nåt, släng dem som tas bort av dem,
+    #  iterera med resten dvs spara av resten dem som inte tas bort av kvarvarande
+    # i nästa pass betinga på par (och senare trippler) av kvarvarande
+    # ta bort enligt analog princip
+    remaining = lst
+    saved = []
+    totsaved = []
+    for order in [1,2,3]:
+        remaining = saved + remaining
+        conds = subsets(remaining, order)
+        for pair in lst:
+            for cond in conds:
+                if not pair in cond:
+                    #(dic1, dic2, kdic) = get_cond_stats_one(df, pair, cond, selg, tags, eff)
+                    (dic1, dic2, kdic) = get_cond_stats_daydiff(df, pair, cond, selg, tags, eff)
+                    tmp = analysediff0(dic2, dic1, kdic)
+                    if tmp['sig'] > sig:
+                        ciidic[pair].append((cond,tmp['sig']))
+        remaining = lst.copy()
+        saved = []
+        removed = []
+        changed = True
+        while changed:
+            # alla som inte tas bort av andra än removed flyttas till saved
+            # alla som tas bort av någon i saved flyttas till removed
+            changed = False
+            if movetosaved(remaining, saved, removed, ciidic):
+                changed = True
+            if movetoremoved(remaining, saved, removed, ciidic):
+                changed = True
+    if remaining:
+        alternatives = []
+        origsaved = saved.copy()
+        origremaining = remaining.copy()
+        akeys = []
+        for pair in remaining:
+            for sp in ciidic[pair]:
+                sp2 = listminus(sp[0], saved)
+                if sp2 and sp2 not in akeys:
+                    akeys.append(sp2)
+        for sp in akeys:
+            # kolla också för var och en i listan sp om de nollas av de andra i sp plus saved
+            if len(sp) > 1:
+                ok = True
+                for p in sp:
+                    ll = ciidic[p]
+                    tset = listminus(sp,p) + saved
+                    for ele,sig in ll:
+                        if issubset(ele, tset):
+                            ok = False
+                            break
+                if not ok:
+                    continue
+            saved = origsaved + sp
+            remaining = listminus(origremaining, sp)
+            removed = []
+            changed = movetoremoved(remaining, saved, removed, ciidic)
+            while changed:
+                changed = False
+                if movetosaved(remaining, saved, removed, ciidic):
+                    changed = True
+                if movetoremoved(remaining, saved, removed, ciidic):
+                    changed = True
+            if not remaining:
+                s = set(saved)
+                if s not in alternatives:
+                    alternatives.append(s)
+        if not alternatives:
+            alternatives = [set(saved)]
+            print("Failed to find clean condition alternatives")
+            print("Saved: ", saved)
+            print("Remaining: ", remaining)
+            print("Dict: ", ciidic)
+    else:
+        alternatives = [set(saved)]
+    resdic2lst = []
+    for saved in alternatives:
+        # preparera resdic2 från saved, dvs betinga var och en på övriga
+        saved = list(saved)
+        resdic2 = {}
+        for pair in lst:
+            if pair in saved:
+                cond = allexcept(saved, pair)
+            else:
+                mx = ((),0.0)
+                for sp in ciidic[pair]:
+                    if issubset(sp[0], saved) and sp[1]>mx[1]:
+                        mx = sp
+                if mx[1] > 0.0:
+                    cond = mx[0]
+                else:
+                    cond = allexcept(saved, pair)
+            #(dic1, dic2, kdic) = get_cond_stats_one(df, pair, cond, selg, tags, eff)
+            (dic1, dic2, kdic) = get_cond_stats_daydiff(df, pair, cond, selg, tags, eff)
+            #tmp = analysediff(dic2, dic1, kdic)
+            tmp = analysediff01(dic2, dic1, kdic)
+            tmp['cond'] = cond
+            resdic2[pair] = tmp
+        for s in saved:
+            if not s in totsaved:
+                totsaved.append(s)
+        resdic2lst.append(resdic2)
+    return resdic2lst, totsaved
+
+def analyse_one_effect(df, selg, pair, cond, eff):
+    tags = []
+    for tag in ["death", "inc", "nic"]:
+        if eff in valsincolumn(df, 'event_time_' + tag):
+            tags.append(tag)
+    #(dic1, dic2, kdic) = get_cond_stats_one(df, pair, [], selg, tags, eff)
+    (dic1, dic2, kdic) = get_cond_stats_daydiff(df, pair, [], selg, tags, eff)
+    res0 = analysediff(dic2, dic1, kdic)
+    #(dic1, dic2, kdic) = get_cond_stats_one(df, pair, cond, selg, tags, eff)
+    (dic1, dic2, kdic) = get_cond_stats_daydiff(df, pair, cond, selg, tags, eff)
+    res1 = analysediff(dic2, dic1, kdic)
+    print("Correlates to " + eff)
+    display_analysis_one_row(pair, res1, res0)
+    print()
+    #return { 'mean0': res0['mean'], 'sig0':res0['sig'], 'mean1': res1['mean'], 'sig1': res1['sig']}
+
+def showeffects1(win, df, selg, coldic, tags, eff):
+    name = "Correlates of " + str(list(coldic.keys())) + " to " + eff
+    resdic = analyseeffects1(df, selg, coldic, tags, eff)
+    show_analysis_rows_dict(win, name, resdic, False)
+
+def show_all_effects(win, df, selg, sig, eff):
+    coldic = {}
+    selcols = [s[0] for s in selg if s[0] is not False]
+    if 'sex' not in selcols:
+        coldic['sex'] = ['flicka']  # special since using both are redundant
+    for col in ['other_dia','diagnosis_class','surgery_diff','radio_diff','cytoclass_diff','stemcell_diff']:
+        if col not in selcols:
+            coldic[col] = valsincolumn(df, col)
+    tags = []
+    for tag in ["death", "inc", "nic"]:
+        if eff in valsincolumn(df, 'event_time_' + tag):
+            tags.append(tag)
+    name = "Direct correlates to " + eff
+    resdic1 = analyseeffects1(df, selg, coldic, tags, eff)
+    resdic2lst,remaining = analyseeffects2new(df, selg, resdic1, sig, tags, eff)
+    if len(resdic2lst) == 1:
+        show_analysis_rows_dict(win, name, resdic2lst[0], resdic1)
+    else:
+        print("There are %d alternatives. Press return to switch." % len(resdic2lst))
+        for i,resdic2 in enumerate(resdic2lst):
+            show_analysis_rows_dict(win, name, resdic2lst[i], resdic1)
+            if (i<len(resdic2lst)-1):
+                input()
+
+def display_all_effects(df, selg, sig, efflst = False, extratext="", tags = False):
+    coldic = {}
+    selcols = [s[0] for s in selg if s[0] is not False]
+    if 'sex' not in selcols:
+        coldic['sex'] = ['flicka']  # special since using both are redundant
+    for col in ['other_dia','diagnosis_class','surgery_diff','radio_diff','cytoclass_diff','stemcell_diff']:
+        if col not in selcols:
+            coldic[col] = valsincolumn(df, col)
+    effdic = {tag : valsincolumn(df, 'event_time_' + tag) for tag in (["death", "inc", "nic"] if tags is False else tags)}
+    if efflst is False:
+        efflst = []
+        for tag in effdic:
+            for eff in effdic[tag]:
+                if eff not in efflst:
+                    efflst.append(eff)
+        efflst.sort()
+    for eff in efflst:
+        taglst = []
+        for tag in effdic:
+            if eff in effdic[tag]:
+                taglst.append(tag)
+        name = "Direct correlates to " + eff + extratext
+        resdic1 = analyseeffects1(df, selg, coldic, taglst, eff)
+        resdic2lst,remaining = analyseeffects2new(df, selg, resdic1, sig, taglst, eff)
+        if len(resdic2lst) == 1:
+            display_analysis_rows_dict(name, resdic2lst[0], resdic1)
+            print()
+        else:
+            print("There are %d alternatives:" % len(resdic2lst))
+            for i,resdic2 in enumerate(resdic2lst):
+                display_analysis_rows_dict(name, resdic2lst[i], resdic1)
+                print("--------------------")
+            display_effect_all_comb(df, selg, remaining, eff)
+
+def get_list_stats2(df, efftag, eff):
+    # return min, max, first tsum, first nsum
+    mn = 0
+    mx = 0
+    nsum_f = 0
+    tsum_f = 0
+    ncol = 'no_event_time_' + efftag
+    ecol = 'event_time_' + efftag
+    for i in range(len(df)):
+        s = df.iloc[i]
+        r = s[ncol]
+        lst = s[ecol]
+        ok = False
+        for ele in lst:
+            if (ele[0] == eff):
+                if ele[1] < mn:
+                    mn = ele[1]
+                if ele[1] > mx:
+                    mx = ele[1]
+                tsum_f += ele[1]
+                nsum_f += 1
+                ok = True
+        if not ok:
+            tsum_f += r if not isnan(r) else 0
+    return (mn, mx, tsum_f, nsum_f)
+
+def show_profile_data(win, df, selg, pair, cond, eff):
+    # först utan betingning
+    (d1, d2) = select_data(df, [pair], False, selg)
+    stats = {}
+    for efftag in ['nic', 'inc', 'death']:
+        stats1[(1,efftag)] = get_list_stats2(d1, efftag, eff)
+        stats2[(2,efftag)] = get_list_stats2(d2, efftag, eff)
+    mn = min([stats[k][0] for k in stats])
+    mx = max([stats[k][1] for k in stats])
+    win.fig.clear()
+    show_profile_axes(win, 100, 200, 800, 100, mn, mx, 365)
+    for (dind, dd, c1, c2, c3) in zip([False,True], [d1,d2], ['pink','lightgreen'], ['red','#00CD00'], ['#8B0000','#006400']):
+        tsum = 0
+        nsum = 0
+        lst1 = []
+        lst2 = []
+        lst3 = []
+        for i in range(len(dd)):
+            s = dd.iloc[i]
+            tacc = max([s['no_event_time_nic'], s['no_event_time_inc'], s['no_event_time_death']]) 
+            nacc = 0
+            for ele in s['event_time_nic']:
+                if (ele[0] == eff):
+                    lst1.append(ele[1])
+                    if ele[1] < tacc:
+                        tacc = ele[1]
+                        nacc = 1
+            for ele in s['event_time_inc']:
+                if (ele[0] == eff):
+                    lst2.append(ele[1])
+                    if ele[1] < tacc:
+                        tacc = ele[1]
+                        nacc = 1
+            for ele in s['event_time_death']:
+                if (ele[0] == eff):
+                    lst3.append(ele[1])
+                    if ele[1] < tacc:
+                        tacc = ele[1]
+                        nacc = 1
+            tsum += tacc
+            nsum += nacc
+        ft = tsum / nsum if nsum > 0 else None
+        show_profile_bars(win, 100, 200, 800, 100, mn, mx, lst1, c1, c1, dind, 0)
+        show_profile_bars(win, 100, 200, 800, 100, mn, mx, lst2, c2, c2, dind, 0)
+        show_profile_bars(win, 100, 200, 800, 100, mn, mx, lst3, c3, c3, dind, 0)
+        show_profile_bars(win, 100, 200, 800, 100, mn, mx, [ft], c2, c2, dind, 1)
+
+    
+def printsorteddicts(diclst):
+    iset = set()
+    for dic in diclst:
+        iset = iset.union(dic.keys())
+    ilst = sorted(list(iset))
+    fstr = "%s" + ("\t%d" * len(diclst))
+    for ele in ilst:
+        tmp = [ele] + [dic[ele] if ele in dic else 0 for dic in diclst]
+        print(fstr % tuple(tmp))
+
+# Full genomlysning:
+# relativt seneffekt eff och konkurerande grundorsaker pairlist för varje
+# förekommande kombination av pair visa:
+# hur många som fått eff av hur många som fått behandlingen och hur lång total obstid.
+# visualisera individuella tider och medelfrekvensen
+
+def display_effect_all_comb(df, selg, conds, eff, tags = False):
+    effdic = {tag : valsincolumn(df, 'event_time_' + tag) for tag in (["death", "inc", "nic"] if tags is False else tags)}
+    taglst = []
+    for tag in effdic:
+        if eff in effdic[tag]:
+            taglst.append(tag)
+    name = "Statistics of " + eff
+    #(dic, datadic) = get_cond_stats_comb(df, conds, selg, taglst, eff)
+    (dic, dummy1, dummy2) = get_cond_stats_daydiff(df, [], conds, selg, taglst, eff)
+    print(name)
+    tscale = 1.0/36525
+    sum0 = 0
+    sum1 = 0
+    sum2 = 0
+    for k in dic:
+        sum0 += dic[k][0]
+        sum1 += dic[k][1]
+        sum2 += dic[k][2]
+    pr1 = sum1/sum0
+    pr2 = sum2/sum0
+    for k in dic:
+        if dic[k][2] > 0:
+            resstr = ""
+            for kk in k.split('_'):
+                if kk:
+                    if kk[0]=='~':
+                        resstr += " "+kk
+                    else:
+                        resstr += "  "+kk
+            resstr += "   "
+            f1 = (dic[k][0] + 1)/((dic[k][1] + pr1)*tscale)
+            f2 = (dic[k][0] + 1)*pr1/(dic[k][1] + pr1)
+            resstr += "%c  %f  %.2f  ( %d / %d : %.2f )" % ('+' if f2 >= 1.5 else ' ', f1, f2, dic[k][0], dic[k][2], dic[k][1]*tscale)
+            #if dic[k][1] > 0:
+            #    resstr += "%f  ( %d / %d : %.2f )" % (dic[k][0]/(dic[k][1]*tscale),dic[k][0],dic[k][2],dic[k][1]*tscale)
+            #else:
+            #    resstr += "%f  ( %d / %d : %.2f )" % (0.0, dic[k][0],dic[k][2],dic[k][1]*tscale)
+            print(resstr)
+    n = 0
+    for kk in k.split('_'):
+        if kk:
+            n += len(kk)+1
+    resstr = " average " + " "*max(0, n-3)
+    resstr += "%f  %.2f  ( %d / %d : %.2f )" % (1/(pr1*tscale), 1.0, 1, round(pr2), pr1*tscale)
+    print(resstr)
+    print()
+
+def display_condensed_data(df, selg, eff):
+    effcols = ['event_time_' + tag for tag in ["death", "inc", "nic"]]
+    df = select_data_alt(df, [(col, eff) for col in effcols], selg)
+    for i in range(len(df)):
+        vec = df.iloc[i]
+        resid = [c[0] for c in vec['event_time_inc'] if c[0][0]=='C']
+        resstr = "%s\t%s\t%s" % (vec['LopNr'],vec['diagnosis'],("("+",".join(resid)+")") if resid else "")
+        for col in ['cytoclass_diff','radio_diff','surgery_diff','stemcell_diff','other_dia']:
+            for b in vec[col]:
+                if type(b)==list:
+                    resstr += "\t" + b[0]
+                else:
+                    resstr += "\t" + b
+        print(resstr)
+
+def collect_days(df, col):
+    dic={}
+    #cols = ['surgery_diff','cytoclass_diff','stemcell_diff','radio_diff']
+    for lst in df[col]:
+        for ele in lst:
+            if not ele[0] in dic:
+                dic[ele[0]] = (0,0)
+            dic[ele[0]] = tupleadd(dic[ele[0]], (1,ele[1]))
+    for key in dic:
+        pair = dic[key]
+        dic[key] = pair[1]/pair[0] if pair[0]>0 else 0
+    return dic
+
+def ddate_hist_dict(df, mn, mx):
+    hdict = { k : 0 for k in range(mn, mx+1)}
+    for i in range(len(df)):
+        dd = df.iloc[i]['DiagnosDat']
+        if dd in hdict:
+            hdict[dd] += 1
+        else:
+            hdict[dd] = 1
+    return hdict
+
+def show_dd_axes(win, x, y, wdt, hgt, labels, count):
+    win.add_line(x, y, x+wdt, y, 0.75, 'black')
+    win.add_line(x, y+hgt+1, x+wdt, y+hgt+1, 0.75, 'black')
+    for pr in [0.25, 0.5, 0.75]:
+        ln = win.add_line(x, y+pr*hgt+1, x+wdt, y+pr*hgt+1, 0.75, 'lightgray')
+        ln.set_zorder(1)
+    onewdt = min(wdt/len(labels), hgt)
+    for ind in range(len(labels)):
+        win.add_text(x+onewdt/2+ind*onewdt, y-20, labels[ind], 12, 'center')
+    win.add_text(x-8, y-4, "0", 9, 'right')
+    win.add_text(x-8, y+hgt-4, "%d" % count, 9, 'right')
+
+def show_dd_bar(win, x, y, wdt, hgt, num, ind, maxcount, val, col):
+    onewdt = min(wdt/num, hgt)
+    bwdt = onewdt-2
+    boff = 1
+    win.add_rect(x + onewdt*ind + boff, y, bwdt, hgt*val/maxcount, 0, None, col)
+
+def show_dd_data(win, dic, x1, x2, mx, pos):
+    win.fig.clear()
+    show_histogram_axes(win, 60, 50+pos*250, 800, 200, ['1970','2015'], 100)
+    for k in dic:
+        show_dd_bar(win, 60, 50+pos*250, 800, 200, 46, int(k)-1970, 100, dic[k], 'orange')
+
+    
+
+def show_dd_data__(win, dic, sel1, sel3, selg, letters):
+    dd = select_data3(df, sel1, sel3, selg)
+    win.fig.clear()
+    show_histogram_axes(win, 100, 100, 800, 200, letters, len(dd[0]))
+    cols = ['#8B0000','red','pink',None]
+    #alpha = 0.0
+    #counts = [alpha/4, alpha/4, alpha/4, alpha/4]
+    for ind in range(len(letters)):
+        letter = letters[ind]
+        counts = [0.0, 0.0, 0.0, 0.0]
+        for i in range(len(dd[0])):
+            s = dd[0].iloc[i]
+            ok = False
+            for ele in s['event_time_death']:
+                if (letter is False or ele[0] == letter):
+                    counts[0] += 1
+                    ok = True
+            if not ok:
+                for ele in s['event_time_inc']:
+                    if (letter is False or ele[0] == letter):
+                        counts[1] += 1
+                        ok = True
+            if not ok:
+                for ele in s['event_time_nic']:
+                    if (letter is False or ele[0] == letter):
+                        counts[2] += 1
+                        ok = True
+            if not ok:
+                counts[3] += 1
+        total = sum(counts)
+        normcount = 0.0
+        normtotal = 0.0
+        for i in range(len(dd[2])):
+            s = dd[2].iloc[i]
+            ok = False
+            for ele in s['event_time_death']:
+                if (letter is False or ele[0] == letter):
+                    normcount += 1
+                    ok = True
+            if not ok:
+                for ele in s['event_time_inc']:
+                    if (letter is False or ele[0] == letter):
+                        normcount += 1
+                        ok = True
+            if not ok:
+                for ele in s['event_time_nic']:
+                    if (letter is False or ele[0] == letter):
+                        normcount += 1
+                        ok = True
+            normtotal += 1
+        show_histogram_bars(win, 100, 100, 800, 200, len(letters), ind, [x/total for x in counts], normcount/normtotal,cols)
+
+def add_complex_attribute(df, col, name, oldlst):
+    for i in range(len(df)):
+        lst = df.iloc[i][col]
+        x = [ ele[1] for ele in lst if ele[0] in oldlst]
+        if x:
+            lst.append([name, min(x)])
diff --git a/dataanalysis/hist.py b/dataanalysis/hist.py
new file mode 100644
index 0000000..d5f555e
--- /dev/null
+++ b/dataanalysis/hist.py
@@ -0,0 +1,105 @@
+"""
+Copyright (C) 2018-2021 RISE Research Institute of Sweden AB
+
+File: hist.py
+
+Author: anders.holst@ri.se
+
+"""
+
+from math import *
+
+def seconddegree(a, b, c, fmin, fmax):
+    # solves a x^2 + b x + c = 0, and returns the closest real value in the interval [fmin, fmax]
+    if a==0.0:
+        f = -c/b
+        return max(min(f, fmax), fmin)
+    else:
+        f0 = -b/(2*a)
+        fd2 = f0*f0 - c/a
+        if fd2<0:
+            f = f0
+        elif f0 < (fmin+fmax)/2:
+            f = f0 + sqrt(fd2)
+        else:
+            f = f0 - sqrt(fd2)
+        return max(min(f, fmax), fmin)
+
+def hist_peak(hist):
+    mx = 0.0
+    mxind = False
+    for i in range(len(hist[0])):
+        if mx < hist[1][i]:
+            mx = hist[1][i]
+            mxind = hist[0][i]
+    return mxind
+
+def hist_significance(hist, val, peak):
+    sum = 0.0
+    if val > peak:
+        for i in range(len(hist[0])):
+            if hist[0][i] >= val:
+                sum += hist[1][i]
+    else:
+        for i in range(len(hist[0])):
+            if hist[0][i] <= val:
+                sum += hist[1][i]
+    return sum
+
+def hist_mean_var(hist):
+    sum = 0
+    sqsum = 0
+    for i in range(len(hist[0])):
+        sum += hist[0][i]*hist[1][i]
+        sqsum += hist[0][i]*hist[0][i]*hist[1][i]
+    return (sum, sqsum-sum*sum)
+
+def hist_maxinterval(hist, mass):
+    i = 0
+    j = len(hist[0])-1
+    while i < j:
+        a1 = hist[1][i-1] if i>0 else 0.0
+        a2 = hist[1][i]
+        b1 = hist[1][j+1] if j<len(hist[0])-1 else 0.0
+        b2 = hist[1][j]
+        if a2 < b2:
+            ma = (a2+a1)/2
+            mb = (a2*a2 - b1*b1)/(2*(b2-b1)) if b2>b1 else 0.0
+            if mass + ma + mb > 1.0:
+                break
+            mass += ma
+            i += 1
+        else:
+            mb = (b2+b1)/2
+            ma = (b2*b2 - a1*a1)/(2*(a2-a1)) if a2>a1 else 0.0
+            if mass + ma + mb > 1.0:
+                break
+            mass += mb
+            j -= 1
+    if i == 0:
+        if j == len(hist[0])-1:
+            return (hist[0][i], hist[0][j])
+        else:
+            b1 = hist[1][j+1]
+            b2 = hist[1][j]
+            delta = (b1+b2)/2 + mass - 1.0
+            xb = (seconddegree(1.0, 0.0, -2*delta*(b1-b2) -b2*b2, b1, b2) - b2)/(b1 - b2)
+            return (hist[0][i], hist[0][j]*(1.0-xb) + hist[0][j+1]*xb)
+    else:            
+        if j == len(hist[0])-1:
+            a1 = hist[1][i-1]
+            a2 = hist[1][i]
+            delta = (a1+a2)/2 + mass - 1.0
+            xa = (seconddegree(1.0, 0.0, -2*delta*(a1-a2) -a2*a2, a1, a2) - a2)/(a1 - a2)
+            return (hist[0][i]*(1.0-xa) + hist[0][i-1]*xa, hist[0][j])
+        else:
+            a1 = hist[1][i-1]
+            a2 = hist[1][i]
+            b1 = hist[1][j+1]
+            b2 = hist[1][j]
+            delta = (a1+a2+b1+b2)/2 + mass - 1.0
+            f = seconddegree(b1-b2+a1-a2, 0.0, -a2*a2*(b1-b2) - b2*b2*(a1-a2) - 2*(a1-a2)*(b1-b2)*delta, max(a1, b1), min(a2, b2))
+            xa = (f - a2)/(a1 - a2)
+            xb = (f - b2)/(b1 - b2)
+            return (hist[0][i]*(1.0-xa) + hist[0][i-1]*xa, hist[0][j]*(1.0-xb) + hist[0][j+1]*xb)
+
diff --git a/dataanalysis/visual.py b/dataanalysis/visual.py
new file mode 100644
index 0000000..b48f3e9
--- /dev/null
+++ b/dataanalysis/visual.py
@@ -0,0 +1,394 @@
+"""
+Copyright (C) 2018-2021 RISE Research Institute of Sweden AB
+
+File: visual.py
+
+Author: anders.holst@ri.se
+
+"""
+
+from math import *
+from hist import *
+from window import *
+from Color import *
+
+def winresize(win, width, height):
+    win.width = width
+    win.height = height
+    win.fig.set_size_inches((width/100.0, height/100.0))
+    win.trans = win.fig.transFigure.inverted()
+
+def calctickdist(span, pixdist, goaldist):
+    onespan = span * goaldist / pixdist
+    lsp = log(onespan)/log(10)+0.15
+    fact = [1.0, 2.0, 5.0][int(floor((lsp - floor(lsp))*3))]
+    return pow(10, floor(lsp))*fact
+    
+def show_hist(win, x, y, wdt, hgt, hist, hx1, hx2, col):
+    ymax = max(hist[1]) * 1.05
+    yscale = hgt/ymax if ymax > 0 else 0
+    xscale = wdt/(hx2 - hx1)
+    hlist = list(map(lambda xx,yy: (x + (xx-hx1)*xscale, y + yy*yscale), hist[0], hist[1]))
+    hlist = [[x + (hist[0][0]-hx1)*xscale, y]] + hlist + [[x + (hist[0][-1]-hx1)*xscale, y]]
+    win.add_polygon(hlist, 0, False, col)
+
+def show_hist_axes(win, x, y, wdt, hgt, desc, hx1, hx2, tickdist):
+    mx = floor(hx2/tickdist)
+    mn = ceil(hx1/tickdist)
+    xscale = wdt/(hx2 - hx1)
+    win.add_line(x, y, x+wdt, y, 0.75, 'black')
+    x0 = x - xscale*hx1
+    for i in range(mn, mx+1):
+        xx = x0 + i*tickdist*xscale
+        win.add_line(xx, y-desc, xx, y+hgt if i==0 else y, 0.75, 'black')
+        win.add_text(xx, y-desc-12, "%g" % (i*tickdist), 9, 'center')
+
+def show_hist_axes_v2(win, x, y, wdt, hgt, desc, hx1, hx2, tickdist):
+    mx = floor(hx2/tickdist)
+    mn = ceil(hx1/tickdist)
+    xscale = wdt/(hx2 - hx1)
+    win.add_line(x, y, x+wdt, y, 0.75, 'black')
+    x0 = x - xscale*hx1
+    for i in range(mn, mx+1):
+        xx = x0 + i*tickdist*xscale
+        win.add_line(xx, y-desc, xx, y+hgt if i==0 else y, 0.75, 'black')
+        win.add_text(xx, y-desc-18, "%.1f" % (i*tickdist), 12, 'center')
+
+def show_hist_stats(win, x, y, wdt, hgt, hist, hx1, hx2, eps, col):
+      xscale = wdt/(hx2 - hx1)
+      mean, var = hist_mean_var(hist)
+      xmin, xmax = hist_maxinterval(hist, 1.0-eps)
+      win.add_line(x + (xmin-hx1)*xscale, y+int(hgt/3), x + (xmax-hx1)*xscale, y+int(hgt/3), 0.75, col)
+      win.add_line(x + (xmin-hx1)*xscale, y+int(hgt/6), x + (xmin-hx1)*xscale + 0.01, y+int(hgt/2), 0.75, col)
+      win.add_line(x + (xmax-hx1)*xscale, y+int(hgt/6), x + (xmax-hx1)*xscale + 0.01, y+int(hgt/2), 0.75, col)
+      win.add_line(x + (mean-hx1)*xscale, y, x + (mean-hx1)*xscale+0.01, y+int(hgt*2/3), 0.75, col)
+
+      
+# 1) analysis row: Effect key, signif, diff hist, abs hist
+# 2) analysis cond row: Effect key, signif, diff hist and cond, abs hist
+# 3) analysis grid: signif and mean
+# 4) analysis cond grid: signif and mean and cond
+# Input: straight or cond stats
+
+lowsign = 0.01
+highsign = 0.0001
+
+def set_sign_level(low, high):
+    global lowsign
+    global highsign
+    lowsign = low
+    highsign = high
+
+def signcolor(sign):
+    global lowsign
+    global highsign
+    if sign > lowsign:
+        return 'white'
+    elif sign < highsign:
+        return 'black'
+    else:
+        llow = log(lowsign)
+        lhigh = log(highsign)
+        gr = (log(sign) - llow)/(lhigh - llow)
+        return hsl_color(0.0, 0.0, 0.5-1.5*gr)
+
+def show_analysis_rows(win, title, dic, dic0 = False):
+    win.fig.clear()
+    maxd = max([abs(dic[k]['mean']) + 2*dic[k]['std'] for k in dic])
+    maxl = max([max(dic[k]['mean1'], dic[k]['mean2']) + 2*dic[k]['std'] for k in dic])
+    if dic0 is not False:
+        maxd = max(maxd, max([abs(dic0[k]['mean']) + 2*dic0[k]['std'] for k in dic0]))
+        maxl = max(maxl, max([max(dic0[k]['mean1'], dic0[k]['mean2']) + 2*dic0[k]['std'] for k in dic0]))
+    num = len(dic)
+    rowhgt = int(win.height / (num + 1))
+    histwdt1 = int((win.width - 240)*2*maxd/(2*maxd+maxl))
+    histwdt2 = int((win.width - 240)*maxl/(2*maxd+maxl))
+    tickdist = pow(10, floor(log(maxl/2)/log(10)))
+    cold = hsl_color(0.35, 1.0, -0.1)
+    cold0 = hsl_color(0.35, 1.0, 0.4)
+    col1 = hsl_color(0.16, 1.0, 0.2)
+    col2 = hsl_color(0.58, 1.0, 0.2)
+    win.add_text(win.width/2, win.height - rowhgt/4 - 8, title, 12, 'center')
+    show_hist_axes(win, 160, rowhgt/2, histwdt1, win.height - rowhgt, 6, -maxd, maxd, tickdist)
+    show_hist_axes(win, 200+histwdt1, rowhgt/2, histwdt2, win.height - rowhgt, 6, 0.0, maxl, tickdist)
+    ind = 0
+    for k in sorted(dic):
+        win.add_text(20, win.height - rowhgt*(1+ind) - 8, str(k), 12)
+        if dic0 is not False:
+            win.add_ellipse(58, win.height - rowhgt*(1+ind) - 8, 16, 16, 1, 'gray', signcolor(dic0[k]['sig']))
+        win.add_ellipse(50, win.height - rowhgt*(1+ind) - 8, 16, 16, 1, 'gray', signcolor(dic[k]['sig']))
+        win.add_text(80, win.height - rowhgt*(1+ind) - 6, "%.6f" % dic[k]['sig'], 10)
+        if dic[k]['hist'] is not False:
+            (mn, mx) = hist_maxinterval(dic[k]['hist'], 1.0-lowsign)
+            pk = hist_peak(dic[k]['hist'])
+            win.add_text(160 + histwdt1*(pk/maxd + 1.0)*0.5, win.height - rowhgt*(1+ind) - 6, "[%.2f,%.2f,%.2f]" % (mn, pk, mx), 10, 'center')
+            if dic0 is not False:
+                show_hist(win, 160, win.height - rowhgt*(1.45+ind), histwdt1, rowhgt*0.9, dic0[k]['hist'], -maxd, maxd, cold0)
+            show_hist(win, 160, win.height - rowhgt*(1.45+ind), histwdt1, rowhgt*0.9, dic[k]['hist'], -maxd, maxd, cold)
+        if dic[k]['hist1'] is not False:
+            show_hist(win, 200+histwdt1, win.height - rowhgt*(1.45+ind), histwdt2, rowhgt*0.9, dic[k]['hist1'], 0.0, maxl, col1)
+            win.add_text(200+histwdt1+histwdt2*dic[k]['mean1']/maxl, win.height - rowhgt*(1+ind) - 6, "(%d/%.0f)" % (dic[k]['n1'],dic[k]['t1']), 10, 'left' if dic[k]['mean1']>dic[k]['mean2'] else 'right')
+        if dic[k]['hist2'] is not False:
+            show_hist(win, 200+histwdt1, win.height - rowhgt*(1.45+ind), histwdt2, rowhgt*0.9, dic[k]['hist2'], 0.0, maxl, col2)
+            win.add_text(200+histwdt1+histwdt2*dic[k]['mean2']/maxl, win.height - rowhgt*(1+ind) - 6, "(%d/%.0f)" % (dic[k]['n2'],dic[k]['t2']), 10, 'right' if dic[k]['mean1']>dic[k]['mean2'] else 'left')
+        ind += 1
+
+def show_analysis_rows_dict(win, title, dic, dic0 = False):
+    win.fig.clear()
+    num = len(dic)
+    rowhgt = int(win.height / (num + 1))
+    win.add_text(win.width/2, win.height - rowhgt/4 - 8, title, 12, 'center')
+    if num == 0:
+        return
+    maxd = max([abs(dic[k]['mean']) + 2*dic[k]['std'] for k in dic])
+    maxl = max([max(dic[k]['mean1'], dic[k]['mean2']) + 2*dic[k]['std'] for k in dic])
+    if dic0 is not False:
+        maxd = max(maxd, max([abs(dic0[k]['mean']) + 2*dic0[k]['std'] for k in dic0]))
+        maxl = max(maxl, max([max(dic0[k]['mean1'], dic0[k]['mean2']) + 2*dic0[k]['std'] for k in dic0]))
+    histwdt1 = int((win.width - 240)*2*maxd/(2*maxd+maxl))
+    histwdt2 = int((win.width - 240)*maxl/(2*maxd+maxl))
+    tickdist = pow(10, floor(log(maxl/2)/log(10)))
+    cold = hsl_color(0.35, 1.0, -0.1)
+    cold0 = hsl_color(0.35, 1.0, 0.4)
+    col1 = hsl_color(0.16, 1.0, 0.2)
+    col2 = hsl_color(0.58, 1.0, 0.2)
+    show_hist_axes(win, 160, rowhgt/2, histwdt1, win.height - rowhgt, 6, -maxd, maxd, tickdist)
+    show_hist_axes(win, 200+histwdt1, rowhgt/2, histwdt2, win.height - rowhgt, 6, 0.0, maxl, tickdist)
+    ind = 0
+    for k in sorted(dic):
+        if dic0 is not False and k in dic0:
+            win.add_ellipse(28, win.height - rowhgt*(1+ind) - 8, 16, 16, 1, 'gray', signcolor(dic0[k]['sig']))
+        win.add_ellipse(20, win.height - rowhgt*(1+ind) - 8, 16, 16, 1, 'gray', signcolor(dic[k]['sig']))
+        win.add_text(50, win.height - rowhgt*(1+ind) - 6, "%.4f" % dic[k]['sig'], 10)
+        nm = str(k[1]) if type(k)==tuple else str(k)
+        if 'cond' in dic[k]:
+            k2 = dic[k]['cond']
+            nm += " | " + (k2[1] if type(k2)==tuple else str(k2))
+        win.add_text(120, win.height - rowhgt*(1+ind) - 8, nm, 12)
+        if dic[k]['hist'] is not False:
+            (mn, mx) = hist_maxinterval(dic[k]['hist'], 1.0-lowsign)
+            pk = hist_peak(dic[k]['hist'])
+            win.add_text(160 + histwdt1*(pk/maxd + 1.0)*0.5, win.height - rowhgt*(1+ind) - 6, "[%.2f,%.2f,%.2f]" % (mn, pk, mx), 10, 'center')
+            if dic0 is not False and k in dic0:
+                show_hist(win, 160, win.height - rowhgt*(1.45+ind), histwdt1, rowhgt*0.9, dic0[k]['hist'], -maxd, maxd, cold0)
+            show_hist(win, 160, win.height - rowhgt*(1.45+ind), histwdt1, rowhgt*0.9, dic[k]['hist'], -maxd, maxd, cold)
+        if dic[k]['hist1'] is not False:
+            show_hist(win, 200+histwdt1, win.height - rowhgt*(1.45+ind), histwdt2, rowhgt*0.9, dic[k]['hist1'], 0.0, maxl, col1)
+            win.add_text(200+histwdt1+histwdt2*dic[k]['mean1']/maxl, win.height - rowhgt*(1+ind) - 6, "(%d/%.0f)" % (dic[k]['n1'],dic[k]['t1']), 10, 'left' if dic[k]['mean1']>dic[k]['mean2'] else 'right')
+        if dic[k]['hist2'] is not False:
+            show_hist(win, 200+histwdt1, win.height - rowhgt*(1.45+ind), histwdt2, rowhgt*0.9, dic[k]['hist2'], 0.0, maxl, col2)
+            win.add_text(200+histwdt1+histwdt2*dic[k]['mean2']/maxl, win.height - rowhgt*(1+ind) - 6, "(%d/%.0f)" % (dic[k]['n2'],dic[k]['t2']), 10, 'right' if dic[k]['mean1']>dic[k]['mean2'] else 'left')
+        ind += 1
+
+def show_analysis_rows_dict_v2(win, title, dic, dic0 = False, params = {}):
+    paramdict = {'rowheight':50, 'namesize': 120, 'marg0': 0, 'marg1': 60, 'marg2' : 40, 'shownum':True, 'col1': 0.16, 'col2': 0.58, 'legendY': False, 'legendX1': False, 'legendX2': False }
+    paramdict.update(params)
+    win.fig.clear()
+    num = len(dic)
+    rowhgt = paramdict['rowheight'] # int(win.height / (num + 1))
+    marg = paramdict['namesize'] + paramdict['marg0'] + paramdict['marg1'] + paramdict['marg2'] + 80
+    if num == 0:
+        return
+    if 'maxd' in paramdict and 'maxl' in paramdict:
+        maxd = paramdict['maxd']
+        maxl = paramdict['maxl']
+    else:
+        maxd = max([abs(dic[k]['mean']) + 2*dic[k]['std'] for k in dic])
+        maxl = max([max(dic[k]['mean1'], dic[k]['mean2']) + 2*dic[k]['std'] for k in dic])
+        if dic0 is not False:
+            maxd = max(maxd, max([abs(dic0[k]['mean']) + 2*dic0[k]['std'] for k in dic0]))
+            maxl = max(maxl, max([max(dic0[k]['mean1'], dic0[k]['mean2']) + 2*dic0[k]['std'] for k in dic0]))
+    if 'fixscale' in paramdict:
+        newwidth = int(paramdict['fixscale']*(2*maxd+maxl) + marg)
+        newheight = int(rowhgt*(num + 2.5))
+        winresize(win, newwidth, newheight)
+    histwdt1 = int((win.width - marg)*2*maxd/(2*maxd+maxl))
+    histwdt2 = int((win.width - marg)*maxl/(2*maxd+maxl))
+    tickdist = calctickdist(maxl, histwdt2, 80) #pow(10, floor(log(maxl/2)/log(10)))
+    lpos = 20 + paramdict['marg0']
+    bpos = lpos + paramdict['namesize']
+    hpos1 = bpos + 20 + paramdict['marg1']
+    hpos2 = hpos1 + paramdict['marg2'] + histwdt2
+    cold = hsl_color(0.35, 1.0, -0.1)
+    cold0 = hsl_color(0.35, 1.0, 0.4)
+    col1 = hsl_color(paramdict['col1'], 1.0, 0.2)
+    col2 = hsl_color(paramdict['col2'], 1.0, 0.2)
+    win.add_text(win.width/2, win.height - rowhgt/4 - 8, title, 12, 'center')
+    show_hist_axes_v2(win, hpos2, 2*rowhgt, histwdt1, win.height - rowhgt*2.5, 6, -maxd, maxd, tickdist)
+    show_hist_axes_v2(win, hpos1, 2*rowhgt, histwdt2, win.height - rowhgt*2.5, 6, 0.0, maxl, tickdist)
+    if paramdict['legendY']:
+        txt_y = win.add_text(36, win.height/2, paramdict['legendY'], 12, 'center')
+        txt_y.set_rotation_mode('anchor')
+        txt_y.set_rotation('vertical')
+    if paramdict['legendX1']:
+        txt_x1 = win.add_text(hpos1, 12, paramdict['legendX1'], 12, 'left')
+    if paramdict['legendX2']:
+        txt_x2 = win.add_text(hpos2+histwdt1/2, 12, paramdict['legendX2'], 12, 'center')
+    ind = 0
+    for k in sorted(dic):
+        if dic0 is not False and k in dic0:
+            win.add_ellipse(28+paramdict['namesize'], win.height - rowhgt*(1+ind) - 8, 16, 16, 1, 'gray', signcolor(dic0[k]['sig']))
+        win.add_ellipse(bpos, win.height - rowhgt*(1+ind) - 8, 16, 16, 1, 'gray', signcolor(dic[k]['sig']))
+        #win.add_text(50, win.height - rowhgt*(1+ind) - 6, "%.4f" % dic[k]['sig'], 10)
+        nm = str(k[1]) if type(k)==tuple else str(k)
+        #if 'cond' in dic[k]:
+        #    k2 = dic[k]['cond']
+        #    nm += " | " + (k2[1] if type(k2)==tuple else str(k2))
+        win.add_text(lpos, win.height - rowhgt*(1+ind) - 8, nm, 12)
+        if dic[k]['hist'] is not False:
+            (mn, mx) = hist_maxinterval(dic[k]['hist'], 1.0-lowsign)
+            pk = hist_peak(dic[k]['hist'])
+            if paramdict['shownum']:
+                win.add_text(hpos2 + histwdt1*(pk/maxd + 1.0)*0.5, win.height - rowhgt*(1+ind) - 6, "[%.2f,%.2f,%.2f]" % (mn, pk, mx), 10, 'center')
+            if dic0 is not False and k in dic0:
+                show_hist(win, hpos2, win.height - rowhgt*(1.45+ind), histwdt1, rowhgt*0.9, dic0[k]['hist'], -maxd, maxd, cold0)
+            show_hist(win, hpos2, win.height - rowhgt*(1.45+ind), histwdt1, rowhgt*0.9, dic[k]['hist'], -maxd, maxd, cold)
+        if dic[k]['hist1'] is not False:
+            show_hist(win, hpos1, win.height - rowhgt*(1.45+ind), histwdt2, rowhgt*0.9, dic[k]['hist1'], 0.0, maxl, col1)
+            if paramdict['shownum']:
+                win.add_text(hpos1 + histwdt2*dic[k]['mean1']/maxl, win.height - rowhgt*(1+ind) - 6, "(%d/%.0f)" % (dic[k]['n1'],dic[k]['t1']), 10, 'left' if dic[k]['mean1']>dic[k]['mean2'] else 'right')
+        elif dic0 is not False and k in dic0 and dic0[k]['hist1'] is not False:
+            show_hist(win, hpos1, win.height - rowhgt*(1.45+ind), histwdt2, rowhgt*0.9, dic0[k]['hist1'], 0.0, maxl, col1)
+            if paramdict['shownum']:
+                win.add_text(hpos1 + histwdt2*dic0[k]['mean1']/maxl, win.height - rowhgt*(1+ind) - 6, "(%d/%.0f)" % (dic0[k]['n1'],dic0[k]['t1']), 10, 'left' if dic0[k]['mean1']>dic0[k]['mean2'] else 'right')
+        if dic[k]['hist2'] is not False:
+            show_hist(win, hpos1, win.height - rowhgt*(1.45+ind), histwdt2, rowhgt*0.9, dic[k]['hist2'], 0.0, maxl, col2)
+            if paramdict['shownum']:
+                win.add_text(hpos1 + histwdt2*dic[k]['mean2']/maxl, win.height - rowhgt*(1+ind) - 6, "(%d/%.0f)" % (dic[k]['n2'],dic[k]['t2']), 10, 'right' if dic[k]['mean1']>dic[k]['mean2'] else 'left')
+        elif dic0 is not False and k in dic0 and dic0[k]['hist2'] is not False:
+            show_hist(win, hpos1, win.height - rowhgt*(1.45+ind), histwdt2, rowhgt*0.9, dic0[k]['hist2'], 0.0, maxl, col2)
+            if paramdict['shownum']:
+                win.add_text(hpos1 + histwdt2*dic0[k]['mean2']/maxl, win.height - rowhgt*(1+ind) - 6, "(%d/%.0f)" % (dic0[k]['n2'],dic0[k]['t2']), 10, 'right' if dic0[k]['mean1']>dic0[k]['mean2'] else 'left')
+        ind += 1
+
+def display_analysis_one_row(pair, res, res0 = False):
+    resstr = ""
+    nm = str(pair[1]) if type(pair)==tuple else str(pair)
+    if 'cond' in res:
+        k2 = res['cond']
+        if type(k2) == list:
+            nm += " | " + ",".join([kk[1] if type(kk)==tuple else str(kk) for kk in k2])
+        elif type(k2) == tuple:
+            nm += " | " + k2[1]
+        else:
+            nm += " | " + str(k2)
+    sig = res['sig']
+    mean = res['mean']
+    fact = res['fact']
+    fact00 = res['mean2']/res['mean1']
+    #if res['hist'] is not False:
+    #    pk = hist_peak(res['hist'])
+    if sig < 0.0001:
+        resstr += "***"
+    elif sig < 0.001:
+        resstr += "**"
+    elif sig < 0.01:
+        resstr += "*"
+    else:
+        resstr += ""
+    resstr += "\t%.4f\t%.4f\t%.2f" % (sig, mean, fact)
+    if res0 is not False:
+        sig0 = res0['sig']
+        mean0 = res0['mean']
+        fact0 = res0['fact']
+        resstr += "\t%.4f\t%.4f\t%.2f" % (sig0, mean0, fact0)
+    else:
+        resstr += "\t\t\t"
+    resstr += "\t%s" % (nm)
+    #sp = max(2, 50-len(resstr))
+    resstr += "\t(%d/%d:%.0f)\t(%d/%d:%.0f)" % (res['n1'],res['s1'],res['t1'],res['n2'],res['s2'],res['t2'])
+    print(resstr)
+
+def display_analysis_rows_dict(title, dic, dic0 = False):
+    num = len(dic)
+    print(title)
+    if num == 0:
+        return
+    for k in sorted(dic):
+        display_analysis_one_row(k, dic[k], dic0[k] if dic0 is not False and k in dic0 else False)
+
+def show_analysis_grid(win, title, gdic, gdic0 = False):
+    win.fig.clear()
+    numc = len(gdic)
+    cols = sorted(gdic)
+    numr = len(gdic[cols[0]])
+    rows = sorted(gdic[cols[0]])
+    rowhgt = int(win.height / (numr + 2))
+    colwdt = int((win.width - 50)/ (numc if numc > 0 else 1))  
+    win.add_text(win.width/2, win.height - rowhgt/4 - 8, title, 12, 'center')
+    indc = 0
+    for g in cols:
+        win.add_text(50+indc*colwdt, win.height - rowhgt - 8, str(g), 12)
+        indc +=1
+    indr = 0
+    for k in rows:
+        win.add_text(20, win.height - rowhgt*(2+indr) - 8, str(k), 12)
+        indc = 0
+        for g in cols:
+            if gdic0 is not False:
+                win.add_ellipse(58+indc*colwdt, win.height - rowhgt*(2+indr) - 8, 16, 16, 1, 'gray', signcolor(gdic0[g][k]['sig']))
+            win.add_ellipse(50+indc*colwdt, win.height - rowhgt*(2+indr) - 8, 16, 16, 1, 'gray', signcolor(gdic[g][k]['sig']))
+            indc += 1
+        indr += 1
+
+def show_profile_axes(win, x, y, wdt, hgt, hx1, hx2, tickdist):
+    mx = floor(hx2/tickdist)
+    mn = ceil(hx1/tickdist)
+    xscale = wdt/(hx2 - hx1)
+    win.add_line(x, y, x+wdt, y, 0.75, 'black')
+    x0 = x - xscale*hx1
+    for i in range(mn, mx+1):
+        xx = x0 + i*tickdist*xscale
+        if i==0:
+            win.add_line(xx, y-hgt, xx, y+hgt, 0.75, 'black')
+        else:
+            win.add_line(xx, y-4, xx, y+4, 0.75, 'black')
+#        win.add_text(xx, y-desc-12, "%g" % (i*tickdist), 9, 'center')
+
+def show_profile_bars(win, x, y, wdt, hgt, hx1, hx2, lst, col1, col2, below, full):
+    xscale = wdt/(hx2 - hx1)
+    first = True
+    lst = [x for x in lst if x is not None]
+    for val in lst:
+        xx = x + xscale*(val - hx1)
+        if first and val > 0:
+            col = col1
+            first = False
+        else:
+            col = col2
+        ihgt = hgt*2/3 if not full else hgt
+        if below:
+            win.add_line(xx, y-4, xx, y-ihgt+4, 0.75, col)
+        else:
+            win.add_line(xx, y+4, xx, y+ihgt-4, 0.75, col)
+
+def show_km_axes(win, x, y, wdt, hgt, hx1, hx2, xtickdist, ymx, ytickdist):
+    mx = floor(hx2/xtickdist)
+    mn = ceil(hx1/xtickdist)
+    xscale = wdt/(hx2 - hx1)
+    yscale = hgt/ymx
+    win.add_line(x-4, y, x+wdt, y, 1, 'black')
+    win.add_text(x-6, y-6, str(0), 12, 'right')
+    for i in range(1,floor(ymx/ytickdist)+1):
+        yy = y+yscale*ytickdist*i
+        win.add_line(x-4, yy, x+wdt, yy, 0.75, 'lightgray')
+        win.add_text(x-6, yy-6, str(int(ytickdist*i*100)), 12, 'right')
+    x0 = x - xscale*hx1
+    for i in range(mn, mx+1):
+        xx = x0 + i*xtickdist*xscale
+        if i==0:
+            win.add_line(xx, y-4, xx, y+hgt+20, 1, 'black')
+        else:
+            win.add_line(xx, y-4, xx, y+4, 1, 'black')
+        win.add_text(xx, y-20, str(i*xtickdist), 12, 'center')
+
+def show_km_data(win, x, y, wdt, hgt, hx1, hx2, ymx, off, lst, col):
+    xscale = wdt/(hx2 - hx1)
+    yscale = hgt/ymx
+    x0 = x - xscale*hx1 + xscale*off
+    points = [(x0 + xscale*i, y + yscale*yy) for i,yy in enumerate(lst)]
+    obj = win.add_polygon(points, 1, col, None)
+    obj.set_closed(False)
+
+
+    
diff --git a/dataanalysis/window.py b/dataanalysis/window.py
new file mode 100644
index 0000000..5a18ac1
--- /dev/null
+++ b/dataanalysis/window.py
@@ -0,0 +1,253 @@
+"""
+Copyright (C) 2018-2021 RISE Research Institute of Sweden AB
+
+File: window.py
+
+Author: anders.holst@ri.se
+
+"""
+
+
+# %matplotlib notebook
+
+import matplotlib.pyplot as plt
+import matplotlib as mpl
+
+mpl.interactive(True)
+mpl.rcParams['toolbar']='None'
+
+class Wind():
+    def __init__(self, name, width, height):
+        self.width = width
+        self.height = height
+        self.fig = plt.figure(name, (width/100.0, height/100.0))
+        self.trans = self.fig.transFigure.inverted()
+        self.bpfid = -1
+        self.brfid = -1
+        self.whfid = -1
+        self.mfid = -1
+        self.kfid = -1
+        self.buttons = {}
+        self.scrolls = {}
+        self.keys = {}
+        self.drags = {}
+        self.objs = []
+        self.objtps = []
+        self.bpressed = False
+
+    def add_line(self, x1, y1, x2, y2, wdt, col):
+        fr = self.trans.transform((x1, y1))
+        to = self.trans.transform((x2, y2))
+        obj = plt.Line2D((fr[0], to[0]), (fr[1], to[1]), linewidth=wdt*0.75, color=col)
+        self.fig.add_artist(obj)
+        return obj
+
+    def add_rect(self, x1, y1, wdt, hgt, brd, fg, bg):
+        fr = self.trans.transform((x1, y1))
+        to = self.trans.transform((x1+wdt, y1+hgt))
+        if bg:
+            obj = plt.Rectangle(fr, to[0]-fr[0], to[1]-fr[1], linewidth=brd*0.75 or 0, edgecolor=fg or (0,0,0,0), facecolor=bg)
+        else:
+            obj = plt.Rectangle(fr, to[0]-fr[0], to[1]-fr[1], linewidth=brd*0.75 or 0, edgecolor=fg or (0,0,0,0), fill=False)
+        self.fig.add_artist(obj)
+        return obj
+
+    def add_ellipse(self, x1, y1, wdt, hgt, brd, fg, bg, angle = 0):
+        fr = self.trans.transform((x1, y1))
+        to = self.trans.transform((x1+wdt, y1+hgt))
+        if bg:
+            obj = mpl.patches.Ellipse(((fr[0]+to[0])*0.5, (fr[1]+to[1])*0.5), to[0]-fr[0], to[1]-fr[1], linewidth=brd*0.75 or 0, edgecolor=fg or (0,0,0,0), facecolor=bg, angle = angle)
+        else:
+            obj = mpl.patches.Ellipse(fr, to[0]-fr[0], to[1]-fr[1], linewidth=brd*0.75 or 0, edgecolor=fg or (0,0,0,0), fill=False, angle = angle)
+        self.fig.add_artist(obj)
+        return obj
+
+    def add_polygon(self, xylst, brd, fg, bg):
+        xytr = list(map(self.trans.transform, xylst))
+        if bg:
+            obj = plt.Polygon(xytr, linewidth=brd*0.75 or 0, edgecolor=fg or (0,0,0,0), facecolor=bg)
+        else:
+            obj = plt.Polygon(xytr, linewidth=brd*0.75 or 0, edgecolor=fg or (0,0,0,0), fill=False)
+        self.fig.add_artist(obj)
+        return obj
+
+    def add_text(self, x1, y1, txt, fontsz = None, align = None):
+        fr = self.trans.transform((x1, y1))
+        obj = plt.Text(fr[0], fr[1], txt, fontsize=fontsz or 18, ha=align or 'left')
+        self.fig.add_artist(obj)
+        return obj
+
+    def locate_object(self, event, objs):
+        for obj in objs:
+            if obj.contains(event)[0]:
+                return obj
+
+    def locate_object_type(self, event, types):
+        for obj in self.fig.artists:
+            if True in types or type(obj) in types:
+                if obj.contains(event)[0]:
+                    return obj
+
+    def button_press_callback(self, event):
+        obj = self.locate_object(event, self.objs)
+        if obj is False:
+            obj = self.locate_object_type(event, self.objtps)
+        func = False
+        if obj is not False:
+            if (event.guiEvent.num, event.guiEvent.state, obj) in self.buttons:
+                func = self.buttons[(event.guiEvent.num, event.guiEvent.state, obj)]
+            elif (event.guiEvent.num, -1, obj) in self.buttons:
+                func = self.buttons[(event.guiEvent.num, -1, obj)]
+            elif (event.guiEvent.num, event.guiEvent.state, True) in self.buttons:
+                func = self.buttons[(event.guiEvent.num, event.guiEvent.state, True)]
+            elif (event.guiEvent.num, -1, True) in self.buttons:
+                func = self.buttons[(event.guiEvent.num, -1, True)]
+        if func is not False:
+            if self.bpressed is False:
+                if type(func)==tuple:
+                    self.bpressed = (False, func[1], func[2] if len(func)==3 else False, obj)
+                    func = func[0]
+                func(self, event, obj)
+        else:
+            if (event.guiEvent.num, event.guiEvent.state) in self.buttons:
+                func = self.buttons[(event.guiEvent.num, event.guiEvent.state)]
+            elif (event.guiEvent.num, -1) in self.buttons:
+                func = self.buttons[(event.guiEvent.num, -1)]
+            if func is not False:
+                if self.bpressed is False:
+                    if type(func)==tuple:
+                        self.bpressed = (False, func[1], func[2] if len(func)==3 else False, False)
+                        func = func[0]
+                    func(self, event)
+        # kolla om ev över object i objs
+        # kolla om bunden: (ev, mod, obj), (ev, -1, obj), (ev, mod, T), (ev, -1, T), (ev, mod), (ev, -1)
+        # om release-cb, spara undan (vad göra om redan finns? ignorera press?)
+        # kör press-cb
+
+    def button_release_callback(self, event):
+        if self.bpressed is not False:
+            func = self.bpressed[1]
+            obj = self.bpressed[3]
+            self.bpressed = False
+            if obj is False:
+                func(self, event)
+            else:
+                func(self, event, obj)
+
+    def button_motion_callback(self, event):
+        if self.bpressed is not False:
+            func = self.bpressed[2]
+            obj = self.bpressed[3]
+            if obj is False:
+                func(self, event)
+            else:
+                func(self, event, obj)
+
+    def bind_button(self, button, func, obj = False):
+        # func called as func(win, ev) or func(win, ev, obj) if object is not False
+        if (self.bpfid == -1):
+            self.bpfid = self.fig.canvas.mpl_connect('button_press_event', self.button_press_callback)
+        if (self.brfid == -1):
+            self.brfid = self.fig.canvas.mpl_connect('button_release_event', self.button_release_callback)
+        if (self.mfid == -1 and type(func)==tuple and len(func)==3):
+            self.mfid = self.fig.canvas.mpl_connect('motion_notify_event', self.button_motion_callback)
+        but = button if type(button)==tuple else (button, -1)
+        if obj is not False:
+            but = but + (obj,)
+            if obj is True or type(obj)==type:
+                if not obj in self.objtps:
+                    self.objtps.append(obj)
+            elif not obj in self.objs:
+                self.objs.append(obj)
+        self.buttons[but] = func
+        
+    def key_press_callback(self, event):
+        obj = self.locate_object(event, self.objs)
+        if obj is False:
+            obj = self.locate_object_type(event, self.objtps)
+        func = False
+        if obj is not False:
+            if (event.guiEvent.keysym, event.guiEvent.state, obj) in self.keys:
+                func = self.keys[(event.guiEvent.keysum, event.guiEvent.state, obj)]
+            elif (event.guiEvent.keysym, -1, obj) in self.keys:
+                func = self.keys[(event.guiEvent.keysym, -1, obj)]
+            elif (event.guiEvent.keysym, event.guiEvent.state, True) in self.keys:
+                func = self.keys[(event.guiEvent.keysym, event.guiEvent.state, True)]
+            elif (event.guiEvent.keysym, -1, True) in self.keys:
+                func = self.keys[(event.guiEvent.keysym, -1, True)]
+        if func is not False:
+            func(self, event, obj)
+        else:
+            if (event.guiEvent.keysym, event.guiEvent.state) in self.keys:
+                func = self.keys[(event.guiEvent.keysym, event.guiEvent.state)]
+            elif (event.guiEvent.keysym, -1) in self.keys:
+                func = self.keys[(event.guiEvent.keysym, -1)]
+            if func is not False:
+                func(self, event)
+        # kolla om ev över object i objs
+        # kolla om bunden: (ev, mod, obj), (ev, -1, obj), (ev, mod, T), (ev, -1, T), (ev, mod), (ev, -1)
+        # kör press-cb
+
+    def bind_key(self, keysym, func, obj = False):
+        # func called as func(win, ev) or func(win, ev, obj) if object is not False
+        if (self.kfid == -1):
+            self.kfid = self.fig.canvas.mpl_connect('key_press_event', self.key_press_callback)
+        key = keysym if type(keysym)==tuple else (keysym, -1)
+        if obj is not False:
+            key = key + (obj,)
+            if obj is True or type(obj)==type:
+                if not obj in self.objtps:
+                    self.objtps.append(obj)
+            elif not obj in self.objs:
+                self.objs.append(obj)
+        self.keys[key] = func
+
+    def scroll_callback(self, event):
+        obj = self.locate_object(event, self.objs)
+        if obj is False:
+            obj = self.locate_object_type(event, self.objtps)
+        func = False
+        if obj is not False:
+            if (event.button, event.guiEvent.state, obj) in self.scrolls:
+                func = self.scrolls[(event.button, event.guiEvent.state, obj)]
+            elif (event.button, -1, obj) in self.scrolls:
+                func = self.scrolls[(event.button, -1, obj)]
+            elif (event.button, event.guiEvent.state, True) in self.scrolls:
+                func = self.scrolls[(event.button, event.guiEvent.state, True)]
+            elif (event.button, -1, True) in self.scrolls:
+                func = self.scrolls[(event.button, -1, True)]
+            elif (True, -1, obj) in self.scrolls:
+                func = self.scrolls[(True, -1, obj)]
+            elif (True, -1, True) in self.scrolls:
+                func = self.scrolls[(True, -1, True)]
+        if func is not False:
+            func(self, event, obj)
+        else:
+            if (event.button, event.guiEvent.state) in self.scrolls:
+                func = self.keys[(event.button, event.guiEvent.state)]
+            elif (event.button, -1) in self.keys:
+                func = self.keys[(event.button, -1)]
+            elif (True, -1) in self.scrolls:
+                func = self.scrolls[(True, -1)]
+            if func is not False:
+                func(self, event)
+        # kolla om ev över object i objs
+        # kolla om bunden: (ev, mod, obj), (ev, -1, obj), (ev, mod, T), (ev, -1, T), (ev, mod), (ev, -1)
+        # kör press-cb
+
+    def bind_scroll(self, dir, func, obj = False):
+        # dir can be "up", "down", "left", "right", or True
+        # func called as func(win, ev) or func(win, ev, obj) if object is not False
+        if (self.whfid == -1):
+            self.whfid = self.fig.canvas.mpl_connect('scroll_event', self.scroll_callback)
+        dd = dir if type(dir)==tuple else (dir, -1)
+        if obj is not False:
+            dir = dir + (obj,)
+            if obj is True or type(obj)==type:
+                if not obj in self.objtps:
+                    self.objtps.append(obj)
+            elif not obj in self.objs:
+                self.objs.append(obj)
+        self.keys[dd] = func
+
+
diff --git a/datapreparation/analyze.py b/datapreparation/analyze.py
new file mode 100644
index 0000000..f7192c7
--- /dev/null
+++ b/datapreparation/analyze.py
@@ -0,0 +1,438 @@
+#! /usr/bin/env python3
+
+
+""" -------------------------------
+
+    analyse.py
+
+    Copyright (C) 2018 RISE
+    This code was produced by RISE
+    The 2013-04-10 version
+
+    bonsai/src_v02/analyze.py
+
+    simple analysis of pandas dataframes data
+    such as 
+
+       1. find duplicated rows
+
+       2. number of unique values in a column
+
+       3. number of unique values in common 
+          between two columns in two different
+          files
+       
+       4. 
+   
+------------------------------------"""
+
+
+import global_settings as gs
+import numpy as np
+import pandas as pd
+import bonsai_io as bio
+import common
+import copy
+
+def nr_of_unique_rows(df):
+    d = df.drop_duplicates()
+    return len(d)
+
+def nr_of_unique_values_in_cols(df, cols):
+    c = df.drop_duplicates(subset = cols)
+    return len(c)
+
+
+def nr_of_unique_values(df, col):
+    c = df[col].dropna()
+    c = c.drop_duplicates()
+    return len(c)
+
+"""
+def nr_of_unique_numeric_values(df, col):
+
+    c = df[col].dropna()
+    c = c.drop_duplicates()
+    c = c.str.isnumeric() 
+    c = c[c].index.values
+"""
+
+
+def nr_of_nonnan_values(df, col):
+
+    c = df[col].dropna()
+    return len(c)
+ 
+def nr_of_unique_digital_values(df, col):
+
+    c = df[col].dropna()
+    c = c.drop_duplicates()
+    c = c.str.isdigit() 
+    c = c[c].index.values
+    # df = df.drop_duplicates(subset = col)
+    # df = df[ df[col].dropna().str.isdigit() ]
+    # df = df[ df[col].str.contains('\d', regex=True) ]
+    return len(c)
+
+def duplicated_rows(df):
+    df['dup'] = df.duplicated()
+    df = df[df['dup'] == True]
+    return df
+
+def print_duplicated_rows(df, nr):
+    dup = duplicated_rows(df)
+    print('Nr of rows in total', len(df))
+    print('Nr of duplicated rows', len(dup))
+    nr = min( nr,len(dup) )
+    if nr > 0:
+        print('the first', nr,' of them')
+        print(dup[0:nr])
+    return dup
+
+def unique_number_values(df, col):
+    df = df.drop_duplicates(subset = col)
+    df = df[ df[col].str.contains('\d', regex=True) ]
+    return df
+
+
+def info(df, name = ''):
+    print()
+    if name != '':
+        print()
+        print('--------------------------------------------------')
+        print()
+        print('\tInfo on the file\n\t' + name)
+        print()
+        print('--------------------------------------------------')
+        print()
+        df_unique_nr = nr_of_unique_rows(df)
+    print('          shape', df.shape)
+    print('    unique rows', df_unique_nr)
+
+    for c in df.columns:
+        print()
+        print('\tInfo on non-nan values of column', c)
+        print()
+        nonnan_nr = nr_of_nonnan_values(df, c)
+        unique_nr = nr_of_unique_values(df, c)
+        digital_nr =  nr_of_unique_digital_values(df, c)
+        # numeric_nr = nr_of_unique_numeric_values(df, c)
+        print('non-nan values', nonnan_nr)
+        print(' unique values', unique_nr)
+        print('digital values', digital_nr)
+        # print('numeric values', unique_nr)
+        
+    print()
+    # return unique_number_values(df, 'ICD10')
+
+# df = df[ df[c].str.contains('\d', regex=True) ]
+
+
+
+def readall():
+    dia = bio.read_generated_dia()
+    dgr = bio.read_diagroups()
+    per = bio.readperson()
+    ctr = bio.readcontrol()
+    inc = bio.readincare()
+    nic = bio.readnicare()
+    dru = bio.readdrug()
+    dcl = bio.readdrugclasses()
+    tre = bio.readtreatment()
+    sur = bio.readsurgery()
+    cau = bio.readcause()
+
+    data  = [
+        dia, 
+        dgr, 
+        per,
+        ctr, 
+        inc, 
+        nic, 
+        dru, 
+        dcl, 
+        tre,
+        sur,
+        cau
+]
+
+    name = [
+        'diagnos          ',
+        'diagnosgrupp     ',
+        'person           ',
+        'kontrollgrupp    ',
+        'sluten v_rd      ',
+        '_ppen v_rd       ',
+        'l_kemedel        ',
+        'l_kemedelsgrupper',
+        'behandling       ',
+        'kirurgi          ',
+        'orsak            ',
+    ]
+
+    return data, name
+
+
+def info_on_all():
+
+    data, name = readall()
+        
+    for i in range(0, len(name)):
+        info(data[i], name[i])
+
+
+def compare_lopnr(dfx, dfy, namex = 'data 1', namey = 'data 2'):
+
+    xs = list(dfx['LopNr'].values)
+    ys = list(dfy['LopNr'].values)
+
+    sx  = set(xs)
+    sy  = set(ys)
+    cut = sx & sy
+    ux  = sx - sy
+    uy  = sy - sx
+
+    print()
+    # print('shape ' + namex + '\t\t', dfx.shape)
+    # print('shape ' + namey + '\t\t', dfy.shape)
+    # print('unique Lopnr ' + namex + '\t', len(xs))
+    # print('unique Lopnr ' + namey + '\t', len(ys))
+
+    print('common Lopnr\t\t\t', len(cut))
+    print('Lopnr in ' + namex + ' only\t', len(ux))
+    print('Lopnr in ' + namey + ' only\t', len(uy))
+    print()
+
+    ux = list(ux)
+    uy = list(uy)
+    ux.sort
+    uy.sort
+    return ux, uy
+
+
+def readlopnr():
+    dia = bio.read_generated_dia()
+    per = bio.readperson()
+    ctr = bio.readcontrol()
+    inc = bio.readincare()
+    nic = bio.readnicare()
+    dru = bio.readdrug()
+    tre = bio.readtreatment()
+    sur = bio.readsurgery()
+    cau = bio.readcause()
+
+    data  = [dia, per, ctr, inc, nic, dru, tre, sur, cau]
+
+    name = [
+        'diagnos      ',
+        'person       ',
+        'kontrollgrupp',
+        'sluten v_rd  ',
+        '_ppen v_rd   ',
+        'l_kemedel    ',
+        'behandling   ',
+        'kirurgi      ',
+        'orsak        ',
+    ]
+
+    return data, name
+
+
+def pairwise_lopnr_comparisions():
+
+    data, name = readlopnr()
+
+    for i in range(0, len(name)):
+        for j in range(i+1, len(name)):
+            print()
+            print('--------------------------------------------------')
+            print()
+            print('\tComparing ' + name[i] + ' with ' + name[j])
+            print()
+            print('--------------------------------------------------')
+            print()
+
+            compare_lopnr(data[i], data[j], name[i], name[j])
+
+
+
+
+
+""" -------------------------------
+       
+       4.  count amd list various types of diagnosis
+           codes in care data
+   
+------------------------------------"""
+
+"""
+def is_icd10_class(x):
+    if not common.isstr(x):
+        return False
+    if common.is_icd10(x):
+        return False
+    if len(x) < 3:
+        return False
+    if not x[0].isupper():
+        return False
+    return x[1].isdigit() and x[2].isdigit()
+"""
+
+
+def code_count(xs):
+    if not isinstance(xs, str):
+        return 0
+    return len(xs.split())
+
+def icd10_count(xs):
+    if not isinstance(xs, str):
+        return 0
+    count = 0
+    for x in xs.split():
+        if common.is_icd10(x):
+            # print(x)
+            count += 1
+    return count
+
+def not_icd10_count(xs):
+    if not isinstance(xs, str):
+        return 0
+    count = 0
+    for x in xs.split():
+        if not common.is_icd10(x):
+            # print(x)
+            count += 1
+    return count
+
+def icd10_class_count(xs):
+    if not isinstance(xs, str):
+        return 0
+    count = 0
+    for x in xs.split():
+        if common.is_icd10_class(x):
+            # print(x)
+            count += 1
+    return count
+
+"""
+def code_list(xs):
+    if not isinstance(xs, str):
+        return 0
+    return len(xs.split())
+"""
+
+def count_and_print(df, table = False):
+    dia  = 'DIAGNOS'
+    dfc = copy.copy(df)
+    dfc['code_count']        = df[dia].apply(code_count)
+    dfc['icd10_count']       = df[dia].apply(icd10_count)
+    dfc['not_icd10_count']   = df[dia].apply(not_icd10_count)
+    dfc['icd10_class_count'] = df[dia].apply(icd10_class_count)
+    nr_of_codes = dfc['code_count'].sum()
+    nr_of_icd10 = dfc['icd10_count'].sum()
+    nr_of_not_icd10 = dfc['not_icd10_count'].sum()
+    nr_of_class_codes = dfc['icd10_class_count'].sum()
+
+    if table:
+        print('nr_of_lines\t', len(df))
+        print('nr_of_codes\t', nr_of_codes)
+        print('nr_of_icd10\t', nr_of_icd10)
+        print('nr_of_not_icd10\t', nr_of_not_icd10)
+        print('nr_of_icd10_class_codes\t', nr_of_class_codes)
+    
+    else:
+            
+    
+        print('                nr_of_lines', len(df))
+        print('                nr_of_codes', nr_of_codes)
+        print('                nr_of_icd10', nr_of_icd10)
+        print('            nr_of_not_icd10', nr_of_not_icd10)
+        print('    nr_of_icd10_class_codes', nr_of_class_codes)
+
+
+    """
+    for c in df1[dia].values:
+        print('\t', c)
+    """
+
+
+def print_dates(df, table = False):
+    date = 'INDATUM'
+
+    if table:
+
+        print('first date\t', df[date].min())
+        print('last date\t', df[date].max())
+
+    else:
+
+        print('           first date', df[date].min())
+        print('            last date', df[date].max())
+        
+
+def icd10_class_list(xs):
+    if not isinstance(xs, str):
+        return []
+    codes = []
+    for x in xs.split():
+        if common.is_icd10_class(x):
+            codes += [x]
+            #print(codes)
+    return codes
+
+def flat(xs):
+    ys = []
+    for x in xs:
+        ys += x
+    return ys
+
+    
+
+def print_class_codes(df):
+    dia  = 'DIAGNOS'
+    dfc = copy.copy(df)
+    dfc['icd10_class'] = df[dia].apply(icd10_class_list)
+    dfc['is_class'] = dfc['icd10_class'].apply(lambda x: x != [])
+    dfc = dfc[dfc['is_class']]
+    codes = np.unique(flat(list(dfc['icd10_class'].values)))
+    for c in codes:
+        print('\t', c)
+ 
+
+def diagnosis_code_count(df, print_class = False, table = False):
+    
+    date = 'INDATUM'
+    nr   = 'LopNr'
+    icd10_start = np.datetime64('1998-01-01')
+
+    """
+    size0 = len(df)
+    df = df.dropna().reset_index(drop=True)
+    print('nr of empty lines:', size0- len(df))
+    """
+    
+    df[date] = df[date].apply(bio.str2time)
+    df = df.sort_values(date).dropna().reset_index(drop=True)
+
+    df1 = df[df[date] < icd10_start]    
+    df2 = df[df[date] >= icd10_start]
+
+    print()    
+    print('code counts before 1998_01_01:')
+    print()
+    
+    print_dates(df1, table = table)
+    count_and_print(df1, table = table)
+
+    print()    
+    print('code counts from 1998_01_01')
+    print()
+    
+    print_dates(df2, table = table)
+    count_and_print(df2, table = table)
+    if print_class:
+        print()
+        print('    all icd10_class_codes:')
+        print_class_codes(df2)
+
+    print()
diff --git a/datapreparation/bonsai_io.py b/datapreparation/bonsai_io.py
new file mode 100644
index 0000000..fabc876
--- /dev/null
+++ b/datapreparation/bonsai_io.py
@@ -0,0 +1,738 @@
+#! /usr/bin/env python3
+
+
+""" -------------------------------
+
+    bonsai_io.py
+
+    Copyright (C) 2019 RISE
+    This code was produced by RISE
+    The 2019-08-29 version
+
+    bonsai/src/bonsai_io.py
+
+------------------------------------"""
+
+
+import copy
+import pandas as pd
+import numpy as np
+from datetime import datetime as time
+from datetime import timedelta as delta
+from dateutil import parser
+
+
+import global_settings as gs
+import common
+import diagnose
+
+
+""" ----------------------------------------
+
+   common support
+
+---------------------------------------"""
+
+
+
+def isstr(x):
+    return isinstance(x, str)
+
+def str2number(x):
+    s = str(x)
+    if  (',' in s):
+        x = float(s.replace(',','.'))
+    return round(float(x), 5)
+
+def str2time(x):
+    # print(x)
+    if isstr(x):
+        return parser.parse(x)
+    return x
+
+def save_generated_file(df, name):
+    f = gs.places.generated_out(name)
+    print('saving ' + name + ' file to:')
+    print('\t', f)
+    df.to_csv(f, index = False, sep = '\t')
+
+def read_generated_file(name):
+    f = gs.places.generated_out(name)
+    print('reading file:', f)
+    df = pd.read_csv(f, sep='\t')
+    return df
+    
+
+def save_tmp_file(df, name):
+    f = gs.places.tmp_out(name)
+    print('saving ' + name + ' file to:')
+    print('\t', f)
+    df.to_csv(f, index = False, sep = '\t')
+
+def save_local_tmp_file(df, name):
+    f = gs.places.local_tmp_out(name)
+    print('saving ' + name + ' file to:')
+    print('\t', f)
+    df.to_csv(f, index = False, sep = '\t')
+
+
+def save_local_result(df, name):
+    f = gs.places.local_result(name)
+    print('saving ' + name + ' file to:')
+    print('\t', f)
+    df.to_csv(f, index = False, sep = '\t')
+
+
+def save_local_input(df, name):
+    f = gs.places.local_input + name + '.csv'
+    print('saving ' + name + ' file to:')
+    print('\t', f)
+    df.to_csv(f, index = False, sep = '\t')
+
+
+
+""" ----------------------------------------
+
+    lexicon 
+
+---------------------------------------"""
+
+def read_code_lex():
+    lex = pd.read_csv(gs.places.code_lex, dtype = str, sep='\t')
+    return lex
+
+
+def read_incare_lex():
+    name = 'incare_lex'
+    f = gs.places.local_input + name + '.csv'
+    print('reading file:', f)
+    lex = pd.read_csv(f, dtype = str, sep='\t')
+    return lex
+
+
+def read_care_icd8_lex():
+    f = gs.places.care_icd8_lex
+    print('reading file:', f)
+    lex = pd.read_csv(f, dtype = str, sep='\t')
+    return lex
+
+
+def read_care_icd9_lex():
+    f = gs.places.care_icd9_lex
+    print('reading file:', f)
+    lex = pd.read_csv(f, dtype = str, sep='\t')
+    return lex
+
+def read_cause_icd8_lex():
+    f = gs.places.cause_icd8_lex
+    print('reading file:', f)
+    lex = pd.read_csv(f, dtype = str, sep='\t')
+    return lex
+
+
+def read_cause_icd9_lex():
+    f = gs.places.cause_icd9_lex
+    print('reading file:', f)
+    lex = pd.read_csv(f, dtype = str, sep='\t')
+    return lex
+
+""" ----------------------------------------
+
+    dia
+
+    reading the diagnosis file, the original 
+    and saved filled in versions
+
+---------------------------------------"""
+
+
+def read_all_original_dia():
+    """
+    reading the the non-null LopNr rows from the
+    original file helena_lev_Diagnos_v2.txt
+    modified only by removing a ',' sign
+    """
+
+    f  = gs.places.diagnose_m
+    print('reading file:', f)
+    df = pd.read_csv(f, dtype = str, sep='\t')
+    df = df[ ~df['LopNr'].isnull() ]
+    # df = df[gs.places.diagnose_selection]
+    df = df.sort_values('LopNr').reset_index(drop=True)
+    return df
+
+def read_original_dia():
+    """
+    reading the non-null LopNr rows and the selected 
+    columns from original file helena_lev_Diagnos_v2.txt 
+    modified only by removing a ',' sign
+    """
+
+    f  = gs.places.diagnose_m
+    print('reading file:', f)
+    df = pd.read_csv(f, dtype = str, sep='\t')
+    df = df[ ~df['LopNr'].isnull() ]
+    df = df[gs.places.diagnose_selection]
+    df = df.sort_values('LopNr').reset_index(drop=True)
+    return df
+
+
+def save_generated_dia(dia):
+    print('saving generated diagnostics file to:')
+    print('\t', gs.places.generated_dia)
+    dia.to_csv(gs.places.generated_dia, index = False, sep = '\t')
+
+"""
+def parse(s):
+    print(s, type(s))
+    print(str(s))
+    return parser.parse(str(s))
+"""
+
+def read_generated_dia():
+
+    f = gs.places.generated_dia
+    print('reading file:', f)
+    dia = pd.read_csv(f, dtype = str, sep='\t')
+    da = 'Diagnos_lder'
+    dia[da] = dia[da].apply(common.str2number)
+    """
+    for col in gs.places.diagnose_time_cols:
+        dia[col] = dia[col].apply(str2time)
+    """
+    return dia
+
+def save_tmp_dia(dia):
+    print('saving generated diagnostics groups file to:')
+    print('\t', gs.places.tmp_dia)
+    dia.to_csv(gs.places.tmp_dia, index = False, sep = '\t')
+
+
+
+""" ----------------------------------------
+
+    dia groups 
+
+    reading and saving versions of the 
+    diagnosis groups file
+
+---------------------------------------"""
+
+def group_nr(s):
+    return int(s.split()[1][0:-1])
+
+def rm_dot(s):
+    return s.replace('.','')
+ 
+def read_orig_diagroups():
+    grp = gs.names.input_data_group
+    df = pd.read_csv(gs.places.orig_dia_groups, dtype = str, sep='\t')
+    cols = list(df.columns[4:6]) + list([df.columns[1]]) 
+    df = df[cols]
+    df.columns = ['ICD10',  'SNOMED', grp]
+    df[grp] = df[grp].apply(group_nr)
+    df['ICD10'] = df['ICD10'].apply(rm_dot)
+    return df
+   
+def save_diagroups():
+    df = read_orig_diagroups()
+    df = df.sort_values(['ICD10', 'SNOMED']).reset_index(drop=True)
+    print('saving generated diagnostics groups file to:')
+    print('\t', gs.places.dia_groups)
+    df.to_csv(gs.places.dia_groups, index = False, sep = '\t')
+
+
+def read_diagroups():
+    f = gs.places.dia_groups
+    print('reading file:', f)
+    df = pd.read_csv(f, dtype = str, sep='\t')
+    return df
+
+
+
+
+""" ----------------------------------------
+
+    person 
+    reading the person data file
+
+---------------------------------------"""
+
+
+def k_n(x):
+    if not(x == 'flicka') and not(x == 'pojke'):
+        return 'sex unknown'
+    return x
+
+def vital(x):
+    return {
+        'levande':'alive',
+        'Död':'dead',
+        'utvandrade':'emigrated',
+        'okänt':'emigrated'
+    }[x]
+
+
+
+def readperson():
+    time_cols = gs.places.person_time_cols
+    cols = gs.places.person_cols
+    
+    f  = gs.places.person
+    print('reading file:', f)
+    df = pd.read_csv(f, dtype = str, sep='\t')
+    df = df[ ~df['LopNr'].isnull() ]
+    df = df[gs.places.person_selection]    
+    for c in time_cols:
+        df[c] = df[c].apply(str2time)  # may destroy the values of other cols
+    df['K_n'] = df['K_n'].apply(k_n)
+    
+    # df['last_date'] = df['Registerdata_inh_mtade__CR_Bef__']
+    df['VitalStatus'] = df['VitalStatus'].apply(vital)
+    
+    df.columns = cols
+    return df
+
+
+
+""" ----------------------------------------
+
+    drug
+    this concerns the outcome, drugs for everyone
+
+---------------------------------------"""
+
+
+def readdrug(n = 1):
+    f  = gs.places.drug
+    print('reading file:', f)
+    df = pd.read_csv(f, dtype = str, sep='\t', encoding='latin1')
+    # df = pd.read_csv(f, dtype = str, sep='\t')
+    df = df[ ~df['LopNr'].isnull() ]
+    df = df[ df['LopNr'].str.contains('\d', regex=True) ]
+    df = df[gs.places.drug_selection]  
+    df['ATC'] = df['ATC'].str[:n].apply(str) # zzz
+    df = df.drop_duplicates()
+    return df
+
+
+""" ----------------------------------------
+
+    cytostatica
+
+---------------------------------------"""
+
+
+def readcytocodes():
+    f  = gs.places.cytocodes
+    print('reading file:', f)
+    df = pd.read_csv(f, dtype = str, sep='\t')
+    df = df[ ~df['BehNr'].isnull() ]
+    df = df[gs.places.cyto_selection]
+    df = df.drop_duplicates()
+    return df
+  
+
+def readdrugclasses():
+    f  = gs.places.drug_classes
+    print('reading file:', f)
+    df = pd.read_csv(f, dtype = str, sep='\t')
+    df = df[gs.places.drug_classes_selection]  
+    df = df.drop_duplicates()
+    return df
+
+
+
+""" ----------------------------------------
+
+    treatement
+    reading the tratement file
+
+    not complete yet 2019-04-02
+
+---------------------------------------"""
+
+
+def readtreatment():
+    f  = gs.places.treatment
+    print('reading file:', f)
+    df = pd.read_csv(f, dtype = str, sep='\t')
+
+  
+    
+    df = df[ ~df['LopNr'].isnull() ]
+    df = df[gs.places.treatment_selection]
+
+    
+    # df = df.dropna() # will drop all but 18 LopNr:s
+
+
+    df = common.change_col_names(df, 'StartDatum_Cyt', 'cyto_date')
+    df = common.change_col_names(df, 'StartDatum_Stamcell', 'stem_date')
+    df = common.change_col_names(df, 'KirurDat', 'surg_date')
+    df = common.change_col_names(df, 'StartDatum_Radioterapi', 'radio_date')
+    df = df.drop_duplicates().reset_index(drop=True)
+
+    # xs = np.unique(df['LopNr'].values)
+    # print('nr of LopNr', len(xs))
+  
+    
+    return df
+
+
+
+
+
+""" ----------------------------------------
+
+    incare and nicare
+    institutional and non-institutional care
+    reading the files
+
+---------------------------------------"""
+
+def readorigincare():
+    f  = gs.places.orig_incare
+    print('reading file:', f)
+    df = pd.read_csv(f, dtype = str, sep='\t', encoding='latin1')
+    df = df[ ~df['LopNr'].isnull() ]
+    df = df[gs.places.incare_selection]  
+    return df
+
+
+def readorignicare():
+    f  = gs.places.orig_nicare
+    print('reading file:', f)
+    df = pd.read_csv(f, dtype = str, sep='\t', encoding='latin1')
+    df = df[ ~df['LopNr'].isnull() ]
+    df = df[gs.places.nicare_selection]  
+    df = df.drop_duplicates() 
+    return df
+
+
+
+def readincare():
+    f  = gs.places.incare
+    print('reading file:', f)
+    df = pd.read_csv(f, dtype = str, sep='\t', encoding='latin1')
+    return df
+
+
+def readnicare():
+    f  = gs.places.nicare
+    print('reading file:', f)
+    df = pd.read_csv(f, dtype = str, sep='\t', encoding='latin1')
+    return df
+
+
+
+def modify_care(df):
+    
+    date = 'INDATUM'
+    dia = 'DIAGNOS'
+    icd10_start = np.datetime64('1998-01-01')
+    df[date] = df[date].apply(str2time)
+    df = df.sort_values(date).dropna().reset_index(drop=True)    
+    df1 = df[df[date] < icd10_start]    
+    df2 = df[df[date] >= icd10_start]
+    df2c = copy.copy(df2)
+    df2c[dia] = df2[dia].apply(common.icd10_fill)
+    df = pd.concat( [df1, df2c])
+    return df
+
+def modify_incare():
+    df = readorigincare()
+    df = modify_care(df)
+    f  = gs.places.incare
+    print('saving to file:')
+    print('\t', f)
+    df.to_csv(f, index = False, sep = '\t')
+    
+
+def modify_nicare():
+    df = readorignicare()
+    df = modify_care(df)
+    f  = gs.places.nicare
+    print('saving to file:')
+    print('\t', f)
+    df.to_csv(f, index = False, sep = '\t')
+
+""" ----------------------------------------
+
+    control group
+
+---------------------------------------"""
+
+def readcontrol():
+    f  = gs.places.control
+    print('reading file:', f)
+    df = pd.read_csv(f, dtype = str, sep='\t', encoding='latin1')
+    df = df[gs.places.control_selection]  
+    df.columns = gs.places.control_column_names
+
+    # fc = 'FoddAr'
+    # df[fc] = df[fc].apply(parser.parse)
+    # df = df[ ~df['LopNr'].isnull() ]
+    # df = df[gs.places.control_selection]  
+    return df
+
+
+""" ----------------------------------------
+
+    cause
+
+    because of sever problems with pandas: a datetime object 
+    in one column interfers with operations on other columns,
+    the readcause function manages the cause list one letter 
+    construction
+
+---------------------------------------"""
+
+
+
+def old_add_to_cause_list(d, c, lex8, lex9):
+    recidiv_icd10 = d['ICD10'] # do no add first letter of recidiv
+
+    icd = d['ICD']
+    L = d['life_events']
+    x = d[c]
+    if L == 'null': #  L is null from start 
+        L = []
+
+    if not isinstance(x, str):
+        return L
+
+    if x == recidiv_icd10:
+        return L
+
+    if icd == '10':    
+        L = list(np.unique( L + [x[0]] ))
+        return L
+
+    if icd == '9':
+        cx = common.look_up_codes(x, lex9)
+    if icd == '8':
+        cx = common.look_up_codes(x, lex8)
+
+    for c in cx:
+        L +=  [c[0]]
+    L = list(np.unique( L ))
+    return L
+
+
+def add_class(L, x, rc, nr = 3):
+    xc = x[:nr]
+    if not xc == rc:
+        L = list(np.unique( L + [xc] ))
+    return L
+
+
+def add_to_cause_list(d, c, lex8, lex9, nr = 3):
+    # nr is the nr of icd10 chars of concern
+    
+    recidiv_icd10 = d['ICD10'] # do no add recidiv class diagnosis
+    recidive_class = recidiv_icd10[:nr]
+
+    icd = d['ICD']
+    L = d['life_events']
+    x = d[c]
+    
+    if L == 'null': #  L is null from start 
+        L = []
+
+    if not isinstance(x, str):
+        return L
+    
+    if icd == '10':
+        if common.is_icd10(x):
+            L = add_class(L, x, recidive_class, nr = nr)
+        elif not x[0].isalpha() and not x[0].isdigit() and common.is_icd10(x[1:]):
+            L = add_class(L, x[1:], recidive_class, nr = nr)
+        return L
+
+    if icd == '9':
+        cx = common.look_up_codes_1(x, lex9, 'icd9')
+        
+    if icd == '8':
+        cx = common.look_up_codes_1(x, lex8, 'icd8')
+
+
+    if common.isarray(cx):
+        for c in cx:
+            L = add_class(L, c, recidive_class, nr = nr)
+        
+    elif common.isstr(cx):
+        L = add_class(L, cx, recidive_class, nr = nr)
+    
+    return L
+  
+
+def readcause(df_icd10):
+    
+    # read cause and add a column 'life_events' 
+    # with a list of first letter of non-recidiv causes
+    
+    cfile  = gs.places.cause
+    print('reading file:', cfile)
+    df = pd.read_csv(cfile, dtype = str, sep='\t', encoding='latin1')
+    df = df[gs.places.cause_selection]  
+    df = diagnose.add_cols_or_zero(df, df_icd10, 'LopNr', ['ICD10'])
+    df['life_events'] = 'null'
+
+    lex8 = read_cause_icd8_lex()
+    lex9 = read_cause_icd9_lex()
+
+    for c in gs.places.morsak:
+        f = lambda d:add_to_cause_list(d, c, lex8, lex9)  # use cause lex here
+        df['life_events'] = df.apply(f, axis = 1)
+    cols =  ['LopNr'] + gs.places.cause_added_cols
+    df = df[cols] 
+    """
+    this destroys it all 
+
+
+    !!
+    for col in gs.places.cause_time_cols:
+        df[col] = df[col].apply(str2time)
+    """
+
+    return df
+ 
+
+""" ----------------------------------------
+
+    surgery
+
+---------------------------------------"""
+
+def readsurgery():
+    f  = gs.places.surgery
+    print('reading file:', f)
+    df = pd.read_csv(f, dtype = str, sep='\t', 
+                     keep_default_na = False,
+                     na_values = [''])
+
+
+    df = df[ ~df['LopNr'].isnull() ]
+    # remove the non-number LopNr in the surgery file
+    df = df[ df['LopNr'].str.contains('\d', regex=True) ] 
+    df = df[gs.places.surgery_selection] 
+    df = df.drop_duplicates()
+    df.columns = gs.places.surgery_column_names
+    df['Datum'] = df['Datum'].apply(str2time)
+    return df
+ 
+def read_generated_surgery():
+    df = read_generated_file('surgery')
+    df['LopNr'] = df['LopNr'].apply(str)
+    df['surgery'] = df['surgery'].apply(common.str2list)
+    return df
+
+
+""" ----------------------------------------
+
+    stralung
+
+---------------------------------------"""
+
+
+def readradiocodes():      # BehNr > Target_Kod
+    f = gs.places.radio
+    print('reading file:', f)
+    df = pd.read_csv(f, dtype = str, sep='\t', encoding='latin1')
+    df = df[['LopNr', 'BehNr', 'Target_Kod']]
+    return df
+
+def readradioclasses():    # Target_kod > Kategorier
+    f = gs.places.radio_classes
+    print('reading file:', f)
+    df = pd.read_csv(f, dtype = str, sep='\t', encoding='latin1')
+    df = common.rmcol(df, 'Target')
+
+    # df = df[['Target_Kod', 'k1', 'k2', 'k3', 'k4']]
+    return df
+
+
+
+
+""" -------------------------------
+
+    special missions
+
+    reading the  columns Target_I and Target_II
+    otherwise not used, from the treatement file 
+
+------------------------------------"""
+
+def readtarget():
+    f  = gs.places.treatment
+    df = pd.read_csv(f, dtype = str, sep='\t')
+    df = df[ ~df['LopNr'].isnull() ]
+    df = df[gs.places.treatment_tagets]  
+    return df
+
+
+
+
+""" -------------------------------
+
+    evolving reads
+
+------------------------------------"""
+
+
+# use bio.read_generated_file(gs.names.outcome_file) instead
+
+def readoutcome():
+    col = 'first_incare'
+    name = gs.names.outcome_file
+    df   = read_generated_file(name)
+    df[col] = df[col].apply(common.str2list)
+    return df
+
+
+def read_final(name):
+    ei_col = 'event_time_inc'
+    ci_col = 'event_time_death'
+    df = read_generated_file(name)
+    df['LopNr'] = df['LopNr'].apply(str)
+    df[ei_col] = df[ei_col].apply(common.str2list)
+    df[ci_col] = df[ci_col].apply(common.str2list)
+    return df
+
+def readgt():
+    name = gs.names.treat_file
+    return read_final(name)
+
+
+def readtotal():
+    list_cols = [
+        'event_time_death',
+	'event_time_inc',
+	'event_time_nic',
+	'surgery',
+	'drug_class',
+	'radio'
+    ]
+
+    name = gs.names.total_file
+    df = read_final(name)
+    f = lambda x: [] if x == 0 else x
+    for c in list_cols:
+        df[c] = df[c].apply(str).apply(common.str2list).apply(f)
+    return df
+
+"""
+def readtimediff():
+    name = gs.names.timediff_file
+    return read_final(name)
+"""
+
+
+def readtimediff():
+    # no type conversions but for LopNr
+    name = gs.names.timediff_file
+    df = read_generated_file(name)
+    df['LopNr'] = df['LopNr'].apply(str)
+    return df
+
diff --git a/datapreparation/bonsai_time.py b/datapreparation/bonsai_time.py
new file mode 100644
index 0000000..3e259fa
--- /dev/null
+++ b/datapreparation/bonsai_time.py
@@ -0,0 +1,316 @@
+#! /home/jan/anaconda3/bin/python
+
+
+""" -------------------------------
+
+    Copyright (C) 2018 RISE
+    This code was produced by RISE
+    The 2013-03-26 version
+
+    bonsai/src_v02/bonsai_time.py
+    
+    support for managing of time data 
+    and time analysis
+
+
+------------------------------------"""
+
+
+
+from datetime import datetime as timeclass
+from datetime import timedelta as delta
+from dateutil import parser
+import pandas as pd
+import numpy as np
+
+import common
+import global_settings as gs
+import bonsai_io as bio
+
+
+def head(xs, lex8, lex9, n = 1):
+    ys = []
+    for x in xs:
+        if isinstance(x, str):
+            cx = common.lookup_icd10(x, lex8, lex9)
+            for c in cx:
+                ys +=  [c[:n]]
+    return ys
+
+
+def split_and_head(ys, recidiv_icd10_class, lex8, lex9, n = 1):
+    
+    cs = []
+    for y in ys.values:
+        if not common.isnull(y):
+            cs = np.append(cs, y.split())
+    cs = list(cs)
+    ys = np.unique(head(cs, lex8, lex9, n = n))
+    ys = list(set(ys) - set([recidiv_icd10_class]))
+    return ys
+
+# here we can instead return the difference
+
+def after(x, y, d = 0):
+    if common.notnull(x) and common.notnull(y):
+        return parser.parse(x) > parser.parse(y) +  delta(days = d)
+    return False
+
+# here we can instead return the time and difference
+
+def times_after(sz, y, d = 0):
+    L = []
+    for z in sz:
+        if after(z, y, d):
+            L += [z]
+    return L
+
+    
+def first_time(zs):
+    L = list(map(common.str2time, zs))
+    if L == []:
+        return '0'
+    # t = min(list(map(common.str2time, zs)))
+    return common.time2str(min(L))
+
+
+def first_after(zs, y, d = 0):
+    return first_time(times_after(zs, y, d))
+
+
+def first_time_compression(df, xcol, ycol, zcol, fcol, lex8, lex9, gap = 1826, split = True, n = 1):
+
+    # xcol = 'LopNr'
+    # ycol = 'DIAGNOS'
+    # fcol = 'first_incare'
+
+    data = []
+    df = df.dropna()
+    if df.empty:
+        return df
+
+    xs =  df[xcol].drop_duplicates()
+
+    print()
+    print('first_time_compression')
+    print()
+    i = 0
+
+    print('nr of LopNr:s',  len(xs))
+    for x in xs:
+        i += 1
+        if (i % 100) == 0:
+            print(i)
+        
+        dx = df[ df[xcol] == x ]  
+        diagnos_dat   = dx['DiagnosDat'].values[0]
+        recidiv_icd10 = dx['ICD10'].values[0]    
+        recidiv_icd10_class = recidiv_icd10[:n]
+        sz = dx[zcol].drop_duplicates()
+        yz_list = []
+
+        for z in sz:
+            dz = dx[ dx[zcol] == z ]  
+            ys = dz[ycol].drop_duplicates()
+
+
+            if split:
+                ys = split_and_head(ys, recidiv_icd10_class, lex8, lex9, n = n)
+                
+            for y in ys:
+                yz_list += [[y, z]]
+
+        if not (yz_list == []):
+            dyz = pd.DataFrame(yz_list)
+            dyz.columns = [ycol, zcol]      
+            sy = dyz[ycol].drop_duplicates()
+            yminz_list = []
+            for y in sy:
+                dy = dyz[ dyz[ycol] == y ]  
+                times = dy[zcol].values
+                z = first_after(times, diagnos_dat, gap)
+                if z != '0':
+                    yminz_list += [[y, z]]
+            data = data + [ [x] + [yminz_list] ]
+
+    dz = pd.DataFrame(data)
+    if not dz.empty:
+        dz.columns = [xcol, fcol]
+    return dz
+
+def first_time_aho(df, base, xcol, ycol, zcol, fcol, lex8, lex9, gap = 1826, split = True, n = 1):
+
+    # xcol = 'LopNr'
+    # ycol = 'DIAGNOS'
+    # fcol = 'first_incare'
+
+    data = []
+
+    xs =  base[xcol]
+
+    print()
+    print('first_time_compression aho')
+    print()
+    i = 0
+
+    df.set_index(xcol,drop=False,inplace=True)
+
+    print('nr of LopNr:s',  len(xs))
+    for x in xs:
+        i += 1
+        if (i % 100) == 0:
+            print(i)
+        
+        if x not in df.index:
+            continue
+
+        dx = df.loc[x]  
+        if type(dx) != pd.DataFrame:
+            dx = pd.DataFrame([dx])
+        diagnos_dat   = base['DiagnosDat'][x]
+        recidiv_icd10 = base['ICD10'][x]
+        recidiv_icd10_class = recidiv_icd10[:n] if isinstance(recidiv_icd10, str) else ""
+
+        yz_list = []
+        for j in range(len(dx)):
+            yz_list += split_and_head_1(dx.iloc[j], ycol, zcol, recidiv_icd10_class, lex8, lex9, n = n)
+                
+        if not (yz_list == []):
+            dyz = pd.DataFrame(yz_list)
+            dyz.columns = [ycol, zcol]      
+            sy = dyz[ycol].drop_duplicates()
+            yminz_list = []
+            for y in sy:
+                dy = dyz[ dyz[ycol] == y ]  
+                times = dy[zcol].values
+                z = first_after(times, diagnos_dat, gap)
+                if z != '0':
+                    yminz_list += [[y, z]]
+            data = data + [ [x] + [yminz_list] ]
+
+    dz = pd.DataFrame(data)
+    if not dz.empty:
+        dz.columns = [xcol, fcol]
+    return dz
+
+
+
+def all_times(df, xcol, ycol, zcol, acol, lex8, lex9, split = True, n = 1):
+    
+    # xcol = 'LopNr'
+    # ycol = 'DIAGNOS'
+    # acol = 'all_incare'
+    
+    data = []
+    df = df.dropna()
+    if df.empty:
+        return df
+    
+    xs =  df[xcol].drop_duplicates()
+
+    print()
+    print('all_time_compression')
+    print()
+    i = 0
+    
+    for x in xs:
+
+        i += 1
+        if (i % 100) == 0:
+            print(i)
+
+        dx = df[ df[xcol] == x ]  
+        diagnos_dat   = dx['DiagnosDat'].values[0]
+        recidiv_icd10 = dx['ICD10'].values[0]
+        recidiv_icd10_class = recidiv_icd10[:n]
+        sz = dx[zcol].drop_duplicates()
+        yz_list = []
+
+        for z in sz:
+            dz = dx[ dx[zcol] == z ]  
+            ys = dz[ycol].drop_duplicates()
+            
+            if split:
+                ys = split_and_head(ys, recidiv_icd10_class, lex8, lex9, n = n)
+            
+            for y in ys:
+                yz_list += [[y, z]]
+                
+        if not (yz_list == []):
+            dyz = pd.DataFrame(yz_list)
+            dyz.columns = [ycol, zcol]      
+            sy = dyz[ycol].drop_duplicates()
+
+            yallz_list = []
+            for y in sy:
+                dy = dyz[ dyz[ycol] == y ]  
+                times = list(dy[zcol].values)
+                yallz_list += [[y, times]]
+            data = data + [ [x] + [yallz_list] ]
+
+    dz = pd.DataFrame(data)
+    if not dz.empty:
+        dz.columns = [xcol, acol]    
+    return dz
+
+def split_and_head_1(row, ycol, zcol, recidiv_icd10_class, lex8, lex9, n = 1):
+    if common.isnull(row[ycol]) or common.isnull(row[zcol]):
+        return []
+    cs = row[ycol].split()
+    ys = np.unique(head(cs, lex8, lex9, n = n))
+    ys = list(set(ys) - set([recidiv_icd10_class]))
+    return [[y, row[zcol]] for y in ys]
+
+def all_times_aho(df, base, xcol, ycol, zcol, acol, lex8, lex9, split = True, n = 1):
+    
+    # xcol = 'LopNr'
+    # ycol = 'DIAGNOS'
+    # acol = 'all_incare'
+    
+    data = []
+    
+    xs =  base[xcol]
+
+    print()
+    print('all_time_compression aho')
+    print()
+    i = 0
+
+    df.set_index(xcol,drop=False,inplace=True)
+
+    for x in xs:
+
+        i += 1
+        if (i % 100) == 0:
+            print(i)
+
+        if x not in df.index:
+            continue
+
+        dx = df.loc[x]  
+        if type(dx) != pd.DataFrame:
+            dx = pd.DataFrame([dx])
+        diagnos_dat   = base['DiagnosDat'][x]
+        recidiv_icd10 = base['ICD10'][x]
+        recidiv_icd10_class = recidiv_icd10[:n] if isinstance(recidiv_icd10, str) else ""
+
+        yz_list = []
+        for j in range(len(dx)):
+            yz_list += split_and_head_1(dx.iloc[j], ycol, zcol, recidiv_icd10_class, lex8, lex9, n = n)
+                
+        if not (yz_list == []):
+            dyz = pd.DataFrame(yz_list)
+            dyz.columns = [ycol, zcol]      
+            sy = dyz[ycol].drop_duplicates()
+
+            yallz_list = []
+            for y in sy:
+                dy = dyz[ dyz[ycol] == y ]  
+                times = list(dy[zcol].values)
+                yallz_list += [[y, times]]
+            data = data + [ [x] + [yallz_list] ]
+
+    dz = pd.DataFrame(data)
+    if not dz.empty:
+        dz.columns = [xcol, acol]    
+    return dz
diff --git a/datapreparation/common.py b/datapreparation/common.py
new file mode 100644
index 0000000..12d89f9
--- /dev/null
+++ b/datapreparation/common.py
@@ -0,0 +1,465 @@
+#! /usr/bin/env python3
+
+
+""" -------------------------------
+
+    Copyright (C) 2018 RISE
+    This code was produced by RISE
+    The 2019-08-29 version
+
+    bonsai/src/common.py
+    common support 
+
+------------------------------------"""
+
+import pandas as pd
+import numpy as np
+from datetime import datetime as timeclass
+from datetime import timedelta as delta
+from dateutil import parser
+import ast
+import copy
+
+from scipy.stats import beta
+# from scipy.stats import expon
+# from scipy.stats import gamma
+
+
+""" ---------------------------
+
+   boolean functions
+
+------------------------- """
+
+
+def isstr(x):
+    return isinstance(x, str)
+
+
+def isnull(a):
+    return not(isinstance(a, str))
+
+
+def notnull(x):
+    if isstr(x):
+        return (x != '0')
+    return False
+ 
+
+def isarray(x):
+    return (isinstance(x, list) or isinstance(x, np.ndarray))
+
+# what is an icd 10 code, we require that it starts
+# with an uppercase letter followed by three numbers
+
+
+def is_icd10(x):
+    if not isstr(x):
+        return False
+    if len(x) < 4:
+        return False
+    if not x[0].isupper():
+        return False
+    return x[1].isdigit() and x[2].isdigit() and x[3].isdigit()
+
+
+
+
+""" ---------------------------
+
+   conversions
+
+------------------------- """
+
+
+
+def str2number(x, dec = 5):
+    s = str(x)
+    if  (',' in s):
+        x = float(s.replace(',','.'))
+    return round(float(x), dec)
+
+
+def str2list(s):
+    return ast.literal_eval(s)
+
+def bool2str(x):
+    if isinstance(x, bool) or isinstance(x, int):
+        return str(int(x))
+    return x
+
+def df_bool2str(df, cols):
+    for c in cols:
+        df[c] = df[c].apply(bool2str)
+    return df
+
+
+
+
+""" ---------------------------
+
+   lists
+
+   [[['JF', 'PJ'], 41], [['JE'], 0]] to
+   [['JF', 41], ['PJ', 41], ['JE', 0]]
+
+
+------------------------- """
+
+def code_compress(u):
+    ru = []
+    for x in u:
+        rx = []
+        [cs, d] = x
+        for c in cs:
+            rx += [[c, d]]
+        ru += rx
+    return ru
+
+
+""" ---------------------------
+
+   pandas
+
+------------------------- """
+
+def rmcol(df, c):
+    return df.drop([c], axis=1)
+
+def rmcols(df, cs):
+    for c in cs:
+        df = df.drop([c], axis=1)
+    return df
+
+
+def change_col_names(df, old_name, new_name):
+    df.columns = [new_name if x==old_name else x for x in df.columns]
+    return df
+
+
+def to_each_col_apply(df, cols, f):
+    for c in cols:
+        df[c] = df[c].apply(f)
+
+
+
+def compression(df, xcol, vcol):
+    data = []
+    xs =  df[xcol].drop_duplicates()
+    for x in xs:
+        dx = df[ df[xcol] == x ]  
+        codes = []
+        for line in dx[vcol].values:
+            if isinstance(line, list):
+                codes += line
+        codes = np.unique(codes)
+        data += [[x, codes]]
+        
+    d = pd.DataFrame(data)
+    d.columns = [xcol, vcol]
+    d[vcol] = d[vcol].apply(list)
+    return d
+
+
+
+def compression_with_dates(df, xcol, vcol, dcol):
+    data = []
+    xs =  df[xcol].drop_duplicates()
+    for x in xs:
+
+        dx = df[ df[xcol] == x ]
+        L0 = len(dx)
+        dx = dx[dx[dcol] == dx[dcol]] # remove NaT values
+        L1 = len(dx)
+        if L0 != L1:
+            print(L0-L1, 'non-valid surgery dates for this LopNr')
+            print(dx)
+            print()
+ 
+        codes = []
+        dxcopy = copy.copy(dx) 
+        dxcopy['Datum'] = dx['Datum'].apply(lambda x: x.strftime('%Y-%m-%d'))
+        dx = dxcopy
+        for values_and_date in dx[[vcol, dcol]].values:
+            values_and_date = [values_and_date[0]] + [values_and_date[1]]
+
+            if isinstance(values_and_date, list):
+                codes += [values_and_date]
+        data += [[x, codes]]
+        
+    d = pd.DataFrame(data)
+    d.columns = [xcol, vcol]
+    d[vcol] = d[vcol].apply(list)
+    return d
+
+
+
+
+def compression_date_limit(df, xcol, vcol, dcol, date_dict):
+    data = []
+    xs =  df[xcol].drop_duplicates()
+    for x in xs:
+        dx = df[ df[xcol] == x and df[dcol] <= date_dict[x]]  
+        codes = []
+        for line in dx[vcol].values:
+            if isinstance(line, list):
+                codes += line
+        codes = np.unique(codes)
+        data += [[x, codes]]
+        
+    d = pd.DataFrame(data)
+    d.columns = [xcol, vcol]
+    d[vcol] = d[vcol].apply(list)
+    return d
+
+
+
+
+
+""" ---------------------------
+
+   lexicon (same code as in merge.py)
+
+------------------------- """
+
+
+
+def look_up_entry(entry, df, entry_col, value_col):
+    dfe = df[df[entry_col] == entry]
+    if not dfe.empty:
+        return dfe[value_col].values[0] 
+    return str(0)
+
+
+def look_up_value(entry, df, entry_col, value_col):
+    dfe = df[df[entry_col] == entry]
+    if not dfe.empty:
+        return dfe[value_col].values[0] 
+    return np.nan
+
+
+def look_up_list_value(entry, df, entry_col, value_col):
+    dfe = df[df[entry_col] == entry]
+    if not dfe.empty:
+        return str2list( dfe[value_col].values[0] )  
+    return str(0)
+
+
+def look_up_codes_1(x, lex, col):
+    keys = set(lex[col].values)
+    if x in keys:
+        return look_up_list_value(x, lex, col, 'icd10')
+    elif x[0].isalpha() and x[1:] in keys:
+        return look_up_list_value(x[1:], lex, col, 'icd10')
+    else:
+        return []
+
+def look_up_codes(x, lex, col):
+    keys = set(lex[col].values)
+    if x in keys:
+        return look_up_list_value(x, lex, col, 'icd10')
+    return []
+
+
+def look_up_list(entry, df, entry_col, value_col):
+    dfe = df[df[entry_col] == entry]
+    if not dfe.empty:
+        return dfe[value_col].values[0]
+    return []
+
+
+#def lookup_icd10(x, lex8, lex9):
+#    lex8.columns = ['code', 'ICD10']
+#    lex9.columns = ['code', 'ICD10']
+#    lex = pd.concat( [lex8, lex9])
+#    return look_up_codes(x, lex)
+
+def lookup_icd10(x, lex8, lex9):
+    res = look_up_codes(x, lex8, 'icd8') if lex8 is not False else []
+    if res:
+        return res
+    res = look_up_codes(x, lex9, 'icd9') if lex9 is not False else []
+    if res:
+        return res
+    if is_icd10(x) or is_icd10_class(x):
+        return [x]
+    else:
+        return []
+
+def add_list_cols(df1, df2, xc, ycs):
+
+    # print(df1[:5]) # zzz
+    # print(df2[:5]) # zzz
+    
+    for yc in ycs:
+        df1[yc] = df1[xc].apply(lambda x:look_up_list(x, df2, xc, yc))
+    return df1
+
+
+def look_up_all_values(entry, df, entry_col, value_col):
+    dfe = df[df[entry_col] == entry]
+    if dfe.empty:
+        return []
+    return list(dfe[value_col].values)
+
+"""
+def look_up_all_values_and_dates(entry, df, entry_col, value_col, date_col):
+    dfe = df[df[entry_col] == entry]
+    return dfe[[value_col, date_col]]
+"""
+
+
+def look_up_all_values_date_limit(entry, df, entry_col, value_col, date_col, date_dict):
+    dfe = df[df[entry_col] == entry and df[date_col] <= date_dict[entry]]
+    if dfe.empty:
+        return []
+    return list(dfe[value_col].values)
+
+
+def look_up_nonnan(entry, df, entry_col, value_col):
+    dfe = df[df[entry_col] == entry]
+    if dfe.empty:
+        return []
+    v = list( set(dfe[value_col].values) - set([np.nan]) )
+    return v
+
+
+""" ---------------------------
+
+   Time
+
+   t: timeclass
+   x: time as a string (str)
+
+   Here we use the string '0' as an undefined value
+
+------------------------- """
+
+def is_time(t):
+    # return isinstance(t, datetime.datetime)
+    return isinstance(t, timeclass) 
+
+def str2time(x):
+    if notnull(x):
+        return parser.parse(x)
+    return x
+
+"""
+def full_year2time(x):
+    return str2time(x + '-06-15')
+"""
+
+def time2str(t):
+    if is_time(t):
+        return t.strftime("%Y-%m-%d")
+    return t
+
+
+def time_diff(t1, t0):
+    if is_time(t1) and is_time(t0):
+        return (t1-t0).total_seconds()
+    return np.nan  
+
+
+def time_diff_in_seconds(x, y):
+    if notnull(x) and notnull(y):
+        if x[-2:]=='00':
+            x = x[:-2] + "15"
+        if y[-2:]=='00':
+            y = y[:-2] + "15"
+        tx = parser.parse(x)
+        ty = parser.parse(y)
+        return (tx-ty).total_seconds()
+    return np.nan   
+
+def time_diff_in_days(x, y):
+    s = time_diff_in_seconds(x, y)
+    if np.isnan(s) or (s == '0'):
+        return s
+    return round( s/ (60*60*24) )
+
+
+def time_diff_in_years(x, y):
+    s = time_diff_in_days(x, y)
+    if np.isnan(s) or (s == '0'):
+        return s
+    return round( s/ 365.24, 2 )
+
+def time_min(x, y):
+    return time2str(min(parser.parse(x), parser.parse(y)))
+
+def time_max(x, y):
+    return time2str(max(parser.parse(x), parser.parse(y)))
+
+def add_days(x, nr_of_days):
+    # print(time2str(parser.parse(x)))
+    return time2str(parser.parse(x) +  delta(days = nr_of_days))
+
+def add_years(x, nr_of_years):
+    nr_of_days = 365.24 * nr_of_years
+    return add_days(x, nr_of_days)
+
+def add_years_to_full_year(x, nr_of_years):
+    x = x + '-07-02'
+    return add_years(x, nr_of_years)
+
+
+""" ---------------------------
+
+   two sample test
+
+------------------------- """
+
+
+def exp2sample_prob(nx, ny, sx, sy):
+    if sx == 0 or sy == 0 or nx == 0 or ny == 0:
+        return 0.5
+    h = sy / (sx + sy)
+    return beta.cdf(h, ny, nx)
+
+
+def exp2sample_logprob(nx, ny, sx, sy):
+    if sx == 0 or sy == 0 or nx == 0 or ny == 0:
+        return 0.5
+    h = sy / (sx + sy)
+    return beta.logcdf(h, ny, nx)
+
+
+def exp2sample_logsf(nx, ny, sx, sy):
+    if sx == 0 or sy == 0 or nx == 0 or ny == 0:
+        return 0.5
+    h = sy / (sx + sy)
+    return beta.logsf(h, ny, nx)
+
+
+""" ---------------------------
+
+   fill in icd10 class codes
+
+------------------------- """
+
+
+
+
+def is_icd10_class(x):
+    if not isstr(x):
+        return False
+    if is_icd10(x):
+        return False
+    if len(x) < 3:
+        return False
+    if not x[0].isupper():
+        return False
+    return x[1].isdigit() and x[2].isdigit()
+
+def icd10_fill(xs, s = '0*'):
+    if not isinstance(xs, str):
+        return xs
+    ys = ''
+    for x in xs.split():
+        if is_icd10_class(x):
+            ys += ' ' + x[:3] + s + x[3:]
+            # print('\t', x, '\t', x[:3] + s + x[3:])
+        else:
+            ys += ' ' + x 
+    return ys[1:]
diff --git a/datapreparation/diagnose.py b/datapreparation/diagnose.py
new file mode 100644
index 0000000..1010f6e
--- /dev/null
+++ b/datapreparation/diagnose.py
@@ -0,0 +1,472 @@
+#! /usr/bin/env python3
+
+
+""" -------------------------------
+
+    Copyright (C) 2018 RISE
+    This code was produced by RISE
+    The 2013-03-26 version
+
+    bonsai/src_v02/diagnose.py
+    processing the diagnosis data
+
+    Notice: This file is not imported
+    using the name dia, since dia is
+    often used for a dataframe with
+    diagnosis data content
+
+------------------------------------"""
+
+import pandas as pd
+import numpy as np
+import copy
+
+import bonsai_io as bio
+import common
+import lexicon
+import global_settings as gs
+import merge
+
+
+
+
+
+""" ----------------------------------------
+
+    generate version 1 dia 
+
+---------------------------------------"""
+
+
+
+def special(ICD10):
+    if common.isnull(ICD10): # not needed after filled in
+        return 0
+    if 'G' in str(ICD10):
+        return 1    
+    if 'H' in str(ICD10):
+        return 1    
+    if 'K' in str(ICD10):
+        return 1 
+    if 'L' in str(ICD10):
+        return 1
+    if 'O' in str(ICD10):
+        return 1
+    if 'Q' in str(ICD10):
+        return 1
+    if 'R' in str(ICD10):
+        return 1
+    if 'T' in str(ICD10):
+        return 1
+    if 'Z' in str(ICD10):
+        return 1
+    if str(ICD10) == 'D469':
+        return 0
+    if str(ICD10) == 'D761':
+        return 0
+    if str(ICD10) == 'D459':
+        return 0
+    if 'D' in str(ICD10):
+        return 1
+    return 0
+
+
+def generate_dia():
+    """
+    constructing the file stored as generated_dia,
+    see places.py
+    """
+
+    xcols  =    ['ICD7', 'ICD9', 'text_C24_']
+
+    dia = bio.read_original_dia()
+    dia = dia.sort_values(by = xcols)
+    print('orig shape:', dia.shape)
+
+    # (1) select the unique rows
+    dia = dia.drop_duplicates()
+    print('non duplicate shape:', dia.shape)
+
+    # (2) select first diagnoses
+    dia = dia[dia['DiagnosNr_Diagnos'] == '1']
+    dia = dia.drop(['DiagnosNr_Diagnos'], axis=1)
+    print('first dia shape:',dia.shape)
+
+    # (3) fill in codes
+    dia = lexicon.fill_in_by_compiled_lex(dia)
+    dia = dia.drop(xcols, axis=1)
+    print('filled dia shape:',dia.shape)
+    
+    # (4) remove the special cases and the not needed columns
+    dia['special'] = dia['ICD10'].apply(special)
+    dia = dia[dia['special'] == 0]
+    dia = dia.drop(['special'], axis=1)
+    print('no special shape:',dia.shape)
+
+    # (5) take care of numbers
+    if 'Diagnos_lder' in gs.places.diagnose_selection:
+        dia['Diagnos_lder'] = dia['Diagnos_lder'].apply(common.str2number)
+
+
+    return dia
+
+
+def rm_dia_cols(dia):
+    cols = gs.places.diagnose_ohe
+    dia = common.rmcols(dia, cols)
+    return dia
+
+
+
+""" ----------------------------------------
+
+    add dia groups
+
+---------------------------------------"""
+
+
+
+def look_up_group_by_codes(groups, ICD10, SNOMED):
+    g = gs.names.input_data_group
+    row = groups[(groups['ICD10'] == ICD10)  & (groups['SNOMED'] == SNOMED)]
+    if not row.empty:
+        return row[g].values[0]
+    return str(0)
+
+def look_up_group(df, groups):
+    g = look_up_group_by_codes(groups, df['ICD10'], df['SNOMED'])
+    return g
+
+def add_group(dia, groups):
+    g   = gs.names.dia_group
+    dia[g] = dia.apply(lambda d: look_up_group(d, groups), axis = 1)
+    return dia
+
+def rm_group_col(dia):
+    g   = gs.names.dia_group
+    dia = common.rmcol(dia, g)
+    return dia
+
+
+""" ----------------------------------------
+
+    to one data frame df1 add columns 
+    from another data frame df2
+
+    Note: in add_cols
+       df1[yc] = 0    when there is no entry x in df2
+       df1[yc] = NaN  when df2[x] = NaN 
+
+---------------------------------------"""
+
+def look_up_entry(entry, df, entry_col, value_col):
+    dfe = df[df[entry_col] == entry]
+    if not dfe.empty:
+        return dfe[value_col].values[0]
+    return str(0)
+
+def add_cols(df1, df2, xc, ycs):
+    for yc in ycs:
+        df1[yc] = df1[xc].apply(lambda x:look_up_entry(x, df2, xc, yc))
+    return df1
+
+
+
+def look_up_or_zero(entry, df, entry_col, value_col, verb = False):
+    dfe = df[df[entry_col] == entry]
+    if not dfe.empty:
+        val = dfe[value_col].values[0]
+        if verb: 
+            print('val =', val, type(val))
+        if isinstance(val, str):
+            return val
+    return str(0)
+
+def add_cols_or_zero(df1, df2, xc, ycs, verb = False):
+    df1_copy = copy.copy(df1) 
+    for yc in ycs:
+        df1_copy[yc] = df1[xc].apply(lambda x:look_up_or_zero(x, df2, xc, yc, verb))
+    return df1_copy
+
+"""
+def add_cols_or_zero(df1, df2, xc, ycs, verb = False):
+    for yc in ycs:
+        df1[yc] = df1[xc].apply(lambda x:look_up_or_zero(x, df2, xc, yc, verb))
+    return df1
+"""
+
+def look_up_aho(df, x, col, zero = False):
+    return df[col][x] if x in df.index and (not zero or isinstance(df[col][x], str)) else str(0)
+
+def add_cols_aho(df1, df2, xc, ycs, zero = False):
+    if zero:
+        for yc in ycs:
+            df1[yc] = df1[xc].apply(lambda x: df2[yc][x] if x in df2.index and isinstance(df2[yc][x], str) else str(0))
+    else:
+        for yc in ycs:
+            df1[yc] = df1[xc].apply(lambda x: df2[yc][x] if x in df2.index else str(0))
+    return df1
+    
+""" ----------------------------------------
+
+    add person data
+
+---------------------------------------"""
+
+
+def add_pers(dia):
+    cols = gs.places.person_cols  
+    copy_cols = copy.copy(cols)    
+    copy_cols.remove('LopNr')
+    pers = bio.readperson()
+    dia = add_cols(dia, pers, 'LopNr', copy_cols)
+    return dia
+
+def add_pers_ohe(dia):
+    cols = gs.places.person_ohe
+    df = ones_x(dia, cols)
+    return df
+
+def rm_pers_cols(dia):
+    cols = gs.places.person_ohe
+    dia = common.rmcols(dia, cols)
+    return dia
+
+
+
+""" ----------------------------------------
+
+    add incare data (sluten vaard)
+
+---------------------------------------"""
+
+def add_incare(dia, nr = 0):
+
+    xcol = 'LopNr'
+    ycol = gs.places.incare_ohe[0]          # only one column is used so far
+
+    inc = bio.readincare()
+    inc = inc.sort_values([xcol]).reset_index(drop=True)
+    if nr > 0:
+        inc = inc[0:nr]                     # an initial part of incare
+    L = list(dia[xcol].values)
+    inc = inc[ inc[xcol].isin(L) ]          # part of inc with LopNr in dia
+    inc = name_compression(inc, xcol, ycol) # first letter set for each LopNr
+    dia = add_cols(dia, inc, xcol, [ycol])  # add compressed inc cols to dia    
+    return dia
+
+#   the following functions are just for test since the incare compression
+#   lists are merged with the nicare and causes lists before unfolding ohe
+
+
+def add_incare_ohe(dia):
+    ycol = gs.places.incare_ohe[0]          # only one column is used so far
+    dia  = one_general(dia, ycol)           # mk first letter one hot
+    return dia
+
+def rm_incare_cols(dia):
+    cols = gs.places.incare_ohe
+    dia = common.rmcols(dia, cols)
+    return dia
+
+
+
+""" ----------------------------------------
+
+    add nicare data (oppen vaard)
+
+---------------------------------------"""
+
+
+def add_nicare(dia, nr = 0):
+
+    xcol = 'LopNr'
+    ycol = gs.places.nicare_ohe[0]          # only one column is used so far
+
+    nic = bio.readnicare()
+    nic = nic.sort_values([xcol]).reset_index(drop=True)
+    if nr > 0:
+        nic = nic[0:nr]                     # an initial part of incare
+    L = list(dia[xcol].values)
+    nic = nic[ nic[xcol].isin(L) ]          # part of nic with LopNr in dia
+    nic = name_compression(nic, xcol, ycol) # first letter set for each LopNr
+    dia = add_cols(dia, nic, xcol, [ycol])  # add compressed nic cols to dia    
+    return dia
+
+#   the following functions are just for test since the nicare compression
+#   lists are merged with the nicare and causes lists before unfolding ohe
+
+def add_nicare_ohe(dia):
+    ycol = gs.places.nicare_ohe[0]          # only one column is used so far
+    dia  = one_general(dia, ycol)           # mk first letter one hot
+    return dia
+
+def rm_nicare_cols(dia):
+    cols = gs.places.nicare_ohe
+    dia = common.rmcols(dia, cols)
+    return dia
+
+
+""" ----------------------------------------
+
+    add drug data
+
+---------------------------------------"""
+
+
+def add_drug(dia):
+    cols = gs.places.drug_selection.copy()
+    cols.remove('LopNr')
+    drug = bio.readdrug()
+    dia = add_cols(dia, drug, 'LopNr', cols)
+    return dia
+
+
+def add_drug_ohe(dia):
+    cols = gs.places.drug_ohe
+    df = ones_x(dia, cols)
+    return df
+
+
+def rm_drug_cols(dia):
+    cols = gs.places.drug_ohe
+    dia = common.rmcols(dia, cols)
+    return dia
+
+
+
+""" ----------------------------------------
+
+    add one hot encodings for names column
+    with unique names (LopNr) and a single
+    code in each row in the codes column
+
+---------------------------------------"""
+
+"""
+
+def equal_str(a, b):
+    if not (isinstance(a, str) and isinstance(b, str)):
+        return 0
+    return int(a == b)
+
+"""
+
+
+def one(df, c):
+    for x in  df[c].dropna().drop_duplicates():
+        df[x] = (df[c] == x).astype(int)
+    return df
+
+"""
+def one_x(df, c):
+    for x in  df[c].drop_duplicates():
+        if isinstance(x, str):
+            df[c + '_' + x] = df[c].apply(lambda z: equal_str(z, x)) 
+    return df
+"""
+
+def one_x(df, c):
+    for x in  df[c].dropna().drop_duplicates():
+        df[c + '_' + x] = (df[c] == x).astype(int)
+    return df
+
+    
+
+def ones_x(df, cs):
+    for c in cs:
+        df = one_x(df, c)
+    return df
+
+
+def to_int(x):
+    if isinstance(x, str):
+        return int(x)
+    if isinstance(x, int):
+        return x
+    return -1
+
+def nr_sort(xs):
+    """
+    sort a list of numbers on str type
+    """
+    if not common.isarray(xs):
+        return []
+    ixs = list(map(to_int, xs))
+    ixs.sort()
+    xs = list(map(str, ixs))
+    return xs
+
+def one_sorted(df, c):
+    xs = list(df[c].drop_duplicates())
+    xs = nr_sort(xs)
+    for x in xs:
+        df[c + '_' + x] = (df[c] == x).astype(int)
+    return df
+    
+def add_one_hot_groups(dia):
+    grp = gs.names.dia_group
+    dia = one_sorted(dia, grp)
+    return dia
+
+
+
+""" ----------------------------------------
+
+    add one hot encodings for names column with 
+    non-unique names and possibly several space
+    separated codes in each row in the codes 
+    column
+
+---------------------------------------"""
+
+def head(xs):
+    ys = []
+    for x in xs:
+        if isinstance(x, str):
+            ys += [x[0]]
+    return ys
+
+def split_and_head(ys):
+    cs = []
+    for y in ys.values:
+        if not common.isnull(y):
+            cs = np.append(cs, y.split())
+    return np.unique(head(cs))
+
+
+def name_compression(df, xcol, ycol):
+    data = []
+    xs =  df[xcol].drop_duplicates()
+    for x in xs:
+        dx = df[ df[xcol] == x ]        
+        ys = dx[ycol].drop_duplicates()
+        ys = split_and_head(ys)
+        data = data + [ [x] + [ys] ]
+    ds = pd.DataFrame(data)
+    ds.columns = [xcol, ycol]    
+    return ds
+
+
+
+def unique_values(ys):
+    ys1 = [] 
+    for y in ys.values:
+        ys1 = np.append(ys1, np.array(y))
+    return np.unique(ys1)
+
+def in_col_list(d, col, y):
+    if isinstance(d[col], str):
+        return 0
+    if common.isarray(d[col]):
+        return int(y in list(d[col]))
+    print('ERROR ? in_col_list: d[col] is neither a str or an array')
+    print('d[col] =', d[col], 'type =', type(d[col]),'\ty=', y)
+    return 0
+
+def one_general(df, ycol):
+    # f  = lambda d: ( int(y in d[ycol]))
+    ys = unique_values(df[ycol])
+    for y in ys:
+        df[ycol + '_' +y] = df.apply(lambda d:in_col_list(d, ycol, y), axis = 1)
+    return df
diff --git a/datapreparation/generate_bonsai.py b/datapreparation/generate_bonsai.py
new file mode 100644
index 0000000..2bba939
--- /dev/null
+++ b/datapreparation/generate_bonsai.py
@@ -0,0 +1,95 @@
+import sys
+import time
+import pandas as pd
+import numpy as np
+from datetime import datetime as timeclass
+from dateutil import parser
+import copy
+
+import global_settings as gs
+import bonsai_io as bio
+import diagnose
+import separate_tasks as sep
+import total
+import analyze
+import lexicon
+import common
+import merge
+import diagnose
+import bonsai_time as btime
+
+
+dia = diagnose.generate_dia()
+dia = diagnose.add_pers(dia)
+dia['DIA'] = 1
+dia.drop_duplicates('LopNr',keep='first',inplace=True)
+
+ctr = total.control_base()
+b = gs.places.base_columns
+c = ctr.columns
+d = dia.columns
+for col in list(set(b) - set(c)):
+        ctr[col] = np.nan
+
+for col in list(set(b) - set(d)):
+        dia[col] = np.nan
+
+base = pd.concat([dia[b], ctr[b]])
+base.set_index('LopNr',drop=False,inplace=True)
+
+
+base = total.add_cause(base)
+
+inc  = bio.readorigincare()
+nic  = bio.readorignicare()
+
+base = total.add_care(base, inc, 'first_incare', 'all_incare', n = 3, n2 = 0, n1 = 0)
+base = total.add_care(base, nic, 'first_nicare', 'all_nicare', n = 3, n2 = 0, n1 = 0)
+base = total.add_drugs(base, n=4)
+
+
+base['DODSDAT'] = base['DODSDAT'].apply(str)
+base = base[(base['VitalStatus'] != 'emigrated') | base['last_date'].notnull()]
+base['K_n']['5251'] = 'pojke'
+
+base   = total.add_no_event_time_death(base, 'no_event_time_death')
+
+for col in ['first_incare','all_incare','first_nicare','all_nicare','first_drug','all_drug','life_events']:
+  base[col] = base[col].apply(str)
+
+def fixdate00(t):
+  if t[-2:]=='00':
+    t = t[:-2] + "15"
+  return t
+
+base['DODSDAT'] = base['DODSDAT'].apply(fixdate00)
+
+base   = total.add_time_differences(base, gs.incspan, 'event_time_inc', 'first_incare')
+base   = total.add_time_differences(base, gs.nicspan, 'event_time_nic', 'first_nicare')
+base   = total.add_time_differences(base, gs.drugspan, 'event_time_drug', 'first_drug')
+
+base = total.add_all_timedifferences(base, gs.incspan, 'all_event_time_inc', 'all_incare')
+base = total.add_all_timedifferences(base, gs.nicspan, 'all_event_time_nic', 'all_nicare')
+base = total.add_all_timedifferences(base, gs.drugspan, 'all_event_time_drug', 'all_drug')
+
+base = total.add_censored_times(base)
+base = total.add_other_str(base)
+
+gs.places.surgery = gs.places.local_input + 'kodad_kirurgi_20200318.csv'
+sur=bio.readsurgery()
+sur = sur.sort_values('LopNr').reset_index(drop=True)
+sur['surgery'] = sur.apply(total.surg_codes, axis = 1)
+sur = common.compression_with_dates(sur, 'LopNr', 'surgery', 'Datum')
+cols = list(sur.columns) 
+cols.remove('LopNr')
+base = common.add_list_cols(base, sur, 'LopNr', cols)
+base['surgery_diff']  = base.apply(total.surgery_diff, axis = 1)
+
+treat = bio.readtreatment()
+base = total.add_cytostatica(base, treat)
+base = total.add_stemcell(base, treat)
+base = total.add_radio(base, treat)
+
+base = total.modify_total(base)
+
+bio.save_generated_file(base, "total_210522")
diff --git a/datapreparation/global_settings.py b/datapreparation/global_settings.py
new file mode 100644
index 0000000..4c5188e
--- /dev/null
+++ b/datapreparation/global_settings.py
@@ -0,0 +1,63 @@
+#! /home/jan/anaconda3/bin/python
+
+
+
+""" -------------------------------
+
+    Copyright (C) 2019 RISE
+    This code was produced by RISE
+    The 2019-08-29 version
+
+    bonsai/src/global_settings.py
+
+------------------------------------"""
+
+
+
+import places
+
+
+class long_names:
+    input_data_group = 'group'
+    dia_group     = 'Diagnostic_Group'
+    out_p1        = 'outcome_p1_2019_12_05'
+    out_p2        = 'outcome_p2_2019_10_16'
+    outcome_file  = 'outcome_2019_12_05'
+    timediff_file = 'timediff_2019_12_09'
+    treat_file    = 'treat_2019_05_17'
+    total_file    = 'total_2020_05_06'
+
+class short_names:
+    input_data_group = 'group'
+    dia_group = 'group'
+    outcome_file = 'outcome'
+    timediff_file = 'timediff'
+    treat_file = 'treat'
+    total_file = 'total'
+
+
+names = long_names
+
+
+
+outcome_timegap    = 0 # 1826   # (nr of days in 5 years)
+mean_diagnosis_age = 9.42 # (years, converted to days: d = y * 365.25)
+
+
+diaspan  = ['1970-02-15', '2015-12-29']
+incspan  = ['1970-01-12', '2016-12-30'] # sluten vard, changed 2019-12-04 
+lifespan = ['1952-01-02', '2017-12-15']
+nicspan  = ['1997-01-02', '2016-12-31']
+drugspan = ['2005-06-28', '2017-12-31']
+
+future   = '5000-01-01'   # any future date
+
+
+"""
+incare span:  1970-01-12 	 2016-12-30
+nicare span:  1997-01-02 	 2016-12-31
+dia span:     1970-02-15 	 2015-12-29
+oc span:      1961-12-02 	 2024-12-01
+life span:    1975-11-02 	 2017-12-15
+drug span:    2005-06-28 	 2017-12-31
+"""
diff --git a/datapreparation/lexicon.py b/datapreparation/lexicon.py
new file mode 100644
index 0000000..23bf478
--- /dev/null
+++ b/datapreparation/lexicon.py
@@ -0,0 +1,191 @@
+#! /home/jan/anaconda3/bin/python
+
+
+""" -------------------------------
+
+    Copyright (C) 2018 RISE
+    This code was produced by RISE
+    The 2013-03-26 version
+
+    bonsai/src/lexicon.py
+
+------------------------------------"""
+
+
+
+import pandas as pd
+import numpy as np
+import bonsai_io as bio
+import common
+import copy
+
+
+xcols    = ['ICD7', 'ICD9', 'text_C24_']
+ycols    = ['ICD10', 'SNOMED']
+lex_cols = xcols + ycols
+tw_cols  = ['tw_icd10', 'tw_snomed']
+dia_cols = ['LopNr'] + lex_cols
+
+
+
+
+"""
+
+   1   Common lexicon support for translation from
+       ICD7, ICD9 to ICD10 and SNOMED
+
+"""
+
+
+def lookup(x, lex):
+    for z in lex.values: 
+        if (x[0] == z[0]) and (x[1] == z[1]) and common.isnull(x[2]) and common.isnull(z[2]):
+            return z[3:5]
+        if (x[0] == z[0]) and (x[1] == z[1]) and (x[2] == z[2]):
+            return z[3:5]
+    return [np.nan, np.nan]
+
+
+def translate(df, lex):
+    x = df[xcols]  # change dfrom df[1:4]
+    y = lookup(x, lex)
+    return y
+
+
+
+"""
+
+   2   Apply lexicon for translation from
+       ICD7, ICD9 to ICD10 and SNOMED
+
+"""
+
+
+def apply_lex(df, lex, label):
+    icd10  = 'icd10'+ label
+    snomed = 'snomed'+ label
+    df[icd10]  = df.apply(lambda d: translate(d, lex)[0], axis = 1)
+    df[snomed] = df.apply(lambda d: translate(d, lex)[1], axis = 1)
+    return df
+
+
+def select_icd10(df, icd10):
+    if common.isnull(df['ICD10']):
+        return df[icd10]
+    return df['ICD10']
+
+def select_snomed(df, snomed):
+    if common.isnull(df['SNOMED']):
+        return df[snomed]
+    return df['SNOMED']
+
+"""
+
+   3   Apply saved lexicon
+
+"""
+
+
+
+def fill_in_by_compiled_lex(df):
+    lex = bio.read_code_lex()
+    label = 'fill'    
+    icd10  = 'icd10'+ label
+    snomed = 'snomed'+ label
+    df = apply_lex(df, lex, label)
+    df['ICD10']  = df.apply(lambda d:select_icd10(d, icd10),  axis = 1)
+    df['SNOMED'] = df.apply(lambda d:select_snomed(d, snomed), axis = 1)
+    df = df.drop([icd10, snomed], axis=1)
+    return df
+
+
+
+"""
+
+   4   Prune
+
+"""
+
+no_code = ['finns ej', 'finns ej som icd8kod', 'oklar kod']
+
+def is_icd10(x, verb = False):
+    if len(x) < 3:
+        if verb: print(x, 'has less than 3 chars')
+        return False
+    if not x[0].isupper():
+        if verb: print('\'' + x[0] + '\'' + ' is not an upper case letter')
+        return False
+    if not  x[1].isdigit():
+        if verb: print('\'' + x[1] + '\'' + ' is not a digit')
+        return False
+    if not  x[2].isdigit():
+        if verb: print('\'' + x[2] +'\'' + ' is not a digit')
+        return False
+    if len(x) > 6:
+        if verb: print(x, 'has more than 6 chars')
+        return False
+    return True
+
+
+def prune_icd10(c):
+    if not isinstance(c, str) or c in no_code:
+        return np.nan
+    code = c
+    if code == 'F':
+        code = 'F00'
+    if code == 'O':
+        code = 'O50'
+    if code == 'S':
+        code = 'S83'
+    code = code.replace('.', '')
+    code = code.replace(' ', '')
+    code = code[:3]
+    # if not is_icd10(c, verb = True):
+    if not is_icd10(c):
+        print('\t' + c + '\t replaced by \t' +  code)
+    return code
+
+def prune_icd10_lex(df, col):
+    df = df[df.columns[:2]]
+    df.columns = [col, 'icd10']
+    dfc = copy.copy(df)
+    dfc['icd10'] = df['icd10'].apply(prune_icd10)
+    dfc= dfc.sort_values(col).dropna().reset_index(drop=True)
+    return dfc
+
+def prune_icd10_lex_files(raw, lex, code):
+    df = pd.read_csv(raw, sep=',', dtype = str)
+    
+    print('File:' + lex + '\n\tspecial icd10 replacements')
+    print()
+    df = prune_icd10_lex(df, code)
+    print('saving file to:')
+    print('\t', lex)
+    df.to_csv(lex, index = False, sep = '\t')
+
+
+def add_brackets(x):
+    x = str(x)
+    x = x.replace(' ', '')
+    x = x.replace(',', '\',\'')
+    s = '[\'' + x + '\']'
+    return s
+
+
+def add_brackets_to_file(f1, f2, col):
+    df = pd.read_csv(f1, sep='\t', dtype = str)
+    df[col] = df[col].apply(add_brackets)
+    df.to_csv(f2, index = False, sep = '\t')
+
+    
+"""
+def prune_icd10_lex_filesx(raw, lex, code):
+    df = pd.read_csv(raw, sep=',', dtype = str)
+    c1 = df.columns[1]
+    df[c1] = df[c1].apply(str)
+    print(df[0:3])
+    df = df[df.columns[:2]]
+    print(df[0:3])
+"""
+
+
diff --git a/datapreparation/merge.py b/datapreparation/merge.py
new file mode 100644
index 0000000..7b5b969
--- /dev/null
+++ b/datapreparation/merge.py
@@ -0,0 +1,73 @@
+#! /usr/bin/env python3
+
+
+""" -------------------------------
+
+    merge.py
+
+    Copyright (C) 2018 RISE
+    This code was produced by RISE
+
+    First version created 2013-04-05
+    based on code from other files
+
+    bonsai/src_v02/merge.py
+
+------------------------------------"""
+
+import pandas as pd
+import numpy as np
+
+import bonsai_io as bio
+import common
+
+import global_settings as gs
+
+
+
+
+
+""" ----------------------------------------
+
+    to one data frame df1 add columns 
+    from another data frame df2
+
+---------------------------------------"""
+
+def look_up_entry(entry, df, entry_col, value_col):
+    dfe = df[df[entry_col] == entry]
+    if not dfe.empty:
+        return dfe[value_col].values[0]
+    return str(0)
+
+def add_cols(df1, df2, xc, ycs):
+    for yc in ycs:
+        df1[yc] = df1[xc].apply(lambda x:look_up_entry(x, df2, xc, yc))
+    return df1
+
+
+""" ----------------------------------------
+
+    add person data
+
+---------------------------------------"""
+
+
+def add_pers(df):
+    cols = gs.places.person_selection.copy()
+    cols.remove('LopNr')
+    pers = bio.readperson()
+    df = add_cols(df, pers, 'LopNr', cols)
+    # cols = gs.places.person_ohe
+    return df
+
+def add_pers_ohe(df):
+    cols = gs.places.person_ohe
+    df = ones_x(df, cols)
+    return df
+
+def rm_pers_cols(df):
+    cols = gs.places.person_ohe
+    df = common.rmcols(df, cols)
+    return df
+
diff --git a/datapreparation/separate_tasks.py b/datapreparation/separate_tasks.py
new file mode 100644
index 0000000..f8770ee
--- /dev/null
+++ b/datapreparation/separate_tasks.py
@@ -0,0 +1,175 @@
+#! /usr/bin/env python3
+
+
+""" -------------------------------
+
+    Copyright (C) 2018 RISE
+    This code was produced by RISE
+
+    bonsai/src_v02/separate_tasks.py
+
+    
+------------------------------------"""
+
+import numpy as np
+
+from datetime import datetime as timeclass
+from dateutil import parser
+import pandas as pd
+
+import bonsai_io as bio
+import bonsai_time as btime
+import common
+
+
+
+""" -------------------------------
+
+    finding zero dia group codes 
+    
+------------------------------------"""
+
+def zero_grp():
+    base = total.base_set()
+    base = base[base[gs.names.dia_group] == '0']
+    bio.save_tmp_file(base, 'base_grp_0')
+
+
+
+
+""" -------------------------------
+
+    mean diagnosis age
+    
+------------------------------------"""
+
+def mean_diagnosis_age():
+    da = 'Diagnos_lder'
+    dia = bio.read_generated_dia()
+    return dia[da].mean()
+
+
+""" -------------------------------
+
+    finding the data time spans 
+    
+------------------------------------"""
+
+
+def spans():
+    care_col  = 'INDATUM'
+    dia_col   = 'DiagnosDat'
+    cause_col = 'DODSDAT'
+    drug_col  = 'EDATUM'
+
+    inc = bio.readincare()
+    t_inc = inc[care_col].dropna()
+    t_inc = t_inc.sort_values()  # .apply(parser.parse)
+
+    nic = bio.readnicare()
+    t_nic = nic[care_col].dropna()
+    t_nic = t_nic.sort_values() # .apply(parser.parse)
+
+    dia = bio.read_generated_dia()
+    t_dia = dia[dia_col].dropna() # .apply(parser.parse)
+
+    oc  = bio.readoutcome()
+    t_oc = oc[dia_col].dropna() # .apply(parser.parse)
+
+    oc = oc[oc[cause_col] != 0]
+    t_life = oc[cause_col].dropna().apply(str).apply(parser.parse).apply(common.time2str)
+    # t_life = t_life[t_life[cause_col] != '0']
+    
+    drug = bio.readdrug()
+    t_drug = drug[drug_col].dropna()# .apply(parser.parse).apply(common.time2str)
+
+
+    print()
+    print('incare span: ', t_inc.min(), '\t', t_inc.max())
+    print('nicare span: ', t_nic.min(), '\t', t_nic.max())
+    print('dia span:    ', t_dia.min(), '\t', t_dia.max())
+    print('oc span:     ', t_oc.min(), '\t', t_oc.max())
+    print('life span:   ', t_life.min(), '\t', t_life.max())
+    print('drug span:   ', t_drug.min(), '\t', t_drug.max())
+    print()
+
+    #t = list(inc[care_col].values)
+
+
+""" -------------------------------
+
+    generation of diagnosis codes lexicon for incare  
+    
+------------------------------------"""
+
+xdir = '/home/jan/bonsai/data/extracted/'
+ks68_file  = xdir + 'diagnoskoder_20190425_sv_70_86.csv'
+icd9_file  = xdir + 'diagnoskoder_20190425_sv_86_97.csv'
+
+
+def heads_list(c):
+    nc = []
+    for ci in c:
+        if common.isstr(ci):
+            nc += [ci[0]]
+    return nc
+    
+def icd10_heads_list(df):
+    c1 = df['ICD10_1']
+    c2 = df['ICD10_2']
+    c  = [c1] + [c2]
+    return heads_list(c)
+
+def gen_incare_lex():
+    c1 = 'ICD10_1'
+    c2 = 'ICD10_2'
+    cols = ['code', c1, c2]
+    f1  = ks68_file
+    f2  = icd9_file
+    df1 = pd.read_csv(f1, dtype = str, sep='\t')
+    df2 = pd.read_csv(f2, dtype = str, sep='\t')
+    df2 = df2[ ~df2[c1].isnull() | ~df2[c2].isnull() ]
+    df1.columns = cols
+    df2.columns = cols
+
+    df1['ICD10'] = df1.apply(icd10_heads_list, axis = 1)
+    df2['ICD10'] = df2.apply(icd10_heads_list, axis = 1)
+    df = pd.concat([df1, df2])
+    df = common.rmcols(df, [c1, c2])
+
+    bio.save_local_input(df, 'incare_lex')
+
+
+
+""" -------------------------------
+
+    generation of diagnosis codes lexicon for causes   
+    
+------------------------------------"""
+
+
+xdir = '/home/jan/bonsai/data/extracted/'
+cause_file = xdir + 'diagnoskoder_20190425_orsak.csv'
+
+
+def gen_cause_lex():
+
+    c1 = 'ICD10_1'
+    c2 = 'ICD10_2'
+    cols = ['code', c1, c2]
+
+    f = cause_file
+    df = pd.read_csv(f, dtype = str, sep=',')
+    df = df[ ~df[c1].isnull() | ~df[c2].isnull() ]
+    df.columns = cols
+    df['ICD10'] = df.apply(icd10_heads_list, axis = 1)
+    df = common.rmcols(df, [c1, c2])
+
+    df8 = df[ df['code'].str.contains('#')]
+    df9 = df[ ~df['code'].str.contains('#')]
+
+    df8.is_copy = False
+    df8['code'] = df8['code'].str[1:] 
+    
+    bio.save_local_input(df8, 'cause_icd8_lex')
+    bio.save_local_input(df9, 'cause_icd9_lex')
diff --git a/datapreparation/total.py b/datapreparation/total.py
new file mode 100755
index 0000000..62d20ec
--- /dev/null
+++ b/datapreparation/total.py
@@ -0,0 +1,1304 @@
+#! /home/jan/anaconda3/bin/python
+
+
+
+""" -------------------------------
+
+    Copyright (C) 2018 RISE
+    This code was produced by RISE
+    The 2019-08-29 version
+
+    bonsai/src/total.py
+
+
+------------------------------------"""
+
+# global imports
+
+import pandas as pd
+import numpy as np
+from datetime import datetime as timeclass
+from dateutil import parser
+import copy
+
+# local imports
+
+import global_settings as gs
+import bonsai_io as bio
+import common
+import merge
+import diagnose
+import bonsai_time as btime
+
+
+# import time
+import analyze
+
+
+""" ----------------------------------------
+
+    1 dia
+
+---------------------------------------"""
+
+def dia_base():
+    dia = bio.read_generated_dia()
+    grp = bio.read_diagroups()
+    dia = diagnose.add_group(dia, grp)
+    # dia = common.rmcols(dia, ['ICD10', 'SNOMED']) # real
+    dia = common.rmcols(dia, ['SNOMED']) # real
+    dia = diagnose.add_pers(dia)
+    dia['DIA'] = 1
+    return dia
+
+
+""" ----------------------------------------
+
+    2 control group and base set
+
+---------------------------------------"""
+
+
+
+def control_base():
+    ctr = bio.readcontrol()
+    ctr['DIA'] = 0
+    m = gs.mean_diagnosis_age
+    f = lambda x: common.add_years_to_full_year(x, m)
+    ctr['DiagnosDat'] = ctr['FoddAr'].apply(f)
+    return ctr
+
+
+def base_set():
+    dia = dia_base()
+    ctr = control_base()
+
+    b = gs.places.base_columns # real
+    c = ctr.columns
+    d = dia.columns
+
+    """
+    print('base 1')
+    for z in c: print ('\t', z)
+    print()
+    print('base 2')
+    for z in d: print ('\t', z)
+    print()
+    print('base 3')
+    for z in b: print ('\t', z)
+    print()
+    """
+
+    
+    for col in list(set(b) - set(c)):
+        ctr[col] = np.nan
+    for col in list(set(b) - set(d)):
+        dia[col] = np.nan
+
+    
+    
+
+    ctr = ctr[b]
+    dia = dia[b]
+
+    return pd.concat([dia, ctr])
+
+
+
+""" ----------------------------------------
+
+    3 cause  
+
+---------------------------------------"""
+
+def add_cause(df):                     # 0 for empty list is ok!
+    cols = gs.places.cause_added_cols  # 'ICD', 'life_events', 'DODSDAT'
+    df_icd10 = df[['LopNr', 'ICD10']]  # The LopNr and the recidive code
+    cause = bio.readcause(df_icd10)    # don't read the recidive class causes
+    df = diagnose.add_cols(df, cause, 'LopNr', cols)
+
+    c0 = 'DiagnosDat'
+    c1 = 'DODSDAT'
+    # df['Life(s)'] = df.apply(lambda d: date_diff_in_seconds(d, c1, c0), axis = 1)
+    # df['Life(days)'] = df['Life(s)'] / (60*60*24)
+    df['life'] = df.apply(lambda d: date_diff_in_days(d, c1, c0), axis = 1)
+
+    return df
+
+
+""" ----------------------------------------
+
+    4 care 
+
+---------------------------------------"""
+
+
+def first_list(df1, df2, fcol, lex8, lex9, xcol = 'LopNr', ycol = 'DIAGNOS', zcol = 'INDATUM', split = True, n = 1,):
+    df2 = btime.first_time_aho(df2, df1, xcol, ycol, zcol, fcol, lex8, lex9, split = split, n = n)
+    return df2
+
+
+def all_list(df1, df2, acol, lex8, lex9, xcol = 'LopNr', ycol = 'DIAGNOS', zcol = 'INDATUM', split = True, n = 1,):
+    df2 = btime.all_times_aho(df2, df1, xcol, ycol, zcol, acol, lex8, lex9, split = split, n = n)
+    return df2
+  
+
+def add_care(df, care, fcol, acol, n = 1, n2 = 0, n1 = 0):
+    xcol = 'LopNr'
+    
+    if n2 > 0:
+        care = care[n1:n2]                     # an initial part of incare
+
+    lex8 = bio.read_care_icd8_lex()
+    lex9 = bio.read_care_icd9_lex()
+
+    
+    #print('add ICD10 & DiagnosDat')
+    #care = diagnose.add_cols_aho(care, df, xcol, ['ICD10','DiagnosDat'], True)
+
+    L = list(df[xcol].values)
+    care = care[ care[xcol].isin(L) ]      # part of care with LopNr in df
+
+    print('    creating first_list')
+    f = first_list(df, care, fcol, lex8, lex9, n = n)
+        
+    print('    adding first_list')
+    df = common.add_list_cols(df, f, xcol, [fcol])
+    
+    print('    creating all_list')
+    a = all_list(df, care, acol, lex8, lex9, n = n)
+    
+    print('    adding all_list')
+    df = common.add_list_cols(df, a, xcol, [acol])
+    return df   
+
+def add_incare(df, n = 1, n2 = 0, n1 = 0):
+    fcol = 'first_incare'
+    acol = 'all_incare'
+    inc  = bio.readincare()
+    df   = add_care(df, inc, fcol, acol, n = n, n2 = n2, n1 = n1)
+    return df
+
+
+def add_nicare(df, n = 1, n2 = 0, n1 = 0):
+    fcol = 'first_nicare'
+    acol = 'all_nicare'
+    nic  =  bio.readnicare()
+    df   = add_care(df, nic, fcol, acol, n = n, n2 = n2, n1 = n1)
+    return df
+
+""" ----------------------------------------
+
+    5 drugs
+
+---------------------------------------"""
+
+
+def add_drugs(df, n = 1, n2 = 0, n1 = 0): 
+    xcol = 'LopNr'
+    fcol = 'first_drug'
+    acol = 'all_drug'
+    ycol = 'ATC'
+    zcol = 'EDATUM'
+    drug  =  bio.readdrug(n)  # read n characters of the ATC code !!!
+
+    print('drug.shape', drug.shape)
+    if n2 > 0:
+        drug = drug[n1:n2]                     # an initial part of drugs
+
+    #ilex = [] 
+    #if n == 1:
+    #    ilex = bio.read_incare_lex() 
+    
+
+    # instead fo letting both first_list and all_list adding the columns
+    # 'ICD10'  and 'DiagnosDat' it should be done here with
+    # df = df1 and drug = df2
+    
+    L = list(df[xcol].values)
+    drug = drug[ drug[xcol].isin(L) ]      # part of drug with LopNr in df
+
+    #print('add ICD10 & DiagnosDat')
+    #drug = diagnose.add_cols_aho(drug, df, xcol, ['ICD10','DiagnosDat'], True)
+
+    print('    creating first_list')
+    f = first_list(df, drug, fcol, False, False, xcol=xcol ,ycol=ycol, zcol=zcol, n = n)
+    
+    print('    creating all_list')
+    a = all_list(df, drug, acol, False, False, xcol=xcol , ycol=ycol, zcol=zcol, n = n)
+    
+    print('    adding first_list')
+    df = common.add_list_cols(df, f, xcol, [fcol])
+    
+    print('    adding all_list')
+    df = common.add_list_cols(df, a, xcol, [acol])
+        
+    # drug.columns = ['LopNr', 'DIAGNOS', 'INDATUM']
+    # drug = first_list(df, drug, fcol, split = False)
+    # drug = first_list(df, drug, fcol, xcol=xcol , ycol=ycol, zcol=zcol)
+    # df = common.add_list_cols(df, drug, xcol, [fcol])
+    
+    return df
+
+
+""" ----------------------------------------
+
+    care and cause merged
+
+    not used in the present version
+
+---------------------------------------"""
+
+
+
+def concat(d, cs):
+    L = []
+    for c in cs:
+        x = d[c]
+        if common.isarray(x):
+            L += list(x)
+    return list(np.unique(L))
+
+
+def merge_care_and_cause(df, diacol):
+    nr = 0
+    sv = 'DIA_SV'
+    ov = 'DIA_OV'
+    cc = 'life_events'
+    cols = [sv, ov, cc]
+    df = diagnose.add_incare(df, nr)
+    df.columns = [sv if x=='DIAGNOS' else x for x in df.columns]
+    df = diagnose.add_nicare(df, nr)
+    df.columns = [ov if x=='DIAGNOS' else x for x in df.columns]
+    df = add_cause(df)
+    # df[diacol] = df.apply(lambda d:concat(d, cols), axis = 1)
+    # df = common.rmcols(df, cols)
+    # print(df.columns)
+    return df
+
+""" ----------------------------------------
+
+    6 outcome
+
+    The outcomes relates the input data the output data.
+    The ouput is given by care, cause and drug
+    outcome_01 give us the time of cause, if occurred and
+    first time of inst. care for each class of cause and 
+    cause diagnosis, as given by the ICD10 first letter.
+    
+
+    obs!!! the times produced by listdiff_*** are obsolet
+    and not used and the assigments of base['Inc(s)'] and
+    base['Inc(days)'] in outcome_01 may be removed.
+    They are replaces by values given to df[ecol] and
+    df[ne] in add_time_differences
+
+---------------------------------------"""
+
+def date_diff_in_seconds(df, c1, c0):
+    return common.time_diff_in_seconds(df[c1], df[c0])
+
+def date_diff_in_days(df, c1, c0):
+    return common.time_diff_in_days(df[c1], df[c0])
+
+def date_diff_in_years(df, c1, c0):
+    return common.time_diff_in_years(df[c1], df[c0])
+
+def list_diff_in_seconds(df, c1, c0):
+    inc_list = df[c1]
+    if not isinstance(inc_list, list):
+        return inc_list
+    L = []
+    dtime = df[c0]
+    for z in inc_list:
+        if not isinstance(z, list):
+            print (z)
+        else:
+            diff = common.time_diff_in_seconds(z[1], dtime)
+            L = L + [[z[0], diff]]
+    return L
+
+def list_diff_in_days(df, c1, c0):
+    inc_list = df[c1]
+    if not isinstance(inc_list, list):
+        return inc_list
+    L = []
+    dtime = df[c0]
+    for z in inc_list:
+        if not isinstance(z, list):
+            print (z)
+        else:
+            diff = common.time_diff_in_days(z[1], dtime)
+            L = L + [[z[0], diff]]
+    return L
+
+
+
+def outcome(n = 1, save = False, n2 = 0, n1 = 0):
+    # n is the number of considered characters of the icd10 code 
+
+    name = gs.names.outcome_file
+    base = base_set()
+    base = add_cause(base)  # recidiv not added    
+    print('\n    add incare\n') 
+    base = add_incare(base, n = n, n2 = n2, n1 = n1) # recidiv not added
+    print('\n    add nicare\n') 
+    base = add_nicare(base, n = n, n2 = n2, n1 = n1) # recidiv not added    
+    base = add_drugs(base, n = n, n2 = n2, n1 = n1)  # just a single letter for each order
+
+    if save:
+        bio.save_generated_file(base, name)
+
+    return base
+
+# instead of running all of the outcome we may divide it into parts
+# storing and reading each partial computation
+
+
+def out_p1(n =3, n2 = 0, n1 = 0):
+
+    out_p1 = gs.names.out_p1
+    base = base_set()
+    base = add_incare(base, n = n, n2 = n2, n1 = n1)
+    base = add_nicare(base, n = n, n2 = n2, n1 = n1) 
+    bio.save_generated_file(base, out_p1)
+
+
+# the drugs are not used at present and we may skip them
+
+def out_no_drugs(n =3, n2 = 0, n1 = 0):
+    out_p1 = gs.names.out_p1
+    out_file = gs.names.outcome_file
+    df = bio.read_generated_file(out_p1)
+    df['LopNr'] = df['LopNr'].apply(str)
+    df = add_cause(df)
+    bio.save_generated_file(df, out_file)
+   
+# with the drugs we have three parts    
+
+def out_p2(n =3, n2 = 0, n1 = 0):
+
+    out_p1 = gs.names.out_p1
+    out_p2 = gs.names.out_p2    
+    df = bio.read_generated_file(out_p1)
+    df['LopNr'] = df['LopNr'].apply(str)
+    df = add_drugs(df, n = n, n2 = n2, n1 = n1)
+    bio.save_generated_file(df, out_p2)
+
+def out_p3(n =3, n2 = 0, n1 = 0):
+    out_p2 = gs.names.out_p2
+    out_file = gs.names.outcome_file
+    df = bio.read_generated_file(out_p2)
+    df['LopNr'] = df['LopNr'].apply(str)
+    df = add_cause(df)  
+    bio.save_generated_file(df, out_file)
+ 
+    
+""" ----------------------------------------
+
+    7 time differences
+    
+---------------------------------------"""
+
+
+
+def time_differences(df, span, fcol):
+    
+    gap = gs.outcome_timegap
+    dia = df['DiagnosDat']
+    
+    [r1, r2] = span    
+    p1       = common.add_days(dia, gap)
+    p2       = df['DODSDAT']                
+    p3       = df['last_date']
+    f_str    = df[fcol]
+    emigrated = df['VitalStatus'] == 'emigrated'
+
+    start = common.time_max(r1, p1)
+    ne_end = r2
+    if p2 != '0':
+        ne_end = common.time_min(p2, r2)
+    elif emigrated and common.isstr(p3):   # zzz
+        ne_end = common.time_min(p3, r2)
+        
+
+    no_event_time = common.time_diff_in_days(ne_end, start)    
+    no_event_time = max(0, no_event_time)
+    event_times = []
+
+    if f_str == '[]' or f_str == '0':
+        return [event_times, no_event_time]
+
+    f_list = common.str2list(f_str)
+    
+    for event in f_list:
+        [icd10_class, event_date] = event
+        class_time = common.time_diff_in_days(event_date, start)   # cannot be negative !
+        if class_time < 0: print('event_time < 0')
+        event_times += [[icd10_class, class_time]]
+
+    return [event_times, no_event_time]
+
+
+
+def no_event_time_death(df):
+
+    gap = gs.outcome_timegap
+    dia = df['DiagnosDat']
+    [r1, r2] = gs.lifespan                  # register start and end
+    p1       = common.add_days(dia, gap)
+    p2       = df['DODSDAT']                # we need a DODSDAT str it must have been convered
+    p3       = df['last_date']
+    emigrated = df['VitalStatus'] == 'emigrated'
+
+    if not common.isstr(p2):
+        print('DODSDAT need to be converted to a str')
+
+    start = common.time_max (r1 , p1 )      # with a complete register it's p1
+    end = r2
+    if p2 != '0':
+        end = p2
+
+    elif emigrated and common.isstr(p3):   # zzz
+        end = common.time_min(p3, r2)
+    time = common.time_diff_in_days(end, start)
+    return max(0, time)
+    
+
+
+def add_no_event_time_death(df, scol):     # no_event_time_death
+    df[scol]   = df.apply(no_event_time_death, axis = 1)
+    return df
+
+def add_time_differences(df, span, ecol, fcol): # zzz
+    ne = 'no_' + ecol 
+    df['tmp']  = df.apply(lambda d: time_differences(d, span, fcol), axis = 1)
+    df[ecol]  = df['tmp'].apply(lambda x: x[0])
+    df[ne] = df['tmp'].apply(lambda x: x[1])
+    df = df.drop(['tmp'], axis=1)
+    return df
+
+def all_timedifferences(df, span, acol):
+
+    if isinstance(df[acol], str):
+        a  =  common.str2list(df[acol])
+    else:
+        a = []
+        print("Bad string: ", df[acol])
+        
+    if a == []:
+        return []
+    
+    dia  =  df['DiagnosDat']
+    gap  =  gs.outcome_timegap
+    gap_date  = common.add_days(dia, gap)
+    [start, end] = span
+    z    = common.time_max(gap_date, start)
+    data = []
+    
+    for x in a:
+        # y = x[0]
+        dates = x[1]
+        dates.sort()
+        diffs = []
+
+        for date in dates:
+            diff = common.time_diff_in_days(date, z)
+            diffs += [diff]
+        data += [[x[0], diffs]]
+        
+    return data
+
+
+
+def add_all_timedifferences(df, span, dcol, acol):
+    df[dcol] = df.apply(lambda d: all_timedifferences(d, span, acol), axis = 1)
+    return df
+
+
+def censored_time(df, spanstart):
+    """
+    censored time = max (0, min (r1 , p2 , p3 ) − p1 )
+    """
+    
+    gap = gs.outcome_timegap
+    dia = df['DiagnosDat']
+    
+    r1  = spanstart    
+    p1  = common.add_days(dia, gap)
+    p2  = df['DODSDAT']                
+    p3  = df['last_date']
+    emigrated = df['VitalStatus'] == 'emigrated'
+
+    # start = p1
+    end = r1
+    if p2 != '0':
+        end = common.time_min(p2, r1)
+
+    elif emigrated and common.isstr(p3):   # zzz   
+        end = common.time_min(p3, r1)
+    
+    time = common.time_diff_in_days(end, p1)
+
+    return max(0, time)
+
+
+
+
+def add_censored_times(df):
+    [incstart, incend] = gs.incspan
+    [nicstart, nicend] = gs.nicspan
+    [drugstart, drugend] = gs.drugspan
+    df['censored_time_inc']  = df.apply(lambda d: censored_time(d, incstart), axis = 1)
+    df['censored_time_nic']  = df.apply(lambda d: censored_time(d, nicstart), axis = 1)
+    df['censored_time_drug']  = df.apply(lambda d: censored_time(d, drugstart), axis = 1)
+    df['censored_time_death']  = 0
+    return df
+ 
+
+def timediff():
+    name = gs.names.outcome_file
+
+    ispan = gs.incspan
+    nspan = gs.nicspan
+    dspan = gs.drugspan
+
+    ficol = 'first_incare'
+    fncol = 'first_nicare'
+    fdcol = 'first_drug'
+
+    eicol = 'event_time_inc'
+    encol = 'event_time_nic'
+    edcol = 'event_time_drug'
+
+    aicol = 'all_incare'
+    ancol = 'all_nicare'
+    adcol = 'all_drug'
+
+    dicol = 'diff_incare'
+    dncol = 'diff_nicare'
+    ddcol = 'diff_drug'    
+
+    scol = 'no_event_time_death'
+    df   = bio.read_generated_file(name)
+
+    # 'DODSDAT' is read as an int
+    df['DODSDAT'] = df['DODSDAT'].apply(str)
+
+    # The three emigrated persons without a last date are removed
+    df = df[(df['VitalStatus'] != 'emigrated') | df['last_date'].notnull()]
+
+    df   = add_no_event_time_death(df, scol)
+    df   = add_time_differences(df, ispan, eicol, ficol)
+    df   = add_time_differences(df, nspan, encol, fncol)
+
+    df   = add_censored_times(df)
+    # df   = add_time_differences(df, dspan, edcol, fdcol)
+    
+    # df   = add_all_timedifferences(df, ispan, dicol, aicol)
+    # df   = add_all_timedifferences(df, nspan, dncol, ancol)
+    # df   = add_all_timedifferences(df, dspan, ddcol, adcol)
+    
+
+    #df   = df[gs.places.out_selection_01]
+    df   = add_other_str(df)
+    bio.save_generated_file(df, gs.names.timediff_file)
+
+
+
+""" ----------------------------------------
+
+    8 counts and frequences
+    (replaced by 9)
+
+---------------------------------------"""
+
+
+def capitals():
+    c = []
+    for x in range(65, 91):
+        c += [chr(x)]
+    return c
+
+def event_types(events):
+    t = []
+    for event in events:
+        t = t + [event[0]]
+    return set(t)
+
+def event_data(df, nr = 0):
+
+    ev_col  = 'event_time_inc'
+    nev_col = 'no_event_time_inc'
+    caps = set(capitals())
+    ev  = {}
+    nev = {}
+    for c in  caps:
+        ev[c]  = []
+        nev[c] = []
+    data = df[[ev_col, nev_col]].values 
+    if nr > 0:
+        data = data[:nr]
+    for line in data:
+        events   = line[0]
+        nev_time = line[1]
+        nev_types = caps -  event_types(events)
+        for c in nev_types:
+            nev[c] += [nev_time]
+        for event in events:
+            [c, ev_time] = event
+            if c.isupper():
+                ev[c] += [ev_time]
+    return ev, nev
+
+
+def extract(d):
+    xd = {}
+    for k in d:
+        t = d[k]
+        xd[k] = {}
+        xd[k]['nr']   = len(t)
+        xd[k]['days'] = sum(t)
+    return xd
+
+def xdata(df, nr = 0):
+    ev, nev = event_data(df, nr = nr)
+    xev  = extract(ev)
+    xnev = extract(nev)
+
+    total = {}
+    for c in capitals():
+        total[c] = {}
+        nr       = xev[c]['nr'] 
+        days     = xev[c]['days'] + xnev[c]['days'] 
+
+        total[c]['nr']   = nr
+        total[c]['days'] = days
+        # total[c]['freq'] = round((nr / days) * int(1e6), 2)
+
+    return total
+
+
+def print_xd(xd):
+    # print('type\t nr\t 1000 years\t nr / (1000 years)')
+    print('type\t nr\t nr / (1000 years)')
+    print()
+    for c in capitals():
+        nr = xd[c]['nr']
+        y  = xd[c]['days'] / (365.24 * 1000)
+        if y > 1e-10:
+            f  = nr / y
+        else:
+            f = -1
+        if nr > 0:
+            print(c + '\t', nr, '\t', round(y, 1), '  \t', round(f,3))
+
+
+
+
+""" ----------------------------------------
+
+    9 frequencies and probabilities
+
+    compare diagnosis group and control group by 
+    the probability that the insitutional care 
+    frequency being higher for the fist category
+    
+---------------------------------------"""
+
+
+def compute_prob(xd, xc):
+
+    sf_lim  = np.log(1e-7)
+    data = []
+    for c in capitals():
+        nd = xd[c]['nr']
+        nc = xc[c]['nr']
+        sd = xd[c]['days']
+        sc = xc[c]['days']
+        if nd > 0 and nc > 0:
+            fd = round(1e6 * nd/sd, 3)
+            fc = round(1e6 * nc/sc, 3)
+            sf  = common.exp2sample_logsf(nd, nc, sd, sc)
+            H1  = int(sf < np.log(1e-7))
+            nsf  = round(-sf, 2) 
+            data = data + [ [c, nd, nc, sd, sc, fd, fc, nsf, H1] ]
+    
+    df = pd.DataFrame(data)
+    df.columns = [
+        'group',
+        'nr_dia',
+        'nr_ctr',
+        'days_dia',
+        'days_ctr',
+        'freq_dia',
+        'freq_ctr',
+        '-log(1-p)',
+        '1-p < 1e-7',
+    ]
+    return df
+
+
+selected_cols = [
+    'group',
+    'nr_dia',
+    'nr_ctr',
+    'freq_dia',
+    'freq_ctr',
+    '-log(1-p)',
+    '1-p < 1e-7',
+    ]
+
+
+def compare_inc_group(group = 0):
+    group_col = 'Diagnostic_Group'
+    td = bio.readtimediff()
+
+    # td = td[td['oth'] == 0]  # remove all other diseases
+    dia = td[td['DIA'] == 1]
+    if group > 0:
+        dia = dia[dia[group_col] == group]
+    ctr = td[td['DIA'] == 0]
+    xd = xdata(dia)
+    xc = xdata(ctr)
+    df = compute_prob(xd, xc) # lägg till styrka ?
+    # df = df[selected_cols]
+    df['M days_dia'] = df['days_dia']/1e6
+    df['M days_ctr'] = df['days_ctr']/1e6
+    df = common.rmcols(df, ['days_dia', 'days_ctr'])
+    return df
+
+
+
+def compare_inc():
+    df =  compare_inc_group()
+    bio.save_local_result(df, 'sluten_vard_5y_alla')
+    for group in range(1, 13):
+        df = compare_inc_group(group)
+        bio.save_local_result(df, 'sluten_vard_5y_grupp_' + str(group))
+
+
+
+
+""" ----------------------------------------
+
+    10 groups of other diseases
+
+---------------------------------------"""
+
+
+
+def selection(df, Take, Leave = []):
+    annat = 'AnnatSpecDiagnos'
+    a = df[annat]
+    if not common.isstr(a):
+        return np.nan
+    for x in Leave:
+        if (x in a):
+            return False
+    for x in Take:
+        if (x in a):
+            return True
+    return False
+      
+
+ 
+def add_other(df):
+
+    bw       = lambda d: selection(d, Take = ['Beck'])
+    ep       = lambda d: selection(d, Take = ['pilep'])
+    men      = lambda d: selection(d, Take = ['Sip', 'MEN'])
+    men2     = lambda d: selection(d, Take = ['Sip', 'MEN'], Leave = ['1'])
+    nf       = lambda d: selection(d, Take =  ['eckling', 'NF', 'euro'])
+    nf1      = lambda d: selection(d, 
+                                    Take =  ['eckling', 'NF', 'euro'],
+                                    Leave = ['typ 2', 'typ II', 'NF2', 'NF 2'])
+    t21      = lambda d: selection(d, Take = ['21', 'own'])
+    tub      = lambda d: selection(d, Take = ['Tub'])
+
+    df['bw']   = df.apply(bw,  axis = 1)
+    df['ep']   = df.apply(ep,  axis = 1)
+    df['men']  = df.apply(men, axis = 1)
+    df['nf']   = df.apply(nf1, axis = 1)
+    df['t21']  = df.apply(t21, axis = 1)
+    df['tub']  = df.apply(tub, axis = 1)
+    df['spe']  = df['bw'] + df['ep'] + df['men'] + df['nf'] + df['t21'] 
+    df['oth']  = ~df['AnnatSpecDiagnos'].isnull()
+
+    return df 
+
+def add_other_str(df):
+    cols  = ['bw', 'ep', 'men', 'nf', 't21', 'tub', 'oth']
+    df    = add_other(df)
+    df    = common.df_bool2str(df, cols)
+    return df 
+    
+
+
+""" ----------------------------------------
+
+    11 surgery
+
+---------------------------------------"""
+
+def surg_codes(df):
+    kod = df['Kod']
+    sub = df['Sub']
+
+    if common.isnull(kod):
+        return np.nan
+
+    kod = kod.replace('+', ' ')
+    kod = kod.replace('/', ' ')
+    kod = kod.split()
+
+    if not common.isnull(sub):   
+
+        sub = sub.replace('+', ' ')
+        sub = sub.split()
+        for k in kod:
+            if not k[0] == sub[0][0]:
+                sub += k
+        return sub
+    return kod    
+
+
+def generate_surgery():
+    df = bio.readsurgery()
+    df = df.sort_values('LopNr').reset_index(drop=True)
+    df['surgery'] = df.apply(surg_codes, axis = 1)
+    df = common.compression_with_dates(df, 'LopNr', 'surgery', 'Datum')
+    bio.save_generated_file(df, 'surgery')
+
+
+
+def surgery_diff(df):
+    dia     = df['DiagnosDat']
+    surgery = df['surgery']
+    surgeries_and_diffs = []
+    for [codes, date] in surgery:
+        diff = common.time_diff_in_days(date, dia)
+        surgeries_and_diffs += [[codes, diff]]
+    return common.code_compress(surgeries_and_diffs)
+
+ 
+def add_surgery(df):
+    surg = bio.read_generated_surgery()
+    cols = list(surg.columns) 
+    cols.remove('LopNr')
+    df = common.add_list_cols(df, surg, 'LopNr', cols)
+    df['surgery_diff']  = df.apply(surgery_diff, axis = 1)  
+    return df
+
+
+
+def surgery_ohe(df):
+    c = 'kirurgi'
+    df = diagnose.one_x(df, c)
+    df = common.rmcols(df, [c])
+    return df
+
+
+
+""" ----------------------------------------
+
+    12 cytostatica
+
+---------------------------------------"""
+
+
+
+def drug_classes(x, treatment, cyto_code, drug_class):
+
+    tx = treatment[ treatment['LopNr'] == x ]
+    if tx.empty:
+        return []
+    
+    tx = tx[['BehNr', 'cyto_date']]
+    tx = tx.sort_values('cyto_date')
+    bs = np.unique(tx['BehNr'].values)  # all the 'BehNr' for x, max 2
+    category_date_list = []
+    
+    for b in bs:
+        codes = common.look_up_all_values(b,  cyto_code,  'BehNr', 'DrogKod')
+        categories = []
+        for c in np.unique(codes):
+            categories += common.look_up_all_values(c,  drug_class,  'Drogkod', 'Kategori')
+        categories = np.unique(categories)
+        txb = tx[tx['BehNr'] == b]
+        dates = np.unique(txb['cyto_date'].values)
+        d0 = dates[0]
+        for c in categories:
+            category_date_list += [[c, d0]]
+        """
+        for d in dates:
+            for c in categories:
+                category_date_list += [[c, d]]
+        """
+        
+    return category_date_list
+
+
+
+def drug_diff(df):
+    dia  = df['DiagnosDat']
+    drugs = df['cytoclass']
+    drugs_and_diffs = []
+    for [code, date] in drugs:
+        diff = common.time_diff_in_days(date, dia)
+        drugs_and_diffs += [[code, diff]]
+    return drugs_and_diffs
+    
+  
+
+def add_cytostatica(df, treat):
+
+    treat = treat[['LopNr', 'BehNr', 'cyto_date']]
+    treat = treat.dropna()
+
+    xs = np.unique(treat['LopNr'].values)
+    print('    Nr of LopNr:s in treatment data with cyto', len(xs))
+ 
+    
+    # treat   = bio.readtreatment()   # LopNr > BehNr
+    codes   = bio.readcytocodes()     # BehNr > DrogKod
+    classes = bio.readdrugclasses()   # Drogkod > Kategori
+
+    f = lambda x: drug_classes(x, treat, codes, classes)
+
+    df['cytoclass'] = df['LopNr'].apply(f)
+    df['cytoclass_diff']  = df.apply(drug_diff, axis = 1)  
+
+    return df
+
+
+
+
+""" ----------------------------------------
+
+    13 stemmcell
+
+---------------------------------------"""
+
+"""
+def stemcell(d, treat):
+    x = d['LopNr']
+    allo = common.look_up_all_values(x, treat, 'LopNr', 'Allogen')
+    d['allo'] = 0
+    d['auto'] = 0
+    if 'Ja' in allo:
+        d['allo'] = 1
+    if 'Nej' in allo:
+        d['auto'] = 1
+    return d
+"""
+
+def stemcell(x, treat):
+    tx = treat[ treat['LopNr'] == x ]
+    if tx.empty:
+        return [] 
+    tx = tx[['Allogen', 'stem_date']]
+    allo = tx[tx['Allogen'] == 'Ja']
+    auto = tx[tx['Allogen'] == 'Nej']
+    allo_dates = np.unique(allo['stem_date'].values)
+    auto_dates = np.unique(auto['stem_date'].values)
+
+    stem = []
+    for date in allo_dates:
+        stem += [['allo', date]]
+    for date in auto_dates:
+        stem += [['auto', date]]
+
+    return stem
+
+
+def stem_diff(df):
+    dia  = df['DiagnosDat']
+    stem = df['stemcell']
+    stem_and_diffs = []
+    for [code, date] in stem:
+        diff = common.time_diff_in_days(date, dia)
+        stem_and_diffs += [[code, diff]]
+    return stem_and_diffs
+    
+
+
+def add_stemcell(df, treat):
+    
+    treat = treat[['LopNr', 'Allogen', 'stem_date']]
+    treat = treat.dropna()
+    xs = np.unique(treat['LopNr'].values)
+    print('    Nr of LopNr:s in treatment data with stem', len(xs))
+    df['stemcell'] = df['LopNr'].apply(lambda x: stemcell(x, treat))
+    df['stemcell_diff']  = df.apply(stem_diff, axis = 1)  
+    return df
+
+
+
+def analyze_stemcell_data(df):   # not used for production of total
+    cols = gs.places.stemcell_columns 
+    cell  = bio.readtreatment()
+    cell  = cell[['LopNr'] + cols]
+    cell  = cell.drop_duplicates()
+    cell  = cell[ ~( cell['Allogen'].isnull() & cell['Autolog'].isnull()) ]
+    print(cell.shape)    
+    print(cell[0:20]) 
+
+    analyze.compare_lopnr(df, cell, 'base', 'cell')
+    
+    df   = diagnose.add_cols(df, cell, 'LopNr', cols)
+    return df
+
+
+""" ----------------------------------------
+
+    14 stralung
+
+---------------------------------------"""
+
+# def look_up_
+
+def radio_datum(treat, x, b):
+    pass
+
+def radio_classes(x, treat, codes, classes):
+
+    cx = codes[codes['LopNr'] == x]
+    
+    if cx.empty:
+        return []
+    
+    cx = cx[['BehNr','Target_Kod']]
+    tx = treat[treat['LopNr'] == x]
+    bs = np.unique(cx['BehNr'].values)  # all the 'BehNr' for x, max 2
+
+    category_date_list = []
+    for b in bs:
+        cxb = cx[cx['BehNr'] == b]
+        txb = tx[tx['BehNr'] == b]
+        dates = np.unique(txb['radio_date'].values)
+        codes = np.unique(cxb['Target_Kod'].values)
+
+        d0 = dates[0]
+
+        for c in codes:
+            for k in classes.columns[1:]:
+                categories = common.look_up_nonnan(c, classes,  'Target_Kod', k)
+                if not categories == []:
+                    t = categories[0]
+                    if [t, d0] not in category_date_list:
+                        category_date_list += [[t, d0]]
+        
+        """
+        for d in dates:  
+            for c in codes:
+                for k in classes.columns[1:]:
+                    categories = common.look_up_nonnan(c, classes,  'Target_Kod', k)
+                    if not categories == []:
+                        t = categories[0]
+                        if [t, d] not in category_date_list:
+                            category_date_list += [[t, d]]
+        """
+
+    return category_date_list
+
+
+def radio_diff(df):
+    dia  = df['DiagnosDat']
+    radio = df['stralung']
+    radio_and_diffs = []
+    for [code, date] in radio:
+        diff = common.time_diff_in_days(date, dia)
+        radio_and_diffs += [[code, diff]]
+    return radio_and_diffs
+    
+
+
+def add_radio(df, treat):
+
+    treat = treat[['LopNr', 'BehNr', 'radio_date']]
+    treat = treat.dropna()
+    xs = np.unique(treat['LopNr'].values)
+    print('    Nr of LopNr:s in treatment data with radio', len(xs))
+
+    
+    codes  = bio.readradiocodes()     # LopNr > Target_Kod (radioterapi)
+    codes = codes.dropna()    
+    classes = bio.readradioclasses()   # Target_kod > "Kategorier"
+    f = lambda x: radio_classes(x, treat, codes, classes)
+    df['stralung'] = df['LopNr'].apply(f)
+    
+    df['radio_diff']  = df.apply(radio_diff, axis = 1)  
+
+    return df
+
+
+
+
+""" ----------------------------------------
+
+    14 total
+
+---------------------------------------"""
+
+
+def stem(df):
+    z = []
+    if df['allo'] == 1: z += ['allo']
+    if df['auto'] == 1: z += ['auto']    
+    return z
+
+def other(df):
+    z = []
+    for s in ['bw', 'ep', 'men', 'nf', 't21', 'tub']:	
+        if df[s] == '1': z += [s]
+    return z
+
+
+def event_time_death(df):
+    events = df['life_events']
+    if events == '0':
+        return []
+
+    events = common.str2list(events)
+
+    gap   = gs.outcome_timegap
+    dia   = df['DiagnosDat']
+    start = common.add_days(dia, gap)
+    end   = str(df['DODSDAT'])
+    time  = common.time_diff_in_days(end, start)
+    
+    # t= int(df['life']) # redo this
+    z = []
+    for e in events:
+        z += [[e, time]]
+    
+    return z
+ 
+def kon(x):
+    if x == '1': return 'pojke'
+    if x == '2': return 'flicka'
+    return x
+
+
+def place_in_order(s, y, z):
+    # place z after y in the list s 
+    t = []
+    for x in s:
+        if not x == z:
+            t += [x]
+        if x == y:
+            t += [z]
+    return t
+
+
+
+cols_to_remove = [
+    'AnnatSpecDiagnos',
+    'Diagnostic_Group',
+    'K_n',
+    'Diagnos_lder',
+#    'DiagnosDat',
+    'SlutBehDat',
+    'FoddAr',
+    'first_incare',
+    'first_nicare',
+    'all_incare',
+    'all_nicare',
+    'first_drug',
+    'all_drug',
+    'surgery',
+    'cytoclass',
+    'stemcell',
+    'radio',
+    'ICD',
+    'DODSDAT',
+    'life_events',
+    'life',
+    'oth',
+    'bw',
+    'ep',
+    'men',
+    'nf',
+    't21',
+    'tub',
+    'spe',
+#    'allo',
+#    'auto',
+]
+
+
+
+def diagnosis_class(x):
+    if isinstance(x, str):
+        return x[:3]
+    return x
+
+def modify_total(df):
+    
+    # 1 change column names
+    
+
+    df = common.change_col_names(df, 'DIA', 'has_dia')
+    # df = common.change_col_names(df, 'oth', 'has_other_dia') # other is removed
+    df = common.change_col_names(df, 'stralung', 'radio')
+    df = common.change_col_names(df, 'ICD10', 'diagnosis')
+    
+
+    # 2 define other_dia columns
+     
+    dfcopy = copy.copy(df)  # alt. = copy.deepcopy(df)
+    # dfcopy['stemcell']  = df.apply(lambda d: stem(d), axis = 1)
+    dfcopy['other_dia'] = df.apply(lambda d: other(d), axis = 1)    
+    df = dfcopy
+
+    # 3 define event_time_death
+
+    df['event_time_death'] = df.apply(lambda d: event_time_death(d), axis = 1)
+    df['all_event_time_death'] = df['event_time_death'].apply(lambda lst: [[x[0], [x[1]]] for x in lst])
+
+    # 4 add diagnosis class
+
+    df['diagnosis_class'] = df['diagnosis'].apply(diagnosis_class)
+
+    # 5 in col. K_n replace 1 with pojke and replace 2 with flicka
+    
+    df['sex'] = df['K_n'].apply(kon)
+    
+    # 6 rearrange columns
+
+    c = list(df.columns)
+    c = place_in_order(c, 'Diagnostic_Group', 'sex')
+    c = place_in_order(c, 'DODSDAT', 'event_time_death')
+    c = place_in_order(c, 'diagnosis', 'diagnosis_class')
+    # c = place_after('diff_nic', c, 'no_event_time_nic')
+    df = df[c]
+
+    
+    # 7 remove not to be used columns
+    
+    df = common.rmcols(df, cols_to_remove)
+
+    return df
+
+
+def generate_total():
+    df = bio.readtimediff()
+    treat = bio.readtreatment()
+    df = add_surgery(df)
+    df = add_cytostatica(df, treat)
+    df = add_stemcell(df, treat)
+    df = add_radio(df, treat)
+    df = modify_total(df)
+    bio.save_generated_file(df, gs.names.total_file)
+    
+    print('total columns')
+    for c in df.columns:
+        print('\t', c)
+    print()
+
+    t = df[['DiagnosDat', 'cytoclass', 'cytoclass_diff', 'stemcell', 'stemcell_diff', 'radio', 'radio_diff']]
+    bio.save_tmp_file(t, 'total_dsr_2020_05_06')
+
+    t = df[['surgery_diff', 'cytoclass_diff', 'stem_diff', 'radio_diff']]
+    bio.save_tmp_file(t, 'total_treatment_2020_05_06')