diff --git a/LICENCE b/LICENCE new file mode 100644 index 0000000..6d95953 --- /dev/null +++ b/LICENCE @@ -0,0 +1,23 @@ +Copyright (C) 2018-2021 RISE Research Institute of Sweden AB + +MIT License + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + diff --git a/README b/README new file mode 100644 index 0000000..6172d66 --- /dev/null +++ b/README @@ -0,0 +1,35 @@ +This repository contains the code used for data analysis in the BONSAI +project (Financed by the Swedish funding agency Vinnova, 2018 - 2020). + +The objective of the project was to explore methods for causal +inference for finding causal links between child cancer treatment and +late effects. The idea is to use a constraint based causal inference +method adapted for small data sets (most medical data sets are small +from a machine learning perspective), to distinguish whether a late +effect is caused by personal predisposition, the primary cancer +diagnosis, or one of the treatments for the cancer. + +This repository contains two parts. One part contains the code for +arranging the data from several registers (person info, control person +info, diagnosis, treatments, in-patient care registry, out-patient +care registry, death registry, and drug prescription, into a common +file with all information about a patient or control person on one +line each. The other part contains the analysis and visualization code +based on that common file. + +The actual data is not included here, since it contains sensitive +personal information and is therefore not publicly available. + +All the code is "research project quality", not production quality, +which means that it is patchy, inconsistent, unclearn, not well +documented, and thus not easy to use by someone else. Neverthelsee, it +is included here for transparency of the methods used in the project. + +For questions about the code and how to use reuse snippets of it in +other projects, contact Anders Holst: anders.holst@ri.se + +For questions about availability of data, contact Helena Linge: +helena.linge@med.lu.se + +For suggestion for future research collaboration around these issues, +you can contact any of the above. diff --git a/dataanalysis/Color.py b/dataanalysis/Color.py new file mode 100644 index 0000000..a23ce37 --- /dev/null +++ b/dataanalysis/Color.py @@ -0,0 +1,67 @@ +""" +Copyright (C) 2018-2021 RISE Research Institute of Sweden AB + +File: Color.py + +Author: anders.holst@ri.se + +""" + +import numpy as np + +def colorgamma(x): + if x < 0.0031308: + return x*12.92 + else: + return 1.055 * np.power(x, 0.416667) - 0.055 + +def invcolorgamma(x): + if x < 0.04045: + return x / 12.92 + else: + return np.power((x + 0.055)/1.055, 2.4) + +def rgb_color(r, g, b): + return tuple(list(map(colorgamma, [r,g,b]))) + +# mimics color.scm with 6color, bluebased, and smooth +def hsl_color(h, s, l): + hramp = [0, 1/12, 1/6, 1/3, 2/3, 7/9, 1] + iramp = [(2.410996, 0.16, 0, 0, 1), (0.862552, 0.79, 0, 1, 1), + (-0.252442, 0.63, 0, 1, 0), (-1.981885, 0.84, 1, 1, 0), + (1.786451, 0.21, 1, 0, 0), (1.571190, 0.37, 1, 0, 1), + (2.410996, 0.16, 0, 0, 1)] + i = 0 + while h > hramp[i+1]: + i += 1 + p = (h - hramp[i])/(hramp[i+1] - hramp[i]) + (a, br, r, g, b) = tuple(map(lambda x1, x2: p*(x2 - x1) + x1, iramp[i], iramp[i+1])) + ll0 = (l + 1.0)*0.5 + ll = (np.exp(a*ll0) - 1.0)/(np.exp(a) - 1.0) if not a==0 else ll0 + if ll < br: + t1 = s * ll / br + t2 = ll0 * (1.0 - s) + else: + t1 = s * (1.0 - ll) / (1.0 - br) + t2 = ll0 * (1.0 - s) + s * (ll - br) / (1.0 - br) + return rgb_color(r*t1+t2, g*t1+t2, b*t1+t2) + +def gencolor(light=0.0, starthue=0.0, startsat=1.0, unsat=False): + h = starthue + s = startsat + if unsat: + ss = s*s + l = light + gs1 = (3.0 - np.sqrt(5))/2.0 + gs2 = gs1/np.sqrt(3) + while True: + yield hsl_color(h, s, l) + h -= gs1 + if h < 0.0: + h += 1.0 + if unsat: + ss -= gs2 + if ss <= 0: + ss += 1.0 + s = np.sqrt(ss) + diff --git a/dataanalysis/exptest3.py b/dataanalysis/exptest3.py new file mode 100644 index 0000000..c3c85a8 --- /dev/null +++ b/dataanalysis/exptest3.py @@ -0,0 +1,1255 @@ +""" +Copyright (C) 2018-2021 RISE Research Institute of Sweden AB + +File: exptest3.py + +Author: anders.holst@ri.se + +""" + +import pandas as pd +from math import * +from scipy.special import hyp1f1 + +from hist import * +from visual import * + +eps = 0.001 + +# Brute force summation, in logarithms to avoid overflow +def log_hyper_1f1_negintab(a, b, x): + m = -a + lr = 0.0 + lhg = 0.0 + for k in range(m): + lr += log(x*(a+k)/((k+1)*(b+k))) + lhg += log(1 + exp(lr - lhg)) + return lhg + +# r *= (x*(a+k) / ((k+1)*(b+k))) +# hg += r + +def log_hyper_1f1(a, b, x): + # b ej negativt heltal -> log(hyp1f1) + # b neg heltal -> interpolera b+-eps + eps = 0.0001 + if b<0 and b==int(b): + ret = (hyp1f1(a, b-eps, x) + hyp1f1(a, b+eps, x))/2 + else: + ret = hyp1f1(a, b, x) + if isinf(ret): + print("overflow at hyp1f1(%f, %f, %f)" % (a,b,x)) + return -inf + elif ret<=0: + print("negative hyp1f1(%f, %f, %f)" % (a,b,x)) + return -inf + return log(ret) + +def log_hyper_1f1_interpol(a, b, x): + # a ej negativt heltal -> interpolera log(hyp1f1) + if a==int(a): + return log_hyper_1f1_negintab(int(a), b, x) + else: + r1 = log_hyper_1f1_negintab(floor(a), b, x) + r2 = log_hyper_1f1_negintab(ceil(a), b, x) + prop = a - floor(a) + return r2*prop + r1*(1.0-prop) + +# pa = (n, t) +def LogLambdaProb_un(pa1, ll): + (n1, t1) = pa1 + if ll <= 0: + return -inf if n1 > 0 else 0.0 + return n1 * log(ll) - t1*ll + +def LogLambdaProb_pr_un(pa1, ll, pr): + (n1, t1) = pa1 + (n0, t0) = pr + if ll <= 0: + return -inf if n1+n0 > 0 else 0.0 + return (n1+n0) * log(ll) - (t1+t0)*ll + +def LogDifflambdaProbOne_un(pa1, pa2, dl): + (n1, t1) = pa1 + (n2, t2) = pa2 + if dl == 0: + ret = 0.0 + elif dl > 0.0: + ret = (-dl * t2) + log_hyper_1f1_negintab(-n2, -n1 - n2, dl*(t1 + t2)) + else: + ret = (dl * t1) + log_hyper_1f1_negintab(-n1, -n1 - n2, -dl*(t1 + t2)) + return ret + +def LogDifflambdaProbOne_pr_un_old(pa1, pa2, dl, pr): + (n1, t1) = pa1 + (n2, t2) = pa2 + (n0, t0) = pr + if dl == 0: + ret = 0.0 + elif dl > 0.0: + ret = (-dl*(t2+t0)) + log_hyper_1f1_interpol(-n2-n0, -n1-n2-2*n0, dl*(t1+t2+2*t0)) + else: + ret = (dl*(t1+t0)) + log_hyper_1f1_interpol(-n1-n0, -n1-n2-2*n0, -dl*(t1+t2+2*t0)) + return ret + +def LogDifflambdaProbOne_pr_un(pa1, pa2, dl, pr): + (n1, t1) = pa1 + (n2, t2) = pa2 + if dl == 0 or t1+t2 == 0.0 or n1+n2 == 0: + ret = 0.0 + else: + if prior_style == 'partaverage': # separate priors per condition + (n0, t0) = (pr[0], pr[0]*(t1+t2)/(n1+n2+pr[0])) + elif prior_style == 'partaveragezero': # inget extra event + (n0, t0) = (pr[0], pr[0]*(t1+t2)/(n1+n2)) + else: + (n0, t0) = pr + if dl > 0.0: + ret = (-dl*(t2+t0)) + log_hyper_1f1_interpol(-n2-n0, -n1-n2-2*n0, dl*(t1+t2+2*t0)) + else: + ret = (dl*(t1+t0)) + log_hyper_1f1_interpol(-n1-n0, -n1-n2-2*n0, -dl*(t1+t2+2*t0)) + return ret + +def LogDifflambdaProbOne_num1_old(pa1, pa2, pr): + (n1, t1) = pa1 + (n2, t2) = pa2 + (n0, t0) = pr + # hitta gränser för pa1 och pa2 separat + func1 = lambda ll: LogLambdaProb_pr_un(pa1, ll, pr) + func2 = lambda ll: LogLambdaProb_pr_un(pa2, ll, pr) + (rng, vals) = find_calc_hrange(func1, (n1+n0+1)/(t1+t0), (n1+n0+1)/(5*(t1+t0)), 0.1, 25) + mx1 = max(vals) + a1 = rng[0] + b1 = rng[-1] + d1 = (b1 - a1)/(len(rng)-1) + (rng, vals) = find_calc_hrange(func2, (n2+n0+1)/(t2+t0), (n2+n0+1)/(5*(t2+t0)), 0.1, 25) + mx2 = max(vals) + a2 = rng[0] + b2 = rng[-1] + d2 = (b2 - a2)/(len(rng)-1) + return (mx1, a1, b1, d1, func1, mx2, a2, b2, d2, func2) + +def LogDifflambdaProbOne_num1(pa1, pa2): + (n1, t1) = pa1 + (n2, t2) = pa2 + if t1+t2==0.0 or n1+n2==0: + return () + # hitta gränser för pa1 och pa2 separat + func1 = lambda ll: LogLambdaProb_un(pa1, ll) + func2 = lambda ll: LogLambdaProb_un(pa2, ll) + (rng, vals) = find_calc_hrange(func1, (n1+1)/t1, (n1+1)/(5*t1), 0.1, 25) + mx1 = max(vals) + a1 = rng[0] + b1 = rng[-1] + d1 = (b1 - a1)/(len(rng)-1) + (rng, vals) = find_calc_hrange(func2, (n2+1)/t2, (n2+1)/(5*t2), 0.1, 25) + mx2 = max(vals) + a2 = rng[0] + b2 = rng[-1] + d2 = (b2 - a2)/(len(rng)-1) + return (mx1, a1, b1, d1, func1, mx2, a2, b2, d2, func2) + +def LogDifflambdaProbOne_num2(tup, dl): + if not tup: + return 0.0 + (mx1, a1, b1, d1, func1, mx2, a2, b2, d2, func2) = tup + # sen gå igenom för fixt dl och summera + dd = min(d1, d2) + aa = max(a1, a2-dl) + bb = min(b1, b2-dl) + sm = 0.0 + lst = [] + x = aa + while x0.0 else -inf + +#def LogDifflambdaProb_un(da1, da2, dl): +# return sum(list(map(lambda pa1,pa2: LogDifflambdaProbOne_un(pa1, pa2, dl), da1, da2))) + +#def LogDifflambdaProb_pr_un(da1, da2, dl, pr): +# pr = (pr[0]/len(da1), pr[1]/len(da1)) +# return sum(list(map(lambda pa1,pa2: LogDifflambdaProbOne_pr_un(pa1, pa2, dl, pr), da1, da2))) + +def LogDifflambdaProb_num1(da1, da2): + return list(map(lambda pa1,pa2: LogDifflambdaProbOne_num1(pa1, pa2), da1, da2)) + +def LogDifflambdaProb_num2(tupl, dl): + return sum(list(map(lambda tup: LogDifflambdaProbOne_num2(tup, dl), tupl))) + +prior_style = None + +def set_prstyle(st): + global prior_style + prior_style = st + +def DifflambdaHist(da1, da2, pa1, pa2): + (n1, t1) = pa1 + (n2, t2) = pa2 + if t1 == 0.0 or t2 == 0.0 or (n1 == 0 and n2 == 0): + return False + tupl = LogDifflambdaProb_num1(da1, da2) + func = lambda dl: LogDifflambdaProb_num2(tupl, dl) + (xres, yres) = find_calc_hrange(func, 0.0, (n1+n2+1)/(t1+t2), 0.1, 20) + return [xres, normalize_logprob(yres)] + +def LambdaHist(pa1): + (n1, t1) = pa1 + if t1 == 0.0 or n1 == 0: + return False + func = lambda dl: LogLambdaProb_un(pa1, dl) + (xres, yres) = find_calc_hrange(func, (n1+1)/t1, (n1+1)/(5*t1), 0.2, 20) + return [xres, normalize_logprob(yres)] + +def normalize_logprob(vec): + mx = max(vec) + sm = 0.0 + for i in range(len(vec)): + sm += exp(vec[i] - mx) + norm = log(sm) + mx + return [exp(x-norm) for x in vec] + +def find_calc_hrange(func, start, step, mindiff, maxdiff): + def zpush(v): + return 0.0 if abs(v)<2e-11 else v + fa = nan + fb = nan + a = nan + b = nan + m1 = zpush(start - 0.5*step) + fm1 = func(m1) + m2 = zpush(start + 0.5*step) + fm2 = func(m2) + left = [] + right = [] + while True: + #print(a, fa, m1, fm1, m2, fm2, b, fb) + if fm1 > fm2: + if not isnan(b): + right = [(b, fb)] + right + (m, fm) = (m1, fm1) + (b, fb) = (m2, fm2) + else: + if not isnan(a): + left = [(a, fa)] + left + (a, fa) = (m1, fm1) + (m, fm) = (m2, fm2) + if isnan(a): + m1 = zpush(3*m - 2*b) + fm1 = func(m1) + (m2, fm2) = (m, fm) + elif isnan(b): + (m1, fm1) = (m, fm) + m2 = zpush(3*m - 2*a) + fm2 = func(m2) + else: + tmp = [fa, fm, fb] + mx = max(tmp) + mn = min(tmp) + if mx-mn < mindiff or (b-a) (b-m)*1.5: + dir = 0 + elif (m-a)*1.5 < (b-m): + dir = 1 + else: + dir = 0 if fa > fb else 1 + if dir == 0: + m1 = zpush(0.5*(a+m)) + fm1 = func(m1) + (m2, fm2) = (m, fm) + else: + (m1, fm1) = (m, fm) + m2 = zpush(0.5*(m+b)) + fm2 = func(m2) + delta = min(m-a, b-m) + left = [(a, fa)] + left + right = [(b, fb)] + right + if m-a < m-b: + right = [(m, fm)] + right + else: + left = [(m, fm)] + left + tr = mx - maxdiff + #print("^", mn, mx, tr) + resx = [] + resy = [] + leftind = 0 + if delta < 1e-10: + delta = 1e-3 + while leftind < len(left) and left[leftind][1] > tr: + resx = [left[leftind][0]] + resx + resy = [left[leftind][1]] + resy + a = zpush(left[leftind][0] - delta) + leftind += 1 + while leftind >= len(left) or a > left[leftind][0] + delta/2: + fa = func(a) + #print("L", a, fa) + if isnan(fa) or isinf(fa) or fa < tr: + break + resx = [a] + resx + resy = [fa] + resy + a = zpush(a-delta) + rightind = 0 + while rightind < len(right) and right[rightind][1] > tr: + resx = resx + [right[rightind][0]] + resy = resy + [right[rightind][1]] + b = zpush(right[rightind][0] + delta) + rightind += 1 + while rightind >= len(right) or b < right[rightind][0] - delta/2: + fb = func(b) + #print("R", b, fb) + if isnan(fb) or isinf(fb) or fb < tr: + break + resx = resx + [b] + resy = resy + [fb] + b = zpush(b+delta) + #print(a, m, b) + return (resx, resy) + +def selfunc(entry, val): + if type(entry) == list: + if len(entry)>0 and type(entry[0])==list: + entry = list(map(lambda x:x[0], entry)) + if type(val) == list: + for x in val: + if x in entry: + return True + return False + else: + return val in entry + else: + if type(val) == list: + return entry in val + else: + return val == entry + +def selassoc(sel, key): + if sel is False: + return [] + else: + return [x[-1] for x in sel if key == x[0] or (len(x)==3 and key == x[1])] + +def assoc(lst, key, defl): + for ele in lst: + if ele[0]==key: + return ele[1] + return defl + +def assoc_a0(lst, key, defl): + if type(lst)==list: + for ele in lst: + if type(ele)==list: + if ele[0]==key: + return ele[1] + else: + if ele==key: + return 0 + return defl + else: + return 0 if lst==key else defl + +def subsets(lst, ord): + if ord==0: + return [[]] + elif ord > len(lst): + return [] + else: + rest = subsets(lst[1:], ord-1) + return list(map(lambda l:[lst[0]]+l, rest)) + subsets(lst[1:],ord) + +def issubset(l1, l2): + for e in l1: + if not e in l2: + return False + return True + +def tupleadd(t1, t2): + return tuple((t1[i]+t2[i] for i in range(min(len(t1),len(t2))))) + #return (t1[0]+t2[0],t1[1]+t2[1]) + +def tuplescale(t1, scale): + return tuple((t1[i]*scale for i in range(min(len(t1),len(t2))))) + #return (t1[0]*scale,t1[1]*scale) + +def listminus(l1, l2): + return [e for e in l1 if not e in l2] + +def read_total_file(file): + df = pd.read_csv(file, sep='\t') + for c in df: + if type(df[c][0])==str and df[c][0][0] == '[': + df[c] = df[c].apply(eval) + return df + +def valsincolumn_old(df, col): + d = df[col] + res = [] + for val in d: + if type(val)==list: + for e in val: + if not e in res: + res.append(e) + else: + if not val in res: + res.append(val) + return sorted(res) + +def valsincolumn(df, col): + d = df[col] + res = [] + for val in d: + if type(val)==list: + for e in val: + if type(e)==list: + if not e[0] in res: + res.append(e[0]) + else: + if not e in res: + res.append(e) + else: + if not val in res and not (type(val)==float and isnan(val)): + res.append(val) + return sorted(res) + +#-------------------------- + +# Ny approach för att undvika betingad utspädning: För varje +# seneffekt, i lämplig submängd av data (patienter, kön, etc) kolla om +# en viss faktor (egenskaper, diagnos, behandling) är signifikant +# korrelerad. Sen betinga på varje annan faktor som också är +# korrelerad, och se om signifikansen försvinner helt, alt frekvensen +# förändras signifikant. Lista de som inte försvinner/ändras, i +# signifikansordning. Håll reda på om alla försvinner vid korsvis +# betingning, eller om flera finns kvar. Helst ska varje signifikant +# korrelation förklaras av en faktor, alt flera faktorer av samma typ. + +def select_data(df, sel1, sel2, selg): + # sel = [(col, val)] selg = [('otherdia', 't21')] [('DIA',1)] [False, 'stralung','ANSIKTE'] + # cond = [col1, col2, ] + if df.empty: + return (df,df) + if selg: + test = df[df.columns[0]].apply(lambda entry: True) + for ele in selg: + if len(ele) == 3: + test = test & df[ele[1]].apply(lambda entry: not selfunc(entry, ele[2])) + else: + test = test & df[ele[0]].apply(lambda entry: selfunc(entry, ele[1])) + df = df[test] + test = df[df.columns[0]].apply(lambda entry: True) + for ele in sel1: + if len(ele) == 3: + test = test & df[ele[1]].apply(lambda entry: not selfunc(entry, ele[2])) + else: + test = test & df[ele[0]].apply(lambda entry: selfunc(entry, ele[1])) + d1 = df[test] + if sel2 is False: + d2 = df[~test] + else: + test = df[df.columns[0]].apply(lambda entry: True) + for ele in sel2: + if len(ele) == 3: + test = test & df[ele[1]].apply(lambda entry: not selfunc(entry, ele[2])) + else: + test = test & df[ele[0]].apply(lambda entry: selfunc(entry, ele[1])) + d2 = df[test] + return (d1, d2) + +def select_data_alt(df, selalt, selg): + if df.empty: + return (df,df) + if selg: + test = df[df.columns[0]].apply(lambda entry: True) + for ele in selg: + if len(ele) == 3: + test = test & df[ele[1]].apply(lambda entry: not selfunc(entry, ele[2])) + else: + test = test & df[ele[0]].apply(lambda entry: selfunc(entry, ele[1])) + df = df[test] + test = df[df.columns[0]].apply(lambda entry: False) + for ele in selalt: + if len(ele) == 3: + test = test | df[ele[1]].apply(lambda entry: not selfunc(entry, ele[2])) + else: + test = test | df[ele[0]].apply(lambda entry: selfunc(entry, ele[1])) + df = df[test] + return df + +def count_effect(df, eff, ncol, ecol): + n = 0 + t = 0 + for i in range(len(df)): + row = df.iloc[i] + x = assoc(row[ecol], eff, False) + if x is not False: + n += 1 + t += x + else: + t += row[ncol] + return (n, t) + +def count_effect_mtag(df, eff, tags): + ecols = ['event_time_' + tag for tag in tags] + ncols = ['no_event_time_' + tag for tag in tags] + #ccols = ['censored_time_' + tag for tag in tags] + res = (0, 0, 0) + for i in range(len(df)): + row = df.iloc[i] + x = min([assoc(row[ecol], eff, inf) for ecol in ecols]) + if x is not inf: + res = tupleadd(res, (1, x, 1)) + else: + x = min([row[ncol] for ncol in ncols]) + res = tupleadd(res, (0, x, 1)) + return res + +#def count_cond_effect(df, eff, ncol, ecol, cols, ignore): +# dic = {} +# for i in range(len(df)): +# row = df.iloc[i] +# x = assoc(row[ecol], eff, False) +# if x is not False: +# incr = (1, x) +# else: +# incr = (0, row[ncol]) +# increment_tuple_value(row, dic, cols, ignore, incr) +# return dic + +def get_cond_stats_daydiff(df, pair, conds, selg, tags, eff): + # Vi får kuta igenom hela data, och för varje sample sortera in värden i rätt dict och key + # Regeln är att ett villkor är sant om eff-dag är senare än ev cond-dag + (dd, dummy) = select_data(df, [], False, selg) + kl = [""] + for cond in conds: + kl = [(k+"_"+str(cond[-1]), k+"_~"+str(cond[-1])) for k in kl] + kl = [k[0] for k in kl] + [k[1] for k in kl] + dick = { k : kl.index(k) for k in kl} + dic1 = { k : (0, 0, 0) for k in kl} + dic2 = { k : (0, 0, 0) for k in kl} + invk = { kl.index(k) : k for k in kl} + bits = [2**i for i in range(len(conds))] + ecols = ['event_time_' + tag for tag in tags] + ncols = ['no_event_time_' + tag for tag in tags] + for i in range(len(dd)): + row = dd.iloc[i] + x = min([assoc(row[ecol], eff, inf) for ecol in ecols]) + # here, sort it into correct key + # för varje cond och pair, hitta senaste dag före eff-dag (minus marginal) + ylst = [y if y 0 and n2 > 0: + lf += min(n1,n2)*(log(n2/t2) - log(n1/t1)) + sn += min(n1,n2) + return exp(lf/sn) if sn>0 else 1.0 + +def dictolistwithprior(cdic1, cdic2, cdic0, tscale = 1.0): + n1,t1,s1 = (0,0,0) + n2,t2,s2 = (0,0,0) + for k in cdic0: + (tmp1, tmp2) = (cdic1[k], cdic2[k]) + n1 += tmp1[0] + t1 += tmp1[1]*tscale + s1 += tmp1[2] + n2 += tmp2[0] + t2 += tmp2[1]*tscale + s2 += tmp2[2] + if prior_style == 'noninfo': # Noninformative prior + prn = 1.0/len(cdic0) + lst1 = [ (cdic1[k][0] - prn, cdic1[k][1]*tscale) for k in cdic0 ] + lst2 = [ (cdic2[k][0] - prn, cdic2[k][1]*tscale) for k in cdic0 ] + elif prior_style == 'average': # Average prior + prn = 1.0/len(cdic0) + prt = prn*(t1+t2)/(n1+n2+1) + lst1 = [ (cdic1[k][0] + prn, cdic1[k][1]*tscale + prt) for k in cdic0 ] + lst2 = [ (cdic2[k][0] + prn, cdic2[k][1]*tscale + prt) for k in cdic0 ] + elif prior_style == 'partaverage': # separate priors per condition + prn = 1.0/len(cdic0) + lst1 = [] + lst2 = [] + for k in cdic0: + (nn1, tt1, ss1) = cdic1[k] + (nn2, tt2, ss2) = cdic2[k] + prt = prn*(tt1+tt2)/(nn1+nn2+prn) + lst1.append((nn1+prn, (tt1+prt)*tscale)) + lst2.append((nn2+prn, (tt2+prt)*tscale)) + elif prior_style == 'partaveragezero': # inget extra event + prn = 1.0/len(cdic0) + lst1 = [] + lst2 = [] + for k in cdic0: + (nn1, tt1, ss1) = cdic1[k] + (nn2, tt2, ss2) = cdic2[k] + if nn1+nn2 > 0: + prt = prn*(tt1+tt2)/(nn1+nn2) + lst1.append((nn1+prn, (tt1+prt)*tscale)) + lst2.append((nn2+prn, (tt2+prt)*tscale)) + else: + lst1.append((nn1, tt1*tscale)) + lst2.append((nn2, tt2*tscale)) + else: # No prior + lst1 = [ (cdic1[k][0], cdic1[k][1]*tscale) for k in cdic0 ] + lst2 = [ (cdic2[k][0], cdic2[k][1]*tscale) for k in cdic0 ] + return (lst1, lst2, (n1,t1,s1), (n2,t2,s2)) + +def analysediff(cdic1, cdic2, cdic0): + (dlst1, dlst2, (n1, t1, s1), (n2, t2, s2)) = dictolistwithprior(cdic1, cdic2, cdic0, 1.0/36525) + + hist = DifflambdaHist(dlst1, dlst2, (n1, t1), (n2, t2)) + fact = estimatefactor(dlst1, dlst2) + if hist is not False: + mean, var = hist_mean_var(hist) + sig = hist_significance(hist, 0.0, mean) + else: + mean = 0.0 + var= 0.0 + sig = 1.0 + nn1 = sum(list(map(lambda p:p[0], dlst1))) + tt1 = sum(list(map(lambda p:p[1], dlst1))) + nn2 = sum(list(map(lambda p:p[0], dlst2))) + tt2 = sum(list(map(lambda p:p[1], dlst2))) + hist1 = LambdaHist((nn1, tt1)) + if hist1 is not False: + mean1, var1 = hist_mean_var(hist1) + else: + mean1 = 0.0 + hist2 = LambdaHist((nn2, tt2)) + if hist2 is not False: + mean2, var2 = hist_mean_var(hist2) + else: + mean2 = 0.0 + return {'mean':mean, 'std':sqrt(var), 'sig':sig, 'fact': fact, 'mean1':mean1, 'n1':n1, 't1':t1, 's1':s1, 'mean2':mean2, 'n2':n2, 't2':t2, 's2':s2, 'hist':hist, 'hist1':hist1, 'hist2':hist2} + +def analysediff01(cdic1, cdic2, cdic0): + (dlst1, dlst2, (n1, t1, s1), (n2, t2, s2)) = dictolistwithprior(cdic1, cdic2, cdic0, 1.0/36525) + + hist = DifflambdaHist(dlst1, dlst2, (n1, t1), (n2, t2)) + fact = estimatefactor(dlst1, dlst2) + if hist is not False: + mean, var = hist_mean_var(hist) + sig = hist_significance(hist, 0.0, mean) + else: + mean = 0.0 + var= 0.0 + sig = 1.0 + nn1 = sum(list(map(lambda p:p[0], dlst1))) + tt1 = sum(list(map(lambda p:p[1], dlst1))) + nn2 = sum(list(map(lambda p:p[0], dlst2))) + tt2 = sum(list(map(lambda p:p[1], dlst2))) + mean1 = nn1/tt1 if tt1 > 0 else 0.0 + mean2 = nn2/tt2 if tt2 > 0 else 0.0 + return {'mean':mean, 'std':sqrt(var), 'sig':sig, 'fact': fact, 'mean1':mean1, 'n1':n1, 't1':t1, 's1':s1, 'mean2':mean2, 'n2':n2, 't2':t2, 's2':s2, 'hist':hist, 'hist1':False, 'hist2':False} + +def analysediff0(cdic1, cdic2, cdic0): + #dlst1 = cdictolist(cdic1, cdic0, 1.0/36525) + #dlst2 = cdictolist(cdic2, cdic0, 1.0/36525) + #if prior_style == 'noninfo': + # pr = (-1, 0) # Noninformative prior + #elif prior_style in ['average','partaverage','partaveragezero']: + # n1 = sum(list(map(lambda p:p[0], dlst1))) + # t1 = sum(list(map(lambda p:p[1], dlst1))) + # n2 = sum(list(map(lambda p:p[0], dlst2))) + # t2 = sum(list(map(lambda p:p[1], dlst2))) + # pr = (1, 1*(t1+t2)/(n1+n2+1)) # Average prior + #else: + # pr = (0, 0) # No prior + #hist = DifflambdaHist(dlst1, dlst2, pr) + (dlst1, dlst2, (n1, t1, s1), (n2, t2, s2)) = dictolistwithprior(cdic1, cdic2, cdic0, 1.0/36525) + hist = DifflambdaHist(dlst1, dlst2, (n1, t1), (n2, t2)) + if hist is not False: + mean, var = hist_mean_var(hist) + sig = hist_significance(hist, 0.0, mean) + else: + mean = 0.0 + var= 0.0 + sig = 1.0 + return {'mean':mean, 'std':sqrt(var), 'sig':sig} + +def analyseeffects1(df, selg, coldic, tags, eff): + resdic = {} + for col in coldic: + for val in coldic[col]: + #print((col,val)) + #(dic1, dic2, kdic) = get_cond_stats_one(df, (col, val), [], selg, tags, eff) + (dic1, dic2, kdic) = get_cond_stats_daydiff(df, (col, val), [], selg, tags, eff) + resdic[(col,val)] = analysediff(dic2, dic1, kdic) + return resdic + +def analyseeffects_back(df, selg, pair, tags, efflst): + resdic = {} + for eff in efflst: + col,val = pair + (dic1, dic2, kdic) = get_cond_stats_one(df, pair, [], selg, tags, eff) + resdic[eff] = analysediff(dic2, dic1, kdic) + return resdic + +def analyseeffects2(df, selg, resdic1, sig, tags, eff): + resdic2 = {} + lst = [] + for pair in resdic1: + if resdic1[pair]['sig'] <= sig: + lst.append(pair) + lst.sort(key=lambda x: resdic1[x]['sig']) + for (ind,pair1) in reversed(list(enumerate(lst))): + mnsig = 0.0 + mntmp = resdic1[pair1] + for pair2 in lst: + if pair1 != pair2: + #print(pair1,pair2) + #(dic1, dic2, kdic) = get_cond_stats_one(df, pair1, [pair2], selg, tags, eff) + (dic1, dic2, kdic) = get_cond_stats_daydiff(df, pair1, [pair2], selg, tags, eff) + tmp = analysediff(dic2, dic1, kdic) + if tmp['sig'] > mnsig: + mnsig = tmp['sig'] + tmp['cond'] = pair2 + mntmp = tmp + resdic2[pair1] = mntmp + if mnsig > sig: + del lst[ind] + return resdic2 + +def movetosaved(remaining, saved, removed, dic): + # alla som inte tas bort av andra än removed flyttas till saved + changed = False + for (ind,pair) in reversed(list(enumerate(remaining))): + lst = dic[pair] + ok = True + for conds,sig in lst: + ok = False + for cond in conds: + if cond in removed: + ok = True + if not ok: + break + if ok: + changed = True + saved.append(pair) + del remaining[ind] + return changed + +def movetoremoved(remaining, saved, removed, dic): + # alla som tas bort av någon i saved flyttas till removed + changed = False + for (ind,pair) in reversed(list(enumerate(remaining))): + lst = dic[pair] + ok = False + for conds,sig in lst: + ok = True + for cond in conds: + if cond not in saved: + ok = False + if ok: + break + if ok: + changed = True + removed.append(pair) + del remaining[ind] + return changed + +def allexcept(lst, ele): + return [e for e in lst if e != ele] + +def analyseeffects2new(df, selg, resdic1, sig, tags, eff): + ciidic = {} + lst = [] + # välj ut dem med signifiant indirekt effekt + for pair in resdic1: + if resdic1[pair]['sig'] <= sig: + lst.append(pair) + ciidic[pair] = [] + # i första passet, betinga var och en på var och en av de andra, + # spara lista på vilka som gör den insignifikant + # ta bort dem som + # 1) blir insignifikant av någon (som den själv inte gör insignifikant) + # 2) och inte behövs för att göra någon annan insignifikant + # I loop: spara dem som inte tas bort av nåt, släng dem som tas bort av dem, + # iterera med resten dvs spara av resten dem som inte tas bort av kvarvarande + # i nästa pass betinga på par (och senare trippler) av kvarvarande + # ta bort enligt analog princip + remaining = lst + saved = [] + totsaved = [] + for order in [1,2,3]: + remaining = saved + remaining + conds = subsets(remaining, order) + for pair in lst: + for cond in conds: + if not pair in cond: + #(dic1, dic2, kdic) = get_cond_stats_one(df, pair, cond, selg, tags, eff) + (dic1, dic2, kdic) = get_cond_stats_daydiff(df, pair, cond, selg, tags, eff) + tmp = analysediff0(dic2, dic1, kdic) + if tmp['sig'] > sig: + ciidic[pair].append((cond,tmp['sig'])) + remaining = lst.copy() + saved = [] + removed = [] + changed = True + while changed: + # alla som inte tas bort av andra än removed flyttas till saved + # alla som tas bort av någon i saved flyttas till removed + changed = False + if movetosaved(remaining, saved, removed, ciidic): + changed = True + if movetoremoved(remaining, saved, removed, ciidic): + changed = True + if remaining: + alternatives = [] + origsaved = saved.copy() + origremaining = remaining.copy() + akeys = [] + for pair in remaining: + for sp in ciidic[pair]: + sp2 = listminus(sp[0], saved) + if sp2 and sp2 not in akeys: + akeys.append(sp2) + for sp in akeys: + # kolla också för var och en i listan sp om de nollas av de andra i sp plus saved + if len(sp) > 1: + ok = True + for p in sp: + ll = ciidic[p] + tset = listminus(sp,p) + saved + for ele,sig in ll: + if issubset(ele, tset): + ok = False + break + if not ok: + continue + saved = origsaved + sp + remaining = listminus(origremaining, sp) + removed = [] + changed = movetoremoved(remaining, saved, removed, ciidic) + while changed: + changed = False + if movetosaved(remaining, saved, removed, ciidic): + changed = True + if movetoremoved(remaining, saved, removed, ciidic): + changed = True + if not remaining: + s = set(saved) + if s not in alternatives: + alternatives.append(s) + if not alternatives: + alternatives = [set(saved)] + print("Failed to find clean condition alternatives") + print("Saved: ", saved) + print("Remaining: ", remaining) + print("Dict: ", ciidic) + else: + alternatives = [set(saved)] + resdic2lst = [] + for saved in alternatives: + # preparera resdic2 från saved, dvs betinga var och en på övriga + saved = list(saved) + resdic2 = {} + for pair in lst: + if pair in saved: + cond = allexcept(saved, pair) + else: + mx = ((),0.0) + for sp in ciidic[pair]: + if issubset(sp[0], saved) and sp[1]>mx[1]: + mx = sp + if mx[1] > 0.0: + cond = mx[0] + else: + cond = allexcept(saved, pair) + #(dic1, dic2, kdic) = get_cond_stats_one(df, pair, cond, selg, tags, eff) + (dic1, dic2, kdic) = get_cond_stats_daydiff(df, pair, cond, selg, tags, eff) + #tmp = analysediff(dic2, dic1, kdic) + tmp = analysediff01(dic2, dic1, kdic) + tmp['cond'] = cond + resdic2[pair] = tmp + for s in saved: + if not s in totsaved: + totsaved.append(s) + resdic2lst.append(resdic2) + return resdic2lst, totsaved + +def analyse_one_effect(df, selg, pair, cond, eff): + tags = [] + for tag in ["death", "inc", "nic"]: + if eff in valsincolumn(df, 'event_time_' + tag): + tags.append(tag) + #(dic1, dic2, kdic) = get_cond_stats_one(df, pair, [], selg, tags, eff) + (dic1, dic2, kdic) = get_cond_stats_daydiff(df, pair, [], selg, tags, eff) + res0 = analysediff(dic2, dic1, kdic) + #(dic1, dic2, kdic) = get_cond_stats_one(df, pair, cond, selg, tags, eff) + (dic1, dic2, kdic) = get_cond_stats_daydiff(df, pair, cond, selg, tags, eff) + res1 = analysediff(dic2, dic1, kdic) + print("Correlates to " + eff) + display_analysis_one_row(pair, res1, res0) + print() + #return { 'mean0': res0['mean'], 'sig0':res0['sig'], 'mean1': res1['mean'], 'sig1': res1['sig']} + +def showeffects1(win, df, selg, coldic, tags, eff): + name = "Correlates of " + str(list(coldic.keys())) + " to " + eff + resdic = analyseeffects1(df, selg, coldic, tags, eff) + show_analysis_rows_dict(win, name, resdic, False) + +def show_all_effects(win, df, selg, sig, eff): + coldic = {} + selcols = [s[0] for s in selg if s[0] is not False] + if 'sex' not in selcols: + coldic['sex'] = ['flicka'] # special since using both are redundant + for col in ['other_dia','diagnosis_class','surgery_diff','radio_diff','cytoclass_diff','stemcell_diff']: + if col not in selcols: + coldic[col] = valsincolumn(df, col) + tags = [] + for tag in ["death", "inc", "nic"]: + if eff in valsincolumn(df, 'event_time_' + tag): + tags.append(tag) + name = "Direct correlates to " + eff + resdic1 = analyseeffects1(df, selg, coldic, tags, eff) + resdic2lst,remaining = analyseeffects2new(df, selg, resdic1, sig, tags, eff) + if len(resdic2lst) == 1: + show_analysis_rows_dict(win, name, resdic2lst[0], resdic1) + else: + print("There are %d alternatives. Press return to switch." % len(resdic2lst)) + for i,resdic2 in enumerate(resdic2lst): + show_analysis_rows_dict(win, name, resdic2lst[i], resdic1) + if (i mx: + mx = ele[1] + tsum_f += ele[1] + nsum_f += 1 + ok = True + if not ok: + tsum_f += r if not isnan(r) else 0 + return (mn, mx, tsum_f, nsum_f) + +def show_profile_data(win, df, selg, pair, cond, eff): + # först utan betingning + (d1, d2) = select_data(df, [pair], False, selg) + stats = {} + for efftag in ['nic', 'inc', 'death']: + stats1[(1,efftag)] = get_list_stats2(d1, efftag, eff) + stats2[(2,efftag)] = get_list_stats2(d2, efftag, eff) + mn = min([stats[k][0] for k in stats]) + mx = max([stats[k][1] for k in stats]) + win.fig.clear() + show_profile_axes(win, 100, 200, 800, 100, mn, mx, 365) + for (dind, dd, c1, c2, c3) in zip([False,True], [d1,d2], ['pink','lightgreen'], ['red','#00CD00'], ['#8B0000','#006400']): + tsum = 0 + nsum = 0 + lst1 = [] + lst2 = [] + lst3 = [] + for i in range(len(dd)): + s = dd.iloc[i] + tacc = max([s['no_event_time_nic'], s['no_event_time_inc'], s['no_event_time_death']]) + nacc = 0 + for ele in s['event_time_nic']: + if (ele[0] == eff): + lst1.append(ele[1]) + if ele[1] < tacc: + tacc = ele[1] + nacc = 1 + for ele in s['event_time_inc']: + if (ele[0] == eff): + lst2.append(ele[1]) + if ele[1] < tacc: + tacc = ele[1] + nacc = 1 + for ele in s['event_time_death']: + if (ele[0] == eff): + lst3.append(ele[1]) + if ele[1] < tacc: + tacc = ele[1] + nacc = 1 + tsum += tacc + nsum += nacc + ft = tsum / nsum if nsum > 0 else None + show_profile_bars(win, 100, 200, 800, 100, mn, mx, lst1, c1, c1, dind, 0) + show_profile_bars(win, 100, 200, 800, 100, mn, mx, lst2, c2, c2, dind, 0) + show_profile_bars(win, 100, 200, 800, 100, mn, mx, lst3, c3, c3, dind, 0) + show_profile_bars(win, 100, 200, 800, 100, mn, mx, [ft], c2, c2, dind, 1) + + +def printsorteddicts(diclst): + iset = set() + for dic in diclst: + iset = iset.union(dic.keys()) + ilst = sorted(list(iset)) + fstr = "%s" + ("\t%d" * len(diclst)) + for ele in ilst: + tmp = [ele] + [dic[ele] if ele in dic else 0 for dic in diclst] + print(fstr % tuple(tmp)) + +# Full genomlysning: +# relativt seneffekt eff och konkurerande grundorsaker pairlist för varje +# förekommande kombination av pair visa: +# hur många som fått eff av hur många som fått behandlingen och hur lång total obstid. +# visualisera individuella tider och medelfrekvensen + +def display_effect_all_comb(df, selg, conds, eff): + effdic = {tag : valsincolumn(df, 'event_time_' + tag) for tag in ["death", "inc", "nic"]} + tags = [] + for tag in effdic: + if eff in effdic[tag]: + tags.append(tag) + name = "Statistics of " + eff + #(dic, datadic) = get_cond_stats_comb(df, conds, selg, tags, eff) + (dic, dummy1, dummy2) = get_cond_stats_daydiff(df, [], conds, selg, tags, eff) + print(name) + tscale = 1.0/36525 + sum0 = 0 + sum1 = 0 + sum2 = 0 + for k in dic: + sum0 += dic[k][0] + sum1 += dic[k][1] + sum2 += dic[k][2] + pr1 = sum1/sum0 + pr2 = sum2/sum0 + for k in dic: + if dic[k][2] > 0: + resstr = "" + for kk in k.split('_'): + if kk: + if kk[0]=='~': + resstr += " "+kk + else: + resstr += " "+kk + resstr += " " + f1 = (dic[k][0] + 1)/((dic[k][1] + pr1)*tscale) + f2 = (dic[k][0] + 1)*pr1/(dic[k][1] + pr1) + resstr += "%c %f %.2f ( %d / %d : %.2f )" % ('+' if f2 >= 1.5 else ' ', f1, f2, dic[k][0], dic[k][2], dic[k][1]*tscale) + #if dic[k][1] > 0: + # resstr += "%f ( %d / %d : %.2f )" % (dic[k][0]/(dic[k][1]*tscale),dic[k][0],dic[k][2],dic[k][1]*tscale) + #else: + # resstr += "%f ( %d / %d : %.2f )" % (0.0, dic[k][0],dic[k][2],dic[k][1]*tscale) + print(resstr) + n = 0 + for kk in k.split('_'): + if kk: + n += len(kk)+1 + resstr = " average " + " "*max(0, n-3) + resstr += "%f %.2f ( %d / %d : %.2f )" % (1/(pr1*tscale), 1.0, 1, round(pr2), pr1*tscale) + print(resstr) + print() + +def display_condensed_data(df, selg, eff): + effcols = ['event_time_' + tag for tag in ["death", "inc", "nic"]] + df = select_data_alt(df, [(col, eff) for col in effcols], selg) + for i in range(len(df)): + vec = df.iloc[i] + resid = [c[0] for c in vec['event_time_inc'] if c[0][0]=='C'] + resstr = "%s\t%s\t%s" % (vec['LopNr'],vec['diagnosis'],("("+",".join(resid)+")") if resid else "") + for col in ['cytoclass_diff','radio_diff','surgery_diff','stemcell_diff','other_dia']: + for b in vec[col]: + if type(b)==list: + resstr += "\t" + b[0] + else: + resstr += "\t" + b + print(resstr) + +def collect_days(df, col): + dic={} + #cols = ['surgery_diff','cytoclass_diff','stemcell_diff','radio_diff'] + for lst in df[col]: + for ele in lst: + if not ele[0] in dic: + dic[ele[0]] = (0,0) + dic[ele[0]] = tupleadd(dic[ele[0]], (1,ele[1])) + for key in dic: + pair = dic[key] + dic[key] = pair[1]/pair[0] if pair[0]>0 else 0 + return dic + +def ddate_hist_dict(df, mn, mx): + hdict = { k : 0 for k in range(mn, mx+1)} + for i in range(len(df)): + dd = df.iloc[i]['DiagnosDat'] + if dd in hdict: + hdict[dd] += 1 + else: + hdict[dd] = 1 + return hdict + +def show_dd_axes(win, x, y, wdt, hgt, labels, count): + win.add_line(x, y, x+wdt, y, 0.75, 'black') + win.add_line(x, y+hgt+1, x+wdt, y+hgt+1, 0.75, 'black') + for pr in [0.25, 0.5, 0.75]: + ln = win.add_line(x, y+pr*hgt+1, x+wdt, y+pr*hgt+1, 0.75, 'lightgray') + ln.set_zorder(1) + onewdt = min(wdt/len(labels), hgt) + for ind in range(len(labels)): + win.add_text(x+onewdt/2+ind*onewdt, y-20, labels[ind], 12, 'center') + win.add_text(x-8, y-4, "0", 9, 'right') + win.add_text(x-8, y+hgt-4, "%d" % count, 9, 'right') + +def show_dd_bar(win, x, y, wdt, hgt, num, ind, maxcount, val, col): + onewdt = min(wdt/num, hgt) + bwdt = onewdt-2 + boff = 1 + win.add_rect(x + onewdt*ind + boff, y, bwdt, hgt*val/maxcount, 0, None, col) + +def show_dd_data(win, dic, x1, x2, mx, pos): + win.fig.clear() + show_histogram_axes(win, 60, 50+pos*250, 800, 200, ['1970','2015'], 100) + for k in dic: + show_dd_bar(win, 60, 50+pos*250, 800, 200, 46, int(k)-1970, 100, dic[k], 'orange') + + + +def show_dd_data__(win, dic, sel1, sel3, selg, letters): + dd = select_data3(df, sel1, sel3, selg) + win.fig.clear() + show_histogram_axes(win, 100, 100, 800, 200, letters, len(dd[0])) + cols = ['#8B0000','red','pink',None] + #alpha = 0.0 + #counts = [alpha/4, alpha/4, alpha/4, alpha/4] + for ind in range(len(letters)): + letter = letters[ind] + counts = [0.0, 0.0, 0.0, 0.0] + for i in range(len(dd[0])): + s = dd[0].iloc[i] + ok = False + for ele in s['event_time_death']: + if (letter is False or ele[0] == letter): + counts[0] += 1 + ok = True + if not ok: + for ele in s['event_time_inc']: + if (letter is False or ele[0] == letter): + counts[1] += 1 + ok = True + if not ok: + for ele in s['event_time_nic']: + if (letter is False or ele[0] == letter): + counts[2] += 1 + ok = True + if not ok: + counts[3] += 1 + total = sum(counts) + normcount = 0.0 + normtotal = 0.0 + for i in range(len(dd[2])): + s = dd[2].iloc[i] + ok = False + for ele in s['event_time_death']: + if (letter is False or ele[0] == letter): + normcount += 1 + ok = True + if not ok: + for ele in s['event_time_inc']: + if (letter is False or ele[0] == letter): + normcount += 1 + ok = True + if not ok: + for ele in s['event_time_nic']: + if (letter is False or ele[0] == letter): + normcount += 1 + ok = True + normtotal += 1 + show_histogram_bars(win, 100, 100, 800, 200, len(letters), ind, [x/total for x in counts], normcount/normtotal,cols) + +def add_complex_attribute(df, col, name, oldlst): + for i in range(len(df)): + lst = df.iloc[i][col] + x = [ ele[1] for ele in lst if ele[0] in oldlst] + if x: + lst.append([name, min(x)]) diff --git a/dataanalysis/exptest4.py b/dataanalysis/exptest4.py new file mode 100644 index 0000000..98c374a --- /dev/null +++ b/dataanalysis/exptest4.py @@ -0,0 +1,1357 @@ +""" +Copyright (C) 2018-2021 RISE Research Institute of Sweden AB + +File: exptest4.py + +Author: anders.holst@ri.se + +""" + +import pandas as pd +from math import * +from scipy.special import hyp1f1 + +from hist import * +from visual import * + +eps = 0.001 + +# Brute force summation, in logarithms to avoid overflow +def log_hyper_1f1_negintab(a, b, x): + m = -a + lr = 0.0 + lhg = 0.0 + for k in range(m): + lr += log(x*(a+k)/((k+1)*(b+k))) + lhg += log(1 + exp(lr - lhg)) + return lhg + +# r *= (x*(a+k) / ((k+1)*(b+k))) +# hg += r + +def log_hyper_1f1(a, b, x): + # b ej negativt heltal -> log(hyp1f1) + # b neg heltal -> interpolera b+-eps + eps = 0.0001 + if b<0 and b==int(b): + ret = (hyp1f1(a, b-eps, x) + hyp1f1(a, b+eps, x))/2 + else: + ret = hyp1f1(a, b, x) + if isinf(ret): + print("overflow at hyp1f1(%f, %f, %f)" % (a,b,x)) + return -inf + elif ret<=0: + print("negative hyp1f1(%f, %f, %f)" % (a,b,x)) + return -inf + return log(ret) + +def log_hyper_1f1_interpol(a, b, x): + # a ej negativt heltal -> interpolera log(hyp1f1) + if a==int(a): + return log_hyper_1f1_negintab(int(a), b, x) + else: + r1 = log_hyper_1f1_negintab(floor(a), b, x) + r2 = log_hyper_1f1_negintab(ceil(a), b, x) + prop = a - floor(a) + return r2*prop + r1*(1.0-prop) + +# pa = (n, t) +def LogLambdaProb_un(pa1, ll): + (n1, t1) = pa1 + if ll <= 0: + return -inf if n1 > 0 else 0.0 + return n1 * log(ll) - t1*ll + +def LogLambdaProb_pr_un(pa1, ll, pr): + (n1, t1) = pa1 + (n0, t0) = pr + if ll <= 0: + return -inf if n1+n0 > 0 else 0.0 + return (n1+n0) * log(ll) - (t1+t0)*ll + +def LogDifflambdaProbOne_un(pa1, pa2, dl): + (n1, t1) = pa1 + (n2, t2) = pa2 + if dl == 0: + ret = 0.0 + elif dl > 0.0: + ret = (-dl * t2) + log_hyper_1f1_negintab(-n2, -n1 - n2, dl*(t1 + t2)) + else: + ret = (dl * t1) + log_hyper_1f1_negintab(-n1, -n1 - n2, -dl*(t1 + t2)) + return ret + +def LogDifflambdaProbOne_pr_un_old(pa1, pa2, dl, pr): + (n1, t1) = pa1 + (n2, t2) = pa2 + (n0, t0) = pr + if dl == 0: + ret = 0.0 + elif dl > 0.0: + ret = (-dl*(t2+t0)) + log_hyper_1f1_interpol(-n2-n0, -n1-n2-2*n0, dl*(t1+t2+2*t0)) + else: + ret = (dl*(t1+t0)) + log_hyper_1f1_interpol(-n1-n0, -n1-n2-2*n0, -dl*(t1+t2+2*t0)) + return ret + +def LogDifflambdaProbOne_pr_un(pa1, pa2, dl, pr): + (n1, t1) = pa1 + (n2, t2) = pa2 + if dl == 0 or t1+t2 == 0.0 or n1+n2 == 0: + ret = 0.0 + else: + if prior_style == 'partaverage': # separate priors per condition + (n0, t0) = (pr[0], pr[0]*(t1+t2)/(n1+n2+pr[0])) + elif prior_style == 'partaveragezero': # inget extra event + (n0, t0) = (pr[0], pr[0]*(t1+t2)/(n1+n2)) + else: + (n0, t0) = pr + if dl > 0.0: + ret = (-dl*(t2+t0)) + log_hyper_1f1_interpol(-n2-n0, -n1-n2-2*n0, dl*(t1+t2+2*t0)) + else: + ret = (dl*(t1+t0)) + log_hyper_1f1_interpol(-n1-n0, -n1-n2-2*n0, -dl*(t1+t2+2*t0)) + return ret + +def LogDifflambdaProbOne_num1_old(pa1, pa2, pr): + (n1, t1) = pa1 + (n2, t2) = pa2 + (n0, t0) = pr + # hitta gränser för pa1 och pa2 separat + func1 = lambda ll: LogLambdaProb_pr_un(pa1, ll, pr) + func2 = lambda ll: LogLambdaProb_pr_un(pa2, ll, pr) + (rng, vals) = find_calc_hrange(func1, (n1+n0+1)/(t1+t0), (n1+n0+1)/(5*(t1+t0)), 0.1, 25) + mx1 = max(vals) + a1 = rng[0] + b1 = rng[-1] + d1 = (b1 - a1)/(len(rng)-1) + (rng, vals) = find_calc_hrange(func2, (n2+n0+1)/(t2+t0), (n2+n0+1)/(5*(t2+t0)), 0.1, 25) + mx2 = max(vals) + a2 = rng[0] + b2 = rng[-1] + d2 = (b2 - a2)/(len(rng)-1) + return (mx1, a1, b1, d1, func1, mx2, a2, b2, d2, func2) + +def LogDifflambdaProbOne_num1(pa1, pa2): + (n1, t1) = pa1 + (n2, t2) = pa2 + if t1+t2==0.0 or n1+n2==0: + return () + # hitta gränser för pa1 och pa2 separat + func1 = lambda ll: LogLambdaProb_un(pa1, ll) + func2 = lambda ll: LogLambdaProb_un(pa2, ll) + (rng, vals) = find_calc_hrange(func1, (n1+1)/t1, (n1+1)/(5*t1), 0.1, 25) + mx1 = max(vals) + a1 = rng[0] + b1 = rng[-1] + d1 = (b1 - a1)/(len(rng)-1) + (rng, vals) = find_calc_hrange(func2, (n2+1)/t2, (n2+1)/(5*t2), 0.1, 25) + mx2 = max(vals) + a2 = rng[0] + b2 = rng[-1] + d2 = (b2 - a2)/(len(rng)-1) + return (mx1, a1, b1, d1, func1, mx2, a2, b2, d2, func2) + +def LogDifflambdaProbOne_num2(tup, dl): + if not tup: + return 0.0 + (mx1, a1, b1, d1, func1, mx2, a2, b2, d2, func2) = tup + # sen gå igenom för fixt dl och summera + dd = min(d1, d2) + aa = max(a1, a2-dl) + bb = min(b1, b2-dl) + sm = 0.0 + lst = [] + x = aa + while x0.0 else -inf + +#def LogDifflambdaProb_un(da1, da2, dl): +# return sum(list(map(lambda pa1,pa2: LogDifflambdaProbOne_un(pa1, pa2, dl), da1, da2))) + +#def LogDifflambdaProb_pr_un(da1, da2, dl, pr): +# pr = (pr[0]/len(da1), pr[1]/len(da1)) +# return sum(list(map(lambda pa1,pa2: LogDifflambdaProbOne_pr_un(pa1, pa2, dl, pr), da1, da2))) + +def LogDifflambdaProb_num1(da1, da2): + return list(map(lambda pa1,pa2: LogDifflambdaProbOne_num1(pa1, pa2), da1, da2)) + +def LogDifflambdaProb_num2(tupl, dl): + return sum(list(map(lambda tup: LogDifflambdaProbOne_num2(tup, dl), tupl))) + +prior_style = 'partaveragezero' + +def set_prstyle(st): + global prior_style + prior_style = st + +def DifflambdaHist(da1, da2, pa1, pa2): + (n1, t1) = pa1 + (n2, t2) = pa2 + if t1 == 0.0 or t2 == 0.0 or (n1 == 0 and n2 == 0): + return False + tupl = LogDifflambdaProb_num1(da1, da2) + func = lambda dl: LogDifflambdaProb_num2(tupl, dl) + (xres, yres) = find_calc_hrange(func, 0.0, (n1+n2+1)/(t1+t2), 0.1, 20) + return [xres, normalize_logprob(yres)] + +def LambdaHist(pa1): + (n1, t1) = pa1 + if t1 == 0.0 or n1 == 0: + return False + func = lambda dl: LogLambdaProb_un(pa1, dl) + (xres, yres) = find_calc_hrange(func, (n1+1)/t1, (n1+1)/(5*t1), 0.2, 20) + return [xres, normalize_logprob(yres)] + +def normalize_logprob(vec): + mx = max(vec) + sm = 0.0 + for i in range(len(vec)): + sm += exp(vec[i] - mx) + norm = log(sm) + mx + return [exp(x-norm) for x in vec] + +def find_calc_hrange(func, start, step, mindiff, maxdiff): + def zpush(v): + return 0.0 if abs(v)<2e-11 else v + fa = nan + fb = nan + a = nan + b = nan + m1 = zpush(start - 0.5*step) + fm1 = func(m1) + m2 = zpush(start + 0.5*step) + fm2 = func(m2) + left = [] + right = [] + while True: + #print(a, fa, m1, fm1, m2, fm2, b, fb) + if fm1 > fm2: + if not isnan(b): + right = [(b, fb)] + right + (m, fm) = (m1, fm1) + (b, fb) = (m2, fm2) + else: + if not isnan(a): + left = [(a, fa)] + left + (a, fa) = (m1, fm1) + (m, fm) = (m2, fm2) + if isnan(a): + m1 = zpush(3*m - 2*b) + fm1 = func(m1) + (m2, fm2) = (m, fm) + elif isnan(b): + (m1, fm1) = (m, fm) + m2 = zpush(3*m - 2*a) + fm2 = func(m2) + else: + tmp = [fa, fm, fb] + mx = max(tmp) + mn = min(tmp) + if mx-mn < mindiff or (b-a) (b-m)*1.5: + dir = 0 + elif (m-a)*1.5 < (b-m): + dir = 1 + else: + dir = 0 if fa > fb else 1 + if dir == 0: + m1 = zpush(0.5*(a+m)) + fm1 = func(m1) + (m2, fm2) = (m, fm) + else: + (m1, fm1) = (m, fm) + m2 = zpush(0.5*(m+b)) + fm2 = func(m2) + delta = min(m-a, b-m) + left = [(a, fa)] + left + right = [(b, fb)] + right + if m-a < m-b: + right = [(m, fm)] + right + else: + left = [(m, fm)] + left + tr = mx - maxdiff + #print("^", mn, mx, tr) + resx = [] + resy = [] + leftind = 0 + if delta < 1e-10: + delta = 1e-3 + while leftind < len(left) and left[leftind][1] > tr: + resx = [left[leftind][0]] + resx + resy = [left[leftind][1]] + resy + a = zpush(left[leftind][0] - delta) + leftind += 1 + while leftind >= len(left) or a > left[leftind][0] + delta/2: + fa = func(a) + #print("L", a, fa) + if isnan(fa) or isinf(fa) or fa < tr: + break + resx = [a] + resx + resy = [fa] + resy + a = zpush(a-delta) + rightind = 0 + while rightind < len(right) and right[rightind][1] > tr: + resx = resx + [right[rightind][0]] + resy = resy + [right[rightind][1]] + b = zpush(right[rightind][0] + delta) + rightind += 1 + while rightind >= len(right) or b < right[rightind][0] - delta/2: + fb = func(b) + #print("R", b, fb) + if isnan(fb) or isinf(fb) or fb < tr: + break + resx = resx + [b] + resy = resy + [fb] + b = zpush(b+delta) + #print(a, m, b) + return (resx, resy) + +def selfunc(entry, val): + if type(entry) == list: + if len(entry)>0 and type(entry[0])==list: + entry = list(map(lambda x:x[0], entry)) + if type(val) == list: + for x in val: + if x in entry: + return True + return False + else: + return val in entry + else: + if type(val) == list: + return entry in val + else: + return val == entry + +def dictincr(dic, key, incr=1): + if key not in dic: + dic[key] = incr + else: + dic[key] += incr + +def selassoc(sel, key): + if sel is False: + return [] + else: + return [x[-1] for x in sel if key == x[0] or (len(x)==3 and key == x[1])] + +def assoc(lst, key, defl): + for ele in lst: + if ele[0]==key: + return ele[1] + return defl + +def assoc_a0(lst, key, defl): + if type(lst)==list: + for ele in lst: + if type(ele)==list: + if ele[0]==key: + return ele[1] + else: + if ele==key: + return 0 + return defl + else: + return 0 if lst==key else defl + +def subsets(lst, ord): + if ord==0: + return [[]] + elif ord > len(lst): + return [] + else: + rest = subsets(lst[1:], ord-1) + return list(map(lambda l:[lst[0]]+l, rest)) + subsets(lst[1:],ord) + +def issubset(l1, l2): + for e in l1: + if not e in l2: + return False + return True + +def tupleadd(t1, t2): + return tuple((t1[i]+t2[i] for i in range(min(len(t1),len(t2))))) + #return (t1[0]+t2[0],t1[1]+t2[1]) + +def tuplescale(t1, scale): + return tuple((t1[i]*scale for i in range(min(len(t1),len(t2))))) + #return (t1[0]*scale,t1[1]*scale) + +def listminus(l1, l2): + return [e for e in l1 if not e in l2] + +def read_total_file(file): + df = pd.read_csv(file, sep='\t') + for c in df: + if type(df[c][0])==str and df[c][0][0] == '[': + df[c] = df[c].apply(eval) + return df + +def valsincolumn_old(df, col): + d = df[col] + res = [] + for val in d: + if type(val)==list: + for e in val: + if not e in res: + res.append(e) + else: + if not val in res: + res.append(val) + return sorted(res) + +def valsincolumn(df, col): + d = df[col] + res = [] + for val in d: + if type(val)==list: + for e in val: + if type(e)==list: + if not e[0] in res: + res.append(e[0]) + else: + if not e in res: + res.append(e) + else: + if not val in res and not (type(val)==float and isnan(val)): + res.append(val) + return sorted(res) + +#-------------------------- + +# Ny approach för att undvika betingad utspädning: För varje +# seneffekt, i lämplig submängd av data (patienter, kön, etc) kolla om +# en viss faktor (egenskaper, diagnos, behandling) är signifikant +# korrelerad. Sen betinga på varje annan faktor som också är +# korrelerad, och se om signifikansen försvinner helt, alt frekvensen +# förändras signifikant. Lista de som inte försvinner/ändras, i +# signifikansordning. Håll reda på om alla försvinner vid korsvis +# betingning, eller om flera finns kvar. Helst ska varje signifikant +# korrelation förklaras av en faktor, alt flera faktorer av samma typ. + +def select_data(df, sel1, sel2, selg): + # sel = [(col, val)] selg = [('otherdia', 't21')] [('DIA',1)] [False, 'stralung','ANSIKTE'] + # cond = [col1, col2, ] + if df.empty: + return (df,df) + if selg: + test = df[df.columns[0]].apply(lambda entry: True) + for ele in selg: + if len(ele) == 3: + test = test & df[ele[1]].apply(lambda entry: not selfunc(entry, ele[2])) + else: + test = test & df[ele[0]].apply(lambda entry: selfunc(entry, ele[1])) + df = df[test] + test = df[df.columns[0]].apply(lambda entry: True) + for ele in sel1: + if len(ele) == 3: + test = test & df[ele[1]].apply(lambda entry: not selfunc(entry, ele[2])) + else: + test = test & df[ele[0]].apply(lambda entry: selfunc(entry, ele[1])) + d1 = df[test] + if sel2 is False: + d2 = df[~test] + else: + test = df[df.columns[0]].apply(lambda entry: True) + for ele in sel2: + if len(ele) == 3: + test = test & df[ele[1]].apply(lambda entry: not selfunc(entry, ele[2])) + else: + test = test & df[ele[0]].apply(lambda entry: selfunc(entry, ele[1])) + d2 = df[test] + return (d1, d2) + +def select_data_alt(df, selalt, selg): + if df.empty: + return (df,df) + if selg: + test = df[df.columns[0]].apply(lambda entry: True) + for ele in selg: + if len(ele) == 3: + test = test & df[ele[1]].apply(lambda entry: not selfunc(entry, ele[2])) + else: + test = test & df[ele[0]].apply(lambda entry: selfunc(entry, ele[1])) + df = df[test] + test = df[df.columns[0]].apply(lambda entry: False) + for ele in selalt: + if len(ele) == 3: + test = test | df[ele[1]].apply(lambda entry: not selfunc(entry, ele[2])) + else: + test = test | df[ele[0]].apply(lambda entry: selfunc(entry, ele[1])) + df = df[test] + return df + +def count_effect(df, eff, ncol, ecol): + n = 0 + t = 0 + for i in range(len(df)): + row = df.iloc[i] + x = assoc(row[ecol], eff, False) + if x is not False: + n += 1 + t += x + else: + t += row[ncol] + return (n, t) + +def count_effect_mtag(df, eff, tags): + ecols = ['event_time_' + tag for tag in tags] + ncols = ['no_event_time_' + tag for tag in tags] + #ccols = ['censored_time_' + tag for tag in tags] + res = (0, 0, 0) + for i in range(len(df)): + row = df.iloc[i] + x = min([assoc(row[ecol], eff, inf) for ecol in ecols]) + if x is not inf: + res = tupleadd(res, (1, x, 1)) + else: + x = min([row[ncol] for ncol in ncols]) + res = tupleadd(res, (0, x, 1)) + return res + +#def count_cond_effect(df, eff, ncol, ecol, cols, ignore): +# dic = {} +# for i in range(len(df)): +# row = df.iloc[i] +# x = assoc(row[ecol], eff, False) +# if x is not False: +# incr = (1, x) +# else: +# incr = (0, row[ncol]) +# increment_tuple_value(row, dic, cols, ignore, incr) +# return dic + +def make_inc_nic_diag_dict(df): + dic = {} + for i in range(len(df)): + nr=df['LopNr'][i] + if nr not in dic: + dic[nr] = set() + dia=df['DIAGNOS'][i] + if type(dia)==str: + dic[nr].update(set(dia.split(' '))) + return dic + +def count_inc_nic_diag(df, dic, pair, selg, tags, eff): + (dic1, dic2, dick) = get_cond_stats_daydiff_lopnr(df, pair, [], selg, tags, eff) + resdic1 = {} + resdic2 = {} + for nr in dic1['']: + if nr in dic: + for diag in dic[nr]: + if eff in diag: + dictincr(resdic1, diag) + for nr in dic2['']: + if nr in dic: + for diag in dic[nr]: + if eff in diag: + dictincr(resdic2, diag) + return (resdic1, resdic2) + +def get_cond_stats_daydiff_old(df, pair, conds, selg, tags, eff): + # Vi får kuta igenom hela data, och för varje sample sortera in värden i rätt dict och key + # Regeln är att ett villkor är sant om eff-dag är senare än ev cond-dag + (dd, dummy) = select_data(df, [], False, selg) + kl = [""] + for cond in conds: + kl = [(k+"_"+str(cond[-1]), k+"_~"+str(cond[-1])) for k in kl] + kl = [k[0] for k in kl] + [k[1] for k in kl] + dick = { k : kl.index(k) for k in kl} + dic1 = { k : (0, 0, 0) for k in kl} + dic2 = { k : (0, 0, 0) for k in kl} + invk = { kl.index(k) : k for k in kl} + bits = [2**i for i in range(len(conds))] + ecols = ['event_time_' + tag for tag in tags] + ncols = ['no_event_time_' + tag for tag in tags] + for i in range(len(dd)): + row = dd.iloc[i] + x = min([assoc(row[ecol], eff, inf) for ecol in ecols]) + # here, sort it into correct key + # för varje cond och pair, hitta senaste dag före eff-dag (minus marginal) + ylst = [y if y x+c else "Block" for y in [assoc_a0(row[col], val, -inf) for col,val in conds]] + yval = assoc_a0(row[pair[0]], pair[1], -inf) if pair else 0 + td = dic2 if pair and yval==-inf else dic1 + yval = max(yval, 0) + if "Block" in condlst or yval >= x+c-marg: + continue + # sort it into correct key + k = invk[sum([b if v==-inf else 0 for (b,v) in zip(bits,condlst)])] + if x is not inf: + ymax = max(condlst + [yval]) if pair else max(condlst + [0]) + td[k] = tupleadd(td[k], (1, x-max(ymax+marg-c, 0), 1)) + else: + x,c = min([(row[ncol],row[ccol]) for ncol,ccol in zip(ncols,ccols)], key=lambda p:p[0]) + td[k] = tupleadd(td[k], (0, max(x-max(marg-c, 0), 0), 1)) + return (dic1, dic2, dick) + +def get_cond_stats_daydiff_lopnr(df, pair, conds, selg, tags, eff): + (dd, dummy) = select_data(df, [], False, selg) + kl = [""] + for cond in conds: + kl = [(k+"_"+str(cond[-1]), k+"_~"+str(cond[-1])) for k in kl] + kl = [k[0] for k in kl] + [k[1] for k in kl] + dick = { k : kl.index(k) for k in kl} + dic1 = { k : [] for k in kl} + dic2 = { k : [] for k in kl} + invk = { kl.index(k) : k for k in kl} + bits = [2**i for i in range(len(conds))] + ecols = ['event_time_' + tag for tag in tags] + ncols = ['no_event_time_' + tag for tag in tags] + for i in range(len(dd)): + row = dd.iloc[i] + x = min([assoc(row[ecol], eff, inf) for ecol in ecols]) + ylst = [y if y 0 and n2 > 0: + lf += min(n1,n2)*(log(n2/t2) - log(n1/t1)) + sn += min(n1,n2) + return exp(lf/sn) if sn>0 else 1.0 + +def dictolistwithprior(cdic1, cdic2, cdic0, tscale = 1.0): + n1,t1,s1 = (0,0,0) + n2,t2,s2 = (0,0,0) + for k in cdic0: + (tmp1, tmp2) = (cdic1[k], cdic2[k]) + n1 += tmp1[0] + t1 += tmp1[1]*tscale + s1 += tmp1[2] + n2 += tmp2[0] + t2 += tmp2[1]*tscale + s2 += tmp2[2] + if prior_style == 'noninfo': # Noninformative prior + prn = 1.0/len(cdic0) + lst1 = [ (cdic1[k][0] - prn, cdic1[k][1]*tscale) for k in cdic0 ] + lst2 = [ (cdic2[k][0] - prn, cdic2[k][1]*tscale) for k in cdic0 ] + elif prior_style == 'average': # Average prior + prn = 1.0/len(cdic0) + prt = prn*(t1+t2)/(n1+n2+1) + lst1 = [ (cdic1[k][0] + prn, cdic1[k][1]*tscale + prt) for k in cdic0 ] + lst2 = [ (cdic2[k][0] + prn, cdic2[k][1]*tscale + prt) for k in cdic0 ] + elif prior_style == 'partaverage': # separate priors per condition + prn = 1.0/len(cdic0) + lst1 = [] + lst2 = [] + for k in cdic0: + (nn1, tt1, ss1) = cdic1[k] + (nn2, tt2, ss2) = cdic2[k] + prt = prn*(tt1+tt2)/(nn1+nn2+prn) + lst1.append((nn1+prn, (tt1+prt)*tscale)) + lst2.append((nn2+prn, (tt2+prt)*tscale)) + elif prior_style == 'partaveragezero': # inget extra event + prn = 1.0/len(cdic0) + lst1 = [] + lst2 = [] + for k in cdic0: + (nn1, tt1, ss1) = cdic1[k] + (nn2, tt2, ss2) = cdic2[k] + if nn1+nn2 > 0: + prt = prn*(tt1+tt2)/(nn1+nn2) + lst1.append((nn1+prn, (tt1+prt)*tscale)) + lst2.append((nn2+prn, (tt2+prt)*tscale)) + else: + lst1.append((nn1, tt1*tscale)) + lst2.append((nn2, tt2*tscale)) + else: # No prior + lst1 = [ (cdic1[k][0], cdic1[k][1]*tscale) for k in cdic0 ] + lst2 = [ (cdic2[k][0], cdic2[k][1]*tscale) for k in cdic0 ] + return (lst1, lst2, (n1,t1,s1), (n2,t2,s2)) + +def analysediff(cdic1, cdic2, cdic0): + (dlst1, dlst2, (n1, t1, s1), (n2, t2, s2)) = dictolistwithprior(cdic1, cdic2, cdic0, 1.0/36525) + + hist = DifflambdaHist(dlst1, dlst2, (n1, t1), (n2, t2)) + fact = estimatefactor(dlst1, dlst2) + if hist is not False: + mean, var = hist_mean_var(hist) + sig = hist_significance(hist, 0.0, mean) + else: + mean = 0.0 + var= 0.0 + sig = 1.0 + nn1 = sum(list(map(lambda p:p[0], dlst1))) + tt1 = sum(list(map(lambda p:p[1], dlst1))) + nn2 = sum(list(map(lambda p:p[0], dlst2))) + tt2 = sum(list(map(lambda p:p[1], dlst2))) + hist1 = LambdaHist((nn1, tt1)) + if hist1 is not False: + mean1, var1 = hist_mean_var(hist1) + else: + mean1 = 0.0 + hist2 = LambdaHist((nn2, tt2)) + if hist2 is not False: + mean2, var2 = hist_mean_var(hist2) + else: + mean2 = 0.0 + return {'mean':mean, 'std':sqrt(var), 'sig':sig, 'fact': fact, 'mean1':mean1, 'n1':n1, 't1':t1, 's1':s1, 'mean2':mean2, 'n2':n2, 't2':t2, 's2':s2, 'hist':hist, 'hist1':hist1, 'hist2':hist2} + +def analysediff01(cdic1, cdic2, cdic0): + (dlst1, dlst2, (n1, t1, s1), (n2, t2, s2)) = dictolistwithprior(cdic1, cdic2, cdic0, 1.0/36525) + + hist = DifflambdaHist(dlst1, dlst2, (n1, t1), (n2, t2)) + fact = estimatefactor(dlst1, dlst2) + if hist is not False: + mean, var = hist_mean_var(hist) + sig = hist_significance(hist, 0.0, mean) + else: + mean = 0.0 + var= 0.0 + sig = 1.0 + nn1 = sum(list(map(lambda p:p[0], dlst1))) + tt1 = sum(list(map(lambda p:p[1], dlst1))) + nn2 = sum(list(map(lambda p:p[0], dlst2))) + tt2 = sum(list(map(lambda p:p[1], dlst2))) + mean1 = nn1/tt1 if tt1 > 0 else 0.0 + mean2 = nn2/tt2 if tt2 > 0 else 0.0 + return {'mean':mean, 'std':sqrt(var), 'sig':sig, 'fact': fact, 'mean1':mean1, 'n1':n1, 't1':t1, 's1':s1, 'mean2':mean2, 'n2':n2, 't2':t2, 's2':s2, 'hist':hist, 'hist1':False, 'hist2':False} + +def analysediff0(cdic1, cdic2, cdic0): + #dlst1 = cdictolist(cdic1, cdic0, 1.0/36525) + #dlst2 = cdictolist(cdic2, cdic0, 1.0/36525) + #if prior_style == 'noninfo': + # pr = (-1, 0) # Noninformative prior + #elif prior_style in ['average','partaverage','partaveragezero']: + # n1 = sum(list(map(lambda p:p[0], dlst1))) + # t1 = sum(list(map(lambda p:p[1], dlst1))) + # n2 = sum(list(map(lambda p:p[0], dlst2))) + # t2 = sum(list(map(lambda p:p[1], dlst2))) + # pr = (1, 1*(t1+t2)/(n1+n2+1)) # Average prior + #else: + # pr = (0, 0) # No prior + #hist = DifflambdaHist(dlst1, dlst2, pr) + (dlst1, dlst2, (n1, t1, s1), (n2, t2, s2)) = dictolistwithprior(cdic1, cdic2, cdic0, 1.0/36525) + hist = DifflambdaHist(dlst1, dlst2, (n1, t1), (n2, t2)) + if hist is not False: + mean, var = hist_mean_var(hist) + sig = hist_significance(hist, 0.0, mean) + else: + mean = 0.0 + var= 0.0 + sig = 1.0 + return {'mean':mean, 'std':sqrt(var), 'sig':sig} + +def analyseeffects1(df, selg, coldic, tags, eff): + resdic = {} + for col in coldic: + for val in coldic[col]: + #print((col,val)) + #(dic1, dic2, kdic) = get_cond_stats_one(df, (col, val), [], selg, tags, eff) + (dic1, dic2, kdic) = get_cond_stats_daydiff(df, (col, val), [], selg, tags, eff) + resdic[(col,val)] = analysediff(dic2, dic1, kdic) + return resdic + +def analyseeffects_back(df, selg, pair, tags, efflst): + resdic = {} + for eff in efflst: + col,val = pair + (dic1, dic2, kdic) = get_cond_stats_one(df, pair, [], selg, tags, eff) + resdic[eff] = analysediff(dic2, dic1, kdic) + return resdic + +def analyseeffects2(df, selg, resdic1, sig, tags, eff): + resdic2 = {} + lst = [] + for pair in resdic1: + if resdic1[pair]['sig'] <= sig: + lst.append(pair) + lst.sort(key=lambda x: resdic1[x]['sig']) + for (ind,pair1) in reversed(list(enumerate(lst))): + mnsig = 0.0 + mntmp = resdic1[pair1] + for pair2 in lst: + if pair1 != pair2: + #print(pair1,pair2) + #(dic1, dic2, kdic) = get_cond_stats_one(df, pair1, [pair2], selg, tags, eff) + (dic1, dic2, kdic) = get_cond_stats_daydiff(df, pair1, [pair2], selg, tags, eff) + tmp = analysediff(dic2, dic1, kdic) + if tmp['sig'] > mnsig: + mnsig = tmp['sig'] + tmp['cond'] = pair2 + mntmp = tmp + resdic2[pair1] = mntmp + if mnsig > sig: + del lst[ind] + return resdic2 + +def movetosaved(remaining, saved, removed, dic): + # alla som inte tas bort av andra än removed flyttas till saved + changed = False + for (ind,pair) in reversed(list(enumerate(remaining))): + lst = dic[pair] + ok = True + for conds,sig in lst: + ok = False + for cond in conds: + if cond in removed: + ok = True + if not ok: + break + if ok: + changed = True + saved.append(pair) + del remaining[ind] + return changed + +def movetoremoved(remaining, saved, removed, dic): + # alla som tas bort av någon i saved flyttas till removed + changed = False + for (ind,pair) in reversed(list(enumerate(remaining))): + lst = dic[pair] + ok = False + for conds,sig in lst: + ok = True + for cond in conds: + if cond not in saved: + ok = False + if ok: + break + if ok: + changed = True + removed.append(pair) + del remaining[ind] + return changed + +def allexcept(lst, ele): + return [e for e in lst if e != ele] + +def analyseeffects2new(df, selg, resdic1, sig, tags, eff): + ciidic = {} + lst = [] + # välj ut dem med signifiant indirekt effekt + for pair in resdic1: + if resdic1[pair]['sig'] <= sig: + lst.append(pair) + ciidic[pair] = [] + # i första passet, betinga var och en på var och en av de andra, + # spara lista på vilka som gör den insignifikant + # ta bort dem som + # 1) blir insignifikant av någon (som den själv inte gör insignifikant) + # 2) och inte behövs för att göra någon annan insignifikant + # I loop: spara dem som inte tas bort av nåt, släng dem som tas bort av dem, + # iterera med resten dvs spara av resten dem som inte tas bort av kvarvarande + # i nästa pass betinga på par (och senare trippler) av kvarvarande + # ta bort enligt analog princip + remaining = lst + saved = [] + totsaved = [] + for order in [1,2,3]: + remaining = saved + remaining + conds = subsets(remaining, order) + for pair in lst: + for cond in conds: + if not pair in cond: + #(dic1, dic2, kdic) = get_cond_stats_one(df, pair, cond, selg, tags, eff) + (dic1, dic2, kdic) = get_cond_stats_daydiff(df, pair, cond, selg, tags, eff) + tmp = analysediff0(dic2, dic1, kdic) + if tmp['sig'] > sig: + ciidic[pair].append((cond,tmp['sig'])) + remaining = lst.copy() + saved = [] + removed = [] + changed = True + while changed: + # alla som inte tas bort av andra än removed flyttas till saved + # alla som tas bort av någon i saved flyttas till removed + changed = False + if movetosaved(remaining, saved, removed, ciidic): + changed = True + if movetoremoved(remaining, saved, removed, ciidic): + changed = True + if remaining: + alternatives = [] + origsaved = saved.copy() + origremaining = remaining.copy() + akeys = [] + for pair in remaining: + for sp in ciidic[pair]: + sp2 = listminus(sp[0], saved) + if sp2 and sp2 not in akeys: + akeys.append(sp2) + for sp in akeys: + # kolla också för var och en i listan sp om de nollas av de andra i sp plus saved + if len(sp) > 1: + ok = True + for p in sp: + ll = ciidic[p] + tset = listminus(sp,p) + saved + for ele,sig in ll: + if issubset(ele, tset): + ok = False + break + if not ok: + continue + saved = origsaved + sp + remaining = listminus(origremaining, sp) + removed = [] + changed = movetoremoved(remaining, saved, removed, ciidic) + while changed: + changed = False + if movetosaved(remaining, saved, removed, ciidic): + changed = True + if movetoremoved(remaining, saved, removed, ciidic): + changed = True + if not remaining: + s = set(saved) + if s not in alternatives: + alternatives.append(s) + if not alternatives: + alternatives = [set(saved)] + print("Failed to find clean condition alternatives") + print("Saved: ", saved) + print("Remaining: ", remaining) + print("Dict: ", ciidic) + else: + alternatives = [set(saved)] + resdic2lst = [] + for saved in alternatives: + # preparera resdic2 från saved, dvs betinga var och en på övriga + saved = list(saved) + resdic2 = {} + for pair in lst: + if pair in saved: + cond = allexcept(saved, pair) + else: + mx = ((),0.0) + for sp in ciidic[pair]: + if issubset(sp[0], saved) and sp[1]>mx[1]: + mx = sp + if mx[1] > 0.0: + cond = mx[0] + else: + cond = allexcept(saved, pair) + #(dic1, dic2, kdic) = get_cond_stats_one(df, pair, cond, selg, tags, eff) + (dic1, dic2, kdic) = get_cond_stats_daydiff(df, pair, cond, selg, tags, eff) + #tmp = analysediff(dic2, dic1, kdic) + tmp = analysediff01(dic2, dic1, kdic) + tmp['cond'] = cond + resdic2[pair] = tmp + for s in saved: + if not s in totsaved: + totsaved.append(s) + resdic2lst.append(resdic2) + return resdic2lst, totsaved + +def analyse_one_effect(df, selg, pair, cond, eff): + tags = [] + for tag in ["death", "inc", "nic"]: + if eff in valsincolumn(df, 'event_time_' + tag): + tags.append(tag) + #(dic1, dic2, kdic) = get_cond_stats_one(df, pair, [], selg, tags, eff) + (dic1, dic2, kdic) = get_cond_stats_daydiff(df, pair, [], selg, tags, eff) + res0 = analysediff(dic2, dic1, kdic) + #(dic1, dic2, kdic) = get_cond_stats_one(df, pair, cond, selg, tags, eff) + (dic1, dic2, kdic) = get_cond_stats_daydiff(df, pair, cond, selg, tags, eff) + res1 = analysediff(dic2, dic1, kdic) + print("Correlates to " + eff) + display_analysis_one_row(pair, res1, res0) + print() + #return { 'mean0': res0['mean'], 'sig0':res0['sig'], 'mean1': res1['mean'], 'sig1': res1['sig']} + +def showeffects1(win, df, selg, coldic, tags, eff): + name = "Correlates of " + str(list(coldic.keys())) + " to " + eff + resdic = analyseeffects1(df, selg, coldic, tags, eff) + show_analysis_rows_dict(win, name, resdic, False) + +def show_all_effects(win, df, selg, sig, eff): + coldic = {} + selcols = [s[0] for s in selg if s[0] is not False] + if 'sex' not in selcols: + coldic['sex'] = ['flicka'] # special since using both are redundant + for col in ['other_dia','diagnosis_class','surgery_diff','radio_diff','cytoclass_diff','stemcell_diff']: + if col not in selcols: + coldic[col] = valsincolumn(df, col) + tags = [] + for tag in ["death", "inc", "nic"]: + if eff in valsincolumn(df, 'event_time_' + tag): + tags.append(tag) + name = "Direct correlates to " + eff + resdic1 = analyseeffects1(df, selg, coldic, tags, eff) + resdic2lst,remaining = analyseeffects2new(df, selg, resdic1, sig, tags, eff) + if len(resdic2lst) == 1: + show_analysis_rows_dict(win, name, resdic2lst[0], resdic1) + else: + print("There are %d alternatives. Press return to switch." % len(resdic2lst)) + for i,resdic2 in enumerate(resdic2lst): + show_analysis_rows_dict(win, name, resdic2lst[i], resdic1) + if (i mx: + mx = ele[1] + tsum_f += ele[1] + nsum_f += 1 + ok = True + if not ok: + tsum_f += r if not isnan(r) else 0 + return (mn, mx, tsum_f, nsum_f) + +def show_profile_data(win, df, selg, pair, cond, eff): + # först utan betingning + (d1, d2) = select_data(df, [pair], False, selg) + stats = {} + for efftag in ['nic', 'inc', 'death']: + stats1[(1,efftag)] = get_list_stats2(d1, efftag, eff) + stats2[(2,efftag)] = get_list_stats2(d2, efftag, eff) + mn = min([stats[k][0] for k in stats]) + mx = max([stats[k][1] for k in stats]) + win.fig.clear() + show_profile_axes(win, 100, 200, 800, 100, mn, mx, 365) + for (dind, dd, c1, c2, c3) in zip([False,True], [d1,d2], ['pink','lightgreen'], ['red','#00CD00'], ['#8B0000','#006400']): + tsum = 0 + nsum = 0 + lst1 = [] + lst2 = [] + lst3 = [] + for i in range(len(dd)): + s = dd.iloc[i] + tacc = max([s['no_event_time_nic'], s['no_event_time_inc'], s['no_event_time_death']]) + nacc = 0 + for ele in s['event_time_nic']: + if (ele[0] == eff): + lst1.append(ele[1]) + if ele[1] < tacc: + tacc = ele[1] + nacc = 1 + for ele in s['event_time_inc']: + if (ele[0] == eff): + lst2.append(ele[1]) + if ele[1] < tacc: + tacc = ele[1] + nacc = 1 + for ele in s['event_time_death']: + if (ele[0] == eff): + lst3.append(ele[1]) + if ele[1] < tacc: + tacc = ele[1] + nacc = 1 + tsum += tacc + nsum += nacc + ft = tsum / nsum if nsum > 0 else None + show_profile_bars(win, 100, 200, 800, 100, mn, mx, lst1, c1, c1, dind, 0) + show_profile_bars(win, 100, 200, 800, 100, mn, mx, lst2, c2, c2, dind, 0) + show_profile_bars(win, 100, 200, 800, 100, mn, mx, lst3, c3, c3, dind, 0) + show_profile_bars(win, 100, 200, 800, 100, mn, mx, [ft], c2, c2, dind, 1) + + +def printsorteddicts(diclst): + iset = set() + for dic in diclst: + iset = iset.union(dic.keys()) + ilst = sorted(list(iset)) + fstr = "%s" + ("\t%d" * len(diclst)) + for ele in ilst: + tmp = [ele] + [dic[ele] if ele in dic else 0 for dic in diclst] + print(fstr % tuple(tmp)) + +# Full genomlysning: +# relativt seneffekt eff och konkurerande grundorsaker pairlist för varje +# förekommande kombination av pair visa: +# hur många som fått eff av hur många som fått behandlingen och hur lång total obstid. +# visualisera individuella tider och medelfrekvensen + +def display_effect_all_comb(df, selg, conds, eff, tags = False): + effdic = {tag : valsincolumn(df, 'event_time_' + tag) for tag in (["death", "inc", "nic"] if tags is False else tags)} + taglst = [] + for tag in effdic: + if eff in effdic[tag]: + taglst.append(tag) + name = "Statistics of " + eff + #(dic, datadic) = get_cond_stats_comb(df, conds, selg, taglst, eff) + (dic, dummy1, dummy2) = get_cond_stats_daydiff(df, [], conds, selg, taglst, eff) + print(name) + tscale = 1.0/36525 + sum0 = 0 + sum1 = 0 + sum2 = 0 + for k in dic: + sum0 += dic[k][0] + sum1 += dic[k][1] + sum2 += dic[k][2] + pr1 = sum1/sum0 + pr2 = sum2/sum0 + for k in dic: + if dic[k][2] > 0: + resstr = "" + for kk in k.split('_'): + if kk: + if kk[0]=='~': + resstr += " "+kk + else: + resstr += " "+kk + resstr += " " + f1 = (dic[k][0] + 1)/((dic[k][1] + pr1)*tscale) + f2 = (dic[k][0] + 1)*pr1/(dic[k][1] + pr1) + resstr += "%c %f %.2f ( %d / %d : %.2f )" % ('+' if f2 >= 1.5 else ' ', f1, f2, dic[k][0], dic[k][2], dic[k][1]*tscale) + #if dic[k][1] > 0: + # resstr += "%f ( %d / %d : %.2f )" % (dic[k][0]/(dic[k][1]*tscale),dic[k][0],dic[k][2],dic[k][1]*tscale) + #else: + # resstr += "%f ( %d / %d : %.2f )" % (0.0, dic[k][0],dic[k][2],dic[k][1]*tscale) + print(resstr) + n = 0 + for kk in k.split('_'): + if kk: + n += len(kk)+1 + resstr = " average " + " "*max(0, n-3) + resstr += "%f %.2f ( %d / %d : %.2f )" % (1/(pr1*tscale), 1.0, 1, round(pr2), pr1*tscale) + print(resstr) + print() + +def display_condensed_data(df, selg, eff): + effcols = ['event_time_' + tag for tag in ["death", "inc", "nic"]] + df = select_data_alt(df, [(col, eff) for col in effcols], selg) + for i in range(len(df)): + vec = df.iloc[i] + resid = [c[0] for c in vec['event_time_inc'] if c[0][0]=='C'] + resstr = "%s\t%s\t%s" % (vec['LopNr'],vec['diagnosis'],("("+",".join(resid)+")") if resid else "") + for col in ['cytoclass_diff','radio_diff','surgery_diff','stemcell_diff','other_dia']: + for b in vec[col]: + if type(b)==list: + resstr += "\t" + b[0] + else: + resstr += "\t" + b + print(resstr) + +def collect_days(df, col): + dic={} + #cols = ['surgery_diff','cytoclass_diff','stemcell_diff','radio_diff'] + for lst in df[col]: + for ele in lst: + if not ele[0] in dic: + dic[ele[0]] = (0,0) + dic[ele[0]] = tupleadd(dic[ele[0]], (1,ele[1])) + for key in dic: + pair = dic[key] + dic[key] = pair[1]/pair[0] if pair[0]>0 else 0 + return dic + +def ddate_hist_dict(df, mn, mx): + hdict = { k : 0 for k in range(mn, mx+1)} + for i in range(len(df)): + dd = df.iloc[i]['DiagnosDat'] + if dd in hdict: + hdict[dd] += 1 + else: + hdict[dd] = 1 + return hdict + +def show_dd_axes(win, x, y, wdt, hgt, labels, count): + win.add_line(x, y, x+wdt, y, 0.75, 'black') + win.add_line(x, y+hgt+1, x+wdt, y+hgt+1, 0.75, 'black') + for pr in [0.25, 0.5, 0.75]: + ln = win.add_line(x, y+pr*hgt+1, x+wdt, y+pr*hgt+1, 0.75, 'lightgray') + ln.set_zorder(1) + onewdt = min(wdt/len(labels), hgt) + for ind in range(len(labels)): + win.add_text(x+onewdt/2+ind*onewdt, y-20, labels[ind], 12, 'center') + win.add_text(x-8, y-4, "0", 9, 'right') + win.add_text(x-8, y+hgt-4, "%d" % count, 9, 'right') + +def show_dd_bar(win, x, y, wdt, hgt, num, ind, maxcount, val, col): + onewdt = min(wdt/num, hgt) + bwdt = onewdt-2 + boff = 1 + win.add_rect(x + onewdt*ind + boff, y, bwdt, hgt*val/maxcount, 0, None, col) + +def show_dd_data(win, dic, x1, x2, mx, pos): + win.fig.clear() + show_histogram_axes(win, 60, 50+pos*250, 800, 200, ['1970','2015'], 100) + for k in dic: + show_dd_bar(win, 60, 50+pos*250, 800, 200, 46, int(k)-1970, 100, dic[k], 'orange') + + + +def show_dd_data__(win, dic, sel1, sel3, selg, letters): + dd = select_data3(df, sel1, sel3, selg) + win.fig.clear() + show_histogram_axes(win, 100, 100, 800, 200, letters, len(dd[0])) + cols = ['#8B0000','red','pink',None] + #alpha = 0.0 + #counts = [alpha/4, alpha/4, alpha/4, alpha/4] + for ind in range(len(letters)): + letter = letters[ind] + counts = [0.0, 0.0, 0.0, 0.0] + for i in range(len(dd[0])): + s = dd[0].iloc[i] + ok = False + for ele in s['event_time_death']: + if (letter is False or ele[0] == letter): + counts[0] += 1 + ok = True + if not ok: + for ele in s['event_time_inc']: + if (letter is False or ele[0] == letter): + counts[1] += 1 + ok = True + if not ok: + for ele in s['event_time_nic']: + if (letter is False or ele[0] == letter): + counts[2] += 1 + ok = True + if not ok: + counts[3] += 1 + total = sum(counts) + normcount = 0.0 + normtotal = 0.0 + for i in range(len(dd[2])): + s = dd[2].iloc[i] + ok = False + for ele in s['event_time_death']: + if (letter is False or ele[0] == letter): + normcount += 1 + ok = True + if not ok: + for ele in s['event_time_inc']: + if (letter is False or ele[0] == letter): + normcount += 1 + ok = True + if not ok: + for ele in s['event_time_nic']: + if (letter is False or ele[0] == letter): + normcount += 1 + ok = True + normtotal += 1 + show_histogram_bars(win, 100, 100, 800, 200, len(letters), ind, [x/total for x in counts], normcount/normtotal,cols) + +def add_complex_attribute(df, col, name, oldlst): + for i in range(len(df)): + lst = df.iloc[i][col] + x = [ ele[1] for ele in lst if ele[0] in oldlst] + if x: + lst.append([name, min(x)]) diff --git a/dataanalysis/hist.py b/dataanalysis/hist.py new file mode 100644 index 0000000..d5f555e --- /dev/null +++ b/dataanalysis/hist.py @@ -0,0 +1,105 @@ +""" +Copyright (C) 2018-2021 RISE Research Institute of Sweden AB + +File: hist.py + +Author: anders.holst@ri.se + +""" + +from math import * + +def seconddegree(a, b, c, fmin, fmax): + # solves a x^2 + b x + c = 0, and returns the closest real value in the interval [fmin, fmax] + if a==0.0: + f = -c/b + return max(min(f, fmax), fmin) + else: + f0 = -b/(2*a) + fd2 = f0*f0 - c/a + if fd2<0: + f = f0 + elif f0 < (fmin+fmax)/2: + f = f0 + sqrt(fd2) + else: + f = f0 - sqrt(fd2) + return max(min(f, fmax), fmin) + +def hist_peak(hist): + mx = 0.0 + mxind = False + for i in range(len(hist[0])): + if mx < hist[1][i]: + mx = hist[1][i] + mxind = hist[0][i] + return mxind + +def hist_significance(hist, val, peak): + sum = 0.0 + if val > peak: + for i in range(len(hist[0])): + if hist[0][i] >= val: + sum += hist[1][i] + else: + for i in range(len(hist[0])): + if hist[0][i] <= val: + sum += hist[1][i] + return sum + +def hist_mean_var(hist): + sum = 0 + sqsum = 0 + for i in range(len(hist[0])): + sum += hist[0][i]*hist[1][i] + sqsum += hist[0][i]*hist[0][i]*hist[1][i] + return (sum, sqsum-sum*sum) + +def hist_maxinterval(hist, mass): + i = 0 + j = len(hist[0])-1 + while i < j: + a1 = hist[1][i-1] if i>0 else 0.0 + a2 = hist[1][i] + b1 = hist[1][j+1] if jb1 else 0.0 + if mass + ma + mb > 1.0: + break + mass += ma + i += 1 + else: + mb = (b2+b1)/2 + ma = (b2*b2 - a1*a1)/(2*(a2-a1)) if a2>a1 else 0.0 + if mass + ma + mb > 1.0: + break + mass += mb + j -= 1 + if i == 0: + if j == len(hist[0])-1: + return (hist[0][i], hist[0][j]) + else: + b1 = hist[1][j+1] + b2 = hist[1][j] + delta = (b1+b2)/2 + mass - 1.0 + xb = (seconddegree(1.0, 0.0, -2*delta*(b1-b2) -b2*b2, b1, b2) - b2)/(b1 - b2) + return (hist[0][i], hist[0][j]*(1.0-xb) + hist[0][j+1]*xb) + else: + if j == len(hist[0])-1: + a1 = hist[1][i-1] + a2 = hist[1][i] + delta = (a1+a2)/2 + mass - 1.0 + xa = (seconddegree(1.0, 0.0, -2*delta*(a1-a2) -a2*a2, a1, a2) - a2)/(a1 - a2) + return (hist[0][i]*(1.0-xa) + hist[0][i-1]*xa, hist[0][j]) + else: + a1 = hist[1][i-1] + a2 = hist[1][i] + b1 = hist[1][j+1] + b2 = hist[1][j] + delta = (a1+a2+b1+b2)/2 + mass - 1.0 + f = seconddegree(b1-b2+a1-a2, 0.0, -a2*a2*(b1-b2) - b2*b2*(a1-a2) - 2*(a1-a2)*(b1-b2)*delta, max(a1, b1), min(a2, b2)) + xa = (f - a2)/(a1 - a2) + xb = (f - b2)/(b1 - b2) + return (hist[0][i]*(1.0-xa) + hist[0][i-1]*xa, hist[0][j]*(1.0-xb) + hist[0][j+1]*xb) + diff --git a/dataanalysis/visual.py b/dataanalysis/visual.py new file mode 100644 index 0000000..b48f3e9 --- /dev/null +++ b/dataanalysis/visual.py @@ -0,0 +1,394 @@ +""" +Copyright (C) 2018-2021 RISE Research Institute of Sweden AB + +File: visual.py + +Author: anders.holst@ri.se + +""" + +from math import * +from hist import * +from window import * +from Color import * + +def winresize(win, width, height): + win.width = width + win.height = height + win.fig.set_size_inches((width/100.0, height/100.0)) + win.trans = win.fig.transFigure.inverted() + +def calctickdist(span, pixdist, goaldist): + onespan = span * goaldist / pixdist + lsp = log(onespan)/log(10)+0.15 + fact = [1.0, 2.0, 5.0][int(floor((lsp - floor(lsp))*3))] + return pow(10, floor(lsp))*fact + +def show_hist(win, x, y, wdt, hgt, hist, hx1, hx2, col): + ymax = max(hist[1]) * 1.05 + yscale = hgt/ymax if ymax > 0 else 0 + xscale = wdt/(hx2 - hx1) + hlist = list(map(lambda xx,yy: (x + (xx-hx1)*xscale, y + yy*yscale), hist[0], hist[1])) + hlist = [[x + (hist[0][0]-hx1)*xscale, y]] + hlist + [[x + (hist[0][-1]-hx1)*xscale, y]] + win.add_polygon(hlist, 0, False, col) + +def show_hist_axes(win, x, y, wdt, hgt, desc, hx1, hx2, tickdist): + mx = floor(hx2/tickdist) + mn = ceil(hx1/tickdist) + xscale = wdt/(hx2 - hx1) + win.add_line(x, y, x+wdt, y, 0.75, 'black') + x0 = x - xscale*hx1 + for i in range(mn, mx+1): + xx = x0 + i*tickdist*xscale + win.add_line(xx, y-desc, xx, y+hgt if i==0 else y, 0.75, 'black') + win.add_text(xx, y-desc-12, "%g" % (i*tickdist), 9, 'center') + +def show_hist_axes_v2(win, x, y, wdt, hgt, desc, hx1, hx2, tickdist): + mx = floor(hx2/tickdist) + mn = ceil(hx1/tickdist) + xscale = wdt/(hx2 - hx1) + win.add_line(x, y, x+wdt, y, 0.75, 'black') + x0 = x - xscale*hx1 + for i in range(mn, mx+1): + xx = x0 + i*tickdist*xscale + win.add_line(xx, y-desc, xx, y+hgt if i==0 else y, 0.75, 'black') + win.add_text(xx, y-desc-18, "%.1f" % (i*tickdist), 12, 'center') + +def show_hist_stats(win, x, y, wdt, hgt, hist, hx1, hx2, eps, col): + xscale = wdt/(hx2 - hx1) + mean, var = hist_mean_var(hist) + xmin, xmax = hist_maxinterval(hist, 1.0-eps) + win.add_line(x + (xmin-hx1)*xscale, y+int(hgt/3), x + (xmax-hx1)*xscale, y+int(hgt/3), 0.75, col) + win.add_line(x + (xmin-hx1)*xscale, y+int(hgt/6), x + (xmin-hx1)*xscale + 0.01, y+int(hgt/2), 0.75, col) + win.add_line(x + (xmax-hx1)*xscale, y+int(hgt/6), x + (xmax-hx1)*xscale + 0.01, y+int(hgt/2), 0.75, col) + win.add_line(x + (mean-hx1)*xscale, y, x + (mean-hx1)*xscale+0.01, y+int(hgt*2/3), 0.75, col) + + +# 1) analysis row: Effect key, signif, diff hist, abs hist +# 2) analysis cond row: Effect key, signif, diff hist and cond, abs hist +# 3) analysis grid: signif and mean +# 4) analysis cond grid: signif and mean and cond +# Input: straight or cond stats + +lowsign = 0.01 +highsign = 0.0001 + +def set_sign_level(low, high): + global lowsign + global highsign + lowsign = low + highsign = high + +def signcolor(sign): + global lowsign + global highsign + if sign > lowsign: + return 'white' + elif sign < highsign: + return 'black' + else: + llow = log(lowsign) + lhigh = log(highsign) + gr = (log(sign) - llow)/(lhigh - llow) + return hsl_color(0.0, 0.0, 0.5-1.5*gr) + +def show_analysis_rows(win, title, dic, dic0 = False): + win.fig.clear() + maxd = max([abs(dic[k]['mean']) + 2*dic[k]['std'] for k in dic]) + maxl = max([max(dic[k]['mean1'], dic[k]['mean2']) + 2*dic[k]['std'] for k in dic]) + if dic0 is not False: + maxd = max(maxd, max([abs(dic0[k]['mean']) + 2*dic0[k]['std'] for k in dic0])) + maxl = max(maxl, max([max(dic0[k]['mean1'], dic0[k]['mean2']) + 2*dic0[k]['std'] for k in dic0])) + num = len(dic) + rowhgt = int(win.height / (num + 1)) + histwdt1 = int((win.width - 240)*2*maxd/(2*maxd+maxl)) + histwdt2 = int((win.width - 240)*maxl/(2*maxd+maxl)) + tickdist = pow(10, floor(log(maxl/2)/log(10))) + cold = hsl_color(0.35, 1.0, -0.1) + cold0 = hsl_color(0.35, 1.0, 0.4) + col1 = hsl_color(0.16, 1.0, 0.2) + col2 = hsl_color(0.58, 1.0, 0.2) + win.add_text(win.width/2, win.height - rowhgt/4 - 8, title, 12, 'center') + show_hist_axes(win, 160, rowhgt/2, histwdt1, win.height - rowhgt, 6, -maxd, maxd, tickdist) + show_hist_axes(win, 200+histwdt1, rowhgt/2, histwdt2, win.height - rowhgt, 6, 0.0, maxl, tickdist) + ind = 0 + for k in sorted(dic): + win.add_text(20, win.height - rowhgt*(1+ind) - 8, str(k), 12) + if dic0 is not False: + win.add_ellipse(58, win.height - rowhgt*(1+ind) - 8, 16, 16, 1, 'gray', signcolor(dic0[k]['sig'])) + win.add_ellipse(50, win.height - rowhgt*(1+ind) - 8, 16, 16, 1, 'gray', signcolor(dic[k]['sig'])) + win.add_text(80, win.height - rowhgt*(1+ind) - 6, "%.6f" % dic[k]['sig'], 10) + if dic[k]['hist'] is not False: + (mn, mx) = hist_maxinterval(dic[k]['hist'], 1.0-lowsign) + pk = hist_peak(dic[k]['hist']) + win.add_text(160 + histwdt1*(pk/maxd + 1.0)*0.5, win.height - rowhgt*(1+ind) - 6, "[%.2f,%.2f,%.2f]" % (mn, pk, mx), 10, 'center') + if dic0 is not False: + show_hist(win, 160, win.height - rowhgt*(1.45+ind), histwdt1, rowhgt*0.9, dic0[k]['hist'], -maxd, maxd, cold0) + show_hist(win, 160, win.height - rowhgt*(1.45+ind), histwdt1, rowhgt*0.9, dic[k]['hist'], -maxd, maxd, cold) + if dic[k]['hist1'] is not False: + show_hist(win, 200+histwdt1, win.height - rowhgt*(1.45+ind), histwdt2, rowhgt*0.9, dic[k]['hist1'], 0.0, maxl, col1) + win.add_text(200+histwdt1+histwdt2*dic[k]['mean1']/maxl, win.height - rowhgt*(1+ind) - 6, "(%d/%.0f)" % (dic[k]['n1'],dic[k]['t1']), 10, 'left' if dic[k]['mean1']>dic[k]['mean2'] else 'right') + if dic[k]['hist2'] is not False: + show_hist(win, 200+histwdt1, win.height - rowhgt*(1.45+ind), histwdt2, rowhgt*0.9, dic[k]['hist2'], 0.0, maxl, col2) + win.add_text(200+histwdt1+histwdt2*dic[k]['mean2']/maxl, win.height - rowhgt*(1+ind) - 6, "(%d/%.0f)" % (dic[k]['n2'],dic[k]['t2']), 10, 'right' if dic[k]['mean1']>dic[k]['mean2'] else 'left') + ind += 1 + +def show_analysis_rows_dict(win, title, dic, dic0 = False): + win.fig.clear() + num = len(dic) + rowhgt = int(win.height / (num + 1)) + win.add_text(win.width/2, win.height - rowhgt/4 - 8, title, 12, 'center') + if num == 0: + return + maxd = max([abs(dic[k]['mean']) + 2*dic[k]['std'] for k in dic]) + maxl = max([max(dic[k]['mean1'], dic[k]['mean2']) + 2*dic[k]['std'] for k in dic]) + if dic0 is not False: + maxd = max(maxd, max([abs(dic0[k]['mean']) + 2*dic0[k]['std'] for k in dic0])) + maxl = max(maxl, max([max(dic0[k]['mean1'], dic0[k]['mean2']) + 2*dic0[k]['std'] for k in dic0])) + histwdt1 = int((win.width - 240)*2*maxd/(2*maxd+maxl)) + histwdt2 = int((win.width - 240)*maxl/(2*maxd+maxl)) + tickdist = pow(10, floor(log(maxl/2)/log(10))) + cold = hsl_color(0.35, 1.0, -0.1) + cold0 = hsl_color(0.35, 1.0, 0.4) + col1 = hsl_color(0.16, 1.0, 0.2) + col2 = hsl_color(0.58, 1.0, 0.2) + show_hist_axes(win, 160, rowhgt/2, histwdt1, win.height - rowhgt, 6, -maxd, maxd, tickdist) + show_hist_axes(win, 200+histwdt1, rowhgt/2, histwdt2, win.height - rowhgt, 6, 0.0, maxl, tickdist) + ind = 0 + for k in sorted(dic): + if dic0 is not False and k in dic0: + win.add_ellipse(28, win.height - rowhgt*(1+ind) - 8, 16, 16, 1, 'gray', signcolor(dic0[k]['sig'])) + win.add_ellipse(20, win.height - rowhgt*(1+ind) - 8, 16, 16, 1, 'gray', signcolor(dic[k]['sig'])) + win.add_text(50, win.height - rowhgt*(1+ind) - 6, "%.4f" % dic[k]['sig'], 10) + nm = str(k[1]) if type(k)==tuple else str(k) + if 'cond' in dic[k]: + k2 = dic[k]['cond'] + nm += " | " + (k2[1] if type(k2)==tuple else str(k2)) + win.add_text(120, win.height - rowhgt*(1+ind) - 8, nm, 12) + if dic[k]['hist'] is not False: + (mn, mx) = hist_maxinterval(dic[k]['hist'], 1.0-lowsign) + pk = hist_peak(dic[k]['hist']) + win.add_text(160 + histwdt1*(pk/maxd + 1.0)*0.5, win.height - rowhgt*(1+ind) - 6, "[%.2f,%.2f,%.2f]" % (mn, pk, mx), 10, 'center') + if dic0 is not False and k in dic0: + show_hist(win, 160, win.height - rowhgt*(1.45+ind), histwdt1, rowhgt*0.9, dic0[k]['hist'], -maxd, maxd, cold0) + show_hist(win, 160, win.height - rowhgt*(1.45+ind), histwdt1, rowhgt*0.9, dic[k]['hist'], -maxd, maxd, cold) + if dic[k]['hist1'] is not False: + show_hist(win, 200+histwdt1, win.height - rowhgt*(1.45+ind), histwdt2, rowhgt*0.9, dic[k]['hist1'], 0.0, maxl, col1) + win.add_text(200+histwdt1+histwdt2*dic[k]['mean1']/maxl, win.height - rowhgt*(1+ind) - 6, "(%d/%.0f)" % (dic[k]['n1'],dic[k]['t1']), 10, 'left' if dic[k]['mean1']>dic[k]['mean2'] else 'right') + if dic[k]['hist2'] is not False: + show_hist(win, 200+histwdt1, win.height - rowhgt*(1.45+ind), histwdt2, rowhgt*0.9, dic[k]['hist2'], 0.0, maxl, col2) + win.add_text(200+histwdt1+histwdt2*dic[k]['mean2']/maxl, win.height - rowhgt*(1+ind) - 6, "(%d/%.0f)" % (dic[k]['n2'],dic[k]['t2']), 10, 'right' if dic[k]['mean1']>dic[k]['mean2'] else 'left') + ind += 1 + +def show_analysis_rows_dict_v2(win, title, dic, dic0 = False, params = {}): + paramdict = {'rowheight':50, 'namesize': 120, 'marg0': 0, 'marg1': 60, 'marg2' : 40, 'shownum':True, 'col1': 0.16, 'col2': 0.58, 'legendY': False, 'legendX1': False, 'legendX2': False } + paramdict.update(params) + win.fig.clear() + num = len(dic) + rowhgt = paramdict['rowheight'] # int(win.height / (num + 1)) + marg = paramdict['namesize'] + paramdict['marg0'] + paramdict['marg1'] + paramdict['marg2'] + 80 + if num == 0: + return + if 'maxd' in paramdict and 'maxl' in paramdict: + maxd = paramdict['maxd'] + maxl = paramdict['maxl'] + else: + maxd = max([abs(dic[k]['mean']) + 2*dic[k]['std'] for k in dic]) + maxl = max([max(dic[k]['mean1'], dic[k]['mean2']) + 2*dic[k]['std'] for k in dic]) + if dic0 is not False: + maxd = max(maxd, max([abs(dic0[k]['mean']) + 2*dic0[k]['std'] for k in dic0])) + maxl = max(maxl, max([max(dic0[k]['mean1'], dic0[k]['mean2']) + 2*dic0[k]['std'] for k in dic0])) + if 'fixscale' in paramdict: + newwidth = int(paramdict['fixscale']*(2*maxd+maxl) + marg) + newheight = int(rowhgt*(num + 2.5)) + winresize(win, newwidth, newheight) + histwdt1 = int((win.width - marg)*2*maxd/(2*maxd+maxl)) + histwdt2 = int((win.width - marg)*maxl/(2*maxd+maxl)) + tickdist = calctickdist(maxl, histwdt2, 80) #pow(10, floor(log(maxl/2)/log(10))) + lpos = 20 + paramdict['marg0'] + bpos = lpos + paramdict['namesize'] + hpos1 = bpos + 20 + paramdict['marg1'] + hpos2 = hpos1 + paramdict['marg2'] + histwdt2 + cold = hsl_color(0.35, 1.0, -0.1) + cold0 = hsl_color(0.35, 1.0, 0.4) + col1 = hsl_color(paramdict['col1'], 1.0, 0.2) + col2 = hsl_color(paramdict['col2'], 1.0, 0.2) + win.add_text(win.width/2, win.height - rowhgt/4 - 8, title, 12, 'center') + show_hist_axes_v2(win, hpos2, 2*rowhgt, histwdt1, win.height - rowhgt*2.5, 6, -maxd, maxd, tickdist) + show_hist_axes_v2(win, hpos1, 2*rowhgt, histwdt2, win.height - rowhgt*2.5, 6, 0.0, maxl, tickdist) + if paramdict['legendY']: + txt_y = win.add_text(36, win.height/2, paramdict['legendY'], 12, 'center') + txt_y.set_rotation_mode('anchor') + txt_y.set_rotation('vertical') + if paramdict['legendX1']: + txt_x1 = win.add_text(hpos1, 12, paramdict['legendX1'], 12, 'left') + if paramdict['legendX2']: + txt_x2 = win.add_text(hpos2+histwdt1/2, 12, paramdict['legendX2'], 12, 'center') + ind = 0 + for k in sorted(dic): + if dic0 is not False and k in dic0: + win.add_ellipse(28+paramdict['namesize'], win.height - rowhgt*(1+ind) - 8, 16, 16, 1, 'gray', signcolor(dic0[k]['sig'])) + win.add_ellipse(bpos, win.height - rowhgt*(1+ind) - 8, 16, 16, 1, 'gray', signcolor(dic[k]['sig'])) + #win.add_text(50, win.height - rowhgt*(1+ind) - 6, "%.4f" % dic[k]['sig'], 10) + nm = str(k[1]) if type(k)==tuple else str(k) + #if 'cond' in dic[k]: + # k2 = dic[k]['cond'] + # nm += " | " + (k2[1] if type(k2)==tuple else str(k2)) + win.add_text(lpos, win.height - rowhgt*(1+ind) - 8, nm, 12) + if dic[k]['hist'] is not False: + (mn, mx) = hist_maxinterval(dic[k]['hist'], 1.0-lowsign) + pk = hist_peak(dic[k]['hist']) + if paramdict['shownum']: + win.add_text(hpos2 + histwdt1*(pk/maxd + 1.0)*0.5, win.height - rowhgt*(1+ind) - 6, "[%.2f,%.2f,%.2f]" % (mn, pk, mx), 10, 'center') + if dic0 is not False and k in dic0: + show_hist(win, hpos2, win.height - rowhgt*(1.45+ind), histwdt1, rowhgt*0.9, dic0[k]['hist'], -maxd, maxd, cold0) + show_hist(win, hpos2, win.height - rowhgt*(1.45+ind), histwdt1, rowhgt*0.9, dic[k]['hist'], -maxd, maxd, cold) + if dic[k]['hist1'] is not False: + show_hist(win, hpos1, win.height - rowhgt*(1.45+ind), histwdt2, rowhgt*0.9, dic[k]['hist1'], 0.0, maxl, col1) + if paramdict['shownum']: + win.add_text(hpos1 + histwdt2*dic[k]['mean1']/maxl, win.height - rowhgt*(1+ind) - 6, "(%d/%.0f)" % (dic[k]['n1'],dic[k]['t1']), 10, 'left' if dic[k]['mean1']>dic[k]['mean2'] else 'right') + elif dic0 is not False and k in dic0 and dic0[k]['hist1'] is not False: + show_hist(win, hpos1, win.height - rowhgt*(1.45+ind), histwdt2, rowhgt*0.9, dic0[k]['hist1'], 0.0, maxl, col1) + if paramdict['shownum']: + win.add_text(hpos1 + histwdt2*dic0[k]['mean1']/maxl, win.height - rowhgt*(1+ind) - 6, "(%d/%.0f)" % (dic0[k]['n1'],dic0[k]['t1']), 10, 'left' if dic0[k]['mean1']>dic0[k]['mean2'] else 'right') + if dic[k]['hist2'] is not False: + show_hist(win, hpos1, win.height - rowhgt*(1.45+ind), histwdt2, rowhgt*0.9, dic[k]['hist2'], 0.0, maxl, col2) + if paramdict['shownum']: + win.add_text(hpos1 + histwdt2*dic[k]['mean2']/maxl, win.height - rowhgt*(1+ind) - 6, "(%d/%.0f)" % (dic[k]['n2'],dic[k]['t2']), 10, 'right' if dic[k]['mean1']>dic[k]['mean2'] else 'left') + elif dic0 is not False and k in dic0 and dic0[k]['hist2'] is not False: + show_hist(win, hpos1, win.height - rowhgt*(1.45+ind), histwdt2, rowhgt*0.9, dic0[k]['hist2'], 0.0, maxl, col2) + if paramdict['shownum']: + win.add_text(hpos1 + histwdt2*dic0[k]['mean2']/maxl, win.height - rowhgt*(1+ind) - 6, "(%d/%.0f)" % (dic0[k]['n2'],dic0[k]['t2']), 10, 'right' if dic0[k]['mean1']>dic0[k]['mean2'] else 'left') + ind += 1 + +def display_analysis_one_row(pair, res, res0 = False): + resstr = "" + nm = str(pair[1]) if type(pair)==tuple else str(pair) + if 'cond' in res: + k2 = res['cond'] + if type(k2) == list: + nm += " | " + ",".join([kk[1] if type(kk)==tuple else str(kk) for kk in k2]) + elif type(k2) == tuple: + nm += " | " + k2[1] + else: + nm += " | " + str(k2) + sig = res['sig'] + mean = res['mean'] + fact = res['fact'] + fact00 = res['mean2']/res['mean1'] + #if res['hist'] is not False: + # pk = hist_peak(res['hist']) + if sig < 0.0001: + resstr += "***" + elif sig < 0.001: + resstr += "**" + elif sig < 0.01: + resstr += "*" + else: + resstr += "" + resstr += "\t%.4f\t%.4f\t%.2f" % (sig, mean, fact) + if res0 is not False: + sig0 = res0['sig'] + mean0 = res0['mean'] + fact0 = res0['fact'] + resstr += "\t%.4f\t%.4f\t%.2f" % (sig0, mean0, fact0) + else: + resstr += "\t\t\t" + resstr += "\t%s" % (nm) + #sp = max(2, 50-len(resstr)) + resstr += "\t(%d/%d:%.0f)\t(%d/%d:%.0f)" % (res['n1'],res['s1'],res['t1'],res['n2'],res['s2'],res['t2']) + print(resstr) + +def display_analysis_rows_dict(title, dic, dic0 = False): + num = len(dic) + print(title) + if num == 0: + return + for k in sorted(dic): + display_analysis_one_row(k, dic[k], dic0[k] if dic0 is not False and k in dic0 else False) + +def show_analysis_grid(win, title, gdic, gdic0 = False): + win.fig.clear() + numc = len(gdic) + cols = sorted(gdic) + numr = len(gdic[cols[0]]) + rows = sorted(gdic[cols[0]]) + rowhgt = int(win.height / (numr + 2)) + colwdt = int((win.width - 50)/ (numc if numc > 0 else 1)) + win.add_text(win.width/2, win.height - rowhgt/4 - 8, title, 12, 'center') + indc = 0 + for g in cols: + win.add_text(50+indc*colwdt, win.height - rowhgt - 8, str(g), 12) + indc +=1 + indr = 0 + for k in rows: + win.add_text(20, win.height - rowhgt*(2+indr) - 8, str(k), 12) + indc = 0 + for g in cols: + if gdic0 is not False: + win.add_ellipse(58+indc*colwdt, win.height - rowhgt*(2+indr) - 8, 16, 16, 1, 'gray', signcolor(gdic0[g][k]['sig'])) + win.add_ellipse(50+indc*colwdt, win.height - rowhgt*(2+indr) - 8, 16, 16, 1, 'gray', signcolor(gdic[g][k]['sig'])) + indc += 1 + indr += 1 + +def show_profile_axes(win, x, y, wdt, hgt, hx1, hx2, tickdist): + mx = floor(hx2/tickdist) + mn = ceil(hx1/tickdist) + xscale = wdt/(hx2 - hx1) + win.add_line(x, y, x+wdt, y, 0.75, 'black') + x0 = x - xscale*hx1 + for i in range(mn, mx+1): + xx = x0 + i*tickdist*xscale + if i==0: + win.add_line(xx, y-hgt, xx, y+hgt, 0.75, 'black') + else: + win.add_line(xx, y-4, xx, y+4, 0.75, 'black') +# win.add_text(xx, y-desc-12, "%g" % (i*tickdist), 9, 'center') + +def show_profile_bars(win, x, y, wdt, hgt, hx1, hx2, lst, col1, col2, below, full): + xscale = wdt/(hx2 - hx1) + first = True + lst = [x for x in lst if x is not None] + for val in lst: + xx = x + xscale*(val - hx1) + if first and val > 0: + col = col1 + first = False + else: + col = col2 + ihgt = hgt*2/3 if not full else hgt + if below: + win.add_line(xx, y-4, xx, y-ihgt+4, 0.75, col) + else: + win.add_line(xx, y+4, xx, y+ihgt-4, 0.75, col) + +def show_km_axes(win, x, y, wdt, hgt, hx1, hx2, xtickdist, ymx, ytickdist): + mx = floor(hx2/xtickdist) + mn = ceil(hx1/xtickdist) + xscale = wdt/(hx2 - hx1) + yscale = hgt/ymx + win.add_line(x-4, y, x+wdt, y, 1, 'black') + win.add_text(x-6, y-6, str(0), 12, 'right') + for i in range(1,floor(ymx/ytickdist)+1): + yy = y+yscale*ytickdist*i + win.add_line(x-4, yy, x+wdt, yy, 0.75, 'lightgray') + win.add_text(x-6, yy-6, str(int(ytickdist*i*100)), 12, 'right') + x0 = x - xscale*hx1 + for i in range(mn, mx+1): + xx = x0 + i*xtickdist*xscale + if i==0: + win.add_line(xx, y-4, xx, y+hgt+20, 1, 'black') + else: + win.add_line(xx, y-4, xx, y+4, 1, 'black') + win.add_text(xx, y-20, str(i*xtickdist), 12, 'center') + +def show_km_data(win, x, y, wdt, hgt, hx1, hx2, ymx, off, lst, col): + xscale = wdt/(hx2 - hx1) + yscale = hgt/ymx + x0 = x - xscale*hx1 + xscale*off + points = [(x0 + xscale*i, y + yscale*yy) for i,yy in enumerate(lst)] + obj = win.add_polygon(points, 1, col, None) + obj.set_closed(False) + + + diff --git a/dataanalysis/window.py b/dataanalysis/window.py new file mode 100644 index 0000000..5a18ac1 --- /dev/null +++ b/dataanalysis/window.py @@ -0,0 +1,253 @@ +""" +Copyright (C) 2018-2021 RISE Research Institute of Sweden AB + +File: window.py + +Author: anders.holst@ri.se + +""" + + +# %matplotlib notebook + +import matplotlib.pyplot as plt +import matplotlib as mpl + +mpl.interactive(True) +mpl.rcParams['toolbar']='None' + +class Wind(): + def __init__(self, name, width, height): + self.width = width + self.height = height + self.fig = plt.figure(name, (width/100.0, height/100.0)) + self.trans = self.fig.transFigure.inverted() + self.bpfid = -1 + self.brfid = -1 + self.whfid = -1 + self.mfid = -1 + self.kfid = -1 + self.buttons = {} + self.scrolls = {} + self.keys = {} + self.drags = {} + self.objs = [] + self.objtps = [] + self.bpressed = False + + def add_line(self, x1, y1, x2, y2, wdt, col): + fr = self.trans.transform((x1, y1)) + to = self.trans.transform((x2, y2)) + obj = plt.Line2D((fr[0], to[0]), (fr[1], to[1]), linewidth=wdt*0.75, color=col) + self.fig.add_artist(obj) + return obj + + def add_rect(self, x1, y1, wdt, hgt, brd, fg, bg): + fr = self.trans.transform((x1, y1)) + to = self.trans.transform((x1+wdt, y1+hgt)) + if bg: + obj = plt.Rectangle(fr, to[0]-fr[0], to[1]-fr[1], linewidth=brd*0.75 or 0, edgecolor=fg or (0,0,0,0), facecolor=bg) + else: + obj = plt.Rectangle(fr, to[0]-fr[0], to[1]-fr[1], linewidth=brd*0.75 or 0, edgecolor=fg or (0,0,0,0), fill=False) + self.fig.add_artist(obj) + return obj + + def add_ellipse(self, x1, y1, wdt, hgt, brd, fg, bg, angle = 0): + fr = self.trans.transform((x1, y1)) + to = self.trans.transform((x1+wdt, y1+hgt)) + if bg: + obj = mpl.patches.Ellipse(((fr[0]+to[0])*0.5, (fr[1]+to[1])*0.5), to[0]-fr[0], to[1]-fr[1], linewidth=brd*0.75 or 0, edgecolor=fg or (0,0,0,0), facecolor=bg, angle = angle) + else: + obj = mpl.patches.Ellipse(fr, to[0]-fr[0], to[1]-fr[1], linewidth=brd*0.75 or 0, edgecolor=fg or (0,0,0,0), fill=False, angle = angle) + self.fig.add_artist(obj) + return obj + + def add_polygon(self, xylst, brd, fg, bg): + xytr = list(map(self.trans.transform, xylst)) + if bg: + obj = plt.Polygon(xytr, linewidth=brd*0.75 or 0, edgecolor=fg or (0,0,0,0), facecolor=bg) + else: + obj = plt.Polygon(xytr, linewidth=brd*0.75 or 0, edgecolor=fg or (0,0,0,0), fill=False) + self.fig.add_artist(obj) + return obj + + def add_text(self, x1, y1, txt, fontsz = None, align = None): + fr = self.trans.transform((x1, y1)) + obj = plt.Text(fr[0], fr[1], txt, fontsize=fontsz or 18, ha=align or 'left') + self.fig.add_artist(obj) + return obj + + def locate_object(self, event, objs): + for obj in objs: + if obj.contains(event)[0]: + return obj + + def locate_object_type(self, event, types): + for obj in self.fig.artists: + if True in types or type(obj) in types: + if obj.contains(event)[0]: + return obj + + def button_press_callback(self, event): + obj = self.locate_object(event, self.objs) + if obj is False: + obj = self.locate_object_type(event, self.objtps) + func = False + if obj is not False: + if (event.guiEvent.num, event.guiEvent.state, obj) in self.buttons: + func = self.buttons[(event.guiEvent.num, event.guiEvent.state, obj)] + elif (event.guiEvent.num, -1, obj) in self.buttons: + func = self.buttons[(event.guiEvent.num, -1, obj)] + elif (event.guiEvent.num, event.guiEvent.state, True) in self.buttons: + func = self.buttons[(event.guiEvent.num, event.guiEvent.state, True)] + elif (event.guiEvent.num, -1, True) in self.buttons: + func = self.buttons[(event.guiEvent.num, -1, True)] + if func is not False: + if self.bpressed is False: + if type(func)==tuple: + self.bpressed = (False, func[1], func[2] if len(func)==3 else False, obj) + func = func[0] + func(self, event, obj) + else: + if (event.guiEvent.num, event.guiEvent.state) in self.buttons: + func = self.buttons[(event.guiEvent.num, event.guiEvent.state)] + elif (event.guiEvent.num, -1) in self.buttons: + func = self.buttons[(event.guiEvent.num, -1)] + if func is not False: + if self.bpressed is False: + if type(func)==tuple: + self.bpressed = (False, func[1], func[2] if len(func)==3 else False, False) + func = func[0] + func(self, event) + # kolla om ev över object i objs + # kolla om bunden: (ev, mod, obj), (ev, -1, obj), (ev, mod, T), (ev, -1, T), (ev, mod), (ev, -1) + # om release-cb, spara undan (vad göra om redan finns? ignorera press?) + # kör press-cb + + def button_release_callback(self, event): + if self.bpressed is not False: + func = self.bpressed[1] + obj = self.bpressed[3] + self.bpressed = False + if obj is False: + func(self, event) + else: + func(self, event, obj) + + def button_motion_callback(self, event): + if self.bpressed is not False: + func = self.bpressed[2] + obj = self.bpressed[3] + if obj is False: + func(self, event) + else: + func(self, event, obj) + + def bind_button(self, button, func, obj = False): + # func called as func(win, ev) or func(win, ev, obj) if object is not False + if (self.bpfid == -1): + self.bpfid = self.fig.canvas.mpl_connect('button_press_event', self.button_press_callback) + if (self.brfid == -1): + self.brfid = self.fig.canvas.mpl_connect('button_release_event', self.button_release_callback) + if (self.mfid == -1 and type(func)==tuple and len(func)==3): + self.mfid = self.fig.canvas.mpl_connect('motion_notify_event', self.button_motion_callback) + but = button if type(button)==tuple else (button, -1) + if obj is not False: + but = but + (obj,) + if obj is True or type(obj)==type: + if not obj in self.objtps: + self.objtps.append(obj) + elif not obj in self.objs: + self.objs.append(obj) + self.buttons[but] = func + + def key_press_callback(self, event): + obj = self.locate_object(event, self.objs) + if obj is False: + obj = self.locate_object_type(event, self.objtps) + func = False + if obj is not False: + if (event.guiEvent.keysym, event.guiEvent.state, obj) in self.keys: + func = self.keys[(event.guiEvent.keysum, event.guiEvent.state, obj)] + elif (event.guiEvent.keysym, -1, obj) in self.keys: + func = self.keys[(event.guiEvent.keysym, -1, obj)] + elif (event.guiEvent.keysym, event.guiEvent.state, True) in self.keys: + func = self.keys[(event.guiEvent.keysym, event.guiEvent.state, True)] + elif (event.guiEvent.keysym, -1, True) in self.keys: + func = self.keys[(event.guiEvent.keysym, -1, True)] + if func is not False: + func(self, event, obj) + else: + if (event.guiEvent.keysym, event.guiEvent.state) in self.keys: + func = self.keys[(event.guiEvent.keysym, event.guiEvent.state)] + elif (event.guiEvent.keysym, -1) in self.keys: + func = self.keys[(event.guiEvent.keysym, -1)] + if func is not False: + func(self, event) + # kolla om ev över object i objs + # kolla om bunden: (ev, mod, obj), (ev, -1, obj), (ev, mod, T), (ev, -1, T), (ev, mod), (ev, -1) + # kör press-cb + + def bind_key(self, keysym, func, obj = False): + # func called as func(win, ev) or func(win, ev, obj) if object is not False + if (self.kfid == -1): + self.kfid = self.fig.canvas.mpl_connect('key_press_event', self.key_press_callback) + key = keysym if type(keysym)==tuple else (keysym, -1) + if obj is not False: + key = key + (obj,) + if obj is True or type(obj)==type: + if not obj in self.objtps: + self.objtps.append(obj) + elif not obj in self.objs: + self.objs.append(obj) + self.keys[key] = func + + def scroll_callback(self, event): + obj = self.locate_object(event, self.objs) + if obj is False: + obj = self.locate_object_type(event, self.objtps) + func = False + if obj is not False: + if (event.button, event.guiEvent.state, obj) in self.scrolls: + func = self.scrolls[(event.button, event.guiEvent.state, obj)] + elif (event.button, -1, obj) in self.scrolls: + func = self.scrolls[(event.button, -1, obj)] + elif (event.button, event.guiEvent.state, True) in self.scrolls: + func = self.scrolls[(event.button, event.guiEvent.state, True)] + elif (event.button, -1, True) in self.scrolls: + func = self.scrolls[(event.button, -1, True)] + elif (True, -1, obj) in self.scrolls: + func = self.scrolls[(True, -1, obj)] + elif (True, -1, True) in self.scrolls: + func = self.scrolls[(True, -1, True)] + if func is not False: + func(self, event, obj) + else: + if (event.button, event.guiEvent.state) in self.scrolls: + func = self.keys[(event.button, event.guiEvent.state)] + elif (event.button, -1) in self.keys: + func = self.keys[(event.button, -1)] + elif (True, -1) in self.scrolls: + func = self.scrolls[(True, -1)] + if func is not False: + func(self, event) + # kolla om ev över object i objs + # kolla om bunden: (ev, mod, obj), (ev, -1, obj), (ev, mod, T), (ev, -1, T), (ev, mod), (ev, -1) + # kör press-cb + + def bind_scroll(self, dir, func, obj = False): + # dir can be "up", "down", "left", "right", or True + # func called as func(win, ev) or func(win, ev, obj) if object is not False + if (self.whfid == -1): + self.whfid = self.fig.canvas.mpl_connect('scroll_event', self.scroll_callback) + dd = dir if type(dir)==tuple else (dir, -1) + if obj is not False: + dir = dir + (obj,) + if obj is True or type(obj)==type: + if not obj in self.objtps: + self.objtps.append(obj) + elif not obj in self.objs: + self.objs.append(obj) + self.keys[dd] = func + + diff --git a/datapreparation/analyze.py b/datapreparation/analyze.py new file mode 100644 index 0000000..f7192c7 --- /dev/null +++ b/datapreparation/analyze.py @@ -0,0 +1,438 @@ +#! /usr/bin/env python3 + + +""" ------------------------------- + + analyse.py + + Copyright (C) 2018 RISE + This code was produced by RISE + The 2013-04-10 version + + bonsai/src_v02/analyze.py + + simple analysis of pandas dataframes data + such as + + 1. find duplicated rows + + 2. number of unique values in a column + + 3. number of unique values in common + between two columns in two different + files + + 4. + +------------------------------------""" + + +import global_settings as gs +import numpy as np +import pandas as pd +import bonsai_io as bio +import common +import copy + +def nr_of_unique_rows(df): + d = df.drop_duplicates() + return len(d) + +def nr_of_unique_values_in_cols(df, cols): + c = df.drop_duplicates(subset = cols) + return len(c) + + +def nr_of_unique_values(df, col): + c = df[col].dropna() + c = c.drop_duplicates() + return len(c) + +""" +def nr_of_unique_numeric_values(df, col): + + c = df[col].dropna() + c = c.drop_duplicates() + c = c.str.isnumeric() + c = c[c].index.values +""" + + +def nr_of_nonnan_values(df, col): + + c = df[col].dropna() + return len(c) + +def nr_of_unique_digital_values(df, col): + + c = df[col].dropna() + c = c.drop_duplicates() + c = c.str.isdigit() + c = c[c].index.values + # df = df.drop_duplicates(subset = col) + # df = df[ df[col].dropna().str.isdigit() ] + # df = df[ df[col].str.contains('\d', regex=True) ] + return len(c) + +def duplicated_rows(df): + df['dup'] = df.duplicated() + df = df[df['dup'] == True] + return df + +def print_duplicated_rows(df, nr): + dup = duplicated_rows(df) + print('Nr of rows in total', len(df)) + print('Nr of duplicated rows', len(dup)) + nr = min( nr,len(dup) ) + if nr > 0: + print('the first', nr,' of them') + print(dup[0:nr]) + return dup + +def unique_number_values(df, col): + df = df.drop_duplicates(subset = col) + df = df[ df[col].str.contains('\d', regex=True) ] + return df + + +def info(df, name = ''): + print() + if name != '': + print() + print('--------------------------------------------------') + print() + print('\tInfo on the file\n\t' + name) + print() + print('--------------------------------------------------') + print() + df_unique_nr = nr_of_unique_rows(df) + print(' shape', df.shape) + print(' unique rows', df_unique_nr) + + for c in df.columns: + print() + print('\tInfo on non-nan values of column', c) + print() + nonnan_nr = nr_of_nonnan_values(df, c) + unique_nr = nr_of_unique_values(df, c) + digital_nr = nr_of_unique_digital_values(df, c) + # numeric_nr = nr_of_unique_numeric_values(df, c) + print('non-nan values', nonnan_nr) + print(' unique values', unique_nr) + print('digital values', digital_nr) + # print('numeric values', unique_nr) + + print() + # return unique_number_values(df, 'ICD10') + +# df = df[ df[c].str.contains('\d', regex=True) ] + + + +def readall(): + dia = bio.read_generated_dia() + dgr = bio.read_diagroups() + per = bio.readperson() + ctr = bio.readcontrol() + inc = bio.readincare() + nic = bio.readnicare() + dru = bio.readdrug() + dcl = bio.readdrugclasses() + tre = bio.readtreatment() + sur = bio.readsurgery() + cau = bio.readcause() + + data = [ + dia, + dgr, + per, + ctr, + inc, + nic, + dru, + dcl, + tre, + sur, + cau +] + + name = [ + 'diagnos ', + 'diagnosgrupp ', + 'person ', + 'kontrollgrupp ', + 'sluten v_rd ', + '_ppen v_rd ', + 'l_kemedel ', + 'l_kemedelsgrupper', + 'behandling ', + 'kirurgi ', + 'orsak ', + ] + + return data, name + + +def info_on_all(): + + data, name = readall() + + for i in range(0, len(name)): + info(data[i], name[i]) + + +def compare_lopnr(dfx, dfy, namex = 'data 1', namey = 'data 2'): + + xs = list(dfx['LopNr'].values) + ys = list(dfy['LopNr'].values) + + sx = set(xs) + sy = set(ys) + cut = sx & sy + ux = sx - sy + uy = sy - sx + + print() + # print('shape ' + namex + '\t\t', dfx.shape) + # print('shape ' + namey + '\t\t', dfy.shape) + # print('unique Lopnr ' + namex + '\t', len(xs)) + # print('unique Lopnr ' + namey + '\t', len(ys)) + + print('common Lopnr\t\t\t', len(cut)) + print('Lopnr in ' + namex + ' only\t', len(ux)) + print('Lopnr in ' + namey + ' only\t', len(uy)) + print() + + ux = list(ux) + uy = list(uy) + ux.sort + uy.sort + return ux, uy + + +def readlopnr(): + dia = bio.read_generated_dia() + per = bio.readperson() + ctr = bio.readcontrol() + inc = bio.readincare() + nic = bio.readnicare() + dru = bio.readdrug() + tre = bio.readtreatment() + sur = bio.readsurgery() + cau = bio.readcause() + + data = [dia, per, ctr, inc, nic, dru, tre, sur, cau] + + name = [ + 'diagnos ', + 'person ', + 'kontrollgrupp', + 'sluten v_rd ', + '_ppen v_rd ', + 'l_kemedel ', + 'behandling ', + 'kirurgi ', + 'orsak ', + ] + + return data, name + + +def pairwise_lopnr_comparisions(): + + data, name = readlopnr() + + for i in range(0, len(name)): + for j in range(i+1, len(name)): + print() + print('--------------------------------------------------') + print() + print('\tComparing ' + name[i] + ' with ' + name[j]) + print() + print('--------------------------------------------------') + print() + + compare_lopnr(data[i], data[j], name[i], name[j]) + + + + + +""" ------------------------------- + + 4. count amd list various types of diagnosis + codes in care data + +------------------------------------""" + +""" +def is_icd10_class(x): + if not common.isstr(x): + return False + if common.is_icd10(x): + return False + if len(x) < 3: + return False + if not x[0].isupper(): + return False + return x[1].isdigit() and x[2].isdigit() +""" + + +def code_count(xs): + if not isinstance(xs, str): + return 0 + return len(xs.split()) + +def icd10_count(xs): + if not isinstance(xs, str): + return 0 + count = 0 + for x in xs.split(): + if common.is_icd10(x): + # print(x) + count += 1 + return count + +def not_icd10_count(xs): + if not isinstance(xs, str): + return 0 + count = 0 + for x in xs.split(): + if not common.is_icd10(x): + # print(x) + count += 1 + return count + +def icd10_class_count(xs): + if not isinstance(xs, str): + return 0 + count = 0 + for x in xs.split(): + if common.is_icd10_class(x): + # print(x) + count += 1 + return count + +""" +def code_list(xs): + if not isinstance(xs, str): + return 0 + return len(xs.split()) +""" + +def count_and_print(df, table = False): + dia = 'DIAGNOS' + dfc = copy.copy(df) + dfc['code_count'] = df[dia].apply(code_count) + dfc['icd10_count'] = df[dia].apply(icd10_count) + dfc['not_icd10_count'] = df[dia].apply(not_icd10_count) + dfc['icd10_class_count'] = df[dia].apply(icd10_class_count) + nr_of_codes = dfc['code_count'].sum() + nr_of_icd10 = dfc['icd10_count'].sum() + nr_of_not_icd10 = dfc['not_icd10_count'].sum() + nr_of_class_codes = dfc['icd10_class_count'].sum() + + if table: + print('nr_of_lines\t', len(df)) + print('nr_of_codes\t', nr_of_codes) + print('nr_of_icd10\t', nr_of_icd10) + print('nr_of_not_icd10\t', nr_of_not_icd10) + print('nr_of_icd10_class_codes\t', nr_of_class_codes) + + else: + + + print(' nr_of_lines', len(df)) + print(' nr_of_codes', nr_of_codes) + print(' nr_of_icd10', nr_of_icd10) + print(' nr_of_not_icd10', nr_of_not_icd10) + print(' nr_of_icd10_class_codes', nr_of_class_codes) + + + """ + for c in df1[dia].values: + print('\t', c) + """ + + +def print_dates(df, table = False): + date = 'INDATUM' + + if table: + + print('first date\t', df[date].min()) + print('last date\t', df[date].max()) + + else: + + print(' first date', df[date].min()) + print(' last date', df[date].max()) + + +def icd10_class_list(xs): + if not isinstance(xs, str): + return [] + codes = [] + for x in xs.split(): + if common.is_icd10_class(x): + codes += [x] + #print(codes) + return codes + +def flat(xs): + ys = [] + for x in xs: + ys += x + return ys + + + +def print_class_codes(df): + dia = 'DIAGNOS' + dfc = copy.copy(df) + dfc['icd10_class'] = df[dia].apply(icd10_class_list) + dfc['is_class'] = dfc['icd10_class'].apply(lambda x: x != []) + dfc = dfc[dfc['is_class']] + codes = np.unique(flat(list(dfc['icd10_class'].values))) + for c in codes: + print('\t', c) + + +def diagnosis_code_count(df, print_class = False, table = False): + + date = 'INDATUM' + nr = 'LopNr' + icd10_start = np.datetime64('1998-01-01') + + """ + size0 = len(df) + df = df.dropna().reset_index(drop=True) + print('nr of empty lines:', size0- len(df)) + """ + + df[date] = df[date].apply(bio.str2time) + df = df.sort_values(date).dropna().reset_index(drop=True) + + df1 = df[df[date] < icd10_start] + df2 = df[df[date] >= icd10_start] + + print() + print('code counts before 1998_01_01:') + print() + + print_dates(df1, table = table) + count_and_print(df1, table = table) + + print() + print('code counts from 1998_01_01') + print() + + print_dates(df2, table = table) + count_and_print(df2, table = table) + if print_class: + print() + print(' all icd10_class_codes:') + print_class_codes(df2) + + print() diff --git a/datapreparation/bonsai_io.py b/datapreparation/bonsai_io.py new file mode 100644 index 0000000..fabc876 --- /dev/null +++ b/datapreparation/bonsai_io.py @@ -0,0 +1,738 @@ +#! /usr/bin/env python3 + + +""" ------------------------------- + + bonsai_io.py + + Copyright (C) 2019 RISE + This code was produced by RISE + The 2019-08-29 version + + bonsai/src/bonsai_io.py + +------------------------------------""" + + +import copy +import pandas as pd +import numpy as np +from datetime import datetime as time +from datetime import timedelta as delta +from dateutil import parser + + +import global_settings as gs +import common +import diagnose + + +""" ---------------------------------------- + + common support + +---------------------------------------""" + + + +def isstr(x): + return isinstance(x, str) + +def str2number(x): + s = str(x) + if (',' in s): + x = float(s.replace(',','.')) + return round(float(x), 5) + +def str2time(x): + # print(x) + if isstr(x): + return parser.parse(x) + return x + +def save_generated_file(df, name): + f = gs.places.generated_out(name) + print('saving ' + name + ' file to:') + print('\t', f) + df.to_csv(f, index = False, sep = '\t') + +def read_generated_file(name): + f = gs.places.generated_out(name) + print('reading file:', f) + df = pd.read_csv(f, sep='\t') + return df + + +def save_tmp_file(df, name): + f = gs.places.tmp_out(name) + print('saving ' + name + ' file to:') + print('\t', f) + df.to_csv(f, index = False, sep = '\t') + +def save_local_tmp_file(df, name): + f = gs.places.local_tmp_out(name) + print('saving ' + name + ' file to:') + print('\t', f) + df.to_csv(f, index = False, sep = '\t') + + +def save_local_result(df, name): + f = gs.places.local_result(name) + print('saving ' + name + ' file to:') + print('\t', f) + df.to_csv(f, index = False, sep = '\t') + + +def save_local_input(df, name): + f = gs.places.local_input + name + '.csv' + print('saving ' + name + ' file to:') + print('\t', f) + df.to_csv(f, index = False, sep = '\t') + + + +""" ---------------------------------------- + + lexicon + +---------------------------------------""" + +def read_code_lex(): + lex = pd.read_csv(gs.places.code_lex, dtype = str, sep='\t') + return lex + + +def read_incare_lex(): + name = 'incare_lex' + f = gs.places.local_input + name + '.csv' + print('reading file:', f) + lex = pd.read_csv(f, dtype = str, sep='\t') + return lex + + +def read_care_icd8_lex(): + f = gs.places.care_icd8_lex + print('reading file:', f) + lex = pd.read_csv(f, dtype = str, sep='\t') + return lex + + +def read_care_icd9_lex(): + f = gs.places.care_icd9_lex + print('reading file:', f) + lex = pd.read_csv(f, dtype = str, sep='\t') + return lex + +def read_cause_icd8_lex(): + f = gs.places.cause_icd8_lex + print('reading file:', f) + lex = pd.read_csv(f, dtype = str, sep='\t') + return lex + + +def read_cause_icd9_lex(): + f = gs.places.cause_icd9_lex + print('reading file:', f) + lex = pd.read_csv(f, dtype = str, sep='\t') + return lex + +""" ---------------------------------------- + + dia + + reading the diagnosis file, the original + and saved filled in versions + +---------------------------------------""" + + +def read_all_original_dia(): + """ + reading the the non-null LopNr rows from the + original file helena_lev_Diagnos_v2.txt + modified only by removing a ',' sign + """ + + f = gs.places.diagnose_m + print('reading file:', f) + df = pd.read_csv(f, dtype = str, sep='\t') + df = df[ ~df['LopNr'].isnull() ] + # df = df[gs.places.diagnose_selection] + df = df.sort_values('LopNr').reset_index(drop=True) + return df + +def read_original_dia(): + """ + reading the non-null LopNr rows and the selected + columns from original file helena_lev_Diagnos_v2.txt + modified only by removing a ',' sign + """ + + f = gs.places.diagnose_m + print('reading file:', f) + df = pd.read_csv(f, dtype = str, sep='\t') + df = df[ ~df['LopNr'].isnull() ] + df = df[gs.places.diagnose_selection] + df = df.sort_values('LopNr').reset_index(drop=True) + return df + + +def save_generated_dia(dia): + print('saving generated diagnostics file to:') + print('\t', gs.places.generated_dia) + dia.to_csv(gs.places.generated_dia, index = False, sep = '\t') + +""" +def parse(s): + print(s, type(s)) + print(str(s)) + return parser.parse(str(s)) +""" + +def read_generated_dia(): + + f = gs.places.generated_dia + print('reading file:', f) + dia = pd.read_csv(f, dtype = str, sep='\t') + da = 'Diagnos_lder' + dia[da] = dia[da].apply(common.str2number) + """ + for col in gs.places.diagnose_time_cols: + dia[col] = dia[col].apply(str2time) + """ + return dia + +def save_tmp_dia(dia): + print('saving generated diagnostics groups file to:') + print('\t', gs.places.tmp_dia) + dia.to_csv(gs.places.tmp_dia, index = False, sep = '\t') + + + +""" ---------------------------------------- + + dia groups + + reading and saving versions of the + diagnosis groups file + +---------------------------------------""" + +def group_nr(s): + return int(s.split()[1][0:-1]) + +def rm_dot(s): + return s.replace('.','') + +def read_orig_diagroups(): + grp = gs.names.input_data_group + df = pd.read_csv(gs.places.orig_dia_groups, dtype = str, sep='\t') + cols = list(df.columns[4:6]) + list([df.columns[1]]) + df = df[cols] + df.columns = ['ICD10', 'SNOMED', grp] + df[grp] = df[grp].apply(group_nr) + df['ICD10'] = df['ICD10'].apply(rm_dot) + return df + +def save_diagroups(): + df = read_orig_diagroups() + df = df.sort_values(['ICD10', 'SNOMED']).reset_index(drop=True) + print('saving generated diagnostics groups file to:') + print('\t', gs.places.dia_groups) + df.to_csv(gs.places.dia_groups, index = False, sep = '\t') + + +def read_diagroups(): + f = gs.places.dia_groups + print('reading file:', f) + df = pd.read_csv(f, dtype = str, sep='\t') + return df + + + + +""" ---------------------------------------- + + person + reading the person data file + +---------------------------------------""" + + +def k_n(x): + if not(x == 'flicka') and not(x == 'pojke'): + return 'sex unknown' + return x + +def vital(x): + return { + 'levande':'alive', + 'Död':'dead', + 'utvandrade':'emigrated', + 'okänt':'emigrated' + }[x] + + + +def readperson(): + time_cols = gs.places.person_time_cols + cols = gs.places.person_cols + + f = gs.places.person + print('reading file:', f) + df = pd.read_csv(f, dtype = str, sep='\t') + df = df[ ~df['LopNr'].isnull() ] + df = df[gs.places.person_selection] + for c in time_cols: + df[c] = df[c].apply(str2time) # may destroy the values of other cols + df['K_n'] = df['K_n'].apply(k_n) + + # df['last_date'] = df['Registerdata_inh_mtade__CR_Bef__'] + df['VitalStatus'] = df['VitalStatus'].apply(vital) + + df.columns = cols + return df + + + +""" ---------------------------------------- + + drug + this concerns the outcome, drugs for everyone + +---------------------------------------""" + + +def readdrug(n = 1): + f = gs.places.drug + print('reading file:', f) + df = pd.read_csv(f, dtype = str, sep='\t', encoding='latin1') + # df = pd.read_csv(f, dtype = str, sep='\t') + df = df[ ~df['LopNr'].isnull() ] + df = df[ df['LopNr'].str.contains('\d', regex=True) ] + df = df[gs.places.drug_selection] + df['ATC'] = df['ATC'].str[:n].apply(str) # zzz + df = df.drop_duplicates() + return df + + +""" ---------------------------------------- + + cytostatica + +---------------------------------------""" + + +def readcytocodes(): + f = gs.places.cytocodes + print('reading file:', f) + df = pd.read_csv(f, dtype = str, sep='\t') + df = df[ ~df['BehNr'].isnull() ] + df = df[gs.places.cyto_selection] + df = df.drop_duplicates() + return df + + +def readdrugclasses(): + f = gs.places.drug_classes + print('reading file:', f) + df = pd.read_csv(f, dtype = str, sep='\t') + df = df[gs.places.drug_classes_selection] + df = df.drop_duplicates() + return df + + + +""" ---------------------------------------- + + treatement + reading the tratement file + + not complete yet 2019-04-02 + +---------------------------------------""" + + +def readtreatment(): + f = gs.places.treatment + print('reading file:', f) + df = pd.read_csv(f, dtype = str, sep='\t') + + + + df = df[ ~df['LopNr'].isnull() ] + df = df[gs.places.treatment_selection] + + + # df = df.dropna() # will drop all but 18 LopNr:s + + + df = common.change_col_names(df, 'StartDatum_Cyt', 'cyto_date') + df = common.change_col_names(df, 'StartDatum_Stamcell', 'stem_date') + df = common.change_col_names(df, 'KirurDat', 'surg_date') + df = common.change_col_names(df, 'StartDatum_Radioterapi', 'radio_date') + df = df.drop_duplicates().reset_index(drop=True) + + # xs = np.unique(df['LopNr'].values) + # print('nr of LopNr', len(xs)) + + + return df + + + + + +""" ---------------------------------------- + + incare and nicare + institutional and non-institutional care + reading the files + +---------------------------------------""" + +def readorigincare(): + f = gs.places.orig_incare + print('reading file:', f) + df = pd.read_csv(f, dtype = str, sep='\t', encoding='latin1') + df = df[ ~df['LopNr'].isnull() ] + df = df[gs.places.incare_selection] + return df + + +def readorignicare(): + f = gs.places.orig_nicare + print('reading file:', f) + df = pd.read_csv(f, dtype = str, sep='\t', encoding='latin1') + df = df[ ~df['LopNr'].isnull() ] + df = df[gs.places.nicare_selection] + df = df.drop_duplicates() + return df + + + +def readincare(): + f = gs.places.incare + print('reading file:', f) + df = pd.read_csv(f, dtype = str, sep='\t', encoding='latin1') + return df + + +def readnicare(): + f = gs.places.nicare + print('reading file:', f) + df = pd.read_csv(f, dtype = str, sep='\t', encoding='latin1') + return df + + + +def modify_care(df): + + date = 'INDATUM' + dia = 'DIAGNOS' + icd10_start = np.datetime64('1998-01-01') + df[date] = df[date].apply(str2time) + df = df.sort_values(date).dropna().reset_index(drop=True) + df1 = df[df[date] < icd10_start] + df2 = df[df[date] >= icd10_start] + df2c = copy.copy(df2) + df2c[dia] = df2[dia].apply(common.icd10_fill) + df = pd.concat( [df1, df2c]) + return df + +def modify_incare(): + df = readorigincare() + df = modify_care(df) + f = gs.places.incare + print('saving to file:') + print('\t', f) + df.to_csv(f, index = False, sep = '\t') + + +def modify_nicare(): + df = readorignicare() + df = modify_care(df) + f = gs.places.nicare + print('saving to file:') + print('\t', f) + df.to_csv(f, index = False, sep = '\t') + +""" ---------------------------------------- + + control group + +---------------------------------------""" + +def readcontrol(): + f = gs.places.control + print('reading file:', f) + df = pd.read_csv(f, dtype = str, sep='\t', encoding='latin1') + df = df[gs.places.control_selection] + df.columns = gs.places.control_column_names + + # fc = 'FoddAr' + # df[fc] = df[fc].apply(parser.parse) + # df = df[ ~df['LopNr'].isnull() ] + # df = df[gs.places.control_selection] + return df + + +""" ---------------------------------------- + + cause + + because of sever problems with pandas: a datetime object + in one column interfers with operations on other columns, + the readcause function manages the cause list one letter + construction + +---------------------------------------""" + + + +def old_add_to_cause_list(d, c, lex8, lex9): + recidiv_icd10 = d['ICD10'] # do no add first letter of recidiv + + icd = d['ICD'] + L = d['life_events'] + x = d[c] + if L == 'null': # L is null from start + L = [] + + if not isinstance(x, str): + return L + + if x == recidiv_icd10: + return L + + if icd == '10': + L = list(np.unique( L + [x[0]] )) + return L + + if icd == '9': + cx = common.look_up_codes(x, lex9) + if icd == '8': + cx = common.look_up_codes(x, lex8) + + for c in cx: + L += [c[0]] + L = list(np.unique( L )) + return L + + +def add_class(L, x, rc, nr = 3): + xc = x[:nr] + if not xc == rc: + L = list(np.unique( L + [xc] )) + return L + + +def add_to_cause_list(d, c, lex8, lex9, nr = 3): + # nr is the nr of icd10 chars of concern + + recidiv_icd10 = d['ICD10'] # do no add recidiv class diagnosis + recidive_class = recidiv_icd10[:nr] + + icd = d['ICD'] + L = d['life_events'] + x = d[c] + + if L == 'null': # L is null from start + L = [] + + if not isinstance(x, str): + return L + + if icd == '10': + if common.is_icd10(x): + L = add_class(L, x, recidive_class, nr = nr) + elif not x[0].isalpha() and not x[0].isdigit() and common.is_icd10(x[1:]): + L = add_class(L, x[1:], recidive_class, nr = nr) + return L + + if icd == '9': + cx = common.look_up_codes_1(x, lex9, 'icd9') + + if icd == '8': + cx = common.look_up_codes_1(x, lex8, 'icd8') + + + if common.isarray(cx): + for c in cx: + L = add_class(L, c, recidive_class, nr = nr) + + elif common.isstr(cx): + L = add_class(L, cx, recidive_class, nr = nr) + + return L + + +def readcause(df_icd10): + + # read cause and add a column 'life_events' + # with a list of first letter of non-recidiv causes + + cfile = gs.places.cause + print('reading file:', cfile) + df = pd.read_csv(cfile, dtype = str, sep='\t', encoding='latin1') + df = df[gs.places.cause_selection] + df = diagnose.add_cols_or_zero(df, df_icd10, 'LopNr', ['ICD10']) + df['life_events'] = 'null' + + lex8 = read_cause_icd8_lex() + lex9 = read_cause_icd9_lex() + + for c in gs.places.morsak: + f = lambda d:add_to_cause_list(d, c, lex8, lex9) # use cause lex here + df['life_events'] = df.apply(f, axis = 1) + cols = ['LopNr'] + gs.places.cause_added_cols + df = df[cols] + """ + this destroys it all + + + !! + for col in gs.places.cause_time_cols: + df[col] = df[col].apply(str2time) + """ + + return df + + +""" ---------------------------------------- + + surgery + +---------------------------------------""" + +def readsurgery(): + f = gs.places.surgery + print('reading file:', f) + df = pd.read_csv(f, dtype = str, sep='\t', + keep_default_na = False, + na_values = ['']) + + + df = df[ ~df['LopNr'].isnull() ] + # remove the non-number LopNr in the surgery file + df = df[ df['LopNr'].str.contains('\d', regex=True) ] + df = df[gs.places.surgery_selection] + df = df.drop_duplicates() + df.columns = gs.places.surgery_column_names + df['Datum'] = df['Datum'].apply(str2time) + return df + +def read_generated_surgery(): + df = read_generated_file('surgery') + df['LopNr'] = df['LopNr'].apply(str) + df['surgery'] = df['surgery'].apply(common.str2list) + return df + + +""" ---------------------------------------- + + stralung + +---------------------------------------""" + + +def readradiocodes(): # BehNr > Target_Kod + f = gs.places.radio + print('reading file:', f) + df = pd.read_csv(f, dtype = str, sep='\t', encoding='latin1') + df = df[['LopNr', 'BehNr', 'Target_Kod']] + return df + +def readradioclasses(): # Target_kod > Kategorier + f = gs.places.radio_classes + print('reading file:', f) + df = pd.read_csv(f, dtype = str, sep='\t', encoding='latin1') + df = common.rmcol(df, 'Target') + + # df = df[['Target_Kod', 'k1', 'k2', 'k3', 'k4']] + return df + + + + +""" ------------------------------- + + special missions + + reading the columns Target_I and Target_II + otherwise not used, from the treatement file + +------------------------------------""" + +def readtarget(): + f = gs.places.treatment + df = pd.read_csv(f, dtype = str, sep='\t') + df = df[ ~df['LopNr'].isnull() ] + df = df[gs.places.treatment_tagets] + return df + + + + +""" ------------------------------- + + evolving reads + +------------------------------------""" + + +# use bio.read_generated_file(gs.names.outcome_file) instead + +def readoutcome(): + col = 'first_incare' + name = gs.names.outcome_file + df = read_generated_file(name) + df[col] = df[col].apply(common.str2list) + return df + + +def read_final(name): + ei_col = 'event_time_inc' + ci_col = 'event_time_death' + df = read_generated_file(name) + df['LopNr'] = df['LopNr'].apply(str) + df[ei_col] = df[ei_col].apply(common.str2list) + df[ci_col] = df[ci_col].apply(common.str2list) + return df + +def readgt(): + name = gs.names.treat_file + return read_final(name) + + +def readtotal(): + list_cols = [ + 'event_time_death', + 'event_time_inc', + 'event_time_nic', + 'surgery', + 'drug_class', + 'radio' + ] + + name = gs.names.total_file + df = read_final(name) + f = lambda x: [] if x == 0 else x + for c in list_cols: + df[c] = df[c].apply(str).apply(common.str2list).apply(f) + return df + +""" +def readtimediff(): + name = gs.names.timediff_file + return read_final(name) +""" + + +def readtimediff(): + # no type conversions but for LopNr + name = gs.names.timediff_file + df = read_generated_file(name) + df['LopNr'] = df['LopNr'].apply(str) + return df + diff --git a/datapreparation/bonsai_time.py b/datapreparation/bonsai_time.py new file mode 100644 index 0000000..3e259fa --- /dev/null +++ b/datapreparation/bonsai_time.py @@ -0,0 +1,316 @@ +#! /home/jan/anaconda3/bin/python + + +""" ------------------------------- + + Copyright (C) 2018 RISE + This code was produced by RISE + The 2013-03-26 version + + bonsai/src_v02/bonsai_time.py + + support for managing of time data + and time analysis + + +------------------------------------""" + + + +from datetime import datetime as timeclass +from datetime import timedelta as delta +from dateutil import parser +import pandas as pd +import numpy as np + +import common +import global_settings as gs +import bonsai_io as bio + + +def head(xs, lex8, lex9, n = 1): + ys = [] + for x in xs: + if isinstance(x, str): + cx = common.lookup_icd10(x, lex8, lex9) + for c in cx: + ys += [c[:n]] + return ys + + +def split_and_head(ys, recidiv_icd10_class, lex8, lex9, n = 1): + + cs = [] + for y in ys.values: + if not common.isnull(y): + cs = np.append(cs, y.split()) + cs = list(cs) + ys = np.unique(head(cs, lex8, lex9, n = n)) + ys = list(set(ys) - set([recidiv_icd10_class])) + return ys + +# here we can instead return the difference + +def after(x, y, d = 0): + if common.notnull(x) and common.notnull(y): + return parser.parse(x) > parser.parse(y) + delta(days = d) + return False + +# here we can instead return the time and difference + +def times_after(sz, y, d = 0): + L = [] + for z in sz: + if after(z, y, d): + L += [z] + return L + + +def first_time(zs): + L = list(map(common.str2time, zs)) + if L == []: + return '0' + # t = min(list(map(common.str2time, zs))) + return common.time2str(min(L)) + + +def first_after(zs, y, d = 0): + return first_time(times_after(zs, y, d)) + + +def first_time_compression(df, xcol, ycol, zcol, fcol, lex8, lex9, gap = 1826, split = True, n = 1): + + # xcol = 'LopNr' + # ycol = 'DIAGNOS' + # fcol = 'first_incare' + + data = [] + df = df.dropna() + if df.empty: + return df + + xs = df[xcol].drop_duplicates() + + print() + print('first_time_compression') + print() + i = 0 + + print('nr of LopNr:s', len(xs)) + for x in xs: + i += 1 + if (i % 100) == 0: + print(i) + + dx = df[ df[xcol] == x ] + diagnos_dat = dx['DiagnosDat'].values[0] + recidiv_icd10 = dx['ICD10'].values[0] + recidiv_icd10_class = recidiv_icd10[:n] + sz = dx[zcol].drop_duplicates() + yz_list = [] + + for z in sz: + dz = dx[ dx[zcol] == z ] + ys = dz[ycol].drop_duplicates() + + + if split: + ys = split_and_head(ys, recidiv_icd10_class, lex8, lex9, n = n) + + for y in ys: + yz_list += [[y, z]] + + if not (yz_list == []): + dyz = pd.DataFrame(yz_list) + dyz.columns = [ycol, zcol] + sy = dyz[ycol].drop_duplicates() + yminz_list = [] + for y in sy: + dy = dyz[ dyz[ycol] == y ] + times = dy[zcol].values + z = first_after(times, diagnos_dat, gap) + if z != '0': + yminz_list += [[y, z]] + data = data + [ [x] + [yminz_list] ] + + dz = pd.DataFrame(data) + if not dz.empty: + dz.columns = [xcol, fcol] + return dz + +def first_time_aho(df, base, xcol, ycol, zcol, fcol, lex8, lex9, gap = 1826, split = True, n = 1): + + # xcol = 'LopNr' + # ycol = 'DIAGNOS' + # fcol = 'first_incare' + + data = [] + + xs = base[xcol] + + print() + print('first_time_compression aho') + print() + i = 0 + + df.set_index(xcol,drop=False,inplace=True) + + print('nr of LopNr:s', len(xs)) + for x in xs: + i += 1 + if (i % 100) == 0: + print(i) + + if x not in df.index: + continue + + dx = df.loc[x] + if type(dx) != pd.DataFrame: + dx = pd.DataFrame([dx]) + diagnos_dat = base['DiagnosDat'][x] + recidiv_icd10 = base['ICD10'][x] + recidiv_icd10_class = recidiv_icd10[:n] if isinstance(recidiv_icd10, str) else "" + + yz_list = [] + for j in range(len(dx)): + yz_list += split_and_head_1(dx.iloc[j], ycol, zcol, recidiv_icd10_class, lex8, lex9, n = n) + + if not (yz_list == []): + dyz = pd.DataFrame(yz_list) + dyz.columns = [ycol, zcol] + sy = dyz[ycol].drop_duplicates() + yminz_list = [] + for y in sy: + dy = dyz[ dyz[ycol] == y ] + times = dy[zcol].values + z = first_after(times, diagnos_dat, gap) + if z != '0': + yminz_list += [[y, z]] + data = data + [ [x] + [yminz_list] ] + + dz = pd.DataFrame(data) + if not dz.empty: + dz.columns = [xcol, fcol] + return dz + + + +def all_times(df, xcol, ycol, zcol, acol, lex8, lex9, split = True, n = 1): + + # xcol = 'LopNr' + # ycol = 'DIAGNOS' + # acol = 'all_incare' + + data = [] + df = df.dropna() + if df.empty: + return df + + xs = df[xcol].drop_duplicates() + + print() + print('all_time_compression') + print() + i = 0 + + for x in xs: + + i += 1 + if (i % 100) == 0: + print(i) + + dx = df[ df[xcol] == x ] + diagnos_dat = dx['DiagnosDat'].values[0] + recidiv_icd10 = dx['ICD10'].values[0] + recidiv_icd10_class = recidiv_icd10[:n] + sz = dx[zcol].drop_duplicates() + yz_list = [] + + for z in sz: + dz = dx[ dx[zcol] == z ] + ys = dz[ycol].drop_duplicates() + + if split: + ys = split_and_head(ys, recidiv_icd10_class, lex8, lex9, n = n) + + for y in ys: + yz_list += [[y, z]] + + if not (yz_list == []): + dyz = pd.DataFrame(yz_list) + dyz.columns = [ycol, zcol] + sy = dyz[ycol].drop_duplicates() + + yallz_list = [] + for y in sy: + dy = dyz[ dyz[ycol] == y ] + times = list(dy[zcol].values) + yallz_list += [[y, times]] + data = data + [ [x] + [yallz_list] ] + + dz = pd.DataFrame(data) + if not dz.empty: + dz.columns = [xcol, acol] + return dz + +def split_and_head_1(row, ycol, zcol, recidiv_icd10_class, lex8, lex9, n = 1): + if common.isnull(row[ycol]) or common.isnull(row[zcol]): + return [] + cs = row[ycol].split() + ys = np.unique(head(cs, lex8, lex9, n = n)) + ys = list(set(ys) - set([recidiv_icd10_class])) + return [[y, row[zcol]] for y in ys] + +def all_times_aho(df, base, xcol, ycol, zcol, acol, lex8, lex9, split = True, n = 1): + + # xcol = 'LopNr' + # ycol = 'DIAGNOS' + # acol = 'all_incare' + + data = [] + + xs = base[xcol] + + print() + print('all_time_compression aho') + print() + i = 0 + + df.set_index(xcol,drop=False,inplace=True) + + for x in xs: + + i += 1 + if (i % 100) == 0: + print(i) + + if x not in df.index: + continue + + dx = df.loc[x] + if type(dx) != pd.DataFrame: + dx = pd.DataFrame([dx]) + diagnos_dat = base['DiagnosDat'][x] + recidiv_icd10 = base['ICD10'][x] + recidiv_icd10_class = recidiv_icd10[:n] if isinstance(recidiv_icd10, str) else "" + + yz_list = [] + for j in range(len(dx)): + yz_list += split_and_head_1(dx.iloc[j], ycol, zcol, recidiv_icd10_class, lex8, lex9, n = n) + + if not (yz_list == []): + dyz = pd.DataFrame(yz_list) + dyz.columns = [ycol, zcol] + sy = dyz[ycol].drop_duplicates() + + yallz_list = [] + for y in sy: + dy = dyz[ dyz[ycol] == y ] + times = list(dy[zcol].values) + yallz_list += [[y, times]] + data = data + [ [x] + [yallz_list] ] + + dz = pd.DataFrame(data) + if not dz.empty: + dz.columns = [xcol, acol] + return dz diff --git a/datapreparation/common.py b/datapreparation/common.py new file mode 100644 index 0000000..12d89f9 --- /dev/null +++ b/datapreparation/common.py @@ -0,0 +1,465 @@ +#! /usr/bin/env python3 + + +""" ------------------------------- + + Copyright (C) 2018 RISE + This code was produced by RISE + The 2019-08-29 version + + bonsai/src/common.py + common support + +------------------------------------""" + +import pandas as pd +import numpy as np +from datetime import datetime as timeclass +from datetime import timedelta as delta +from dateutil import parser +import ast +import copy + +from scipy.stats import beta +# from scipy.stats import expon +# from scipy.stats import gamma + + +""" --------------------------- + + boolean functions + +------------------------- """ + + +def isstr(x): + return isinstance(x, str) + + +def isnull(a): + return not(isinstance(a, str)) + + +def notnull(x): + if isstr(x): + return (x != '0') + return False + + +def isarray(x): + return (isinstance(x, list) or isinstance(x, np.ndarray)) + +# what is an icd 10 code, we require that it starts +# with an uppercase letter followed by three numbers + + +def is_icd10(x): + if not isstr(x): + return False + if len(x) < 4: + return False + if not x[0].isupper(): + return False + return x[1].isdigit() and x[2].isdigit() and x[3].isdigit() + + + + +""" --------------------------- + + conversions + +------------------------- """ + + + +def str2number(x, dec = 5): + s = str(x) + if (',' in s): + x = float(s.replace(',','.')) + return round(float(x), dec) + + +def str2list(s): + return ast.literal_eval(s) + +def bool2str(x): + if isinstance(x, bool) or isinstance(x, int): + return str(int(x)) + return x + +def df_bool2str(df, cols): + for c in cols: + df[c] = df[c].apply(bool2str) + return df + + + + +""" --------------------------- + + lists + + [[['JF', 'PJ'], 41], [['JE'], 0]] to + [['JF', 41], ['PJ', 41], ['JE', 0]] + + +------------------------- """ + +def code_compress(u): + ru = [] + for x in u: + rx = [] + [cs, d] = x + for c in cs: + rx += [[c, d]] + ru += rx + return ru + + +""" --------------------------- + + pandas + +------------------------- """ + +def rmcol(df, c): + return df.drop([c], axis=1) + +def rmcols(df, cs): + for c in cs: + df = df.drop([c], axis=1) + return df + + +def change_col_names(df, old_name, new_name): + df.columns = [new_name if x==old_name else x for x in df.columns] + return df + + +def to_each_col_apply(df, cols, f): + for c in cols: + df[c] = df[c].apply(f) + + + +def compression(df, xcol, vcol): + data = [] + xs = df[xcol].drop_duplicates() + for x in xs: + dx = df[ df[xcol] == x ] + codes = [] + for line in dx[vcol].values: + if isinstance(line, list): + codes += line + codes = np.unique(codes) + data += [[x, codes]] + + d = pd.DataFrame(data) + d.columns = [xcol, vcol] + d[vcol] = d[vcol].apply(list) + return d + + + +def compression_with_dates(df, xcol, vcol, dcol): + data = [] + xs = df[xcol].drop_duplicates() + for x in xs: + + dx = df[ df[xcol] == x ] + L0 = len(dx) + dx = dx[dx[dcol] == dx[dcol]] # remove NaT values + L1 = len(dx) + if L0 != L1: + print(L0-L1, 'non-valid surgery dates for this LopNr') + print(dx) + print() + + codes = [] + dxcopy = copy.copy(dx) + dxcopy['Datum'] = dx['Datum'].apply(lambda x: x.strftime('%Y-%m-%d')) + dx = dxcopy + for values_and_date in dx[[vcol, dcol]].values: + values_and_date = [values_and_date[0]] + [values_and_date[1]] + + if isinstance(values_and_date, list): + codes += [values_and_date] + data += [[x, codes]] + + d = pd.DataFrame(data) + d.columns = [xcol, vcol] + d[vcol] = d[vcol].apply(list) + return d + + + + +def compression_date_limit(df, xcol, vcol, dcol, date_dict): + data = [] + xs = df[xcol].drop_duplicates() + for x in xs: + dx = df[ df[xcol] == x and df[dcol] <= date_dict[x]] + codes = [] + for line in dx[vcol].values: + if isinstance(line, list): + codes += line + codes = np.unique(codes) + data += [[x, codes]] + + d = pd.DataFrame(data) + d.columns = [xcol, vcol] + d[vcol] = d[vcol].apply(list) + return d + + + + + +""" --------------------------- + + lexicon (same code as in merge.py) + +------------------------- """ + + + +def look_up_entry(entry, df, entry_col, value_col): + dfe = df[df[entry_col] == entry] + if not dfe.empty: + return dfe[value_col].values[0] + return str(0) + + +def look_up_value(entry, df, entry_col, value_col): + dfe = df[df[entry_col] == entry] + if not dfe.empty: + return dfe[value_col].values[0] + return np.nan + + +def look_up_list_value(entry, df, entry_col, value_col): + dfe = df[df[entry_col] == entry] + if not dfe.empty: + return str2list( dfe[value_col].values[0] ) + return str(0) + + +def look_up_codes_1(x, lex, col): + keys = set(lex[col].values) + if x in keys: + return look_up_list_value(x, lex, col, 'icd10') + elif x[0].isalpha() and x[1:] in keys: + return look_up_list_value(x[1:], lex, col, 'icd10') + else: + return [] + +def look_up_codes(x, lex, col): + keys = set(lex[col].values) + if x in keys: + return look_up_list_value(x, lex, col, 'icd10') + return [] + + +def look_up_list(entry, df, entry_col, value_col): + dfe = df[df[entry_col] == entry] + if not dfe.empty: + return dfe[value_col].values[0] + return [] + + +#def lookup_icd10(x, lex8, lex9): +# lex8.columns = ['code', 'ICD10'] +# lex9.columns = ['code', 'ICD10'] +# lex = pd.concat( [lex8, lex9]) +# return look_up_codes(x, lex) + +def lookup_icd10(x, lex8, lex9): + res = look_up_codes(x, lex8, 'icd8') if lex8 is not False else [] + if res: + return res + res = look_up_codes(x, lex9, 'icd9') if lex9 is not False else [] + if res: + return res + if is_icd10(x) or is_icd10_class(x): + return [x] + else: + return [] + +def add_list_cols(df1, df2, xc, ycs): + + # print(df1[:5]) # zzz + # print(df2[:5]) # zzz + + for yc in ycs: + df1[yc] = df1[xc].apply(lambda x:look_up_list(x, df2, xc, yc)) + return df1 + + +def look_up_all_values(entry, df, entry_col, value_col): + dfe = df[df[entry_col] == entry] + if dfe.empty: + return [] + return list(dfe[value_col].values) + +""" +def look_up_all_values_and_dates(entry, df, entry_col, value_col, date_col): + dfe = df[df[entry_col] == entry] + return dfe[[value_col, date_col]] +""" + + +def look_up_all_values_date_limit(entry, df, entry_col, value_col, date_col, date_dict): + dfe = df[df[entry_col] == entry and df[date_col] <= date_dict[entry]] + if dfe.empty: + return [] + return list(dfe[value_col].values) + + +def look_up_nonnan(entry, df, entry_col, value_col): + dfe = df[df[entry_col] == entry] + if dfe.empty: + return [] + v = list( set(dfe[value_col].values) - set([np.nan]) ) + return v + + +""" --------------------------- + + Time + + t: timeclass + x: time as a string (str) + + Here we use the string '0' as an undefined value + +------------------------- """ + +def is_time(t): + # return isinstance(t, datetime.datetime) + return isinstance(t, timeclass) + +def str2time(x): + if notnull(x): + return parser.parse(x) + return x + +""" +def full_year2time(x): + return str2time(x + '-06-15') +""" + +def time2str(t): + if is_time(t): + return t.strftime("%Y-%m-%d") + return t + + +def time_diff(t1, t0): + if is_time(t1) and is_time(t0): + return (t1-t0).total_seconds() + return np.nan + + +def time_diff_in_seconds(x, y): + if notnull(x) and notnull(y): + if x[-2:]=='00': + x = x[:-2] + "15" + if y[-2:]=='00': + y = y[:-2] + "15" + tx = parser.parse(x) + ty = parser.parse(y) + return (tx-ty).total_seconds() + return np.nan + +def time_diff_in_days(x, y): + s = time_diff_in_seconds(x, y) + if np.isnan(s) or (s == '0'): + return s + return round( s/ (60*60*24) ) + + +def time_diff_in_years(x, y): + s = time_diff_in_days(x, y) + if np.isnan(s) or (s == '0'): + return s + return round( s/ 365.24, 2 ) + +def time_min(x, y): + return time2str(min(parser.parse(x), parser.parse(y))) + +def time_max(x, y): + return time2str(max(parser.parse(x), parser.parse(y))) + +def add_days(x, nr_of_days): + # print(time2str(parser.parse(x))) + return time2str(parser.parse(x) + delta(days = nr_of_days)) + +def add_years(x, nr_of_years): + nr_of_days = 365.24 * nr_of_years + return add_days(x, nr_of_days) + +def add_years_to_full_year(x, nr_of_years): + x = x + '-07-02' + return add_years(x, nr_of_years) + + +""" --------------------------- + + two sample test + +------------------------- """ + + +def exp2sample_prob(nx, ny, sx, sy): + if sx == 0 or sy == 0 or nx == 0 or ny == 0: + return 0.5 + h = sy / (sx + sy) + return beta.cdf(h, ny, nx) + + +def exp2sample_logprob(nx, ny, sx, sy): + if sx == 0 or sy == 0 or nx == 0 or ny == 0: + return 0.5 + h = sy / (sx + sy) + return beta.logcdf(h, ny, nx) + + +def exp2sample_logsf(nx, ny, sx, sy): + if sx == 0 or sy == 0 or nx == 0 or ny == 0: + return 0.5 + h = sy / (sx + sy) + return beta.logsf(h, ny, nx) + + +""" --------------------------- + + fill in icd10 class codes + +------------------------- """ + + + + +def is_icd10_class(x): + if not isstr(x): + return False + if is_icd10(x): + return False + if len(x) < 3: + return False + if not x[0].isupper(): + return False + return x[1].isdigit() and x[2].isdigit() + +def icd10_fill(xs, s = '0*'): + if not isinstance(xs, str): + return xs + ys = '' + for x in xs.split(): + if is_icd10_class(x): + ys += ' ' + x[:3] + s + x[3:] + # print('\t', x, '\t', x[:3] + s + x[3:]) + else: + ys += ' ' + x + return ys[1:] diff --git a/datapreparation/diagnose.py b/datapreparation/diagnose.py new file mode 100644 index 0000000..1010f6e --- /dev/null +++ b/datapreparation/diagnose.py @@ -0,0 +1,472 @@ +#! /usr/bin/env python3 + + +""" ------------------------------- + + Copyright (C) 2018 RISE + This code was produced by RISE + The 2013-03-26 version + + bonsai/src_v02/diagnose.py + processing the diagnosis data + + Notice: This file is not imported + using the name dia, since dia is + often used for a dataframe with + diagnosis data content + +------------------------------------""" + +import pandas as pd +import numpy as np +import copy + +import bonsai_io as bio +import common +import lexicon +import global_settings as gs +import merge + + + + + +""" ---------------------------------------- + + generate version 1 dia + +---------------------------------------""" + + + +def special(ICD10): + if common.isnull(ICD10): # not needed after filled in + return 0 + if 'G' in str(ICD10): + return 1 + if 'H' in str(ICD10): + return 1 + if 'K' in str(ICD10): + return 1 + if 'L' in str(ICD10): + return 1 + if 'O' in str(ICD10): + return 1 + if 'Q' in str(ICD10): + return 1 + if 'R' in str(ICD10): + return 1 + if 'T' in str(ICD10): + return 1 + if 'Z' in str(ICD10): + return 1 + if str(ICD10) == 'D469': + return 0 + if str(ICD10) == 'D761': + return 0 + if str(ICD10) == 'D459': + return 0 + if 'D' in str(ICD10): + return 1 + return 0 + + +def generate_dia(): + """ + constructing the file stored as generated_dia, + see places.py + """ + + xcols = ['ICD7', 'ICD9', 'text_C24_'] + + dia = bio.read_original_dia() + dia = dia.sort_values(by = xcols) + print('orig shape:', dia.shape) + + # (1) select the unique rows + dia = dia.drop_duplicates() + print('non duplicate shape:', dia.shape) + + # (2) select first diagnoses + dia = dia[dia['DiagnosNr_Diagnos'] == '1'] + dia = dia.drop(['DiagnosNr_Diagnos'], axis=1) + print('first dia shape:',dia.shape) + + # (3) fill in codes + dia = lexicon.fill_in_by_compiled_lex(dia) + dia = dia.drop(xcols, axis=1) + print('filled dia shape:',dia.shape) + + # (4) remove the special cases and the not needed columns + dia['special'] = dia['ICD10'].apply(special) + dia = dia[dia['special'] == 0] + dia = dia.drop(['special'], axis=1) + print('no special shape:',dia.shape) + + # (5) take care of numbers + if 'Diagnos_lder' in gs.places.diagnose_selection: + dia['Diagnos_lder'] = dia['Diagnos_lder'].apply(common.str2number) + + + return dia + + +def rm_dia_cols(dia): + cols = gs.places.diagnose_ohe + dia = common.rmcols(dia, cols) + return dia + + + +""" ---------------------------------------- + + add dia groups + +---------------------------------------""" + + + +def look_up_group_by_codes(groups, ICD10, SNOMED): + g = gs.names.input_data_group + row = groups[(groups['ICD10'] == ICD10) & (groups['SNOMED'] == SNOMED)] + if not row.empty: + return row[g].values[0] + return str(0) + +def look_up_group(df, groups): + g = look_up_group_by_codes(groups, df['ICD10'], df['SNOMED']) + return g + +def add_group(dia, groups): + g = gs.names.dia_group + dia[g] = dia.apply(lambda d: look_up_group(d, groups), axis = 1) + return dia + +def rm_group_col(dia): + g = gs.names.dia_group + dia = common.rmcol(dia, g) + return dia + + +""" ---------------------------------------- + + to one data frame df1 add columns + from another data frame df2 + + Note: in add_cols + df1[yc] = 0 when there is no entry x in df2 + df1[yc] = NaN when df2[x] = NaN + +---------------------------------------""" + +def look_up_entry(entry, df, entry_col, value_col): + dfe = df[df[entry_col] == entry] + if not dfe.empty: + return dfe[value_col].values[0] + return str(0) + +def add_cols(df1, df2, xc, ycs): + for yc in ycs: + df1[yc] = df1[xc].apply(lambda x:look_up_entry(x, df2, xc, yc)) + return df1 + + + +def look_up_or_zero(entry, df, entry_col, value_col, verb = False): + dfe = df[df[entry_col] == entry] + if not dfe.empty: + val = dfe[value_col].values[0] + if verb: + print('val =', val, type(val)) + if isinstance(val, str): + return val + return str(0) + +def add_cols_or_zero(df1, df2, xc, ycs, verb = False): + df1_copy = copy.copy(df1) + for yc in ycs: + df1_copy[yc] = df1[xc].apply(lambda x:look_up_or_zero(x, df2, xc, yc, verb)) + return df1_copy + +""" +def add_cols_or_zero(df1, df2, xc, ycs, verb = False): + for yc in ycs: + df1[yc] = df1[xc].apply(lambda x:look_up_or_zero(x, df2, xc, yc, verb)) + return df1 +""" + +def look_up_aho(df, x, col, zero = False): + return df[col][x] if x in df.index and (not zero or isinstance(df[col][x], str)) else str(0) + +def add_cols_aho(df1, df2, xc, ycs, zero = False): + if zero: + for yc in ycs: + df1[yc] = df1[xc].apply(lambda x: df2[yc][x] if x in df2.index and isinstance(df2[yc][x], str) else str(0)) + else: + for yc in ycs: + df1[yc] = df1[xc].apply(lambda x: df2[yc][x] if x in df2.index else str(0)) + return df1 + +""" ---------------------------------------- + + add person data + +---------------------------------------""" + + +def add_pers(dia): + cols = gs.places.person_cols + copy_cols = copy.copy(cols) + copy_cols.remove('LopNr') + pers = bio.readperson() + dia = add_cols(dia, pers, 'LopNr', copy_cols) + return dia + +def add_pers_ohe(dia): + cols = gs.places.person_ohe + df = ones_x(dia, cols) + return df + +def rm_pers_cols(dia): + cols = gs.places.person_ohe + dia = common.rmcols(dia, cols) + return dia + + + +""" ---------------------------------------- + + add incare data (sluten vaard) + +---------------------------------------""" + +def add_incare(dia, nr = 0): + + xcol = 'LopNr' + ycol = gs.places.incare_ohe[0] # only one column is used so far + + inc = bio.readincare() + inc = inc.sort_values([xcol]).reset_index(drop=True) + if nr > 0: + inc = inc[0:nr] # an initial part of incare + L = list(dia[xcol].values) + inc = inc[ inc[xcol].isin(L) ] # part of inc with LopNr in dia + inc = name_compression(inc, xcol, ycol) # first letter set for each LopNr + dia = add_cols(dia, inc, xcol, [ycol]) # add compressed inc cols to dia + return dia + +# the following functions are just for test since the incare compression +# lists are merged with the nicare and causes lists before unfolding ohe + + +def add_incare_ohe(dia): + ycol = gs.places.incare_ohe[0] # only one column is used so far + dia = one_general(dia, ycol) # mk first letter one hot + return dia + +def rm_incare_cols(dia): + cols = gs.places.incare_ohe + dia = common.rmcols(dia, cols) + return dia + + + +""" ---------------------------------------- + + add nicare data (oppen vaard) + +---------------------------------------""" + + +def add_nicare(dia, nr = 0): + + xcol = 'LopNr' + ycol = gs.places.nicare_ohe[0] # only one column is used so far + + nic = bio.readnicare() + nic = nic.sort_values([xcol]).reset_index(drop=True) + if nr > 0: + nic = nic[0:nr] # an initial part of incare + L = list(dia[xcol].values) + nic = nic[ nic[xcol].isin(L) ] # part of nic with LopNr in dia + nic = name_compression(nic, xcol, ycol) # first letter set for each LopNr + dia = add_cols(dia, nic, xcol, [ycol]) # add compressed nic cols to dia + return dia + +# the following functions are just for test since the nicare compression +# lists are merged with the nicare and causes lists before unfolding ohe + +def add_nicare_ohe(dia): + ycol = gs.places.nicare_ohe[0] # only one column is used so far + dia = one_general(dia, ycol) # mk first letter one hot + return dia + +def rm_nicare_cols(dia): + cols = gs.places.nicare_ohe + dia = common.rmcols(dia, cols) + return dia + + +""" ---------------------------------------- + + add drug data + +---------------------------------------""" + + +def add_drug(dia): + cols = gs.places.drug_selection.copy() + cols.remove('LopNr') + drug = bio.readdrug() + dia = add_cols(dia, drug, 'LopNr', cols) + return dia + + +def add_drug_ohe(dia): + cols = gs.places.drug_ohe + df = ones_x(dia, cols) + return df + + +def rm_drug_cols(dia): + cols = gs.places.drug_ohe + dia = common.rmcols(dia, cols) + return dia + + + +""" ---------------------------------------- + + add one hot encodings for names column + with unique names (LopNr) and a single + code in each row in the codes column + +---------------------------------------""" + +""" + +def equal_str(a, b): + if not (isinstance(a, str) and isinstance(b, str)): + return 0 + return int(a == b) + +""" + + +def one(df, c): + for x in df[c].dropna().drop_duplicates(): + df[x] = (df[c] == x).astype(int) + return df + +""" +def one_x(df, c): + for x in df[c].drop_duplicates(): + if isinstance(x, str): + df[c + '_' + x] = df[c].apply(lambda z: equal_str(z, x)) + return df +""" + +def one_x(df, c): + for x in df[c].dropna().drop_duplicates(): + df[c + '_' + x] = (df[c] == x).astype(int) + return df + + + +def ones_x(df, cs): + for c in cs: + df = one_x(df, c) + return df + + +def to_int(x): + if isinstance(x, str): + return int(x) + if isinstance(x, int): + return x + return -1 + +def nr_sort(xs): + """ + sort a list of numbers on str type + """ + if not common.isarray(xs): + return [] + ixs = list(map(to_int, xs)) + ixs.sort() + xs = list(map(str, ixs)) + return xs + +def one_sorted(df, c): + xs = list(df[c].drop_duplicates()) + xs = nr_sort(xs) + for x in xs: + df[c + '_' + x] = (df[c] == x).astype(int) + return df + +def add_one_hot_groups(dia): + grp = gs.names.dia_group + dia = one_sorted(dia, grp) + return dia + + + +""" ---------------------------------------- + + add one hot encodings for names column with + non-unique names and possibly several space + separated codes in each row in the codes + column + +---------------------------------------""" + +def head(xs): + ys = [] + for x in xs: + if isinstance(x, str): + ys += [x[0]] + return ys + +def split_and_head(ys): + cs = [] + for y in ys.values: + if not common.isnull(y): + cs = np.append(cs, y.split()) + return np.unique(head(cs)) + + +def name_compression(df, xcol, ycol): + data = [] + xs = df[xcol].drop_duplicates() + for x in xs: + dx = df[ df[xcol] == x ] + ys = dx[ycol].drop_duplicates() + ys = split_and_head(ys) + data = data + [ [x] + [ys] ] + ds = pd.DataFrame(data) + ds.columns = [xcol, ycol] + return ds + + + +def unique_values(ys): + ys1 = [] + for y in ys.values: + ys1 = np.append(ys1, np.array(y)) + return np.unique(ys1) + +def in_col_list(d, col, y): + if isinstance(d[col], str): + return 0 + if common.isarray(d[col]): + return int(y in list(d[col])) + print('ERROR ? in_col_list: d[col] is neither a str or an array') + print('d[col] =', d[col], 'type =', type(d[col]),'\ty=', y) + return 0 + +def one_general(df, ycol): + # f = lambda d: ( int(y in d[ycol])) + ys = unique_values(df[ycol]) + for y in ys: + df[ycol + '_' +y] = df.apply(lambda d:in_col_list(d, ycol, y), axis = 1) + return df diff --git a/datapreparation/generate_bonsai.py b/datapreparation/generate_bonsai.py new file mode 100644 index 0000000..2bba939 --- /dev/null +++ b/datapreparation/generate_bonsai.py @@ -0,0 +1,95 @@ +import sys +import time +import pandas as pd +import numpy as np +from datetime import datetime as timeclass +from dateutil import parser +import copy + +import global_settings as gs +import bonsai_io as bio +import diagnose +import separate_tasks as sep +import total +import analyze +import lexicon +import common +import merge +import diagnose +import bonsai_time as btime + + +dia = diagnose.generate_dia() +dia = diagnose.add_pers(dia) +dia['DIA'] = 1 +dia.drop_duplicates('LopNr',keep='first',inplace=True) + +ctr = total.control_base() +b = gs.places.base_columns +c = ctr.columns +d = dia.columns +for col in list(set(b) - set(c)): + ctr[col] = np.nan + +for col in list(set(b) - set(d)): + dia[col] = np.nan + +base = pd.concat([dia[b], ctr[b]]) +base.set_index('LopNr',drop=False,inplace=True) + + +base = total.add_cause(base) + +inc = bio.readorigincare() +nic = bio.readorignicare() + +base = total.add_care(base, inc, 'first_incare', 'all_incare', n = 3, n2 = 0, n1 = 0) +base = total.add_care(base, nic, 'first_nicare', 'all_nicare', n = 3, n2 = 0, n1 = 0) +base = total.add_drugs(base, n=4) + + +base['DODSDAT'] = base['DODSDAT'].apply(str) +base = base[(base['VitalStatus'] != 'emigrated') | base['last_date'].notnull()] +base['K_n']['5251'] = 'pojke' + +base = total.add_no_event_time_death(base, 'no_event_time_death') + +for col in ['first_incare','all_incare','first_nicare','all_nicare','first_drug','all_drug','life_events']: + base[col] = base[col].apply(str) + +def fixdate00(t): + if t[-2:]=='00': + t = t[:-2] + "15" + return t + +base['DODSDAT'] = base['DODSDAT'].apply(fixdate00) + +base = total.add_time_differences(base, gs.incspan, 'event_time_inc', 'first_incare') +base = total.add_time_differences(base, gs.nicspan, 'event_time_nic', 'first_nicare') +base = total.add_time_differences(base, gs.drugspan, 'event_time_drug', 'first_drug') + +base = total.add_all_timedifferences(base, gs.incspan, 'all_event_time_inc', 'all_incare') +base = total.add_all_timedifferences(base, gs.nicspan, 'all_event_time_nic', 'all_nicare') +base = total.add_all_timedifferences(base, gs.drugspan, 'all_event_time_drug', 'all_drug') + +base = total.add_censored_times(base) +base = total.add_other_str(base) + +gs.places.surgery = gs.places.local_input + 'kodad_kirurgi_20200318.csv' +sur=bio.readsurgery() +sur = sur.sort_values('LopNr').reset_index(drop=True) +sur['surgery'] = sur.apply(total.surg_codes, axis = 1) +sur = common.compression_with_dates(sur, 'LopNr', 'surgery', 'Datum') +cols = list(sur.columns) +cols.remove('LopNr') +base = common.add_list_cols(base, sur, 'LopNr', cols) +base['surgery_diff'] = base.apply(total.surgery_diff, axis = 1) + +treat = bio.readtreatment() +base = total.add_cytostatica(base, treat) +base = total.add_stemcell(base, treat) +base = total.add_radio(base, treat) + +base = total.modify_total(base) + +bio.save_generated_file(base, "total_210522") diff --git a/datapreparation/global_settings.py b/datapreparation/global_settings.py new file mode 100644 index 0000000..4c5188e --- /dev/null +++ b/datapreparation/global_settings.py @@ -0,0 +1,63 @@ +#! /home/jan/anaconda3/bin/python + + + +""" ------------------------------- + + Copyright (C) 2019 RISE + This code was produced by RISE + The 2019-08-29 version + + bonsai/src/global_settings.py + +------------------------------------""" + + + +import places + + +class long_names: + input_data_group = 'group' + dia_group = 'Diagnostic_Group' + out_p1 = 'outcome_p1_2019_12_05' + out_p2 = 'outcome_p2_2019_10_16' + outcome_file = 'outcome_2019_12_05' + timediff_file = 'timediff_2019_12_09' + treat_file = 'treat_2019_05_17' + total_file = 'total_2020_05_06' + +class short_names: + input_data_group = 'group' + dia_group = 'group' + outcome_file = 'outcome' + timediff_file = 'timediff' + treat_file = 'treat' + total_file = 'total' + + +names = long_names + + + +outcome_timegap = 0 # 1826 # (nr of days in 5 years) +mean_diagnosis_age = 9.42 # (years, converted to days: d = y * 365.25) + + +diaspan = ['1970-02-15', '2015-12-29'] +incspan = ['1970-01-12', '2016-12-30'] # sluten vard, changed 2019-12-04 +lifespan = ['1952-01-02', '2017-12-15'] +nicspan = ['1997-01-02', '2016-12-31'] +drugspan = ['2005-06-28', '2017-12-31'] + +future = '5000-01-01' # any future date + + +""" +incare span: 1970-01-12 2016-12-30 +nicare span: 1997-01-02 2016-12-31 +dia span: 1970-02-15 2015-12-29 +oc span: 1961-12-02 2024-12-01 +life span: 1975-11-02 2017-12-15 +drug span: 2005-06-28 2017-12-31 +""" diff --git a/datapreparation/lexicon.py b/datapreparation/lexicon.py new file mode 100644 index 0000000..23bf478 --- /dev/null +++ b/datapreparation/lexicon.py @@ -0,0 +1,191 @@ +#! /home/jan/anaconda3/bin/python + + +""" ------------------------------- + + Copyright (C) 2018 RISE + This code was produced by RISE + The 2013-03-26 version + + bonsai/src/lexicon.py + +------------------------------------""" + + + +import pandas as pd +import numpy as np +import bonsai_io as bio +import common +import copy + + +xcols = ['ICD7', 'ICD9', 'text_C24_'] +ycols = ['ICD10', 'SNOMED'] +lex_cols = xcols + ycols +tw_cols = ['tw_icd10', 'tw_snomed'] +dia_cols = ['LopNr'] + lex_cols + + + + +""" + + 1 Common lexicon support for translation from + ICD7, ICD9 to ICD10 and SNOMED + +""" + + +def lookup(x, lex): + for z in lex.values: + if (x[0] == z[0]) and (x[1] == z[1]) and common.isnull(x[2]) and common.isnull(z[2]): + return z[3:5] + if (x[0] == z[0]) and (x[1] == z[1]) and (x[2] == z[2]): + return z[3:5] + return [np.nan, np.nan] + + +def translate(df, lex): + x = df[xcols] # change dfrom df[1:4] + y = lookup(x, lex) + return y + + + +""" + + 2 Apply lexicon for translation from + ICD7, ICD9 to ICD10 and SNOMED + +""" + + +def apply_lex(df, lex, label): + icd10 = 'icd10'+ label + snomed = 'snomed'+ label + df[icd10] = df.apply(lambda d: translate(d, lex)[0], axis = 1) + df[snomed] = df.apply(lambda d: translate(d, lex)[1], axis = 1) + return df + + +def select_icd10(df, icd10): + if common.isnull(df['ICD10']): + return df[icd10] + return df['ICD10'] + +def select_snomed(df, snomed): + if common.isnull(df['SNOMED']): + return df[snomed] + return df['SNOMED'] + +""" + + 3 Apply saved lexicon + +""" + + + +def fill_in_by_compiled_lex(df): + lex = bio.read_code_lex() + label = 'fill' + icd10 = 'icd10'+ label + snomed = 'snomed'+ label + df = apply_lex(df, lex, label) + df['ICD10'] = df.apply(lambda d:select_icd10(d, icd10), axis = 1) + df['SNOMED'] = df.apply(lambda d:select_snomed(d, snomed), axis = 1) + df = df.drop([icd10, snomed], axis=1) + return df + + + +""" + + 4 Prune + +""" + +no_code = ['finns ej', 'finns ej som icd8kod', 'oklar kod'] + +def is_icd10(x, verb = False): + if len(x) < 3: + if verb: print(x, 'has less than 3 chars') + return False + if not x[0].isupper(): + if verb: print('\'' + x[0] + '\'' + ' is not an upper case letter') + return False + if not x[1].isdigit(): + if verb: print('\'' + x[1] + '\'' + ' is not a digit') + return False + if not x[2].isdigit(): + if verb: print('\'' + x[2] +'\'' + ' is not a digit') + return False + if len(x) > 6: + if verb: print(x, 'has more than 6 chars') + return False + return True + + +def prune_icd10(c): + if not isinstance(c, str) or c in no_code: + return np.nan + code = c + if code == 'F': + code = 'F00' + if code == 'O': + code = 'O50' + if code == 'S': + code = 'S83' + code = code.replace('.', '') + code = code.replace(' ', '') + code = code[:3] + # if not is_icd10(c, verb = True): + if not is_icd10(c): + print('\t' + c + '\t replaced by \t' + code) + return code + +def prune_icd10_lex(df, col): + df = df[df.columns[:2]] + df.columns = [col, 'icd10'] + dfc = copy.copy(df) + dfc['icd10'] = df['icd10'].apply(prune_icd10) + dfc= dfc.sort_values(col).dropna().reset_index(drop=True) + return dfc + +def prune_icd10_lex_files(raw, lex, code): + df = pd.read_csv(raw, sep=',', dtype = str) + + print('File:' + lex + '\n\tspecial icd10 replacements') + print() + df = prune_icd10_lex(df, code) + print('saving file to:') + print('\t', lex) + df.to_csv(lex, index = False, sep = '\t') + + +def add_brackets(x): + x = str(x) + x = x.replace(' ', '') + x = x.replace(',', '\',\'') + s = '[\'' + x + '\']' + return s + + +def add_brackets_to_file(f1, f2, col): + df = pd.read_csv(f1, sep='\t', dtype = str) + df[col] = df[col].apply(add_brackets) + df.to_csv(f2, index = False, sep = '\t') + + +""" +def prune_icd10_lex_filesx(raw, lex, code): + df = pd.read_csv(raw, sep=',', dtype = str) + c1 = df.columns[1] + df[c1] = df[c1].apply(str) + print(df[0:3]) + df = df[df.columns[:2]] + print(df[0:3]) +""" + + diff --git a/datapreparation/merge.py b/datapreparation/merge.py new file mode 100644 index 0000000..7b5b969 --- /dev/null +++ b/datapreparation/merge.py @@ -0,0 +1,73 @@ +#! /usr/bin/env python3 + + +""" ------------------------------- + + merge.py + + Copyright (C) 2018 RISE + This code was produced by RISE + + First version created 2013-04-05 + based on code from other files + + bonsai/src_v02/merge.py + +------------------------------------""" + +import pandas as pd +import numpy as np + +import bonsai_io as bio +import common + +import global_settings as gs + + + + + +""" ---------------------------------------- + + to one data frame df1 add columns + from another data frame df2 + +---------------------------------------""" + +def look_up_entry(entry, df, entry_col, value_col): + dfe = df[df[entry_col] == entry] + if not dfe.empty: + return dfe[value_col].values[0] + return str(0) + +def add_cols(df1, df2, xc, ycs): + for yc in ycs: + df1[yc] = df1[xc].apply(lambda x:look_up_entry(x, df2, xc, yc)) + return df1 + + +""" ---------------------------------------- + + add person data + +---------------------------------------""" + + +def add_pers(df): + cols = gs.places.person_selection.copy() + cols.remove('LopNr') + pers = bio.readperson() + df = add_cols(df, pers, 'LopNr', cols) + # cols = gs.places.person_ohe + return df + +def add_pers_ohe(df): + cols = gs.places.person_ohe + df = ones_x(df, cols) + return df + +def rm_pers_cols(df): + cols = gs.places.person_ohe + df = common.rmcols(df, cols) + return df + diff --git a/datapreparation/separate_tasks.py b/datapreparation/separate_tasks.py new file mode 100644 index 0000000..f8770ee --- /dev/null +++ b/datapreparation/separate_tasks.py @@ -0,0 +1,175 @@ +#! /usr/bin/env python3 + + +""" ------------------------------- + + Copyright (C) 2018 RISE + This code was produced by RISE + + bonsai/src_v02/separate_tasks.py + + +------------------------------------""" + +import numpy as np + +from datetime import datetime as timeclass +from dateutil import parser +import pandas as pd + +import bonsai_io as bio +import bonsai_time as btime +import common + + + +""" ------------------------------- + + finding zero dia group codes + +------------------------------------""" + +def zero_grp(): + base = total.base_set() + base = base[base[gs.names.dia_group] == '0'] + bio.save_tmp_file(base, 'base_grp_0') + + + + +""" ------------------------------- + + mean diagnosis age + +------------------------------------""" + +def mean_diagnosis_age(): + da = 'Diagnos_lder' + dia = bio.read_generated_dia() + return dia[da].mean() + + +""" ------------------------------- + + finding the data time spans + +------------------------------------""" + + +def spans(): + care_col = 'INDATUM' + dia_col = 'DiagnosDat' + cause_col = 'DODSDAT' + drug_col = 'EDATUM' + + inc = bio.readincare() + t_inc = inc[care_col].dropna() + t_inc = t_inc.sort_values() # .apply(parser.parse) + + nic = bio.readnicare() + t_nic = nic[care_col].dropna() + t_nic = t_nic.sort_values() # .apply(parser.parse) + + dia = bio.read_generated_dia() + t_dia = dia[dia_col].dropna() # .apply(parser.parse) + + oc = bio.readoutcome() + t_oc = oc[dia_col].dropna() # .apply(parser.parse) + + oc = oc[oc[cause_col] != 0] + t_life = oc[cause_col].dropna().apply(str).apply(parser.parse).apply(common.time2str) + # t_life = t_life[t_life[cause_col] != '0'] + + drug = bio.readdrug() + t_drug = drug[drug_col].dropna()# .apply(parser.parse).apply(common.time2str) + + + print() + print('incare span: ', t_inc.min(), '\t', t_inc.max()) + print('nicare span: ', t_nic.min(), '\t', t_nic.max()) + print('dia span: ', t_dia.min(), '\t', t_dia.max()) + print('oc span: ', t_oc.min(), '\t', t_oc.max()) + print('life span: ', t_life.min(), '\t', t_life.max()) + print('drug span: ', t_drug.min(), '\t', t_drug.max()) + print() + + #t = list(inc[care_col].values) + + +""" ------------------------------- + + generation of diagnosis codes lexicon for incare + +------------------------------------""" + +xdir = '/home/jan/bonsai/data/extracted/' +ks68_file = xdir + 'diagnoskoder_20190425_sv_70_86.csv' +icd9_file = xdir + 'diagnoskoder_20190425_sv_86_97.csv' + + +def heads_list(c): + nc = [] + for ci in c: + if common.isstr(ci): + nc += [ci[0]] + return nc + +def icd10_heads_list(df): + c1 = df['ICD10_1'] + c2 = df['ICD10_2'] + c = [c1] + [c2] + return heads_list(c) + +def gen_incare_lex(): + c1 = 'ICD10_1' + c2 = 'ICD10_2' + cols = ['code', c1, c2] + f1 = ks68_file + f2 = icd9_file + df1 = pd.read_csv(f1, dtype = str, sep='\t') + df2 = pd.read_csv(f2, dtype = str, sep='\t') + df2 = df2[ ~df2[c1].isnull() | ~df2[c2].isnull() ] + df1.columns = cols + df2.columns = cols + + df1['ICD10'] = df1.apply(icd10_heads_list, axis = 1) + df2['ICD10'] = df2.apply(icd10_heads_list, axis = 1) + df = pd.concat([df1, df2]) + df = common.rmcols(df, [c1, c2]) + + bio.save_local_input(df, 'incare_lex') + + + +""" ------------------------------- + + generation of diagnosis codes lexicon for causes + +------------------------------------""" + + +xdir = '/home/jan/bonsai/data/extracted/' +cause_file = xdir + 'diagnoskoder_20190425_orsak.csv' + + +def gen_cause_lex(): + + c1 = 'ICD10_1' + c2 = 'ICD10_2' + cols = ['code', c1, c2] + + f = cause_file + df = pd.read_csv(f, dtype = str, sep=',') + df = df[ ~df[c1].isnull() | ~df[c2].isnull() ] + df.columns = cols + df['ICD10'] = df.apply(icd10_heads_list, axis = 1) + df = common.rmcols(df, [c1, c2]) + + df8 = df[ df['code'].str.contains('#')] + df9 = df[ ~df['code'].str.contains('#')] + + df8.is_copy = False + df8['code'] = df8['code'].str[1:] + + bio.save_local_input(df8, 'cause_icd8_lex') + bio.save_local_input(df9, 'cause_icd9_lex') diff --git a/datapreparation/total.py b/datapreparation/total.py new file mode 100755 index 0000000..62d20ec --- /dev/null +++ b/datapreparation/total.py @@ -0,0 +1,1304 @@ +#! /home/jan/anaconda3/bin/python + + + +""" ------------------------------- + + Copyright (C) 2018 RISE + This code was produced by RISE + The 2019-08-29 version + + bonsai/src/total.py + + +------------------------------------""" + +# global imports + +import pandas as pd +import numpy as np +from datetime import datetime as timeclass +from dateutil import parser +import copy + +# local imports + +import global_settings as gs +import bonsai_io as bio +import common +import merge +import diagnose +import bonsai_time as btime + + +# import time +import analyze + + +""" ---------------------------------------- + + 1 dia + +---------------------------------------""" + +def dia_base(): + dia = bio.read_generated_dia() + grp = bio.read_diagroups() + dia = diagnose.add_group(dia, grp) + # dia = common.rmcols(dia, ['ICD10', 'SNOMED']) # real + dia = common.rmcols(dia, ['SNOMED']) # real + dia = diagnose.add_pers(dia) + dia['DIA'] = 1 + return dia + + +""" ---------------------------------------- + + 2 control group and base set + +---------------------------------------""" + + + +def control_base(): + ctr = bio.readcontrol() + ctr['DIA'] = 0 + m = gs.mean_diagnosis_age + f = lambda x: common.add_years_to_full_year(x, m) + ctr['DiagnosDat'] = ctr['FoddAr'].apply(f) + return ctr + + +def base_set(): + dia = dia_base() + ctr = control_base() + + b = gs.places.base_columns # real + c = ctr.columns + d = dia.columns + + """ + print('base 1') + for z in c: print ('\t', z) + print() + print('base 2') + for z in d: print ('\t', z) + print() + print('base 3') + for z in b: print ('\t', z) + print() + """ + + + for col in list(set(b) - set(c)): + ctr[col] = np.nan + for col in list(set(b) - set(d)): + dia[col] = np.nan + + + + + ctr = ctr[b] + dia = dia[b] + + return pd.concat([dia, ctr]) + + + +""" ---------------------------------------- + + 3 cause + +---------------------------------------""" + +def add_cause(df): # 0 for empty list is ok! + cols = gs.places.cause_added_cols # 'ICD', 'life_events', 'DODSDAT' + df_icd10 = df[['LopNr', 'ICD10']] # The LopNr and the recidive code + cause = bio.readcause(df_icd10) # don't read the recidive class causes + df = diagnose.add_cols(df, cause, 'LopNr', cols) + + c0 = 'DiagnosDat' + c1 = 'DODSDAT' + # df['Life(s)'] = df.apply(lambda d: date_diff_in_seconds(d, c1, c0), axis = 1) + # df['Life(days)'] = df['Life(s)'] / (60*60*24) + df['life'] = df.apply(lambda d: date_diff_in_days(d, c1, c0), axis = 1) + + return df + + +""" ---------------------------------------- + + 4 care + +---------------------------------------""" + + +def first_list(df1, df2, fcol, lex8, lex9, xcol = 'LopNr', ycol = 'DIAGNOS', zcol = 'INDATUM', split = True, n = 1,): + df2 = btime.first_time_aho(df2, df1, xcol, ycol, zcol, fcol, lex8, lex9, split = split, n = n) + return df2 + + +def all_list(df1, df2, acol, lex8, lex9, xcol = 'LopNr', ycol = 'DIAGNOS', zcol = 'INDATUM', split = True, n = 1,): + df2 = btime.all_times_aho(df2, df1, xcol, ycol, zcol, acol, lex8, lex9, split = split, n = n) + return df2 + + +def add_care(df, care, fcol, acol, n = 1, n2 = 0, n1 = 0): + xcol = 'LopNr' + + if n2 > 0: + care = care[n1:n2] # an initial part of incare + + lex8 = bio.read_care_icd8_lex() + lex9 = bio.read_care_icd9_lex() + + + #print('add ICD10 & DiagnosDat') + #care = diagnose.add_cols_aho(care, df, xcol, ['ICD10','DiagnosDat'], True) + + L = list(df[xcol].values) + care = care[ care[xcol].isin(L) ] # part of care with LopNr in df + + print(' creating first_list') + f = first_list(df, care, fcol, lex8, lex9, n = n) + + print(' adding first_list') + df = common.add_list_cols(df, f, xcol, [fcol]) + + print(' creating all_list') + a = all_list(df, care, acol, lex8, lex9, n = n) + + print(' adding all_list') + df = common.add_list_cols(df, a, xcol, [acol]) + return df + +def add_incare(df, n = 1, n2 = 0, n1 = 0): + fcol = 'first_incare' + acol = 'all_incare' + inc = bio.readincare() + df = add_care(df, inc, fcol, acol, n = n, n2 = n2, n1 = n1) + return df + + +def add_nicare(df, n = 1, n2 = 0, n1 = 0): + fcol = 'first_nicare' + acol = 'all_nicare' + nic = bio.readnicare() + df = add_care(df, nic, fcol, acol, n = n, n2 = n2, n1 = n1) + return df + +""" ---------------------------------------- + + 5 drugs + +---------------------------------------""" + + +def add_drugs(df, n = 1, n2 = 0, n1 = 0): + xcol = 'LopNr' + fcol = 'first_drug' + acol = 'all_drug' + ycol = 'ATC' + zcol = 'EDATUM' + drug = bio.readdrug(n) # read n characters of the ATC code !!! + + print('drug.shape', drug.shape) + if n2 > 0: + drug = drug[n1:n2] # an initial part of drugs + + #ilex = [] + #if n == 1: + # ilex = bio.read_incare_lex() + + + # instead fo letting both first_list and all_list adding the columns + # 'ICD10' and 'DiagnosDat' it should be done here with + # df = df1 and drug = df2 + + L = list(df[xcol].values) + drug = drug[ drug[xcol].isin(L) ] # part of drug with LopNr in df + + #print('add ICD10 & DiagnosDat') + #drug = diagnose.add_cols_aho(drug, df, xcol, ['ICD10','DiagnosDat'], True) + + print(' creating first_list') + f = first_list(df, drug, fcol, False, False, xcol=xcol ,ycol=ycol, zcol=zcol, n = n) + + print(' creating all_list') + a = all_list(df, drug, acol, False, False, xcol=xcol , ycol=ycol, zcol=zcol, n = n) + + print(' adding first_list') + df = common.add_list_cols(df, f, xcol, [fcol]) + + print(' adding all_list') + df = common.add_list_cols(df, a, xcol, [acol]) + + # drug.columns = ['LopNr', 'DIAGNOS', 'INDATUM'] + # drug = first_list(df, drug, fcol, split = False) + # drug = first_list(df, drug, fcol, xcol=xcol , ycol=ycol, zcol=zcol) + # df = common.add_list_cols(df, drug, xcol, [fcol]) + + return df + + +""" ---------------------------------------- + + care and cause merged + + not used in the present version + +---------------------------------------""" + + + +def concat(d, cs): + L = [] + for c in cs: + x = d[c] + if common.isarray(x): + L += list(x) + return list(np.unique(L)) + + +def merge_care_and_cause(df, diacol): + nr = 0 + sv = 'DIA_SV' + ov = 'DIA_OV' + cc = 'life_events' + cols = [sv, ov, cc] + df = diagnose.add_incare(df, nr) + df.columns = [sv if x=='DIAGNOS' else x for x in df.columns] + df = diagnose.add_nicare(df, nr) + df.columns = [ov if x=='DIAGNOS' else x for x in df.columns] + df = add_cause(df) + # df[diacol] = df.apply(lambda d:concat(d, cols), axis = 1) + # df = common.rmcols(df, cols) + # print(df.columns) + return df + +""" ---------------------------------------- + + 6 outcome + + The outcomes relates the input data the output data. + The ouput is given by care, cause and drug + outcome_01 give us the time of cause, if occurred and + first time of inst. care for each class of cause and + cause diagnosis, as given by the ICD10 first letter. + + + obs!!! the times produced by listdiff_*** are obsolet + and not used and the assigments of base['Inc(s)'] and + base['Inc(days)'] in outcome_01 may be removed. + They are replaces by values given to df[ecol] and + df[ne] in add_time_differences + +---------------------------------------""" + +def date_diff_in_seconds(df, c1, c0): + return common.time_diff_in_seconds(df[c1], df[c0]) + +def date_diff_in_days(df, c1, c0): + return common.time_diff_in_days(df[c1], df[c0]) + +def date_diff_in_years(df, c1, c0): + return common.time_diff_in_years(df[c1], df[c0]) + +def list_diff_in_seconds(df, c1, c0): + inc_list = df[c1] + if not isinstance(inc_list, list): + return inc_list + L = [] + dtime = df[c0] + for z in inc_list: + if not isinstance(z, list): + print (z) + else: + diff = common.time_diff_in_seconds(z[1], dtime) + L = L + [[z[0], diff]] + return L + +def list_diff_in_days(df, c1, c0): + inc_list = df[c1] + if not isinstance(inc_list, list): + return inc_list + L = [] + dtime = df[c0] + for z in inc_list: + if not isinstance(z, list): + print (z) + else: + diff = common.time_diff_in_days(z[1], dtime) + L = L + [[z[0], diff]] + return L + + + +def outcome(n = 1, save = False, n2 = 0, n1 = 0): + # n is the number of considered characters of the icd10 code + + name = gs.names.outcome_file + base = base_set() + base = add_cause(base) # recidiv not added + print('\n add incare\n') + base = add_incare(base, n = n, n2 = n2, n1 = n1) # recidiv not added + print('\n add nicare\n') + base = add_nicare(base, n = n, n2 = n2, n1 = n1) # recidiv not added + base = add_drugs(base, n = n, n2 = n2, n1 = n1) # just a single letter for each order + + if save: + bio.save_generated_file(base, name) + + return base + +# instead of running all of the outcome we may divide it into parts +# storing and reading each partial computation + + +def out_p1(n =3, n2 = 0, n1 = 0): + + out_p1 = gs.names.out_p1 + base = base_set() + base = add_incare(base, n = n, n2 = n2, n1 = n1) + base = add_nicare(base, n = n, n2 = n2, n1 = n1) + bio.save_generated_file(base, out_p1) + + +# the drugs are not used at present and we may skip them + +def out_no_drugs(n =3, n2 = 0, n1 = 0): + out_p1 = gs.names.out_p1 + out_file = gs.names.outcome_file + df = bio.read_generated_file(out_p1) + df['LopNr'] = df['LopNr'].apply(str) + df = add_cause(df) + bio.save_generated_file(df, out_file) + +# with the drugs we have three parts + +def out_p2(n =3, n2 = 0, n1 = 0): + + out_p1 = gs.names.out_p1 + out_p2 = gs.names.out_p2 + df = bio.read_generated_file(out_p1) + df['LopNr'] = df['LopNr'].apply(str) + df = add_drugs(df, n = n, n2 = n2, n1 = n1) + bio.save_generated_file(df, out_p2) + +def out_p3(n =3, n2 = 0, n1 = 0): + out_p2 = gs.names.out_p2 + out_file = gs.names.outcome_file + df = bio.read_generated_file(out_p2) + df['LopNr'] = df['LopNr'].apply(str) + df = add_cause(df) + bio.save_generated_file(df, out_file) + + +""" ---------------------------------------- + + 7 time differences + +---------------------------------------""" + + + +def time_differences(df, span, fcol): + + gap = gs.outcome_timegap + dia = df['DiagnosDat'] + + [r1, r2] = span + p1 = common.add_days(dia, gap) + p2 = df['DODSDAT'] + p3 = df['last_date'] + f_str = df[fcol] + emigrated = df['VitalStatus'] == 'emigrated' + + start = common.time_max(r1, p1) + ne_end = r2 + if p2 != '0': + ne_end = common.time_min(p2, r2) + elif emigrated and common.isstr(p3): # zzz + ne_end = common.time_min(p3, r2) + + + no_event_time = common.time_diff_in_days(ne_end, start) + no_event_time = max(0, no_event_time) + event_times = [] + + if f_str == '[]' or f_str == '0': + return [event_times, no_event_time] + + f_list = common.str2list(f_str) + + for event in f_list: + [icd10_class, event_date] = event + class_time = common.time_diff_in_days(event_date, start) # cannot be negative ! + if class_time < 0: print('event_time < 0') + event_times += [[icd10_class, class_time]] + + return [event_times, no_event_time] + + + +def no_event_time_death(df): + + gap = gs.outcome_timegap + dia = df['DiagnosDat'] + [r1, r2] = gs.lifespan # register start and end + p1 = common.add_days(dia, gap) + p2 = df['DODSDAT'] # we need a DODSDAT str it must have been convered + p3 = df['last_date'] + emigrated = df['VitalStatus'] == 'emigrated' + + if not common.isstr(p2): + print('DODSDAT need to be converted to a str') + + start = common.time_max (r1 , p1 ) # with a complete register it's p1 + end = r2 + if p2 != '0': + end = p2 + + elif emigrated and common.isstr(p3): # zzz + end = common.time_min(p3, r2) + time = common.time_diff_in_days(end, start) + return max(0, time) + + + +def add_no_event_time_death(df, scol): # no_event_time_death + df[scol] = df.apply(no_event_time_death, axis = 1) + return df + +def add_time_differences(df, span, ecol, fcol): # zzz + ne = 'no_' + ecol + df['tmp'] = df.apply(lambda d: time_differences(d, span, fcol), axis = 1) + df[ecol] = df['tmp'].apply(lambda x: x[0]) + df[ne] = df['tmp'].apply(lambda x: x[1]) + df = df.drop(['tmp'], axis=1) + return df + +def all_timedifferences(df, span, acol): + + if isinstance(df[acol], str): + a = common.str2list(df[acol]) + else: + a = [] + print("Bad string: ", df[acol]) + + if a == []: + return [] + + dia = df['DiagnosDat'] + gap = gs.outcome_timegap + gap_date = common.add_days(dia, gap) + [start, end] = span + z = common.time_max(gap_date, start) + data = [] + + for x in a: + # y = x[0] + dates = x[1] + dates.sort() + diffs = [] + + for date in dates: + diff = common.time_diff_in_days(date, z) + diffs += [diff] + data += [[x[0], diffs]] + + return data + + + +def add_all_timedifferences(df, span, dcol, acol): + df[dcol] = df.apply(lambda d: all_timedifferences(d, span, acol), axis = 1) + return df + + +def censored_time(df, spanstart): + """ + censored time = max (0, min (r1 , p2 , p3 ) − p1 ) + """ + + gap = gs.outcome_timegap + dia = df['DiagnosDat'] + + r1 = spanstart + p1 = common.add_days(dia, gap) + p2 = df['DODSDAT'] + p3 = df['last_date'] + emigrated = df['VitalStatus'] == 'emigrated' + + # start = p1 + end = r1 + if p2 != '0': + end = common.time_min(p2, r1) + + elif emigrated and common.isstr(p3): # zzz + end = common.time_min(p3, r1) + + time = common.time_diff_in_days(end, p1) + + return max(0, time) + + + + +def add_censored_times(df): + [incstart, incend] = gs.incspan + [nicstart, nicend] = gs.nicspan + [drugstart, drugend] = gs.drugspan + df['censored_time_inc'] = df.apply(lambda d: censored_time(d, incstart), axis = 1) + df['censored_time_nic'] = df.apply(lambda d: censored_time(d, nicstart), axis = 1) + df['censored_time_drug'] = df.apply(lambda d: censored_time(d, drugstart), axis = 1) + df['censored_time_death'] = 0 + return df + + +def timediff(): + name = gs.names.outcome_file + + ispan = gs.incspan + nspan = gs.nicspan + dspan = gs.drugspan + + ficol = 'first_incare' + fncol = 'first_nicare' + fdcol = 'first_drug' + + eicol = 'event_time_inc' + encol = 'event_time_nic' + edcol = 'event_time_drug' + + aicol = 'all_incare' + ancol = 'all_nicare' + adcol = 'all_drug' + + dicol = 'diff_incare' + dncol = 'diff_nicare' + ddcol = 'diff_drug' + + scol = 'no_event_time_death' + df = bio.read_generated_file(name) + + # 'DODSDAT' is read as an int + df['DODSDAT'] = df['DODSDAT'].apply(str) + + # The three emigrated persons without a last date are removed + df = df[(df['VitalStatus'] != 'emigrated') | df['last_date'].notnull()] + + df = add_no_event_time_death(df, scol) + df = add_time_differences(df, ispan, eicol, ficol) + df = add_time_differences(df, nspan, encol, fncol) + + df = add_censored_times(df) + # df = add_time_differences(df, dspan, edcol, fdcol) + + # df = add_all_timedifferences(df, ispan, dicol, aicol) + # df = add_all_timedifferences(df, nspan, dncol, ancol) + # df = add_all_timedifferences(df, dspan, ddcol, adcol) + + + #df = df[gs.places.out_selection_01] + df = add_other_str(df) + bio.save_generated_file(df, gs.names.timediff_file) + + + +""" ---------------------------------------- + + 8 counts and frequences + (replaced by 9) + +---------------------------------------""" + + +def capitals(): + c = [] + for x in range(65, 91): + c += [chr(x)] + return c + +def event_types(events): + t = [] + for event in events: + t = t + [event[0]] + return set(t) + +def event_data(df, nr = 0): + + ev_col = 'event_time_inc' + nev_col = 'no_event_time_inc' + caps = set(capitals()) + ev = {} + nev = {} + for c in caps: + ev[c] = [] + nev[c] = [] + data = df[[ev_col, nev_col]].values + if nr > 0: + data = data[:nr] + for line in data: + events = line[0] + nev_time = line[1] + nev_types = caps - event_types(events) + for c in nev_types: + nev[c] += [nev_time] + for event in events: + [c, ev_time] = event + if c.isupper(): + ev[c] += [ev_time] + return ev, nev + + +def extract(d): + xd = {} + for k in d: + t = d[k] + xd[k] = {} + xd[k]['nr'] = len(t) + xd[k]['days'] = sum(t) + return xd + +def xdata(df, nr = 0): + ev, nev = event_data(df, nr = nr) + xev = extract(ev) + xnev = extract(nev) + + total = {} + for c in capitals(): + total[c] = {} + nr = xev[c]['nr'] + days = xev[c]['days'] + xnev[c]['days'] + + total[c]['nr'] = nr + total[c]['days'] = days + # total[c]['freq'] = round((nr / days) * int(1e6), 2) + + return total + + +def print_xd(xd): + # print('type\t nr\t 1000 years\t nr / (1000 years)') + print('type\t nr\t nr / (1000 years)') + print() + for c in capitals(): + nr = xd[c]['nr'] + y = xd[c]['days'] / (365.24 * 1000) + if y > 1e-10: + f = nr / y + else: + f = -1 + if nr > 0: + print(c + '\t', nr, '\t', round(y, 1), ' \t', round(f,3)) + + + + +""" ---------------------------------------- + + 9 frequencies and probabilities + + compare diagnosis group and control group by + the probability that the insitutional care + frequency being higher for the fist category + +---------------------------------------""" + + +def compute_prob(xd, xc): + + sf_lim = np.log(1e-7) + data = [] + for c in capitals(): + nd = xd[c]['nr'] + nc = xc[c]['nr'] + sd = xd[c]['days'] + sc = xc[c]['days'] + if nd > 0 and nc > 0: + fd = round(1e6 * nd/sd, 3) + fc = round(1e6 * nc/sc, 3) + sf = common.exp2sample_logsf(nd, nc, sd, sc) + H1 = int(sf < np.log(1e-7)) + nsf = round(-sf, 2) + data = data + [ [c, nd, nc, sd, sc, fd, fc, nsf, H1] ] + + df = pd.DataFrame(data) + df.columns = [ + 'group', + 'nr_dia', + 'nr_ctr', + 'days_dia', + 'days_ctr', + 'freq_dia', + 'freq_ctr', + '-log(1-p)', + '1-p < 1e-7', + ] + return df + + +selected_cols = [ + 'group', + 'nr_dia', + 'nr_ctr', + 'freq_dia', + 'freq_ctr', + '-log(1-p)', + '1-p < 1e-7', + ] + + +def compare_inc_group(group = 0): + group_col = 'Diagnostic_Group' + td = bio.readtimediff() + + # td = td[td['oth'] == 0] # remove all other diseases + dia = td[td['DIA'] == 1] + if group > 0: + dia = dia[dia[group_col] == group] + ctr = td[td['DIA'] == 0] + xd = xdata(dia) + xc = xdata(ctr) + df = compute_prob(xd, xc) # lägg till styrka ? + # df = df[selected_cols] + df['M days_dia'] = df['days_dia']/1e6 + df['M days_ctr'] = df['days_ctr']/1e6 + df = common.rmcols(df, ['days_dia', 'days_ctr']) + return df + + + +def compare_inc(): + df = compare_inc_group() + bio.save_local_result(df, 'sluten_vard_5y_alla') + for group in range(1, 13): + df = compare_inc_group(group) + bio.save_local_result(df, 'sluten_vard_5y_grupp_' + str(group)) + + + + +""" ---------------------------------------- + + 10 groups of other diseases + +---------------------------------------""" + + + +def selection(df, Take, Leave = []): + annat = 'AnnatSpecDiagnos' + a = df[annat] + if not common.isstr(a): + return np.nan + for x in Leave: + if (x in a): + return False + for x in Take: + if (x in a): + return True + return False + + + +def add_other(df): + + bw = lambda d: selection(d, Take = ['Beck']) + ep = lambda d: selection(d, Take = ['pilep']) + men = lambda d: selection(d, Take = ['Sip', 'MEN']) + men2 = lambda d: selection(d, Take = ['Sip', 'MEN'], Leave = ['1']) + nf = lambda d: selection(d, Take = ['eckling', 'NF', 'euro']) + nf1 = lambda d: selection(d, + Take = ['eckling', 'NF', 'euro'], + Leave = ['typ 2', 'typ II', 'NF2', 'NF 2']) + t21 = lambda d: selection(d, Take = ['21', 'own']) + tub = lambda d: selection(d, Take = ['Tub']) + + df['bw'] = df.apply(bw, axis = 1) + df['ep'] = df.apply(ep, axis = 1) + df['men'] = df.apply(men, axis = 1) + df['nf'] = df.apply(nf1, axis = 1) + df['t21'] = df.apply(t21, axis = 1) + df['tub'] = df.apply(tub, axis = 1) + df['spe'] = df['bw'] + df['ep'] + df['men'] + df['nf'] + df['t21'] + df['oth'] = ~df['AnnatSpecDiagnos'].isnull() + + return df + +def add_other_str(df): + cols = ['bw', 'ep', 'men', 'nf', 't21', 'tub', 'oth'] + df = add_other(df) + df = common.df_bool2str(df, cols) + return df + + + +""" ---------------------------------------- + + 11 surgery + +---------------------------------------""" + +def surg_codes(df): + kod = df['Kod'] + sub = df['Sub'] + + if common.isnull(kod): + return np.nan + + kod = kod.replace('+', ' ') + kod = kod.replace('/', ' ') + kod = kod.split() + + if not common.isnull(sub): + + sub = sub.replace('+', ' ') + sub = sub.split() + for k in kod: + if not k[0] == sub[0][0]: + sub += k + return sub + return kod + + +def generate_surgery(): + df = bio.readsurgery() + df = df.sort_values('LopNr').reset_index(drop=True) + df['surgery'] = df.apply(surg_codes, axis = 1) + df = common.compression_with_dates(df, 'LopNr', 'surgery', 'Datum') + bio.save_generated_file(df, 'surgery') + + + +def surgery_diff(df): + dia = df['DiagnosDat'] + surgery = df['surgery'] + surgeries_and_diffs = [] + for [codes, date] in surgery: + diff = common.time_diff_in_days(date, dia) + surgeries_and_diffs += [[codes, diff]] + return common.code_compress(surgeries_and_diffs) + + +def add_surgery(df): + surg = bio.read_generated_surgery() + cols = list(surg.columns) + cols.remove('LopNr') + df = common.add_list_cols(df, surg, 'LopNr', cols) + df['surgery_diff'] = df.apply(surgery_diff, axis = 1) + return df + + + +def surgery_ohe(df): + c = 'kirurgi' + df = diagnose.one_x(df, c) + df = common.rmcols(df, [c]) + return df + + + +""" ---------------------------------------- + + 12 cytostatica + +---------------------------------------""" + + + +def drug_classes(x, treatment, cyto_code, drug_class): + + tx = treatment[ treatment['LopNr'] == x ] + if tx.empty: + return [] + + tx = tx[['BehNr', 'cyto_date']] + tx = tx.sort_values('cyto_date') + bs = np.unique(tx['BehNr'].values) # all the 'BehNr' for x, max 2 + category_date_list = [] + + for b in bs: + codes = common.look_up_all_values(b, cyto_code, 'BehNr', 'DrogKod') + categories = [] + for c in np.unique(codes): + categories += common.look_up_all_values(c, drug_class, 'Drogkod', 'Kategori') + categories = np.unique(categories) + txb = tx[tx['BehNr'] == b] + dates = np.unique(txb['cyto_date'].values) + d0 = dates[0] + for c in categories: + category_date_list += [[c, d0]] + """ + for d in dates: + for c in categories: + category_date_list += [[c, d]] + """ + + return category_date_list + + + +def drug_diff(df): + dia = df['DiagnosDat'] + drugs = df['cytoclass'] + drugs_and_diffs = [] + for [code, date] in drugs: + diff = common.time_diff_in_days(date, dia) + drugs_and_diffs += [[code, diff]] + return drugs_and_diffs + + + +def add_cytostatica(df, treat): + + treat = treat[['LopNr', 'BehNr', 'cyto_date']] + treat = treat.dropna() + + xs = np.unique(treat['LopNr'].values) + print(' Nr of LopNr:s in treatment data with cyto', len(xs)) + + + # treat = bio.readtreatment() # LopNr > BehNr + codes = bio.readcytocodes() # BehNr > DrogKod + classes = bio.readdrugclasses() # Drogkod > Kategori + + f = lambda x: drug_classes(x, treat, codes, classes) + + df['cytoclass'] = df['LopNr'].apply(f) + df['cytoclass_diff'] = df.apply(drug_diff, axis = 1) + + return df + + + + +""" ---------------------------------------- + + 13 stemmcell + +---------------------------------------""" + +""" +def stemcell(d, treat): + x = d['LopNr'] + allo = common.look_up_all_values(x, treat, 'LopNr', 'Allogen') + d['allo'] = 0 + d['auto'] = 0 + if 'Ja' in allo: + d['allo'] = 1 + if 'Nej' in allo: + d['auto'] = 1 + return d +""" + +def stemcell(x, treat): + tx = treat[ treat['LopNr'] == x ] + if tx.empty: + return [] + tx = tx[['Allogen', 'stem_date']] + allo = tx[tx['Allogen'] == 'Ja'] + auto = tx[tx['Allogen'] == 'Nej'] + allo_dates = np.unique(allo['stem_date'].values) + auto_dates = np.unique(auto['stem_date'].values) + + stem = [] + for date in allo_dates: + stem += [['allo', date]] + for date in auto_dates: + stem += [['auto', date]] + + return stem + + +def stem_diff(df): + dia = df['DiagnosDat'] + stem = df['stemcell'] + stem_and_diffs = [] + for [code, date] in stem: + diff = common.time_diff_in_days(date, dia) + stem_and_diffs += [[code, diff]] + return stem_and_diffs + + + +def add_stemcell(df, treat): + + treat = treat[['LopNr', 'Allogen', 'stem_date']] + treat = treat.dropna() + xs = np.unique(treat['LopNr'].values) + print(' Nr of LopNr:s in treatment data with stem', len(xs)) + df['stemcell'] = df['LopNr'].apply(lambda x: stemcell(x, treat)) + df['stemcell_diff'] = df.apply(stem_diff, axis = 1) + return df + + + +def analyze_stemcell_data(df): # not used for production of total + cols = gs.places.stemcell_columns + cell = bio.readtreatment() + cell = cell[['LopNr'] + cols] + cell = cell.drop_duplicates() + cell = cell[ ~( cell['Allogen'].isnull() & cell['Autolog'].isnull()) ] + print(cell.shape) + print(cell[0:20]) + + analyze.compare_lopnr(df, cell, 'base', 'cell') + + df = diagnose.add_cols(df, cell, 'LopNr', cols) + return df + + +""" ---------------------------------------- + + 14 stralung + +---------------------------------------""" + +# def look_up_ + +def radio_datum(treat, x, b): + pass + +def radio_classes(x, treat, codes, classes): + + cx = codes[codes['LopNr'] == x] + + if cx.empty: + return [] + + cx = cx[['BehNr','Target_Kod']] + tx = treat[treat['LopNr'] == x] + bs = np.unique(cx['BehNr'].values) # all the 'BehNr' for x, max 2 + + category_date_list = [] + for b in bs: + cxb = cx[cx['BehNr'] == b] + txb = tx[tx['BehNr'] == b] + dates = np.unique(txb['radio_date'].values) + codes = np.unique(cxb['Target_Kod'].values) + + d0 = dates[0] + + for c in codes: + for k in classes.columns[1:]: + categories = common.look_up_nonnan(c, classes, 'Target_Kod', k) + if not categories == []: + t = categories[0] + if [t, d0] not in category_date_list: + category_date_list += [[t, d0]] + + """ + for d in dates: + for c in codes: + for k in classes.columns[1:]: + categories = common.look_up_nonnan(c, classes, 'Target_Kod', k) + if not categories == []: + t = categories[0] + if [t, d] not in category_date_list: + category_date_list += [[t, d]] + """ + + return category_date_list + + +def radio_diff(df): + dia = df['DiagnosDat'] + radio = df['stralung'] + radio_and_diffs = [] + for [code, date] in radio: + diff = common.time_diff_in_days(date, dia) + radio_and_diffs += [[code, diff]] + return radio_and_diffs + + + +def add_radio(df, treat): + + treat = treat[['LopNr', 'BehNr', 'radio_date']] + treat = treat.dropna() + xs = np.unique(treat['LopNr'].values) + print(' Nr of LopNr:s in treatment data with radio', len(xs)) + + + codes = bio.readradiocodes() # LopNr > Target_Kod (radioterapi) + codes = codes.dropna() + classes = bio.readradioclasses() # Target_kod > "Kategorier" + f = lambda x: radio_classes(x, treat, codes, classes) + df['stralung'] = df['LopNr'].apply(f) + + df['radio_diff'] = df.apply(radio_diff, axis = 1) + + return df + + + + +""" ---------------------------------------- + + 14 total + +---------------------------------------""" + + +def stem(df): + z = [] + if df['allo'] == 1: z += ['allo'] + if df['auto'] == 1: z += ['auto'] + return z + +def other(df): + z = [] + for s in ['bw', 'ep', 'men', 'nf', 't21', 'tub']: + if df[s] == '1': z += [s] + return z + + +def event_time_death(df): + events = df['life_events'] + if events == '0': + return [] + + events = common.str2list(events) + + gap = gs.outcome_timegap + dia = df['DiagnosDat'] + start = common.add_days(dia, gap) + end = str(df['DODSDAT']) + time = common.time_diff_in_days(end, start) + + # t= int(df['life']) # redo this + z = [] + for e in events: + z += [[e, time]] + + return z + +def kon(x): + if x == '1': return 'pojke' + if x == '2': return 'flicka' + return x + + +def place_in_order(s, y, z): + # place z after y in the list s + t = [] + for x in s: + if not x == z: + t += [x] + if x == y: + t += [z] + return t + + + +cols_to_remove = [ + 'AnnatSpecDiagnos', + 'Diagnostic_Group', + 'K_n', + 'Diagnos_lder', +# 'DiagnosDat', + 'SlutBehDat', + 'FoddAr', + 'first_incare', + 'first_nicare', + 'all_incare', + 'all_nicare', + 'first_drug', + 'all_drug', + 'surgery', + 'cytoclass', + 'stemcell', + 'radio', + 'ICD', + 'DODSDAT', + 'life_events', + 'life', + 'oth', + 'bw', + 'ep', + 'men', + 'nf', + 't21', + 'tub', + 'spe', +# 'allo', +# 'auto', +] + + + +def diagnosis_class(x): + if isinstance(x, str): + return x[:3] + return x + +def modify_total(df): + + # 1 change column names + + + df = common.change_col_names(df, 'DIA', 'has_dia') + # df = common.change_col_names(df, 'oth', 'has_other_dia') # other is removed + df = common.change_col_names(df, 'stralung', 'radio') + df = common.change_col_names(df, 'ICD10', 'diagnosis') + + + # 2 define other_dia columns + + dfcopy = copy.copy(df) # alt. = copy.deepcopy(df) + # dfcopy['stemcell'] = df.apply(lambda d: stem(d), axis = 1) + dfcopy['other_dia'] = df.apply(lambda d: other(d), axis = 1) + df = dfcopy + + # 3 define event_time_death + + df['event_time_death'] = df.apply(lambda d: event_time_death(d), axis = 1) + df['all_event_time_death'] = df['event_time_death'].apply(lambda lst: [[x[0], [x[1]]] for x in lst]) + + # 4 add diagnosis class + + df['diagnosis_class'] = df['diagnosis'].apply(diagnosis_class) + + # 5 in col. K_n replace 1 with pojke and replace 2 with flicka + + df['sex'] = df['K_n'].apply(kon) + + # 6 rearrange columns + + c = list(df.columns) + c = place_in_order(c, 'Diagnostic_Group', 'sex') + c = place_in_order(c, 'DODSDAT', 'event_time_death') + c = place_in_order(c, 'diagnosis', 'diagnosis_class') + # c = place_after('diff_nic', c, 'no_event_time_nic') + df = df[c] + + + # 7 remove not to be used columns + + df = common.rmcols(df, cols_to_remove) + + return df + + +def generate_total(): + df = bio.readtimediff() + treat = bio.readtreatment() + df = add_surgery(df) + df = add_cytostatica(df, treat) + df = add_stemcell(df, treat) + df = add_radio(df, treat) + df = modify_total(df) + bio.save_generated_file(df, gs.names.total_file) + + print('total columns') + for c in df.columns: + print('\t', c) + print() + + t = df[['DiagnosDat', 'cytoclass', 'cytoclass_diff', 'stemcell', 'stemcell_diff', 'radio', 'radio_diff']] + bio.save_tmp_file(t, 'total_dsr_2020_05_06') + + t = df[['surgery_diff', 'cytoclass_diff', 'stem_diff', 'radio_diff']] + bio.save_tmp_file(t, 'total_treatment_2020_05_06')