From 5879a6f556314d7562b1413a8033894f098adadb Mon Sep 17 00:00:00 2001 From: Maxim Zaslavsky Date: Thu, 30 Nov 2017 10:12:42 -0500 Subject: [PATCH 1/5] cp model6.4 -> model6.5 --- ..._optim_otherbucket_transcriptvariance.stan | 120 ++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 model-single-origin-samples/models/model6.5_negbinom_matrix_correlation_features_oos_optim_otherbucket_transcriptvariance.stan diff --git a/model-single-origin-samples/models/model6.5_negbinom_matrix_correlation_features_oos_optim_otherbucket_transcriptvariance.stan b/model-single-origin-samples/models/model6.5_negbinom_matrix_correlation_features_oos_optim_otherbucket_transcriptvariance.stan new file mode 100644 index 0000000..7073b75 --- /dev/null +++ b/model-single-origin-samples/models/model6.5_negbinom_matrix_correlation_features_oos_optim_otherbucket_transcriptvariance.stan @@ -0,0 +1,120 @@ +// neg binom parameterization +// estimate correlation matrix among cell types + +// other bucket +// test first with other proportion of 0. i.e. with synthetic mixtures of known cell types +// then simulate other data: +// either pick other genes that we call tumor-related and give some values for those and spike in +// or spike in cell line or RCC sample (though that may have immune content) + +data { + // dimensions + int N; // N obs + int G; // N genes + int S; // N samples + int C; // N classes (e.g. B-cell, T-cell, B_Naive, CD5, CD45RO, etc) + // note: classes should be mutually exclusive. Each row here should sum to 1 + int M; // number of cell-level predictors + + // data for each gene*sample + int gene[N]; // gene id for each obs + int sample[N]; // sample id for each obs + vector[C] x[N]; // map each obs to each class (0:'- or ?', 1:'+') + int y[N]; // count/tpm for each obs + + // group-level predictors for each class C + matrix[C, M] cell_features; + + // out of sample estimates, with unknown comp + int N2; // number of records in out of sample (UNK) + int S2; // number of samples in UNK set + int gene2[N2]; // gene id for UNK data (corresponding to IDs above) + int sample2[N2]; // sample id for each UNK sample (separate from above) + int y2[N2]; // data for UNK set +} +transformed data { + int sample_y[S, G]; // array (size SxG) of ints + vector[C] sample_x[S]; // array (size S) of vectors[C] + int sample2_y[S2, G]; + int nu; + for (n in 1:N) { + sample_y[sample[n], gene[n]] = y[n]; + sample_x[sample[n]] = x[n,]; + } + for (n in 1:N2) { + sample2_y[sample2[n], gene2[n]] = y2[n]; + } + nu = 1; +} +parameters { + cholesky_factor_corr[C] Omega_L; + vector[C] Omega_sigma; + //corr_matrix[C] Omega; // degree of correlation among loading factors for each cell type + //vector[C] tau; // scale for each cell type - multiplied (on diagonal) with Omega + + matrix[G, C] theta; // loading factors for each gene, for each cell type + vector[C] theta_mu; // mean expression level for each cell type + vector[M] theta_coefs_raw; + vector[M] theta_coefs_per_gene[G]; + + vector[G] log_gene_base; // constant intercept expression level for each gene, irrespective of cell type + vector[G] gene_phi; // overdispersion parameter per transcript (for now) + simplex[C] sample2_x[S2]; // inferred sample2 compositions (simplex type enforces sum-to-one) + + vector[S2] unknown_prop; // proportion of each test sample that is of unknown cell type + vector[G] other_log_contribution_per_gene[S2]; // for each test sample, per-transcript contribution of unknown cell type +} +transformed parameters { + vector[M] theta_coefs[G]; + for (g in 1:G) + theta_coefs[g] = theta_coefs_raw + theta_coefs_per_gene[g]; +} +model { + // estimate theta - gene-level expression per cell type, as a function of cell-surface expression proteins + theta_mu ~ normal(0, 1); + theta_coefs_raw ~ normal(0, 1); + Omega_sigma ~ gamma(0.1, 0.1); + Omega_L ~ lkj_corr_cholesky(nu); + for (g in 1:G) { + vector[C] theta_tmp; // temporary predictor for cell-gene-specific expression level + theta_coefs_per_gene[g] ~ normal(0, 1); + theta_tmp = theta_mu + cell_features*theta_coefs[g]; + theta[g] ~ multi_normal_cholesky(theta_tmp, diag_pre_multiply(Omega_sigma, Omega_L)); + } + + // estimate sample_y: observed expression for a sample (possibly a mixture) + log_gene_base ~ normal(0, 1); + gene_phi ~ normal(0, 1); + for (s in 1:S) { + vector[G] log_expected_rate; + log_expected_rate = log_gene_base + log(theta*sample_x[s]); + sample_y[s] ~ neg_binomial_2_log(log_expected_rate, gene_phi); + } + + // estimate sample2_y: observed expression for a sample of unknown composition + unknown_prop ~ beta(5, 5); // not sure about this, maybe Beta(1,1) uniform? + + + for (s in 1:S2) { + vector[G] log_expected_rate; + other_log_contribution_per_gene[s] ~ normal(0, 1); + // TODO: use logmix() instead after `log_gene_base + ` in next line. + log_expected_rate = log_gene_base + log(theta*sample2_x[s]) * (1 - unknown_prop[s]) + other_log_contribution_per_gene[s] * unknown_prop[s]; + sample2_y[s] ~ neg_binomial_2_log(log_expected_rate, gene_phi); + } +} +generated quantities { + int y_rep[N]; + real log_lik[N]; + matrix[C,C] Omega; + matrix[C,C] tau; + Omega = multiply_lower_tri_self_transpose(Omega_L); + tau = quad_form_diag(Omega_L, Omega_sigma); + + for (n in 1:N) { + real log_expected_rate; + log_expected_rate = log_gene_base[gene[n]] + log(theta[gene[n], ]*x[n]); + y_rep[n] = neg_binomial_2_log_rng(log_expected_rate, gene_phi[gene[n]]); + log_lik[n] = neg_binomial_2_log_lpmf(y[n] | log_expected_rate, gene_phi[gene[n]]); + } +} \ No newline at end of file From f4ed3570f43d669a1f5141a6ac4982948976918a Mon Sep 17 00:00:00 2001 From: Maxim Zaslavsky Date: Thu, 30 Nov 2017 11:16:54 -0500 Subject: [PATCH 2/5] some model changes --- ...s_optim_otherbucket_transcriptvariance.stan | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/model-single-origin-samples/models/model6.5_negbinom_matrix_correlation_features_oos_optim_otherbucket_transcriptvariance.stan b/model-single-origin-samples/models/model6.5_negbinom_matrix_correlation_features_oos_optim_otherbucket_transcriptvariance.stan index 7073b75..e48d8a6 100644 --- a/model-single-origin-samples/models/model6.5_negbinom_matrix_correlation_features_oos_optim_otherbucket_transcriptvariance.stan +++ b/model-single-origin-samples/models/model6.5_negbinom_matrix_correlation_features_oos_optim_otherbucket_transcriptvariance.stan @@ -63,6 +63,9 @@ parameters { vector[S2] unknown_prop; // proportion of each test sample that is of unknown cell type vector[G] other_log_contribution_per_gene[S2]; // for each test sample, per-transcript contribution of unknown cell type + + vector[S] sample_variance; // sample-specific variance scaling + vector[S2] sample2_variance; // sample2-specific variance } transformed parameters { vector[M] theta_coefs[G]; @@ -85,10 +88,17 @@ model { // estimate sample_y: observed expression for a sample (possibly a mixture) log_gene_base ~ normal(0, 1); gene_phi ~ normal(0, 1); + sample_variance ~ normal(0, 1); + sample2_variance ~ normal(0, 1); + for (s in 1:S) { vector[G] log_expected_rate; log_expected_rate = log_gene_base + log(theta*sample_x[s]); - sample_y[s] ~ neg_binomial_2_log(log_expected_rate, gene_phi); + + vector[G] corrected_phis; + corrected_phis = gene_phi * sample_variance[s]; + + sample_y[s] ~ neg_binomial_2_log(log_expected_rate, corrected_phis); } // estimate sample2_y: observed expression for a sample of unknown composition @@ -100,7 +110,11 @@ model { other_log_contribution_per_gene[s] ~ normal(0, 1); // TODO: use logmix() instead after `log_gene_base + ` in next line. log_expected_rate = log_gene_base + log(theta*sample2_x[s]) * (1 - unknown_prop[s]) + other_log_contribution_per_gene[s] * unknown_prop[s]; - sample2_y[s] ~ neg_binomial_2_log(log_expected_rate, gene_phi); + + vector[G] corrected_phis2; + corrected_phis2 = gene_phi * sample2_variance[s]; + + sample2_y[s] ~ neg_binomial_2_log(log_expected_rate, corrected_phis2); } } generated quantities { From 6db815e94b1f41ac6a6cc2f18e59cf6c286615a2 Mon Sep 17 00:00:00 2001 From: Maxim Zaslavsky Date: Thu, 30 Nov 2017 11:17:28 -0500 Subject: [PATCH 3/5] rename --- ...orrelation_features_oos_optim_otherbucket_samplevariance.stan} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename model-single-origin-samples/models/{model6.5_negbinom_matrix_correlation_features_oos_optim_otherbucket_transcriptvariance.stan => model6.5_negbinom_matrix_correlation_features_oos_optim_otherbucket_samplevariance.stan} (100%) diff --git a/model-single-origin-samples/models/model6.5_negbinom_matrix_correlation_features_oos_optim_otherbucket_transcriptvariance.stan b/model-single-origin-samples/models/model6.5_negbinom_matrix_correlation_features_oos_optim_otherbucket_samplevariance.stan similarity index 100% rename from model-single-origin-samples/models/model6.5_negbinom_matrix_correlation_features_oos_optim_otherbucket_transcriptvariance.stan rename to model-single-origin-samples/models/model6.5_negbinom_matrix_correlation_features_oos_optim_otherbucket_samplevariance.stan From 26bb3975192dcd96c0a60dd03a690d480af7b906 Mon Sep 17 00:00:00 2001 From: Maxim Zaslavsky Date: Wed, 13 Dec 2017 17:10:21 -0500 Subject: [PATCH 4/5] implementation of hierarchical dispersion --- ..._oos_optim_otherbucket_samplevariance.stan | 46 ++++++++++++------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/model-single-origin-samples/models/model6.5_negbinom_matrix_correlation_features_oos_optim_otherbucket_samplevariance.stan b/model-single-origin-samples/models/model6.5_negbinom_matrix_correlation_features_oos_optim_otherbucket_samplevariance.stan index e48d8a6..feb706e 100644 --- a/model-single-origin-samples/models/model6.5_negbinom_matrix_correlation_features_oos_optim_otherbucket_samplevariance.stan +++ b/model-single-origin-samples/models/model6.5_negbinom_matrix_correlation_features_oos_optim_otherbucket_samplevariance.stan @@ -58,19 +58,39 @@ parameters { vector[M] theta_coefs_per_gene[G]; vector[G] log_gene_base; // constant intercept expression level for each gene, irrespective of cell type - vector[G] gene_phi; // overdispersion parameter per transcript (for now) + // vector[G] gene_phi; // overdispersion parameter per transcript (for now) simplex[C] sample2_x[S2]; // inferred sample2 compositions (simplex type enforces sum-to-one) vector[S2] unknown_prop; // proportion of each test sample that is of unknown cell type vector[G] other_log_contribution_per_gene[S2]; // for each test sample, per-transcript contribution of unknown cell type - vector[S] sample_variance; // sample-specific variance scaling - vector[S2] sample2_variance; // sample2-specific variance + // hierarchical dispersion + real log_global_phi_scale; // single global-scale value that doesn't vary by sample, gene, or cell type. + simplex[G] log_gene_phi; // constant intercept overdispersion parameter per transcript. simplex solves identifiability problem + vector[S] log_sample_scale; // sample-level variance + vector[S2] log_sample2_scale; // sample2-level variance + matrix[G, C] celltype_scale; // cell type-level variance (varies by transcript) + } transformed parameters { vector[M] theta_coefs[G]; + vector[G] corrected_phis[S]; + vector[G] corrected_phis2[S2]; + vector[G] ones; + for (g in 1:G) theta_coefs[g] = theta_coefs_raw + theta_coefs_per_gene[g]; + + for (s in 1:S) + corrected_phis[s] = log_global_phi_scale + log_gene_phi + log_sample_scale[s] + log(celltype_scale * sample_x[s]) + corrected_phis = exp(corrected_phis) + + for (s in 1:S2) + corrected_phis2[s] = log_global_phi_scale + log_gene_phi + log_sample2_scale[s] + log(celltype_scale * sample_x[s]) + corrected_phis2 = exp(corrected_phis2) + + for(g in 1:G) + ones = 1; // initialize a vector of all ones (for dirichlet prior) } model { // estimate theta - gene-level expression per cell type, as a function of cell-surface expression proteins @@ -87,17 +107,15 @@ model { // estimate sample_y: observed expression for a sample (possibly a mixture) log_gene_base ~ normal(0, 1); - gene_phi ~ normal(0, 1); - sample_variance ~ normal(0, 1); - sample2_variance ~ normal(0, 1); + log_global_phi_scale ~ cauchy(0,1); // fatter tails so that this can move more. + log_gene_phi ~ dirichlet(ones); + log_sample_scale ~ normal(0, 1); + log_sample2_scale ~ normal(0, 1); + celltype_scale ~ normal(0, 1); for (s in 1:S) { vector[G] log_expected_rate; log_expected_rate = log_gene_base + log(theta*sample_x[s]); - - vector[G] corrected_phis; - corrected_phis = gene_phi * sample_variance[s]; - sample_y[s] ~ neg_binomial_2_log(log_expected_rate, corrected_phis); } @@ -110,10 +128,6 @@ model { other_log_contribution_per_gene[s] ~ normal(0, 1); // TODO: use logmix() instead after `log_gene_base + ` in next line. log_expected_rate = log_gene_base + log(theta*sample2_x[s]) * (1 - unknown_prop[s]) + other_log_contribution_per_gene[s] * unknown_prop[s]; - - vector[G] corrected_phis2; - corrected_phis2 = gene_phi * sample2_variance[s]; - sample2_y[s] ~ neg_binomial_2_log(log_expected_rate, corrected_phis2); } } @@ -128,7 +142,7 @@ generated quantities { for (n in 1:N) { real log_expected_rate; log_expected_rate = log_gene_base[gene[n]] + log(theta[gene[n], ]*x[n]); - y_rep[n] = neg_binomial_2_log_rng(log_expected_rate, gene_phi[gene[n]]); - log_lik[n] = neg_binomial_2_log_lpmf(y[n] | log_expected_rate, gene_phi[gene[n]]); + y_rep[n] = neg_binomial_2_log_rng(log_expected_rate, corrected_phis[gene[n]]); + log_lik[n] = neg_binomial_2_log_lpmf(y[n] | log_expected_rate, corrected_phis[gene[n]]); } } \ No newline at end of file From 65b42263b3bf06b3f01c7c05accbf1be375eb685 Mon Sep 17 00:00:00 2001 From: Maxim Zaslavsky Date: Wed, 13 Dec 2017 17:25:10 -0500 Subject: [PATCH 5/5] hierarchical dispersion model now compiles --- ..._oos_optim_otherbucket_samplevariance.stan | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/model-single-origin-samples/models/model6.5_negbinom_matrix_correlation_features_oos_optim_otherbucket_samplevariance.stan b/model-single-origin-samples/models/model6.5_negbinom_matrix_correlation_features_oos_optim_otherbucket_samplevariance.stan index feb706e..0f4fe79 100644 --- a/model-single-origin-samples/models/model6.5_negbinom_matrix_correlation_features_oos_optim_otherbucket_samplevariance.stan +++ b/model-single-origin-samples/models/model6.5_negbinom_matrix_correlation_features_oos_optim_otherbucket_samplevariance.stan @@ -82,15 +82,15 @@ transformed parameters { theta_coefs[g] = theta_coefs_raw + theta_coefs_per_gene[g]; for (s in 1:S) - corrected_phis[s] = log_global_phi_scale + log_gene_phi + log_sample_scale[s] + log(celltype_scale * sample_x[s]) - corrected_phis = exp(corrected_phis) + corrected_phis[s] = log_global_phi_scale + log_gene_phi + log_sample_scale[s] + log(celltype_scale * sample_x[s]); + corrected_phis = exp(corrected_phis); for (s in 1:S2) - corrected_phis2[s] = log_global_phi_scale + log_gene_phi + log_sample2_scale[s] + log(celltype_scale * sample_x[s]) - corrected_phis2 = exp(corrected_phis2) + corrected_phis2[s] = log_global_phi_scale + log_gene_phi + log_sample2_scale[s] + log(celltype_scale * sample_x[s]); + corrected_phis2 = exp(corrected_phis2); for(g in 1:G) - ones = 1; // initialize a vector of all ones (for dirichlet prior) + ones[g] = 1; // initialize a vector of all ones (for dirichlet prior) } model { // estimate theta - gene-level expression per cell type, as a function of cell-surface expression proteins @@ -111,12 +111,13 @@ model { log_gene_phi ~ dirichlet(ones); log_sample_scale ~ normal(0, 1); log_sample2_scale ~ normal(0, 1); - celltype_scale ~ normal(0, 1); + for(g in 1:G) + celltype_scale[g] ~ normal(0, 1); for (s in 1:S) { vector[G] log_expected_rate; log_expected_rate = log_gene_base + log(theta*sample_x[s]); - sample_y[s] ~ neg_binomial_2_log(log_expected_rate, corrected_phis); + sample_y[s] ~ neg_binomial_2_log(log_expected_rate, corrected_phis[s]); } // estimate sample2_y: observed expression for a sample of unknown composition @@ -128,7 +129,7 @@ model { other_log_contribution_per_gene[s] ~ normal(0, 1); // TODO: use logmix() instead after `log_gene_base + ` in next line. log_expected_rate = log_gene_base + log(theta*sample2_x[s]) * (1 - unknown_prop[s]) + other_log_contribution_per_gene[s] * unknown_prop[s]; - sample2_y[s] ~ neg_binomial_2_log(log_expected_rate, corrected_phis2); + sample2_y[s] ~ neg_binomial_2_log(log_expected_rate, corrected_phis2[s]); } } generated quantities { @@ -142,7 +143,7 @@ generated quantities { for (n in 1:N) { real log_expected_rate; log_expected_rate = log_gene_base[gene[n]] + log(theta[gene[n], ]*x[n]); - y_rep[n] = neg_binomial_2_log_rng(log_expected_rate, corrected_phis[gene[n]]); - log_lik[n] = neg_binomial_2_log_lpmf(y[n] | log_expected_rate, corrected_phis[gene[n]]); + y_rep[n] = neg_binomial_2_log_rng(log_expected_rate, corrected_phis[sample[n]][gene[n]]); + log_lik[n] = neg_binomial_2_log_lpmf(y[n] | log_expected_rate, corrected_phis[sample[n]][gene[n]]); } } \ No newline at end of file