Skip to content

Commit

Permalink
Merge pull request #367 from eweitz/refine-tissue-expression
Browse files Browse the repository at this point in the history
Ensure top max-expressing tissue is shown, robustify curve shift
  • Loading branch information
eweitz authored Jan 16, 2024
2 parents a3d74a7 + cd3992e commit 31a721f
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 28 deletions.
Binary file modified dist/data/cache/tissues/homo-sapiens-tissues.tsv.gz
Binary file not shown.
Binary file modified dist/data/cache/tissues/homo-sapiens-tissues.tsv.li.gz
Binary file not shown.
74 changes: 58 additions & 16 deletions scripts/python/cache/tissue_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def get_summary(expressions):
if s > 0:
summary.append(round(s, 2))

if len(summary) >= 4:
if len(summary) >= 4 or (len(summary) > 0 and summary[-1] >= 100):
num_bins = 10
size = (max - min) / num_bins
quantile_counts = [0] * num_bins
Expand Down Expand Up @@ -240,39 +240,81 @@ def summarize_top_tissues_by_gene(input_dir):

trimmed_summary_by_gene_by_tissue = {}
trimmed_summary_by_gene_by_tissue[gene] = {}
medians_by_tissue_index = []
medians_and_maxes_by_tissue_index = []
j = 0
top_max = 0
top_max_tissue = ''
for tissue in summary_by_gene_by_tissue[gene]:
summary = summary_by_gene_by_tissue[gene][tissue]
if len(summary) == 15: # 5 for box plot, 10 for KDE deciles
num_metrics = len(summary)
if num_metrics == 15: # 5 for box plot, 10 for KDE deciles
median = summary[2]
elif len(summary) == 14: # for minimum with a value of 0
max = summary[4]
elif num_metrics == 14: # for minimum with a value of 0
median = summary[1]
max = summary[3]
elif num_metrics == 13:
# Occurs in "Breast - mammary tissue" in LALBA or CSN3
median = 0
max = summary[2]
elif num_metrics == 12:
# Occurs in "Breast - mammary tissue" in LALBA or CSN3
median = 0
max = summary[1]
else:
median = 0
medians_by_tissue_index.append([tissue, median])
sorted_medians_by_tissue_index = sorted(
medians_by_tissue_index,
key=lambda x: x[1],
max = 0
if max > top_max:
top_max = max
top_max_tissue = tissue
medians_and_maxes_by_tissue_index.append([tissue, median, max])
sorted_medians_and_maxes_by_tissue_index = sorted(
medians_and_maxes_by_tissue_index,
key=lambda x: (x[1], x[2]),
reverse=True
)
top_tissues_by_median = [
tm[0] for tm in sorted_medians_by_tissue_index[:10]
]
for tissue in top_tissues_by_median:
top_tissues_by_median_and_max = []
includes_top_max = False
for tm in sorted_medians_and_maxes_by_tissue_index[:10]:
top_tissues_by_median_and_max.append(tm[0])
if tm[0] == top_max_tissue:
includes_top_max = True

if not includes_top_max and top_max != 0:
# Ensure a tissue with a low median but the highest max
# isn't excluded from the top 10, e.g. "Breast - mammary gland"
# for gene XDH
top_tissues_by_median_and_max[-1] = top_max_tissue

for tissue in top_tissues_by_median_and_max:
summary = summary_by_gene_by_tissue[gene][tissue]
if len(summary) < 4:
# Skip summaries that where Q1 (or higher percentile) is 0
if len(summary) < 10:
continue
summary = ';'.join([str(s) for s in summary])
elif len(summary) == 13:
max = summary[2]
if max < 100:
# Skip summaries where Q1 (or higher percentile) is 0
# and max is less than 100 TPM (e.g. to not skip
# "Breast - mammary tissue" in LALBA or CSN3)
continue
summary_list = []
for s in summary:
if s == 0:
# Delate 0-integer to empty string
summary_list.append('')
elif s < 1:
# Truncate e.g. 0.1234 to .1234
summary_list.append(str(s)[1:])
else:
summary_list.append(str(s))
summary = ';'.join(summary_list)
if summary == '':
# Observed for e.g. MEF2AP1, a pseudogene
continue
tissue_index = tissues_by_index_unique.index(tissue)
tissue_and_summary = f'{tissue_index};{summary}'
output_row.append(tissue_and_summary)


if i % 500 == 0:
print(f'Last tissue summary for gene {i} {gene}:')
print(tissue_and_summary)
Expand Down
9 changes: 8 additions & 1 deletion src/js/init/caches/tissue-cache-worker.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,21 @@ async function getTissueExpressions(gene, ideo) {
const tissueExpressions = [];
const rawExpressions = geneDataLine.split('\t').slice(1);
for (let i = 0; i < rawExpressions.length; i++) {
const rawValues = rawExpressions[i].split(';');
const rawValues = rawExpressions[i].split(';').map(
v => v === '' ? 0 : v // inflate empty string to 0-integer
);
const numValues = rawValues.length;
if (numValues === 15) {
rawValues.splice(1, 0, 0); // Insert number 0 at position 1
} else if (numValues === 14) {
// Min. and Q1 are 0
rawValues.splice(1, 0, 0);
rawValues.splice(1, 0, 0);
} else if (numValues === 13) {
// Min., Q1, and median are 0
rawValues.splice(1, 0, 0);
rawValues.splice(1, 0, 0);
rawValues.splice(1, 0, 0);
}
const tissueId = rawValues[0];
const boxMetrics = rawValues.slice(1, 6);
Expand Down
23 changes: 15 additions & 8 deletions src/js/kit/protein-color.js
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,9 @@ export function getColors(domainType) {
domainType.includes('KA1') || // e.g. MARK2
domainType === 'V(D)J recombination-activating protein 1' || // e.g. RAG1
domainType.toLowerCase().includes('opiod') || // e.g. PDYN
domainType === 'Corticotropin-releasing factor' // e.g. CRH
domainType === 'Corticotropin-releasing factor' || // e.g. CRH
domainType.includes('2Fe-2S ferredoxin') || // e.g. XDH
domainType.includes('acidic domain') // e.g. SYNCRIP
) {
return [redderFaintRed, redderFaintRedLine];
} else if (
Expand All @@ -239,7 +241,8 @@ export function getColors(domainType) {
domainType.includes('MutS, clamp') ||
domainType.includes('S5 domain 2-like') || // e.g. MLH1 in ACMG
domainType.endsWith('CC1/2') || // e.g. PRKDC
domainType.includes('MUN') // e.g. UNC13C
domainType.includes('MUN') || // e.g. UNC13C
domainType.includes('second molybdopterin') // e.g. XDH
) {
return [darkGreen, darkGreenLine];
} else if (
Expand Down Expand Up @@ -313,13 +316,15 @@ export function getColors(domainType) {
domainType.includes('MG1') ||
domainType === 'Homocysteine-binding domain' ||
domainType.startsWith('Acyl-CoA') && domainType.endsWith('N-terminal') ||
domainType === 'Clathrin light chain'
domainType === 'Clathrin light chain' ||
domainType === 'Hexokinase, N-terminal' // e.g. HK2
) {
return [lightBlue, lightBlueLine];
} else if (
// Larger binding regions and miscellaneous
domainType.includes('zinc-binding') ||
domainType.includes('DNA-binding') ||
domainType === 'RUNT domain' || // a DNA-binding / PPI domain, e.g. RUNX1
domainType === 'G protein-coupled receptor, rhodopsin-like' ||
domainType.includes('CXC domain') ||
domainType.includes('Homeobox domain') ||
Expand Down Expand Up @@ -446,7 +451,8 @@ export function getColors(domainType) {
domainType.includes('multifunctional domain') ||
domainType.includes('MutS, core') ||
domainType === 'RAP domain' || // RNA-binding, e.g. FASTK
domainType.endsWith('CC5') // e.g. PRKDC
domainType.endsWith('CC5') || // e.g. PRKDC
domainType.includes('first molybdopterin') // e.g. XDH
) {
return [seafoam, seafoamLine];
} else if (
Expand Down Expand Up @@ -639,7 +645,8 @@ export function getColors(domainType) {
domainType === 'Cobalamin (vitamin B12)-binding domain' ||
domainType === 'Laminin domain II' ||
domainType === 'Troponin I residues 1-32' || // e.g. TNNI3 in ACMG
domainType === 'KI67R' // KI67 / Chmadrin repeat
domainType === 'KI67R' || // KI67 / Chmadrin repeat
domainType.includes('FAD-binding')
) {
return [orange, orangeLines];
} else if (
Expand Down Expand Up @@ -777,7 +784,9 @@ export function getColors(domainType) {
domainType === 'GPCR, family 2, secretin-like' ||
domainType === 'GPCR, family 3, nine cysteines domain' ||
domainType === 'G-protein coupled receptor' ||
domainType.toLowerCase().includes('orexin') // e.g. HCRTR2
domainType.toLowerCase().includes('orexin') || // e.g. HCRTR2

domainType === '[2Fe-2S]-binding'
) {
return [darkOrange, darkOrangeLines];
} else if (
Expand All @@ -796,7 +805,6 @@ export function getColors(domainType) {
} else if (
domainType.includes('inhibit') ||
domainType.includes('central') ||
domainType === '[2Fe-2S]-binding' ||
domainType.endsWith('tail') ||
domainType.endsWith('helical domain') ||
domainType.endsWith('helical domain HD2') ||
Expand Down Expand Up @@ -826,7 +834,6 @@ export function getColors(domainType) {
domainType === 'Paired domain' || // found in eukaryotic transcription regulatory proteins involved in embryogenesis
domainType === 'JmjC domain' ||
domainType === 'BRK domain' ||
domainType.includes('FAD-binding') ||
domainType.includes('MG3') ||
domainType.toLowerCase().includes('polycomb') ||
domainType.toLowerCase().includes('metallopeptidase') ||
Expand Down
6 changes: 3 additions & 3 deletions src/js/kit/tissue.js
Original file line number Diff line number Diff line change
Expand Up @@ -665,14 +665,14 @@ function focusMiniCurve(traceDom, ideo, reset=false) {
const refTissue = reset ? null : traceDom.getAttribute('data-tissue');

const numTissues = !ideo.showTissuesMore ? 10 : 3;
let tissueExpressions =
ideo.tissueExpressionsByGene[gene].slice(0, numTissues);
let tissueExpressions = ideo.tissueExpressionsByGene[gene];

const maxPx = MINI_CURVE_WIDTH;
const relative = true;
const leftPx = 0;
tissueExpressions =
setPxOffset(tissueExpressions, maxPx, relative, leftPx, refTissue);
setPxOffset(tissueExpressions, maxPx, relative, leftPx, refTissue)
.slice(0, numTissues);

const height = MINI_CURVE_HEIGHT;
tissueExpressions.forEach((teObject, i) => {
Expand Down

0 comments on commit 31a721f

Please # to comment.