-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathImmunarch_TCR_pipeline.R
155 lines (119 loc) · 5.81 KB
/
Immunarch_TCR_pipeline.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# Immunarch is an R package for analysis of TCR/BCR repertoire data.
# It allows you to look at shared clonotypes, clonal diversity, gene usage and more
# Supports all popular TCR and BCR analysis and post-analysis formats, including single-cell data:
# ImmunoSEQ, IMGT, MiTCR, MiXCR, MiGEC, MigMap, VDJtools, tcR, AIRR, 10XGenomics, ArcherDX.
install.packages("devtools", dependencies = T)
devtools::install_github("immunomind/immunarch")
library("immunarch")
library("dtplyr")
data <-setwd("D:/Data files/Mixcr_Analysis_Partial_Assemble")
tcr<-repLoad(data)
#View(tcr$data)
names(tcr)
names(tcr$data)
top(tcr$data[1])
#Finding the number of shared clonotypes and visualize it as a correlation plot
ov = repOverlap(tcr$data, .method="public", .col = "aa", .a = 0.5, .b = 0.5, .verbose = T)
vis(ov)
# Using Shiny to make publication ready plots
p=vis(ov)
fixVis(p)
# Visualize shared clonotypes as a heatmap with dendrogram
vis(ov, "heatmap2")
# repOverlap function is designed to analyse the overlap between two or more repertoires
# .method = c("public", "overlap", "jaccard", "tversky", "cosine", "morisita", "inc+public", "inc+morisita")
# 'public' means shared clonotypes, "jaccard" index is percentage of objects two sets have in common out of total
imm_ov1 = repOverlap(tcr$data, .method = "public", .verbose = F)
#Morista is another statistical measure to check for the dispersion of individuals in a population.
imm_ov2 = repOverlap(tcr$data, .method = "morisita", .verbose = F)
gridExtra::grid.arrange(vis(imm_ov1), vis(imm_ov2, .text.size=2), ncol = 2)
# Apply different analysis algorithms to the matrix of public clonotypes:
# "tsne" - t-Stochastic Neighbor Embedding
repOverlapAnalysis(imm_ov1, "tsne") %>% vis()
# Visualize CDR3 length distribution
repExplore(tcr$data, "lens") %>% vis()
# Repertoire Clonal Proportions
# Number of clonotypes occupying the 10% of repertoires
tcr_pr = repClonality(tcr$data, .method = "clonal.prop")
tcr_pr
vis(tcr_pr)
#Top clonal proportion
tcr_top = repClonality(tcr$data, .method = "top", .head = c(10, 100, 1000, 3000, 10000))
tcr_top
vis(tcr_top)
#Rare clonal proportion
tcr_tail = repClonality(tcr$data, .method = "tail")
tcr_tail
vis(tcr_tail)
#Relative Abundance; summary of clonotypes with specific frequencies
tcr_hom = repClonality(tcr$data, .method = "homeo", .clone.types = c(Small = .0001, Medium = .001, Large = .01, Hyperexpanded = 1))
tcr_hom
vis(tcr_hom)
# Compare diversity of repertoires and visualise samples
# Chao1 estimator is a nonparameteric asymptotic estimator of species richness (number of species in a population)
div = repDiversity(tcr$data, .method = "chao1")
vis(div)
#visualize CDR3 length distribution
repExplore(tcr$data, "lens") %>% vis()
# The gene usage function allows to analyse the immune receptor gene usage
# for (IGHD, IGHJ, IDHV, IGIJ, IGKJ, IGKV, IGLJ, IGLV, TRAJ, TRAV, TRBD, etc.) and gives the statistics
# geneUsage (
# .data,
# .gene = c("hs.trbv", "HomoSapiens.TRBJ", "macmul.IGHV"),
# .quant = c(NA, "count"),
# .ambig = c("inc", "exc", "maj"),
# .type = c("segment", "allele", "family"),
# .norm = F
#)
#Compute V gene usage and and highlight gene differences between different Age groups:
geneUsage(tcr$data[[1]]) %>% vis()
#or
gu = geneUsage(tcr$data[[1]])
vis(gu, .by="Age", .meta=tcr$meta)
#TRBV gene usage
trbv_gu = geneUsage(tcr$data, "hs.trbv", .type = "family", .norm = T, .quant = "count", .ambig = "exc")
vis(trbv_gu, .plot = "hist", .grid = T)
#TRBD gene usage
trbd_gu = geneUsage(tcr$data, "hs.trbd", .type = "family", .norm = F, .quant = "count", .ambig = "exc")
vis(trbd_gu, .plot = "hist", .grid = T)
#TRAV gene usage
trav_gu = geneUsage(tcr$data, "hs.trav", .type = "family", .norm = T, .quant = "count", .ambig = "exc")
vis(trav_gu, .plot = "hist", .grid = T)
#TRAJ gene usage
traj_gu = geneUsage(tcr$data, "hs.traj", .type = "family", .norm = F, .quant = "count", .ambig = "exc")
vis(traj_gu, .plot = "hist", .grid = T)
gene_stats(traj_gu)
#cor - correlation coefficient for analysis. Can be "pearson", "kendall" or "spearman".
#Can be "pca", "mds", "js", "kmeans", "hclust", "dbscan" or "cor" if you want to calculate correlation coefficient.
#JS stands for Jensen-Shannon divergence, measuring the similarity between two probability distributions
tcr_gu_js = geneUsageAnalysis(trbv_gu, .method = "js", .verbose = F)
tcr_gu_cor = geneUsageAnalysis(trbv_gu, .method = "cor", .cor="pearson", .verbose = F)
gridExtra::grid.arrange(vis(tcr_gu_js, .title = "Gene usage JS-divergence", .leg.title = "JS", .text.size=2.5, .signif.digits= 1), vis(tcr_gu_cor, .title = "Gene usage correlation", .leg.title = "Cor", .text.size=2.5, .signif.digits= 2), ncol = 2)
# Track clonotypes
# Choose the first 10 amino acid clonotype sequences
# from the first repertoire to track
tc = trackClonotypes(tcr$data, list(1, 10), .col = "aa")
vis(tc, .plot = "smooth")
# Choose the first 10 clonotypes from the first repertoire
# with amino acid sequences and V segments
target = tcr$data[[1]] %>% select(CDR3.aa, V.name) %>% head(10)
tc = trackClonotypes(tcr$data, target)
vis(tc, .plot = "smooth")
vis(tc, .plot = "area")
vis(tc, .plot = "line")
pr.nt = pubRep(tcr$data, "nt", .verbose = F)
write.table(pr.nt, file="sharedclones.txt")
pr.aa = pubRep(tcr$data, "aa", .verbose = F)
write.table(pr.aa, file="Shared_clones.txt")
# Pass "aa+v" as the second parameter to build the public repertoire table using CDR3 aminoacid sequences and V alleles
# In order to use only CDR3 aminoacid sequences, just pass "aa"
pr.aav = pubRep(tcr$data, "aa+v", .verbose = F)
pr.aav
# You can also pass the ".coding" parameter to filter out all noncoding sequences first:
pr.aav.cod = pubRep(tcr$data, "aa+v", .coding=T)
pr.aav.cod
#explore differences in clonotype counts and volume
exp_cnt = repExplore(tcr$data, .method = "count")
vis(exp_cnt)
exp_vol = repExplore(tcr$data, .method = "volume")
vis(exp_vol)