-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFunctiona_Enrichment.tex
2444 lines (1688 loc) · 115 KB
/
Functiona_Enrichment.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% Options for packages loaded elsewhere
\PassOptionsToPackage{unicode}{hyperref}
\PassOptionsToPackage{hyphens}{url}
%
\documentclass[
]{book}
\usepackage{amsmath,amssymb}
\usepackage{iftex}
\ifPDFTeX
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage{textcomp} % provide euro and other symbols
\else % if luatex or xetex
\usepackage{unicode-math} % this also loads fontspec
\defaultfontfeatures{Scale=MatchLowercase}
\defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1}
\fi
\usepackage{lmodern}
\ifPDFTeX\else
% xetex/luatex font selection
\fi
% Use upquote if available, for straight quotes in verbatim environments
\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
\IfFileExists{microtype.sty}{% use microtype if available
\usepackage[]{microtype}
\UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
}{}
\makeatletter
\@ifundefined{KOMAClassName}{% if non-KOMA class
\IfFileExists{parskip.sty}{%
\usepackage{parskip}
}{% else
\setlength{\parindent}{0pt}
\setlength{\parskip}{6pt plus 2pt minus 1pt}}
}{% if KOMA class
\KOMAoptions{parskip=half}}
\makeatother
\usepackage{xcolor}
\usepackage{color}
\usepackage{fancyvrb}
\newcommand{\VerbBar}{|}
\newcommand{\VERB}{\Verb[commandchars=\\\{\}]}
\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}}
% Add ',fontsize=\small' for more characters per line
\usepackage{framed}
\definecolor{shadecolor}{RGB}{248,248,248}
\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}}
\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}}
\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}}
\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
\newcommand{\BuiltInTok}[1]{#1}
\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}}
\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}}
\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}}
\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}}
\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}}
\newcommand{\ExtensionTok}[1]{#1}
\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}}
\newcommand{\ImportTok}[1]{#1}
\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}}
\newcommand{\NormalTok}[1]{#1}
\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}}
\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}}
\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}}
\newcommand{\RegionMarkerTok}[1]{#1}
\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}}
\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\usepackage{longtable,booktabs,array}
\usepackage{calc} % for calculating minipage widths
% Correct order of tables after \paragraph or \subparagraph
\usepackage{etoolbox}
\makeatletter
\patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{}
\makeatother
% Allow footnotes in longtable head/foot
\IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}}
\makesavenoteenv{longtable}
\usepackage{graphicx}
\makeatletter
\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
\makeatother
% Scale images if necessary, so that they will not overflow the page
% margins by default, and it is still possible to overwrite the defaults
% using explicit options in \includegraphics[width, height, ...]{}
\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
% Set default figure placement to htbp
\makeatletter
\def\fps@figure{htbp}
\makeatother
\setlength{\emergencystretch}{3em} % prevent overfull lines
\providecommand{\tightlist}{%
\setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
\setcounter{secnumdepth}{5}
\usepackage{booktabs}
\usepackage{booktabs}
\usepackage{longtable}
\usepackage{array}
\usepackage{multirow}
\usepackage{wrapfig}
\usepackage{float}
\usepackage{colortbl}
\usepackage{pdflscape}
\usepackage{tabu}
\usepackage{threeparttable}
\usepackage{threeparttablex}
\usepackage[normalem]{ulem}
\usepackage{makecell}
\usepackage{xcolor}
\ifLuaTeX
\usepackage{selnolig} % disable illegal ligatures
\fi
\usepackage[]{natbib}
\bibliographystyle{plainnat}
\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}}
\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available
\urlstyle{same}
\hypersetup{
pdftitle={Functional Enrichment Workshop},
pdfauthor={Australian Biocommons, Sydney Informatics Hub (USYD) and Monash Genomics and Bioinformatics Platform (MGBP)},
hidelinks,
pdfcreator={LaTeX via pandoc}}
\title{Functional Enrichment Workshop}
\author{Australian Biocommons, Sydney Informatics Hub (USYD) and Monash Genomics and Bioinformatics Platform (MGBP)}
\date{Compiled: November 19, 2024}
\begin{document}
\maketitle
{
\setcounter{tocdepth}{1}
\tableofcontents
}
\hypertarget{functional-enrichment-workshop}{%
\chapter{Functional Enrichment Workshop}\label{functional-enrichment-workshop}}
\textbf{Welcome to the Functional Enrichment Workshop!} This resource is designed to guide you through the process of performing functional enrichment analysis using a variety of tools and methodologies, both online and via command-line interfaces.
\hypertarget{instructors}{%
\subsection{Instructors}\label{instructors}}
\begin{itemize}
\tightlist
\item
\textbf{Hossein V Kahrood}
\begin{itemize}
\tightlist
\item
Lead Instructor, Monash Genomics and Bioinformatics Platform (MGBP)
\item
\href{mailto:hossein.valipourkahrood@monash.edu}{\nolinkurl{hossein.valipourkahrood@monash.edu}}
\end{itemize}
\item
\textbf{Cali Willet}
\begin{itemize}
\tightlist
\item
Lead Instructor, Sydney Informatics Hub
\item
\href{mailto:cali.willet@sydney.edu.au}{\nolinkurl{cali.willet@sydney.edu.au}}
\end{itemize}
\end{itemize}
\hypertarget{important-links}{%
\subsection{Important Links}\label{important-links}}
\begin{itemize}
\tightlist
\item
\textbf{Workshop Page:} \href{https://monashbioinformaticsplatform.github.io/Functional_Enrichment_BioCommons_2024/}{Functional Enrichment BioCommons 2024}
\item
\textbf{Related Publication:} \href{https://link.springer.com/article/10.1007\%2Fs10571-016-0403-y}{NeuroMolecular Medicine Article}
\item
\textbf{Degust Tool:} \href{https://degust.erc.monash.edu/degust/compare.html?code=5b2c7805ab8f8c5f2dc8c72e61b049b0\#/?plot=mds}{Degust Comparative Analysis}
\end{itemize}
\begin{center}\rule{0.5\linewidth}{0.5pt}\end{center}
\hypertarget{getting-started}{%
\subsection{Getting Started}\label{getting-started}}
To begin, we recommend reviewing the \href{https://monashbioinformaticsplatform.github.io/Functional_Enrichment_BioCommons_2024/}{workshop page} for an overview of the content and tools covered. This workshop is structured to progressively build your skills, starting with the basics of functional enrichment analysis and moving towards more complex applications.
Happy learning!
\hypertarget{part-day-1}{%
\chapter{(PART) Day 1}\label{part-day-1}}
\hypertarget{overview}{%
\chapter{Overview}\label{overview}}
\hypertarget{functional-analysis-of--omics-data}{%
\section{Functional analysis of -Omics data}\label{functional-analysis-of--omics-data}}
Workshop 2024
\hypertarget{general-information}{%
\subsection{General information}\label{general-information}}
The workshop covers the bioinformatics concepts and tools available for interpreting a gene list using gene ontology and pathway information. The workshop focuses on the principles and concepts required for analyzing and conducting functional and pathway analysis on a gene list from any organism, although the focus will be on human and model eukaryotic organisms.
\hypertarget{course-objectives}{%
\subsection{Course Objectives}\label{course-objectives}}
Participants will gain practical experience and skills to be able to:
\begin{itemize}
\tightlist
\item
Understand basic concepts of functional enrichment analysis;
\item
Interpret enrichment analysis results;
\item
Get systems perspective of gene functions;
\item
Get more information about a gene list;
\item
Discover what pathways are enriched in a gene list (and use it for hypothesis generation);
\item
Predict gene function and extend a gene list;
\item
Follow workflow after the workshop to conduct their own analysis.
\end{itemize}
\hypertarget{target-audience}{%
\subsection{Target Audience}\label{target-audience}}
This workshop is intended for biologists working with `-Omics data' (e.g.~RNA-Seq, protein expression and other omics data), who are interested in interpreting large gene/protein lists resulting from their experiments.
\hypertarget{setup-requirements}{%
\subsection{Setup Requirements}\label{setup-requirements}}
This workshop will be delivered online over zoom; you may wish to install the dedicated zoom. Otherwise, no special software installation will required, as we will be using online analysis tools.
\begin{itemize}
\tightlist
\item
Zoom Link:
\end{itemize}
Links and material will be provided on the day. BYO coffee.
\hypertarget{schedule}{%
\subsection{Schedule}\label{schedule}}
Day
Instructor
Activity
Time (mins)
Day 1
Welcome and housekeeping
10
HK
Introduction
10
HK
Data acquisition
5
HK
Filtering gene list
15
HK
Hands-on with Interactive Calculator (breakout rooms); \url{https://bioinformatics3.erc.monash.edu/rsconnect/content/241/}
15
HK
gProfiler {[}GO + pathways{]} (\url{https://biit.cs.ut.ee/gprofiler/gost})
20
HK
Hands-on with gProfiler (breakout rooms)
20
HK
Break
15
HK
STRING (\url{https://string-db.org/})
20
HK
Reactome (\url{https://reactome.org/})
20
HK
GSEA (GenePattern) (\url{https://cloud.genepattern.org/gp/pages/index.jsf})
30
Day 1
3 hrs
Day 2
Welcome and housekeeping
5
HK
Day -1 recap
15
CW
Using R for functional enrichment analysis; Applications and advantages; Working with confidential data; Customisation, flexibility, reproducibility; Automation and batch processing
30
CW
Available packages in R -; Clusterprofiler; Gprofiler; Any other?
5
CW
Introducing R, R Markdown, Rstudio; Getting logged on RStudio environment; Discuss R Markdown; Discuss basic features of Rstudio
30
CW
Clusterprofiler - Handon; Breakout rooms; Work on and discuss results based on following criterion; Analysis; ORA; GSEA; Ontologies; GO; Pathway (KEGG, Reactome); \ldots; Visualisations
30
CW
gprofiler - Handson; Breakout rooms; Work on and discuss specific features; gost function with standard analysis and plots - Discuss how the plots from gprofiler are different (than clusterprofiler) and also useful; Send~ analysis from R to g:Profiler web interface~; Sharing the results easily with colleagues~; To accompany a publication without the peers having to run the full analysis code in R; Integrating results with external tools for visualisations; Alter results using ggplot2, enrichplot, clusterProfiler; Using custom annotations; Non-model organisms, that are not annotated in the Ensembl database; Enable users to upload custom annotation files
30
CW
Experiment wrap up~~; Discuss results; Enrichments look different from different tools - Why
30
CW
Wrap up and feedback
5
Day 2
3 hrs
\begin{itemize}
\tightlist
\item
\textbf{HK}: Hossein V Kahrood
\item
\textbf{CW}: Cali Willet
\end{itemize}
\hypertarget{recap}{%
\chapter{Recap}\label{recap}}
\hypertarget{functional-enrichment-analysis}{%
\section{Functional enrichment analysis}\label{functional-enrichment-analysis}}
Functional enrichment analysis (FEA) refers to a set of computational approaches designed to derive biological meaning from lists of biomolecules, such as genes, proteins, or metabolites.
By focusing on the biological significance of biomolecular changes, functional enrichment analysis enables researchers to make sense of complex high-throughput data from large-scale studies, revealing the key cellular processes and signaling pathways involved in health and disease states.
\hypertarget{why-is-it-important}{%
\subsection{Why Is It Important?}\label{why-is-it-important}}
Large-scale omics studies often yield vast datasets with hundreds or thousands of significantly regulated biomolecules. Manually investigating each feature, such as individual genes or proteins, can be overwhelming and inefficient.
Functional enrichment analysis provides a solution by organising these biomolecules into meaningful categories, allowing for the identification of overarching biological patterns and mechanisms. This helps reduce data complexity and uncovers higher-level biological insights, such as discovering critical pathways involved in disease progression or identifying potential therapeutic targets. Thus, enrichment analysis is a crucial step in the interpretation of high-dimensional omics data, transforming lists of molecular entities into actionable biological knowledge.
\hypertarget{when-to-use-functional-enrichment-analysis}{%
\subsection{When to Use Functional Enrichment Analysis?}\label{when-to-use-functional-enrichment-analysis}}
Functional enrichment analysis is typically applied after conducting a differential expression analysis or other comparative analyses in omics studies. This step is essential when attempting to derive biological insights from large lists of biomolecules that exhibit significant changes in expression, modification, or abundance between experimental groups. Below are common scenarios where functional enrichment analysis is particularly valuable:
\begin{itemize}
\tightlist
\item
Transcriptomics (eg high-fat diet vs.~low-fat diet)
\item
Proteomics (eg tumor tissue vs.~healthy tissue)
\item
Lipidomics (eg disease vs.~healthy state)
\item
Metabolomics (eg diabetic vs.~non-diabetic patients)
\item
Epigenomics (eg smokers vs.~non-smokers)
\item
\ldots{}
\end{itemize}
\hypertarget{what-are-the-input-data}{%
\subsection{What Are the Input Data?}\label{what-are-the-input-data}}
Functional enrichment analysis relies on carefully prepared input data derived from an -omics study. The data inputs typically consist of various components depending on the type of the enrichment analysis. These are list of features, ranked list, background set, gene sets and pathway topology.
\hypertarget{synonyms}{%
\subsection{Synonyms}\label{synonyms}}
It's important to note that the term ``functional enrichment analysis'' is often used in different ways across the field. The diversity in terminology can sometimes cause confusion, as the same concept is referred to by various synonymous terms. These include:
\begin{itemize}
\tightlist
\item
Enrichment analysis
\item
Pathway analysis
\item
Pathway enrichment analysis
\item
Functional annotation analysis
\item
Annotation enrichment analysis
\item
Functional pathway analysis
\item
Functional enrichment analysis
\item
\ldots{}
\end{itemize}
\hypertarget{concepts}{%
\section{Concepts}\label{concepts}}
Here we will explore key concepts that are essential to perform and interpret enrichment analysis, including gene lists, background sets, p-values, false discovery rates, and the role of annotation databases. These concepts form the foundation for making sense of the biological significance of experimental findings.
\begin{itemize}
\item
{Gene List:}
A gene list is the collection of genes (or proteins) that are of particular interest in a biological experiment. This list typically arises from high-throughput experiments such as transcriptomics, proteomics, or genomics, where genes are differentially expressed, mutated, or otherwise identified as significant. In functional enrichment analysis, the gene list is used to assess whether certain biological pathways, gene ontologies, or functions are statistically overrepresented compared to a reference or background set.
\item
{Background Set:}
The background set, also referred to as the ``reference set,'' is the complete set of genes or proteins against which the gene list is compared. This background typically includes all genes that were analysed in the experiment (e.g., all genes in a microarray or RNA-seq dataset). The choice of background is crucial because it influences the statistical significance of the enrichment. For instance, using a background that includes only expressed genes will result in a different outcome compared to using all known genes in the genome.
\item
{P-value:}
The p-value is a measure of the probability that the observed result occurred by chance. At the feature level, it indicates whether a particular gene or protein shows significant differences (e.g., in expression or mutation) when compared to a control or baseline. For example, a p-value of 0.01 for a gene means there's only a 1\% chance that the observed change in that gene is due to random variation. At the enrichment level, the p-value evaluates whether the overlap between the identified genes and a particular biological term (such as a pathway) occurred by chance. A pathway with a p-value of 0.001, for instance, suggests that there's only a 0.1\% probability that the pathway's association with the gene list occurred randomly.
\item
{False Discovery Rate (FDR):}
The FDR corrects for multiple comparisons, as many tests are conducted both at the feature and enrichment levels. When analysing thousands of genes and numerous pathways, the likelihood of false positives increases, so FDR adjusts for this by controlling the proportion of false positives among the significant results. For example, an FDR threshold of 0.05 means that no more than 5\% of the features (e.g., genes) or enriched terms (e.g., pathways) identified as significant are expected to be false positives.
\item
{Regulation:}
In the context of enrichment analysis, regulation refers to the upregulation or downregulation of genes. Many enrichment tools allow users to analyse gene lists with regulation status included. This means pathways or biological functions may be enriched with genes that are specifically upregulated (increased activity) or downregulated (decreased activity). This additional layer of information helps in understanding whether certain pathways or processes are being activated or suppressed in the condition of interest.
\item
{ID Mapping:}
ID mapping refers to the process of converting different types of gene or protein identifiers into a unified format. This is necessary because different databases and platforms may use different types of identifiers (e.g., gene symbols, Entrez IDs, Ensembl IDs, Uniprot IDs). Accurate ID mapping ensures that the gene list aligns with the annotation database being used in the analysis. Tools and databases often provide built-in options for ID conversion to facilitate this step.
\item
{Annotation Databases:}
An annotation database is a curated collection of biological data that links genes or proteins to functional information such as pathways, molecular functions, cellular components, and biological processes. Examples include Gene Ontology (GO), KEGG, Reactome, and MSigDB. These databases provide the functional terms or pathways that are tested for enrichment. The choice of annotation database can significantly influence the results, as different databases may focus on different types of biological information or contain slightly different gene-function relationships.
\end{itemize}
\hypertarget{types-of-enrichment-analysis}{%
\section{Types of Enrichment Analysis}\label{types-of-enrichment-analysis}}
Khatri et al.~(2012) nicely explained different types of enrichment analysis, as shown below.
\begin{figure}
{\centering \includegraphics[width=1\linewidth]{images/fea_types}
}
\caption{Types of of functional enrichment analysis}\label{fig:unnamed-chunk-6}
\end{figure}
\emph{Source: Figure adapted from Khatri P, Sirota M, Butte AJ. Ten years of pathway analysis: current approaches and outstanding challenges. PLoS Comput Biol. 2012;8(2):e1002375.}
\hypertarget{over-representation-analysis-ora}{%
\subsection{Over Representation Analysis (ORA)}\label{over-representation-analysis-ora}}
Over Representation Analysis (ORA) is one of the simplest and most widely used methods for functional enrichment analysis. ORA aims to determine whether specific biological categories (e.g., pathways, Gene Ontology terms) are statistically overrepresented in a given list of features (like genes or proteins) compared to a background or reference set. This method focuses on counting the number of features from the list that are associated with a specific category and comparing this count to what would be expected by chance.
\hypertarget{input-data}{%
\subsubsection{Input Data}\label{input-data}}
\begin{itemize}
\item
{List of Features:} This refers to the subset of biomolecules identified as significantly regulated or altered in the study. Features might include genes, proteins, lipids, or other biomolecules, depending on the type of -omics data.
\item
{Background Set:} The background set, or universe, consists of all the features that were measured in the study or a defined subset of the total genome, proteome, or metabolome being studied. The background is critical for enrichment analysis because it provides the context against which the significance of feature enrichment is assessed.
\end{itemize}
\hypertarget{workflow}{%
\subsubsection{Workflow}\label{workflow}}
{How it works}: ORA uses a predefined feature list (e.g., from differentially expressed genes), calculates the number of features in the list that belong to a certain category (e.g., a pathway), and tests whether this number is significantly higher than expected using statistical tests like the hypergeometric test or Fisher's exact test.
{Strengths}: Simple and easy to implement. Works well with a predefined list of significant features.
{Limitations}: ORA does not take into account the full range of feature expression values and can miss subtle changes across a broader set of features. It relies heavily on selecting a predefined cut-off to create the feature list, which can be subjective.
\hypertarget{gene-set-enrichment-analysis-gsea}{%
\subsection{Gene Set Enrichment Analysis (GSEA)}\label{gene-set-enrichment-analysis-gsea}}
Gene Set Enrichment Analysis (GSEA) also known as Functional Class Scoring (FCS) is a more sophisticated method that avoids the need to define a strict cut-off for selecting a list of significant features. Instead of using a discrete list of differentially expressed features, GSEA analyses ranked feature expression data. It evaluates whether predefined gene sets (such as pathways or functional categories) are enriched at the top or bottom of the ranked list, capturing subtle but coordinated changes in gene expression.
\hypertarget{input-data-1}{%
\subsubsection{Input Data}\label{input-data-1}}
\begin{itemize}
\item
{Ranked List:} In some enrichment methods, such as Gene Set Enrichment Analysis (GSEA), a ranked list is used instead of a simple feature list. The ranking is typically based on a continuous metric such as the magnitude of gene expression changes or some sort of statistical test output. This ranked list helps prioritise features that exhibit the strongest biological relevance and facilitates more nuanced enrichment analyses that consider the direction and strength of biomolecular changes.
\item
{Gene Sets:} This refers to predefined groups of genes that share a common biological property, such as involvement in a specific biological pathway, functional category, or regulatory process. The most commonly used format for gene sets in MSigDB is the GMT format. This format is simple, human-readable, and widely supported by various GSEA tools.
\end{itemize}
\hypertarget{workflow-1}{%
\subsubsection{Workflow}\label{workflow-1}}
{How it works}: GSEA first ranks all genes in the dataset according to their differential expression levels (e.g., from a control to a condition). Then, for each predefined gene set, it calculates an enrichment score (ES) that reflects the concentration of the gene set members at the extremes of the ranked list. Statistical significance is assessed through permutation testing, and the False Discovery Rate (FDR) is used to correct for multiple comparisons.
{Strengths}: GSEA avoids arbitrary thresholds for feature selection and can detect coordinated changes across sets of genes, even if individual genes within the set do not show significant differential expression.
{Limitations}: GSEA may miss smaller pathways or functional categories if their features are not highly ranked or uniformly expressed. It is also more computationally intensive than ORA.
\hypertarget{pathway-topology-pt-based-enrichment}{%
\subsection{Pathway Topology (PT)-Based Enrichment}\label{pathway-topology-pt-based-enrichment}}
Pathway Topology (PT)-based enrichment analysis extends beyond merely counting features and instead incorporates the topological structure of biological pathways. This method evaluates not only which features are part of a pathway but also their position and interactions within the pathway. By considering the connectivity and interaction strength between features, PT-based approaches provide a more biologically meaningful interpretation of pathway activation or suppression.
\hypertarget{input-data-2}{%
\subsubsection{Input Data}\label{input-data-2}}
\begin{itemize}
\tightlist
\item
{List of Features or Ranked List:} Already explained.
\item
{Pathway Topology:} This refers to the structure of a biological pathway, which includes detailed information about the interactions and relationships between gene products (such as proteins or RNAs) within a pathway.
\end{itemize}
\hypertarget{workflow-2}{%
\subsubsection{Workflow}\label{workflow-2}}
{How it works}: PT-based methods take into account the direction and magnitude of feature expression changes, as well as the structure of pathways (e.g., signaling cascades, metabolic pathways). They consider how biomolecule products interact with one another and the specific roles of each gene within the pathway. Topological factors like the number of connections a gene has or its centrality in the pathway are considered when assessing the enrichment.
{Strengths}: Provides more biologically relevant insights by considering gene-gene interactions and the position of each gene within a pathway. It is particularly useful for complex pathways where the roles of genes differ based on their interactions with others.
{Limitations}: Requires more detailed pathway annotations and higher computational complexity. Pathway databases may not have complete or accurate topological information for all pathways, limiting the analysis for certain datasets.
PT-based enrichment will be covered in this workshop:
Given the focus of this workshop on more widely used and accessible enrichment methods, PT-based analysis will not be covered for its limited practical applications (primarily due to the insufficient availability of comprehensive and well-annotated pathway topology databases). Instead, we will focus on methods like ORA and GSEA, which are better supported by existing databases and easier to apply in typical omics studies. However, participants are encouraged to explore PT-based enrichment in the future as database resources improve.
\hypertarget{annotation-databses}{%
\section{Annotation Databses}\label{annotation-databses}}
Functional annotation databases are curated collections of biological data that systematically categorise and describe the functions, roles, interactions, and pathways of genes, proteins, or other biological molecules, enabling researchers to link experimental data to biological knowledge.
\hypertarget{go-gene-ontology}{%
\subsection{\texorpdfstring{\href{https://geneontology.org/}{GO: Gene Ontology}}{GO: Gene Ontology}}\label{go-gene-ontology}}
``The goal of the Gene Ontology Consortium is to produce a dynamic, controlled vocabulary that can be applied to all eukaryotes even as knowledge of gene and protein roles in cells is accumulating and changing.'' (Ashburner et al.~2000)
\begin{figure}
{\centering \includegraphics[width=1\linewidth]{images/go_structure}
}
\caption{The structure of gene ontology.}\label{fig:unnamed-chunk-7}
\end{figure}
\emph{Imagen source \href{https://link.springer.com/book/10.1007/978-1-4939-3743-1}{The Gene Ontology Handbook}}
Gene Ontology (GO) is a structured framework used to describe the roles of genes and their products across all living organisms. It provides a controlled vocabulary that allows for consistent descriptions of gene functions, biological processes, and cellular locations, facilitating computational analysis and integration of biological data across different species. GO's structure comprises three main aspects:
{1. Molecular Function:}
In the Gene Ontology (GO), molecular function refers to the specific biochemical activity that a gene product (such as a protein or RNA) performs. This activity typically involves direct physical interactions with other molecular entities, such as catalysis or binding. These functions are described based on their biochemical roles (e.g., enzyme activity) and their contribution as components within larger biological systems. For instance, protein kinase activity involves the phosphorylation of proteins, which is a specific molecular function. In GO, molecular function is concerned with the direct action of gene products, whether in terms of biochemical interactions or roles in larger biological systems.
{2. Biological Process:}
Biological processes represent the larger objectives that gene products contribute to in an organism, often described by the outcome or result of a series of molecular events. These processes are broader, coordinated sequences of molecular activities that achieve a biological objective, such as cell division or DNA replication. A biological process in GO can encompass anything from simple enzymatic actions to complex, regulated systems like embryonic development or immune response. GO annotations aim to associate gene products not only with the processes they directly contribute to but also with processes they regulate or enable.
{3. Cellular Component:}
This aspect of GO refers to the specific location within a cell where a gene product operates. Cellular components are described relative to structures within the cell, such as the mitochondrion or plasma membrane, and reflect where molecular functions occur as part of broader biological processes. These locations are vital to understanding where molecular activities take place, as cellular compartmentalisation often influences the function and regulation of gene products. Unlike molecular function and biological process, cellular components refer more to cellular anatomy, specifying where gene products perform their roles during biological activities.
In practice, GO terms and annotations allow researchers to describe gene functions in a standardised way, helping in tasks such as gene function prediction, functional profiling, and comparing genes across species. GO's hierarchical organisation of terms provides a rich framework to model the complexity of biological systems and facilitates the computational study of gene functions.
\hypertarget{kegg-kyoto-encyclopedia-of-genes-and-genomes}{%
\subsection{\texorpdfstring{\href{https://www.genome.jp/kegg/}{KEGG: Kyoto Encyclopedia of Genes and Genomes}}{KEGG: Kyoto Encyclopedia of Genes and Genomes}}\label{kegg-kyoto-encyclopedia-of-genes-and-genomes}}
Kyoto Encyclopedia of Genes and Genomes (KEGG) is a curated database that integrates genomic, chemical, and systemic information to represent biological systems and their interactions. It allows users to map molecular data (such as genes, proteins, and small molecules) to biological pathways, enabling a better understanding of how different components interact within an organism.
\begin{figure}
{\centering \includegraphics[width=1\linewidth]{images/NOTCH_signaling_pathway_kegg}
}
\caption{NOTCH Signaling Pathway by KEGG}\label{fig:unnamed-chunk-8}
\end{figure}
\hypertarget{reactome}{%
\subsection{\texorpdfstring{\href{https://reactome.org/}{Reactome}}{Reactome}}\label{reactome}}
Reactome pathway knowledgebase is an open-access, manually curated database that captures molecular details of biological processes such as signal transduction, DNA replication, metabolism, and more, using a consistent data model across different domains of biology. This makes it particularly well-suited for functional enrichment analysis, where understanding the relationships between gene expression data and biological pathways is crucial.
\begin{figure}
{\centering \includegraphics[width=1\linewidth]{images/NOTCH_signaling_pathway_reactome}
}
\caption{NOTCH Signaling Pathway by Reactome}\label{fig:unnamed-chunk-9}
\end{figure}
\hypertarget{msigdb}{%
\subsection{\texorpdfstring{\href{https://www.gsea-msigdb.org/}{MSigDB}}{MSigDB}}\label{msigdb}}
Molecular Signatures Database (MSigDB) is a comprehensive resource for gene set enrichment analysis. It offers a comprehensive collection of gene sets that represent biological processes, molecular pathways, and other biologically relevant information. MSigDB is integrated with the Gene Set Enrichment Analysis (GSEA) tool, which is commonly used to determine if predefined sets of genes show statistically significant differences between two biological states (e.g., diseased vs.~healthy samples).
\begin{figure}
{\centering \includegraphics[width=1\linewidth]{images/GSEA-homegraphic}
}
\caption{GSEA Workflow}\label{fig:unnamed-chunk-10}
\end{figure}
\hypertarget{common-tools-for-doing-fea}{%
\section{Common Tools for Doing FEA}\label{common-tools-for-doing-fea}}
\hypertarget{enrichment-statistics}{%
\chapter{Enrichment Statistics}\label{enrichment-statistics}}
Enrichment statistics are based on a contingency table like so:
..in term
..not in term
Total
..in gene list
50
100
150
..not in gene list (but in background)
200
15900
16100
Total
250
16000
16250
This is based on the 16250 genes that were measured in your experiment.
Note that there might be extra genes that weren't measured these are excluded from the calculations entirely. E.g. There might have been an extra 5000 terms (some of which might have been annotated with the term of interest), making for 21250 \emph{annotated} genes.
\begin{center}\rule{0.5\linewidth}{0.5pt}\end{center}
\hypertarget{fishers-exact-test}{%
\section{\texorpdfstring{{Fisher's Exact Test}}{Fisher's Exact Test}}\label{fishers-exact-test}}
Fisher's Exact Test is a statistical test used to determine if there are nonrandom associations between the proportions of two categorical variables. It calculates the exact probability of observing the given distribution of counts in a 2x2 contingency table, under the null hypothesis of no association between the variables.
\begin{quote}
\emph{Note:} This is just a toy calculator for this training, it is quite limited. You can also use some online tools like \href{https://www.socscistatistics.com/tests/fisher/default2.aspx}{Social Science Statistics} to play with.
\end{quote}
\emph{Formula}:
\[P = \frac{(a + b)!(c + d)!(a + c)!(b + d)!}{a!b!c!d!N!}\]
Where:
\begin{itemize}
\item
\(a\), \(b\), \(c\), and \(d\) are the observed counts in the 2x2 contingency table.
\item
\(N\) is the total number of observations, \(N = a + b + c + d\).
\end{itemize}
Given this contingency table:
\begin{longtable}[]{@{}llll@{}}
\toprule\noalign{}
& Category 1 & Category 2 & Total \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
\textbf{Group 1} & \(a\) & \(b\) & \(a + b\) \\
\textbf{Group 2} & \(c\) & \(d\) & \(c + d\) \\
\textbf{Total} & \(a + c\) & \(b + d\) & \(a + b + c + d\) \\
\end{longtable}
\emph{R syntax}
\hypertarget{hypergeometric-test}{%
\section{\texorpdfstring{{Hypergeometric Test}}{Hypergeometric Test}}\label{hypergeometric-test}}
Hypergeometric test calculates the probability of observing the given number of genes from a specific category (e.g., a pathway) in the gene list (differentially expressed genes) by chance. It models the situation where you draw a sample (the gene list) from a finite population (the background of all genes), and success is defined as a gene being in the category (e.g., belonging to the pathway).
\begin{quote}
\emph{Note:} Here is a tool by \href{https://stattrek.com/online-calculator/hypergeometric}{Stat Trek} to play around with the hypergeometric test.
\end{quote}
\emph{Formula:}
\[P(X = k) = \frac{\binom{K}{k} \binom{N - K}{n - k}}{\binom{N}{n}}\]
Where:
\begin{itemize}
\item
\(N\) = Total number of items in the population.
\item
\(K\) = Number of success items in the population.
\item
\(n\) = Number of items in the sample.
\item
\(k\) = Number of success items in the sample.
\end{itemize}
The parameters in our example:
N=16250; K=250; n=150; k=50
\emph{R syntax}
Where:
k−1 is the number of observed successes minus 1 (for the ``at least'' scenario).
lower.tail = FALSE gives the probability of getting at least k successes (right-tail).
\begin{center}\rule{0.5\linewidth}{0.5pt}\end{center}
\hypertarget{section}{%
\subsubsection*{}\label{section}}
\addcontentsline{toc}{subsubsection}{}
\hypertarget{activity}{%
\section{Activity}\label{activity}}
\hypertarget{challenge-interactive-calculator}{%
\subsection*{\texorpdfstring{\textbf{Challenge:} Interactive Calculator}{Challenge: Interactive Calculator}}\label{challenge-interactive-calculator}}
\addcontentsline{toc}{subsection}{\textbf{Challenge:} Interactive Calculator}
\href{https://bioinformatics3.erc.monash.edu/rsconnect/content/241/}{\emph{Link to open toy enrichment calculator}}.
This calculates enrichment for a single hypothetical genelist (e.g.~your RNAseq differentially expressed genelist) against a single hypothetical `term' (some set of interesting genes, e.g.~synaptic signaling genes). It makes a Venn diagram and a wordy description of what is being tested.
You can adjust various factors and see their effect on the enrichment p-values.
\hypertarget{section-1}{%
\subsubsection*{}\label{section-1}}
\addcontentsline{toc}{subsubsection}{}
\hypertarget{questions}{%
\subsection*{\texorpdfstring{\textbf{Questions}}{Questions}}\label{questions}}
\addcontentsline{toc}{subsection}{\textbf{Questions}}
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
Is it significant at p=0.05?
Show
No, corrected pval=0.087
\item
What about with a smaller background of 5000 genes (e.g.~proteomic datasets)?
Show
Even less so - corrected pval=1
\item
Or, testing against a smaller database of terms; 2000 terms instead of 10000? With the original 16000 gene background.
Show
Yes, now corrected pval=0.017
\item
19 out of 200 differentially expressed genes (9.5\%), need to hit for a 500-gene term (3.1\% of all genes) to be significant at (p=0.048). How many hits would be needed for a more specific 30-gene term?
Show
5 hits - 2.5\% of the differentially expressed genes vs 0.19\% of all genes
\end{enumerate}
\hypertarget{section-2}{%
\subsubsection*{}\label{section-2}}
\addcontentsline{toc}{subsubsection}{}
\hypertarget{example-analysis}{%
\chapter{Example Analysis}\label{example-analysis}}
\hypertarget{sh-sy5y-differentiation}{%
\section{SH-SY5Y Differentiation}\label{sh-sy5y-differentiation}}
SH-SY5Y is a commonly used neuroblastoma cell line.
With appropriate treatment, it can be induced to differentiate into a `more neuronal' form.
Differentiated cells look quite different, growing thin neurites out from the body of the cell.
\begin{figure}
{\centering \includegraphics[width=1\linewidth]{images/shsy5ydiff}
}
\caption{Morphological analysis of differentiated SH-SY5Y cells.<br>At 6-DIV stage, the cells exposed to RA showed an elongated morphology as compared to basal medium (NT). Cells subsequently treated in NBM for 3 days became more polarised, exhibited several neurites and branches and acquired a neuronal-like shape<br><br>Image is derived from Figure 4 (Pezzini et al. 2017).}\label{fig:unnamed-chunk-15}
\end{figure}
\hypertarget{the-question-what-pathways-are-involved-in-sh-sy5y-differentiation}{%
\section{The question: What pathways are involved in SH-SY5Y Differentiation?}\label{the-question-what-pathways-are-involved-in-sh-sy5y-differentiation}}
In their paper \href{https://link.springer.com/article/10.1007\%2Fs10571-016-0403-y}{\emph{Transcriptomic Profiling Discloses Molecular and Cellular Events Related to Neuronal Differentiation in SH-SY5Y Neuroblastoma Cells}}, Pezzini et al.~induced neuronal differentiation of the SH-SY5Y neuroblastoma cell line and measured transcriptomic changes using RNA sequencing (Pezzini et al.~2017). During the 9-day differentiation protocol, SH-SY5Y cells were initially pre-differentiated in a retinoic acid (RA) medium for 6 days, followed by a 3-day treatment with a neurobasal medium (NBM) enriched with neurotrophic factors. Control cells, which were not treated (NT), were maintained under basal conditions and served as a comparison group. The authors then performed functional enrichment analysis on the differentially expressed genes.
\hypertarget{the-data-differentially-expressed-genes}{%
\section{The data : Differentially expressed genes}\label{the-data-differentially-expressed-genes}}
The example dataset for today is the RNAseq differential expression results.
They can be accessed via this \href{http://degust.erc.monash.edu/degust/compare.html?code=5b2c7805ab8f8c5f2dc8c72e61b049b0\#?plot=mds}{Degust} link:
This has been reanalysed from the published raw data, via the degust tool.
\begin{quote}
\textbf{Note:} Other tools and approaches may produce different-looking results, but generally, you will end up with a table of genes containing some measure of statistical confidence. The methods for functional enrichment analysis should remain similar.
\end{quote}
\hypertarget{defining-the-genelist}{%
\chapter{Defining the genelist}\label{defining-the-genelist}}
Starting from those differential expression results \href{http://degust.erc.monash.edu/degust/compare.html?code=5b2c7805ab8f8c5f2dc8c72e61b049b0\#?plot=mds}{here}, how do we go about getting a genelist to calculate enrichment on?
\hypertarget{activities}{%
\section{Activities}\label{activities}}
Todays exercise follows the process of getting the differentially expressed gene list using excel. You could use another spreadsheet program, or some may prefer a programming language like R .
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
Download the full table of data from either degust, or the csv file here:
\href{https://monashbioinformaticsplatform.github.io/enrichment_analysis_workshop/data/Pezzini2016_SHSY5Ycelldiff_DE_table.csv}{Pezzini2016\_SHSY5Ycelldiff\_DE\_table.csv}. Import into excel.
\item
How many genes are differentially expressed? In these results the FDR Column contains the corrected p-value, and the `differentiated' column shows the log2 fold-change of differentiated cells vs untreated cells (log2(diff)-log2(undiff)); 0 is unchanged, 1 is doubled, -1 is halved.
\begin{itemize}
\item
Significant at 0.01?
\item
That's a particularly large number of genes - perhaps not unexpected given how much the cells are changed this experiment. How many significant genes also have 2-fold change in expression?
\item
For this workshop, get the genes with a FDR \textless1x10\^{}-4 and 2x fold change. Note that this is a ridiculous threshold - most experiments yeild far less differential expression, but the difference between these two cell conditions is pretty extreme! Typically you would only filter at p\textless0.01 (and occasionally 2-fold change) - you might see 10s to 100s of results. However, this arbitrary threshold gives a more typical number of differentially expressed genes for downstream analysis. An alternative approach could be to take the top 500 genes.
\end{itemize}
\end{enumerate}
Show
There are 4923 differentially expressed genes, 2149 of which have a 2-fold change in expression. With the aggressive filtering, there are 198 genes left.
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{2}
\tightlist
\item
How many genes are \emph{tested}? This is your background.
\end{enumerate}
Show
14420 genes tested.
\begin{center}\rule{0.5\linewidth}{0.5pt}\end{center}
\hypertarget{common-gotcha}{%
\section{Common gotcha}\label{common-gotcha}}
Can you find SEPT4? Because \href{https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-1044-7}{\emph{Gene name errors are widespread in the scientific literature}}
You can't revert the gene names automatically (try converting it to text!). You have to avoid it in the first place by importing gene columns as `text' columns in excel. See video from HUGO : \url{https://www.genenames.org/help/faq/\#!/\#tocAnchor-1-25-1}
\begin{center}\rule{0.5\linewidth}{0.5pt}\end{center}
\hypertarget{example}{%
\section{Example}\label{example}}
An example excel document showing this filtering process is here: \href{https://monashbioinformaticsplatform.github.io/enrichment_analysis_workshop/data/Pezzini2016_SHSY5Ycelldiff_DE_table_filtering.xlsx}{Pezzini2016\_SHSY5Ycelldiff\_DE\_table\_filtering.xlsx}.
\hypertarget{online-tools}{%
\chapter{Online Tools}\label{online-tools}}
Functional enrichment analysis can be performed using various web-based tools, each of which is designed to meet specific analytical needs. These tools often vary in the databases they use, their statistical approaches, and their capabilities to perform different types of analysis, such as Over-Representation Analysis (ORA) or Gene Set Enrichment Analysis (GSEA).
In this workshop, we will explore several popular tools for functional enrichment analysis, including gProfiler, STRING, Reactome, and MSigDB GSEA. Each tool offers unique features and insights, providing flexibility in selecting the right method for diverse datasets and research questions.
\hypertarget{fea-in-gprofiler}{%
\section{\texorpdfstring{FEA in gProfiler }{FEA in gProfiler }}\label{fea-in-gprofiler}}
gProfiler is known for its integration of numerous species and databases. It supports both ORA and GSEA, enabling users to assess Gene Ontology (GO), biological pathways, regulatory motifs and protein databases. With gProfile one can
\hypertarget{steps-to-perform-ora-in-gprofiler}{%
\subsection{Steps to perform ORA in g:Profiler:}\label{steps-to-perform-ora-in-gprofiler}}
{- Prepare Input List:} Ensure your input is formatted as one gene per line or in a suitable format for g:Profiler.
{- Input Gene List:} Paste your prepared gene list directly into the input box on the g:Profiler web page or upload a file containing your list.
{- Select Organism:} Choose the appropriate organism from the \texttt{Organism} dropdown menu (e.g., \emph{Homo sapiens} for human data).
{- Choose Statistical Domain Scope:} Under \texttt{Advanced\ options}, select your preferred statistical background from the \texttt{Statistical\ domain\ scope} menu.If you choose ``Custom'' background, provide your custom background list by pasting or uploading the relevant file.
{- Set Significance Threshold:} Select the desired significance threshold method, such as \emph{g:SCS}, \emph{Bonferroni}, or \emph{Benjamini-Hochberg}.
- Specify the threshold value (e.g., 0.05, 0.1, etc.).
{- Select Functional Annotation Databases: } Navigate to the \texttt{Data\ sources} tab and choose one or more databases for analysis. Available options include:
\begin{itemize}
\tightlist
\item
\emph{Gene Ontology (GO)}: Biological Process, Molecular Function, and Cellular Component.
\item
\emph{KEGG Pathways}
\item
\emph{Reactome Pathways}
\item
\emph{WikiPathways}
\item
\emph{TRANSFAC}
\item
\emph{mirTarBase}
\item
\emph{Human Protein Atlas}
\item
\emph{CORUM}
\item
\emph{Human Phenotype Ontology (HP)}
\end{itemize}
{- Run Query:} Run the analysis and review the enriched terms, pathways, and visual outputs. Download the results as needed for further exploration.
\hypertarget{browse-the-gprofiler-results}{%
\subsubsection{Browse the gProfiler Results}\label{browse-the-gprofiler-results}}
\begin{itemize}
\item
\textbf{Overview}:
The analysis provided a comprehensive list of enriched terms across selected databases, highlighting significant GO. The results give a high-level summary of pathways or terms most relevant to the input data.
\item