-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathKP.DIID.rmd
133 lines (88 loc) · 5.66 KB
/
KP.DIID.rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
---
title: "DIID"
author: "KonstantinPopadin"
date: "8/19/2024"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## 1 Read data, derive DeficitOfFragility
```{r}
# Set the file path
file_path <- "2_Derived/Total_DIID_count.csv"
data <- read.table(file_path, sep = ';', header = TRUE)
# Display the first few rows of the data
names(data)
summary(data$DIIDFragilityScore)
data$DIID.DeficitOfFragility = data$D.I..IDcount - data$DI..I.Dcount # ALTER - FRAGILE: the higher the score the higher the deficit of fragile elements ( due to potential selection)
summary(data$DIID.DeficitOfFragility)
```
## 2 DIIDFragilityScore (==number of fragile / number of alternative structures)
### we expect that FragilityScore should decrease (due to selection) in case of:
### (i) long repeats (so we expect negative correlations between FragilityScore and length);
### (ii) short intervals 1 and 3 (so we expect positive correlations between FragilityScore and interval length)
### in a set of univariate and multivariate analyses below we observe exactly this.
```{r}
summary(data$DirectRepeatLength)
summary(data$InvertedRepeatLength)
### UNIVARIATE WITH DIIDFragilityScore
cor.test(data$DIIDFragilityScore,data$DirectRepeatLength, method = 'spearman')
plot(data$DIIDFragilityScore,data$DirectRepeatLength)
cor.test(data$DIIDFragilityScore,data$InvertedRepeatLength, method = 'spearman')
plot(data$DIIDFragilityScore,data$InvertedRepeatLength)
cor.test(data$DIIDFragilityScore,data$LengthOfInterval1, method = 'spearman')
plot(data$DIIDFragilityScore,data$LengthOfInterval1)
cor.test(data$DIIDFragilityScore,data$LengthOfInterval2, method = 'spearman')
plot(data$DIIDFragilityScore,data$LengthOfInterval2)
cor.test(data$DIIDFragilityScore,data$LengthOfInterval3, method = 'spearman')
plot(data$DIIDFragilityScore,data$LengthOfInterval3)
### MULTIVARIATE
summary(lm(data$DIIDFragilityScore ~ scale(data$DirectRepeatLength) + scale(data$InvertedRepeatLength) + scale(data$LengthOfInterval1) + scale(data$LengthOfInterval2) + scale(data$LengthOfInterval3)))
summary(lm(data$DIIDFragilityScore ~ scale(data$DirectRepeatLength) + scale(data$InvertedRepeatLength) + scale(data$LengthOfInterval1) + scale(data$LengthOfInterval3)))
```
## 3 DIID.DeficitOfFragility (== number of alternative structures - number of fragile structures)
### we expect that DeficitOfFragility (as a result of selection) should be more pronounced (stronger), when:
### (i) repeats are long (so we expect positive correlations between DeficitOfFragility and length);
### (ii) intervals 1 and 3 are short (so we expect negative correlations between DIID.DeficitOfFragility and interval length)
### in a set of univariate (not always) and in multivariate analyses below we observe exactly this.
```{r}
cor.test(data$DIID.DeficitOfFragility,data$DirectRepeatLength, method = 'spearman')
plot(data$DIID.DeficitOfFragility,data$DirectRepeatLength)
cor.test(data$DIID.DeficitOfFragility,data$InvertedRepeatLength, method = 'spearman')
plot(data$DIID.DeficitOfFragility,data$InvertedRepeatLength)
cor.test(data$DIID.DeficitOfFragility,data$LengthOfInterval1, method = 'spearman')
plot(data$DIID.DeficitOfFragility,data$LengthOfInterval1)
cor.test(data$DIID.DeficitOfFragility,data$LengthOfInterval2, method = 'spearman')
plot(data$DIID.DeficitOfFragility,data$LengthOfInterval2)
cor.test(data$DIID.DeficitOfFragility,data$LengthOfInterval3, method = 'spearman')
plot(data$DIID.DeficitOfFragility,data$LengthOfInterval3)
### MULTIVARIATE
summary(lm(data$DIID.DeficitOfFragility ~ scale(data$DirectRepeatLength) + scale(data$InvertedRepeatLength) + scale(data$LengthOfInterval1) + scale(data$LengthOfInterval2) + scale(data$LengthOfInterval3)))
summary(lm(data$DIID.DeficitOfFragility ~ scale(data$DirectRepeatLength) + scale(data$InvertedRepeatLength) + scale(data$LengthOfInterval1) + scale(data$LengthOfInterval3)))
```
## 4: DG4IIDFragilityScore (==number of fragile with GQ / number of alternative structures)
### we expect that FragilityScore should decrease (due to selection) in case of:
### (i) long repeats (so we expect negative correlations between FragilityScore and length);
### (ii) short intervals 1 and 3 (so we expect positive correlations between FragilityScore and interval length)
### in a set of univariate and multivariate analyses below we observe exactly this.
```{r}
summary(data$DirectRepeatLength)
summary(data$InvertedRepeatLength)
summary(data$DG4IIDFragilityScore)
### UNIVARIATE WITH DIIDFragilityScore
cor.test(data$DG4IIDFragilityScore,data$DirectRepeatLength, method = 'spearman')
plot(data$DG4IIDFragilityScore,data$DirectRepeatLength)
cor.test(data$DG4IIDFragilityScore,data$InvertedRepeatLength, method = 'spearman')
plot(data$DG4IIDFragilityScore,data$InvertedRepeatLength)
cor.test(data$DG4IIDFragilityScore,data$LengthOfInterval1, method = 'spearman')
plot(data$DG4IIDFragilityScore,data$LengthOfInterval1)
cor.test(data$DG4IIDFragilityScore,data$LengthOfInterval2, method = 'spearman')
plot(data$DG4IIDFragilityScore,data$LengthOfInterval2)
cor.test(data$DG4IIDFragilityScore,data$LengthOfInterval3, method = 'spearman')
plot(data$DG4IIDFragilityScore,data$LengthOfInterval3)
### MULTIVARIATE
summary(lm(data$DG4IIDFragilityScore ~ scale(data$DirectRepeatLength) + scale(data$InvertedRepeatLength) + scale(data$LengthOfInterval1) + scale(data$LengthOfInterval2) + scale(data$LengthOfInterval3)))
summary(lm(data$DG4IIDFragilityScore ~ scale(data$DirectRepeatLength) + scale(data$InvertedRepeatLength) + scale(data$LengthOfInterval1) + scale(data$LengthOfInterval3)))
```
Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot.