-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFigures.R
189 lines (153 loc) · 8.59 KB
/
Figures.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
library(choroplethr)
library(choroplethrMaps)
library(ggplot2)
library(RColorBrewer)
#import and process data
WNV_metadata <- read.csv("data_summary/Summary_13_feb_2024.csv", header = TRUE)
accessions_of_good_genomes <- read.csv("data_summary/AA_polymorphisms_sparse_filtered.csv", header = TRUE)
filtered_metadata <- merge(WNV_metadata, accessions_of_good_genomes, by = "accession")
filtered_metadata <- filtered_metadata[filtered_metadata$county != "", ]
filtered_metadata <- filtered_metadata[filtered_metadata$FIPS != "", ]
#PLOT 1: US distribution of all high quality genomes ----
my.cols <- brewer.pal(6, "OrRd")
# Count the occurrences of each county_FIPS value
counts <- table(filtered_metadata$county_FIPS)
# Convert the result to a dataframe
us_distribution_df <- as.data.frame(counts)
# Rename the columns
names(us_distribution_df) <- c("region", "value")
us_distribution_df$region <- as.numeric(as.character(us_distribution_df$region))
all_genomes_map <- choroplethr::county_choropleth(us_distribution_df, title = "Distribution of WNV genomes", legend = "# of genomes")
all_genomes_map + scale_fill_manual(values = my.cols, na.value = "white")
#PLOT 2: distribution of NW02 ----
my.cols <- brewer.pal(6, "OrRd")
E.V159A_data <- aggregate(E.V159A ~ county_FIPS, data = filtered_metadata, sum)
names(E.V159A_data) <- c("region", "value")
E.V159A_data$region <- as.numeric(as.character(E.V159A_data$region))
E.V159A_map <- choroplethr::county_choropleth(E.V159A_data, title = "Distribution of WN02", legend = "Presence of E-V159A")
E.V159A_map + scale_fill_manual(values = my.cols, na.value = "white")
#PLOT 3: distribution of SW03 ----
my.cols <- brewer.pal(6, "OrRd")
my.cols[1] <- '#ffffff'
NS4A.A85T_data <- aggregate(NS4A.A85T ~ county_FIPS, data = filtered_metadata, sum)
names(NS4A.A85T_data) <- c("region", "value")
NS4A.A85T_data$region <- as.numeric(as.character(NS4A.A85T_data$region))
NS4A.A85T_map <- choroplethr::county_choropleth(NS4A.A85T_data, title = "Distribution of NS4A.A85T", legend = "Presence of NS4A-A85T")
NS4A.A85T_map + scale_fill_manual(values = my.cols, na.value = "white")
#PLOT 4: distribution of NS5-K314R ----
my.cols <- brewer.pal(4, "OrRd")
my.cols[1] <- '#ffffff'
NS5.K314R_data <- aggregate(NS5.K314R ~ county_FIPS, data = filtered_metadata, sum)
names(NS5.K314R_data) <- c("region", "value")
NS5.K314R_data$region <- as.numeric(as.character(NS5.K314R_data$region))
NS5.K314R_map <- choroplethr::county_choropleth(NS5.K314R_data, title = "Distribution of NS5.K314R", legend = "Presence of NS5.K314R")
NS5.K314R_map + scale_fill_manual(values = my.cols, na.value = "white")
#PLOT 5: distribution of NS5-K314R AND NS4A.A85T together ----
# Create a new column that indicates whether both NS5.K314R and NS4A.A85T are present
filtered_metadata$NS5.K314R_NS4A.A85T <- ifelse(filtered_metadata$NS5.K314R == 1 & filtered_metadata$NS4A.A85T == 1, 1, 0)
# Aggregate this new column by county_FIPS
both_present_data <- aggregate(NS5.K314R_NS4A.A85T ~ county_FIPS, data = filtered_metadata, sum)
# Rename the columns and convert the region column to numeric
names(both_present_data) <- c("region", "value")
both_present_data$region <- as.numeric(as.character(both_present_data$region))
NS5.K314R_NS4A.A85T_map <- choroplethr::county_choropleth(both_present_data, title = "Distribution of NS5.K314R and NS4A.A85T", legend = "Presence of both NS5.K314R and NS4A.A85T")
NS5.K314R_NS4A.A85T_map + scale_fill_manual(values = my.cols, na.value = "white")
#PLOT 6: distribution of NS4B-I240M ----
my.cols <- brewer.pal(6, "OrRd")
my.cols[1] <- '#ffffff'
NS4B.I240M_data <- aggregate(NS4B.I240M ~ county_FIPS, data = filtered_metadata, sum)
names(NS4B.I240M_data) <- c("region", "value")
NS4B.I240M_data$region <- as.numeric(as.character(NS4B.I240M_data$region))
NS4B.I240M_map <- choroplethr::county_choropleth(NS4B.I240M_data, title = "Distribution of NS4B.I240M", legend = "Presence of NS4B.I240M")
NS4B.I240M_map + scale_fill_manual(values = my.cols, na.value = "white")
#PLOT 7: distribution of NS5-A860T ----
my.cols <- brewer.pal(6, "OrRd")
my.cols[1] <- '#ffffff'
NS5.A860T_data <- aggregate(NS5.A860T ~ county_FIPS, data = filtered_metadata, sum)
names(NS5.A860T_data) <- c("region", "value")
NS5.A860T_data$region <- as.numeric(as.character(NS5.A860T_data$region))
NS5.A860T_map <- choroplethr::county_choropleth(NS5.A860T_data, title = "Distribution of NS5.A860T", legend = "Presence of NS5.A860T")
NS5.A860T_map + scale_fill_manual(values = my.cols, na.value = "white")
#PLOT 7: distribution of NS2A-R188K ----
NS2A.R188K_data <- aggregate(NS2A.R188K ~ county_FIPS, data = filtered_metadata, sum)
names(NS2A.R188K_data) <- c("region", "value")
NS2A.R188K_data$region <- as.numeric(as.character(NS2A.R188K_data$region))
NS2A.R188K_map <- choroplethr::county_choropleth(NS2A.R188K_data, title = "Distribution of NS2A.R188K", legend = "Presence of NS2A.R188K")
NS2A.R188K_map + scale_fill_manual(values = my.cols, na.value = "white")
##---------------------------- summary plot of mutations -------------------##
library(tidyverse)
library(dplyr)
polymorphisms <- read.csv("data_summary/AA_polymorphisms_dense.csv", header=TRUE)
# Function to replace the most common amino acid with NA, except for E.V159
replace_most_common <- function(x, name) {
if (name == "E.V159") return(x)
most_common <- names(which.max(table(x)))
x[x == most_common] <- NA
return(x)
}
# Apply the function to each column (except the first one)
polymorphisms_filtered <- polymorphisms
polymorphisms_filtered[-1] <- lapply(names(polymorphisms_filtered[-1]), function(name) replace_most_common(polymorphisms_filtered[[name]], name))
polymorphisms_long <- polymorphisms_filtered %>% pivot_longer(cols = -accession, names_to = "residue", values_to = "amino_acid")
# Remove rows with NA in the 'amino_acid' column
polymorphisms_long <- na.omit(polymorphisms_long)
# Remove rows with '-' in the 'amino_acid' column
polymorphisms_long <- polymorphisms_long %>% filter(amino_acid != '-')
# Convert 'residue' to a factor and specify the levels
polymorphisms_long$residue <- factor(polymorphisms_long$residue, levels = names(polymorphisms_filtered)[-1])
ggplot(polymorphisms_long, aes(x = residue, y = ..count../1211, fill = amino_acid)) +
geom_bar(width = 0.7) +
scale_x_discrete(breaks = function(x) x[seq(1, length(x), by = 20)]) +
coord_cartesian(ylim = c(0, 0.25)) + # Adjust these values as needed
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
labs(x = "Residue", y = "Frequency")
##----------------------- time series for common mutations -------------#
library(lubridate)
library(gridExtra)
# Convert 'collection_date' to a date format and extract the year
filtered_metadata$year <- year(dmy(filtered_metadata$collection_date))
# Calculate the sum of each mutation by year
mutations_by_year <- filtered_metadata %>%
group_by(year) %>%
summarise(NS4A.A85T = sum(NS4A.A85T) / n(),
NS2A.R188K = sum(NS2A.R188K) / n(),
NS4B.I240M = sum(NS4B.I240M) / n(),
NS5.K314R = sum(NS5.K314R) / n())
# Reshape the data to a long format
mutations_long <- mutations_by_year %>%
gather(key = "mutation", value = "frequency", -year)
# Calculate the total number of samples per year
total_samples_per_year <- filtered_metadata %>%
group_by(year) %>%
summarise(total = n())
# Calculate the count of samples by region and year
region_by_year <- filtered_metadata %>%
group_by(year, region_name) %>%
summarise(count = n())
# Join the two dataframes and calculate the frequency
region_by_year <- region_by_year %>%
left_join(total_samples_per_year, by = "year") %>%
mutate(frequency = count / total)
# Create the first plot for NS5.K314R and NS4A.A85T
plot1 <- ggplot(mutations_long %>% filter(mutation %in% c("NS5.K314R", "NS4A.A85T")),
aes(x = year, y = frequency, color = mutation)) +
geom_line() +
labs(x = "Year", y = "Frequency of mutations", color = "Mutation") +
theme_minimal()
# Create the second plot for NS2A.R188K and NS4B.I240M
plot2 <- ggplot(mutations_long %>% filter(mutation %in% c("NS2A.R188K", "NS4B.I240M")),
aes(x = year, y = frequency, color = mutation)) +
geom_line() +
labs(x = "Year", y = "Frequency of mutations", color = "Mutation") +
theme_minimal()
# Create the third plot for the sampling of regions over time
plot3 <- ggplot(region_by_year, aes(x = year, y = frequency, color = region_name)) +
geom_line() +
labs(x = "Year", y = "Frequency of samples", color = "Region") +
theme_minimal()
# Combine the plots
grid.arrange(plot1, plot2, plot3, ncol=1)