-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBioinformaticsITS.R
545 lines (414 loc) · 21.7 KB
/
BioinformaticsITS.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
#2017 ROOT and Soil fungal samples for Phragmites plots.
#First filter and infer sequence variants for each set: Root.Lane1,Root.Lane2, Soil
#For each set, run through fist part of script, CHANGE DIRECTORY folder
load("/Users/farrer/Dropbox/EmilyComputerBackup/Documents/LAmarsh/Survey/Stats/Gradient/QIIME2/Fungi/analysisSept2020/Roots_SoilFungi2017EF2.RData")
save.image(file = "/Users/farrer/Dropbox/EmilyComputerBackup/Documents/LAmarsh/Survey/Stats/Gradient/QIIME2/Fungi/analysisSept2020/Roots_SoilFungi2017EF2.RData")
#BiocManager::install(version = '3.10')
#BiocManager::install(("decontam"))
#if (!requireNamespace("BiocManager", quietly = TRUE))
#install.packages("BiocManager")
#BiocManager::install("dada2", version = "3.10")
packageVersion("dada2") #I'm using an old version b/c i didn't want to update r and everything
library(dada2)
library(ShortRead)
library(Biostrings)
library("phyloseq")
library("plyr")
library("ggplot2")
library(dplyr)
library(tidyr)
library(nlme)
library(vegan)
library(reshape)
library(stringr)
library(plotrix)
library(data.table)
library(decontam)
library(vegan)
#in unix add s to begining of file names
for i in *; do mv "$i" s"$i"; done;ls -l
#change path for each sequence set
path<- "/Users/farrer/Dropbox/EmilyComputerBackup/Documents/LAmarsh/Survey/Stats/Gradient/QIIME2/Fungi/analysisSept2020/Soil"
path<- "/Users/farrer/Dropbox/EmilyComputerBackup/Documents/LAmarsh/Survey/Stats/Gradient/QIIME2/Fungi/analysisSept2020/Roots1" #310 files
path<- "/Users/farrer/Dropbox/EmilyComputerBackup/Documents/LAmarsh/Survey/Stats/Gradient/QIIME2/Fungi/analysisSept2020/Roots2" #148 files
list.files(path)
#separate forward and reverse
fnFs <- sort(list.files(path,pattern= "_L001_R1_001.fastq.gz", full.names = TRUE))
fnRs <- sort(list.files(path,pattern = "_L001_R2_001.fastq.gz", full.names = TRUE))
#Identify Primers
FWD <- "CTTGGTCATTTAGAGGAAGTAA"
REV <- "GCTGCGTTCTTCATCGATGC"
#verify that we have the rich primers and correct orientation
allOrients <- function(primer) {
# Create all orientations of the input sequence
require(Biostrings)
dna <- DNAString(primer) # The Biostrings works w/ DNAString objects rather than character vectors
orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna),
RevComp = reverseComplement(dna))
return(sapply(orients, toString)) # Convert back to character vector
}
FWD.orients <- allOrients(FWD)
REV.orients <- allOrients(REV)
FWD.orients
#The presence of ambiguous bases (Ns) in the sequencing reads makes accurate mapping of short primer sequences difficult. Next we are going to "pre-filter" the sequences just to remove those with Ns, but perform no other filtering. I trimmed the ends of forward and reverse reads so that the quality score was roughly 30.
fnFs.filtN <- file.path(path,"filtN", basename(fnFs)) # Put N-filterd files in filtN/ subdirectory
fnRs.filtN <- file.path(path,"filtN", basename(fnRs))
filterAndTrim(fnFs, fnFs.filtN, fnRs, fnRs.filtN, maxN = 0, multithread = TRUE, trimRight = c(50,90))#soil is trimRight = c(50,90); roots1 is trimRight = c(50,95), roots2 is trimRight = c(50,90)
#count the number of times the primers appear in the forward and reverse read, while considering all possible primer orientations. Identifying and counting the primers on one set of paired end FASTQ files is sufficient, assuming all the files were created using the same library preparation, so we'll just process the first sample.
primerHits <- function(primer, fn) {
# Counts number of reads in which the primer is found
nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE)
return(sum(nhits > 0))
}
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.filtN[[1]]),
FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.filtN[[1]]),
REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.filtN[[1]]),
REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.filtN[[1]]))
#remove primers
cutadapt <- "/Users/farrer/miniconda3/envs/qiime2-2018.2/bin/cutadapt"
list.files(cutadapt)
system2(cutadapt, args = "--version") # Run shell commands from R
#create output filenames for the cutadapt-ed files, and define the parameters we are going to give the cutadapt command
#critical parameters are the primers, and they need to be in the right orientation,
# i.e. the FWD primer should have been matching the forward-reads in its forward orientation, and the REV primer should have been matching the reverse-reads in its forward orientation.
path.cut <- file.path(path, "cutadapt")
list.files(path.cut)
if(!dir.exists(path.cut)) dir.create(path.cut)
fnFs.cut <- file.path(path.cut, basename(fnFs))
fnRs.cut <- file.path(path.cut, basename(fnRs))
FWD.RC <- dada2:::rc(FWD)
REV.RC <- dada2:::rc(REV)
# Trim FWD and the reverse-complement of REV off of R1 (forward reads)
R1.flags <- paste("-g", FWD, "-a", REV.RC)
# Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
R2.flags <- paste("-G", REV, "-A", FWD.RC)
# Run Cutadapt, 30 min
for(i in seq_along(fnFs)) {
system2(cutadapt, args = c(R1.flags, R2.flags, "-n", 2, "-m", 20,# -n 2 required to remove FWD and REV from reads, -m means it deletes reads of less than 20 length
"-o", fnFs.cut[i], "-p", fnRs.cut[i], # output files
fnFs.filtN[i], fnRs.filtN[i])) # input files
}
#sanity check:count presence of primers in cut-adapeted files
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.cut[[1]]),
FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.cut[[1]]),
REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.cut[[1]]),
REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.cut[[1]]))
# Forward and reverse fastq filenames have the format:
cutFs <- sort(list.files(path.cut, pattern = "_L001_R1_001.fastq.gz", full.names = TRUE))
cutRs <- sort(list.files(path.cut, pattern = "_L001_R2_001.fastq.gz", full.names = TRUE))
# Extract sample names, assuming filenames have format:
get.sample.name <- function(fname) strsplit(basename(fname), "_")[[1]][1]
sample.names <- unname(sapply(cutFs, get.sample.name))
head(sample.names)
plotQualityProfile(cutFs[1:8])+
geom_vline(xintercept=250)+
geom_hline(yintercept=30)
plotQualityProfile(cutRs[1:8])+
geom_vline(xintercept=210)+
geom_hline(yintercept=30)
#filter and trim
filtFs <- file.path(path.cut, "filtered", basename(cutFs))
filtRs <- file.path(path.cut, "filtered", basename(cutRs))
#Standard filtering parameters maxN=0 (already filtered), truncQ=2, rm.phix=TRUE, maxEE=TRUE. soil is out2, roots1 is out3, roots2 is out4
#enforce min length of 50 bp
#start 10pm, end 10:06
out4 <- filterAndTrim(cutFs, filtFs, cutRs, filtRs, maxN = 0, maxEE = c(2, 2),
truncQ = 2, minLen = 50, rm.phix = TRUE, compress = TRUE, multithread = TRUE) # on windows, set multithread = FALSE
head(out4)
#write.csv(out2, "readsout.csv")
#inspect read quality profiles 1,82, 92
plotQualityProfile(filtFs[70])
plotQualityProfile(filtRs[70])
#learn Error rates, start 10:20, end 10:22
errF <- learnErrors(filtFs, multithread = TRUE)
errR <- learnErrors(filtRs, multithread = TRUE)
plotErrors(errF, nominalQ = TRUE)
#Dereplicate identical reads
derepFs <- derepFastq(filtFs, verbose = TRUE)
derepRs <- derepFastq(filtRs, verbose = TRUE)
# Name the derep-class objects by the sample names
names(derepFs) <- sample.names
names(derepRs) <- sample.names
#sample inference, 4 min
#At this step the core sample inference algorithm is applied to the dereplicated data
dadaFs <- dada(derepFs, err = errF, multithread = TRUE)
dadaRs <- dada(derepRs, err = errR, multithread = TRUE)
#merge paired reads, 2 min
mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=TRUE)
#construct sequence table
#seqtab.Root.Lane1, seqtab.Root.Lane2, seqtab.Soil
seqtab.Soil <- makeSequenceTable(mergers)
#Track reads through pipeline
getN <- function(x) sum(getUniques(x))
tracksoil<-cbind(out2, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN))
trackroots1<-cbind(out3, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN))
trackroots2<-cbind(out4, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN))
#MERGE 3 sets################################################################################################
seqtab.all<-mergeSequenceTables(seqtab.Soil,seqtab.Roots1,seqtab.Roots2)
#remove chimeras, start 11:30, end 11:31
seqtab.nochim <- removeBimeraDenovo(seqtab.all, method="consensus", multithread=TRUE, verbose=TRUE)
write.csv(seqtab.nochim, "seqtab.nochim.csv")
rownames(seqtab.nochim)
#inspect distribution of sequence length
plot(table(nchar(getSequences(seqtab.nochim)))) #229 is the most common length
#normally do track here but not if you merged multiple lanes
#track <- cbind(out6, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN), rowSums(seqtab.nochim))
#assign taxonomy###############################################################################################
#Assign taxonomy, start 10:20am, end 1:50pm
unite.ref <- "/Users/farrer/Dropbox/EmilyComputerBackup/Documents/LAMarsh/Survey/Stats/Gradient/QIIME2/sh_general_release_dynamic_s_04.02.2020.fasta" #
taxa<-assignTaxonomy(seqtab.nochim, unite.ref, multithread = TRUE, minBoot=70, tryRC = TRUE,outputBootstraps=T)
taxaonly<-taxa$tax
write.csv(taxa,"/Users/farrer/Dropbox/EmilyComputerBackup/Documents/LAmarsh/Survey/Stats/Gradient/QIIME2/Fungi/analysisSept2020/taxa.csv")
#create tax table for Phyloseq####################################################################################
sequences<-as.data.frame(rownames(taxaonly))
rownames(sequences)<-paste0("OTU",1:nrow(sequences))
sequences$OTU<-rownames(sequences)
colnames(sequences)<-c("sequence","OTUID")
OTUID<-as.data.frame(sequences$OTUID)
taxa1<-cbind(OTUID, taxaonly)
write.csv(taxa1,"/Users/farrer/Dropbox/EmilyComputerBackup/Documents/LAmarsh/Survey/Stats/Gradient/QIIME2/Fungi/analysisSept2020/taxa_OTUID.csv")
OTUID<-sequences$OTUID
rownames(taxa1)<-OTUID
#use this for phyloseq tax table
taxa1<-taxa1[,2:8]
phy.tax<-tax_table(as.matrix(taxa1))
phy.tax
#Create sample data table###########################################################################################
sam<-as.data.frame(rownames(seqtab.nochim))
write.csv(sam, "/Users/farrer/Dropbox/EmilyComputerBackup/Documents/LAmarsh/Survey/Stats/Gradient/QIIME2/Fungi/analysisSept2020/sam.csv")
#clean in excel and reupload
sam1<-read.csv("/Users/farrer/Dropbox/EmilyComputerBackup/Documents/LAmarsh/Survey/Stats/Gradient/QIIME2/Fungi/analysisSept2020/sam.csv")
sam1<-sam1[,2:4]
# plot info
#soilchem<-read.csv("Soilchem.csv")
#join sam1 and rpi
#SD<-left_join(sam1,soilchem, by="Plot")
#colnames(SD)<-c("SampleID","Plot","SampleID.y","Site","SiteID" ,"Transect" , "Calcium" , "Copper" , "Magnesium" , "Phosphorus" ,"Potassium" ,"Sodium" , "Sulfur" , "Zinc" , "Carbon" , "Nitrogen" , "pH" , "plant.rich" ,"sal" )
#root sample species
#species<-read.csv("Species_Samples.csv");colnames(species)<-c("SampleID","Species")
#SD<-left_join(SD,species,by="SampleID")
#add column in excel fpr decntam and clean
#write.csv(SD,"SampleData.csv")
#SD<-read.csv("SampleData.csv")
SD<-sam1
rownames(SD)<-SD$SampleID
SD$Sample_or_Control<-ifelse(SD$SampleType=="control","Control","Sample")
SD$is.neg<-ifelse(SD$Sample_or_Control=="Control",TRUE,FALSE)
#load in dat17, no i'll add it below on just the soil data
phy.SD<-sample_data(SD)
#Create official ASV Table####################################################################################
OTU.Table<-as.data.frame(seqtab.nochim)
colnames(OTU.Table)<-OTUID
#sam<-read.csv("sam.csv")
SampleID<-sam1$SampleID
#check
#cbind(rownames(OTU.Table),as.character(SampleID))
rownames(OTU.Table)<-SampleID
#OTU.Table$SampleID_Original<-rownames(OTU.Table)
#ASV.Table<-left_join(OTU.Table,sam,by="SampleID_Original")
#rownames(ASV.Table)<-ASV.Table$SampleID
#ASV.Table<-ASV.Table[,1:16456]
phy.ASV<-otu_table(OTU.Table,taxa_are_rows = F)
#create phyloseq object
phy<-merge_phyloseq(phy.ASV,phy.SD,phy.tax)
#### Decontam ####
datITS<-phy
print(as.data.frame(unique(tax_table(datITS)[,"Kingdom"])),row.names=F)
####Filter and remove contaminant sequences with decontam()####
#note!!!! if you filter with subset_taxa and a !=, it will NOT return any rows that are NA, so you always have to do an "or NA" in the statement
#Filter samples with really low numbers of reads (<1000), not sure if this is necessary for the prevelance method but it is kind of weird that some of my actual samples had lower number of reads than the negative controls. This could be because the pcr failed and susannah put them in anyway "just in case" but if the pcr failed and all we got was contamination, then that skews the ability to see that they are contaminated.
sort(sample_sums(datITS))
datITSS<-datITS%>%
subset_samples(sample_sums(datITS)>1000)
datITSS
sample_data(datITSS)
dfITS <- as.data.frame(sample_data(datITSS)) # Put sample_data into a ggplot-friendly data.frame
dfITS$LibrarySize <- sample_sums(datITSS)
dfITS <- dfITS[order(dfITS$LibrarySize),]
dfITS$Index <- seq(nrow(dfITS))
ggplot(data=dfITS, aes(x=Index, y=LibrarySize, color=Sample_or_Control)) + geom_point()
contamdf.prevITS <- isContaminant(datITSS, method="prevalence", neg="is.neg")
table(contamdf.prevITS$contaminant)
contamdf.prevITS5 <- isContaminant(datITSS, method="prevalence", neg="is.neg",threshold = .2)
table(contamdf.prevITS5$contaminant)
#threshold .1: 51/53 contaminants, threshold .2: 65/ , threshold .5: 87/88 (taking out low # read samples/not taking them out) - this is an old comment from using carolyn's phy object
datITSS.pa <- transform_sample_counts(datITSS, function(abund) 1*(abund>0))
datITSS.pa.neg <- prune_samples(sample_data(datITSS.pa)$is.neg == TRUE, datITSS.pa)
datITSS.pa.pos <- prune_samples(sample_data(datITSS.pa)$is.neg == FALSE, datITSS.pa)
# Make data.frame of prevalence in positive and negative samples
#threshold .1
datITSSdf.pa <- data.frame(pa.pos=taxa_sums(datITSS.pa.pos), pa.neg=taxa_sums(datITSS.pa.neg),contaminant=contamdf.prevITS$contaminant)
ggplot(data=datITSSdf.pa, aes(x=pa.neg, y=pa.pos, color=contaminant)) + geom_point() +
xlab("Prevalence (Negative Controls)") + ylab("Prevalence (True Samples)")
#threshold .2
datITSSdf.pa <- data.frame(pa.pos=taxa_sums(datITSS.pa.pos), pa.neg=taxa_sums(datITSS.pa.neg),contaminant=contamdf.prevITS5$contaminant)
ggplot(data=datITSSdf.pa, aes(x=pa.neg, y=pa.pos, color=contaminant)) + geom_point() +
xlab("Prevalence (Negative Controls)") + ylab("Prevalence (True Samples)")
#I think I will use a threshold = 0.2. It captures the "contaminants" that were in all three negative controls but doesn't overdo it.
#take out contaminants and filter negative controls, there were 78 contaminants
datITSS2 <- prune_taxa(!contamdf.prevITS5$contaminant, datITSS)
datITSS3 <-datITSS2 %>%
subset_samples(is.neg==FALSE)%>%
filter_taxa(function(x) sum(x) > (0), prune=T) #odd 84 taxa additionally are removed after removing the 78 contaminants. these must be taxa only found in the negative controls (which for some reason are not called contaminants) (??)
min(sample_sums(datITSS4))
sort(sample_sums(datITSS4))
#Filter out root samples and samples with low sampling depth
datITSS4 <-datITSS3 %>%
subset_samples(SampleType=="soil")%>%
subset_samples(sample_sums(datITSS3)>5000) %>%
filter_taxa(function(x) sum(x) > (0), prune=T)
####### Rarefy to 5984 ######
#This takes out samples: 155, 118, 134, 135, 88, 99
datITSS4
datITSS5<-datITSS4%>%
rarefy_even_depth(sample.size=min(sample_sums(datITSS4)),rngseed=10,replace=F)%>%
transform_sample_counts(function(x) x/sum(x) )
datITSS5c<-datITSS4%>%
rarefy_even_depth(sample.size=min(sample_sums(datITSS4)),rngseed=10,replace=F)
#2941 OTUs were removed because they are no longer present in any sample after random subsampling
#Adding to the sample data with dat from dat17
#need to open and run LoadingPlantSoilData.R script
tempf<-sample_data(datITSS5c)
tempf$Plot<-as.numeric(as.character(tempf$Plot))
tempd<-dat17[,c(1:4,52:77)]
tempb<-left_join(tempf,tempd,by="Plot")#warning is ok
tempb2<-cbind(Plottest=tempb$Plot,tempb[,6:34])
sample_data(datITSS5c)[,6:35]<-tempb2
sample_data(datITSS5c)[1:5,1:35]
#checks
cbind(as.character(sample_data(datITSS5c)$SampleID),as.numeric(as.character(sample_data(datITSS5c)$Plot)),sample_data(datITSS5c)$Plottest)
sample_data(datITSS5c)$Plottest<-NULL
sample_data(datITSS5)
sample_data(datITSS5)[,6:35]<-tempb2
cbind(as.numeric(as.character(sample_data(datITSS5)$Plot)),sample_data(datITSS5)$Plottest)
sample_data(datITSS5)$Plottest<-NULL
#Make OTU tables, this takes a while, start 10:47-10:52
datITSS5otu<-cbind(sample_data(datITSS5),otu_table(datITSS5))
datITSS5cotu<-cbind(sample_data(datITSS5c),otu_table(datITSS5c))
#Richness
richITS<-estimate_richness(datITSS5c, split = TRUE, measures = c("Observed", "Shannon","Chao1","Simpson","InvSimpson"))
colnames(richITS)<-c("RichnessITS","Chao1ITS","se.chao1ITS","ShannonITS","SimpsonITS","InvSimpsonITS")
richITS$SampleID<-rownames(richITS)
richITS2<-separate(richITS,SampleID,c(NA,"Plot"),"s")
richITS2$Plot<-as.numeric(richITS2$Plot)
#richITS2<-richITS%>%
# left_join(data.frame(sample_data(datITS2rcsoil)))
#sample_data(datITSS5c)[,]
dat17b<-dat17%>%
full_join(richITS2)
dat17<-dat17b
###### FUNguild ######
#https://github.com/UMNFuN/FUNGuild
temp<-data.frame(cbind(tax_table(datITSS5c),1))
temp$V8<-NULL
temp2<-temp%>%
unite("Kingdom_Species",sep=";",remove=T)
head(temp2)
colnames(temp2)<-"taxonomy"
cbind(row.names(temp2),row.names(d)
datITSS5cfunguild<-cbind(t(otu_table(datITSS5c)),temp2)
datITSS5cfunguild[1:5,161:163]
write.csv(datITSS5cfunguild,"/Users/farrer/Dropbox/EmilyComputerBackup/Documents/LAmarsh/Survey/Stats/Gradient/QIIME2/Fungi/analysisSept2020/datITSS5cfunguild.csv")
#then open the file and write "OTU ID" in the first cell
#then open terminal and navigate to folder and run
python Guilds_v1.1.py -otu datITSS5cfunguild.csv -m -u
#then open datITSS5cfunguild.guilds.txt and delete OTU ID in the first cell
funguild<-read.delim2("/Users/farrer/Dropbox/EmilyComputerBackup/Documents/LAmarsh/Survey/Stats/Gradient/QIIME2/Fungi/analysisSept2020/datITSS5cfunguild.guilds.txt",header=T,row.names = 1)
head(funguild)
funguild[1:5,170:172]
#rarefied to 5984
colSums(funguild[,2:5])
funguild2<-funguild%>%
filter(Trophic.Mode%in%c("Symbiotroph","Pathotroph","Saprotroph"))%>%
arrange(Trophic.Mode)%>%
dplyr::select(s1:s98,Trophic.Mode)%>%
group_by(Trophic.Mode)%>%
summarise_all(list(sum=sum))
funguild3<-data.frame(funguild2)
row.names(funguild3)<-funguild3$Trophic.Mode;funguild3$Trophic.Mode<-NULL
funguild4<-data.frame(t(funguild3))
head(funguild4)
funguild4$Plottemp<-row.names(funguild4)
funguild5<-funguild4%>%
separate(Plottemp,into=c("Plot",NA))
funguild5$Plot<-as.numeric(sub("s","",funguild5$Plot))
funguild5[,1:3]<-funguild5[,1:3]/5984*100
dat17b<-dat17%>%
full_join(funguild5)
dat17<-dat17b
#To get "Plant Pathogen" or "Endophyte" or "Arbuscular Mycorrhizal"
funguild2<-funguild%>%
filter(Guild%in%c("Plant Pathogen","Endophyte","Arbuscular Mycorrhizal"))%>%
arrange(Guild)%>%
dplyr::select(s1:s98,Guild)%>%
group_by(Guild)%>%
summarise_all(list(sum=sum))
funguild3<-data.frame(funguild2)
row.names(funguild3)<-funguild3$Guild;funguild3$Guild<-NULL
funguild4<-data.frame(t(funguild3))
head(funguild4)
funguild4$Plottemp<-row.names(funguild4)
funguild5<-funguild4%>%
separate(Plottemp,into=c("Plot",NA))
funguild5$Plot<-as.numeric(sub("s","",funguild5$Plot))
funguild5[,1:3]<-funguild5[,1:3]/5984*100
dat17b<-dat17%>%
full_join(funguild5)
dat17<-dat17b
#Note: any taxon classified as one thing (i.e. plant pathogen with nothing else) always gets at least a probable, no possibles.
#Trying taking out the "probables" for plant pathogens so it only includes highly probable. if i do this, there are only 7 non zero plots for plant pathogens, so that's not reasonble
funguild2<-funguild%>%
filter(Guild%in%c("Plant Pathogen","Endophyte","Arbuscular Mycorrhizal"))%>%
filter(Confidence.Ranking=="Highly Probable")%>%
arrange(Guild)%>%
dplyr::select(s1:s98,Guild)%>%
group_by(Guild)%>%
summarise_all(list(sum=sum))
funguild3<-data.frame(funguild2)
row.names(funguild3)<-funguild3$Guild;funguild3$Guild<-NULL
funguild4<-data.frame(t(funguild3))
head(funguild4)
funguild4$Plottemp<-row.names(funguild4)
funguild5<-funguild4%>%
separate(Plottemp,into=c("Plot",NA))
funguild5$Plot<-as.numeric(sub("s","",funguild5$Plot))
funguild5[,1:3]<-funguild5[,1:3]/5984*100
colnames(funguild5)[1:3]<-c("Arbuscular.Mycorrhizalhighlyprobable","Endophytehighlyprobable","Plant.Pathogenhighlyprobable")
dat17b<-dat17%>%
full_join(funguild5)
dat17<-dat17b
#Getting data for ectomycorrhizal
funguild2<-funguild%>%
filter(Guild%in%c("Ectomycorrhizal"))%>%
arrange(Guild)%>%
dplyr::select(s1:s98,Guild)%>%
group_by(Guild)%>%
summarise_all(list(sum=sum))
funguild3<-data.frame(funguild2)
row.names(funguild3)<-funguild3$Guild;funguild3$Guild<-NULL
funguild4<-data.frame(t(funguild3))
head(funguild4)
funguild4$Plottemp<-row.names(funguild4)
funguild5<-funguild4%>%
separate(Plottemp,into=c("Plot",NA))
funguild5$Plot<-as.numeric(sub("s","",funguild5$Plot))
funguild5[,1]<-funguild5[,1]/5984*100
dat17b<-dat17%>%
full_join(funguild5)
dat17<-dat17b
dat17$PlantMutualist<-dat17$Ectomycorrhizal+dat17$Arbuscular.Mycorrhizal+dat17$Endophyte
#Try any description that includes "plant pathogen"
ind<-grep("Plant Pathogen",funguild$Guild)
funguild2<-funguild[ind,]
funguild3<-colSums(funguild2[,1:162])
funguild4<-data.frame(t(data.frame(t(funguild3))))
head(funguild4)
funguild4$Plottemp<-row.names(funguild4)
colnames(funguild4)[1]<-"Plant.Pathogenanywhere"
funguild5<-funguild4%>%
separate(Plottemp,into=c("Plot",NA))
funguild5$Plot<-as.numeric(sub("s","",funguild5$Plot))
funguild5[,1]<-funguild5[,1]/5984*100
dat17b<-dat17%>%
full_join(funguild5)
dat17<-dat17b