-
Notifications
You must be signed in to change notification settings - Fork 124
VSEARCH pipeline
Torbjørn Rognes edited this page Feb 15, 2021
·
9 revisions
This is an example of how you may use VSEARCH to process a 16S rRNA dataset and obtain OTUs.
First you'll find the main shell script to perform the processing. Below that you will find a perl script to perform extraction of filtered fasta sequences used by the main script.
#!/bin/sh
# This is an example of a pipeline using vsearch to process data in the
# Mothur 16S rRNA MiSeq SOP tutorial dataset to perform initial paired-end
# read merging, quality filtering, chimera removal and OTU clustering.
THREADS=1
REF=../gold.fasta
PERL=$(which perl)
VSEARCH=$(which vsearch)
MISEQSOPDATA="https://mothur.s3.us-east-2.amazonaws.com/wiki/miseqsopdata.zip"
GOLD="https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.gold.bacteria.zip"
date
echo Obtaining Mothur MiSeq SOP tutorial dataset
if [ ! -e miseqsopdata.zip ]; then
wget $MISEQSOPDATA
fi
echo Decompressing...
unzip -u -o miseqsopdata.zip
echo
echo Obtaining Gold reference database for chimera detection
if [ ! -e gold.fasta ]; then
if [ ! -e silva.gold.bacteria.zip ]; then
wget $GOLD
fi
echo Decompressing and reformatting...
unzip -p silva.gold.bacteria.zip silva.gold.align | \
sed -e "s/[.-]//g" > gold.fasta
fi
# Enter subdirectory
cd MiSeq_SOP
echo
echo Checking FASTQ format version for one file
$VSEARCH --fastq_chars $(ls -1 *.fastq | head -1)
# Process samples
for f in *_R1_*.fastq; do
r=$(sed -e "s/_R1_/_R2_/" <<< "$f")
s=$(cut -d_ -f1 <<< "$f")
echo
echo ====================================
echo Processing sample $s
echo ====================================
$VSEARCH --fastq_mergepairs $f \
--threads $THREADS \
--reverse $r \
--fastq_minovlen 200 \
--fastq_maxdiffs 15 \
--fastqout $s.merged.fastq \
--fastq_eeout
# Commands to demultiplex and remove tags and primers
# using e.g. cutadapt may be added here.
echo
echo Calculate quality statistics
$VSEARCH --fastq_eestats $s.merged.fastq \
--output $s.stats
echo
echo Quality filtering
$VSEARCH --fastq_filter $s.merged.fastq \
--fastq_maxee 0.5 \
--fastq_minlen 225 \
--fastq_maxlen 275 \
--fastq_maxns 0 \
--fastaout $s.filtered.fasta \
--fasta_width 0
echo
echo Dereplicate at sample level and relabel with sample_n
$VSEARCH --derep_fulllength $s.filtered.fasta \
--strand plus \
--output $s.derep.fasta \
--sizeout \
--uc $s.derep.uc \
--relabel $s. \
--fasta_width 0
done
echo Sum of unique sequences in each sample: $(cat *.derep.fasta | grep -c "^>")
# At this point there should be one fasta file for each sample
# It should be quality filtered and dereplicated.
echo
echo ====================================
echo Processing all samples together
echo ====================================
echo
echo Merge all samples
rm -f all.derep.fasta all.nonchimeras.derep.fasta
cat *.derep.fasta > all.fasta
echo
echo Dereplicate across samples and remove singletons
$VSEARCH --derep_fulllength all.fasta \
--minuniquesize 2 \
--sizein \
--sizeout \
--fasta_width 0 \
--uc all.derep.uc \
--output all.derep.fasta
echo Unique non-singleton sequences: $(grep -c "^>" all.derep.fasta)
echo
echo Precluster at 98% before chimera detection
$VSEARCH --cluster_size all.derep.fasta \
--threads $THREADS \
--id 0.98 \
--strand plus \
--sizein \
--sizeout \
--fasta_width 0 \
--uc all.preclustered.uc \
--centroids all.preclustered.fasta
echo Unique sequences after preclustering: $(grep -c "^>" all.preclustered.fasta)
echo
echo De novo chimera detection
$VSEARCH --uchime_denovo all.preclustered.fasta \
--sizein \
--sizeout \
--fasta_width 0 \
--nonchimeras all.denovo.nonchimeras.fasta \
echo Unique sequences after de novo chimera detection: $(grep -c "^>" all.denovo.nonchimeras.fasta)
echo
echo Reference chimera detection
$VSEARCH --uchime_ref all.denovo.nonchimeras.fasta \
--threads $THREADS \
--db $REF \
--sizein \
--sizeout \
--fasta_width 0 \
--nonchimeras all.ref.nonchimeras.fasta
echo Unique sequences after reference-based chimera detection: $(grep -c "^>" all.ref.nonchimeras.fasta)
echo
echo Extract all non-chimeric, non-singleton sequences, dereplicated
$PERL ../map.pl all.derep.fasta all.preclustered.uc all.ref.nonchimeras.fasta > all.nonchimeras.derep.fasta
echo Unique non-chimeric, non-singleton sequences: $(grep -c "^>" all.nonchimeras.derep.fasta)
echo
echo Extract all non-chimeric, non-singleton sequences in each sample
$PERL ../map.pl all.fasta all.derep.uc all.nonchimeras.derep.fasta > all.nonchimeras.fasta
echo Sum of unique non-chimeric, non-singleton sequences in each sample: $(grep -c "^>" all.nonchimeras.fasta)
echo
echo Cluster at 97% and relabel with OTU_n, generate OTU table
$VSEARCH --cluster_size all.nonchimeras.fasta \
--threads $THREADS \
--id 0.97 \
--strand plus \
--sizein \
--sizeout \
--fasta_width 0 \
--uc all.clustered.uc \
--relabel OTU_ \
--centroids all.otus.fasta \
--otutabout all.otutab.txt
echo
echo Number of OTUs: $(grep -c "^>" all.otus.fasta)
echo
echo Done
date
The perl script map.pl
can be found below.
#!/usr/bin/perl -w
use warnings;
use strict;
die "Three arguments needed: fasta1, uc, and fasta2\n" unless scalar @ARGV > 2;
my ($fasta1, $uc, $fasta2) = @ARGV;
# read fasta2 file with accepted sequences
my %accepted = ();
open(F2, $fasta2);
while (<F2>)
{
if (/^>([^ ;]+)/)
{
$accepted{$1} = 1;
}
}
close F2;
# read uc file with mapping
open(UC, $uc);
while (<UC>)
{
chomp;
my @col = split /\t/;
my $a;
if ($col[8] =~ /^([^ ;*]+)/)
{
$a = $1;
}
my $b;
if ($col[9] =~ /^([^ ;*]+)/)
{
$b = $1;
}
if ((defined $b) && ($accepted{$b}) && (defined $a))
{
$accepted{$a} = 1;
}
}
close UC;
# read original fasta1 file
my $ok = 0;
open(F1, $fasta1);
while (<F1>)
{
if (/^>([^ ;]+)/)
{
$ok = $accepted{$1};
}
print if $ok;
}
close F1;