Skip to content

Commit

Permalink
STAR support enabled.
Browse files Browse the repository at this point in the history
  • Loading branch information
roryk committed Dec 21, 2013
1 parent 092fe6c commit b9630e2
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 9 deletions.
5 changes: 4 additions & 1 deletion bcbio/bam/fastq.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,9 @@ def estimate_read_length(fastq_file, quality_format="fastq-sanger", nreads=1000)
read = in_handle.next()
average = len(read.seq)
for _ in range(nreads):
average = (average + len(in_handle.next().seq)) / 2
try:
average = (average + len(in_handle.next().seq)) / 2
except StopIteration:
break
in_handle.close()
return average
49 changes: 41 additions & 8 deletions bcbio/ngsalign/star.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,19 @@
from os import path
import subprocess
import tempfile

from bcbio.pipeline import config_utils
from bcbio.utils import safe_makedir, file_exists
from bcbio.utils import safe_makedir, file_exists, get_in
from bcbio.provenance import do
from bcbio.bam import fastq

CLEANUP_FILES = ["Aligned.out.sam", "Log.out", "Log.progress.out"]

fastq_file = "/Users/rory/tmp/data/test_fastq_1.fastq"
pair_file = "/Users/rory/tmp/data/test_fastq_2.fastq"
align_dir = "/Users/rory/tmp/star_test"
ref_file = "/Users/rory/tmp/GRCm38/STAR"
out_base = "/Users/rory/tmp/star_test/test"


def align(fastq_file, pair_file, ref_file, out_base, align_dir, data,
names=None):
genome_dir = create_genome(fastq_file, data)

config = data["config"]
out_prefix = path.join(align_dir, out_base)
out_file = out_prefix + "Aligned.out.sam"
Expand All @@ -25,15 +23,50 @@ def align(fastq_file, pair_file, ref_file, out_base, align_dir, data,
fastq = " ".join([fastq_file, pair_file])
num_cores = config["algorithm"].get("num_cores", 1)
safe_makedir(align_dir)
cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} "
cmd = ("{star_path} --genomeDir {genome_dir} --readFilesIn {fastq} "
"--runThreadN {num_cores} --outFileNamePrefix {out_prefix} "
"--outReadsUnmapped Fastx --outFilterMultimapNmax 10")
run_message = "Running STAR aligner on %s and %s." % (pair_file, ref_file)
do.run(cmd.format(**locals()), run_message, None)
return out_file

def _get_quality_format(config):
qual_format = config["algorithm"].get("quality_format", None)
if qual_format.lower() == "illumina":
return "fastq-illumina"
elif qual_format.lower() == "solexa":
return "fastq-solexa"
else:
return "fastq-sanger"

def remap_index_fn(ref_file):
"""Map sequence references to equivalent star indexes
"""
return path.join(path.dirname(path.dirname(ref_file)), "star")

def create_genome(fastq_file, data):
"""
create a genome file with splicing information. this depends on read length
so must be made custom for each mapping
"""
config = data["config"]
star_path = config_utils.get_program("STAR", config)
tempdir = tempfile.mkdtemp("_star")
quality_format = _get_quality_format(data["config"])
overhang = fastq.estimate_read_length(fastq_file, quality_format, 100000) - 1
fasta_file = data["sam_ref"]
gtf_file = get_in(data, ("genome_resources", "rnaseq", "transcripts"), None)
num_cores = config["algorithm"].get("num_cores", 1)

mem_limit = 1000000000

cmd = ("{star_path} --genomeDir {tempdir} --genomeFastaFiles {fasta_file} "
" --runThreadN {num_cores} --runMode genomeGenerate "
"--sjdbOverhang {overhang} --limitGenomeGenerateRAM {mem_limit}")
if gtf_file:
cmd + " --sjdbGTFfile {gtf_file}"
run_message = ("Generating a STAR index for %s using %s as transcripts with an "
"overhang of %d") % (fasta_file, gtf_file, overhang)

do.run(cmd.format(**locals()), run_message, None)
return tempdir

0 comments on commit b9630e2

Please # to comment.