diff --git a/bcbio/bam/fastq.py b/bcbio/bam/fastq.py index ce7545d5c..b6261113f 100644 --- a/bcbio/bam/fastq.py +++ b/bcbio/bam/fastq.py @@ -220,6 +220,9 @@ def estimate_read_length(fastq_file, quality_format="fastq-sanger", nreads=1000) read = in_handle.next() average = len(read.seq) for _ in range(nreads): - average = (average + len(in_handle.next().seq)) / 2 + try: + average = (average + len(in_handle.next().seq)) / 2 + except StopIteration: + break in_handle.close() return average diff --git a/bcbio/ngsalign/star.py b/bcbio/ngsalign/star.py index d813ee84b..6c847a3ba 100644 --- a/bcbio/ngsalign/star.py +++ b/bcbio/ngsalign/star.py @@ -1,21 +1,19 @@ from os import path -import subprocess +import tempfile from bcbio.pipeline import config_utils -from bcbio.utils import safe_makedir, file_exists +from bcbio.utils import safe_makedir, file_exists, get_in from bcbio.provenance import do +from bcbio.bam import fastq CLEANUP_FILES = ["Aligned.out.sam", "Log.out", "Log.progress.out"] -fastq_file = "/Users/rory/tmp/data/test_fastq_1.fastq" -pair_file = "/Users/rory/tmp/data/test_fastq_2.fastq" -align_dir = "/Users/rory/tmp/star_test" -ref_file = "/Users/rory/tmp/GRCm38/STAR" -out_base = "/Users/rory/tmp/star_test/test" def align(fastq_file, pair_file, ref_file, out_base, align_dir, data, names=None): + genome_dir = create_genome(fastq_file, data) + config = data["config"] out_prefix = path.join(align_dir, out_base) out_file = out_prefix + "Aligned.out.sam" @@ -25,15 +23,50 @@ def align(fastq_file, pair_file, ref_file, out_base, align_dir, data, fastq = " ".join([fastq_file, pair_file]) num_cores = config["algorithm"].get("num_cores", 1) safe_makedir(align_dir) - cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} " + cmd = ("{star_path} --genomeDir {genome_dir} --readFilesIn {fastq} " "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax 10") run_message = "Running STAR aligner on %s and %s." % (pair_file, ref_file) do.run(cmd.format(**locals()), run_message, None) return out_file +def _get_quality_format(config): + qual_format = config["algorithm"].get("quality_format", None) + if qual_format.lower() == "illumina": + return "fastq-illumina" + elif qual_format.lower() == "solexa": + return "fastq-solexa" + else: + return "fastq-sanger" def remap_index_fn(ref_file): """Map sequence references to equivalent star indexes """ return path.join(path.dirname(path.dirname(ref_file)), "star") + +def create_genome(fastq_file, data): + """ + create a genome file with splicing information. this depends on read length + so must be made custom for each mapping + """ + config = data["config"] + star_path = config_utils.get_program("STAR", config) + tempdir = tempfile.mkdtemp("_star") + quality_format = _get_quality_format(data["config"]) + overhang = fastq.estimate_read_length(fastq_file, quality_format, 100000) - 1 + fasta_file = data["sam_ref"] + gtf_file = get_in(data, ("genome_resources", "rnaseq", "transcripts"), None) + num_cores = config["algorithm"].get("num_cores", 1) + + mem_limit = 1000000000 + + cmd = ("{star_path} --genomeDir {tempdir} --genomeFastaFiles {fasta_file} " + " --runThreadN {num_cores} --runMode genomeGenerate " + "--sjdbOverhang {overhang} --limitGenomeGenerateRAM {mem_limit}") + if gtf_file: + cmd + " --sjdbGTFfile {gtf_file}" + run_message = ("Generating a STAR index for %s using %s as transcripts with an " + "overhang of %d") % (fasta_file, gtf_file, overhang) + + do.run(cmd.format(**locals()), run_message, None) + return tempdir