From afc022f54bcbeff3a8bfa5582b18824253bb08d5 Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 22 Apr 2014 08:34:46 -0700 Subject: [PATCH 1/7] change sample temparory directory from names["sample"] to names["lane"] --- bcbio/rnaseq/oncofuse.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/bcbio/rnaseq/oncofuse.py b/bcbio/rnaseq/oncofuse.py index 8375a1f51..24e499857 100644 --- a/bcbio/rnaseq/oncofuse.py +++ b/bcbio/rnaseq/oncofuse.py @@ -31,10 +31,10 @@ def run(data): cl = ["java"] cl += resources.get("jvm_opts", ["-Xms750m", "-Xmx5g"]) cl += ["-jar", oncofuse_jar, input_file, input_type, tissue_type, out_file] - with file_transaction(out_file) as tx_out_file: - with open(tx_out_file, "w") as out_handle: - cmd = " ".join(cl) - do.run(cmd, "oncofuse fusion detection", data) + #with file_transaction(out_file) as tx_out_file: + with open(out_file, "w") as out_handle: + cmd = " ".join(cl) + do.run(cmd, "oncofuse fusion detection", data) return out_file @@ -48,13 +48,13 @@ def _get_input_para(data): if aligner == 'tophat2': aligner = 'tophat' names = data["rgnames"] - align_dir_parts = os.path.join(data["dirs"]["work"], "align", names["sample"], names["sample"]+"_%s" % aligner) + align_dir_parts = os.path.join(data["dirs"]["work"], "align", names["lane"], names["sample"]+"_%s" % aligner) if aligner in ['tophat', 'tophat2']: - align_dir_parts = os.path.join(data["dirs"]["work"], "align", names["sample"], names["sample"]+"_%s" % aligner) + align_dir_parts = os.path.join(data["dirs"]["work"], "align", names["lane"], names["sample"]+"_%s" % aligner) return 'tophat', align_dir_parts, os.path.join(align_dir_parts, TOPHAT_FUSION_OUTFILE) if aligner in ['star']: - align_dir_parts = os.path.join(data["dirs"]["work"], "align", names["sample"]) - return 'rnastar', align_dir_parts, os.path.join(align_dir_parts,names["sample"]+STAR_FUSION_OUTFILE) + align_dir_parts = os.path.join(data["dirs"]["work"], "align", names["lane"]) + return 'rnastar', align_dir_parts, os.path.join(align_dir_parts,names["lane"]+STAR_FUSION_OUTFILE) return None From daf983030b5505b4211de8fc364405c42ccaba99 Mon Sep 17 00:00:00 2001 From: Paul Date: Tue, 22 Apr 2014 09:45:30 -0700 Subject: [PATCH 2/7] fixed STAR juction input for oncofuse with GRCh37 disable tophat input for now only works for GRCh37/hg19 genome --- bcbio/rnaseq/oncofuse.py | 46 +++++++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/bcbio/rnaseq/oncofuse.py b/bcbio/rnaseq/oncofuse.py index 24e499857..3b1abb0ce 100644 --- a/bcbio/rnaseq/oncofuse.py +++ b/bcbio/rnaseq/oncofuse.py @@ -19,7 +19,11 @@ def run(data): #cmd line: java -Xmx1G -jar Oncofuse.jar input_file input_type tissue_type output_file config = data["config"] + genome_build = data.get("genome_build", "") input_type, input_dir, input_file = _get_input_para(data) + if genome_build == 'GRCh37': + input_file = _fix_junction_output(input_file) + print input_file out_file = os.path.join(input_dir, 'oncofuse_out.txt') oncofuse_jar = config_utils.get_jar("Oncofuse", config_utils.get_program("oncofuse", @@ -31,32 +35,54 @@ def run(data): cl = ["java"] cl += resources.get("jvm_opts", ["-Xms750m", "-Xmx5g"]) cl += ["-jar", oncofuse_jar, input_file, input_type, tissue_type, out_file] - #with file_transaction(out_file) as tx_out_file: with open(out_file, "w") as out_handle: cmd = " ".join(cl) do.run(cmd, "oncofuse fusion detection", data) return out_file + def _get_input_para(data): - TOPHAT_FUSION_OUTFILE = "fusions.out" + #TOPHAT_FUSION_OUTFILE = "fusions.out" STAR_FUSION_OUTFILE = 'Chimeric.out.junction' - + + config = data["config"] aligner = config["algorithm"].get("aligner") - if aligner == 'tophat2': - aligner = 'tophat' + #if aligner == 'tophat2': + # aligner = 'tophat' names = data["rgnames"] align_dir_parts = os.path.join(data["dirs"]["work"], "align", names["lane"], names["sample"]+"_%s" % aligner) - if aligner in ['tophat', 'tophat2']: - align_dir_parts = os.path.join(data["dirs"]["work"], "align", names["lane"], names["sample"]+"_%s" % aligner) - return 'tophat', align_dir_parts, os.path.join(align_dir_parts, TOPHAT_FUSION_OUTFILE) + #if aligner in ['tophat', 'tophat2']: + # align_dir_parts = os.path.join(data["dirs"]["work"], "align", names["lane"], names["sample"]+"_%s" % aligner) + # return 'tophat', align_dir_parts, os.path.join(align_dir_parts, TOPHAT_FUSION_OUTFILE) if aligner in ['star']: align_dir_parts = os.path.join(data["dirs"]["work"], "align", names["lane"]) return 'rnastar', align_dir_parts, os.path.join(align_dir_parts,names["lane"]+STAR_FUSION_OUTFILE) return None +def _fix_junction_output(chimeric_out_junction_file): + #for Chimeric.out.junction + out_file = chimeric_out_junction_file + '.hg19' + with open(out_file, "w") as out_handle: + with open(chimeric_out_junction_file, "r") as in_handle: + for line in in_handle: + parts = line.split("\t") + parts[0] = _h37tohg19(parts[0]) + parts[3] = _h37tohg19(parts[3]) + out_handle.write("\t".join(parts)) + return out_file + +def _h37tohg19(chromosome): + MAX_CHROMOSOMES = 23 + if chromosome in [str(x) for x in range(1, MAX_CHROMOSOMES)] + ["X", "Y"]: + new_chrom = "chr%s" % chromosome + elif chromosome == "MT": + new_chrom = "chrM" + else: + raise NotImplementedError(chromosome) + return new_chrom def _oncofuse_tissue_arg_from_config(data): @@ -70,14 +96,10 @@ def _oncofuse_tissue_arg_from_config(data): MES (mesenchymal origin) and AVG (average expression, if tissue source is unknown). """ - #potential check for tumor only analysis - #if data.get("metadata", {}).get("tissue") in ["tumor", "normal"]: - # pass SUPPORTED_TIISUE_TYPE = ["EPI", "HEM", "MES", "AVG"] if data.get("metadata", {}).get("tissue") in SUPPORTED_TIISUE_TYPE: return data.get("metadata", {}).get("tissue") else: - #may handle exception later return 'AVG' From e9b88668962fb4829b9dbbe6f459cca2df3b9e78 Mon Sep 17 00:00:00 2001 From: Paul Date: Wed, 23 Apr 2014 07:18:15 -0700 Subject: [PATCH 3/7] resume support for tophat-oncofuse add checks to make sure the junction file exists before running oncofuse --- bcbio/rnaseq/oncofuse.py | 49 ++++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/bcbio/rnaseq/oncofuse.py b/bcbio/rnaseq/oncofuse.py index 3b1abb0ce..00aae2a06 100644 --- a/bcbio/rnaseq/oncofuse.py +++ b/bcbio/rnaseq/oncofuse.py @@ -21,9 +21,18 @@ def run(data): config = data["config"] genome_build = data.get("genome_build", "") input_type, input_dir, input_file = _get_input_para(data) - if genome_build == 'GRCh37': - input_file = _fix_junction_output(input_file) - print input_file + if genome_build == 'GRCh37': #assume genome_build is hg19 otherwise + if config["algorithm"].get("aligner") in ['star']: + input_file = _fix_star_junction_output(input_file) + if config["algorithm"].get("aligner") in ['tophat', 'tophat2']: + input_file = _fix_tophat_junction_output(input_file) + + #handle cases when fusion file doesn't exist + if not os.path.exists(input_file): + return None + if os.stat(input_file).st_size == 0: + return None + out_file = os.path.join(input_dir, 'oncofuse_out.txt') oncofuse_jar = config_utils.get_jar("Oncofuse", config_utils.get_program("oncofuse", @@ -40,29 +49,42 @@ def run(data): do.run(cmd, "oncofuse fusion detection", data) return out_file - +def is_non_zero_file(fpath): + return True if os.path.isfile(fpath) and os.path.getsize(fpath) > 0 else False def _get_input_para(data): - #TOPHAT_FUSION_OUTFILE = "fusions.out" + TOPHAT_FUSION_OUTFILE = "fusions.out" STAR_FUSION_OUTFILE = 'Chimeric.out.junction' config = data["config"] aligner = config["algorithm"].get("aligner") - #if aligner == 'tophat2': - # aligner = 'tophat' + if aligner == 'tophat2': + aligner = 'tophat' names = data["rgnames"] align_dir_parts = os.path.join(data["dirs"]["work"], "align", names["lane"], names["sample"]+"_%s" % aligner) - #if aligner in ['tophat', 'tophat2']: - # align_dir_parts = os.path.join(data["dirs"]["work"], "align", names["lane"], names["sample"]+"_%s" % aligner) - # return 'tophat', align_dir_parts, os.path.join(align_dir_parts, TOPHAT_FUSION_OUTFILE) + if aligner in ['tophat', 'tophat2']: + align_dir_parts = os.path.join(data["dirs"]["work"], "align", names["lane"], names["sample"]+"_%s" % aligner) + return 'tophat', align_dir_parts, os.path.join(align_dir_parts, TOPHAT_FUSION_OUTFILE) if aligner in ['star']: align_dir_parts = os.path.join(data["dirs"]["work"], "align", names["lane"]) return 'rnastar', align_dir_parts, os.path.join(align_dir_parts,names["lane"]+STAR_FUSION_OUTFILE) return None -def _fix_junction_output(chimeric_out_junction_file): +def _fix_tophat_junction_output(chimeric_out_junction_file): + #for fusion.out + out_file = chimeric_out_junction_file + '.hg19' + with open(out_file, "w") as out_handle: + with open(chimeric_out_junction_file, "r") as in_handle: + for line in in_handle: + parts = line.split("\t") + left, right = parts[0].split("-") + parts[0] = "%s-%s" % (_h37tohg19(left), _h37tohg19(right)) + out_handle.write("\t".join(parts)) + return out_file + +def _fix_star_junction_output(chimeric_out_junction_file): #for Chimeric.out.junction out_file = chimeric_out_junction_file + '.hg19' with open(out_file, "w") as out_handle: @@ -100,7 +122,4 @@ def _oncofuse_tissue_arg_from_config(data): if data.get("metadata", {}).get("tissue") in SUPPORTED_TIISUE_TYPE: return data.get("metadata", {}).get("tissue") else: - return 'AVG' - - - + return 'AVG' \ No newline at end of file From cc6dcf313dd1bd2d5c9f4447591c0fd3c1b27184 Mon Sep 17 00:00:00 2001 From: Paul Date: Wed, 23 Apr 2014 08:27:32 -0700 Subject: [PATCH 4/7] upload oncofuse output to final directory --- bcbio/pipeline/rnaseq.py | 4 +++- bcbio/rnaseq/oncofuse.py | 8 +++++--- bcbio/upload/__init__.py | 8 ++++++++ 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/bcbio/pipeline/rnaseq.py b/bcbio/pipeline/rnaseq.py index 059792cd6..3c7779cfe 100644 --- a/bcbio/pipeline/rnaseq.py +++ b/bcbio/pipeline/rnaseq.py @@ -14,7 +14,9 @@ def generate_transcript_counts(data): """Generate counts per transcript from an alignment""" data["count_file"] = featureCounts.count(data) if get_in(data, ("config", "algorithm", "fusion_mode"), False): - data["oncofuse_file"] = oncofuse.run(data) + oncofuse_file = oncofuse.run(data) + if oncofuse_file: + data["oncofuse_file"] = oncofuse.run(data) return [[data]] diff --git a/bcbio/rnaseq/oncofuse.py b/bcbio/rnaseq/oncofuse.py index 00aae2a06..9bde98be2 100644 --- a/bcbio/rnaseq/oncofuse.py +++ b/bcbio/rnaseq/oncofuse.py @@ -28,12 +28,14 @@ def run(data): input_file = _fix_tophat_junction_output(input_file) #handle cases when fusion file doesn't exist - if not os.path.exists(input_file): - return None - if os.stat(input_file).st_size == 0: + if not file_exists(input_file): return None out_file = os.path.join(input_dir, 'oncofuse_out.txt') + + if file_exists(out_file): + return out_file + oncofuse_jar = config_utils.get_jar("Oncofuse", config_utils.get_program("oncofuse", config, "dir")) diff --git a/bcbio/upload/__init__.py b/bcbio/upload/__init__.py index c3342ca0f..0ee831396 100644 --- a/bcbio/upload/__init__.py +++ b/bcbio/upload/__init__.py @@ -47,6 +47,7 @@ def _get_files_rnaseq(sample): out = _maybe_add_alignment(algorithm, sample, out) out = _maybe_add_counts(algorithm, sample, out) out = _maybe_add_cufflinks(algorithm, sample, out) + out = _maybe_add_oncofuse(algorithm, sample, out) return _add_meta(out, sample) def _get_files_chipseq(sample): @@ -160,6 +161,13 @@ def _maybe_add_counts(algorithm, sample, out): "ext": "ready"}) return out +def _maybe_add_oncofuse(algorithm, sample, out): + if sample["oncofuse_file"] is not None: + out.append({"path": sample["oncofuse_file"], + "type": "oncofuse_outfile", + "ext": "ready"}) + return out + def _maybe_add_cufflinks(algorithm, sample, out): if "cufflinks_dir" in sample: out.append({"path": sample["cufflinks_dir"], From 548fe71cec001c13b9c5670829116b93e58ca0a6 Mon Sep 17 00:00:00 2001 From: Paul Date: Fri, 25 Apr 2014 14:12:06 -0700 Subject: [PATCH 5/7] catch case where no fusion transcripts are loaded --- bcbio/rnaseq/oncofuse.py | 5 ++++- tests/data/automated/run_info-fusion.yaml | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/bcbio/rnaseq/oncofuse.py b/bcbio/rnaseq/oncofuse.py index 9bde98be2..10d4f9320 100644 --- a/bcbio/rnaseq/oncofuse.py +++ b/bcbio/rnaseq/oncofuse.py @@ -48,7 +48,10 @@ def run(data): cl += ["-jar", oncofuse_jar, input_file, input_type, tissue_type, out_file] with open(out_file, "w") as out_handle: cmd = " ".join(cl) - do.run(cmd, "oncofuse fusion detection", data) + try: + do.run(cmd, "oncofuse fusion detection", data) + except: + return None return out_file def is_non_zero_file(fpath): diff --git a/tests/data/automated/run_info-fusion.yaml b/tests/data/automated/run_info-fusion.yaml index 5665367bb..21395d35a 100644 --- a/tests/data/automated/run_info-fusion.yaml +++ b/tests/data/automated/run_info-fusion.yaml @@ -3,7 +3,7 @@ upload: details: - analysis: RNA-seq algorithm: - aligner: tophat2 + aligner: star quality_format: illumina trim_reads: read_through adapters: [truseq, polya] From f028b6424191bba34493966be982e9514a04339b Mon Sep 17 00:00:00 2001 From: Paul Date: Fri, 25 Apr 2014 14:20:08 -0700 Subject: [PATCH 6/7] put tophat2 as the default aligner --- tests/data/automated/run_info-fusion.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data/automated/run_info-fusion.yaml b/tests/data/automated/run_info-fusion.yaml index 21395d35a..5665367bb 100644 --- a/tests/data/automated/run_info-fusion.yaml +++ b/tests/data/automated/run_info-fusion.yaml @@ -3,7 +3,7 @@ upload: details: - analysis: RNA-seq algorithm: - aligner: star + aligner: tophat2 quality_format: illumina trim_reads: read_through adapters: [truseq, polya] From 3ae269ec09a86e57934c6b1f196f4ad27821362b Mon Sep 17 00:00:00 2001 From: Paul Date: Fri, 25 Apr 2014 20:07:30 -0700 Subject: [PATCH 7/7] temporary solution --- bcbio/rnaseq/oncofuse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bcbio/rnaseq/oncofuse.py b/bcbio/rnaseq/oncofuse.py index 10d4f9320..06ef6efa6 100644 --- a/bcbio/rnaseq/oncofuse.py +++ b/bcbio/rnaseq/oncofuse.py @@ -51,7 +51,7 @@ def run(data): try: do.run(cmd, "oncofuse fusion detection", data) except: - return None + return out_file return out_file def is_non_zero_file(fpath):