Skip to content

Commit

Permalink
resolves #24
Browse files Browse the repository at this point in the history
- added support for user supplied bridge adapter sequence in dnase simulations
- version bump
  • Loading branch information
cerebis committed Feb 6, 2024
1 parent 0062590 commit d4475da
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 21 deletions.
2 changes: 1 addition & 1 deletion sim3C/_version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '0.7'
__version__ = '0.7.1'

__copyright__ = """Copyright (C) 2019 Matthew Z DeMaere
This is free software. You may redistribute copies of it under the terms of
Expand Down
14 changes: 11 additions & 3 deletions sim3C/command_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ def main():
help='Library preparation method [hic]')
parser.add_argument('-e', '--enzyme', metavar='NEB_NAME', action='append',
help='Case-sensitive NEB enzyme name. Use multiple times for multiple enzymes')

parser.add_argument('-n', '--num-pairs', metavar='INT', type=int, required=True,
help='Number of read-pairs generate')
parser.add_argument('-l', '--read-length', metavar='INT', type=int, required=True,
Expand All @@ -111,10 +110,13 @@ def main():
parser.add_argument('--insert-max', metavar='INT', type=int, default=None,
help='Maximum allowed insert size [None]')

parser.add_argument('--bridge-adapter',
help='Bridge adapter sequence (for dnase mode)')

parser.add_argument('--linear', default=False, action='store_true',
help='Treat all replicons as linear molecules')
parser.add_argument('--efficiency', metavar='FLOAT', type=float,
help='HiC/Meta3C efficiency factor [hic: 0.5 or meta3c: 0.02]')
help='HiC/Meta3C efficiency factor [hic, dnase: 0.5 or meta3c: 0.02]')
parser.add_argument('--anti-rate', metavar='FLOAT', type=float, default=0.2,
help='Rate of anti-diagonal fragments [0.2]')
parser.add_argument('--trans-rate', metavar='FLOAT', type=float, default=0.1,
Expand Down Expand Up @@ -167,6 +169,12 @@ def main():
elif args.method != 'dnase':
raise Sim3CException('At least one enzyme must be specified')

if not args.bridge_adapter:
if args.method == 'dnase':
logger.warning('No bridge adapter sequence was specified for the dnase method')
elif args.method != 'dnase':
raise Sim3CException('Bridge adapter sequence is only valid for the dnase method')

#
# Prepare community abundance profile, either procedurally or from a file
#
Expand Down Expand Up @@ -231,7 +239,7 @@ def main():
'anti_rate', 'spurious_rate', 'trans_rate',
'efficiency',
'ins_rate', 'del_rate',
'simple_reads', 'linear', 'convert_symbols', 'profile_format']
'simple_reads', 'linear', 'convert_symbols', 'profile_format', 'bridge_adapter']

# extract these parameters from the parsed arguments
kw_args = {k: v for k, v in vars(args).items() if k in kw_names}
Expand Down
59 changes: 42 additions & 17 deletions sim3C/simulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import logging
import re

import tqdm

from Bio import SeqIO
Expand Down Expand Up @@ -45,7 +47,7 @@ class ReadGenerator(object):
"""
def __init__(self, method, enzymes, prefix='SIM3C', simple=False, machine_profile='EmpMiSeq250',
read_length=250, ins_rate=9.e-5, del_rate=1.1e-4,
insert_mean=500, insert_sd=100, insert_min=100, insert_max=None):
insert_mean=500, insert_sd=100, insert_min=100, insert_max=None, bridge=None):
"""
Initialise a read generator.
:param method: The two library preparation methods are: 'meta3c' or 'hic'.
Expand All @@ -60,6 +62,7 @@ def __init__(self, method, enzymes, prefix='SIM3C', simple=False, machine_profil
:param insert_sd: standard deviation of insert length (must be < mean)
:param insert_min: minimum allowable insert length (must be > 50)
:param insert_max: maximum allowable insert length (must be > mean)
:param bridge: an arbitrary sequence bridging a and b
"""

self.method = method
Expand Down Expand Up @@ -88,6 +91,8 @@ def __init__(self, method, enzymes, prefix='SIM3C', simple=False, machine_profil
self.too_short = 0
self.too_long = 0

self.bridge = bridge

# initialise ART read simulator
self.art = Art(read_length, EmpDist.create(machine_profile), ins_rate, del_rate)

Expand All @@ -101,7 +106,7 @@ def __init__(self, method, enzymes, prefix='SIM3C', simple=False, machine_profil
method_switcher = {
'hic': self._part_joiner_sitedup,
'meta3c': self._part_joiner_simple,
'dnase': self._part_joiner_simple
'dnase': self._part_joiner_bridged
}
self._part_joiner = method_switcher[self.method.lower()]
except Exception:
Expand All @@ -121,32 +126,44 @@ def get_report(self):
return msg

@staticmethod
def _part_joiner_simple(a, b, *args):
def _part_joiner_simple(frag_a, frag_b, *args):
"""
Join two fragments end to end without any site duplication. The new
fragment will begin at a_0 and end at b_max. Used when no end fills are
applied to the library prep (Eg. meta3C)
Altogether [a_0..a_max] + [b_0..b_max]
:param a: fragment a
:param b: fragment b
:param frag_a: fragment a
:param frag_b: fragment b
:param args: unused
:return: a + b
:return: frag_a + frag_b
"""
return a + b
return frag_a + frag_b

def _part_joiner_sitedup(self, a, b, enz1, enz2):
def _part_joiner_bridged(self, frag_a, frag_b, *args):
"""
Join two fragments end to end with an intervening bridge sequence. The new
fragment will begin at a_0 and end at b_max. Used optionally in dnase
library preps.
Altogether [a_0..a_max] + [bridge] + [b_0..b_max]
:param frag_a: fragment a
:param frag_b: fragment b
:param args: unused
:return: frag_a + bridge + frag_b
"""
return frag_a + self.bridge + frag_b

def _part_joiner_sitedup(self, frag_a, frag_b, *enzymes):
"""
Join two fragments end to end where the cut-site is duplicated. The new
fragment will begin at a_0 end at b_max. Used when end-fill is applied
before further steps in library preparation (Eg. HiC)
Altogether [a_0..a_max] + [cs_0..cs_max] + [b_0..b_max]
:param a: fragment a
:param b: fragment b
:param enz1: the 5p enzyme
:param enz2: the 3p enzyme
:return: a + b
:param frag_a: fragment a
:param frag_b: fragment b
:param enzymes: tuple of (enzyme_a, enzyme_b) used in digestion
:return: frag_a + junc_ab + frag_b
"""
return a + self.ligation_info[(enz1, enz2)].junction + b
return frag_a + self.ligation_info[enzymes].junction + frag_b

def draw_insert(self):
"""
Expand Down Expand Up @@ -259,7 +276,7 @@ def __init__(self, profile_filename, seq_filename, enzyme_names, number_pairs,
efficiency=0.02,
ins_rate=9.e-5, del_rate=1.1e-4,
create_cids=True, simple_reads=True, linear=False, convert_symbols=False,
profile_format='table'):
profile_format='table', bridge_adapter=None):
"""
Initialise a SequencingStrategy.
Expand All @@ -286,6 +303,7 @@ def __init__(self, profile_filename, seq_filename, enzyme_names, number_pairs,
:param linear: treat replicons as linear
:param convert_symbols: if true, unsupported (by Art) symbols in the input sequences are converted to N
:param profile_format: the format of the abundance profile (Either: table or toml)
:param bridge_adapter: the sequence of the bridge adapter used in dnase library prep
"""
self.profile_filename = profile_filename
self.profile_format = profile_format
Expand All @@ -298,6 +316,13 @@ def __init__(self, profile_filename, seq_filename, enzyme_names, number_pairs,
self.insert_max = insert_max
self.efficiency = efficiency

# validate and standardise user-input bridge adapter sequence
if bridge_adapter is not None:
bridge_adapter = bridge_adapter.upper()
if re.fullmatch(r'[ACGT]+', bridge_adapter) is None:
raise Sim3CException('Bridge adapter sequence must be composed of A, C, G, or T')
self.bridge_adapter = bridge_adapter

self.enzymes = None
if enzyme_names is not None:
self.enzymes = [get_enzyme_instance(enz) for enz in enzyme_names if enz is not None]
Expand Down Expand Up @@ -327,12 +352,12 @@ def __init__(self, profile_filename, seq_filename, enzyme_names, number_pairs,
else:
raise Sim3CException(f'unknown profile format ({profile_format}) Either: "table" or "toml"]')

# preparate the read simulator for output
# prepare the read simulator for output
self.read_generator = ReadGenerator(method, self.enzymes,
prefix=prefix, simple=simple_reads, machine_profile=machine_profile,
read_length=read_length, insert_mean=insert_mean,
insert_sd=insert_sd, insert_min=insert_min, insert_max=insert_max,
del_rate=del_rate, ins_rate=ins_rate)
del_rate=del_rate, ins_rate=ins_rate, bridge=self.bridge_adapter)

# the method determines the strategy governing the creation of
# ligation products and WGS reads.
Expand Down

0 comments on commit d4475da

Please # to comment.