Skip to content

Commit

Permalink
Fixes minor regression in kdb view and graph.open/KDBGReader.__init__
Browse files Browse the repository at this point in the history
  • Loading branch information
MatthewRalston committed Mar 28, 2024
1 parent a109718 commit fce3a3d
Show file tree
Hide file tree
Showing 3 changed files with 199 additions and 75 deletions.
195 changes: 127 additions & 68 deletions kmerdb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -906,26 +906,40 @@ def header(arguments):
Another end-user function that takes an argparse Namespace object.
This function just reads the metadata header, can print in json.
"""
from kmerdb import fileutil, config, util
from kmerdb import fileutil, config, util, graph

if os.path.splitext(arguments.kdb)[-1] != ".kdb":
raise IOError("Viewable .kdb filepath does not end in '.kdb'")
sfx = os.path.splitext(arguments.kdb)[-1]
metadata = None

if sfx != ".kdb" and sfx != ".kdbg": # A filepath with invalid suffix
raise IOError("Viewable .kdb(g) filepath does not end in '.kdb' or '.kdbg'")
elif not os.path.exists(arguments.kdb):
raise IOError("Viewable .kdb(g) filepath '{0}' does not exist on the filesystem".format(arguments.kdb_in))

with fileutil.open(arguments.kdb, mode='r') as kdb_in:
if kdb_in.metadata["version"] != config.VERSION:
if sfx == ".kdb":
kdb = fileutil.open(arguments.kdb, mode='r', sort=arguments.re_sort, slurp=True)
metadata = kdb.metadata
kmer_ids_dtype = metadata["kmer_ids_dtype"]
N = 4**metadata["k"]
if metadata["version"] != config.VERSION:
logger.warning("KDB version is out of date, may be incompatible with current KDBReader class")
assert kdb.kmer_ids.size == N, "view | read kmer_ids size did not match N from the header metadata"
assert kdb.counts.size == N, "view | read counts size did not match N from the header metadata"
assert kdb.frequencies.size == N, "view | read frequencies size did not match N from the header metadata"
metadata = kdb.metadata
elif sfx == ".kdbg":
kdb = graph.open(arguments.kdb, mode='r')
if kdb.metadata["version"] != config.VERSION:
logger.warning("KDB file version is out of date, may be incompatible with current fileutil.KDBReader class")
N = 4**kdb_in.metadata["k"]
N = 4**kdb.metadata["k"]
metadata = kdb.metadata

assert kdb_in.kmer_ids.size == N, "view | read kmer_ids size did not match N from the header metadata"
assert kdb_in.counts.size == N, "view | read counts size did not match N from the header metadata"
assert kdb_in.frequencies.size == N, "view | read frequencies size did not match N from the header metadata"

if arguments.json:
print(dict(kdb_in.metadata))
else:
yaml.add_representer(OrderedDict, util.represent_ordereddict)
print(yaml.dump(kdb_in.metadata))
print(config.header_delimiter)
if arguments.json:
print(dict(kdb.metadata))
else:
yaml.add_representer(OrderedDict, util.represent_yaml_from_collections_dot_OrderedDict)
print(yaml.dump(metadata))
print(config.header_delimiter)

def view(arguments):
"""
Expand Down Expand Up @@ -1065,12 +1079,91 @@ def get_header(line, header):
#kdb_out._handle.close()
sys.stderr.write(config.DONE)
elif sfx == ".kdbg":
with graph.open(arguments.kdb_in, mode='r', slurp=True) as kdbg_in:
metadata = kdbg_in.metadata
kdbg_in = graph.open(arguments.kdb_in, mode='r', slurp=True)
metadata = kdbg_in.metadata

n1_dtype = metadata["n1_dtype"]
n2_dtype = metadata["n2_dtype"]
weights_dtype = metadata["weights_dtype"]

print(metadata)
sys.exit(1)

if metadata["version"] != config.VERSION:
logger.warning("KDB version is out of date, may be incompatible with current KDBReader class")
if arguments.kdb_out is None or (arguments.kdb_out == "/dev/stdout" or arguments.kdb_out == "STDOUT"): # Write to stdout, uncompressed
if arguments.header:
yaml.add_representer(OrderedDict, util.represent_yaml_from_collections_dot_OrderedDict)
print(yaml.dump(metadata, sort_keys=False))
print(config.header_delimiter)
logger.info("Reading from file...")
logger.debug("I cut off the json-formatted unstructured column for the main view.")

for i in range(len(kdbg_in.n1)):
n1 = kdbg_in.n1[i]
n2 = kdbg_in.n2[i]
w = kdbg_in.weights[i]
logger.debug("The row in the file should follow this order:")
logger.debug("The first is an implicit row-index. The second and third are k-mer ids, then edge weight")
logger.debug("{0}\t{1}\t{2}\t{3}".format(i, n1, n2, w))
logger.debug("{0} line:".format(i))
logger.debug("=== = = = ======= = = = = = = |")
print("{0}\t{1}\t{2}\t{3}".format(i, n1, n2, w))
# I don't think anyone cares about the graph representation.
# I don't think this actually matters because I can't figure out what the next data structure is.
# Is it a Cypher query and creation node set?
# I need to demonstrate a capacity for graph based learning.
# (:-|X) The dread pirate roberts got me.
# :)
if arguments.kdb_out is not None and arguments.compress: # Can't yet write compressed to stdout
logger.error("Can't write kdb to stdout! We need to use a Bio.bgzf filehandle.")
sys.exit(1)
elif arguments.kdb_out is not None and type(arguments.kdb_out) is not str:
raise ValueError("Cannot write a file to an argument that isn't a string")
elif arguments.kdb_out is not None and os.path.exists(arguments.kdb_out):
logger.warning("Overwriting '{0}'...".format(arguments.kdb_out))
elif arguments.kdb_out is not None and not os.path.exists(arguments.kdb_out):
logger.debug("Creating '{0}'...".format(arguments.kdb_out))
if arguments.kdb_out is not None:
with graph.open(arguments.kdb_out, metadata=metadata, mode='w') as kdb_out:
try:
for i in range(len(kdbg.n1)):
kdb_out.write("{0}\t{1}\t{2}\t{3}\n".format(i, kdbg_in.n1[i], kdbg_in.n2[i], kdbg_in.w[i]))

except StopIteration as e:
logger.error(e)
raise e
finally:
#kdb_out._write_block(kdb_out._buffer)
#kdb_out._handle.flush()
#kdb_out._handle.close()
sys.stderr.write(config.DONE)


def assembly(arguments):
from kmerdb import graph








assert type(arguments.kdbg) is str, "kdbg must be a str"
sfx = os.path.splitext(arguments.kdbg)[-1]





if sfx != ".kdb" and sfx != ".kdbg": # A filepath with invalid suffix
raise IOError("Viewable .kdb(g) filepath does not end in '.kdb' or '.kdbg'")
elif not os.path.exists(arguments.kdbg):
raise IOError("Viewable .kdb(g) filepath '{0}' does not exist on the filesystem".format(arguments.kdbg))
elif sfx == ".kdbg":
with graph.open(arguments.kdbg, mode='r', slurp=True) as kdbg_in:
metadata = kdbg_in.metadata

N = len(kdbg_in.n1)
n1_dtype = metadata["n1_dtype"]
n2_dtype = metadata["n2_dtype"]
weights_dtype = metadata["weights_dtype"]
Expand All @@ -1082,68 +1175,26 @@ def get_header(line, header):
logger.warning("KDB version is out of date, may be incompatible with current KDBReader class")
if arguments.kdb_out is None or (arguments.kdb_out == "/dev/stdout" or arguments.kdb_out == "STDOUT"): # Write to stdout, uncompressed
if arguments.header:
yaml.add_representer(OrderedDict, util.represent_ordereddict)
yaml.add_representer(OrderedDict, util.represent_yaml_from_collections_dot_OrderedDict)
print(yaml.dump(metadata, sort_keys=False))
print(config.header_delimiter)
logger.info("Reading from file...")
logger.debug("I cut off the json-formatted unstructured column for the main view.")
try:


"""
FIXME!

nodes = list(set(kdbg_in.n1 + kdbg_in.n2))
edges = list(zip(kdbg_in.n1, kdbg_in.n2, list(map(lambda w: {'weight': w} , kdbg_in.weights))))

"""
graph.create_graph(nodes, edges)



for i in range(len(kdbg_in.n1)):
n1 = kdbg_in.n1[i]
n2 = kdbg_in.n2[i]
w = kdbg_in.weights[i]
logger.debug("The row in the file should follow this order:")
logger.debug("The first is an implicit row-index. The second and third are k-mer ids, then edge weight")
logger.debug("{0}\t{1}\t{2}\t{3}".format(i, n1, n2, w))
logger.debug("{0} line:".format(i))
logger.debug("=== = = = ======= = = = = = = |")
print("{0}\t{1}\t{2}\t{3}".format(i, n1, n2, w))
# I don't think anyone cares about the graph representation.
# I don't think this actually matters because I can't figure out what the next data structure is.
# Is it a Cypher query and creation node set?
# I need to demonstrate a capacity for graph based learning.
# (:-|X) The dread pirate roberts got me.
# :)
except BrokenPipeError as e:
logger.error(e)
raise e
if arguments.kdb_out is not None and arguments.compress: # Can't yet write compressed to stdout
logger.error("Can't write kdb to stdout! We need to use a Bio.bgzf filehandle.")
sys.exit(1)
elif arguments.kdb_out is not None and type(arguments.kdb_out) is not str:
raise ValueError("Cannot write a file to an argument that isn't a string")
elif arguments.kdb_out is not None and os.path.exists(arguments.kdb_out):
logger.warning("Overwriting '{0}'...".format(arguments.kdb_out))
elif arguments.kdb_out is not None and not os.path.exists(arguments.kdb_out):
logger.debug("Creating '{0}'...".format(arguments.kdb_out))
if arguments.kdb_out is not None:
with fileutil.graph(arguments.kdb_in, 'r', dtype=suggested_dtype, sort=arguments.sorted, slurp=True) as kdbg:
with graph.open(arguments.kdb_out, metadata=metadata, mode='w') as kdb_out:
try:
for i in range(len(kdbg.n1)):
kdb_out.write("{0}\t{1}\t{2}\t{3}\n".format(i, kdbg.n1[i], kdbg.n2[i], kdbg.w[i]))

except StopIteration as e:
logger.error(e)
raise e
finally:
#kdb_out._write_block(kdb_out._buffer)
#kdb_out._handle.flush()
#kdb_out._handle.close()
sys.stderr.write(config.DONE)





def make_kdbg(arguments):
"""
Expand Down Expand Up @@ -1330,6 +1381,7 @@ def make_kdbg(arguments):
Write the dataset (weighted edge list) to a file with '.kdbg' as its suffix.
"""
kdbg_out = graph.open(arguments.kdbg, mode='wb', metadata=metadata)

try:
sys.stderr.write("\n\n\nWriting edge list to {0}...\n\n\n".format(arguments.kdbg))
for i, node1 in enumerate(n1):
Expand Down Expand Up @@ -1769,6 +1821,13 @@ def cli():
graph_parser.add_argument("seqfile", nargs="+", type=str, metavar="<.fasta|.fastq>", help="Fasta or fastq files")
graph_parser.add_argument("kdbg", type=str, help=".kdbg file")
graph_parser.set_defaults(func=make_kdbg)

# assembly_parser = subparsers.add_parser("assemble", help="Use NetworkX (and/or cugraph) to perform deBruijn graphs")
# assembly_parser.add_argument("-v", "--verbose", help="Prints warnings to the console by default", default=0, action="count")
# assembly_parser.add_argument("-g", "--gpu", action="store_true", default=False, help="Utilize GPU resources (requires CUDA library cugraph)")
# assembly_parser.add_argument("kdbg", type=str, help=".kdbg file")
# assembly_parser.set_defaults(func=assembly)


view_parser = subparsers.add_parser("view", help="View the contents of the .kdb file")
view_parser.add_argument("-v", "--verbose", help="Prints warnings to the console by default", default=0, action="count")
Expand Down
77 changes: 70 additions & 7 deletions kmerdb/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,12 @@

from builtins import open as _open

#import jsonschema
import jsonschema
from Bio import SeqIO, Seq, bgzf


import numpy as np
import networkx as nx

from kmerdb import fileutil, parse, kmer, config, util

Expand Down Expand Up @@ -725,6 +726,35 @@ def make_graph(kmer_ids:list, k:int=None, quiet:bool=True):
"""
return all_edges_in_kspace



def create_graph(nodes:list, edge_tuples:list, gpu:bool=False):



if nodes is None or type(nodes) is not list or not all(type(n) is not int for n in nodes):
raise TypeError("kmerdb.graph.create_graph expects the first argument to be a list of ints")
elif edge_tuples is None or type(edge_tuples) is not list or not all(len(e) != 3 for e in edge_tuples) or not all((type(e[0]) is int and type(e[1]) is int) for e in edge_tuples):
raise TypeError("kmerdb.graph.create_graph expects the second argument to be a list of tuples of length 2")
elif gpu is None or type(gpu) is not bool:
raise TypeError("kmerdb.graph.create_graph expects the keyword argument gpu to be a bool")


"""
Now we make the networkx graph
"""
G = nx.Graph()

G.add_nodes_from(nodes)


G.add_edges_from(edge_tuples)






def w_lexer():
pass

Expand Down Expand Up @@ -873,8 +903,8 @@ def __init__(self, filename:str, fileobj:io.IOBase=None, mode:str="r", n1_dtype:
raise TypeError("kmerdb.graph.KDBGReader expects the keyword argument 'fileobj' to be a file object")
if filename is not None and type(filename) is not str:
raise TypeError("kmerdb.graph.KDBGReader expects the keyword argument 'filename' to be a str")
elif mode is not None and type(mode) is not bool:
raise TypeError("kmerdb.graph.KDBGReader expects the keyword argument 'mode' to be a bool")
elif mode is not None and type(mode) is not str:
raise TypeError("kmerdb.graph.KDBGReader expects the keyword argument 'mode' to be a str")
elif n1_dtype is not None and type(n1_dtype) is not str:
raise TypeError("kmerdb.graph.KDBGReader expects the keyword argument 'n1_dtype' to be a str")
elif n2_dtype is not None and type(n2_dtype) is not str:
Expand Down Expand Up @@ -936,7 +966,8 @@ def __init__(self, filename:str, fileobj:io.IOBase=None, mode:str="r", n1_dtype:

self.read_and_validate_kdbg_header()

if slurp is True:
if slurp is True:
logger.info("Reading .kdbg contents into memory")
self.slurp(n1_dtype=n1_dtype, n2_dtype=n2_dtype, weights_dtype=weights_dtype)

self.is_int = True
Expand Down Expand Up @@ -1142,8 +1173,8 @@ def slurp(self, n1_dtype:str="uint64", n2_dtype:str="uint64", weights_dtype:str=
reading = False


self.node1 = np.array(node1, dtype=n1_dtype)
self.node2 = np.array(node2, dtype=n2_dtype)
self.n1 = np.array(node1, dtype=n1_dtype)
self.n2 = np.array(node2, dtype=n2_dtype)
self.weights = np.array(weights, dtype=weights_dtype)
return

Expand Down Expand Up @@ -1229,9 +1260,41 @@ def __init__(self, filename:str=None, mode:str="w", metadata:OrderedDict=None, b
# 3-04-2024
yaml.add_representer(OrderedDict, util.represent_yaml_from_collections_dot_OrderedDict)


self.metadata = metadata

#self._write_block(metadata_slice)
if "b" in mode.lower():
metadata_bytes = bytes(yaml.dump(self.metadata, sort_keys=False), 'utf-8')
metadata_plus_delimiter_in_bytes = metadata_bytes + bytes(config.header_delimiter, 'utf-8')
self.metadata["metadata_blocks"] = math.ceil( sys.getsizeof(metadata_plus_delimiter_in_bytes) / ( 2**16 ) ) # First estimate
metadata_bytes = bytes(yaml.dump(self.metadata, sort_keys=False), 'utf-8')
metadata_bytes = metadata_bytes + bytes(config.header_delimiter, 'utf-8')
self.metadata["metadata_blocks"] = math.ceil( sys.getsizeof(metadata_bytes) / ( 2**16 ) ) # Second estimate
metadata_bytes = bytes(yaml.dump(self.metadata, sort_keys=False), 'utf-8')
metadata_bytes = metadata_bytes + bytes(config.header_delimiter, 'utf-8')
logger.info("Writing the {0} metadata blocks to the new file".format(self.metadata["metadata_blocks"]))
logger.debug(self.metadata)
logger.debug("Header is being written as follows:\n{0}".format(yaml.dump(self.metadata, sort_keys=False)))

# 01-01-2022 This is still not a completely functional method to write data to bgzf through the Bio.bgzf.BgzfWriter class included in BioPython
# I've needed to implement a basic block_writer, maintaining compatibility with the Biopython bgzf submodule.
#self.write(bytes(yaml.dump(metadata, sort_keys=False), 'utf-8'))

for i in range(self.metadata["metadata_blocks"]):
metadata_slice = metadata_bytes[:65536]
metadata_bytes = metadata_bytes[65536:]
self._write_block(metadata_slice)

#self._write_block
self._buffer = b""
self._handle.flush()
elif "w" == mode.lower() or "x" == mode.lower():
self.write(yaml.dump(metadata, sort_keys=False))
self._buffer = ""
self._handle.flush()
else:
logger.error("Mode: {}".format(mode.lower()))
raise RuntimeError("Could not determine proper encoding for write operations to .kdb file")



Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ numpy==1.26.3
pandas>=2.2.0
scipy>=1.12.0
scikit-learn==1.4.0
networkx==3.2.1


#######################
# Statistical language
Expand Down

0 comments on commit fce3a3d

Please # to comment.