Skip to content

Commit

Permalink
Improve chromsizes file validation to catch formatting errors early (o…
Browse files Browse the repository at this point in the history
  • Loading branch information
ShigrafS committed Feb 26, 2025
1 parent ed34981 commit f093a77
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 2 deletions.
2 changes: 1 addition & 1 deletion docs/releasenotes.md
12 changes: 11 additions & 1 deletion src/cooler/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,6 @@ def read_chromsizes(
----------
* `UCSC assembly terminology <http://genome.ucsc.edu/FAQ/FAQdownloads.html#download9>`_
* `GRC assembly terminology <https://www.ncbi.nlm.nih.gov/grc/help/definitions>`_
"""
if isinstance(filepath_or, str) and filepath_or.endswith(".gz"):
kwargs.setdefault("compression", "gzip")
Expand All @@ -247,6 +246,17 @@ def read_chromsizes(
dtype={"name": str},
**kwargs,
)

# Convert the "length" column to numeric values.
chromtable["length"] = pd.to_numeric(chromtable["length"], errors="coerce")
if chromtable["length"].isnull().any():
raise ValueError(
f"Chromsizes file '{filepath_or}' contains missing or invalid "
"length values. Please ensure that the file is properly formatted "
"as tab-delimited with two columns: sequence name and integer "
"length. Check for extraneous spaces or hidden characters."
)

if not all_names:
parts = []
for pattern in name_patterns:
Expand Down

0 comments on commit f093a77

Please # to comment.