diff --git a/server/src/scimodom/services/assembly.py b/server/src/scimodom/services/assembly.py index 3255bb45..6042661f 100644 --- a/server/src/scimodom/services/assembly.py +++ b/server/src/scimodom/services/assembly.py @@ -203,7 +203,7 @@ def liftover( unmapped_lines = self._file_service.count_lines(unmapped_file) if unmapped_lines / raw_lines > threshold: raise LiftOverError( - f"Liftover failed: {unmapped_lines} records of {raw_lines} could not be mapped." + f"Liftover failed: {unmapped_lines} records out of {raw_lines} could not be mapped." ) if unmapped_lines > 0: logger.warning( @@ -234,7 +234,7 @@ def add_assembly(self, taxa_id: int, assembly_name: str) -> int: if self._file_service.check_if_assembly_exists(taxa_id, assembly_name): raise FileExistsError( - f"Directory exists, but assembly '{assembly_name}' does not exists!" + f"Directory exists, but assembly '{assembly_name}' does not exist!" ) chain_file_name = self._get_chain_file_name( @@ -277,8 +277,8 @@ def prepare_assembly_for_version(self, assembly_id: int) -> None: assembly = self.get_assembly_by_id(assembly_id) if not self.is_latest_assembly(assembly): raise AssemblyVersionError( - f"Mismatch between assembly version {assembly.version} and " - f"database version {self._version}." + f"Mismatch between assembly version '{assembly.version}' and " + f"database version '{self._version}'." ) logger.info(f"Setting up assembly {assembly.name} for current version...") @@ -303,11 +303,11 @@ def _get_ensembl_chain_file_url(self, taxa_id: int, chain_file_name): return urljoin( ENSEMBL_FTP, ENSEMBL_ASM_MAPPING, - self._get_organism_for_ensemble_url(taxa_id), + self._get_organism_for_ensembl_url(taxa_id), chain_file_name, ) - def _get_organism_for_ensemble_url(self, taxa_id: int): + def _get_organism_for_ensembl_url(self, taxa_id: int): organism = self._get_organism(taxa_id) return ("_".join(organism.split())).lower() @@ -315,7 +315,7 @@ def _get_ensembl_gene_build_url(self, taxa_id: int): return urljoin( ENSEMBL_SERVER, ENSEMBL_ASM, - self._get_organism_for_ensemble_url(taxa_id), + self._get_organism_for_ensembl_url(taxa_id), ) def _get_organism(self, taxa_id: int) -> str: diff --git a/server/src/scimodom/services/bedtools.py b/server/src/scimodom/services/bedtools.py index 7ae8cdec..097c83d3 100644 --- a/server/src/scimodom/services/bedtools.py +++ b/server/src/scimodom/services/bedtools.py @@ -95,92 +95,91 @@ def __init__(self, tmp_path): makedirs(tmp_path, exist_ok=True) pybedtools.helpers.set_tempdir(tmp_path) - def annotate_data_using_ensembl( - self, - annotation_path: Path, - features: dict[str, str], - records: Iterable[Data], - ) -> Iterable[DataAnnotationRecord]: - """Annotate data records, i.e. create - records for DataAnnotation. Columns - order is (gene_id, data_id, feature). - There is no type coercion. + @staticmethod + def create_temp_file_from_records( + records: Iterable[Sequence[Any]], sort: bool = True + ) -> str: + """Create a bedtool object from records. - :param annotation_path: Path to annotation - :type annotation_path: Path - :param records: Data records as BED6+1-like, - where the additional field is the "data_id". - :type records: Iterable[Data] - :param features: Genomic features for which - annotation must be created. - :type features: dict of {str: str} - :returns: Records for DataAnnotation - :rtype: Iterable[ModificationRecord] + :param records: A iterable over records which can be processed by bedtools + :type records: Iterable[Sequence[Any]] + :param sort: sort the result + :type sort: bool + :returns: Path to temporary file + :rtype: str """ + bedtool = BedTool(records) + if sort: + bedtool = bedtool.sort() + return bedtool.fn - bedtool_records = self._get_data_as_bedtool_for_annotation(records) - if "intergenic" not in features: - raise AnnotationFormatError( - "Missing feature intergenic from specs. This is due to a change " - "in definition. Aborting transaction!" - ) - intergenic_feature = features.pop("intergenic") - prefix = None - for feature, pretty_feature in features.items(): - file_name = Path(annotation_path, f"{feature}.bed").as_posix() - feature_bedtool = pybedtools.BedTool(file_name) - if prefix is None: - # any feature_bedtool, exc. intergenic has a gene_id at fields 6... - prefix = utils.get_ensembl_prefix( - feature_bedtool[0].fields[6].split(",")[0] - ) - for item in self._intersect_for_annotation( - bedtool_records, feature_bedtool, pretty_feature - ): - yield item + @staticmethod + def get_ensembl_annotation_records( + annotation_file: Path, annotation_id: int, intergenic_feature: str + ) -> Iterable[GenomicAnnotationRecord]: + """Create records for GenomicAnnotation from + annotation file. Columns order is + (gene_id, annotation_id, gene_name, gene_biotype). - gene_id = f"{prefix}{intergenic_feature}" - file_name = Path(annotation_path, "intergenic.bed").as_posix() - feature_bedtool = pybedtools.BedTool(file_name) - stream = bedtool_records.intersect( - b=feature_bedtool, wa=True, wb=True, s=False, sorted=True - ) + :param annotation_file: Path to annotation file. + The format is implicitely assumed to be GTF. + :type annotation_file: Path + :param annotation_id: Annotation ID + :type annotation_id: int + :param intergenic_feature: Name for intergenic + feature. This name must come from Annotation + specifications, as it must match that used + when creating annotation for data records. + :type intergenic_feature: str + :returns: Annotation records as tuple of columns + :rtype: Iterable[GenomicAnnotationRecord] + """ + logger.info(f"Creating annotation for {annotation_file}...") + + bedtool = pybedtools.BedTool(annotation_file.as_posix()).sort() + stream = bedtool.filter(lambda f: f.fields[2] == "gene").each(_get_gtf_attrs) + prefix = None for s in stream: - yield DataAnnotationRecord( - gene_id=gene_id, data_id=s[6], feature=intergenic_feature + yield GenomicAnnotationRecord( + id=s[6], annotation_id=annotation_id, name=s[3], biotype=s[7] ) + if prefix is None: + # get a "dummy" record for intergenic annotation + prefix = utils.get_ensembl_prefix(s[6]) + yield GenomicAnnotationRecord( + id=f"{prefix}{intergenic_feature}", annotation_id=annotation_id + ) @staticmethod - def _get_data_as_bedtool_for_annotation( - records: Iterable[Data], - ) -> BedTool: - def generator(): - for record in records: - yield create_interval_from_list( - [ - record.chrom, - record.start, - record.end, - record.name, - record.score, - record.strand.value, - record.id, - ] - ) + def get_gtrnadb_annotation_records( + annotation_file: Path, + annotation_id: int, + organism: str, + ) -> Iterable[GenomicAnnotationRecord]: + """Create records for GenomicAnnotation from + annotation file. Columns order is + (gene_id, annotation_id, gene_name, gene_biotype). - return BedTool(generator()).sort() + :param annotation_file: Path to annotation file. + The format is implicitely assumed to be BED12. + :type annotation_file: Path + :param annotation_id: Annotation ID + :type annotation_id: int + :param organism: Organism name + :type organism: str + :returns: Annotation records as tuple of columns + :rtype: Iterable[GenomicAnnotationRecord] + """ + logger.info(f"Creating annotation for {annotation_file}...") - @staticmethod - def _intersect_for_annotation(bedtool_records, feature_bedtool, feature): - # delim (collapse) Default: "," - stream = bedtool_records.intersect( - b=feature_bedtool, wa=True, wb=True, s=True, sorted=True - ) - for s in stream: - for gene_id in s[13].split(","): - yield DataAnnotationRecord( - gene_id=gene_id, data_id=s[6], feature=feature - ) + bedtool = pybedtools.BedTool(annotation_file.as_posix()).sort() + for interval in bedtool: + yield GenomicAnnotationRecord( + id=f"{organism}_{interval.name}", + annotation_id=annotation_id, + name=interval.name, + biotype="tRNA", + ) @staticmethod def gtrnadb_to_bed_features(annotation_file: Path, features: list[str]) -> None: @@ -241,6 +240,90 @@ def gtrnadb_to_bed_features(annotation_file: Path, features: list[str]) -> None: except IndexError: pass + def create_temp_euf_file(self, records: Iterable[EufRecord]) -> str: + """Create a bedtool object from EUF records. + + :param records: A iterable over EUF records which can be processed by bedtools + :type records: Iterable[EufRecord] + :returns: Path to temporary file + :rtype: str + """ + + def generator(): + for record in records: + yield create_interval_from_list( + [ + record.chrom, + record.start, + record.end, + record.name, + record.score, + record.strand.value, + record.thick_start, + record.thick_end, + record.item_rgb, + record.coverage, + record.frequency, + ] + ) + + return self.create_temp_file_from_records(generator()) + + def annotate_data_using_ensembl( + self, + annotation_path: Path, + features: dict[str, str], + records: Iterable[Data], + ) -> Iterable[DataAnnotationRecord]: + """Annotate data records, i.e. create + records for DataAnnotation. Columns + order is (gene_id, data_id, feature). + There is no type coercion. + + :param annotation_path: Path to annotation + :type annotation_path: Path + :param records: Data records as BED6+1-like, + where the additional field is the "data_id". + :type records: Iterable[Data] + :param features: Genomic features for which + annotation must be created. + :type features: dict of {str: str} + :returns: Records for DataAnnotation + :rtype: Iterable[ModificationRecord] + """ + + bedtool_records = self._get_data_as_bedtool_for_annotation(records) + if "intergenic" not in features: + raise AnnotationFormatError( + "Missing feature intergenic from specs. This is due to a change " + "in definition. Aborting transaction!" + ) + intergenic_feature = features.pop("intergenic") + prefix = None + for feature, pretty_feature in features.items(): + file_name = Path(annotation_path, f"{feature}.bed").as_posix() + feature_bedtool = pybedtools.BedTool(file_name) + if prefix is None: + # any feature_bedtool, exc. intergenic has a gene_id at fields 6... + prefix = utils.get_ensembl_prefix( + feature_bedtool[0].fields[6].split(",")[0] + ) + for item in self._intersect_for_annotation( + bedtool_records, feature_bedtool, pretty_feature + ): + yield item + + gene_id = f"{prefix}{intergenic_feature}" + file_name = Path(annotation_path, "intergenic.bed").as_posix() + feature_bedtool = pybedtools.BedTool(file_name) + stream = bedtool_records.intersect( + b=feature_bedtool, wa=True, wb=True, s=False, sorted=True + ) + for s in stream: + yield DataAnnotationRecord( + gene_id=gene_id, data_id=s[6], feature=intergenic_feature + ) + def ensembl_to_bed_features( self, annotation_file: Path, chrom_file: Path, features: dict[str, list[str]] ) -> None: @@ -295,130 +378,6 @@ def ensembl_to_bed_features( .moveto(file_name) ) - @staticmethod - def _annotation_to_stream(annotation_bedtool, feature): - return annotation_bedtool.filter(lambda a: a.fields[2] == feature).each( - _get_gtf_attrs - ) - - @staticmethod - def _check_feature(feature, features, parent): - if feature not in features["extended"]: - raise AnnotationFormatError( - f"Missing feature {feature} from specs. This is due to a change in definition." - ) - logger.debug(f"Writing {feature}...") - return Path(parent, f"{feature}.bed").as_posix() - - @staticmethod - def get_ensembl_annotation_records( - annotation_file: Path, annotation_id: int, intergenic_feature: str - ) -> Iterable[GenomicAnnotationRecord]: - """Create records for GenomicAnnotation from - annotation file. Columns order is - (gene_id, annotation_id, gene_name, gene_biotype). - - :param annotation_file: Path to annotation file. - The format is implicitely assumed to be GTF. - :type annotation_file: Path - :param annotation_id: Annotation ID - :type annotation_id: int - :param intergenic_feature: Name for intergenic - feature. This name must come from Annotation - specifications, as it must match that used - when creating annotation for data records. - :type intergenic_feature: str - :returns: Annotation records as tuple of columns - :rtype: Iterable[GenomicAnnotationRecord] - """ - logger.info(f"Creating annotation for {annotation_file}...") - - bedtool = pybedtools.BedTool(annotation_file.as_posix()).sort() - stream = bedtool.filter(lambda f: f.fields[2] == "gene").each(_get_gtf_attrs) - prefix = None - for s in stream: - yield GenomicAnnotationRecord( - id=s[6], annotation_id=annotation_id, name=s[3], biotype=s[7] - ) - if prefix is None: - # get a "dummy" record for intergenic annotation - prefix = utils.get_ensembl_prefix(s[6]) - yield GenomicAnnotationRecord( - id=f"{prefix}{intergenic_feature}", annotation_id=annotation_id - ) - - @staticmethod - def get_gtrnadb_annotation_records( - annotation_file: Path, - annotation_id: int, - organism: str, - ) -> Iterable[GenomicAnnotationRecord]: - """Create records for GenomicAnnotation from - annotation file. Columns order is - (gene_id, annotation_id, gene_name, gene_biotype). - - :param annotation_file: Path to annotation file. - The format is implicitely assumed to be BED12. - :type annotation_file: Path - :param annotation_id: Annotation ID - :type annotation_id: int - :param organism: Organism name - :type organism: str - :returns: Annotation records as tuple of columns - :rtype: Iterable[GenomicAnnotationRecord] - """ - logger.info(f"Creating annotation for {annotation_file}...") - - bedtool = pybedtools.BedTool(annotation_file.as_posix()).sort() - for interval in bedtool: - yield GenomicAnnotationRecord( - id=f"{organism}_{interval.name}", - annotation_id=annotation_id, - name=interval.name, - biotype="tRNA", - ) - - @staticmethod - def create_temp_file_from_records( - records: Iterable[Sequence[Any]], sort: bool = True - ) -> str: - """Liftover records. Handles conversion to BedTool, but not from, - of the liftedOver features. A file is returned pointing - to the liftedOver features. The unmapped ones are saved as - "unmapped", or discarded. - - :param records: A iterable over records which can be processed by bedtools - :type records: Iterable[Sequence[Any]] - :param sort: sort the result - :returns: Path to temporary file - :rtype: str - """ - bedtool = BedTool(records) - if sort: - bedtool = bedtool.sort() - return bedtool.fn - - def create_temp_euf_file(self, records: Iterable[EufRecord]) -> str: - def generator(): - for record in records: - yield create_interval_from_list( - [ - record.chrom, - record.start, - record.end, - record.name, - record.score, - record.strand.value, - record.thick_start, - record.thick_end, - record.item_rgb, - record.coverage, - record.frequency, - ] - ) - - return self.create_temp_file_from_records(generator()) - def intersect( self, a_records: Iterable[ComparisonRecord], @@ -460,42 +419,6 @@ def intersect( r = IntersectRecord(a=a, b=b) yield r - @staticmethod - def _get_modifications_as_bedtool( - records: Iterable[ComparisonRecord], - ) -> BedTool: - def generator(): - for record in records: - yield create_interval_from_list( - [ - record.chrom, - record.start, - record.end, - record.name, - record.score, - record.strand.value, - record.eufid, - record.coverage, - record.frequency, - ] - ) - - return BedTool(generator()).sort() - - @staticmethod - def _get_modification_from_bedtools_data(s: Sequence[str]): - return ComparisonRecord( - chrom=s[0], - start=s[1], - end=s[2], - name=s[3], - score=s[4], - strand=Strand(s[5]), - eufid=s[6], - coverage=s[7], - frequency=s[8], - ) - def closest( self, a_records: Iterable[ComparisonRecord], @@ -579,6 +502,89 @@ def b_generator(): for s in bedtool: yield SubtractRecord(**self._get_modification_from_bedtools_data(s).dict()) + @staticmethod + def _annotation_to_stream(annotation_bedtool, feature): + return annotation_bedtool.filter(lambda a: a.fields[2] == feature).each( + _get_gtf_attrs + ) + + @staticmethod + def _check_feature(feature, features, parent): + if feature not in features["extended"]: + raise AnnotationFormatError( + f"Missing feature {feature} from specs. This is due to a change in definition." + ) + logger.debug(f"Writing {feature}...") + return Path(parent, f"{feature}.bed").as_posix() + + @staticmethod + def _get_data_as_bedtool_for_annotation( + records: Iterable[Data], + ) -> BedTool: + def generator(): + for record in records: + yield create_interval_from_list( + [ + record.chrom, + record.start, + record.end, + record.name, + record.score, + record.strand.value, + record.id, + ] + ) + + return BedTool(generator()).sort() + + @staticmethod + def _intersect_for_annotation(bedtool_records, feature_bedtool, feature): + # delim (collapse) Default: "," + stream = bedtool_records.intersect( + b=feature_bedtool, wa=True, wb=True, s=True, sorted=True + ) + for s in stream: + for gene_id in s[13].split(","): + yield DataAnnotationRecord( + gene_id=gene_id, data_id=s[6], feature=feature + ) + + @staticmethod + def _get_modifications_as_bedtool( + records: Iterable[ComparisonRecord], + ) -> BedTool: + def generator(): + for record in records: + yield create_interval_from_list( + [ + record.chrom, + record.start, + record.end, + record.name, + record.score, + record.strand.value, + record.eufid, + record.coverage, + record.frequency, + ] + ) + + return BedTool(generator()).sort() + + @staticmethod + def _get_modification_from_bedtools_data(s: Sequence[str]): + return ComparisonRecord( + chrom=s[0], + start=s[1], + end=s[2], + name=s[3], + score=s[4], + strand=Strand(s[5]), + eufid=s[6], + coverage=s[7], + frequency=s[8], + ) + @cache def get_bedtools_service(): diff --git a/server/src/scimodom/services/dataset.py b/server/src/scimodom/services/dataset.py index 06108d67..12bcd897 100644 --- a/server/src/scimodom/services/dataset.py +++ b/server/src/scimodom/services/dataset.py @@ -12,7 +12,6 @@ from scimodom.database.buffer import InsertBuffer from scimodom.database.database import get_session from scimodom.database.models import ( - Assembly, Dataset, DatasetModificationAssociation, DetectionTechnology, @@ -25,7 +24,6 @@ User, UserProjectAssociation, Selection, - AssemblyVersion, Data, ) from scimodom.services.annotation import ( @@ -36,7 +34,6 @@ from scimodom.services.assembly import ( get_assembly_service, AssemblyService, - AssemblyVersionError, ) from scimodom.services.bedtools import get_bedtools_service, BedToolsService from scimodom.utils import utils @@ -96,13 +93,6 @@ class SpecsError(Exception): pass -def _none_if_empty(x): - if x == "": - return None - else: - return x - - class DatasetService: FILE_FORMAT_VERSION_REGEXP = re.compile(r".*?([0-9.]+)\Z") @@ -264,6 +254,37 @@ def import_dataset( ) return context.eufid + @staticmethod + def _check_euf_record(record, importer, context): + if record.chrom not in context.seqids: + importer.report_error( + f"Unrecognized chrom: {record.chrom}. Ignore this warning " + "for scaffolds and contigs, otherwise this could be due to misformatting!" + ) + return False + if record.name not in context.modification_names: + importer.report_error(f"Unrecognized name: {record.name}.") + return False + return True + + @staticmethod + def _get_data_record(record: EufRecord, context): + return Data( + dataset_id=context.eufid, + modification_id=context.modification_names.get(record.name), + chrom=record.chrom, + start=record.start, + end=record.end, + name=record.name, + score=record.score, + strand=record.strand, + thick_start=record.thick_start, + thick_end=record.thick_end, + item_rgb=record.item_rgb, + coverage=record.coverage, + frequency=record.frequency, + ) + def _sanitize_import_context(self, context): is_found = self._session.query( exists().where(Project.id == context.smid) @@ -430,37 +451,6 @@ def _do_direct_import(self, importer, context): data = self._get_data_record(record, context) buffer.queue(data) - @staticmethod - def _check_euf_record(record, importer, context): - if record.chrom not in context.seqids: - importer.report_error( - f"Unrecognized chrom: {record.chrom}. Ignore this warning " - "for scaffolds and contigs, otherwise this could be due to misformatting!" - ) - return False - if record.name not in context.modification_names: - importer.report_error(f"Unrecognized name: {record.name}.") - return False - return True - - @staticmethod - def _get_data_record(record: EufRecord, context): - return Data( - dataset_id=context.eufid, - modification_id=context.modification_names.get(record.name), - chrom=record.chrom, - start=record.start, - end=record.end, - name=record.name, - score=record.score, - strand=record.strand, - thick_start=record.thick_start, - thick_end=record.thick_end, - item_rgb=record.item_rgb, - coverage=record.coverage, - frequency=record.frequency, - ) - def _do_lift_over(self, importer, context): def generator(): for record in importer.parse(): @@ -489,6 +479,13 @@ def _add_association(self, context) -> None: self._session.flush() +def _none_if_empty(x): + if x == "": + return None + else: + return x + + @cache def get_dataset_service() -> DatasetService: """Helper function to set up a DatasetService object by injecting its dependencies. diff --git a/server/src/scimodom/services/file.py b/server/src/scimodom/services/file.py index d56832e3..cd054c52 100644 --- a/server/src/scimodom/services/file.py +++ b/server/src/scimodom/services/file.py @@ -131,7 +131,7 @@ def _get_gene_cache_dir(self) -> Path: # Project related - def get_project_metadata_dir(self): + def get_project_metadata_dir(self) -> Path: """Construct parent path to metadata. :returns: Path to metadata diff --git a/server/tests/integration/services/test_file_service.py b/server/tests/integration/services/test_file_service.py index a6a8d546..cb111acb 100644 --- a/server/tests/integration/services/test_file_service.py +++ b/server/tests/integration/services/test_file_service.py @@ -2,13 +2,11 @@ from pathlib import Path import pytest -from sqlalchemy import select -from scimodom.database.models import AssemblyVersion, Assembly, Taxa, Taxonomy, Organism from scimodom.services.file import FileService, AssemblyFileType -def get_service(Session, tmp_path): +def _get_file_service(Session, tmp_path): return FileService( session=Session(), data_path=join(tmp_path, "t_data"), @@ -18,41 +16,45 @@ def get_service(Session, tmp_path): ) -@pytest.fixture -def assembly(Session): - with Session() as db: - db.add_all( - [ - AssemblyVersion(version_num="v23"), - Taxonomy(id="tax", domain="domain", kingdom="kingdom", phylum="phylum"), - Taxa( - id=15, name="The Wrong One", short_name="wrong", taxonomy_id="tax" - ), - Taxa( - id=16, name="The Right Taxa", short_name="right", taxonomy_id="tax" - ), - Assembly(id=1, name="asWrongTaxa", taxa_id=15, version="v23"), - Assembly(id=2, name="asWrongVersion", taxa_id=16, version="v22"), - Assembly(id=3, name="asRight", taxa_id=16, version="v23"), - ] - ) - db.commit() +def test_get_assembly_file_path_for_chrom(Session, tmp_path, setup): + with Session() as session, session.begin(): + session.add_all(setup) + service = _get_file_service(Session, tmp_path) + assert service.get_assembly_file_path(9606, AssemblyFileType.CHROM) == Path( + tmp_path, "t_data", "assembly", "Homo_sapiens", "GRCh38", "chrom.sizes" + ) -def test_assembly_path(Session, tmp_path, assembly): - service = get_service(Session, tmp_path) - assert service.get_assembly_file_path(16, AssemblyFileType.CHROM) == Path( - tmp_path, "t_data", "assembly", "The_right_taxa", "asRight", "chrom.sizes" - ) +def test_get_assembly_file_path_for_chain(Session, tmp_path, setup): + with Session() as session, session.begin(): + session.add_all(setup) + service = _get_file_service(Session, tmp_path) + assert service.get_assembly_file_path( + 9606, + AssemblyFileType.CHAIN, + chain_file_name="chain", + chain_assembly_name="GRCh37", + ) == Path(tmp_path, "t_data", "assembly", "Homo_sapiens", "GRCh37", "chain") + + +def test_get_assembly_file_path_for_chain_fail(Session, tmp_path, setup): + with Session() as session, session.begin(): + session.add_all(setup) + service = _get_file_service(Session, tmp_path) + with pytest.raises(ValueError) as exc: + assert service.get_assembly_file_path( + 9606, AssemblyFileType.CHAIN, chain_file_name="chain" + ) + assert (str(exc.value)) == "Missing chain_file_name and/or assembly_name!" -def test_create_chain_file(Session, tmp_path, assembly): - service = get_service(Session, tmp_path) - with service.create_chain_file(16, "x_to_y.chain") as fh: +def test_create_chain_file(Session, tmp_path, setup): + with Session() as session, session.begin(): + session.add_all(setup) + service = _get_file_service(Session, tmp_path) + with service.create_chain_file(9606, "chain", "GRCh37") as fh: fh.write(b"bla") - path = Path( - tmp_path, "t_data", "assembly", "The_right_taxa", "asRight", "x_to_y.chain" - ) + path = Path(tmp_path, "t_data", "assembly", "Homo_sapiens", "GRCh37", "chain") with open(path, "rb") as fh: assert fh.read() == b"bla" @@ -60,20 +62,20 @@ def test_create_chain_file(Session, tmp_path, assembly): def test_count_lines(Session, tmp_path): test_file = Path(tmp_path, "example.txt") test_file.write_text("line1\nline2\nline3\n") - service = get_service(Session, tmp_path) + service = _get_file_service(Session, tmp_path) assert service.count_lines(test_file) == 3 def test_get_annotation_path(Session, tmp_path): - service = get_service(Session, tmp_path) + service = _get_file_service(Session, tmp_path) assert service.get_annotation_dir() == Path(tmp_path, "t_data", "annotation") -def test_gene_cache(Session, tmp_path): - service = get_service(Session, tmp_path) +def test_update_gene_cache(Session, tmp_path): + service = _get_file_service(Session, tmp_path) service.update_gene_cache(122, ["1", "2", "Y"]) service.update_gene_cache(123, ["1", "3"]) service.update_gene_cache(124, ["1", "3", "X"]) - assert service.get_gene_cache([122, 123]) == {"1", "2", "3", "Y"} + assert set(service.get_gene_cache([122, 123])) == set(["1", "2", "3", "Y"]) service.update_gene_cache(123, ["1", "15"]) - assert service.get_gene_cache([122, 123]) == {"1", "2", "15", "Y"} + assert set(service.get_gene_cache([122, 123])) == set(["1", "2", "15", "Y"]) diff --git a/server/tests/unit/services/test_assembly.py b/server/tests/unit/services/test_assembly.py index 358cbf70..794356ce 100644 --- a/server/tests/unit/services/test_assembly.py +++ b/server/tests/unit/services/test_assembly.py @@ -1,15 +1,13 @@ -from abc import ABC from io import StringIO, BytesIO from pathlib import Path from typing import TextIO, BinaryIO import pytest -import requests # type: ignore from sqlalchemy import exists from mocks.io_mocks import MockStringIO, MockBytesIO from mocks.web_service import MockWebService, MockHTTPError -from scimodom.database.models import Assembly, AssemblyVersion, Taxonomy, Taxa +from scimodom.database.models import Assembly, AssemblyVersion from scimodom.services.assembly import ( AssemblyService, AssemblyNotFoundError, @@ -28,15 +26,20 @@ class MockFileService: def __init__(self): self.files_by_name: dict[str, MockStringIO | MockBytesIO] = {} self.lines_by_name: dict[str, int] = {} - self.existing_assemblies: list[int] = [] - self.deleted_assemblies: list[int] = [] + self.existing_assemblies: list[tuple[int, str]] = [] + self.deleted_assemblies: list[tuple[int, str]] = [] @staticmethod def get_assembly_file_path( - taxa_id: int, file_type: AssemblyFileType, chain_file_name: str | None = None + taxa_id: int, + file_type: AssemblyFileType, + chain_file_name: str | None = None, + chain_assembly_name: str | None = None, ) -> Path: if file_type == AssemblyFileType.CHAIN: - return Path(f"/data/assembly/{taxa_id}/{chain_file_name}") + return Path( + f"/data/assembly/{taxa_id}/{chain_assembly_name}/{chain_file_name}" + ) else: return Path(f"/data/assembly/{taxa_id}/{file_type.value}") @@ -56,20 +59,24 @@ def create_assembly_file(self, taxa_id: int, file_type: AssemblyFileType) -> Tex self.files_by_name[name] = new_file return new_file - def create_chain_file(self, taxa_id: int, name: str) -> BinaryIO: + def create_chain_file( + self, taxa_id: int, file_name: str, assembly_name: str + ) -> BinaryIO: name = self.get_assembly_file_path( - taxa_id, AssemblyFileType.CHAIN, chain_file_name=name + taxa_id, + AssemblyFileType.CHAIN, + chain_file_name=file_name, + chain_assembly_name=assembly_name, ).as_posix() new_file = MockBytesIO() self.files_by_name[name] = new_file return new_file - # TODO def check_if_assembly_exists(self, taxa_id: int, assembly_name: str) -> bool: - return taxa_id in self.existing_assemblies + return (taxa_id, assembly_name) in self.existing_assemblies - def delete_assembly(self, taxa_id: int): - self.deleted_assemblies.append(taxa_id) + def delete_assembly(self, taxa_id: int, assembly_name: str): + self.deleted_assemblies.append((taxa_id, assembly_name)) def count_lines(self, path): return self.lines_by_name[path] @@ -80,11 +87,6 @@ def file_service(): yield MockFileService() -@pytest.fixture -def chain_file(file_service): - file_service.files_by_name["/data/assembly/9606/GRCh37_to_GRCh38.chain"] = BytesIO() - - def _get_assembly_service(Session, file_service, url_to_result=None, url_to_data=None): return AssemblyService( session=Session(), @@ -99,23 +101,14 @@ def _get_assembly_service(Session, file_service, url_to_result=None, url_to_data # tests -def test_init(Session, data_path, file_service): +def test_init(Session, file_service): with Session() as session, session.begin(): session.add(AssemblyVersion(version_num="GcatSmFcytpU")) service = _get_assembly_service(Session, file_service) assert service._version == "GcatSmFcytpU" -def test_get_organism(Session, file_service, data_path, setup): - with Session() as session, session.begin(): - session.add_all(setup) - service = _get_assembly_service(Session, file_service) - assert ( - service._get_organism(10090) == "Mus musculus" - ) # Converting space to '_' is now done in file service. - - -def test_get_assembly(Session, data_path, setup): +def test_get_assembly_by_id(Session, file_service, setup): with Session() as session, session.begin(): session.add_all(setup) service = _get_assembly_service(Session, file_service) @@ -127,7 +120,7 @@ def test_get_assembly(Session, data_path, setup): assert service._version == "GcatSmFcytpU" -def test_get_assembly_fail(Session, data_path): +def test_get_assembly_by_id_fail(Session): with Session() as session, session.begin(): session.add(AssemblyVersion(version_num="GcatSmFcytpU")) service = _get_assembly_service(Session, file_service) @@ -137,10 +130,23 @@ def test_get_assembly_fail(Session, data_path): assert exc.type == AssemblyNotFoundError +def test_get_assemblies_by_taxa(Session, file_service, setup): + with Session() as session, session.begin(): + session.add_all(setup) + service = _get_assembly_service(Session, file_service) + assemblies = service.get_assemblies_by_taxa(9606) + + expected_assemblies = [ + ("GRCh38", "GcatSmFcytpU"), + ("GRCh37", "J9dit7Tfc6Sb"), + ] + for assembly, expected_assembly in zip(assemblies, expected_assemblies): + assert assembly.name == expected_assembly[0] + assert assembly.version == expected_assembly[1] + + @pytest.mark.parametrize("assembly_id,is_latest", [(1, True), (3, False)]) -def test_is_latest_assembly( - assembly_id, is_latest, Session, file_service, data_path, setup -): +def test_is_latest_assembly(assembly_id, is_latest, Session, file_service, setup): with Session() as session, session.begin(): session.add_all(setup) service = _get_assembly_service(Session, file_service) @@ -148,6 +154,14 @@ def test_is_latest_assembly( assert service.is_latest_assembly(assembly) == is_latest +def test_get_name_for_version(Session, file_service, setup): + with Session() as session, session.begin(): + session.add_all(setup) + service = _get_assembly_service(Session, file_service) + assembly_name = service.get_name_for_version(9606) + assert assembly_name == "GRCh38" + + def test_get_seqids(Session, file_service, setup): file_service.files_by_name["/data/assembly/9606/chrom.sizes"] = StringIO( "1\t12345\n2\t123456" @@ -159,7 +173,23 @@ def test_get_seqids(Session, file_service, setup): assert set(seqids) == {"1", "2"} -def test_liftover(Session, file_service, setup, chain_file): +def test_get_chroms(Session, file_service, setup): + file_service.files_by_name["/data/assembly/9606/chrom.sizes"] = StringIO( + "1\t12345\n2\t123456" + ) + with Session() as session, session.begin(): + session.add_all(setup) + service = _get_assembly_service(Session, file_service) + chroms = service.get_chroms(9606) + expected_chroms = [{"chrom": "1", "size": 12345}, {"chrom": "2", "size": 123456}] + for chrom, expected_chrom in zip(chroms, expected_chroms): + assert chrom == expected_chrom + + +def test_liftover(Session, file_service, setup): + file_service.files_by_name[ + "/data/assembly/9606/GRCh37/GRCh37_to_GRCh38.chain.gz" + ] = BytesIO() with Session() as session, session.begin(): session.add_all(setup) file_service.lines_by_name["unmapped.bed"] = 0 @@ -170,9 +200,10 @@ def test_liftover(Session, file_service, setup, chain_file): service.liftover(assembly, "to_be_lifted.bed", unmapped_file="unmapped.bed") -# with 3 raw_file records, liftover succeeds with the following warning: -# 1 records could not be mapped and were discarded... Contact the system administrator if you have questions. -def test_liftover_fail_count(Session, file_service, setup, chain_file): +def test_liftover_fail_count(Session, file_service, setup): + file_service.files_by_name[ + "/data/assembly/9606/GRCh37/GRCh37_to_GRCh38.chain.gz" + ] = BytesIO() with Session() as session, session.begin(): session.add_all(setup) file_service.lines_by_name["unmapped.bed"] = 1 @@ -182,11 +213,30 @@ def test_liftover_fail_count(Session, file_service, setup, chain_file): assembly = service.get_assembly_by_id(3) with pytest.raises(LiftOverError) as exc: service.liftover(assembly, "to_be_lifted.bed", unmapped_file="unmapped.bed") - assert (str(exc.value)) == "Liftover failed: 1 records of 3 could not be mapped." + assert ( + str(exc.value) + ) == "Liftover failed: 1 records out of 3 could not be mapped." assert exc.type == LiftOverError -def test_liftover_fail_version(Session, file_service, data_path, setup): +def test_liftover_warning(Session, file_service, setup, caplog): + file_service.files_by_name[ + "/data/assembly/9606/GRCh37/GRCh37_to_GRCh38.chain.gz" + ] = BytesIO() + with Session() as session, session.begin(): + session.add_all(setup) + file_service.lines_by_name["unmapped.bed"] = 1 + file_service.lines_by_name["to_be_lifted.bed"] = 4 + file_service.lines_by_name["lifted.bed"] = 3 + service = _get_assembly_service(Session, file_service) + assembly = service.get_assembly_by_id(3) + service.liftover(assembly, "to_be_lifted.bed", unmapped_file="unmapped.bed") + assert caplog.messages == [ + "1 records could not be mapped... Contact the system administrator if you have questions." + ] + + +def test_liftover_fail_version(Session, file_service, setup): with Session() as session, session.begin(): session.add_all(setup) service = _get_assembly_service(Session, file_service) @@ -197,8 +247,12 @@ def test_liftover_fail_version(Session, file_service, data_path, setup): assert exc.type == AssemblyVersionError -# downloads chain file... -def test_add_assembly(Session, file_service, data_path, setup): +# NOTE: test_add_assembly*, and test_prepare_assembly_for_version* all implicitely +# test protected methods e.g. _handle_gene_build and _handle_release, that rely +# on scimodom.utils.specifications e.g. _get_ensembl_gene_build_url and _get_ensembl_chain_file_url. + + +def test_add_assembly(Session, file_service, setup): with Session() as session, session.begin(): session.add_all(setup) service = _get_assembly_service( @@ -208,53 +262,34 @@ def test_add_assembly(Session, file_service, data_path, setup): "https://ftp.ensembl.org/pub/current_assembly_chain/homo_sapiens/NCBI36_to_GRCh38.chain.gz": b"foo" }, ) - service.add_assembly(9606, "NCBI36") - # asseemb;y_id is 4 + assembly_id = service.add_assembly(9606, "NCBI36") with Session() as session: assert session.query( exists().where(Assembly.taxa_id == 9606, Assembly.name == "NCBI36") ).scalar() - file = file_service.files_by_name["/data/assembly/9606/NCBI36_to_GRCh38.chain.gz"] + assert assembly_id == 4 + file = file_service.files_by_name[ + "/data/assembly/9606/NCBI36/NCBI36_to_GRCh38.chain.gz" + ] assert file.final_content == b"foo" -def test_add_assembly_exists(Session, file_service, data_path, setup): +def test_add_assembly_exists(Session, file_service, setup): with Session() as session, session.begin(): session.add_all(setup) service = _get_assembly_service(Session, file_service) assert service.add_assembly(9606, "GRCh37") == 3 - # normally directory exists for an existing assembly, but - # this function does not check that, it simple returns if - # the assembly exists - # if the tests were fully isolated, this should be False... - # chain_file = service.get_chain_file(9606, "GRCh37") - # assert chain_file.parent.exists() is False - # - # HW: Check with Etienne - that is a true uni test now! - # - - -def test_add_assembly_file_exists(Session, file_service): + + +def test_add_assembly_directory_exists(Session, file_service): with Session() as session, session.begin(): version = AssemblyVersion(version_num="GcatSmFcytpU") - taxonomy = Taxonomy( - id="a1b240af", domain="Eukarya", kingdom="Animalia", phylum="Chordata" - ) - taxa = Taxa( - id=9606, - name="Homo sapiens", - short_name="H. sapiens", - taxonomy_id="a1b240af", - ) - assembly = Assembly( - name="GRCh38", alt_name="hg38", taxa_id=9606, version="GcatSmFcytpU" - ) - session.add_all([version, taxonomy, taxa, assembly]) - file_service.existing_assemblies = [9606] + session.add(version) + file_service.existing_assemblies = [(9606, "GRCh37")] service = _get_assembly_service(Session, file_service) with pytest.raises(FileExistsError) as exc: service.add_assembly(9606, "GRCh37") - assert (str(exc.value)) == "Assembly 'GRCh37' already exists (Taxa ID 9606)." + assert (str(exc.value)) == "Directory exists, but assembly 'GRCh37' does not exist!" def test_add_assembly_wrong_url(Session, file_service, setup): @@ -263,6 +298,9 @@ def test_add_assembly_wrong_url(Session, file_service, setup): service = _get_assembly_service(Session, file_service) with pytest.raises(MockHTTPError): service.add_assembly(9606, "GRCH37") + assert file_service.deleted_assemblies == [(9606, "GRCH37")] + with Session() as session: + assert session.query(Assembly).count() == 3 EXAMPLE_GENE_BUILD_DATA = { @@ -283,6 +321,11 @@ def test_add_assembly_wrong_url(Session, file_service, setup): ], } + +NEWEST_EXAMPLE_GENE_BUILD_DATA = EXAMPLE_GENE_BUILD_DATA.copy() +NEWEST_EXAMPLE_GENE_BUILD_DATA["default_coord_system_version"] = "GRCh39" + + EXPECTED_CHROM_SIZES = """1\t248956422 2\t242193529 X\t156040895 @@ -301,7 +344,6 @@ def test_add_assembly_wrong_url(Session, file_service, setup): }""" -# this now fails because Assembly directory exists: /tmp/pytest-of-scimodom/data0/assembly/Homo_sapiens/GRCh38. def test_prepare_assembly_for_version(Session, file_service, setup): with Session() as session, session.begin(): session.add_all(setup) @@ -328,5 +370,68 @@ def test_prepare_assembly_for_version(Session, file_service, setup): ) -# the above with 2, adn 10090 -# scimodom.services.assembly.AssemblyVersionError: Mismatch between assembly GRCm38 and coord system version GRCm39. Upgrade your database! +def test_prepare_assembly_for_version_wrong_version(Session, file_service, setup): + with Session() as session, session.begin(): + session.add_all(setup) + service = _get_assembly_service( + Session, + file_service, + ) + with pytest.raises(AssemblyVersionError) as exc: + service.prepare_assembly_for_version(3) + assert ( + (str(exc.value)) + == "Mismatch between assembly version 'J9dit7Tfc6Sb' and database version 'GcatSmFcytpU'." + ) + assert exc.type == AssemblyVersionError + + +def test_prepare_assembly_for_version_directory_exists(Session, file_service, setup): + with Session() as session, session.begin(): + session.add_all(setup) + file_service.existing_assemblies = [(9606, "GRCh38")] + service = _get_assembly_service(Session, file_service) + with pytest.raises(FileExistsError) as exc: + service.prepare_assembly_for_version(1) + assert (str(exc.value)) == "Assembly 'GRCh38' already exists (Taxa ID 9606)." + + +def test_prepare_assembly_for_version_build_error(Session, file_service, setup): + with Session() as session, session.begin(): + session.add_all(setup) + service = _get_assembly_service( + Session, + file_service, + url_to_result={ + "http://rest.ensembl.org/info/assembly/homo_sapiens": NEWEST_EXAMPLE_GENE_BUILD_DATA, + "http://rest.ensembl.org/info/data": {"releases": [112]}, + }, + ) + with pytest.raises(AssemblyVersionError) as exc: + service.prepare_assembly_for_version(1) + assert ( + (str(exc.value)) + == "Mismatch between assembly GRCh38 and coord system version GRCh39. Upgrade your database!" + ) + assert exc.type == AssemblyVersionError + + +def test_get_chain_file_name(): + assert ( + AssemblyService._get_chain_file_name("GRCh37", "GRCh38") + == "GRCh37_to_GRCh38.chain.gz" + ) + + +def test_get_organism(Session, file_service, setup): + with Session() as session, session.begin(): + session.add_all(setup) + service = _get_assembly_service(Session, file_service) + assert service._get_organism(10090) == "Mus musculus" + + +def test_get_organism_for_ensembl_url(Session, file_service, setup): + with Session() as session, session.begin(): + session.add_all(setup) + service = _get_assembly_service(Session, file_service) + assert service._get_organism_for_ensembl_url(10090) == "mus_musculus" diff --git a/server/tests/unit/services/test_bedtools.py b/server/tests/unit/services/test_bedtools.py index 9db9c4e3..92f80606 100644 --- a/server/tests/unit/services/test_bedtools.py +++ b/server/tests/unit/services/test_bedtools.py @@ -1,7 +1,10 @@ import pytest +from pybedtools import BedTool + from scimodom.services.bedtools import BedToolsService from scimodom.utils.bedtools_dto import ( + EufRecord, IntersectRecord, ClosestRecord, SubtractRecord, @@ -15,32 +18,6 @@ def bedtools_service(tmp_path): yield BedToolsService(tmp_path=tmp_path) -def test_get_modification_from_bedtools_data(): - record = BedToolsService._get_modification_from_bedtools_data( - [ - "1", - "1043431", - "1043432", - "Y", - "190", - Strand.FORWARD, - "iMuwPsi24Yka", - "576", - "19", - ] - ) - assert isinstance(record, ComparisonRecord) - assert record.chrom == "1" - assert record.start == 1043431 - assert record.end == 1043432 - assert record.name == "Y" - assert record.score == 190 - assert record.strand == Strand.FORWARD - assert record.eufid == "iMuwPsi24Yka" - assert record.coverage == 576 - assert record.frequency == 19 - - DATASET_A = [ ComparisonRecord( chrom="1", @@ -665,6 +642,71 @@ def test_subtract_simple(bedtools_service): assert result == EXPECTED_RESULT_SUBTRACT_A_WITH_B +def test_get_modification_from_bedtools_data(): + record = BedToolsService._get_modification_from_bedtools_data( + [ + "1", + "1043431", + "1043432", + "Y", + "190", + Strand.FORWARD, + "iMuwPsi24Yka", + "576", + "19", + ] + ) + assert isinstance(record, ComparisonRecord) + assert record.chrom == "1" + assert record.start == 1043431 + assert record.end == 1043432 + assert record.name == "Y" + assert record.score == 190 + assert record.strand == Strand.FORWARD + assert record.eufid == "iMuwPsi24Yka" + assert record.coverage == 576 + assert record.frequency == 19 + + +def test_get_modifications_as_bedtool(): + bedtool = BedToolsService._get_modifications_as_bedtool( + [ + ComparisonRecord( + chrom="1", + start=1043431, + end=1043432, + name="Y", + score=190, + strand=Strand.FORWARD, + eufid="iMuwPsi24Yka", + coverage=576, + frequency=19, + ), + ComparisonRecord( + chrom="1", + start=1031, + end=1032, + name="Y", + score=0, + strand=Strand.FORWARD, + eufid="iMuwPsi24Yka", + coverage=57, + frequency=1, + ), + ] + ) + assert isinstance(bedtool, BedTool) + # All features have chrom, start, stop, name, score, and strand attributes. + # Note that start and stop are integers, while everything else (including score) is a string. + # https://daler.github.io/pybedtools/intervals.html + expected_records = [(1031, "0", "1"), (1043431, "190", "19")] + for record, expected_record in zip(bedtool, expected_records): + assert record.chrom == "1" + assert record.start == expected_record[0] + assert record.score == expected_record[1] + assert record.fields[8] == expected_record[2] + + EXPECTED_BED_FILE = """1\t2\t3\t4\t5\t6 7\t8\t9\t10\t11\t12 """ @@ -679,3 +721,29 @@ def test_create_temp_file_from_records(bedtools_service): ) with open(path) as fp: assert fp.read() == EXPECTED_BED_FILE + + +EXPECTED_EUF_FILE = """1\t0\t1\tname\t0\t.\t0\t1\t0,0,0\t10\t5 +""" + + +def test_create_temp_euf_file(bedtools_service): + path = bedtools_service.create_temp_euf_file( + [ + EufRecord( + chrom="1", + start=0, + end=1, + name="name", + score=0, + strand=".", + thick_start=0, + thick_end=1, + item_rgb="0,0,0", + coverage=10, + frequency=5, + ), + ] + ) + with open(path) as fp: + assert fp.read() == EXPECTED_EUF_FILE diff --git a/server/tests/unit/services/test_dataset.py b/server/tests/unit/services/test_dataset.py index e8accd61..71933961 100644 --- a/server/tests/unit/services/test_dataset.py +++ b/server/tests/unit/services/test_dataset.py @@ -1,11 +1,10 @@ -from datetime import datetime, timezone +from datetime import datetime from io import StringIO from os import makedirs import pytest from sqlalchemy import select -import scimodom.utils.utils as utils from scimodom.database.models import ( Selection, Dataset, @@ -13,7 +12,6 @@ DetectionTechnology, Organism, Project, - ProjectSource, ProjectContact, Assembly, Data, @@ -24,7 +22,6 @@ SpecsError, DatasetHeaderError, ) -from scimodom.services.project import ProjectService from scimodom.utils.bed_importer import BedImportEmptyFile, BedImportTooManyErrors from scimodom.utils.common_dto import Strand @@ -80,7 +77,7 @@ def annotate_data( pass -def get_dataset_service( +def _get_dataset_service( session, is_latest_asembly=True, assemblies_by_id=None, check_source_result=True ): if assemblies_by_id is None: @@ -107,15 +104,6 @@ def get_dataset_service( ) -def _get_dataset_service(dependencies): - return DatasetService( - dependencies.Session(), - bedtools_service=dependencies.bedtools_service, - assembly_service=dependencies.assembly_service, - annotation_service=dependencies.annotation_service, - ) - - def _add_setup(session, setup): session.add_all(setup) session.flush() @@ -161,7 +149,7 @@ def _add_project(session): @pytest.fixture -def project(Session, setup, freezer): # noqa +def project(Session, setup): # noqa session = Session() _add_setup(session, setup) _add_selection(session) @@ -169,14 +157,6 @@ def project(Session, setup, freezer): # noqa yield "12345678" -@pytest.fixture -def chrome_file(data_path): - path = f"{data_path.ASSEMBLY_PATH}/Homo_sapiens/GRCh38/" - makedirs(path, exist_ok=True) - with open(f"{path}/chrom.sizes", "w") as chrom_file: - chrom_file.write("1\t248956422\n") - - GOOD_EUF_FILE = """#fileformat=bedRModv1.7 #organism=9606 #modification_type=RNA @@ -202,11 +182,9 @@ def chrome_file(data_path): (r"\Z", "# Extra comment\n# In the end"), ], ) -def test_import_simple( - regexp, replacement, Session, project, freezer, chrome_file -): # noqa +def test_import_simple(regexp, replacement, Session, project, freezer): # noqa euf_file = GOOD_EUF_FILE.replace(regexp, replacement) - service = get_dataset_service(Session()) + service = _get_dataset_service(Session()) file = StringIO(euf_file) freezer.move_to("2017-05-20 11:00:23") eufid = service.import_dataset( @@ -248,76 +226,133 @@ def test_import_simple( @pytest.mark.parametrize( - "regexp,replacement,exception,message", + "regexp,replacement,exception,message,record_tuples", [ ( r"#fileformat=bedRModv1.7", "", SpecsError, "Failed to parse version from header (1).", + [], ), ( r"#fileformat=bedRModv1.7", r"#fileformat=bedRModvXX", SpecsError, "Failed to parse version from header (2).", + [], ), ( r"#fileformat=bedRModv1.7", "#fileformat=bedRModv2.1", SpecsError, "Unknown or outdated version 2.1.", + [], + ), + ( + r"#assembly=GRCh38", + "", + SpecsError, + "Required header 'assembly' is missing.", + [], ), - (r"#assembly=GRCh38", "", SpecsError, "Required header 'assembly' is missing."), ( r"#assembly=GRCh38", "#assembly=", SpecsError, "Required header 'assembly' is empty.", + [], ), ( r"#organism=9606", "#organism=10090", DatasetHeaderError, "Expected 9606 for organism; got 10090 from file header.", + [], ), ( r"#assembly=GRCh38", "#assembly=GRCm38", DatasetHeaderError, "Expected GRCh38 for assembly; got GRCm38 from file header.", + [], ), ( "1\t0\t10\tm6A\t1000\t+\t0\t10\t0,0,0\t10\t1", "1\t0\t10\tm6A\t1000\t+\t0\t10\t0,0,0\t10", BedImportTooManyErrors, "Found too many errors ins 'test' (valid: 0, errors: 1)", + [ + ( + "scimodom.utils.bed_importer", + 30, + "test, line 13: Expected 11 fields, but got 10", + ), + ( + "scimodom.utils.bed_importer", + 40, + "Found too many errors ins 'test' (valid: 0, errors: 1)", + ), + ], ), ( "1\t0\t10\tm6A\t1000\t+\t0\t10\t0,0,0\t10\t1", "", BedImportEmptyFile, "Did not find any records in 'test'", + [("scimodom.utils.bed_importer", 40, "Did not find any records in 'test'")], ), ( "1\t0\t10\tm6A\t1000\t+\t0\t10\t0,0,0\t10\t1", "1\t0\t10\tm6\t1000\t+\t0\t10\t0,0,0\t10\t1", BedImportTooManyErrors, "Found too many errors ins 'test' (valid: 0, errors: 1)", + [ + ( + "scimodom.utils.bed_importer", + 30, + "test, line 13: Unrecognized name: m6.", + ), + ( + "scimodom.utils.bed_importer", + 40, + "Found too many errors ins 'test' (valid: 0, errors: 1)", + ), + ], ), ( "1\t0\t10\tm6A\t1000\t+\t0\t10\t0,0,0\t10\t1", "2\t0\t10\tm6A\t1000\t+\t0\t10\t0,0,0\t10\t1", BedImportTooManyErrors, "Found too many errors ins 'test' (valid: 0, errors: 1)", + [ + ( + "scimodom.utils.bed_importer", + 30, + "test, line 13: Unrecognized chrom: 2. Ignore this warning for scaffolds and contigs, otherwise this could be due to misformatting!", + ), + ( + "scimodom.utils.bed_importer", + 40, + "Found too many errors ins 'test' (valid: 0, errors: 1)", + ), + ], ), ], ) def test_bad_import( - regexp, replacement, exception, message, Session, project, freezer, chrome_file + regexp, + replacement, + exception, + message, + record_tuples, + Session, + project, + freezer, + caplog, ): # noqa euf_file = GOOD_EUF_FILE.replace(regexp, replacement) - service = get_dataset_service(Session()) + service = _get_dataset_service(Session()) file = StringIO(euf_file) freezer.move_to("2017-05-20 11:00:23") with pytest.raises(exception) as exc: @@ -333,164 +368,4 @@ def test_bad_import( annotation_source=AnnotationSource.ENSEMBL, ) assert str(exc.value) == message - - -# -# -# def test_validate_args_no_smid(Session): -# with pytest.raises(InstantiationError) as exc: -# DataService( -# session=Session(), -# smid="12345678", -# title="title", -# filen="filen", -# assembly_id=1, -# modification_ids=1, -# technology_id=1, -# organism_id=1, -# ) -# assert ( -# str(exc.value) == "Unrecognised SMID 12345678. Cannot instantiate DataService!" -# ) -# assert exc.type == InstantiationError -# -# -# def test_validate_args_repeated(Session, setup, project_template): -# with Session() as session, session.begin(): -# session.add_all(setup) -# smid = _mock_project_service(session, project_template) -# session.commit() -# with pytest.raises(InstantiationError) as exc: -# DataService( -# session=Session(), -# smid=smid, -# title="title", -# filen="filen", -# assembly_id=1, -# modification_ids=[1, 1], -# technology_id=1, -# organism_id=1, -# ) -# assert ( -# str(exc.value) == "Repeated modification IDs. Cannot instantiate DataService!" -# ) -# assert exc.type == InstantiationError -# -# -# @pytest.mark.parametrize( -# "modid,techid,orgid,name", -# [ -# (99, 1, 1, "Modification"), -# (1, 99, 1, "Technology"), -# (1, 1, 99, "Organism"), -# ], -# ) -# def test_validate_args_fail( -# modid, techid, orgid, name, Session, setup, project_template -# ): -# with Session() as session, session.begin(): -# session.add_all(setup) -# smid = _mock_project_service(session, project_template) -# session.commit() -# with pytest.raises(InstantiationError) as exc: -# DataService( -# session=Session(), -# smid=smid, -# title="title", -# filen="filen", -# assembly_id=1, -# modification_ids=modid, -# technology_id=techid, -# organism_id=orgid, -# ) -# assert ( -# str(exc.value) == f"{name} ID = 99 not found! Cannot instantiate DataService!" -# ) -# assert exc.type == InstantiationError -# -# -# def test_validate_selection_ids_fail(Session, setup, project_template): -# with Session() as session, session.begin(): -# session.add_all(setup) -# smid = _mock_project_service(session, project_template) -# session.commit() -# with pytest.raises(SelectionNotFoundError) as exc: -# DataService( -# session=Session(), -# smid=smid, -# title="title", -# filen="filen", -# assembly_id=1, -# modification_ids=2, -# technology_id=2, -# organism_id=1, -# ) -# assert ( -# str(exc.value) -# == "Selection (mod=m5C, tech=Technology 2, organism=(Homo sapiens, Cell Type 1)) does not exists. Aborting transaction!" -# ) -# assert exc.type == SelectionNotFoundError -# -# -# -# -# def test_validate_existing_entry(Session, setup, project_template): -# with Session() as session, session.begin(): -# session.add_all(setup) -# smid = _mock_project_service(session, project_template) -# # force/add dataset manually -# stamp = datetime.now(timezone.utc).replace(microsecond=0) -# dataset = Dataset( -# id="123456789ABC", -# project_id=smid, -# organism_id=1, -# technology_id=1, -# title="title", -# modification_type="RNA", -# date_added=stamp, -# ) -# association = DatasetModificationAssociation( -# dataset_id="123456789ABC", modification_id=1 -# ) -# session.add_all([dataset, association]) -# session.commit() -# -# service = DataService( -# session=Session(), -# smid=smid, -# title="title", -# filen="filen", -# assembly_id=1, -# modification_ids=1, -# technology_id=1, -# organism_id=1, -# ) -# with pytest.raises(DatasetExistsError) as exc: -# service._validate_entry() -# assert str(exc.value) == ( -# "Suspected duplicate record with EUFID = 123456789ABC " -# f"(SMID = {smid}), and title = title. Aborting transaction!" -# ) -# assert exc.type == DatasetExistsError -# -# -# def test_add_association(Session, setup, project_template): -# with Session() as session, session.begin(): -# session.add_all(setup) -# smid = _mock_project_service(session, project_template) -# -# service = DataService( -# session=Session(), -# smid=smid, -# title="title", -# filen="filen", -# assembly_id=1, -# modification_ids=[1, 2], -# technology_id=1, -# organism_id=1, -# ) -# service._create_eufid() -# service._add_association() -# assert service._selection_ids == [1, 4] -# assert service._modification_names == {"m6A": 1, "m5C": 2} -# assert service.get_eufid() == service._eufid + assert caplog.record_tuples == record_tuples diff --git a/server/tests/unit/services/test_ensembl_annotation.py b/server/tests/unit/services/test_ensembl_annotation.py index ea57e23f..b46d3270 100644 --- a/server/tests/unit/services/test_ensembl_annotation.py +++ b/server/tests/unit/services/test_ensembl_annotation.py @@ -30,9 +30,6 @@ def get_name_for_version(taxa_id: int) -> str: else: return "GRCm38" - def get_chrom_file(self, taxa_id): - pass - def get_seqids(self, taxa_id): pass @@ -60,26 +57,24 @@ def get_ensembl_annotation_records( ): pass - def gtrnadb_to_bed_features(self, annotation_path, features): - pass - - def get_gtrnadb_annotation_records(self, annotation_path, annotation_id, organism): - pass - class MockExternalService: - @staticmethod - def get_sprinzl_mapping(model_file, fasta_file, sprinzl_file): # noqa - return Path(Path(fasta_file).parent, "seq_to_sprinzl.tab").as_posix() + def __init__(self): + pass class MockFileService: @staticmethod def get_assembly_file_path( - taxa_id: int, file_type: AssemblyFileType, chain_file_name: str | None = None + taxa_id: int, + file_type: AssemblyFileType, + chain_file_name: str | None = None, + chain_assembly_name: str | None = None, ) -> Path: if file_type == AssemblyFileType.CHAIN: - return Path(f"/data/assembly/{taxa_id}/{chain_file_name}") + return Path( + f"/data/assembly/{taxa_id}/{chain_assembly_name}/{chain_file_name}" + ) else: return Path(f"/data/assembly/{taxa_id}/{file_type.value}") @@ -146,7 +141,7 @@ def test_get_annotation_fail(Session): assert exc.type == AnnotationNotFoundError -def test_get_release_path(Session, data_path): +def test_get_release_path(Session): with Session() as session, session.begin(): version = AnnotationVersion(version_num="EyRBnPeVwbzW") taxonomy = Taxonomy( @@ -165,10 +160,8 @@ def test_get_release_path(Session, data_path): service = _get_ensembl_annotation_service(Session) annotation = service.get_annotation(9606) release_path = service.get_release_path(annotation) - expected_release_path = Path( - data_path.ANNOTATION_PATH, "Homo_sapiens", "GRCh38", "110" - ) - assert release_path == Path("/data", "annotation", "Homo_sapiens", "GRCh38", "110") + expected_release_path = Path("/data", "annotation", "Homo_sapiens", "GRCh38", "110") + assert release_path == expected_release_path def test_release_exists(Session): @@ -195,26 +188,7 @@ def test_release_exists(Session): assert service._release_exists(annotation.id) -# def test_download(data_path): -# test_release_path = Path(data_path.ANNOTATION_PATH, "Homo_sapiens", "GRCh38", "110") -# test_release_path.mkdir(parents=True, exist_ok=False) -# test_file_name = "README.md" -# test_url = urljoin( -# "https://github.com/dieterich-lab", -# "scimodom", -# "blob", -# "0f977c262e173d4ce5668fda6b6b73308d275ae5", -# test_file_name, -# ) -# test_file = Path(test_release_path, test_file_name) -# AnnotationService.download(test_url, test_file) -# assert test_file.is_file() - - -# Ensembl - - -def test_ensembl_annotation_paths(data_path, Session): +def test_ensembl_annotation_paths(Session): with Session() as session, session.begin(): version = AnnotationVersion(version_num="EyRBnPeVwbzW") taxonomy = Taxonomy( @@ -238,7 +212,8 @@ def test_ensembl_annotation_paths(data_path, Session): organism="Homo_sapiens", assembly="GRCh38", release="110", fmt="gtf" ) expected_annotation_path = Path( - "/data/annotation", + "/data", + "annotation", "Homo_sapiens", "GRCh38", "110", @@ -253,272 +228,3 @@ def test_ensembl_annotation_paths(data_path, Session): ) assert annotation_file == expected_annotation_path assert url == expected_url - - -# def test_ensembl_update_database(setup, data_path, dependencies): -# with dependencies.Session() as session, session.begin(): -# session.add_all(setup) -# service = _get_ensembl_annotation_service(dependencies) -# annotation_path, _ = service._get_annotation_paths() -# service._release_path.mkdir(parents=True, exist_ok=True) -# gtf = _mock_gtf() -# with gzip.open(annotation_path, "wt") as gtf_file: -# gtf_file.write(gtf) -# service._update_database(annotation_path) - -# expected_records = [ -# ("ENSG00000000001", 1, "A", "protein_coding"), -# ("ENSG00000000002", 1, "B", "processed_pseudogene"), -# ("ENSIntergenic", 1, None, None), -# ] -# with _annotation_setup.Session() as session, session.begin(): -# records = session.execute(select(GenomicAnnotation)).scalars().all() -# for row, expected_row in zip(records, expected_records): -# assert row.id == expected_row[0] -# assert row.annotation_id == expected_row[1] -# assert row.name == expected_row[2] -# assert row.biotype == expected_row[3] - - -# def test_ensembl_annotate_data(setup, data_path, _annotation_setup): -# stamp = datetime.now(timezone.utc).replace(microsecond=0) -# with _annotation_setup.Session() as session, session.begin(): -# session.add_all(setup) -# selection = Selection( -# modification_id=1, -# technology_id=1, -# organism_id=1, -# ) -# session.add(selection) -# session.flush() -# selection_id = selection.id -# contact = ProjectContact( -# contact_name="contact_name", -# contact_institution="contact_institution", -# contact_email="contact@email", -# ) -# session.add(contact) -# session.flush() -# contact_id = contact.id -# project = Project( -# id="12345678", -# title="title", -# summary="summary", -# contact_id=contact_id, -# date_published=datetime.fromisoformat("2024-01-01"), -# date_added=stamp, -# ) -# session.add(project) -# session.flush() -# smid = project.id -# dataset = Dataset( -# id="KEyK5s3pcKjE", -# project_id=smid, -# organism_id=1, -# technology_id=1, -# title="title", -# modification_type="RNA", -# date_added=stamp, -# ) -# session.add(dataset) -# session.flush() -# eufid = dataset.id -# rows = _get_records() -# data = [ -# Data( -# dataset_id=eufid, -# modification_id=1, -# chrom=chrom, -# start=start, -# end=end, -# name=name, -# score=score, -# strand=strand, -# thick_start=thick_start, -# thick_end=thick_end, -# item_rgb=item_rgb, -# coverage=coverage, -# frequency=frequency, -# ) -# for chrom, start, end, name, score, strand, thick_start, thick_end, item_rgb, coverage, frequency in rows -# ] -# session.add_all(data) -# session.commit() - -# service = _get_ensembl_annotation_service(_annotation_setup, annotation_id=1) -# service._release_path.mkdir(parents=True, exist_ok=True) -# annotation_path, _ = service._get_annotation_paths() -# gtf = _mock_gtf() -# with gzip.open(annotation_path, "wt") as gtf_file: -# gtf_file.write(gtf) -# parent = service._chrom_file.parent -# parent.mkdir(parents=True, exist_ok=True) -# string = "1\t248956422\n" -# with open(service._chrom_file, "w") as chrom_file: -# chrom_file.write(string) -# features = {k: list(v.keys()) for k, v in service.FEATURES.items()} -# _annotation_setup.bedtools_service.ensembl_to_bed_features( -# annotation_path, service._chrom_file, features -# ) -# service._update_database(annotation_path) -# service.annotate_data(eufid) - -# expected_records = [ -# (1, 5, "ENSG00000000001", "Exonic"), -# (2, 1, "ENSG00000000001", "Exonic"), -# (3, 2, "ENSG00000000001", "Exonic"), -# (4, 3, "ENSG00000000001", "Exonic"), -# (5, 4, "ENSG00000000002", "Exonic"), -# (6, 5, "ENSG00000000001", "5'UTR"), -# (7, 3, "ENSG00000000001", "3'UTR"), -# (8, 1, "ENSG00000000001", "CDS"), -# (9, 2, "ENSG00000000001", "CDS"), -# (10, 6, "ENSG00000000002", "Intronic"), -# (11, 8, "ENSIntergenic", "Intergenic"), -# ] -# with _annotation_setup.Session() as session, session.begin(): -# records = session.execute(select(DataAnnotation)).scalars().all() -# for row, expected_row in zip(records, expected_records): -# assert row.id == expected_row[0] -# assert row.data_id == expected_row[1] -# assert row.gene_id == expected_row[2] -# assert row.feature == expected_row[3] - - -# error handled eslewhere -# def test_ensembl_annotate_no_data(setup, data_path, _annotation_setup): -# with _annotation_setup.Session() as session, session.begin(): -# session.add_all(setup) -# service = _get_ensembl_annotation_service(_annotation_setup, annotation_id=1) -# with pytest.raises(MissingDataError) as exc: -# service.annotate_data("123456789abc") -# assert str(exc.value) == "No records found for 123456789abc" -# assert exc.type == MissingDataError - - -# # tested using ensembl -# def test_update_gene_cache(setup, data_path, _annotation_setup): -# stamp = datetime.now(timezone.utc).replace(microsecond=0) -# with _annotation_setup.Session() as session, session.begin(): -# session.add_all(setup) -# selection = Selection( -# modification_id=1, -# technology_id=1, -# organism_id=1, -# ) -# session.add(selection) -# session.flush() -# selection_id = selection.id -# contact = ProjectContact( -# contact_name="contact_name", -# contact_institution="contact_institution", -# contact_email="contact@email", -# ) -# session.add(contact) -# session.flush() -# contact_id = contact.id -# project = Project( -# id="12345678", -# title="title", -# summary="summary", -# contact_id=contact_id, -# date_published=datetime.fromisoformat("2024-01-01"), -# date_added=stamp, -# ) -# session.add(project) -# session.flush() -# smid = project.id -# dataset = Dataset( -# id="KEyK5s3pcKjE", -# project_id=smid, -# organism_id=1, -# technology_id=1, -# title="title", -# modification_type="RNA", -# date_added=stamp, -# ) -# session.add(dataset) -# session.flush() -# eufid = dataset.id -# rows = _get_records() -# data = [ -# Data( -# dataset_id=eufid, -# modification_id=1, -# chrom=chrom, -# start=start, -# end=end, -# name=name, -# score=score, -# strand=strand, -# thick_start=thick_start, -# thick_end=thick_end, -# item_rgb=item_rgb, -# coverage=coverage, -# frequency=frequency, -# ) -# for chrom, start, end, name, score, strand, thick_start, thick_end, item_rgb, coverage, frequency in rows -# ] -# session.add_all(data) -# session.commit() - -# service = _get_ensembl_annotation_service(_annotation_setup, annotation_id=1) -# annotation_path, _ = service._get_annotation_paths() -# service._release_path.mkdir(parents=True, exist_ok=True) -# gtf = _mock_gtf() -# with gzip.open(annotation_path, "wt") as gtf_file: -# gtf_file.write(gtf) -# service._chrom_file.parent.mkdir(parents=True, exist_ok=True) -# string = "1\t248956422\n" -# with open(service._chrom_file, "w") as chrom_file: -# chrom_file.write(string) -# features = {k: list(v.keys()) for k, v in service.FEATURES.items()} -# _annotation_setup.bedtools_service.ensembl_to_bed_features( -# annotation_path, service._chrom_file, features -# ) -# service._update_database(annotation_path) -# service.annotate_data(eufid) -# service.update_gene_cache(eufid, {1: 1}) -# parent = service.get_cache_path() -# with open(Path(parent, "1"), "r") as f: -# genes = f.read().splitlines() -# assert set(genes) == {"A", "B"} - - -def _mock_gtf(): - string = """#!genome-build GRCh38 -#!genome-version GRCh38 -#!genome-date -#!genome-build-accession -#!genebuild-last-updated -1 ensembl_havana gene 65419 71585 . + . gene_id "ENSG00000000001"; gene_version "7"; gene_name "A"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; -1 havana transcript 65419 71585 . + . gene_id "ENSG00000000001"; gene_version "7"; transcript_id "ENST00000000001"; transcript_version "2"; gene_name "A"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "A-1"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS00000"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; -1 havana exon 65419 65433 . + . gene_id "ENSG00000000001"; gene_version "7"; transcript_id "ENST00000000001"; transcript_version "2"; exon_number "1"; gene_name "A"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "A-1"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS00001"; exon_id "ENSE00000000001"; exon_version "1"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; -1 havana exon 65520 65573 . + . gene_id "ENSG00000000001"; gene_version "7"; transcript_id "ENST00000000001"; transcript_version "2"; exon_number "2"; gene_name "A"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "A-1"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS00002"; exon_id "ENSE00000000002"; exon_version "1"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; -1 havana CDS 65565 65573 . + 0 gene_id "ENSG00000000001"; gene_version "7"; transcript_id "ENST00000000001"; transcript_version "2"; exon_number "2"; gene_name "A"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "A-1"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS00003"; protein_id "ENSP00000493376"; protein_version "2"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; -1 havana start_codon 65565 65567 . + 0 gene_id "ENSG00000000001"; gene_version "7"; transcript_id "ENST00000000001"; transcript_version "2"; exon_number "2"; gene_name "A"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "A-1"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS00004"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; -1 havana exon 69037 71585 . + . gene_id "ENSG00000000001"; gene_version "7"; transcript_id "ENST00000000001"; transcript_version "2"; exon_number "3"; gene_name "A"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "A-1"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS00005"; exon_id "ENSE00000000003"; exon_version "1"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; -1 havana CDS 69037 70005 . + 0 gene_id "ENSG00000000001"; gene_version "7"; transcript_id "ENST00000000001"; transcript_version "2"; exon_number "3"; gene_name "A"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "A-1"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS00006"; protein_id "ENSP00000493376"; protein_version "2"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; -1 havana stop_codon 70006 70008 . + 0 gene_id "ENSG00000000001"; gene_version "7"; transcript_id "ENST00000000001"; transcript_version "2"; exon_number "3"; gene_name "A"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "A-1"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS00007"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; -1 havana five_prime_utr 65419 65433 . + . gene_id "ENSG00000000001"; gene_version "7"; transcript_id "ENST00000000001"; transcript_version "2"; gene_name "A"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "A-1"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS00008"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; -1 havana five_prime_utr 65520 65564 . + . gene_id "ENSG00000000001"; gene_version "7"; transcript_id "ENST00000000001"; transcript_version "2"; gene_name "A"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "A-1"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS00009"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; -1 havana three_prime_utr 70009 71585 . + . gene_id "ENSG00000000001"; gene_version "7"; transcript_id "ENST00000000001"; transcript_version "2"; gene_name "A"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "A-1"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS00010"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; -1 havana gene 487101 489906 . + . gene_id "ENSG00000000002"; gene_version "3"; gene_name "B"; gene_source "havana"; gene_biotype "processed_pseudogene"; -1 havana transcript 487101 489906 . + . gene_id "ENSG00000000002"; gene_version "3"; transcript_id "ENST00000000002"; transcript_version "3"; gene_name "B"; gene_source "havana"; gene_biotype "processed_pseudogene"; transcript_name "B-1"; transcript_source "havana"; transcript_biotype "processed_pseudogene"; tag "basic"; tag "Ensembl_canonical"; transcript_support_level "NA"; -1 havana exon 487101 489387 . + . gene_id "ENSG00000000002"; gene_version "3"; transcript_id "ENST00000000002"; transcript_version "3"; exon_number "1"; gene_name "B"; gene_source "havana"; gene_biotype "processed_pseudogene"; transcript_name "B-1"; transcript_source "havana"; transcript_biotype "processed_pseudogene"; exon_id "ENSE00000000001"; exon_version "3"; tag "basic"; tag "Ensembl_canonical"; transcript_support_level "NA"; -1 havana exon 489717 489906 . + . gene_id "ENSG00000000002"; gene_version "3"; transcript_id "ENST00000000002"; transcript_version "3"; exon_number "2"; gene_name "B"; gene_source "havana"; gene_biotype "processed_pseudogene"; transcript_name "B-1"; transcript_source "havana"; transcript_biotype "processed_pseudogene"; exon_id "ENSE00000000002"; exon_version "1"; tag "basic"; tag "Ensembl_canonical"; transcript_support_level "NA";""" - return string - - -def _get_records(): - records = [ - ("1", 65570, 65571, "m6A", 1, "+", 65570, 65571, "0,0,0", 10, 20), - ("1", 70000, 70001, "m6A", 2, "+", 70000, 70001, "0,0,0", 30, 40), - ("1", 71000, 71001, "m6A", 3, "+", 71000, 71001, "0,0,0", 50, 60), - ("1", 487100, 487101, "m6A", 4, "+", 487100, 487101, "0,0,0", 70, 80), - ("1", 65550, 65551, "m6A", 5, "+", 65550, 65551, "0,0,0", 90, 100), - ("1", 489400, 489401, "m6A", 6, "+", 489400, 489401, "0,0,0", 100, 100), - ("1", 489700, 489701, "m6A", 7, "-", 489700, 489701, "0,0,0", 100, 100), - ("1", 487050, 487051, "m6A", 8, "+", 487050, 487051, "0,0,0", 100, 100), - ] - return records diff --git a/server/tests/unit/services/test_project.py b/server/tests/unit/services/test_project.py index 74cb818e..e65596b1 100644 --- a/server/tests/unit/services/test_project.py +++ b/server/tests/unit/services/test_project.py @@ -1,11 +1,13 @@ from datetime import datetime +from pathlib import Path +from typing import TextIO import pytest from sqlalchemy import select +from mocks.io_mocks import MockStringIO, MockBytesIO from scimodom.database.models import ( Modification, - Organism, DetectionTechnology, Project, ProjectSource, @@ -24,6 +26,48 @@ ) +class MockFileService: + def __init__(self): + self.files_by_name: dict[str, MockStringIO | MockBytesIO] = {} + self.deleted_requests: list[str] = [] + + def get_project_metadata_dir(self) -> Path: + return Path("/data", "metadata") + + def create_project_metadata_file(self, smid: str) -> TextIO: + metadata_file = Path(self.get_project_metadata_dir(), f"{smid}.json").as_posix() + new_file = MockStringIO() + self.files_by_name[metadata_file] = new_file + return new_file + + def create_project_request_file(self, request_uuid) -> TextIO: + request_file = Path( + self._get_project_request_file_path(request_uuid) + ).as_posix() + new_file = MockStringIO() + self.files_by_name[request_file] = new_file + return new_file + + def delete_project_request_file(self, request_uuid) -> None: + name = self._get_project_request_file_path(request_uuid).as_posix() + self.deleted_requests.append(name) + + def _get_project_request_file_path(self, request_uuid): + return Path(self._get_project_request_dir(), f"{request_uuid}.json") + + def _get_project_request_dir(self): + return Path(self.get_project_metadata_dir(), "project_requests") + + +@pytest.fixture +def file_service(): + yield MockFileService() + + +def _get_project_service(Session, file_service): + return ProjectService(Session(), file_service=file_service) # noqa + + PROJECT = ProjectTemplate( title="Title", summary="Summary", @@ -55,20 +99,70 @@ ) -class MockFileService: - pass - - -def get_service(Session): - return ProjectService(Session(), file_service=MockFileService()) # noqa +EXPECTED_PROJECT_TEMPLATE = """{ + "title": "Title", + "summary": "Summary", + "contact_name": "Contact Name", + "contact_institution": "Contact Institution", + "contact_email": "email@example.com", + "date_published": "2024-01-01T00:00:00", + "external_sources": [ + { + "doi": "DOI", + "pmid": 12345678 + } + ], + "metadata": [ + { + "rna": "WTS", + "modomics_id": "2000000006A", + "tech": "Tech-seq", + "method_id": "0ee048bc", + "organism": { + "taxa_id": 9606, + "cto": "Cell", + "assembly_name": "GRCh38", + "assembly_id": null + }, + "note": null + }, + { + "rna": "WTS", + "modomics_id": "2000000005C", + "tech": "Tech-seq", + "method_id": "0ee048bc", + "organism": { + "taxa_id": 10090, + "cto": "Organ", + "assembly_name": "GRCm38", + "assembly_id": null + }, + "note": null + } + ] +}""" + + +# tests + + +def test_create_project_request(Session, file_service): + service = _get_project_service(Session, file_service) + uuid = service.create_project_request(PROJECT) + assert ( + file_service.files_by_name[ + f"/data/metadata/project_requests/{uuid}.json" + ].final_content + == EXPECTED_PROJECT_TEMPLATE + ) -def test_project_validate_entry(Session): - service = get_service(Session) +def test_project_validate_entry(Session, file_service): + service = _get_project_service(Session, file_service) assert service._validate_entry(PROJECT) is None -def test_project_validate_existing_entry(Session, setup): +def test_project_validate_existing_entry(Session, file_service, setup): smid = "12345678" with Session() as session, session.begin(): session.add_all(setup) @@ -91,7 +185,7 @@ def test_project_validate_existing_entry(Session, setup): session.add_all([project, source]) session.commit() - service = get_service(Session) + service = _get_project_service(Session, file_service) with pytest.raises(DuplicateProjectError) as exc: service._validate_entry(PROJECT) assert ( @@ -100,11 +194,11 @@ def test_project_validate_existing_entry(Session, setup): ) -def test_project_add_selection(Session, setup): +def test_project_add_selection(Session, file_service, setup): with Session() as session, session.begin(): session.add_all(setup) - service = get_service(Session) + service = _get_project_service(Session, file_service) service._add_selection_if_none(PROJECT) expected_records = [(1, 1, 1, 1), (2, 2, 2, 1)] @@ -116,7 +210,7 @@ def test_project_add_selection(Session, setup): assert records == expected_records -def test_project_add_selection_exists(Session, setup): +def test_project_add_selection_exists(Session, file_service, setup): with Session() as session, session.begin(): session.add_all(setup) technology = DetectionTechnology(method_id="0ee048bc", tech="Tech-seq") @@ -128,7 +222,7 @@ def test_project_add_selection_exists(Session, setup): session.add(selection) session.commit() - service = get_service(Session) + service = _get_project_service(Session, file_service) service._add_selection_if_none(PROJECT) expected_records = [(1, 1, 3, 1), (2, 1, 1, 1), (3, 2, 2, 1)] @@ -140,31 +234,31 @@ def test_project_add_selection_exists(Session, setup): assert records == expected_records -def test_project_add_modification(Session, setup): +def test_project_add_modification(Session, file_service, setup): with Session() as session, session.begin(): session.add_all(setup) modification = Modification(modomics_id="2000000006A", rna="WTS") session.add(modification) session.commit() - service = get_service(Session) + service = _get_project_service(Session, file_service) for metadata, expected_modification_id in zip(PROJECT.metadata, [1, 2]): assert service._add_modification_if_none(metadata) == expected_modification_id -def test_project_add_technology(Session, setup): - service = get_service(Session) +def test_project_add_technology(Session, file_service, setup): + service = _get_project_service(Session, file_service) for metadata in PROJECT.metadata: assert service._add_technology_if_none(metadata) == 1 -def test_project_add_organism(Session, setup): - service = get_service(Session) +def test_project_add_organism(Session, file_service, setup): + service = _get_project_service(Session, file_service) for metadata, expected_organism_id in zip(PROJECT.metadata, [1, 2]): assert service._add_organism_if_none(metadata) == expected_organism_id -def test_project_add_contact(Session): +def test_project_add_contact(Session, file_service): with Session() as session, session.begin(): contact = ProjectContact( contact_name="Another contact Name", @@ -174,16 +268,16 @@ def test_project_add_contact(Session): session.add(contact) session.commit() - service = get_service(Session) + service = _get_project_service(Session, file_service) assert service._add_contact_if_none(PROJECT) == 2 -def test_project_add_project(Session, setup, freezer): +def test_project_add_project(Session, file_service, setup, freezer): with Session() as session, session.begin(): session.add_all(setup) freezer.move_to("2024-06-20 12:00:00") - service = get_service(Session) + service = _get_project_service(Session, file_service) smid = service._add_project(PROJECT) with Session() as session, session.begin(): @@ -203,11 +297,26 @@ def test_project_add_project(Session, setup, freezer): assert source[0].pmid == 12345678 -def test_project_get_by_id(Session, setup): +def test_create_project(Session, file_service, setup): + with Session() as session, session.begin(): + session.add_all(setup) + + service = _get_project_service(Session, file_service) + smid = service.create_project(PROJECT, "abcdef123456") + assert ( + file_service.files_by_name[f"/data/metadata/{smid}.json"].final_content + == EXPECTED_PROJECT_TEMPLATE + ) + assert file_service.deleted_requests == [ + "/data/metadata/project_requests/abcdef123456.json" + ] + + +def test_project_get_by_id(Session, file_service, setup): with Session() as session, session.begin(): session.add_all(setup) - service = get_service(Session) + service = _get_project_service(Session, file_service) smid = service._add_project(PROJECT) project = service.get_by_id(smid) assert project.id == smid @@ -215,7 +324,7 @@ def test_project_get_by_id(Session, setup): assert project.summary == "Summary" -def test_query_projects(Session, setup): +def test_query_projects(Session, file_service, setup): with Session() as session, session.begin(): session.add_all(setup) for name, email, smid, title, summary in zip( @@ -249,7 +358,7 @@ def test_query_projects(Session, setup): session.add(user_permission) session.commit() - service = get_service(Session) + service = _get_project_service(Session, file_service) with Session() as session, session.begin(): user = session.get_one(User, 1) assert len(service.get_projects(user=user)) == 1