diff --git a/exomiser-core/src/main/java/org/monarchinitiative/exomiser/core/genome/JannovarSmallVariantAnnotator.java b/exomiser-core/src/main/java/org/monarchinitiative/exomiser/core/genome/JannovarSmallVariantAnnotator.java index 7c469b437..ab36e4b5a 100644 --- a/exomiser-core/src/main/java/org/monarchinitiative/exomiser/core/genome/JannovarSmallVariantAnnotator.java +++ b/exomiser-core/src/main/java/org/monarchinitiative/exomiser/core/genome/JannovarSmallVariantAnnotator.java @@ -31,14 +31,10 @@ import org.monarchinitiative.exomiser.core.model.TranscriptAnnotation; import org.monarchinitiative.exomiser.core.model.VariantAnnotation; import org.monarchinitiative.svart.GenomicVariant; -import org.monarchinitiative.svart.VariantType; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.Set; +import java.util.*; import java.util.stream.Stream; import static java.util.stream.Collectors.groupingBy; @@ -127,16 +123,15 @@ private Stream splitAnnotationsByGene(VariantAnnotations var return annotations.stream() .collect(groupingBy(Annotation::getGeneSymbol)) .values().stream() - //.peek(annotationList -> annotationList.forEach(annotation -> logger.info("{}", toAnnotationString(annotation)))) + //.peek(annotationList -> annotationList.forEach(annotation -> logger.info("{}", toAnnotationString(genomeVariant, annotation)))) .map(annos -> new VariantAnnotations(genomeVariant, annos)); } - private String toAnnotationString(VariantType variantType, SVAnnotation annotation) { - return variantType + ", " + - annotation.getVariant() + ", " + + private String toAnnotationString(GenomeVariant genomeVariant, Annotation annotation) { + return genomeVariant + ", " + annotation.getTranscript().getGeneSymbol() + ", " + annotation.getTranscript().getGeneID() + ", " + - annotation.getMostPathogenicVariantEffect() + ", " + + annotation.getMostPathogenicVarType() + ", " + annotation.getPutativeImpact() + ", " + annotation.getTranscript(); } diff --git a/exomiser-core/src/main/java/org/monarchinitiative/exomiser/core/model/AlleleProtoAdaptor.java b/exomiser-core/src/main/java/org/monarchinitiative/exomiser/core/model/AlleleProtoAdaptor.java index f2bc73266..3c2f82c0b 100644 --- a/exomiser-core/src/main/java/org/monarchinitiative/exomiser/core/model/AlleleProtoAdaptor.java +++ b/exomiser-core/src/main/java/org/monarchinitiative/exomiser/core/model/AlleleProtoAdaptor.java @@ -21,6 +21,7 @@ package org.monarchinitiative.exomiser.core.model; import com.google.common.collect.ImmutableMap; +import de.charite.compbio.jannovar.annotation.VariantEffect; import org.monarchinitiative.exomiser.core.model.frequency.Frequency; import org.monarchinitiative.exomiser.core.model.frequency.FrequencyData; import org.monarchinitiative.exomiser.core.model.frequency.FrequencySource; @@ -28,6 +29,7 @@ import org.monarchinitiative.exomiser.core.model.pathogenicity.PathogenicityData; import org.monarchinitiative.exomiser.core.model.pathogenicity.PathogenicityScore; import org.monarchinitiative.exomiser.core.model.pathogenicity.PathogenicitySource; +import org.monarchinitiative.exomiser.core.proto.AlleleProto; import org.monarchinitiative.exomiser.core.proto.AlleleProto.AlleleKey; import org.monarchinitiative.exomiser.core.proto.AlleleProto.AlleleProperties; import org.monarchinitiative.exomiser.core.proto.AlleleProto.ClinVar; @@ -160,10 +162,13 @@ public static ClinVarData toClinVarData(ClinVar clinVar) { } ClinVarData.Builder builder = ClinVarData.builder(); builder.alleleId(clinVar.getAlleleId()); + builder.variationId(clinVar.getVariationId()); builder.primaryInterpretation(toClinSig(clinVar.getPrimaryInterpretation())); builder.secondaryInterpretations(toClinSigSet(clinVar.getSecondaryInterpretationsList())); builder.includedAlleles(getToIncludedAlleles(clinVar.getIncludedAllelesMap())); builder.reviewStatus(clinVar.getReviewStatus()); + builder.geneSymbol(clinVar.getGeneSymbol()); + builder.variantEffect(toVariantEffect(clinVar.getVariantEffect())); return builder.build(); } @@ -192,39 +197,94 @@ private static Set toClinSigSet(List proto } private static ClinVarData.ClinSig toClinSig(ClinVar.ClinSig protoClinSig) { - switch (protoClinSig) { - case BENIGN: - return ClinVarData.ClinSig.BENIGN; - case BENIGN_OR_LIKELY_BENIGN: - return ClinVarData.ClinSig.BENIGN_OR_LIKELY_BENIGN; - case LIKELY_BENIGN: - return ClinVarData.ClinSig.LIKELY_BENIGN; - case UNCERTAIN_SIGNIFICANCE: - return ClinVarData.ClinSig.UNCERTAIN_SIGNIFICANCE; - case LIKELY_PATHOGENIC: - return ClinVarData.ClinSig.LIKELY_PATHOGENIC; - case PATHOGENIC_OR_LIKELY_PATHOGENIC: - return ClinVarData.ClinSig.PATHOGENIC_OR_LIKELY_PATHOGENIC; - case PATHOGENIC: - return ClinVarData.ClinSig.PATHOGENIC; - case CONFLICTING_PATHOGENICITY_INTERPRETATIONS: - return ClinVarData.ClinSig.CONFLICTING_PATHOGENICITY_INTERPRETATIONS; - case AFFECTS: - return ClinVarData.ClinSig.AFFECTS; - case ASSOCIATION: - return ClinVarData.ClinSig.ASSOCIATION; - case DRUG_RESPONSE: - return ClinVarData.ClinSig.DRUG_RESPONSE; - case OTHER: - return ClinVarData.ClinSig.OTHER; - case PROTECTIVE: - return ClinVarData.ClinSig.PROTECTIVE; - case RISK_FACTOR: - return ClinVarData.ClinSig.RISK_FACTOR; - case NOT_PROVIDED: - case UNRECOGNIZED: - default: - return ClinVarData.ClinSig.NOT_PROVIDED; - } + return switch (protoClinSig) { + case BENIGN -> ClinVarData.ClinSig.BENIGN; + case BENIGN_OR_LIKELY_BENIGN -> ClinVarData.ClinSig.BENIGN_OR_LIKELY_BENIGN; + case LIKELY_BENIGN -> ClinVarData.ClinSig.LIKELY_BENIGN; + case UNCERTAIN_SIGNIFICANCE -> ClinVarData.ClinSig.UNCERTAIN_SIGNIFICANCE; + case LIKELY_PATHOGENIC -> ClinVarData.ClinSig.LIKELY_PATHOGENIC; + case PATHOGENIC_OR_LIKELY_PATHOGENIC -> ClinVarData.ClinSig.PATHOGENIC_OR_LIKELY_PATHOGENIC; + case PATHOGENIC -> ClinVarData.ClinSig.PATHOGENIC; + case CONFLICTING_PATHOGENICITY_INTERPRETATIONS -> + ClinVarData.ClinSig.CONFLICTING_PATHOGENICITY_INTERPRETATIONS; + case AFFECTS -> ClinVarData.ClinSig.AFFECTS; + case ASSOCIATION -> ClinVarData.ClinSig.ASSOCIATION; + case DRUG_RESPONSE -> ClinVarData.ClinSig.DRUG_RESPONSE; + case OTHER -> ClinVarData.ClinSig.OTHER; + case PROTECTIVE -> ClinVarData.ClinSig.PROTECTIVE; + case RISK_FACTOR -> ClinVarData.ClinSig.RISK_FACTOR; + default -> ClinVarData.ClinSig.NOT_PROVIDED; + }; + } + + private static VariantEffect toVariantEffect(AlleleProto.VariantEffect clinVarVariantEffect) { + return switch (clinVarVariantEffect) { + case SEQUENCE_VARIANT -> VariantEffect.SEQUENCE_VARIANT; + case CHROMOSOME_NUMBER_VARIATION -> VariantEffect.CHROMOSOME_NUMBER_VARIATION; + case TRANSCRIPT_ABLATION -> VariantEffect.TRANSCRIPT_ABLATION; + case EXON_LOSS_VARIANT -> VariantEffect.EXON_LOSS_VARIANT; + case INVERSION -> VariantEffect.INVERSION; + case INSERTION -> VariantEffect.INSERTION; + case TRANSLOCATION -> VariantEffect.TRANSLOCATION; + case FRAMESHIFT_ELONGATION -> VariantEffect.FRAMESHIFT_ELONGATION; + case FRAMESHIFT_TRUNCATION -> VariantEffect.FRAMESHIFT_TRUNCATION; + case FRAMESHIFT_VARIANT -> VariantEffect.FRAMESHIFT_VARIANT; + case INTERNAL_FEATURE_ELONGATION -> VariantEffect.INTERNAL_FEATURE_ELONGATION; + case FEATURE_TRUNCATION -> VariantEffect.FEATURE_TRUNCATION; + case TRANSCRIPT_AMPLIFICATION -> VariantEffect.TRANSCRIPT_AMPLIFICATION; + case COPY_NUMBER_CHANGE -> VariantEffect.COPY_NUMBER_CHANGE; + case MNV -> VariantEffect.MNV; + case COMPLEX_SUBSTITUTION -> VariantEffect.COMPLEX_SUBSTITUTION; + case STOP_GAINED -> VariantEffect.STOP_GAINED; + case STOP_LOST -> VariantEffect.STOP_LOST; + case START_LOST -> VariantEffect.START_LOST; + case SPLICE_ACCEPTOR_VARIANT -> VariantEffect.SPLICE_ACCEPTOR_VARIANT; + case SPLICE_DONOR_VARIANT -> VariantEffect.SPLICE_DONOR_VARIANT; + case RARE_AMINO_ACID_VARIANT -> VariantEffect.RARE_AMINO_ACID_VARIANT; + case MISSENSE_VARIANT -> VariantEffect.MISSENSE_VARIANT; + case INFRAME_INSERTION -> VariantEffect.INFRAME_INSERTION; + case DISRUPTIVE_INFRAME_INSERTION -> VariantEffect.DISRUPTIVE_INFRAME_INSERTION; + case INFRAME_DELETION -> VariantEffect.INFRAME_DELETION; + case DISRUPTIVE_INFRAME_DELETION -> VariantEffect.DISRUPTIVE_INFRAME_DELETION; + case FIVE_PRIME_UTR_TRUNCATION -> VariantEffect.FIVE_PRIME_UTR_TRUNCATION; + case THREE_PRIME_UTR_TRUNCATION -> VariantEffect.THREE_PRIME_UTR_TRUNCATION; + case SPLICE_REGION_VARIANT -> VariantEffect.SPLICE_REGION_VARIANT; + case STOP_RETAINED_VARIANT -> VariantEffect.STOP_RETAINED_VARIANT; + case INITIATOR_CODON_VARIANT -> VariantEffect.INITIATOR_CODON_VARIANT; + case SYNONYMOUS_VARIANT -> VariantEffect.SYNONYMOUS_VARIANT; + case CODING_TRANSCRIPT_INTRON_VARIANT -> VariantEffect.CODING_TRANSCRIPT_INTRON_VARIANT; + case FIVE_PRIME_UTR_PREMATURE_START_CODON_GAIN_VARIANT -> + VariantEffect.FIVE_PRIME_UTR_PREMATURE_START_CODON_GAIN_VARIANT; + case FIVE_PRIME_UTR_EXON_VARIANT -> VariantEffect.FIVE_PRIME_UTR_EXON_VARIANT; + case THREE_PRIME_UTR_EXON_VARIANT -> VariantEffect.THREE_PRIME_UTR_EXON_VARIANT; + case FIVE_PRIME_UTR_INTRON_VARIANT -> VariantEffect.FIVE_PRIME_UTR_INTRON_VARIANT; + case THREE_PRIME_UTR_INTRON_VARIANT -> VariantEffect.THREE_PRIME_UTR_INTRON_VARIANT; + case NON_CODING_TRANSCRIPT_EXON_VARIANT -> VariantEffect.NON_CODING_TRANSCRIPT_EXON_VARIANT; + case NON_CODING_TRANSCRIPT_INTRON_VARIANT -> VariantEffect.NON_CODING_TRANSCRIPT_INTRON_VARIANT; + case DIRECT_TANDEM_DUPLICATION -> VariantEffect.DIRECT_TANDEM_DUPLICATION; + case MOBILE_ELEMENT_DELETION -> VariantEffect.MOBILE_ELEMENT_DELETION; + case MOBILE_ELEMENT_INSERTION -> VariantEffect.MOBILE_ELEMENT_INSERTION; + case UPSTREAM_GENE_VARIANT -> VariantEffect.UPSTREAM_GENE_VARIANT; + case DOWNSTREAM_GENE_VARIANT -> VariantEffect.DOWNSTREAM_GENE_VARIANT; + case INTERGENIC_VARIANT -> VariantEffect.INTERGENIC_VARIANT; + case TFBS_ABLATION -> VariantEffect.TFBS_ABLATION; + case TFBS_AMPLIFICATION -> VariantEffect.TFBS_AMPLIFICATION; + case TF_BINDING_SITE_VARIANT -> VariantEffect.TF_BINDING_SITE_VARIANT; + case REGULATORY_REGION_VARIANT -> VariantEffect.REGULATORY_REGION_VARIANT; + case REGULATORY_REGION_ABLATION -> VariantEffect.REGULATORY_REGION_ABLATION; + case REGULATORY_REGION_AMPLIFICATION -> VariantEffect.REGULATORY_REGION_AMPLIFICATION; + case CONSERVED_INTRON_VARIANT -> VariantEffect.CONSERVED_INTRON_VARIANT; + case INTRAGENIC_VARIANT -> VariantEffect.INTRAGENIC_VARIANT; + case CONSERVED_INTERGENIC_VARIANT -> VariantEffect.CONSERVED_INTERGENIC_VARIANT; + case STRUCTURAL_VARIANT -> VariantEffect.STRUCTURAL_VARIANT; + case CODING_SEQUENCE_VARIANT -> VariantEffect.CODING_SEQUENCE_VARIANT; + case INTRON_VARIANT -> VariantEffect.INTRON_VARIANT; + case EXON_VARIANT -> VariantEffect.EXON_VARIANT; + case SPLICING_VARIANT -> VariantEffect.SPLICING_VARIANT; + case MIRNA -> VariantEffect.MIRNA; + case CODING_TRANSCRIPT_VARIANT -> VariantEffect.CODING_TRANSCRIPT_VARIANT; + case NON_CODING_TRANSCRIPT_VARIANT -> VariantEffect.NON_CODING_TRANSCRIPT_VARIANT; + case UNRECOGNIZED -> VariantEffect.SEQUENCE_VARIANT; + }; } } diff --git a/exomiser-core/src/main/java/org/monarchinitiative/exomiser/core/model/pathogenicity/ClinVarData.java b/exomiser-core/src/main/java/org/monarchinitiative/exomiser/core/model/pathogenicity/ClinVarData.java index 8d3350dc4..07a5ecfc2 100644 --- a/exomiser-core/src/main/java/org/monarchinitiative/exomiser/core/model/pathogenicity/ClinVarData.java +++ b/exomiser-core/src/main/java/org/monarchinitiative/exomiser/core/model/pathogenicity/ClinVarData.java @@ -21,8 +21,7 @@ package org.monarchinitiative.exomiser.core.model.pathogenicity; import com.fasterxml.jackson.annotation.JsonIgnore; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Sets; +import de.charite.compbio.jannovar.annotation.VariantEffect; import java.util.*; @@ -65,6 +64,8 @@ public enum ClinSig { private final String reviewStatus; private final Map includedAlleles; + private final String geneSymbol; + private final VariantEffect variantEffect; // https://www.medschool.umaryland.edu/Genetic_Variant_Interpretation_Tool1.html/ // BP1, Missense variant in a gene for which primarily truncating variants are known to cause disease @@ -115,9 +116,11 @@ private ClinVarData(Builder builder) { this.alleleId = builder.alleleId; this.variationId = builder.variationId; this.primaryInterpretation = builder.primaryInterpretation; - this.secondaryInterpretations = Sets.immutableEnumSet(builder.secondaryInterpretations); + this.secondaryInterpretations = Collections.unmodifiableSet(builder.secondaryInterpretations); this.reviewStatus = builder.reviewStatus.replace("_", " "); - this.includedAlleles = ImmutableMap.copyOf(builder.includedAlleles); + this.includedAlleles = Collections.unmodifiableMap(builder.includedAlleles); + this.geneSymbol = builder.geneSymbol; + this.variantEffect = builder.variantEffect; } public static ClinVarData empty() { @@ -153,6 +156,14 @@ public Map getIncludedAlleles() { return includedAlleles; } + public String getGeneSymbol() { + return geneSymbol; + } + + public VariantEffect getVariantEffect() { + return variantEffect; + } + /** * @return true if the secondary CLNSIG contains one of 'affects', 'other', 'association', 'risk factor' or * 'protective'. These are considered unimportant from the mendelian disease perspective. The category 'drug response' @@ -163,49 +174,36 @@ public Map getIncludedAlleles() { @JsonIgnore public boolean isSecondaryAssociationRiskFactorOrOther() { for (ClinVarData.ClinSig secondaryClinSig : secondaryInterpretations) { - switch (secondaryClinSig) { - case AFFECTS: - case OTHER: - case ASSOCIATION: - case RISK_FACTOR: - case PROTECTIVE: - return true; - default: - return false; + if (Objects.requireNonNull(secondaryClinSig) == ClinSig.AFFECTS || secondaryClinSig == ClinSig.OTHER || secondaryClinSig == ClinSig.ASSOCIATION || secondaryClinSig == ClinSig.RISK_FACTOR || secondaryClinSig == ClinSig.PROTECTIVE) { + return true; } } return false; } /** - * Returns the ClinVar star rating according to the criteria provided at - * https://www.ncbi.nlm.nih.gov/clinvar/docs/review_status/#revstat_def - *

- * In the VCF CLNREVSTAT the start ratings are mapped as follows: + * Returns the ClinVar star rating. + * In the VCF CLNREVSTAT the star ratings are mapped as follows: *

+ *

      * 1* criteria_provided,_conflicting_interpretations
      * 1* criteria_provided,_single_submitter
      * 2* criteria_provided,_multiple_submitters,_no_conflicts
      * 3* reviewed_by_expert_panel
      * 4* practice_guideline
+     * 
* * @return an integer value between 0 (worst) and 4 (best) * @since 13.0.0 */ public int starRating() { - switch (reviewStatus) { - case "criteria provided, single submitter": - case "criteria provided, conflicting interpretations": - return 1; - case "criteria provided, multiple submitters, no conflicts": - return 2; - case "reviewed by expert panel": - return 3; - case "practice guideline": - return 4; - default: - return 0; - } + return switch (reviewStatus) { + case "criteria provided, single submitter", "criteria provided, conflicting interpretations" -> 1; + case "criteria provided, multiple submitters, no conflicts" -> 2; + case "reviewed by expert panel" -> 3; + case "practice guideline" -> 4; + default -> 0; + }; } @Override @@ -214,10 +212,10 @@ public boolean equals(Object o) { if (o == null || getClass() != o.getClass()) return false; ClinVarData that = (ClinVarData) o; return Objects.equals(alleleId, that.alleleId) && - primaryInterpretation == that.primaryInterpretation && - Objects.equals(secondaryInterpretations, that.secondaryInterpretations) && - Objects.equals(reviewStatus, that.reviewStatus) && - Objects.equals(includedAlleles, that.includedAlleles); + primaryInterpretation == that.primaryInterpretation && + Objects.equals(secondaryInterpretations, that.secondaryInterpretations) && + Objects.equals(reviewStatus, that.reviewStatus) && + Objects.equals(includedAlleles, that.includedAlleles); } @Override @@ -228,12 +226,27 @@ public int hashCode() { @Override public String toString() { return "ClinVarData{" + - "alleleId='" + alleleId + '\'' + - ", primaryInterpretation=" + primaryInterpretation + - ", secondaryInterpretations=" + secondaryInterpretations + - ", reviewStatus='" + reviewStatus + '\'' + - ", includedAlleles=" + includedAlleles + - '}'; + "variationId='" + variationId + '\'' + + ", alleleId='" + alleleId + '\'' + + ", geneSymbol='" + geneSymbol + '\'' + + ", variantEffect='" + variantEffect + '\'' + + ", primaryInterpretation=" + primaryInterpretation + + ", secondaryInterpretations=" + secondaryInterpretations + + ", reviewStatus='" + reviewStatus + '\'' + + ", includedAlleles=" + includedAlleles + + '}'; + } + + public Builder toBuilder() { + return new Builder() + .variationId(variationId) + .alleleId(alleleId) + .primaryInterpretation(primaryInterpretation) + .secondaryInterpretations(secondaryInterpretations) + .reviewStatus(reviewStatus) + .includedAlleles(includedAlleles) + .geneSymbol(geneSymbol) + .variantEffect(variantEffect); } public static Builder builder() { @@ -249,6 +262,9 @@ public static class Builder { private String reviewStatus = ""; private Map includedAlleles = Collections.emptyMap(); + private String geneSymbol = ""; + private VariantEffect variantEffect = VariantEffect.SEQUENCE_VARIANT; + public Builder alleleId(String alleleId) { Objects.requireNonNull(alleleId); this.alleleId = alleleId; @@ -269,7 +285,7 @@ public Builder primaryInterpretation(ClinSig primaryInterpretation) { public Builder secondaryInterpretations(Set secondaryInterpretations) { Objects.requireNonNull(secondaryInterpretations); - this.secondaryInterpretations = secondaryInterpretations; + this.secondaryInterpretations = secondaryInterpretations.isEmpty() ? Set.of() : EnumSet.copyOf(secondaryInterpretations); return this; } @@ -285,6 +301,18 @@ public Builder includedAlleles(Map includedAlleles) { return this; } + public Builder geneSymbol(String geneSymbol) { + Objects.requireNonNull(geneSymbol); + this.geneSymbol = geneSymbol; + return this; + } + + public Builder variantEffect(VariantEffect variantEffect) { + Objects.requireNonNull(variantEffect); + this.variantEffect = variantEffect; + return this; + } + public ClinVarData build() { return new ClinVarData(this); } diff --git a/exomiser-core/src/main/proto/allele.proto b/exomiser-core/src/main/proto/allele.proto index fd95d30ef..61e8146d1 100644 --- a/exomiser-core/src/main/proto/allele.proto +++ b/exomiser-core/src/main/proto/allele.proto @@ -48,7 +48,74 @@ message ClinVar { string reviewStatus = 4; map includedAlleles = 5; string variationId = 6; -// string gene_symbol = 7; -// string gene_id = 8; -// VariantEffect variant_effect = 9; + string gene_symbol = 7; + string gene_id = 8; + VariantEffect variant_effect = 9; +} + +enum VariantEffect { + SEQUENCE_VARIANT = 0; // n.b. this is the LOWEST value in the jannovar enum + CHROMOSOME_NUMBER_VARIATION = 1; + TRANSCRIPT_ABLATION = 2; + EXON_LOSS_VARIANT = 3; + INVERSION = 4; + INSERTION = 5; + TRANSLOCATION = 6; + FRAMESHIFT_ELONGATION = 7; + FRAMESHIFT_TRUNCATION = 8; + FRAMESHIFT_VARIANT = 9; + INTERNAL_FEATURE_ELONGATION = 10; + FEATURE_TRUNCATION = 11; + TRANSCRIPT_AMPLIFICATION = 12; + COPY_NUMBER_CHANGE = 13; + MNV = 14; + COMPLEX_SUBSTITUTION = 15; + STOP_GAINED = 16; + STOP_LOST = 17; + START_LOST = 18; + SPLICE_ACCEPTOR_VARIANT = 19; + SPLICE_DONOR_VARIANT = 20; + RARE_AMINO_ACID_VARIANT = 21; + MISSENSE_VARIANT = 22; + INFRAME_INSERTION = 23; + DISRUPTIVE_INFRAME_INSERTION = 24; + INFRAME_DELETION = 25; + DISRUPTIVE_INFRAME_DELETION = 26; + FIVE_PRIME_UTR_TRUNCATION = 27; + THREE_PRIME_UTR_TRUNCATION = 28; + SPLICE_REGION_VARIANT = 30; + STOP_RETAINED_VARIANT = 31; + INITIATOR_CODON_VARIANT = 32; + SYNONYMOUS_VARIANT = 33; + CODING_TRANSCRIPT_INTRON_VARIANT = 34; + FIVE_PRIME_UTR_PREMATURE_START_CODON_GAIN_VARIANT = 35; + FIVE_PRIME_UTR_EXON_VARIANT = 36; + THREE_PRIME_UTR_EXON_VARIANT = 37; + FIVE_PRIME_UTR_INTRON_VARIANT = 38; + THREE_PRIME_UTR_INTRON_VARIANT = 39; + NON_CODING_TRANSCRIPT_EXON_VARIANT = 40; + NON_CODING_TRANSCRIPT_INTRON_VARIANT = 41; + DIRECT_TANDEM_DUPLICATION = 42; + MOBILE_ELEMENT_DELETION = 43; + MOBILE_ELEMENT_INSERTION = 44; + UPSTREAM_GENE_VARIANT = 45; + DOWNSTREAM_GENE_VARIANT = 46; + INTERGENIC_VARIANT = 47; + TFBS_ABLATION = 48; + TFBS_AMPLIFICATION = 49; + TF_BINDING_SITE_VARIANT = 50; + REGULATORY_REGION_VARIANT = 51; + REGULATORY_REGION_ABLATION = 52; + REGULATORY_REGION_AMPLIFICATION = 53; + CONSERVED_INTRON_VARIANT = 54; + INTRAGENIC_VARIANT = 55; + CONSERVED_INTERGENIC_VARIANT = 56; + STRUCTURAL_VARIANT = 57; + CODING_SEQUENCE_VARIANT = 58; + INTRON_VARIANT = 59; + EXON_VARIANT = 60; + SPLICING_VARIANT = 61; + MIRNA = 62; + CODING_TRANSCRIPT_VARIANT = 63; + NON_CODING_TRANSCRIPT_VARIANT = 64; } \ No newline at end of file diff --git a/exomiser-core/src/test/java/org/monarchinitiative/exomiser/core/model/pathogenicity/ClinVarDataTest.java b/exomiser-core/src/test/java/org/monarchinitiative/exomiser/core/model/pathogenicity/ClinVarDataTest.java index 2d44ca2d4..2bc235eaf 100644 --- a/exomiser-core/src/test/java/org/monarchinitiative/exomiser/core/model/pathogenicity/ClinVarDataTest.java +++ b/exomiser-core/src/test/java/org/monarchinitiative/exomiser/core/model/pathogenicity/ClinVarDataTest.java @@ -20,7 +20,7 @@ package org.monarchinitiative.exomiser.core.model.pathogenicity; -import com.google.common.collect.ImmutableMap; +import de.charite.compbio.jannovar.annotation.VariantEffect; import org.junit.jupiter.api.Test; import org.monarchinitiative.exomiser.core.model.pathogenicity.ClinVarData.ClinSig; @@ -52,12 +52,18 @@ public void testEmptyBuilder() { @Test public void testBuilderWithValues() { String alleleId = "12345"; + String variationId = "23456"; + String geneSymbol = "GENE1"; + VariantEffect variantEffect = VariantEffect.MISSENSE_VARIANT; ClinSig clinSig = ClinSig.PATHOGENIC; Set secondaryInterpretations = EnumSet.of(ClinSig.RISK_FACTOR, ClinSig.ASSOCIATION); String reviewStatus = "multiple_submitters,_no_conflict"; - Map included = ImmutableMap.of("54321", ClinSig.PATHOGENIC_OR_LIKELY_PATHOGENIC); + Map included = Map.of("54321", ClinSig.PATHOGENIC_OR_LIKELY_PATHOGENIC); ClinVarData instance = ClinVarData.builder() + .variationId(variationId) .alleleId(alleleId) + .geneSymbol(geneSymbol) + .variantEffect(variantEffect) .primaryInterpretation(clinSig) .secondaryInterpretations(secondaryInterpretations) .reviewStatus(reviewStatus) @@ -65,10 +71,14 @@ public void testBuilderWithValues() { .build(); assertThat(instance.getAlleleId(), equalTo(alleleId)); + assertThat(instance.getVariationId(), equalTo(variationId)); + assertThat(instance.getGeneSymbol(), equalTo(geneSymbol)); + assertThat(instance.getVariantEffect(), equalTo(variantEffect)); assertThat(instance.getPrimaryInterpretation(), equalTo(clinSig)); assertThat(instance.getSecondaryInterpretations(), equalTo(secondaryInterpretations)); assertThat(instance.getReviewStatus(), equalTo("multiple submitters, no conflict")); assertThat(instance.getIncludedAlleles(), equalTo(included)); + System.out.println(instance); } @Test @@ -77,7 +87,7 @@ public void testStringValue() { ClinSig clinSig = ClinSig.PATHOGENIC; Set secondaryInterpretations = EnumSet.of(ClinSig.RISK_FACTOR, ClinSig.ASSOCIATION); String reviewStatus = "multiple_submitters,_no_conflict"; - Map included = ImmutableMap.of("54321", ClinSig.PATHOGENIC_OR_LIKELY_PATHOGENIC); + Map included = Map.of("54321", ClinSig.PATHOGENIC_OR_LIKELY_PATHOGENIC); ClinVarData instance = ClinVarData.builder() .alleleId(alleleId) .primaryInterpretation(clinSig) diff --git a/exomiser-data-genome/README.md b/exomiser-data-genome/README.md new file mode 100644 index 000000000..8016ca0da --- /dev/null +++ b/exomiser-data-genome/README.md @@ -0,0 +1,49 @@ +Exomiser - Genome DB Build += + +This is a Spring Boot CLI application and as such has one idiosyncrasy which will prevent a build from launching, if not +set in the `application.properties`. + +The absolute requirement for anything, even `--help` to work is for the `--build-dir` variable to be set and this _must_ +be set using an equals sign i.e. + +```shell +$ java -jar exomiser-data-genome-${project.version}.jar --build-dir=. --help +``` + +By default, this is set in the `application.properties` to `.` i.e. the current working directory but can be overriden. + +Also, note that the `--assembly` and `--version` **must come before any other arguments** in order that they are correctly +set for use with other optional arguments. + +Build transcript databases and build/annotate ClinVar data. The ClinVar data build now requires a transcript database so +that the variants can be annotated for gene symbol and variant effect to be in line with the output from Exomiser. Whilst +these are available in the `MC` field of the ClinVar VCF file, the effects are not sorted according to reference +transcript order, so in cases where more than one transcript overlaps a variant, the most damaging effect is reported +first, even if the MANE/MANE_Clinical or canonical transcript has a less damaging effect. + +```shell +$ java -jar exomiser-data-genome-${project.version}.jar --assembly hg38 --version 2311 --transcripts ensembl --clinvar +``` +Will create an output directory `2311_hg38` containing the files: + +```shell +2311_hg38/ +├── 2311_hg38_clinvar.mv.db +└── 2311_hg38_transcripts_ensembl.ser +``` + +Create a new ClinVar database from the latest ClinVar release using an existing Exomiser release, in this case 2309_hg38: + +```shell +$ java -jar exomiser-data-genome-${project.version}.jar --assembly hg38 --version 231112 --clinvar /data/exomiser/2309_hg38/2309_hg38_transcripts_ensembl.ser +``` + +Will just create the ClinVar database, annotated using the specified transcript data: + +```shell +231112_hg38/ +└── 231112_hg38_clinvar.mv.db +``` +_n.b_ here the ClinVar data has been created for the 20231112 release, so it is possible to update the clinvar data for +Exomiser on a weekly basis to keep up with ClinVar \ No newline at end of file diff --git a/exomiser-data-genome/src/main/java/org/monarchinitiative/exomiser/data/genome/BuildCommand.java b/exomiser-data-genome/src/main/java/org/monarchinitiative/exomiser/data/genome/BuildCommand.java index bf61a247b..2292d0e01 100644 --- a/exomiser-data-genome/src/main/java/org/monarchinitiative/exomiser/data/genome/BuildCommand.java +++ b/exomiser-data-genome/src/main/java/org/monarchinitiative/exomiser/data/genome/BuildCommand.java @@ -20,8 +20,10 @@ package org.monarchinitiative.exomiser.data.genome; +import de.charite.compbio.jannovar.data.JannovarData; import org.monarchinitiative.exomiser.core.genome.GenomeAssembly; import org.monarchinitiative.exomiser.core.genome.jannovar.JannovarDataFactory; +import org.monarchinitiative.exomiser.core.genome.jannovar.JannovarDataSourceLoader; import org.monarchinitiative.exomiser.core.genome.jannovar.TranscriptSource; import org.monarchinitiative.exomiser.data.genome.config.AssemblyResources; import org.monarchinitiative.exomiser.data.genome.model.AlleleResource; @@ -36,6 +38,8 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -51,24 +55,23 @@ @Component @Command(name = "build", description = "Command to build the Exomiser genome data bundle.") public class BuildCommand implements Callable { - private static final Logger logger = LoggerFactory.getLogger(BuildCommand.class); @Option(names = {"-h", "--help"}, usageHelp = true, description = "display this help message") boolean usageHelpRequested; - @Option(names = "--build-dir", required = true) - private Path buildDir; + @Option(names = "--build-dir", description = "The directory in which to build the data (default: ${DEFAULT-VALUE}).") + private Path buildDir = Path.of(System.getProperty("user.dir")); private final AssemblyResources hg19Resources; private final AssemblyResources hg38Resources; private final Path jannovarIniFile; - @Option(names = "--assembly", required = true, converter = AssemblyConverter.class, description = "Genome assembly to build the data for - one of hg19 or hg38.") + @Option(names = "--assembly", required = true, converter = AssemblyConverter.class, description = "Genome assembly for the build. Either hg19 or hg38.", order = 0) private GenomeAssembly assembly; - @Option(names = "--version", required = true, description = "Data version for this build. Typically this would be of the form YYMM i.e. 2308 indicates the data was built in August 2023.") - private String version; - @Option(names = "--clinvar", description = "Flag to trigger building of ClinVar data.") - private boolean buildClinVar; + @Option(names = "--version", description = "Data version for this build. Typically this would be of the form yyMM i.e. 2308 indicates the data was built in August 2023 (default: ${DEFAULT-VALUE}).") + private String version = DateTimeFormatter.ofPattern("yyMM").format(LocalDate.now()); + @Option(names = "--clinvar", arity = "0..1", converter = ClinVarOptionConverter.class, description = "Flag to trigger building of ClinVar data using the specified transcript data file. If not specified, the transcript_ensembl.ser for the current build will be used.") + private Path buildClinVar; @Option(names = "--transcripts", converter = TranscriptSourceConverter.class, split = ",", arity = "0..1", fallbackValue = "ensembl,refseq,ucsc", description = "List of transcript databases to build. If specified without parameter, will build all sources: ${FALLBACK-VALUE}") private List transcriptSources; @Option(names = "--variants", split = ",", arity = "0..1", fallbackValue = "esp,exac,uk10k,topmed,dbsnp,gnomad-exome,gnomad-genome,dbnsfp", description = "List of variant data sources to build. If specified without parameter, will build all sources: ${FALLBACK-VALUE}") @@ -102,8 +105,9 @@ public Integer call() throws IOException { BuildInfo buildInfo = BuildInfo.of(assembly, version); String buildString = buildInfo.getBuildString(); logger.info("Building version {}", buildString); - Path outPath = buildDir.resolve(buildString); - logger.info("Build directory set to {}", outPath); + Path outPath = getOutPath(buildDir, buildInfo); + logger.info("Build directory set to {}", buildDir); + logger.info("Build artefacts will be written to {}", outPath); if (!outPath.toFile().exists()) { Files.createDirectories(outPath); } @@ -113,8 +117,8 @@ public Integer call() throws IOException { if (shouldBuildAllData()) { logger.info("BUILDING ALLL THIe THINGS!"); - buildClinVarData(buildInfo, outPath, assemblyResources.getClinVarResource()); buildTranscriptData(buildInfo, outPath, List.of(TranscriptSource.values())); + buildClinVarData(buildInfo, outPath, assemblyResources.getClinVarResource()); buildVariantData(buildInfo, outPath, new ArrayList<>(alleleResources.values())); buildGenomeData(buildInfo, outPath, assemblyResources); } @@ -123,7 +127,7 @@ public Integer call() throws IOException { buildTranscriptData(buildInfo, outPath, transcriptSources); } - if (buildClinVar) { + if (buildClinVar != null) { ClinVarAlleleResource clinVarResource = assemblyResources.getClinVarResource(); buildClinVarData(buildInfo, outPath, clinVarResource); } @@ -141,8 +145,12 @@ public Integer call() throws IOException { return 0; } + private Path getOutPath(Path buildDir, BuildInfo buildInfo) { + return buildDir.resolve(buildInfo.getBuildString()); + } + private boolean shouldBuildAllData() { - return !buildGenome && !buildClinVar && transcriptSources == null && variantSources == null; + return !buildGenome && buildClinVar == null && transcriptSources == null && variantSources == null; } private void buildTranscriptData(BuildInfo buildInfo, Path outPath, List transcriptSources) { @@ -160,11 +168,21 @@ private void buildTranscriptData(BuildInfo buildInfo, Path outPath, List userDefinedAlleleResources) { logger.info("Downloading variant resources - {}", userDefinedAlleleResources.stream() .map(AlleleResource::getName) @@ -202,6 +220,7 @@ public GenomeAssembly convert(String value) throws Exception { return GenomeAssembly.parseAssembly(value); } } + static class TranscriptSourceConverter implements CommandLine.ITypeConverter { @Override @@ -209,4 +228,20 @@ public TranscriptSource convert(String value) throws Exception { return TranscriptSource.parseValue(value.trim()); } } + + private class ClinVarOptionConverter implements ITypeConverter { + @Override + public Path convert(String value) throws Exception { + if (value == null) { + return null; + } + return value.isEmpty() ? fallbackPath() : Path.of(value); + } + + public Path fallbackPath() { + BuildInfo buildInfo = BuildInfo.of(assembly, version); + logger.info("Transcript file path for ClinVar annotation not specified for build {}. Using fallback path...", buildInfo.getBuildString()); + return getOutPath(buildDir, buildInfo).resolve(TranscriptDataBuildRunner.transcriptFileName(buildInfo, TranscriptSource.ENSEMBL)); + } + } } diff --git a/exomiser-data-genome/src/main/java/org/monarchinitiative/exomiser/data/genome/ClinVarBuildRunner.java b/exomiser-data-genome/src/main/java/org/monarchinitiative/exomiser/data/genome/ClinVarBuildRunner.java index 912f71659..28441cdf9 100644 --- a/exomiser-data-genome/src/main/java/org/monarchinitiative/exomiser/data/genome/ClinVarBuildRunner.java +++ b/exomiser-data-genome/src/main/java/org/monarchinitiative/exomiser/data/genome/ClinVarBuildRunner.java @@ -1,20 +1,33 @@ package org.monarchinitiative.exomiser.data.genome; +import de.charite.compbio.jannovar.data.JannovarData; import org.h2.mvstore.MVMap; import org.h2.mvstore.MVStore; import org.h2.mvstore.MVStoreTool; +import org.monarchinitiative.exomiser.core.genome.GenomeAssembly; +import org.monarchinitiative.exomiser.core.genome.JannovarVariantAnnotator; +import org.monarchinitiative.exomiser.core.genome.VariantAnnotator; import org.monarchinitiative.exomiser.core.genome.dao.serialisers.MvStoreUtil; +import org.monarchinitiative.exomiser.core.model.ChromosomalRegionIndex; +import org.monarchinitiative.exomiser.core.model.VariantAnnotation; +import org.monarchinitiative.exomiser.core.model.pathogenicity.ClinVarData; import org.monarchinitiative.exomiser.core.proto.AlleleProto; import org.monarchinitiative.exomiser.data.genome.indexers.AlleleConverter; import org.monarchinitiative.exomiser.data.genome.model.Allele; import org.monarchinitiative.exomiser.data.genome.model.BuildInfo; import org.monarchinitiative.exomiser.data.genome.model.resource.ClinVarAlleleResource; +import org.monarchinitiative.svart.CoordinateSystem; +import org.monarchinitiative.svart.Coordinates; +import org.monarchinitiative.svart.GenomicVariant; +import org.monarchinitiative.svart.Strand; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.util.List; +import java.util.Map; import java.util.stream.Stream; @@ -26,12 +39,16 @@ public class ClinVarBuildRunner { private final BuildInfo buildInfo; private final ClinVarAlleleResource clinVarAlleleResource; private final Path outFile; + private final VariantAnnotator variantAnnotator; + private final GenomeAssembly genomeAssembly; - public ClinVarBuildRunner(BuildInfo buildInfo, Path outDir, ClinVarAlleleResource clinVarAlleleResource) { + public ClinVarBuildRunner(BuildInfo buildInfo, Path outDir, ClinVarAlleleResource clinVarAlleleResource, JannovarData jannovarData) { this.outDir = outDir.toAbsolutePath(); this.buildInfo = buildInfo; this.clinVarAlleleResource = clinVarAlleleResource; - outFile = outDir.toAbsolutePath().resolve(buildInfo.getBuildString() + "_clinvar.mv.db"); + this.outFile = outDir.toAbsolutePath().resolve(buildInfo.getBuildString() + "_clinvar.mv.db"); + genomeAssembly = buildInfo.getAssembly(); + variantAnnotator = new JannovarVariantAnnotator(genomeAssembly, jannovarData, ChromosomalRegionIndex.empty()); } public Path getOutFile() { @@ -52,8 +69,11 @@ public void run() { try (Stream alleleStream = clinVarAlleleResource.parseResource()) { alleleStream .forEach(allele -> { + ClinVarData clinVarData = annotateClinvar(allele); + allele.setClinVarData(clinVarData); + logger.debug("{}-{}-{}-{} {}", allele.getChr(), allele.getPos(), allele.getRef(), allele.getAlt(), clinVarData); var alleleKey = AlleleConverter.toAlleleKey(allele); - var clinvarProto = AlleleConverter.toProtoClinVar(allele.getClinVarData()); + var clinvarProto = AlleleConverter.toProtoClinVar(clinVarData); clinVarMap.put(alleleKey, clinvarProto); }); } @@ -64,4 +84,18 @@ public void run() { logger.info("Compacting MVStore"); MVStoreTool.compact(outFileName, true); } + + private ClinVarData annotateClinvar(Allele allele) { + GenomicVariant genomicVariant = GenomicVariant.of(genomeAssembly.getContigById(allele.getChr()), Strand.POSITIVE, Coordinates.ofAllele(CoordinateSystem.ONE_BASED, allele.getPos(), allele.getRef()), allele.getRef(), allele.getAlt()); + List variantAnnotations = variantAnnotator.annotate(genomicVariant); + if (!variantAnnotations.isEmpty()) { + VariantAnnotation variantAnnotation = variantAnnotations.get(0); + return allele.getClinVarData() + .toBuilder() + .geneSymbol(variantAnnotation.getGeneSymbol()) + .variantEffect(variantAnnotation.getVariantEffect()) + .build(); + } + return allele.getClinVarData(); + } } diff --git a/exomiser-data-genome/src/main/java/org/monarchinitiative/exomiser/data/genome/TranscriptDataBuildRunner.java b/exomiser-data-genome/src/main/java/org/monarchinitiative/exomiser/data/genome/TranscriptDataBuildRunner.java index b9ecdbacc..6095d57da 100644 --- a/exomiser-data-genome/src/main/java/org/monarchinitiative/exomiser/data/genome/TranscriptDataBuildRunner.java +++ b/exomiser-data-genome/src/main/java/org/monarchinitiative/exomiser/data/genome/TranscriptDataBuildRunner.java @@ -48,9 +48,13 @@ public TranscriptDataBuildRunner(BuildInfo buildInfo, JannovarDataFactory jannov this.transcriptSources = transcriptSources; } + public static String transcriptFileName(BuildInfo buildInfo, TranscriptSource transcriptSource) { + return buildInfo.getBuildString() + "_transcripts_" + transcriptSource + ".ser"; + } + public void run() { transcriptSources.forEach(transcriptSource -> { - String outputName = String.format("%s_transcripts_%s.ser", buildInfo.getBuildString(), transcriptSource); + String outputName = transcriptFileName(buildInfo, transcriptSource); logger.info("Building {}", outputName); jannovarDataFactory.buildAndWrite(buildInfo.getAssembly(), transcriptSource, outPath.resolve(outputName)); }); diff --git a/exomiser-data-genome/src/main/java/org/monarchinitiative/exomiser/data/genome/indexers/AlleleConverter.java b/exomiser-data-genome/src/main/java/org/monarchinitiative/exomiser/data/genome/indexers/AlleleConverter.java index c90b4e556..422b2e096 100644 --- a/exomiser-data-genome/src/main/java/org/monarchinitiative/exomiser/data/genome/indexers/AlleleConverter.java +++ b/exomiser-data-genome/src/main/java/org/monarchinitiative/exomiser/data/genome/indexers/AlleleConverter.java @@ -20,7 +20,9 @@ package org.monarchinitiative.exomiser.data.genome.indexers; +import de.charite.compbio.jannovar.annotation.VariantEffect; import org.monarchinitiative.exomiser.core.model.pathogenicity.ClinVarData; +import org.monarchinitiative.exomiser.core.proto.AlleleProto; import org.monarchinitiative.exomiser.core.proto.AlleleProto.AlleleKey; import org.monarchinitiative.exomiser.core.proto.AlleleProto.AlleleProperties; import org.monarchinitiative.exomiser.core.proto.AlleleProto.ClinVar; @@ -91,43 +93,114 @@ public static ClinVar toProtoClinVar(ClinVarData clinVarData) { for (Map.Entry entry : clinVarData.getIncludedAlleles().entrySet()) { builder.putIncludedAlleles(entry.getKey(), toProtoClinSig(entry.getValue())); } + builder.setGeneSymbol(clinVarData.getGeneSymbol()); + builder.setVariantEffect(toProtoVariantEffect(clinVarData.getVariantEffect())); return builder.build(); } private static ClinVar.ClinSig toProtoClinSig(ClinVarData.ClinSig clinSig) { - switch (clinSig){ - case BENIGN: - return ClinVar.ClinSig.BENIGN; - case BENIGN_OR_LIKELY_BENIGN: - return ClinVar.ClinSig.BENIGN_OR_LIKELY_BENIGN; - case LIKELY_BENIGN: - return ClinVar.ClinSig.LIKELY_BENIGN; - case UNCERTAIN_SIGNIFICANCE: - return ClinVar.ClinSig.UNCERTAIN_SIGNIFICANCE; - case LIKELY_PATHOGENIC: - return ClinVar.ClinSig.LIKELY_PATHOGENIC; - case PATHOGENIC_OR_LIKELY_PATHOGENIC: - return ClinVar.ClinSig.PATHOGENIC_OR_LIKELY_PATHOGENIC; - case PATHOGENIC: - return ClinVar.ClinSig.PATHOGENIC; - case CONFLICTING_PATHOGENICITY_INTERPRETATIONS: - return ClinVar.ClinSig.CONFLICTING_PATHOGENICITY_INTERPRETATIONS; - case AFFECTS: - return ClinVar.ClinSig.AFFECTS; - case ASSOCIATION: - return ClinVar.ClinSig.ASSOCIATION; - case DRUG_RESPONSE: - return ClinVar.ClinSig.DRUG_RESPONSE; - case NOT_PROVIDED: - return ClinVar.ClinSig.NOT_PROVIDED; - case OTHER: - return ClinVar.ClinSig.OTHER; - case PROTECTIVE: - return ClinVar.ClinSig.PROTECTIVE; - case RISK_FACTOR: - return ClinVar.ClinSig.RISK_FACTOR; - } - throw new IllegalArgumentException(clinSig + " not a recognised value"); + return switch (clinSig) { + case BENIGN -> ClinVar.ClinSig.BENIGN; + case BENIGN_OR_LIKELY_BENIGN -> ClinVar.ClinSig.BENIGN_OR_LIKELY_BENIGN; + case LIKELY_BENIGN -> ClinVar.ClinSig.LIKELY_BENIGN; + case UNCERTAIN_SIGNIFICANCE -> ClinVar.ClinSig.UNCERTAIN_SIGNIFICANCE; + case LIKELY_PATHOGENIC -> ClinVar.ClinSig.LIKELY_PATHOGENIC; + case PATHOGENIC_OR_LIKELY_PATHOGENIC -> ClinVar.ClinSig.PATHOGENIC_OR_LIKELY_PATHOGENIC; + case PATHOGENIC -> ClinVar.ClinSig.PATHOGENIC; + case CONFLICTING_PATHOGENICITY_INTERPRETATIONS -> ClinVar.ClinSig.CONFLICTING_PATHOGENICITY_INTERPRETATIONS; + case AFFECTS -> ClinVar.ClinSig.AFFECTS; + case ASSOCIATION -> ClinVar.ClinSig.ASSOCIATION; + case DRUG_RESPONSE -> ClinVar.ClinSig.DRUG_RESPONSE; + case NOT_PROVIDED -> ClinVar.ClinSig.NOT_PROVIDED; + case OTHER -> ClinVar.ClinSig.OTHER; + case PROTECTIVE -> ClinVar.ClinSig.PROTECTIVE; + case RISK_FACTOR -> ClinVar.ClinSig.RISK_FACTOR; + }; + } + + public static AlleleProto.VariantEffect toProtoVariantEffect(VariantEffect variantEffect) { + return switch (variantEffect) { + case CHROMOSOME_NUMBER_VARIATION -> AlleleProto.VariantEffect.CHROMOSOME_NUMBER_VARIATION; + case TRANSCRIPT_ABLATION -> AlleleProto.VariantEffect.TRANSCRIPT_ABLATION; + case EXON_LOSS_VARIANT -> AlleleProto.VariantEffect.EXON_LOSS_VARIANT; + case INVERSION -> AlleleProto.VariantEffect.INVERSION; + case INSERTION -> AlleleProto.VariantEffect.INSERTION; + case TRANSLOCATION -> AlleleProto.VariantEffect.TRANSLOCATION; + case FRAMESHIFT_ELONGATION -> AlleleProto.VariantEffect.FRAMESHIFT_ELONGATION; + case FRAMESHIFT_TRUNCATION -> AlleleProto.VariantEffect.FRAMESHIFT_TRUNCATION; + case FRAMESHIFT_VARIANT -> AlleleProto.VariantEffect.FRAMESHIFT_VARIANT; + case INTERNAL_FEATURE_ELONGATION -> AlleleProto.VariantEffect.INTERNAL_FEATURE_ELONGATION; + case FEATURE_TRUNCATION -> AlleleProto.VariantEffect.FEATURE_TRUNCATION; + case TRANSCRIPT_AMPLIFICATION -> AlleleProto.VariantEffect.TRANSCRIPT_AMPLIFICATION; + case COPY_NUMBER_CHANGE -> AlleleProto.VariantEffect.COPY_NUMBER_CHANGE; + case MNV -> AlleleProto.VariantEffect.MNV; + case COMPLEX_SUBSTITUTION -> AlleleProto.VariantEffect.COMPLEX_SUBSTITUTION; + case STOP_GAINED -> AlleleProto.VariantEffect.STOP_GAINED; + case STOP_LOST -> AlleleProto.VariantEffect.STOP_LOST; + case START_LOST -> AlleleProto.VariantEffect.START_LOST; + case SPLICE_ACCEPTOR_VARIANT -> AlleleProto.VariantEffect.SPLICE_ACCEPTOR_VARIANT; + case SPLICE_DONOR_VARIANT -> AlleleProto.VariantEffect.SPLICE_DONOR_VARIANT; + case RARE_AMINO_ACID_VARIANT -> AlleleProto.VariantEffect.RARE_AMINO_ACID_VARIANT; + // unused marker + case _SMALLEST_HIGH_IMPACT -> AlleleProto.VariantEffect.SEQUENCE_VARIANT; + case MISSENSE_VARIANT -> AlleleProto.VariantEffect.MISSENSE_VARIANT; + case INFRAME_INSERTION -> AlleleProto.VariantEffect.INFRAME_INSERTION; + case DISRUPTIVE_INFRAME_INSERTION -> AlleleProto.VariantEffect.DISRUPTIVE_INFRAME_INSERTION; + case INFRAME_DELETION -> AlleleProto.VariantEffect.INFRAME_DELETION; + case DISRUPTIVE_INFRAME_DELETION -> AlleleProto.VariantEffect.DISRUPTIVE_INFRAME_DELETION; + case FIVE_PRIME_UTR_TRUNCATION -> AlleleProto.VariantEffect.FIVE_PRIME_UTR_TRUNCATION; + case THREE_PRIME_UTR_TRUNCATION -> AlleleProto.VariantEffect.THREE_PRIME_UTR_TRUNCATION; + // unused marker + case _SMALLEST_MODERATE_IMPACT -> AlleleProto.VariantEffect.SEQUENCE_VARIANT; + case SPLICE_REGION_VARIANT -> AlleleProto.VariantEffect.SPLICE_REGION_VARIANT; + case STOP_RETAINED_VARIANT -> AlleleProto.VariantEffect.STOP_RETAINED_VARIANT; + case INITIATOR_CODON_VARIANT -> AlleleProto.VariantEffect.INITIATOR_CODON_VARIANT; + case SYNONYMOUS_VARIANT -> AlleleProto.VariantEffect.SYNONYMOUS_VARIANT; + case CODING_TRANSCRIPT_INTRON_VARIANT -> AlleleProto.VariantEffect.CODING_TRANSCRIPT_INTRON_VARIANT; + case FIVE_PRIME_UTR_PREMATURE_START_CODON_GAIN_VARIANT -> AlleleProto.VariantEffect.FIVE_PRIME_UTR_PREMATURE_START_CODON_GAIN_VARIANT; + case FIVE_PRIME_UTR_EXON_VARIANT -> AlleleProto.VariantEffect.FIVE_PRIME_UTR_EXON_VARIANT; + case THREE_PRIME_UTR_EXON_VARIANT -> AlleleProto.VariantEffect.THREE_PRIME_UTR_EXON_VARIANT; + case FIVE_PRIME_UTR_INTRON_VARIANT -> AlleleProto.VariantEffect.FIVE_PRIME_UTR_INTRON_VARIANT; + case THREE_PRIME_UTR_INTRON_VARIANT -> AlleleProto.VariantEffect.THREE_PRIME_UTR_INTRON_VARIANT; + case NON_CODING_TRANSCRIPT_EXON_VARIANT -> AlleleProto.VariantEffect.NON_CODING_TRANSCRIPT_EXON_VARIANT; + case NON_CODING_TRANSCRIPT_INTRON_VARIANT -> AlleleProto.VariantEffect.NON_CODING_TRANSCRIPT_INTRON_VARIANT; + // unused marker + case _SMALLEST_LOW_IMPACT -> AlleleProto.VariantEffect.SEQUENCE_VARIANT; + case DIRECT_TANDEM_DUPLICATION -> AlleleProto.VariantEffect.DIRECT_TANDEM_DUPLICATION; + case MOBILE_ELEMENT_DELETION -> AlleleProto.VariantEffect.MOBILE_ELEMENT_DELETION; + case MOBILE_ELEMENT_INSERTION -> AlleleProto.VariantEffect.MOBILE_ELEMENT_INSERTION; + // unused + case CUSTOM -> AlleleProto.VariantEffect.SEQUENCE_VARIANT; + case UPSTREAM_GENE_VARIANT -> AlleleProto.VariantEffect.UPSTREAM_GENE_VARIANT; + case DOWNSTREAM_GENE_VARIANT -> AlleleProto.VariantEffect.DOWNSTREAM_GENE_VARIANT; + case INTERGENIC_VARIANT -> AlleleProto.VariantEffect.INTERGENIC_VARIANT; + case TFBS_ABLATION -> AlleleProto.VariantEffect.TFBS_ABLATION; + case TFBS_AMPLIFICATION -> AlleleProto.VariantEffect.TFBS_AMPLIFICATION; + case TF_BINDING_SITE_VARIANT -> AlleleProto.VariantEffect.TF_BINDING_SITE_VARIANT; + case REGULATORY_REGION_VARIANT -> AlleleProto.VariantEffect.REGULATORY_REGION_VARIANT; + case REGULATORY_REGION_ABLATION -> AlleleProto.VariantEffect.REGULATORY_REGION_ABLATION; + case REGULATORY_REGION_AMPLIFICATION -> AlleleProto.VariantEffect.REGULATORY_REGION_AMPLIFICATION; + case CONSERVED_INTRON_VARIANT -> AlleleProto.VariantEffect.CONSERVED_INTRON_VARIANT; + case INTRAGENIC_VARIANT -> AlleleProto.VariantEffect.INTRAGENIC_VARIANT; + case CONSERVED_INTERGENIC_VARIANT -> AlleleProto.VariantEffect.CONSERVED_INTERGENIC_VARIANT; + case STRUCTURAL_VARIANT -> AlleleProto.VariantEffect.STRUCTURAL_VARIANT; + case CODING_SEQUENCE_VARIANT -> AlleleProto.VariantEffect.CODING_SEQUENCE_VARIANT; + case INTRON_VARIANT -> AlleleProto.VariantEffect.INTRON_VARIANT; + case EXON_VARIANT -> AlleleProto.VariantEffect.EXON_VARIANT; + case SPLICING_VARIANT -> AlleleProto.VariantEffect.SPLICING_VARIANT; + case MIRNA -> AlleleProto.VariantEffect.MIRNA; + // unused + case GENE_VARIANT -> AlleleProto.VariantEffect.SEQUENCE_VARIANT; + case CODING_TRANSCRIPT_VARIANT -> AlleleProto.VariantEffect.CODING_TRANSCRIPT_VARIANT; + case NON_CODING_TRANSCRIPT_VARIANT -> AlleleProto.VariantEffect.NON_CODING_TRANSCRIPT_VARIANT; + // unused + case TRANSCRIPT_VARIANT -> AlleleProto.VariantEffect.SEQUENCE_VARIANT; + // unused + case INTERGENIC_REGION -> AlleleProto.VariantEffect.SEQUENCE_VARIANT; + // unused + case CHROMOSOME -> AlleleProto.VariantEffect.SEQUENCE_VARIANT; + case SEQUENCE_VARIANT -> AlleleProto.VariantEffect.SEQUENCE_VARIANT; + }; } } diff --git a/exomiser-data-genome/src/main/resources/application.properties b/exomiser-data-genome/src/main/resources/application.properties index 778d1f79c..e0b8cef13 100644 --- a/exomiser-data-genome/src/main/resources/application.properties +++ b/exomiser-data-genome/src/main/resources/application.properties @@ -20,7 +20,7 @@ spring.flyway.enabled=false spring.h2.console.enabled=true # build-dir defines the main directory which will be used to build the databases. -build-dir= +build-dir=. build-version=1711 jannovar.ini-file=${build-dir}/default_sources.ini dbsnp-ftp-url=ftp://ftp.ncbi.nlm.nih.gov/snp/latest_release/VCF diff --git a/exomiser-data-genome/src/main/resources/logback-spring.xml b/exomiser-data-genome/src/main/resources/logback-spring.xml new file mode 100644 index 000000000..f69a24573 --- /dev/null +++ b/exomiser-data-genome/src/main/resources/logback-spring.xml @@ -0,0 +1,26 @@ + + + + + + + + \ No newline at end of file diff --git a/exomiser-data-genome/src/test/java/org/monarchinitiative/exomiser/data/genome/ClinVarBuildRunnerTest.java b/exomiser-data-genome/src/test/java/org/monarchinitiative/exomiser/data/genome/ClinVarBuildRunnerTest.java index 7e326a2f7..551d33363 100644 --- a/exomiser-data-genome/src/test/java/org/monarchinitiative/exomiser/data/genome/ClinVarBuildRunnerTest.java +++ b/exomiser-data-genome/src/test/java/org/monarchinitiative/exomiser/data/genome/ClinVarBuildRunnerTest.java @@ -1,5 +1,6 @@ package org.monarchinitiative.exomiser.data.genome; +import de.charite.compbio.jannovar.data.JannovarData; import org.h2.mvstore.MVMap; import org.h2.mvstore.MVStore; import org.junit.jupiter.api.Test; @@ -7,6 +8,7 @@ import org.monarchinitiative.exomiser.core.genome.GenomeAssembly; import org.monarchinitiative.exomiser.core.genome.dao.ClinVarWhiteListReader; import org.monarchinitiative.exomiser.core.genome.dao.serialisers.MvStoreUtil; +import org.monarchinitiative.exomiser.core.genome.jannovar.JannovarDataSourceLoader; import org.monarchinitiative.exomiser.core.proto.AlleleProto; import org.monarchinitiative.exomiser.data.genome.model.BuildInfo; import org.monarchinitiative.exomiser.data.genome.model.resource.ClinVarAlleleResource; @@ -30,7 +32,10 @@ void run(@TempDir Path tempDir) throws Exception { ClinVarAlleleResource clinVarAlleleResource = new ClinVarAlleleResource("clinvar", new URL("https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz"), testResourcePath); ResourceDownloader.download(clinVarAlleleResource); - ClinVarBuildRunner instance = new ClinVarBuildRunner(buildInfo, tempDir, clinVarAlleleResource); + Path testJannovarFilePath = Path.of("src/test/resources/clinvar-test-transcript-data.ser"); + JannovarData jannovarData = JannovarDataSourceLoader.loadJannovarData(testJannovarFilePath); + + ClinVarBuildRunner instance = new ClinVarBuildRunner(buildInfo, tempDir, clinVarAlleleResource, jannovarData); instance.run(); Path outputFile = instance.getOutFile(); @@ -40,7 +45,7 @@ void run(@TempDir Path tempDir) throws Exception { // The ClinVar data is used for the ClinVarDao and for building the WhiteList (along with optional user data) MVMap clinvar = MvStoreUtil.openClinVarMVMap(clinvarStore); assertThat(clinvar.size(), equalTo(2000)); - + clinvar.values().forEach(clinvarProto -> assertThat(clinvarProto.getVariantEffect() != AlleleProto.VariantEffect.SEQUENCE_VARIANT, is(true))); Set whiteListAlleleKeys = ClinVarWhiteListReader.readVariantWhiteList(clinvarStore); assertThat(whiteListAlleleKeys.size(), equalTo(23)); } diff --git a/exomiser-data-genome/src/test/resources/clinvar-test-transcript-data.ser b/exomiser-data-genome/src/test/resources/clinvar-test-transcript-data.ser new file mode 100644 index 000000000..d68da28cc Binary files /dev/null and b/exomiser-data-genome/src/test/resources/clinvar-test-transcript-data.ser differ