Skip to content

Commit

Permalink
Added a util method to auto-fix invalid URIs; use that in Normalizer.
Browse files Browse the repository at this point in the history
  • Loading branch information
IgorRodchenkov committed Jun 17, 2024
1 parent 44adb1b commit 3d3ee51
Show file tree
Hide file tree
Showing 7 changed files with 41 additions and 39 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ final void testSomePc14DemoPathway() throws IOException {

// convert owl test file in resource directory to jsonld format
InputStream in = getClass().getResourceAsStream("/demo-pathway.owl");
//- there is no rdf:datatype=... anymore; should be fine as the datatypes are defined in the biopax-level3.owl spec!
//todo: for some reason, Jena libs v4 or v5 fail at e.g. rdf:about="TEST_CHEBI:cs_26d67131a0608673ae6a683d1dad18f7",
//there is no rdf:datatype=... anymore; should be fine as the datatypes are defined in the biopax-level3.owl spec;
//Jena libs v4, v5 fail at e.g. rdf:about="TEST_CHEBI:cs_26d67131a0608673ae6a683d1dad18f7" (not a valid URI due to '_' in the prefix),
//but jena v3 just prints warnings, e.g.: org.apache.jena.riot - [line: 155, col: 82] {W107} Bad URI: <TEST_CHEBI:cs_26d67131a0608673ae6a683d1dad18f7> Code: 0/ILLEGAL_CHARACTER in SCHEME: The character violates the grammar rules for URIs/IRIs.
//howver, removing the underscore from TEST_CHEBI - makes those warning/errors go away...
//however, removing the underscore (or replacing with '.') from TEST_CHEBI - makes those warning/errors go away - so I did in demo-pathway.owl

ByteArrayOutputStream baos = new ByteArrayOutputStream();
converter.convertToJsonld(in, baos);
Expand Down
8 changes: 4 additions & 4 deletions json-converter/src/test/resources/demo-pathway.owl
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@
<bp:db>hgnc.symbol</bp:db>
</bp:RelationshipXref>

<bp:ChemicalStructure rdf:about="TEST_CHEBI:cs_26d67131a0608673ae6a683d1dad18f7">
<bp:ChemicalStructure rdf:about="TESTCHEBI:cs_26d67131a0608673ae6a683d1dad18f7">
<bp:structureFormat>InChI</bp:structureFormat>
<bp:structureData>InChI=1S/C10H18O/c1-5-10(4,11)8-6-7-9(2)3/h5,7,11H,1,6,8H2,2-4H3/t10-/m0/s1</bp:structureData>
</bp:ChemicalStructure>
Expand All @@ -158,7 +158,7 @@
<bp:db>ncbigene</bp:db>
</bp:RelationshipXref>

<bp:ChemicalStructure rdf:about="TESTCHEBI:cs_d763940a9c24185a3d1806945a0beb4a">
<bp:ChemicalStructure rdf:about="TEST.CHEBI:cs_d763940a9c24185a3d1806945a0beb4a">
<bp:structureFormat>InChI</bp:structureFormat>
<bp:structureData>InChI=1S/C10H16/c1-7-8-4-5-9(6-8)10(7,2)3/h8-9H,1,4-6H2,2-3H3/t8-,9+/m0/s1</bp:structureData>
</bp:ChemicalStructure>
Expand Down Expand Up @@ -452,7 +452,7 @@
<bp:comment>A linalool that has formula C10H18O.</bp:comment>
<bp:comment>is_conjugate_acid_of 422</bp:comment>
<bp:comment>is_enantiomer_of 98</bp:comment>
<bp:structure rdf:resource="TEST_CHEBI:cs_26d67131a0608673ae6a683d1dad18f7" />
<bp:structure rdf:resource="TESTCHEBI:cs_26d67131a0608673ae6a683d1dad18f7" />
</bp:SmallMoleculeReference>

<bp:RelationshipXref rdf:ID="RX_chebi_CHEBI_17580_multiple_parent_reference">
Expand Down Expand Up @@ -490,7 +490,7 @@
<bp:name>CC1(C)[C@@H]2CC[C@@H](C2)C1=C</bp:name>
<bp:comment>is_enantiomer_of 89</bp:comment>
<bp:comment>A camphene that has formula C10H16.</bp:comment>
<bp:structure rdf:resource="TESTCHEBI:cs_d763940a9c24185a3d1806945a0beb4a" />
<bp:structure rdf:resource="TEST.CHEBI:cs_d763940a9c24185a3d1806945a0beb4a" />
</bp:SmallMoleculeReference>

<bp:UnificationXref rdf:ID="UX_protein_modification_ontology_MOD_00046">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -484,13 +484,15 @@ public void normalize(Model model) {
*/
public void normalize(Model model, boolean usePrefixAsDbName) {

if(model.getLevel() != BioPAXLevel.L3)
if(model.getLevel() != BioPAXLevel.L3) {
throw new IllegalArgumentException("Not Level3 model. " +
"Consider converting it first (e.g., with the PaxTools).");
"Consider converting it first (e.g., with the PaxTools).");
}

//if set, update the xml:base
if(xmlBase != null && !xmlBase.isEmpty())
if(xmlBase != null && !xmlBase.isEmpty()) {
model.setXmlBase(xmlBase);
}

// Normalize/merge xrefs first and then - CVs
// (xrefs could have URIs that should be instead used for CV, PR, SMR or BS biopax types)
Expand Down Expand Up @@ -518,15 +520,15 @@ public void normalize(Model model, boolean usePrefixAsDbName) {

log.info("Normalizing entity references..." + description);
normalizeERs(model);

log.info("Fixing invalid URIs if any...");
ModelUtils.fixInvalidUris(model);

// find/add lost (in replace) children
log.info("Repairing..." + description);
model.repair(); // it does not remove dangling utility class objects (can be done separately, later, if needed)

log.info("Optional tasks (reasoning)..." + description);
}


private void normalizeCVs(Model model) {

NormalizerMap map = new NormalizerMap(model);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package org.biopax.paxtools.normalizer;


import org.apache.commons.lang3.StringUtils;
import org.biopax.paxtools.io.SimpleIOHandler;
import org.biopax.paxtools.model.BioPAXElement;
Expand Down Expand Up @@ -366,23 +365,4 @@ void normalizeInoh() {
e = model.getByID(model.getXmlBase() + "IMR_0100366_G_alpha_s_Canonical");
assertTrue(e instanceof ProteinReference);
}

private void print(XReferrable xr, Model m) {
System.out.println();
System.out.println("model=" + m.contains(xr) + ":\t"
+ xr.getUri() +
" is " + xr.getModelInterface().getSimpleName()
+ " and has xrefs: ");
for(Xref x : xr.getXref()) {
System.out.println("model=" + m.contains(x) + ":\t"
+" " + x + " is "
+ x.getModelInterface().getSimpleName()
+ " - " + x.getUri() + ", db=" + x.getDb()
+ ", id=" + x.getId() + ", idVer=" + x.getIdVersion());
for(XReferrable rx : x.getXrefOf()) {
System.out.println("model=" + m.contains(rx) + ":\t"
+ " xrefOf: " + rx);
}
}
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.biopax.paxtools.controller;

import org.apache.commons.lang3.StringUtils;
import org.biopax.paxtools.impl.BioPAXElementImpl;
import org.biopax.paxtools.io.BioPAXIOHandler;
import org.biopax.paxtools.io.SimpleIOHandler;
Expand All @@ -15,6 +16,7 @@

import java.io.*;
import java.lang.reflect.Method;
import java.net.URI;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.*;
Expand Down Expand Up @@ -1290,7 +1292,6 @@ public static void updateUri(Model model, BioPAXElement el, String newUri) {
m.setAccessible(true);
m.invoke(el, newUri);
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException(e);
}

Expand Down Expand Up @@ -1342,4 +1343,17 @@ public static boolean isGeneric(BioPAXElement e) {
);
//false when e==null
}

public static void fixInvalidUris(Model model) {
String prefix = StringUtils.isBlank(model.getXmlBase()) ? "" : model.getXmlBase();
for(BioPAXElement bpe : new HashSet<>(model.getObjects())) {
try {
URI.create(bpe.getUri());
} catch (IllegalArgumentException e) {
String uri = prefix + md5hex(bpe.getUri());
LOG.info("Replaced invalid URI: '{}' with generated: '{}'", bpe.getUri(), uri);
updateUri(model, bpe, uri);
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.*;
import java.net.URI;
import java.nio.charset.StandardCharsets;
import java.util.*;

Expand Down Expand Up @@ -363,6 +364,11 @@ private String processIndividual(Model model) throws XMLStreamException
throw new BioPaxIOException(
String.format("Error processing %s%s (rdf:ID/rdf:about not found)", r.getNamespaceURI(), getXmlStreamInfo()));
}
try {
URI.create(id);
} catch (IllegalArgumentException e) {
log.error("Invalid URI '{}' at {}{}", id, r.getNamespaceURI(), getXmlStreamInfo());
}

Class<? extends BioPAXElement> type;
try {
Expand All @@ -387,7 +393,7 @@ private String processIndividual(Model model) throws XMLStreamException
} else
{
//abstract BioPAX types, e.g. Entity, UtilityClass, cannot be used directly in RDF+XML model/file!
log.error(String.format("Ignoring abstract %s, id: %s", (r.hasText()?r.getText():getXmlStreamInfo()), id));
log.error("Ignoring abstract {}, id: {}", (r.hasText()?r.getText():getXmlStreamInfo()), id);
//id = null; //todo: uncomment/test (currently, ignored object's uri can become parent's property value, e.g. CV term)
//skip(); //was a bug - throws a misleading exception at the next element in some cases
//todo: shall we instead throw an exception when e.g. <term><Entity rdf:ID="Gene"></Entity></term>?
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,10 @@ public final void mergeAndReplace() {
assertEquals(8, m.getObjects().size()); // + pr3
assertTrue(m.contains(pr3)); // added!

assertTrue(m.contains(pr2)); // not deleted (may be dangling now)!
assertTrue(m.contains(x2)); // not deleted (may be dangling now)!
assertTrue(m.contains(pr1)); // not deleted (may be dangling now)!
assertTrue(m.contains(x1)); // not deleted (may be dangling now)!
assertTrue(m.contains(pr2)); // not deleted (maybe dangling now)!
assertTrue(m.contains(x2)); // not deleted (maybe dangling now)!
assertTrue(m.contains(pr1)); // not deleted (maybe dangling now)!
assertTrue(m.contains(x1)); // not deleted (maybe dangling now)!

// delete dangling
ModelUtils.removeObjectsIfDangling(m, ProteinReference.class);
Expand Down

0 comments on commit 3d3ee51

Please # to comment.