Skip to content

Commit

Permalink
Updated HumanCyc and Drugbank cleaner classes; minor polish in the in…
Browse files Browse the repository at this point in the history
…dex and tests.
  • Loading branch information
IgorRodchenkov committed Apr 24, 2024
1 parent 74bf3a6 commit f2dfee4
Show file tree
Hide file tree
Showing 8 changed files with 85 additions and 45 deletions.
64 changes: 51 additions & 13 deletions src/main/java/cpath/cleaner/DrugbankCleaner.java
Original file line number Diff line number Diff line change
@@ -1,27 +1,65 @@
package cpath.cleaner;

import cpath.service.api.Cleaner;
import org.biopax.paxtools.io.SimpleIOHandler;
import org.biopax.paxtools.model.BioPAXLevel;
import org.biopax.paxtools.model.Model;
import org.biopax.paxtools.model.level3.Xref;

import java.io.InputStream;
import java.io.OutputStream;

public class DrugbankCleaner implements Cleaner {

/*
* drugbank biopax data uses the following values in xref.db properties,
* which biopax validator reports as "Unknown";
* so we need to replace with the corresponding standard prefix/name from bioregistry.io (or identifiers.org):
GenBank Gene Database -> "genbank"
GenBank Protein Database -> "genbank" (numeric IDs, not like it's in ncbiprotein)
Therapeutic Targets Database ->
Guide to Pharmacology ->
HUGO Gene Nomenclature Committee (HGNC) -> "hgnc.symbol" (or "HGNC Symbol")
IUPHAR ->
Drugs Product Database (DPD) ->
*/

* drugbank biopax data uses the following weird values in xref.db properties
* (which biopax validator reports as "Unknown"; so we need to map to standard names from bioregistry.io)
* - GenBank Gene Database -> "genbank"
* - GenBank Protein Database -> "genbank" (numeric IDs, not like it's in ncbiprotein)
* - Therapeutic Targets Database -> ttd.drug
* - HUGO Gene Nomenclature Committee (HGNC) -> "hgnc" (e.g. HGNC:1111)
* - Drugs Product Database (DPD) -> cdpd
* - IUPHAR -> iuphar.family or iuphar.ligand or iuphar.receptor?
* - Guide to Pharmacology -> likely same as IUPHAR (but which of the three collections?)
*/
@Override
public void clean(InputStream data, OutputStream cleanedData) {
//TODO: implement
try
{
SimpleIOHandler simpleReader = new SimpleIOHandler(BioPAXLevel.L3);
Model model = simpleReader.convertFromOWL(data);
cleanXrefDBName(model);
simpleReader.convertToOWL(model, cleanedData);
} catch (Exception e) {
throw new RuntimeException("HumanCycCleaner failed", e);
}
}

protected void cleanXrefDBName(Model model)
{
for (Xref xr : model.getObjects(Xref.class))
{
if(xr.getDb() == null) {
//skip
}
else if(xr.getDb().equalsIgnoreCase("GenBank Gene Database")) {
xr.setDb("genbank");
}
else if(xr.getDb().equalsIgnoreCase("GenBank Protein Database")) {
xr.setDb("genbank");
}
else if(xr.getDb().equalsIgnoreCase("Therapeutic Targets Database")) {
xr.setDb("ttd.drug");
xr.addComment("Therapeutic Targets Database");
}
else if(xr.getDb().equalsIgnoreCase("HUGO Gene Nomenclature Committee (HGNC)")) { //HGNC:1234 ids
xr.setDb("hgnc");
}
else if(xr.getDb().equalsIgnoreCase("Drugs Product Database (DPD)")) {
xr.setDb("cdpd");
xr.addComment("Drugs Product Database (DPD)");
}
}
}

}
8 changes: 7 additions & 1 deletion src/main/java/cpath/cleaner/HumanCycCleaner.java
Original file line number Diff line number Diff line change
Expand Up @@ -107,14 +107,20 @@ protected void cleanXrefDBName(Model model)
LOG.warn(xr.getModelInterface().getSimpleName() + ".db is NULL; " + xr.getUri());
}
else if(xr.getDb().startsWith("Entrez")) {
xr.setDb("genpept"); //Protein GenBank Identifier
xr.setDb("genpept"); //aka Protein GenBank Identifier
}
else if(xr.getDb().equalsIgnoreCase("NCBI Taxonomy")) {
xr.setDb("ncbitaxon");
}
else if(xr.getDb().equalsIgnoreCase("Ensembl Human")) {
xr.setDb("ensembl");
}
else if(xr.getDb().equalsIgnoreCase("kegg ligand")) {
xr.setDb("kegg.compound");
}
else if(xr.getDb().equalsIgnoreCase("PubChem (CID)")) {
xr.setDb("pubchem.compound");
}
}
}

Expand Down
17 changes: 7 additions & 10 deletions src/main/java/cpath/service/IndexImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -526,10 +526,10 @@ private void addDatasources(Set<Provenance> set, Document doc) {
//store but do not index/tokenize the URI
doc.add(new StoredField(FIELD_DATASOURCE, p.getUri()));

//index the last/local (collection prefix) part of the normalized Provenance uri
//index the last/local (collection prefix) part of the Provenance uri
String u = p.getUri();
if (u.endsWith("/")) u = u.substring(0, u.length() - 1);
u = u.replaceAll(".*[/#]", "");
u = u.replaceAll(".*[/#:]", "");
doc.add(new TextField(FIELD_DATASOURCE, u.toLowerCase(), Field.Store.NO));

//index names (including the datasource identifier from metadata json config; see premerge/merge)
Expand Down Expand Up @@ -572,15 +572,13 @@ private void addOrganisms(Set<BioSource> set, Document doc) {

private void addPathways(Set<Pathway> set, Document doc) {
for(Pathway pw : set) {
final String uri = pw.getUri();
//URI, index=yes, analyze=no, store=yes (this is to find child objects by pathway URI)
// we want searching by URI or its ending part (id) be case-sensitive
final String uri = pw.getUri();
//URI, index=yes, analyze=no, store=yes (this is to find child objects, participants or processes, by pathway URI/name/id)
// we want searching by URI or its ending part (id) be case-sensitive
doc.add(new StringField(FIELD_PATHWAY, uri, Field.Store.YES));
//also, extract and index the last part of the uri (e.g., 'hsa00010' or 'r-hsa-201451')
if(uri.startsWith("http://")) {
String id = uri.replaceAll(".*[/#]", "").trim();
doc.add(new StringField(FIELD_PATHWAY, id, Field.Store.NO));
}
String id = uri.replaceAll(".*[/#:]", "").trim();
doc.add(new StringField(FIELD_PATHWAY, id, Field.Store.NO));
// add names to the 'pathway' (don't store); will be case-insensitive (if using StandardAnalyser)
// (this allows to find a biopax element, e.g., protein, by a parent pathway name: pathway:<query_str>)
for (String s : pw.getName()) {
Expand All @@ -595,7 +593,6 @@ private void addPathways(Set<Pathway> set, Document doc) {
}
}


private String getTaxonId(BioSource bioSource) {
String id = null;
if(!bioSource.getXref().isEmpty()) {
Expand Down
2 changes: 1 addition & 1 deletion src/test/java/cpath/cleaner/HumanCycCleanerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

public class HumanCycCleanerTest {
@Test
public void deleteHtmlFromNames() throws Exception
public void deleteHtmlFromNames()
{
HumanCycCleaner humanCycCleaner = new HumanCycCleaner();

Expand Down
2 changes: 1 addition & 1 deletion src/test/java/cpath/service/ConsoleApplicationIT.java
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ public void premergeAndMerge() throws IOException {
assertMerge(mainModel);

//pid, reactome,humancyc,.. were there in the test
assertEquals(4, mainModel.getObjects(Provenance.class).size());
assertEquals(5, mainModel.getObjects(Provenance.class).size());

//additional 'test' metadata entry
Datasource md = new Datasource("test", Collections.singletonList("Reactome"),
Expand Down
10 changes: 4 additions & 6 deletions src/test/java/cpath/service/IndexIT.java
Original file line number Diff line number Diff line change
Expand Up @@ -95,16 +95,14 @@ public final void search() throws IOException {
assertEquals(2, response.getSearchHit().size());
response = index.search("*", 0, Provenance.class, new String[] {"kegg"}, null);
assertEquals(1, response.getSearchHit().size());

//datasource filter using Provenance absolute URI - not needed anymore - still stored but not indexed anymore
response = index.search("*", 0, Pathway.class, new String[] {"http://identifiers.org/kegg.pathway/"}, null);
assertTrue(response.isEmpty());

assertTrue(index.search("*", 0, Pathway.class, new String[] {"http://identifiers.org/reactome/"}, null).isEmpty());
assertTrue(index.search("*", 0, Pathway.class, new String[] {"test:kegg_test"}, null).isEmpty());
//using the local/last part of the URI (standard bio collection prefix/name)
response = index.search("*", 0, Pathway.class, new String[] {"kegg.pathway"}, null);
response = index.search("*", 0, Pathway.class, new String[] {"kegg_test"}, null);
assertFalse(response.isEmpty());
assertEquals(1, response.getSearchHit().size());
assertTrue(response.getSearchHit().stream().anyMatch(h -> h.getDataSource().contains("http://identifiers.org/kegg.pathway/")));
assertTrue(response.getSearchHit().stream().anyMatch(h -> h.getDataSource().contains("test:kegg_test")));

//find by partial name of a datasource - "pathway" of "KEGG Pathway"...
response = index.search("*", 0, Pathway.class, new String[] {"pathway"}, null);
Expand Down
24 changes: 12 additions & 12 deletions src/test/resources/merge/pathwaydata1.owl
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
</bp:comment>
</bp:ChemicalStructure>

<bp:Provenance rdf:about="http://identifiers.org/kegg.pathway/">
<bp:Provenance rdf:about="test:kegg_test">
<bp:standardName rdf:datatype="xsd:string">KEGG Pathway</bp:standardName>
<bp:displayName rdf:datatype="xsd:string">KEGG</bp:displayName>
</bp:Provenance>
Expand Down Expand Up @@ -101,7 +101,7 @@
<bp:displayName rdf:datatype="xsd:string">beta-D-glu + ATP =&gt;
beta-D-glu-6-p + ADP</bp:displayName>
<bp:dataSource rdf:resource="http://identifiers.org/reactome/" />
<bp:dataSource rdf:resource="http://identifiers.org/kegg.pathway/" />
<bp:dataSource rdf:resource="test:kegg_test" />
</bp:BiochemicalReaction>

<bp:SmallMoleculeReference rdf:about="http://identifiers.org/chebi/CHEBI:28">
Expand Down Expand Up @@ -161,7 +161,7 @@
</bp:name>
<bp:displayName rdf:datatype="xsd:string">beta-D-fructose-6-phosphate</bp:displayName>
<bp:dataSource rdf:resource="http://identifiers.org/reactome/" />
<bp:dataSource rdf:resource="http://identifiers.org/kegg.pathway/" />
<bp:dataSource rdf:resource="test:kegg_test" />
</bp:SmallMolecule>

<bp:SmallMoleculeReference rdf:about="http://identifiers.org/pubchem.substance/14438">
Expand Down Expand Up @@ -207,7 +207,7 @@
<bp:displayName rdf:datatype="xsd:string">catalysis of (alpha-D-glu
&lt;=&gt; alpha-D-glu-6-p)</bp:displayName>
<bp:dataSource rdf:resource="http://identifiers.org/reactome/" />
<bp:dataSource rdf:resource="http://identifiers.org/kegg.pathway/" />
<bp:dataSource rdf:resource="test:kegg_test" />
</bp:Catalysis>

<bp:BiochemicalReaction rdf:ID="phosphoglucoisomerase">
Expand All @@ -227,7 +227,7 @@
<bp:displayName rdf:datatype="xsd:string">beta-D-glu-6-p &lt;=&gt;
beta-D-fru-6-p</bp:displayName>
<bp:dataSource rdf:resource="http://identifiers.org/reactome/" />
<bp:dataSource rdf:resource="http://identifiers.org/kegg.pathway/" />
<bp:dataSource rdf:resource="test:kegg_test" />
</bp:BiochemicalReaction>

<bp:UnificationXref rdf:about="UnificationXref:KEGG_R01786">
Expand All @@ -244,7 +244,7 @@
<bp:displayName rdf:datatype="xsd:string">Adenosine
5&apos;-diphosphate</bp:displayName>
<bp:dataSource rdf:resource="http://identifiers.org/reactome/" />
<bp:dataSource rdf:resource="http://identifiers.org/kegg.pathway/" />
<bp:dataSource rdf:resource="test:kegg_test" />
</bp:SmallMolecule>

<bp:Protein rdf:ID="Protein_54">
Expand Down Expand Up @@ -272,7 +272,7 @@
<bp:displayName rdf:datatype="xsd:string">phosphoglucose isomerase
</bp:displayName>
<bp:dataSource rdf:resource="http://identifiers.org/reactome/" />
<bp:dataSource rdf:resource="http://identifiers.org/kegg.pathway/" />
<bp:dataSource rdf:resource="test:kegg_test" />
<bp:feature rdf:resource="#O_phospho_L_serine_at_5" />
</bp:Protein>

Expand All @@ -297,7 +297,7 @@
<bp:displayName rdf:datatype="xsd:string">Glycolysis Pathway</bp:displayName>
<bp:standardName rdf:datatype="xsd:string">glycolysis</bp:standardName>
<bp:dataSource rdf:resource="http://identifiers.org/reactome/" />
<bp:dataSource rdf:resource="http://identifiers.org/kegg.pathway/" />
<bp:dataSource rdf:resource="test:kegg_test" />
<bp:pathwayComponent
rdf:resource="#glucokinase_converts_alpha-D-glu_to_alpha-D-glu-6-p" />
</bp:Pathway>
Expand Down Expand Up @@ -356,7 +356,7 @@
<bp:displayName rdf:datatype="xsd:string">Adenosine
5&apos;-triphosphate</bp:displayName>
<bp:dataSource rdf:resource="http://identifiers.org/reactome/" />
<bp:dataSource rdf:resource="http://identifiers.org/kegg.pathway/" />
<bp:dataSource rdf:resource="test:kegg_test" />
</bp:SmallMolecule>

<bp:UnificationXref rdf:about="UnificationXref:PUBCHEM_14438">
Expand All @@ -383,7 +383,7 @@
<bp:displayName rdf:datatype="xsd:string">catalysis of
(beta-D-glu-6-p &lt;=&gt; beta-D-fruc-6-p)</bp:displayName>
<bp:dataSource rdf:resource="http://identifiers.org/reactome/" />
<bp:dataSource rdf:resource="http://identifiers.org/kegg.pathway/" />
<bp:dataSource rdf:resource="test:kegg_test" />
</bp:Catalysis>

<bp:UnificationXref rdf:about="UnificationXref:PUBCHEM_14439">
Expand All @@ -404,7 +404,7 @@
<bp:displayName rdf:datatype="xsd:string">beta-D-glucose 6-phosphate
</bp:displayName>
<bp:dataSource rdf:resource="http://identifiers.org/reactome/" />
<bp:dataSource rdf:resource="http://identifiers.org/kegg.pathway/" />
<bp:dataSource rdf:resource="test:kegg_test" />
</bp:SmallMolecule>

<bp:BiochemicalPathwayStep rdf:ID="BiochemicalPathwayStep_3">
Expand All @@ -425,7 +425,7 @@
<bp:displayName rdf:datatype="xsd:string">beta-D-glucose
</bp:displayName>
<bp:dataSource rdf:resource="http://identifiers.org/reactome/" />
<bp:dataSource rdf:resource="http://identifiers.org/kegg.pathway/" />
<bp:dataSource rdf:resource="test:kegg_test" />
</bp:SmallMolecule>

<bp:UnificationXref
Expand Down
3 changes: 2 additions & 1 deletion work/metadata.v14.json
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,8 @@
"iconUrl": "https://pathwaycommons.github.io/cpath2/logos/drugbanklogo.png",
"availability": "academic",
"type": "BIOPAX",
"pubmedId": "29126136"
"pubmedId": "29126136",
"cleanerClass": "cpath.cleaner.DrugbankCleaner"
},
{
"dataUrl": "http://www.ebi.ac.uk/biomodels-main/MODEL1109130000",
Expand Down

0 comments on commit f2dfee4

Please # to comment.