Skip to content

Commit

Permalink
Merge pull request #9 from cmungall/program-learning-claude
Browse files Browse the repository at this point in the history
Program learning claude
  • Loading branch information
cmungall authored Feb 18, 2025
2 parents df25d76 + 031c205 commit b4ba7f0
Show file tree
Hide file tree
Showing 343 changed files with 20,870 additions and 0 deletions.
55 changes: 55 additions & 0 deletions learned/claude-3-sonnet/11_12_saturated_fatty_acyl_CoA_4__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""
Classifies: CHEBI:84948 11,12-saturated fatty acyl-CoA(4-)
"""
"""
Classifies: CHEBI:84948 11,12-saturated fatty acyl-CoA(4-)
"""
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdMolDescriptors

def is_11_12_saturated_fatty_acyl_CoA_4__(smiles: str):
"""
Determines if a molecule is a 11,12-saturated fatty acyl-CoA(4-) based on its SMILES string.
A 11,12-saturated fatty acyl-CoA(4-) is a fatty acyl-CoA(4-) where the 11-12 bond of the fatty acyl group is saturated.
Args:
smiles (str): SMILES string of the molecule
Returns:
bool: True if molecule is a 11,12-saturated fatty acyl-CoA(4-), False otherwise
str: Reason for classification
"""

# Parse SMILES
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return False, "Invalid SMILES string"

# Check for CoA substructure
coa_pattern = Chem.MolFromSmarts("C(C)(COP(=O)([O-])OP(=O)([O-])OC[C@H]1O[C@H]([C@H](O)[C@@H]1OP(=O)([O-])[O-])n1cnc2c(N)ncnc12)C(=O)NCCC(=O)NCCS")
if not mol.HasSubstructMatch(coa_pattern):
return False, "No CoA substructure found"

# Check for fatty acyl chain
acyl_pattern = Chem.MolFromSmarts("[CX3](=[OX1])[CX4,CX3]~[CX4,CX3]~[CX4,CX3]~[CX4,CX3]")
acyl_matches = mol.GetSubstructMatches(acyl_pattern)
if not acyl_matches:
return False, "No fatty acyl chain found"

# Check for saturation at 11-12 position
for acyl_match in acyl_matches:
acyl_atoms = [mol.GetAtomWithIdx(idx) for idx in acyl_match]
if len(acyl_atoms) < 12:
continue # Acyl chain too short

atom11 = acyl_atoms[10]
atom12 = acyl_atoms[11]
if atom11.GetIsAromatic() or atom12.GetIsAromatic():
continue # Aromatic atoms

bond = mol.GetBondBetweenAtoms(atom11.GetIdx(), atom12.GetIdx())
if bond.GetBondType() == Chem.BondType.SINGLE:
return True, "Fatty acyl chain has a saturated bond at position 11-12"

return False, "Fatty acyl chain does not have a saturated bond at position 11-12"
56 changes: 56 additions & 0 deletions learned/claude-3-sonnet/11_oxo_steroid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""
Classifies: CHEBI:47787 11-oxo steroid
"""
"""
Classifies: 11-oxo steroid
Definition: Any oxo steroid that has an oxo substituent at position 11
"""
from rdkit import Chem
from rdkit.Chem import AllChem

def is_11_oxo_steroid(smiles: str):
"""
Determines if a molecule is an 11-oxo steroid based on its SMILES string.
Args:
smiles (str): SMILES string of the molecule
Returns:
tuple: (bool, str) - (True if molecule is an 11-oxo steroid, reason for classification)
"""
# Parse SMILES
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return False, "Invalid SMILES string"

# Check for basic steroid core (four fused rings)
steroid_core = Chem.MolFromSmarts("[#6]1~[#6]~[#6]~[#6]2~[#6]~[#6]~[#6]~3~[#6]~[#6]~[#6]~4~[#6]~[#6]~[#6]~[#6]~[#6]~4~[#6]~3~[#6]~2~[#6]~1")
if not mol.HasSubstructMatch(steroid_core):
return False, "No steroid core structure found"

# SMARTS pattern for 11-oxo group in steroid context
# This pattern looks for the specific environment of the 11-position ketone
# in the steroid ring system
oxo_11_pattern = Chem.MolFromSmarts("[#6]1~[#6]~[#6]2~[#6]~[#6]~[#6]~3~[#6](=[O:1])~[#6]~[#6]~[#6]~[#6]~3~[#6]~2~[#6]~[#6]~1")

# Find matches for the 11-oxo pattern
matches = mol.GetSubstructMatches(oxo_11_pattern)
if not matches:
return False, "No ketone group at position 11"

# Count carbons to verify it's in the typical steroid range
carbon_count = len([atom for atom in mol.GetAtoms() if atom.GetAtomicNum() == 6])
if carbon_count < 19 or carbon_count > 30:
return False, f"Carbon count ({carbon_count}) outside typical steroid range (19-30)"

# Additional check for reasonable molecular weight
mol_wt = Chem.Descriptors.ExactMolWt(mol)
if mol_wt < 250 or mol_wt > 500:
return False, f"Molecular weight ({mol_wt:.1f}) outside typical steroid range (250-500)"

# Count rings to ensure we have the right ring system
ring_info = mol.GetRingInfo()
if ring_info.NumRings() < 4:
return False, "Insufficient number of rings for steroid structure"

return True, "Molecule contains steroid core with ketone group at position 11"
60 changes: 60 additions & 0 deletions learned/claude-3-sonnet/11beta_hydroxy_steroid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""
Classifies: CHEBI:35346 11beta-hydroxy steroid
"""
"""
Classifies: 11beta-hydroxy steroids
"""
from rdkit import Chem
from rdkit.Chem import AllChem

def is_11beta_hydroxy_steroid(smiles: str):
"""
Determines if a molecule is an 11beta-hydroxy steroid based on its SMILES string.
Args:
smiles (str): SMILES string of the molecule
Returns:
bool: True if molecule is an 11beta-hydroxy steroid, False otherwise
str: Reason for classification
"""

# Parse SMILES
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return False, "Invalid SMILES string"

# Check for basic steroid core (4 fused rings)
steroid_core = Chem.MolFromSmarts("[#6]~1~[#6]~[#6]~[#6]~2~[#6]~[#6]~[#6]~[#6]~3~[#6]~[#6]~[#6]~[#6]~4~[#6]~[#6]~[#6]~[#6]~4~[#6]~3~[#6]~2~1")
if not mol.HasSubstructMatch(steroid_core):
return False, "No steroid core structure found"

# Check for 11-beta hydroxy group
# The SMARTS pattern looks for:
# - The specific carbon at position 11 in the steroid skeleton
# - An OH group in beta configuration (specified by '@H')
# Note: The exact stereochemistry is critical here
hydroxy_11beta = Chem.MolFromSmarts("[C]~1~[C]~[C]~[C]~2~[C]~[C]~[C]~[C]~3~[C]~[C@@H](O)~[C]~[C]~4~[C]~[C]~[C]~[C]~4~[C]~3~[C]~2~1")

if not mol.HasSubstructMatch(hydroxy_11beta):
return False, "No 11-beta hydroxy group found"

# Additional check for common substituents often found in 11beta-hydroxy steroids
# Look for common functional groups like ketones, other hydroxyls, etc.
ketone_pattern = Chem.MolFromSmarts("C(=O)")
other_hydroxy = Chem.MolFromSmarts("[CH]O")

if not (mol.HasSubstructMatch(ketone_pattern) or mol.HasSubstructMatch(other_hydroxy)):
return False, "Missing typical steroid substituents"

# Count carbons to ensure reasonable size for a steroid
carbon_count = len([atom for atom in mol.GetAtoms() if atom.GetAtomicNum() == 6])
if carbon_count < 19 or carbon_count > 30: # Most steroids have 19-30 carbons
return False, f"Carbon count ({carbon_count}) outside typical steroid range"

# Check for reasonable number of rings
ring_info = mol.GetRingInfo()
if ring_info.NumRings() < 4:
return False, "Too few rings for steroid structure"

return True, "Contains steroid core with 11-beta hydroxy group and appropriate substituents"
61 changes: 61 additions & 0 deletions learned/claude-3-sonnet/16beta_hydroxy_steroid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""
Classifies: CHEBI:17354 16beta-hydroxy steroid
"""
"""
Classifies: CHEBI:16beta-hydroxy steroid
A 16-hydroxy steroid in which the hydroxy group at position 16 has a beta-configuration.
"""
from rdkit import Chem
from rdkit.Chem import AllChem

def is_16beta_hydroxy_steroid(smiles: str):
"""
Determines if a molecule is a 16beta-hydroxy steroid based on its SMILES string.
Args:
smiles (str): SMILES string of the molecule
Returns:
bool: True if molecule is a 16beta-hydroxy steroid, False otherwise
str: Reason for classification
"""
# Parse SMILES
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return False, "Invalid SMILES string"

# Add explicit hydrogens for stereochemistry
mol = Chem.AddHs(mol)

# Basic steroid core pattern (four fused rings)
# Using SMARTS that matches the basic steroid skeleton
steroid_core = Chem.MolFromSmarts("[#6]~1~[#6]~[#6]~[#6]~2~[#6]~[#6]~[#6]~3~[#6]~[#6]~[#6]~4~[#6]~[#6]~[#6]~[#6]~4~[#6]~3~[#6]~2~1")

if not mol.HasSubstructMatch(steroid_core):
return False, "No steroid core structure found"

# Pattern for 16-beta-hydroxy group
# The [H] specifies explicit hydrogen, OH is the hydroxy group
# The @ symbols specify the stereochemistry
beta_oh_pattern = Chem.MolFromSmarts("[C]12[C][C@H](O)[CH2][C@]1([CH2,CH3])[C@@H]3[C][C][C]2")

if not mol.HasSubstructMatch(beta_oh_pattern):
return False, "No 16-beta-hydroxy group found"

# Additional checks for reasonable molecular weight and atom counts
# Steroids typically have molecular weights between 250-1000
mol_weight = Chem.Descriptors.ExactMolWt(mol)
if mol_weight < 250 or mol_weight > 1000:
return False, f"Molecular weight {mol_weight} outside typical steroid range (250-1000)"

# Count carbons (steroids typically have 17+ carbons)
carbon_count = len([atom for atom in mol.GetAtoms() if atom.GetAtomicNum() == 6])
if carbon_count < 17:
return False, f"Too few carbons ({carbon_count}) for a steroid structure"

# Check for at least one oxygen (for the hydroxy group)
oxygen_count = len([atom for atom in mol.GetAtoms() if atom.GetAtomicNum() == 8])
if oxygen_count < 1:
return False, "No oxygen atoms found"

return True, "Contains steroid core with 16-beta-hydroxy group"
57 changes: 57 additions & 0 deletions learned/claude-3-sonnet/17alpha_hydroxy_steroid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""
Classifies: CHEBI:35342 17alpha-hydroxy steroid
"""
"""
Classifies: 17alpha-hydroxy steroid
"""
from rdkit import Chem
from rdkit.Chem import AllChem

def is_17alpha_hydroxy_steroid(smiles: str):
"""
Determines if a molecule is a 17alpha-hydroxy steroid based on its SMILES string.
Args:
smiles (str): SMILES string of the molecule
Returns:
bool: True if molecule is a 17alpha-hydroxy steroid, False otherwise
str: Reason for classification
"""

# Parse SMILES
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return False, "Invalid SMILES string"

# Check for basic steroid core (4 fused rings)
steroid_core = Chem.MolFromSmarts("[#6]~1~[#6]~[#6]~[#6]~2~[#6]~[#6]~[#6]~[#6]~3~[#6]~[#6]~[#6]~[#6]~4~[#6]~[#6]~[#6]~[#6]~4~[#6]~3~[#6]~2~1")
if not mol.HasSubstructMatch(steroid_core):
return False, "No steroid core structure found"

# Check for 17-OH with alpha stereochemistry
# [C] is carbon 17, [OH1] is the hydroxyl group, '@' indicates stereochemistry
# The exact SMARTS pattern depends on the numbering convention used in the structure
oh_17_alpha_pattern = Chem.MolFromSmarts('[C;R1]-[C;R1]([OH1])')

if not mol.HasSubstructMatch(oh_17_alpha_pattern):
return False, "No hydroxyl group at C17 position found"

# Additional checks to verify it's a steroid structure:
# Count rings
ri = mol.GetRingInfo()
if ri.NumRings() < 4:
return False, "Insufficient number of rings for steroid structure"

# Basic size check - steroids typically have at least 19 carbons
c_count = sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() == 6)
if c_count < 19:
return False, "Too few carbons for steroid structure"

# Check for sp3 hybridized carbons typical in steroid core
sp3_carbons = sum(1 for atom in mol.GetAtoms()
if atom.GetAtomicNum() == 6 and atom.GetHybridization() == Chem.HybridizationType.SP3)
if sp3_carbons < 10:
return False, "Insufficient sp3 carbons for steroid structure"

return True, "Contains steroid core with 17-alpha hydroxyl group"
70 changes: 70 additions & 0 deletions learned/claude-3-sonnet/17beta_hydroxy_steroid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""
Classifies: CHEBI:35343 17beta-hydroxy steroid
"""
"""
Classifies: 17beta-hydroxy steroid
A 17-hydroxy steroid in which the hydroxy group at position 17 has a beta-configuration.
"""
from rdkit import Chem
from rdkit.Chem import AllChem

def is_17beta_hydroxy_steroid(smiles: str):
"""
Determines if a molecule is a 17beta-hydroxy steroid based on its SMILES string.
Args:
smiles (str): SMILES string of the molecule
Returns:
bool: True if molecule is a 17beta-hydroxy steroid, False otherwise
str: Reason for classification
"""
# Parse SMILES
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return False, "Invalid SMILES string"

# Check for basic steroid core (four fused rings)
steroid_core = Chem.MolFromSmarts("[#6]~1~[#6]~[#6]~[#6]~2~[#6]~[#6]~[#6]~[#6]~3~[#6]~[#6]~[#6]~[#6]~4~[#6]~[#6]~[#6]~[#6]~1~[#6]~2~[#6]~3~4")
if not mol.HasSubstructMatch(steroid_core):
return False, "No steroid core structure found"

# Check for 17-OH group in beta configuration
# [C] is carbon 17, [OH1] is hydroxy group, '@' indicates stereochemistry
# The [C] must be connected to 4 atoms (saturated)
# Note: The exact SMARTS pattern depends on the numbering convention used
oh_17_beta = Chem.MolFromSmarts('[C;X4](@[*])(@[*])(@[*])[OH1]')

if not mol.HasSubstructMatch(oh_17_beta):
return False, "No hydroxyl group with correct connectivity found"

# Get matches for OH group
oh_matches = mol.GetSubstructMatches(oh_17_beta)

# Check if any of the matches are at position 17
found_17_beta_oh = False
for match in oh_matches:
c_atom = mol.GetAtomWithIdx(match[0]) # Get the carbon atom
# Check if this carbon is part of the D ring (ring 4) of the steroid
# by checking its environment
ring_info = mol.GetRingInfo()
if ring_info.NumAtomRings(match[0]) > 0: # Carbon must be part of a ring
# Check chirality of the carbon
if c_atom.GetChiralTag() == Chem.ChiralType.CHI_TETRAHEDRAL_CCW:
found_17_beta_oh = True
break

if not found_17_beta_oh:
return False, "No 17-beta hydroxyl group found"

# Additional validation: molecule should have reasonable size for a steroid
num_atoms = mol.GetNumAtoms()
if num_atoms < 20 or num_atoms > 100:
return False, "Molecule size not consistent with steroid structure"

# Count rings
ring_info = mol.GetRingInfo()
if ring_info.NumRings() < 4:
return False, "Insufficient number of rings for steroid structure"

return True, "Contains steroid core with 17-beta hydroxyl group"
Loading

0 comments on commit b4ba7f0

Please # to comment.