Skip to content

Commit ad206e3

Browse files
authored
Add support for downloading plant species with pyEnsembl (#305)
This commit introduces the ability to download plant species data using pyEnsembl. We've added the 'is_plant' parameter to the Species class, and registered two new species: Arabidopsis Thaliana and Oryza Sativa (Rice). We've also added the ENSEMBL_PLANTS_FTP_SERVER URL, and the PLANTS_GTF_SUBDIR_TEMPLATE and PLANTS_FASTA_SUBDIR_TEMPLATE for creating the download links. The code checks if the species is a plant to determine which templates to use.
1 parent 2208f87 commit ad206e3

File tree

5 files changed

+52
-12
lines changed

5 files changed

+52
-12
lines changed

pyensembl/ensembl_release.py

+3
Original file line numberDiff line numberDiff line change
@@ -77,12 +77,14 @@ def __init__(
7777
species=self.species.latin_name,
7878
sequence_type="cdna",
7979
server=server,
80+
is_plant = self.species.is_plant,
8081
),
8182
make_fasta_url(
8283
ensembl_release=self.release,
8384
species=self.species.latin_name,
8485
sequence_type="ncrna",
8586
server=server,
87+
is_plant = self.species.is_plant,
8688
),
8789
]
8890

@@ -92,6 +94,7 @@ def __init__(
9294
species=self.species.latin_name,
9395
sequence_type="pep",
9496
server=self.server,
97+
is_plant = self.species.is_plant,
9598
)
9699
]
97100

pyensembl/ensembl_url_templates.py

+24-7
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,19 @@
2424
from .ensembl_versions import check_release_number
2525

2626
ENSEMBL_FTP_SERVER = "https://ftp.ensembl.org"
27+
ENSEMBL_PLANTS_FTP_SERVER = "https://ftp.ensemblgenomes.ebi.ac.uk/"
2728

2829
# Example directories
2930
# FASTA files: /pub/release-78/fasta/homo_sapiens/
3031
# GTF annotation files: /pub/release-78/gtf/homo_sapiens/
3132
FASTA_SUBDIR_TEMPLATE = "/pub/release-%(release)d/fasta/%(species)s/%(type)s/"
33+
PLANTS_FASTA_SUBDIR_TEMPLATE = "/pub/release-%(release)d/plants/fasta/%(species)s/%(type)s/"
3234
GTF_SUBDIR_TEMPLATE = "/pub/release-%(release)d/gtf/%(species)s/"
35+
PLANTS_GTF_SUBDIR_TEMPLATE = "/pub/release-%(release)d/plants/gtf/%(species)s/"
3336

37+
#List plants
38+
#Lest do a vector with all the plants species that we added to make the custom url
39+
lPlants = ("arabidopsis_thaliana","arabidopsis")
3440

3541
def normalize_release_properties(ensembl_release, species):
3642
"""
@@ -63,12 +69,18 @@ def make_gtf_filename(ensembl_release, species):
6369
}
6470

6571

66-
def make_gtf_url(ensembl_release, species, server=ENSEMBL_FTP_SERVER):
72+
def make_gtf_url(ensembl_release, species, server=ENSEMBL_FTP_SERVER, gtf_subdir=GTF_SUBDIR_TEMPLATE):
6773
"""
6874
Returns a URL and a filename, which can be joined together.
6975
"""
76+
if species.is_plant:
77+
server = ENSEMBL_PLANTS_FTP_SERVER
78+
gtf_subdir = PLANTS_GTF_SUBDIR_TEMPLATE
79+
#else:
80+
#print(f"[+] {species.latin_name} it is not a plant", flush=True)
81+
7082
ensembl_release, species, _ = normalize_release_properties(ensembl_release, species)
71-
subdir = GTF_SUBDIR_TEMPLATE % {"release": ensembl_release, "species": species}
83+
subdir = gtf_subdir % {"release": ensembl_release, "species": species}
7284
filename = make_gtf_filename(ensembl_release=ensembl_release, species=species)
7385
return server + subdir + filename
7486

@@ -93,11 +105,11 @@ def make_gtf_url(ensembl_release, species, server=ENSEMBL_FTP_SERVER):
93105
NEW_FASTA_FILENAME_TEMPLATE_NCRNA = "%(Species)s.%(reference)s.ncrna.fa.gz"
94106

95107

96-
def make_fasta_filename(ensembl_release, species, sequence_type):
108+
def make_fasta_filename(ensembl_release, species, sequence_type, is_plant):
97109
ensembl_release, species, reference_name = normalize_release_properties(
98110
ensembl_release, species
99111
)
100-
if ensembl_release <= 75:
112+
if ensembl_release <= 75 and not is_plant:
101113
if sequence_type == "ncrna":
102114
return OLD_FASTA_FILENAME_TEMPLATE_NCRNA % {
103115
"Species": species.capitalize(),
@@ -125,7 +137,7 @@ def make_fasta_filename(ensembl_release, species, sequence_type):
125137
}
126138

127139

128-
def make_fasta_url(ensembl_release, species, sequence_type, server=ENSEMBL_FTP_SERVER):
140+
def make_fasta_url(ensembl_release, species, sequence_type, is_plant, server=ENSEMBL_FTP_SERVER, fasta_subdir=FASTA_SUBDIR_TEMPLATE):
129141
"""Construct URL to FASTA file with cDNA transcript or protein sequences
130142
131143
Parameter examples:
@@ -136,12 +148,17 @@ def make_fasta_url(ensembl_release, species, sequence_type, server=ENSEMBL_FTP_S
136148
ensembl_release, species, reference_name = normalize_release_properties(
137149
ensembl_release, species
138150
)
139-
subdir = FASTA_SUBDIR_TEMPLATE % {
151+
152+
if is_plant:
153+
server = ENSEMBL_PLANTS_FTP_SERVER
154+
fasta_subdir = PLANTS_FASTA_SUBDIR_TEMPLATE
155+
156+
subdir = fasta_subdir % {
140157
"release": ensembl_release,
141158
"species": species,
142159
"type": sequence_type,
143160
}
144161
filename = make_fasta_filename(
145-
ensembl_release=ensembl_release, species=species, sequence_type=sequence_type
162+
ensembl_release=ensembl_release, species=species, sequence_type=sequence_type, is_plant = is_plant
146163
)
147164
return server + subdir + filename

pyensembl/ensembl_versions.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
MIN_ENSEMBL_RELEASE = 47
1414
MAX_ENSEMBL_RELEASE = 111
15-
15+
MAX_PLANTS_ENSEMBL_RELEASE = 58
1616

1717
def check_release_number(release):
1818
"""

pyensembl/species.py

+23-3
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
from serializable import Serializable
1414

15-
from .ensembl_versions import MAX_ENSEMBL_RELEASE
15+
from .ensembl_versions import MAX_ENSEMBL_RELEASE, MAX_PLANTS_ENSEMBL_RELEASE
1616

1717
# TODO: replace Serializable with data class
1818

@@ -30,7 +30,7 @@ class Species(Serializable):
3030
_reference_names_to_species = {}
3131

3232
@classmethod
33-
def register(cls, latin_name, synonyms, reference_assemblies):
33+
def register(cls, latin_name, synonyms, reference_assemblies, is_plant=False):
3434
"""
3535
Create a Species object from the given arguments and enter into
3636
all the dicts used to look the species up by its fields.
@@ -39,6 +39,7 @@ def register(cls, latin_name, synonyms, reference_assemblies):
3939
latin_name=latin_name,
4040
synonyms=synonyms,
4141
reference_assemblies=reference_assemblies,
42+
is_plant=is_plant,
4243
)
4344
cls._latin_names_to_species[species.latin_name] = species
4445
for synonym in synonyms:
@@ -80,7 +81,7 @@ def all_species_release_pairs(cls):
8081
for release in range(release_range[0], release_range[1] + 1):
8182
yield species_name, release
8283

83-
def __init__(self, latin_name, synonyms=[], reference_assemblies={}):
84+
def __init__(self, latin_name, synonyms=[], reference_assemblies={}, is_plant=False):
8485
"""
8586
Parameters
8687
----------
@@ -96,6 +97,7 @@ def __init__(self, latin_name, synonyms=[], reference_assemblies={}):
9697
self.synonyms = synonyms
9798
self.reference_assemblies = reference_assemblies
9899
self._release_to_genome = {}
100+
self.is_plant = is_plant
99101
for genome_name, (start, end) in self.reference_assemblies.items():
100102
for i in range(start, end + 1):
101103
if i in self._release_to_genome:
@@ -350,3 +352,21 @@ def check_species_object(species_name_or_object):
350352
"R64-1-1": (76, MAX_ENSEMBL_RELEASE),
351353
},
352354
)
355+
356+
arabidopsis_thaliana = Species.register(
357+
latin_name="arabidopsis_thaliana",
358+
synonyms=["arabidopsis"],
359+
reference_assemblies={
360+
"TAIR10": (40, MAX_PLANTS_ENSEMBL_RELEASE),
361+
},
362+
is_plant=True
363+
)
364+
365+
rice = Species.register(
366+
latin_name="oryza_sativa",
367+
synonyms=["rice"],
368+
reference_assemblies={
369+
"IRGSP-1.0": (40, MAX_PLANTS_ENSEMBL_RELEASE),
370+
},
371+
is_plant=True
372+
)

pyensembl/version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "2.3.11"
1+
__version__ = "2.3.12"
22

33
def print_version():
44
print(f"v{__version__}")

0 commit comments

Comments
 (0)