Skip to content

Commit 6fbf8ad

Browse files
authored
Merge pull request #163 from hammerlab/use-cached-ensembl-release-objects-after-pickling
Deserialization should construct cached EnsemblRelease objects
2 parents 423af3c + 0fa961d commit 6fbf8ad

File tree

3 files changed

+70
-26
lines changed

3 files changed

+70
-26
lines changed

pyensembl/__init__.py

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from .memory_cache import MemoryCache
1818
from .download_cache import DownloadCache
1919
from .ensembl_release import EnsemblRelease
20-
from .ensembl_release_versions import check_release_number, MAX_ENSEMBL_RELEASE
20+
from .ensembl_release_versions import MAX_ENSEMBL_RELEASE
2121
from .exon import Exon
2222
from .genome import Genome
2323
from .gene import Gene
@@ -35,28 +35,23 @@
3535
)
3636
from .transcript import Transcript
3737

38-
__version__ = '0.9.6'
38+
__version__ = '0.9.7'
3939

40-
_cache = {}
40+
def cached_release(release, species="human"):
41+
"""
42+
Create an EnsemblRelease instance only if it's hasn't already been made,
43+
otherwise returns the old instance.
4144
42-
def cached_release(version, species="human"):
43-
"""Cached construction of EnsemblRelease objects. It's desirable to reuse
44-
the same EnsemblRelease object since each one will store a lot of cached
45-
annotation data in-memory.
45+
Keeping this function for backwards compatibility but this functionality
46+
has been moving into the cached method of EnsemblRelease.
4647
"""
47-
version = check_release_number(version)
48-
species = check_species_object(species)
49-
key = (version, species)
50-
if key not in _cache:
51-
ensembl = EnsemblRelease(version, species=species)
52-
_cache[key] = ensembl
53-
return _cache[key]
48+
return EnsemblRelease.cached(release=release, species=species)
5449

5550
def genome_for_reference_name(reference_name):
5651
reference_name = normalize_reference_name(reference_name)
5752
species = find_species_by_reference(reference_name)
5853
(_, max_ensembl_release) = species.reference_assemblies[reference_name]
59-
return cached_release(max_ensembl_release, species=species)
54+
return cached_release(release=max_ensembl_release, species=species)
6055

6156
ensembl_grch36 = ensembl54 = cached_release(54) # last release for GRCh36/hg18
6257
ensembl_grch37 = ensembl75 = cached_release(75) # last release for GRCh37/hg19

pyensembl/ensembl_release.py

Lines changed: 53 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
Contains the EnsemblRelease class, which extends the Genome class
1717
to be specific to (a particular release of) Ensembl.
1818
"""
19+
from weakref import WeakValueDictionary
1920

2021
from .genome import Genome
2122
from .ensembl_release_versions import check_release_number, MAX_ENSEMBL_RELEASE
@@ -32,18 +33,53 @@ class EnsemblRelease(Genome):
3233
Bundles together the genomic annotation and sequence data associated with
3334
a particular release of the Ensembl database.
3435
"""
35-
def __init__(self,
36-
release=MAX_ENSEMBL_RELEASE,
37-
species=human,
38-
server=ENSEMBL_FTP_SERVER):
39-
self.release = check_release_number(release)
40-
self.species = check_species_object(species)
41-
self.server = server
36+
37+
@classmethod
38+
def normalize_init_values(cls, release, species, server):
39+
"""
40+
Normalizes the arguments which uniquely specify an EnsemblRelease
41+
genome.
42+
"""
43+
release = check_release_number(release)
44+
species = check_species_object(species)
45+
return (release, species, server)
46+
47+
# Using a WeakValueDictionary instead of an ordinary dict to prevent a
48+
# memory leak in cases where we test many different releases in sequence.
49+
# When all the references to a particular EnsemblRelease die then that
50+
# genome should also be removed from this cache.
51+
_genome_cache = WeakValueDictionary()
52+
53+
@classmethod
54+
def cached(
55+
cls,
56+
release=MAX_ENSEMBL_RELEASE,
57+
species=human,
58+
server=ENSEMBL_FTP_SERVER):
59+
"""
60+
Construct EnsemblRelease if it's never been made before, otherwise
61+
return an old instance.
62+
"""
63+
init_args_tuple = cls.normalize_init_values(release, species, server)
64+
if init_args_tuple in cls._genome_cache:
65+
genome = cls._genome_cache[init_args_tuple]
66+
else:
67+
genome = cls._genome_cache[init_args_tuple] = cls(*init_args_tuple)
68+
return genome
69+
70+
def __init__(
71+
self,
72+
release=MAX_ENSEMBL_RELEASE,
73+
species=human,
74+
server=ENSEMBL_FTP_SERVER):
75+
self.release, self.species, self.server = self.normalize_init_values(
76+
release=release, species=species, server=server)
4277

4378
self.gtf_url = make_gtf_url(
4479
ensembl_release=self.release,
45-
species=species,
46-
server=server)
80+
species=self.species,
81+
server=self.server)
82+
4783
self.transcript_fasta_url = make_fasta_url(
4884
ensembl_release=self.release,
4985
species=self.species.latin_name,
@@ -53,7 +89,7 @@ def __init__(self,
5389
ensembl_release=self.release,
5490
species=self.species.latin_name,
5591
sequence_type="pep",
56-
server=server)
92+
server=self.server)
5793

5894
self.reference_name = self.species.which_reference(self.release)
5995

@@ -92,3 +128,10 @@ def to_dict(self):
92128
"species": self.species,
93129
"server": self.server
94130
}
131+
132+
@classmethod
133+
def from_dict(cls, state_dict):
134+
"""
135+
Deserialize EnsemblRelease without creating duplicate instances.
136+
"""
137+
return cls.cached(**state_dict)

test/test_serialization.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
setup_init_custom_mouse_genome
2626
)
2727

28-
2928
@test_ensembl_releases()
3029
def test_pickle_ensembl_gene(ensembl_genome):
3130
gene = ensembl_genome.gene_by_id(TP53_gene_id)
@@ -112,3 +111,10 @@ def test_species_to_json():
112111

113112
def test_species_to_pickle():
114113
eq_(human, pickle.loads(pickle.dumps(human)))
114+
115+
116+
@test_ensembl_releases()
117+
def test_unique_memory_address_of_unpickled_genomes(ensembl_genome):
118+
unpickled = pickle.loads(pickle.dumps(ensembl_genome))
119+
assert ensembl_genome is unpickled, \
120+
"Expected same object for %s but got two different instances" % (unpickled,)

0 commit comments

Comments
 (0)