1
- # -*- coding: utf-8 -*-
2
-
3
- """Provide OptGP sampler."""
4
-
5
- from __future__ import absolute_import , division
1
+ """Provide the OptGP sampler class and helper functions."""
6
2
7
3
from multiprocessing import Pool
4
+ from typing import TYPE_CHECKING , Dict , Optional , Tuple
8
5
9
6
import numpy as np
10
- import pandas
7
+ import pandas as pd
8
+
9
+ from ..core .configuration import Configuration
10
+ from .core import step
11
+ from .hr_sampler import HRSampler , shared_np_array
12
+
11
13
12
- from cobra . core . configuration import Configuration
13
- from cobra . sampling . core import step
14
- from cobra .sampling . hr_sampler import HRSampler , shared_np_array
14
+ if TYPE_CHECKING :
15
+ from cobra import Model
16
+ from cobra .sampling import OptGPSampler
15
17
16
18
17
19
__all__ = ("OptGPSampler" ,)
20
22
CONFIGURATION = Configuration ()
21
23
22
24
23
- def mp_init (obj ) :
25
+ def mp_init (obj : "OptGPSampler" ) -> None :
24
26
"""Initialize the multiprocessing pool."""
25
27
global sampler
26
28
sampler = obj
27
29
28
30
29
31
# Unfortunately this has to be outside the class to be usable with
30
32
# multiprocessing :()
31
- def _sample_chain (args ) :
33
+ def _sample_chain (args : Tuple [ int , int ]) -> Tuple [ int , "OptGPSampler" ] :
32
34
"""Sample a single chain for OptGPSampler.
33
35
34
- center and n_samples are updated locally and forgotten afterwards.
36
+ ` center` and ` n_samples` are updated locally and forgotten afterwards.
35
37
36
38
"""
37
39
n , idx = args # has to be this way to work in Python 2.7
@@ -67,59 +69,52 @@ def _sample_chain(args):
67
69
68
70
69
71
class OptGPSampler (HRSampler ):
70
- """A parallel optimized sampler.
72
+ """
73
+ Improved Artificial Centering Hit-and-Run sampler.
71
74
72
- A parallel sampler with fast convergence and parallel execution. See [1]_
73
- for details.
75
+ A parallel sampler with fast convergence and parallel execution.
76
+ See [1]_ for details.
74
77
75
78
Parameters
76
79
----------
77
80
model : cobra.Model
78
81
The cobra model from which to generate samples.
79
- processes: int, optional (default Configuration.processes)
80
- The number of processes used during sampling.
82
+ processes: int, optional
83
+ The number of processes used during sampling
84
+ (default cobra.Configuration.processes).
81
85
thinning : int, optional
82
- The thinning factor of the generated sampling chain. A thinning of 10
83
- means samples are returned every 10 steps.
86
+ The thinning factor of the generated sampling chain. A thinning of
87
+ 10 means samples are returned every 10 steps (default 100) .
84
88
nproj : int > 0, optional
85
- How often to reproject the sampling point into the feasibility space.
86
- Avoids numerical issues at the cost of lower sampling. If you observe
87
- many equality constraint violations with `sampler.validate` you should
88
- lower this number.
89
+ How often to reproject the sampling point into the feasibility
90
+ space. Avoids numerical issues at the cost of lower sampling. If
91
+ you observe many equality constraint violations with
92
+ `sampler.validate` you should lower this number (default None) .
89
93
seed : int > 0, optional
90
- Sets the random number seed. Initialized to the current time stamp if
91
- None.
94
+ Sets the random number seed. Initialized to the current time stamp
95
+ if None (default None) .
92
96
93
97
Attributes
94
98
----------
95
- model : cobra.Model
96
- The cobra model from which the samples get generated.
97
- thinning : int
98
- The currently used thinning factor.
99
99
n_samples : int
100
100
The total number of samples that have been generated by this
101
101
sampler instance.
102
- problem : collections.namedtuple
103
- A python object whose attributes define the entire sampling problem in
104
- matrix form. See docstring of `Problem`.
102
+ problem : typing.NamedTuple
103
+ A NamedTuple whose attributes define the entire sampling problem in
104
+ matrix form.
105
105
warmup : numpy.matrix
106
- A matrix of with as many columns as reactions in the model and more
107
- than 3 rows containing a warmup sample in each row. None if no warmup
108
- points have been generated yet.
106
+ A numpy matrix with as many columns as reactions in the model and
107
+ more than 3 rows containing a warmup sample in each row. None if no
108
+ warmup points have been generated yet.
109
109
retries : int
110
110
The overall of sampling retries the sampler has observed. Larger
111
111
values indicate numerical instabilities.
112
- seed : int > 0, optional
113
- Sets the random number seed. Initialized to the current time stamp if
114
- None.
115
- nproj : int
116
- How often to reproject the sampling point into the feasibility space.
117
112
fwd_idx : numpy.array
118
- Has one entry for each reaction in the model containing the index of
119
- the respective forward variable.
113
+ A numpy array having one entry for each reaction in the model,
114
+ containing the index of the respective forward variable.
120
115
rev_idx : numpy.array
121
- Has one entry for each reaction in the model containing the index of
122
- the respective reverse variable.
116
+ A numpy array having one entry for each reaction in the model,
117
+ containing the index of the respective reverse variable.
123
118
prev : numpy.array
124
119
The current/last flux sample generated.
125
120
center : numpy.array
@@ -129,20 +124,20 @@ class OptGPSampler(HRSampler):
129
124
Notes
130
125
-----
131
126
The sampler is very similar to artificial centering where each process
132
- samples its own chain. Initial points are chosen randomly from the warmup
133
- points followed by a linear transformation that pulls the points a little
134
- bit towards the center of the sampling space.
127
+ samples its own chain. Initial points are chosen randomly from the
128
+ warmup points followed by a linear transformation that pulls the points
129
+ a little bit towards the center of the sampling space.
135
130
136
131
If the number of processes used is larger than the one requested,
137
132
number of samples is adjusted to the smallest multiple of the number of
138
133
processes larger than the requested sample number. For instance, if you
139
- have 3 processes and request 8 samples you will receive 9.
134
+ have 3 processes and request 8 samples, you will receive 9.
140
135
141
- Memory usage is roughly in the order of (2 * number reactions)^2
142
- due to the required nullspace matrices and warmup points. So large
143
- models easily take up a few GB of RAM. However, most of the large matrices
144
- are kept in shared memory. So the RAM usage is independent of the number
145
- of processes.
136
+ Memory usage is roughly in the order of (2 * number of reactions)^2
137
+ due to the required nullspace matrices and warmup points. So, large
138
+ models easily take up a few GBs of RAM. However, most of the large
139
+ matrices are kept in shared memory. So the RAM usage is independent of
140
+ the number of processes.
146
141
147
142
References
148
143
----------
@@ -154,9 +149,17 @@ class OptGPSampler(HRSampler):
154
149
155
150
"""
156
151
157
- def __init__ (self , model , processes = None , thinning = 100 , nproj = None , seed = None ):
152
+ def __init__ (
153
+ self ,
154
+ model : "Model" ,
155
+ thinning : int = 100 ,
156
+ processes : Optional [int ] = None ,
157
+ nproj : Optional [int ] = None ,
158
+ seed : Optional [int ] = None ,
159
+ ** kwargs
160
+ ) -> None :
158
161
"""Initialize a new OptGPSampler."""
159
- super (OptGPSampler , self ).__init__ (model , thinning , seed = seed )
162
+ super ().__init__ (model , thinning , nproj = nproj , seed = seed , * kwargs )
160
163
self .generate_fva_warmup ()
161
164
162
165
if processes is None :
@@ -170,37 +173,37 @@ def __init__(self, model, processes=None, thinning=100, nproj=None, seed=None):
170
173
(len (self .model .variables ),), self .warmup .mean (axis = 0 )
171
174
)
172
175
173
- def sample (self , n , fluxes = True ):
176
+ def sample (self , n : int , fluxes : bool = True ) -> pd . DataFrame :
174
177
"""Generate a set of samples.
175
178
176
179
This is the basic sampling function for all hit-and-run samplers.
177
180
178
181
Parameters
179
182
----------
180
183
n : int
181
- The minimum number of samples that are generated at once
182
- (see Notes).
183
- fluxes : boolean
184
- Whether to return fluxes or the internal solver variables. If set
185
- to False will return a variable for each forward and backward flux
186
- as well as all additional variables you might have defined in the
187
- model.
184
+ The minimum number of samples that are generated at once.
185
+ fluxes : bool, optional
186
+ Whether to return fluxes or the internal solver variables. If
187
+ set to False, will return a variable for each forward and
188
+ backward flux as well as all additional variables you might
189
+ have defined in the model (default True).
188
190
189
191
Returns
190
192
-------
191
- numpy.matrix
192
- Returns a matrix with `n` rows, each containing a flux sample.
193
+ pandas.DataFrame
194
+ Returns a pandas DataFrame with `n` rows, each containing a
195
+ flux sample.
193
196
194
197
Notes
195
198
-----
196
199
Performance of this function linearly depends on the number
197
200
of reactions in your model and the thinning factor.
198
201
199
202
If the number of processes is larger than one, computation is split
200
- across as the CPUs of your machine. This may shorten computation time.
201
- However, there is also overhead in setting up parallel computation so
202
- we recommend to calculate large numbers of samples at once
203
- (`n` > 1000).
203
+ across the CPU cores of your machine. This may shorten computation
204
+ time. However, there is also overhead in setting up parallel
205
+ computation primitives so, we recommend to calculate large numbers
206
+ of samples at once (`n` > 1000).
204
207
205
208
"""
206
209
if self .processes > 1 :
@@ -234,17 +237,17 @@ def sample(self, n, fluxes=True):
234
237
if fluxes :
235
238
names = [r .id for r in self .model .reactions ]
236
239
237
- return pandas .DataFrame (
240
+ return pd .DataFrame (
238
241
chains [:, self .fwd_idx ] - chains [:, self .rev_idx ],
239
242
columns = names ,
240
243
)
241
244
else :
242
245
names = [v .name for v in self .model .variables ]
243
246
244
- return pandas .DataFrame (chains , columns = names )
247
+ return pd .DataFrame (chains , columns = names )
245
248
246
249
# Models can be large so don't pass them around during multiprocessing
247
- def __getstate__ (self ):
250
+ def __getstate__ (self ) -> Dict :
248
251
"""Return the object for serialization."""
249
252
d = dict (self .__dict__ )
250
253
del d ["model" ]
0 commit comments