Skip to content

C++-based CHRR implementation #1453

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: devel
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ cobra =
[options.extras_require]
array =
scipy
chrr =
hopsy
development =
black
bumpversion
Expand Down
6 changes: 6 additions & 0 deletions src/cobra/sampling/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
from .hr_sampler import HRSampler, shared_np_array
from .achr import ACHRSampler
from .core import step

from .hopsy import hopsy_is_available

if hopsy_is_available:
from .hopsy import HopsySampler

from .optgp import OptGPSampler
from .sampling import sample
214 changes: 214 additions & 0 deletions src/cobra/sampling/hopsy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
"""Provide hopsy samplers."""

from typing import TYPE_CHECKING, Optional


if TYPE_CHECKING:
from cobra import Model

from ..core.configuration import Configuration
from .hr_sampler import HRSampler


__all__ = (
"HopsySampler",
"hopsy_is_available",
)

configuration = Configuration()

try:
import hopsy
import numpy as np
import pandas as pd

hopsy_is_available = True

# for the time being while the updated hopsy version isn't released
def back_transform(problem, samples):
"""
Transform samples from the sampling space to the original parameter space.

Parameters
----------
problem : hopsy.Problem
A hopsy.Problem with populated `problem.transformation` and `problem.shift` of
shape `(m, d)` and `(d, )`, respectively. If not populated, the identity
operation is performed.
samples : np.array
Parameter samples of shape `(..., d)` which are to be transformed.

Returns
-------
np.array
Transformed samples of shape `(..., m)`
"""
S, h = problem.transformation, problem.shift
transformed = samples @ S.T if S is not None else samples
shifted = transformed + h if h is not None else transformed
return shifted

class HopsySampler(HRSampler):
"""
Generic sampling wrapper for hopsy samplers.

Parameters
----------
model : cobra.Model
The cobra model from which to generate samples.
sampler : hopsy.Proposal, optional
The hopsy sampler for sampling the polytope
(default hopsy.UniformCoordinateHitAndRunProposal).
processes: int, optional
The number of processes used during sampling
(default cobra.Configuration.processes).
thinning : int, optional
The thinning factor of the generated sampling chain. A thinning of
10 means samples are returned every 10 steps (default 100).
nproj : int > 0, optional
How often to reproject the sampling point into the feasibility
space. Avoids numerical issues at the cost of lower sampling. If
you observe many equality constraint violations with
`sampler.validate` you should lower this number (default None).
rounding : bool, optional
Whether to precondition the sampling problem by applying an offline rounding
transformation.
seed : int > 0, optional
Sets the random number seed. Initialized to the current time stamp
if None (default None).

Attributes
----------
n_samples : int
The total number of samples that have been generated by this
sampler instance.
problem : typing.NamedTuple
A NamedTuple whose attributes define the entire sampling problem in
matrix form.
fwd_idx : numpy.array
A numpy array having one entry for each reaction in the model,
containing the index of the respective forward variable.
rev_idx : numpy.array
A numpy array having one entry for each reaction in the model,
containing the index of the respective reverse variable.
sampler :
The hopsy sampler type.
mcs : list
List of length `processes` of hopsy.MarkovChain objects.
rngs : list
List of length `processes` of hopsy.RandomNumberGenerator objects.

"""

def __init__(
self,
model: "Model",
sampler: hopsy.Proposal = hopsy.UniformCoordinateHitAndRunProposal,
thinning: int = 100,
processes: Optional[int] = None,
nproj: Optional[int] = None,
seed: Optional[int] = None,
rounding: bool = True,
**kwargs,
) -> None:
"""Initialize a new HopsySampler."""
super().__init__(model, thinning, nproj=nproj, seed=seed, **kwargs)

if self.problem.inequalities.shape[0] > 0:
A = self.problem.inequalities
b = self.problem.bounds
else:
# empty dummy constraint to set problem dimension
A = np.ones((0, self.problem.equalities.shape[1]))
b = np.ones(0)

problem = hopsy.Problem(A, b)
problem = hopsy.add_box_constraints(
problem,
self.problem.variable_bounds[0],
self.problem.variable_bounds[1],
simplify=False,
)
problem = hopsy.add_equality_constraints(
problem, self.problem.equalities, self.problem.b
)
problem = hopsy.round(problem, simplify=False) if rounding else problem

self.sampler = sampler

# batched transformation to better exploit vectorization
self.transformation = problem.transformation.copy()
self.shift = problem.shift.copy()

problem.transformation = None
problem.shift = None

self._problem = problem

if processes is None:
self.processes = configuration.processes
else:
self.processes = processes

self.mcs = None
self.rngs = None

self.mcs = [
hopsy.MarkovChain(self._problem, self.sampler) for i in range(processes)
]
self.rngs = [
hopsy.RandomNumberGenerator(self._seed, i) for i in range(processes)
]

def sample(self, n: int, fluxes: bool = True) -> pd.DataFrame:
"""Generate a set of samples by calling hopsy.

Parameters
----------
n : int
The minimum number of samples that are generated at once.
fluxes : bool, optional
Whether to return fluxes or the internal solver variables. If
set to False, will return a variable for each forward and
backward flux as well as all additional variables you might
have defined in the model (default True).

Returns
-------
pandas.DataFrame
Returns a pandas DataFrame with `n` rows, each containing a
flux sample.
"""
_, samples = hopsy.sample(
self.mcs,
self.rngs,
n_samples=n // self.processes,
thinning=self.thinning,
n_procs=self.processes,
)

# batched transformation to better exploit vectorization
self._problem.transformation = self.transformation
self._problem.shift = self.shift

samples = back_transform(self._problem, samples)

self._problem.transformation = None
self._problem.shift = None

samples = samples.reshape(-1, samples.shape[-1]) # flatten chain dim

if fluxes:
names = [r.id for r in self.model.reactions]

return pd.DataFrame(
samples[:, self.fwd_idx] - samples[:, self.rev_idx],
columns=names,
)
else:
names = [v.name for v in self.model.variables]

return pd.DataFrame(samples, columns=names)

except ModuleNotFoundError:
hopsy_is_available = False
77 changes: 63 additions & 14 deletions src/cobra/sampling/sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@
import pandas as pd

from .achr import ACHRSampler
from .hopsy import hopsy_is_available


if hopsy_is_available:
from .hopsy import HopsySampler
else:
import warnings

from .optgp import OptGPSampler


Expand All @@ -15,22 +23,30 @@
def sample(
model: "Model",
n: int,
method: str = "optgp",
method: str = "chrr",
thinning: int = 100,
processes: int = 1,
seed: Optional[int] = None,
) -> pd.DataFrame:
"""Sample valid flux distributions from a cobra model.

Currently, two methods are supported:
Currently, three methods are supported:

1. 'chrr' (default) which uses a the coordinate Hit-and-Run algorithm
with rounding ([1]_), which has been shown to often outperform OptGP
and ACHR, refer [2]_, [3]_. The rounding transformation is performed
in an offline manner which inflicts a certain base cost, however
sampling is performed using the C++-based library ``hopsy`` ([4]_)
and is, thus, blazingly fast. Moreover, this method supports
parallel sampling.

1. 'optgp' (default) which uses the OptGPSampler that supports parallel
2. 'optgp' which uses the OptGPSampler that supports parallel
sampling. Requires large numbers of samples to be performant
(`n` > 1000). For smaller samples, 'achr' might be better suited.
For details, refer [1]_ .
For details, refer [5]_ .

2. 'achr' which uses artificial centering hit-and-run. This is a single
process method with good convergence. For details, refer [2]_ .
3. 'achr' which uses artificial centering hit-and-run. This is a single
process method with good convergence. For details, refer [6]_ .

Parameters
----------
Expand All @@ -48,8 +64,8 @@ def sample(
in benchmarks gives approximately uncorrelated samples. If set to 1
will return all iterates (default 100).
processes : int, optional
Only used for 'optgp'. The number of processes used to generate
samples (default 1).
Only used for 'optgp' and the hopsy samplers. The number of processes
used to generate samples (default 1).
seed : int > 0, optional
Sets the random number seed. Initialized to the current time stamp
if None (default None).
Expand All @@ -69,28 +85,61 @@ def sample(

References
----------
.. [1] Megchelenbrink W, Huynen M, Marchiori E (2014)
.. [1] Hulda S Haraldsdóttir, Ben Cousins, Ines Thiele, Ronan M.T Fleming,
Santosh Vempala,
CHRR: coordinate hit-and-run with rounding for uniform sampling of
constraint-based models,
Bioinformatics, Volume 33, Issue 11, June 2017, Pages 1741–1743,
https://doi.org/10.1093/bioinformatics/btx052

.. [2] Herrmann, H.A., Dyson, B.C., Vass, L. et al.
Flux sampling is a powerful tool to study metabolism under changing
environmental conditions.
npj Syst Biol Appl 5, 32 (2019).
https://doi.org/10.1038/s41540-019-0109-0

.. [3] Fallahi S, Skaug HJ, Alendal G (2020)
A comparison of Monte Carlo sampling methods for
metabolic network models.
PLoS ONE 15(7): e0235393.
https://doi.org/10.1371/journal.pone.0235393

.. [4] Richard D Paul, Johann F Jadebeck, Anton Stratmann, Wolfgang Wiechert,
Katharina Nöh,
hopsy — a methods marketplace for convex polytope sampling in Python,
Bioinformatics, Volume 40, Issue 7, July 2024, btae430,
https://doi.org/10.1093/bioinformatics/btae430

.. [5] Megchelenbrink W, Huynen M, Marchiori E (2014)
optGpSampler: An Improved Tool for Uniformly Sampling the Solution-Space
of Genome-Scale Metabolic Networks.
PLoS ONE 9(2): e86587.
https://doi.org/10.1371/journal.pone.0086587

.. [2] Direction Choice for Accelerated Convergence in Hit-and-Run Sampling
.. [6] Direction Choice for Accelerated Convergence in Hit-and-Run Sampling
David E. Kaufman, Robert L. Smith
Operations Research 199846:1 , 84-95
https://doi.org/10.1287/opre.46.1.84

"""
if not hopsy_is_available and method == "chrr":
method = "optgp"
warnings.warn(
"hopsy and thus chrr are not available, switching to optgp.", stacklevel=2
)

if method == "optgp":
sampler = OptGPSampler(model, processes=processes, thinning=thinning, seed=seed)
elif method == "achr":
sampler = ACHRSampler(model, thinning=thinning, seed=seed)
elif method == "chrr":
sampler = HopsySampler(
model, processes=processes, thinning=thinning, seed=seed, rounding=True
)
else:
raise ValueError(
f'Invalid value: "{method}" for method used. '
'The value must be "optgp" or "achr".'
'The value must be "optgp", "achr" or "chrr".'
)

return pd.DataFrame(
columns=[rxn.id for rxn in model.reactions], data=sampler.sample(n)
)
return sampler.sample(n)
Loading