Merge pull request #74 from COBREXA/mk-gapfill

implement gapfilling
COBREXA · Sep 24, 2024 · 9497ad2 · 9497ad2 · exaexa · Sep 24, 2024
2 parents 86b06b5 + 30f6b18
commit 9497ad2
Show file tree

Hide file tree

Showing 6 changed files with 357 additions and 1 deletion.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "COBREXA"
 uuid = "babc4406-5200-4a30-9033-bf5ae714c842"
 authors = ["The developers of COBREXA.jl"]
-version = "2.2.2"
+version = "2.3.0"
 
 [deps]
 AbstractFBCModels = "5a4f3dfa-1789-40f8-8221-69268c29937c"

diff --git a/docs/src/examples/05g-gapfilling.jl b/docs/src/examples/05g-gapfilling.jl
@@ -0,0 +1,138 @@
+
+# Copyright (c) 2024, University of Luxembourg                              #src
+# Copyright (c) 2024, Heinrich-Heine University Duesseldorf                 #src
+#                                                                           #src
+# Licensed under the Apache License, Version 2.0 (the "License");           #src
+# you may not use this file except in compliance with the License.          #src
+# You may obtain a copy of the License at                                   #src
+#                                                                           #src
+#     http://www.apache.org/licenses/LICENSE-2.0                            #src
+#                                                                           #src
+# Unless required by applicable law or agreed to in writing, software       #src
+# distributed under the License is distributed on an "AS IS" BASIS,         #src
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  #src
+# See the License for the specific language governing permissions and       #src
+# limitations under the License.                                            #src
+
+# # Gap filling
+#
+# Gapfilling is used to find easiest additions to the models that would make
+# them feasible and capable of growth.
+#
+# Typically, an infeasible model ("with gaps") is used together with an
+# universal model (which contains "everything"), and the algorithm attempts to
+# find the minimal amount of reactions from the universal model that make the
+# gappy model happy. In turn, the gapfilling optimization problem becomes a
+# MILP.
+#
+# Gapfilling is sometimes used to produce "viable" genome-scale
+# reconstructions from partial ones, but without additional manual intervention
+# the gapfilling results are almost never biologically valid. A good use of
+# gapfilling is to find problems in a model that cause infeasibility as
+# follows: First the modeller makes a set of (unrealistic) universal reactions
+# that supply or remove metabolites, and after gapfilling, metabolites that had
+# to be supplied or removed to make the model feasible mark possible problems,
+# thus guiding the modeller towards correct solution.
+
+# We will use a partially crippled *E. coli* toy model and see the minimal
+# amount of reactions that may save it.
+
+using COBREXA
+
+download_model(
+    "http://bigg.ucsd.edu/static/models/e_coli_core.json",
+    "e_coli_core.json",
+    "7bedec10576cfe935b19218dc881f3fb14f890a1871448fc19a9b4ee15b448d8",
+)
+
+import JSONFBCModels, HiGHS
+model = load_model("e_coli_core.json")
+
+# First, let's produce an infeasible model:
+
+import AbstractFBCModels.CanonicalModel as CM
+infeasible_model = convert(CM.Model, model)
+
+for rxn in ["TALA", "PDH", "PGI", "PYK"]
+    infeasible_model.reactions[rxn].lower_bound = 0.0
+    infeasible_model.reactions[rxn].upper_bound = 0.0
+end
+
+# After removing the above reactions, the model will fail to solve:
+
+flux_balance_analysis(infeasible_model, optimizer = HiGHS.Optimizer) |> println
+
+# To avoid very subtle semantic issues, we are going to remove the ATP
+# maintenance pseudoreaction from the universal model:
+universal_model = convert(CM.Model, model)
+delete!(universal_model.reactions, "ATPM")
+
+# ## Making the model feasible with a minimal set of reactions
+
+# Which of the reactions we have to fill back to get the model working again?
+# First, let's run [`gap_filling_analysis`](@ref) to get a solution for a
+# system that implements the reaction patching:
+
+x = gap_filling_analysis(
+    infeasible_model,
+    universal_model,
+    0.05,
+    optimizer = HiGHS.Optimizer,
+)
+
+# The reactions that had to be re-added can be found from the `fill_flags`:
+
+filled_reactions = [k for (k, v) in x.fill_flags if v != 0]
+
+@test length(filled_reactions) == 1 #src
+
+# If we want to try to generate another solution, we have to explicitly ask the
+# system to avoid the previous solution. That is done via setting the argument
+# `known_fill`. We can also set the `max_cost` to avoid finding too benevolent
+# solutions:
+
+x2 = gap_filling_analysis(
+    infeasible_model,
+    universal_model,
+    0.05,
+    max_cost = 2.0,
+    known_fills = [x.fill_flags],
+    optimizer = HiGHS.Optimizer,
+)
+
+other_filled_reactions = [k for (k, v) in x2.fill_flags if v != 0]
+
+# ## Model debugging: which metabolite is missing?
+#
+# Gap-filling is great for detecting various broken links and imbalances in
+# metabolic models. We show how to find the metabolites are causing the
+# imbalance for our "broken" E. coli model.
+#
+# First, we construct a few completely unnatural reactions that create/remove
+# the metabolites from/to nowhere:
+
+magic_model = convert(CM.Model, model)
+empty!(magic_model.genes)
+empty!(magic_model.reactions)
+
+for mid in keys(magic_model.metabolites)
+    magic_model.reactions[mid] = CM.Reaction(
+        lower_bound = -100.0,
+        upper_bound = 100.0,
+        stoichiometry = Dict(mid => 1.0),
+    )
+end
+
+# Gapfilling now points to the metabolites that need to be somehow taken care
+# of by the modeller in order for the model to become feasible:
+
+xm = gap_filling_analysis(infeasible_model, magic_model, 0.05, optimizer = HiGHS.Optimizer)
+
+blocking_metabolites = [k for (k, v) in xm.fill_flags if v != 0]
+
+@test length(blocking_metabolites) == 1 #src
+
+# We can also have a look at how much of a given metabolite was used to make
+# the model feasible again:
+
+xm.universal_fluxes[first(blocking_metabolites)]
diff --git a/docs/src/reference/frontend.md b/docs/src/reference/frontend.md
@@ -57,6 +57,13 @@ Modules = [COBREXA]
 Pages = ["src/frontend/envelope.jl"]
 ```
 
+## Gap-filling
+
+```@autodocs
+Modules = [COBREXA]
+Pages = ["src/frontend/gapfill.jl"]
+```
+
 ## Knockout models
 
 ```@autodocs

diff --git a/src/COBREXA.jl b/src/COBREXA.jl
@@ -77,6 +77,7 @@ include("frontend/community.jl")
 include("frontend/concentrations.jl")
 include("frontend/envelope.jl")
 include("frontend/enzymes.jl")
+include("frontend/gapfill.jl")
 include("frontend/knockout.jl")
 include("frontend/loopless.jl")
 include("frontend/mmdf.jl")

diff --git a/src/frontend/gapfill.jl b/src/frontend/gapfill.jl
@@ -0,0 +1,190 @@
+
+# Copyright (c) 2024, University of Luxembourg
+# Copyright (c) 2024, Heinrich-Heine University Duesseldorf
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+$(TYPEDSIGNATURES)
+
+Make a gap-filling system from a FBC model with gaps and an universal
+FBC model that contains reactions to be added into the original model.
+
+The output system will be constrainted to reach at least `objective_target`
+flux through the objective function. Generally, this should be set to an
+arbitrary small value such as `0.05`.
+
+`universal_reaction_cost` should assign a numeric cost of inclusion of each of
+the reactions in the `universal_model`; by default all are assigned equal
+weight of `1`. `max_cost` puts an optional maximum limit on the cost, which may
+help the solver to avoid exploring unnecessarily complex solutions.
+`known_fills` may contain previous solutions of the same system; these will be
+made infeasible in the output constraint system in order to allow discovery of
+other ones.
+
+Additional arguments are forwarded to `flux_balance_constraints` that converts
+`model` to constraints.
+"""
+function gap_filling_constraints(
+    model::A.AbstractFBCModel,
+    universal_model::A.AbstractFBCModel,
+    objective_target::Float64;
+    universal_reaction_cost = _ -> 1.0,
+    max_cost = Inf,
+    known_fills::Vector{C.Tree{Float64}} = C.Tree{Float64}[],
+    kwargs...,
+)
+    m = flux_balance_constraints(model; kwargs...)
+    m.objective.bound = C.Between(objective_target, Inf)
+    s = m.flux_stoichiometry
+    delete!(m, :flux_stoichiometry)
+    u = flux_balance_constraints(universal_model)
+    gap_filling_constraints(;
+        system = m,
+        stoichiometry = s,
+        universal_fluxes = u.fluxes,
+        universal_stoichiometry = u.flux_stoichiometry,
+        flux_cost = i -> universal_reaction_cost(String(i)),
+        max_cost,
+        known_fills,
+    )
+end
+
+"""
+$(TYPEDSIGNATURES)
+
+Make a gap-fillign system from a non-solving `system` with a separated
+`stoichiometry`, filling in possible fluxes from `universal_fluxes` that are
+balanced with `universal_stoichiometry`
+
+`flux_cost` can be used to assign a weight to each given universal flux; the
+total cost is bounded by `max_cost`.
+
+`known_fills` may contain previous solutions to be avoided; these are processed
+by [`gap_filling_known_fill_constraint`](@ref) and attached to the system.
+
+`stoichiometry` needs to be extended to construct the final constraints, thus
+it should not be present in the original `system`.
+"""
+function gap_filling_constraints(;
+    system::C.ConstraintTree,
+    stoichiometry::C.ConstraintTree,
+    universal_fluxes::C.ConstraintTree,
+    universal_stoichiometry::C.ConstraintTree,
+    flux_cost = _ -> 1.0,
+    max_cost = Inf,
+    known_fills::Vector{C.Tree{Float64}} = C.Tree{Float64}[],
+)
+    joined =
+        C.ConstraintTree(:system => system, :stoichiometry => stoichiometry) +
+        :universal^C.ConstraintTree(
+            :fluxes => universal_fluxes,
+            :stoichiometry => universal_stoichiometry,
+        ) +
+        :fill_flags^C.variables_for(universal_fluxes) do _
+            Switch(0, 1)
+        end
+
+    return C.ConstraintTree(
+        :system => joined.system,
+        :universal_fluxes => joined.universal.fluxes,
+        :universal_flux_bounds => C.zip(joined.universal.fluxes, joined.fill_flags) do x, b
+            if x.bound isa C.Between
+                C.ConstraintTree(
+                    :lower => C.Constraint(
+                        x.value - x.bound.lower * b.value,
+                        C.Between(0, Inf),
+                    ),
+                    :upper => C.Constraint(
+                        x.value - x.bound.upper * b.value,
+                        C.Between(-Inf, 0),
+                    ),
+                )
+            elseif x.bound isa C.EqualTo
+                C.Constraint(x.value - x.bound.equal_to * b.value, 0)
+            elseif isnothing(x.bound)
+                C.ConstraintTree()
+            else
+                throw(DomainError(x.bound, "unsupported flux bound"))
+            end
+        end,
+        :stoichiometry =>
+            C.merge(joined.stoichiometry, joined.universal.stoichiometry) do a, b
+                ismissing(a) && return b
+                ismissing(b) && return a
+                @assert a.bound isa C.EqualTo &&
+                        a.bound.equal_to == 0.0 &&
+                        b.bound isa C.EqualTo &&
+                        b.bound.equal_to == 0.0 "Stoichiometries in both systems must only contain equal-to-zero-bounded constraints"
+                C.Constraint(a.value + b.value, 0.0)
+            end,
+        :fill_flags => joined.fill_flags,
+        (
+            Symbol(:known_fills_, i) =>
+                gap_filling_known_fill_constraint(joined.fill_flags, kf) for
+            (i, kf) in enumerate(known_fills)
+        )...,
+        :n_filled => C.Constraint(
+            C.sum(
+                (flux_cost(k) * v.value for (k, v) in joined.fill_flags),
+                init = zero(C.LinearValue),
+            ),
+            C.Between(0, max_cost),
+        ),
+    )
+end
+
+export gap_filling_constraints
+
+"""
+$(TYPEDSIGNATURES)
+
+Produce a constraint that can be added to the system made by
+[`gap_filling_constraints`](@ref) to avoid repeating of a solution that
+includes reactions already generated by another solution, as represented by the
+solved `fill_flags`.
+
+Parameter `fill_flags` are the gapfilling flags of the given constraint system,
+parameter `known_flags` is expected to contain the solved `fill_flags` for the
+solution that is to be avoided.
+"""
+gap_filling_known_fill_constraint(
+    fill_flags::C.ConstraintTree,
+    known_flags::C.Tree{Float64},
+) = C.Constraint(
+    C.sum(
+        values(C.zip(fill_flags, known_flags, C.Value) do f, k
+            k - f.value * k
+        end),
+        init = zero(C.LinearValue),
+    ),
+    (1 - eps(), Inf),
+)
+
+export gap_filling_known_fill_constraint
+
+"""
+$(TYPEDSIGNATURES)
+
+Run the gap-filling analysis on a constraint system specified by
+[`gap_filling_constraints`](@ref).
+"""
+gap_filling_analysis(args...; kwargs...) = frontend_optimized_values(
+    gap_filling_constraints,
+    args...;
+    objective = x -> x.n_filled.value,
+    sense = Minimal,
+    kwargs...,
+)
+
+export gap_filling_analysis
diff --git a/test/misc.jl b/test/misc.jl
@@ -104,3 +104,23 @@ end
     @test c.interface_balance.x.value.weights == [1.0, 2.0, 1.0, 3.0, -1.0]
 
 end
+
+@testset "Uncommon bounds in gapfilling" begin
+    vs = C.variables(keys = [:a, :b], bounds = [C.EqualTo(123), nothing])
+    stoi = C.variables(keys = [:c, :d], bounds = C.EqualTo(0))
+    x = gap_filling_constraints(
+        system = vs,
+        stoichiometry = stoi,
+        universal_fluxes = vs,
+        universal_stoichiometry = stoi,
+    )
+    @test x.universal_flux_bounds.a.bound isa C.EqualTo
+    @test isempty(x.universal_flux_bounds.b)
+    vs = C.variables(keys = [:a, :b], bounds = [C.EqualTo(123), Switch(1, 2)])
+    @test_throws DomainError gap_filling_constraints(
+        system = vs,
+        stoichiometry = stoi,
+        universal_fluxes = vs,
+        universal_stoichiometry = stoi,
+    )
+end