GUDHI · tlacombe · Oct 15, 2021 · Oct 15, 2021 · Nov 10, 2021 · Nov 10, 2021
diff --git a/biblio/bibliography.bib b/biblio/bibliography.bib
@@ -78,6 +78,14 @@ @article{boissonnatmariasimplextreealgorithmica
 language={English}
 }
 
+@article{divolLacombe2021quantization,
+  title={Estimation and Quantization of Expected Persistence Diagrams},
+  author={Divol, Vincent and Lacombe, Th{\'e}o},
+  journal={ICML 2021},
+  year={2021},
+  url       = {https://arxiv.org/abs/2105.04852}
+}
+
 @inproceedings{Garland,
  author = {Garland, Michael and Heckbert, Paul S.},
  title = {Surface simplification using quadric error metrics},

diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
@@ -530,6 +530,11 @@ if(PYTHONINTERP_FOUND)
       endif()
     endif()
 
+    # Quantization
+    if(SCIPY_FOUND)
+      add_gudhi_py_test(test_quantization)
+    endif()
+
     # Representations
     if(SKLEARN_FOUND AND MATPLOTLIB_FOUND AND OT_FOUND AND NOT CGAL_VERSION VERSION_LESS 4.11.0)
       add_gudhi_py_test(test_representations)

diff --git a/src/python/doc/img/quantiz.gif b/src/python/doc/img/quantiz.gif
diff --git a/src/python/doc/wasserstein_distance_user.rst b/src/python/doc/wasserstein_distance_user.rst
@@ -193,9 +193,82 @@ The output is:
      [0.7375     0.7625    ]
      [0.2375     0.2625    ]]
 
+
+
 Tutorial
 ********
 
 This
 `notebook <https://github.com/GUDHI/TDA-tutorial/blob/master/Tuto-GUDHI-Barycenters-of-persistence-diagrams.ipynb>`_
 presents the concept of barycenter, or Fréchet mean, of a family of persistence diagrams.
+
+
+Quantization
+------------
+
+:Since: GUDHI 3.5.0
+:Requires: `SciPy <installation.html#scipy>`_
+
+The quantization problem consists, given a measure :math:`\mu` (e.g. a persistence diagram) and a budget :math:`k`,
+in finding a measure :math:`\nu` supported on :math:`k` points that is the best approximation possible of :math:`\mu` ;
+the :math:`k`-means problem is a particular example of quantization problem.
+The output of a quantization algorithm is often referred to as a codebook; it consists of :math:`k` points (centroids)
+optimally positioned to summarize the input measure :math:`\mu`.
+
+If one has access to a sample of measures instead, one may perform online quantization: given a sample
+:math:`\mu_1,\dots,\mu_n` and an initial codebook :math:`c`, we progressively update :math:`c` by going through
+(possibly batches of) the :math:`(\mu_i)_i`. Theoretical properties (including convergence rates and guarantees) of the
+final codebook are provided in :cite:`divolLacombe2021quantization`.
+
+The Figure below showcases the use of the online-quantization approach provided by the
+``gudhi.wasserstein.quantization`` method.
+In this experiment, Rips persistence diagrams (in homology dimension 1) are built from random point clouds supported on
+different tori with some additional noise.
+Starting from an initial codebook ``c0``, centroids are iteratively updated as new diagrams are provided.
+As we use the Wasserstein metrics between persistence diagrams (denoted here by :math:`\mathrm{OT}_2`), points in the
+diagrams that are close to the diagonal do not interfere in the codebook update process.
+Doing so, the final codebook is able to properly retrieve the two 1-dimensional features of the underlying points clouds
+(the two loops generating the tori).
+
+.. figure::
+     ./img/quantiz.gif
+     :figclass: align-center
+
+     Online-quantization of a family of persistence diagrams built from random tori.
+
+.. autofunction:: gudhi.wasserstein.quantization
+
+Basic example:
+**************
+
+This example outputs a codebook for a set of diagrams with two proeminent modes around :math:`(0,1)` and :math:`(0,3)`.
+Diagrams, as often, also contain points near the diagonal. As we compare the codebook and the input diagrams
+using the standard :math:`2`-Wasserstein metric between persistence diagram (which maps points to the diagonal rather
+than any centroids if it reduces the matching cost), those points do not influence the centroids positions,
+an improvement over using vanilla :math:`k`-mean (Lloyd algorithm) to quantize persistence diagrams.
+
+.. testcode::
+
+    d1 = np.array([[0, 1.001], [0, 3], [2, 2.001], [3, 3.001], [1, 1.01]])
+    d2 = np.array([[0, 1], [0, 3.001], [0, 0.001]])
+    d3 = np.array([[0, 0.999], [0, 3.002], [0, 2.998]])
+    d4 = np.array([[0, 0.003], [0, 1.001], [0,3.004], [4, 4.01]])
+    pdiagset = [d1, d2, d3, d4]
+    c_final = quantization(pdiagset, k=2)
+    print("Final codebook obtained:")
+    print(c_final)
+
+.. testoutput::
+
+    Final codebook obtained:
+    [[0.      1.00025]
+     [0.      3.00125]]
+
+
+Tutorial
+********
+
+
+This `notebook <https://github.com/GUDHI/TDA-tutorial/blob/master/Tuto-GUDHI-Quantization-of-persistence-diagrams.ipynb>`_
+presents the concept of quantization, or codebooks, of a family of persistence diagrams.
+
diff --git a/src/python/gudhi/wasserstein/__init__.py b/src/python/gudhi/wasserstein/__init__.py
@@ -1 +1,2 @@
 from .wasserstein import wasserstein_distance
+from .quantization import quantization
diff --git a/src/python/gudhi/wasserstein/_utils.py b/src/python/gudhi/wasserstein/_utils.py
@@ -0,0 +1,49 @@
+# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
+# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
+# Author(s):       Theo Lacombe
+#
+# Copyright (C) 2021 Université Gustave Eiffel
+#
+# Modification(s):
+#   - YYYY/MM Author: Description of the modification
+
+import numpy as np
+import scipy.spatial.distance as sc
+
+
+def _dist_to_diag(X, internal_p):
+    """
+    :param X: (n x 2) array encoding the points of a persistent diagram.
+    :param internal_p: Ground metric (i.e. norm L^p).
+    :returns: (n) array encoding the (respective orthogonal) distances of the points to the diagonal
+
+    .. note::
+        Assumes that the points are above the diagonal.
+    """
+    return (X[:, 1] - X[:, 0]) * 2 ** (1.0 / internal_p - 1)
+
+
+def _build_dist_matrix(X, Y, order, internal_p):
+    """
+    :param X: (n x 2) numpy.array encoding the (points of the) first diagram.
+    :param Y: (m x 2) numpy.array encoding the second diagram.
+    :param order: exponent for the Wasserstein metric.
+    :param internal_p: Ground metric (i.e. norm L^p).
+    :returns: (n+1) x (m+1) np.array encoding the cost matrix C.
+                For 0 <= i < n, 0 <= j < m, C[i,j] encodes the distance between X[i] and Y[j],
+                while C[i, m] (resp. C[n, j]) encodes the distance (to the p) between X[i] (resp Y[j])
+                and its orthogonal projection onto the diagonal.
+                note also that C[n, m] = 0  (it costs nothing to move from the diagonal to the diagonal).
+    """
+    Cxd = _dist_to_diag(X, internal_p)**order
+    Cdy = _dist_to_diag(Y, internal_p)**order
+    if np.isinf(internal_p):
+        C = sc.cdist(X,Y, metric='chebyshev')**order
+    else:
+        C = sc.cdist(X,Y, metric='minkowski', p=internal_p)**order
+    Cf = np.hstack((C, Cxd[:,None]))
+    Cdy = np.append(Cdy, 0)
+
+    Cf = np.vstack((Cf, Cdy[None,:]))
+
+    return Cf
diff --git a/src/python/gudhi/wasserstein/quantization.py b/src/python/gudhi/wasserstein/quantization.py
@@ -0,0 +1,189 @@
+# This file is part of the Gudhi Library - https://gudhi.inria.fr/ - which is released under MIT.
+# See file LICENSE or go to https://gudhi.inria.fr/licensing/ for full license details.
+# Author(s):       Theo Lacombe
+#
+# Copyright (C) 2021 Université Gustave Eiffel
+#
+# Modification(s):
+#   - YYYY/MM Author: Description of the modification
+
+
+import numpy as np
+from gudhi.wasserstein import _utils
+import warnings
+
+
+def _get_cells(X, c, withdiag, internal_p):
+    """
+    Given a point cloud `X` and a codebook (set of centroids) `c`,
+    assign to each point in X its nearest centroid in c.
+    If withdiag is True, points which are nearest to the diagonal than any centroid are assigned to an
+    additional cell.
+
+    Parameters
+    ----------
+    :param X: Current set of points, of size (n x d) (n : number of points, and d=2 for persistence diagrams).
+    :type X: `numpy.array`
+    :param c: Current set of centroids (codebook), of size (k x d). (k centroids in dimension d, d=2 for PDs).
+                It is assumed that k >= 1.
+    :param withdiag: Indicate if we consider the diagonal into account (True when working with PDs).
+    :type withdiag: bool
+    :param internal_p: The ground metric used to compute distances between points (norm_p).
+
+    :returns: list `cells` of affectation, such that cells[j] = points in `X` matched to `c[j]`,
+                with convention that `cells[k]` represents the diagonal whenever `withdiags=True`.
+
+        .. note: if the input X is empty, returns the empty list. Should not happen as _get_batches remove
+                    batches of empty points.
+
+    """
+    k = c.shape[0]
+
+    if X.shape[0] == 0:
+        warnings.warn("Input point cloud is empty, Cells affectation is a list of empty lists.")
+        return [[]] * (k+withdiag)  # k or (k+1) depending on additional centroid for the diagonal.
+
+    if X.shape[1] != 2:
+        raise ValueError("Input batch must be of shape (n x 2), not (%s,%s)" %(X.shape))
+
+    M = _utils._build_dist_matrix(X, c, order=2, internal_p=internal_p)  # Note: Order is useless here
+                                                                  # we just need nearest neighbors.
+
+    if withdiag:
+        a = np.argmin(M[:-1, :], axis=1)
+    else:
+        a = np.argmin(M[:-1, :-1], axis=1)
+
+    cells = [X[a == j] for j in range(k)]
+
+    if withdiag:
+        cells.append(X[a == k])  # this is the (k+1)-th centroid corresponding to the diagonal
+
+    return cells
+
+
+def _from_batch(pdiagset, batches_indices):
+    """
+    Given a pdiagset and a list of indices, compute the "empirical expected persistence diagram"
+    (in practice, concatenates) on the corresponding batch of diagrams.
+    Note that we may encounter empty diagrams, hence the specific check .ndim==2.
+        If the resulting list is empty (all diagrams are empty), returns empty array.
+    """
+    list_of_non_empty_diags = [pdiagset[i] for i in batches_indices if pdiagset[i].ndim==2]
+    if list_of_non_empty_diags: # There are some non-empty diagrams
+        X_batch = np.concatenate(list_of_non_empty_diags)
+        return X_batch
+    else:
+        return np.array([])
+
+
+def _init_c(pdiagset, k, internal_p=2):
+    """
+    A naive heuristic to initialize a codebook: we take the k points with largest distances to the diagonal
+        in the first diagram of the list.
+
+    Parameters
+    ----------
+    :param list_diags: The list of diagrams we want to quantize. Assume it is not empty (checked by quantization).
+    :param k: number of centroids in the codebook.
+    :param internal_p: Internal ground metric. Should not play a role specific role.
+
+    :returns: an initial codebook c0 with `k` entries.
+    -------
+
+    """
+    dgm = pdiagset[0]
+    w = _utils._dist_to_diag(dgm, internal_p)
+    s = np.argsort(w)
+    c0 = dgm[s[-k:]]  # Elements in dgm with larger persistence are last
+    return c0
+
+
+#####################
+###  Main method  ###
+#####################
+
+def quantization(pdiagset, k=2, init=None, batch_size=1, order=2., internal_p=2.):
+    """
+    This quantization algorithm takes a list of diagrams ``pdiagset``, an integer ``k``
+    (or an initial codebook guess ``c0``)
+    and returns a codebook of ``k`` centroids that aims at minimizing the distance between the codebook and the
+    expected persistence diagram underneath the diagrams in ``pdiagset``.
+    The codebook is iteratively updated by going through (batches of) persistence diagrams.
+
+    :param pdiagset: a set of persistence diagrams that we want to quantize.
+    :type pdiagset: list of n x 2 numpy arrays
+    :param k: number of centroids. Default is ``2``. A naive heuristic is used to initialize the codebook.
+              Not used if an initial codebook ``c0`` has been provided.
+    :type k: ``int``
+    :param init: If provided, overwrite the provided ``k`` (which is now ``c0.shape[0]``), and is used as an
+                initialization for the algorithm. Default is ``None``.
+    :type init: (n x 2) ``numpy.array``, or ``None``.
+    :param batch_size: Size of batches used during the online exploration of the ``pdiagset``.
+                        Default is ``1``.
+    :type batch_size: ``int``
+    :param order: Order of the Wasserstein distance we want to minimize.
+                  Only 2 is implemented for now. Default is ``2.``.
+    :type order: ``float``
+    :param internal_p: Ground metric to assess centroid affectation. Default is ``2.``.
+    :type internal_p: ``float``
+
+    :returns: The final codebook obtained after going through the all pdiagset.
+
+    .. note:: The exact algorithm presented in the reference paper
+              requires more technical considerations in order to prove theoretical convergence rates.
+              A theoretically motivated value (asymptotically optimal) for the ``batch_size`` is
+              ``int(log(len(pdiagset)))``.
+              However, numerical experiments suggest that this simplified version yields substantially similar results.
+    """
+
+    # We take the diagonal into account in the following.
+    # This could be turned into a parameter of the algorithm (so that it would encompass the standard
+    # "online-Lloyd algorithm"), but I thought this may be confusing.
+    withdiag = True
+
+    if not pdiagset:
+        raise ValueError("Input pdiagset is empty.")
+    # If no codebook has been provided, we propose an initialization.
+    if init is None:
+        init = _init_c(pdiagset, k)
+    # Number of centroids in the codebook (updated in the case c0 was provided by the user).
+    k = init.shape[0]
+
+    # Variable that will store the centroid evolution.
+    c_current = init.copy()
+
+    # number of diagrams
+    n = len(pdiagset)
+
+    # Building the batches (size batch_size, except possibly for the last one): a list of the form
+    # [[0,1,..., batch_size-1],[batch_size,..., 2batch_size-1],...,[remaining diags]]
+    # Note : we have to add (n % batch_size != 0) to increase the number of batches by 1 if needed.
+    nb_step = n // batch_size + (n % batch_size != 0)
+    batches = np.array_split(np.arange(0, n, dtype=int), nb_step)
+
+    # We now iterate through the diagrams
+    for t in range(nb_step):
+        # Get the empirical expected persistence diagram on the current batch
+        X_bar_t = _from_batch(pdiagset, batches[t])
+        # We compute the corresponding cells
+        cells = _get_cells(X_bar_t, c_current, withdiag=withdiag, internal_p=internal_p)
+
+        if order == 2.:
+            # We update the position of each centroid.
+            for j in range(k):
+                lc1 = len(cells[j])
+                # The batch may be empty (if c_current[j] does not catch any point)
+                # In which case we do not update it.
+                if lc1 > 0:
+                    # We compute the "gradient", i.e. the optimal descent direction for c_current[j]
+                    # When order == 2, it just consists of pushing c_current[j] towards the barycenter
+                    # of the cells[j] it defines.
+                    grad = np.mean(c_current[j] - cells[j], axis=0)
+                    # We then apply the gradient, with a 1/(1+t) factor to guarantee the convergence of this
+                    # stochastic-gradient-descent like approach (decreasing learning rate).
+                    c_current[j] = c_current[j] - grad / (t + 1)
+        else:
+            raise NotImplemented('Order = %s is not available yet. Only order=2. is valid' %order)
+
+    return c_current