Merge branch 'similarity-measures' into 'main'

functionality to do time-series similarity calculations See merge request water/computational-tools/surface-water-work/hyswap!22
DOI-USGS · Aug 21, 2023 · dcc7916 · dcc7916
2 parents c646d0d + 3d0cbae
commit dcc7916
Show file tree

Hide file tree

Showing 11 changed files with 678 additions and 2 deletions.
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -92,6 +92,7 @@
 from hyswap import percentiles
 from hyswap import cumulative
 from hyswap import plots
+from hyswap import similarity
 from hyswap import runoff
 import numpy as np
 import pandas as pd
@@ -144,5 +145,6 @@
 linkcheck_exclude_documents = [
     r'meta/disclaimer*',
     r'meta/contributing*',
-    r'meta/installation*'
+    r'meta/installation*',
+    r'examples/similarity_examples',
 ]
diff --git a/docs/source/examples/index.rst b/docs/source/examples/index.rst
@@ -34,4 +34,12 @@ Cumulative Hydrograph Examples
 .. toctree::
     :maxdepth: 2
 
-    cumulative_hydrograph_examples
+    cumulative_hydrograph_examples
+
+Similarity Examples
+-------------------
+
+.. toctree::
+    :maxdepth: 2
+
+    similarity_examples
diff --git a/docs/source/examples/similarity_examples.rst b/docs/source/examples/similarity_examples.rst
@@ -0,0 +1,158 @@
+
+Similarity Measures
+-------------------
+
+These examples showcase the usage of the functions in the `similarity` module, with heatmap visualizations via the :obj:`hyswap.plots.plot_similarity_heatmap` function.
+Sometimes it is helpful to compare the relationships between a set of stations and their respective measurements.
+The `similarity` functions packaged in `hyswap` handle some of the data clean-up for you by ensuring the time-series of observations being compared at the same, and by removing any missing data.
+This ensures that your results are not skewed by missing data or gaps in one of the time-series.
+
+
+Correlations Between 5 Stations
+*******************************
+
+The following example shows the correlations between streamflow at 5 stations (07374525, 07374000, 07289000, 07032000, 07024175) along the Mississippi River, listed from downstream to upstream.
+First we have to fetch the streamflow data for these stations, to do this we will use the `dataretrieval` package to access the NWIS database.
+
+.. plot::
+    :context: reset
+    :include-source:
+
+    # get the data from these 5 sites
+    site_list = ["07374525", "07374000", "07289000", "07032000", "07024175"]
+
+    # fetch some streamflow data from NWIS as a list of dataframes
+    df_list = []
+    for site in site_list:
+        df, _ = dataretrieval.nwis.get_dv(site, start="2012-01-01",
+                                          end="2022-12-31",
+                                          parameterCd='00060')
+        df_list.append(df)
+
+Once we've collected the streamflow data, we will calculate the pair-wise correlations between the stations using the :obj:`hyswap.similarity.calculate_correlations` function and then plot the results using :obj:`hyswap.plots.plot_similarity_heatmap`.
+
+.. plot::
+    :context:
+    :include-source:
+
+    # calculate correlations
+    results, n_obs = hyswap.similarity.calculate_correlations(df_list, "00060_Mean")
+
+    # make plot
+    ax = hyswap.plots.plot_similarity_heatmap(
+        results, n_obs=n_obs,
+        title="Pearson Correlation Coefficients for Streamflow\n" +
+              "Between 5 Sites Along the Mississippi River")
+
+    # show the plot
+    plt.tight_layout()
+    plt.show()
+
+If we'd like, we can display the specific values of the correlations by setting the `show_values` argument to `True` in the :obj:`hyswap.plots.plot_similarity_heatmap` function.
+
+.. plot::
+    :context: reset
+    :include-source:
+
+    # get the data from these 5 sites
+    site_list = ["07374525", "07374000", "07289000", "07032000", "07024175"]
+
+    # fetch some streamflow data from NWIS as a list of dataframes
+    df_list = []
+    for site in site_list:
+        df, _ = dataretrieval.nwis.get_dv(site, start="2012-01-01",
+                                          end="2022-12-31",
+                                          parameterCd='00060')
+        df_list.append(df)
+
+    # calculate correlations
+    results, n_obs = hyswap.similarity.calculate_correlations(df_list, "00060_Mean")
+
+    # make plot
+    ax = hyswap.plots.plot_similarity_heatmap(
+        results, n_obs=n_obs,
+        title="Pearson Correlation Coefficients for Streamflow\n" +
+              "Between 5 Sites Along the Mississippi River",
+        show_values=True)
+
+    # show the plot
+    plt.tight_layout()
+    plt.show()
+
+
+Wasserstein Distances Between 5 Stations
+****************************************
+
+In this example we compare the same 5 time-series as the previous example, but instead of calculating correlations, we calculate the `Wasserstein Distance <wasserstein_doc>`_ between each pairing of time-series.
+The Wasserstein Distance is a measure of the distance between two probability distributions, in this case the probability distributions of the streamflow values at each station.
+Specifically in `hyswap`, we utilize the `scipy.stats.wasserstein_distance()` function, and are therefore specifically calculating the "first" Wasserstein Distance between two time-series.
+
+.. _wasserstein_doc: https://en.wikipedia.org/wiki/Wasserstein_metric
+
+.. plot::
+    :context: reset
+    :include-source:
+
+    # get the data from these 5 sites
+    site_list = ["07374525", "07374000", "07289000", "07032000", "07024175"]
+
+    # fetch some streamflow data from NWIS as a list of dataframes
+    df_list = []
+    for site in site_list:
+        df, _ = dataretrieval.nwis.get_dv(site, start="2012-01-01",
+                                          end="2022-12-31",
+                                          parameterCd='00060')
+        df_list.append(df)
+
+    # calculate Wasserstein Distances
+    results, n_obs = hyswap.similarity.calculate_wasserstein_distance(df_list, "00060_Mean")
+
+    # make plot
+    ax = hyswap.plots.plot_similarity_heatmap(
+        results, n_obs=n_obs,
+        title="Wasserstein Distances for Streamflow\n" +
+              "Between 5 Sites Along the Mississippi River",
+        show_values=True)
+
+    # show the plot
+    plt.tight_layout()
+    plt.show()
+
+
+Energy Distances Between 5 Stations
+***********************************
+
+In this example we compare the same 5 time-series as the previous example, but this time using another distance measure, the so-called "Energy Distance" between two time-series.
+The `Energy Distance <energy_dist>`_ is a statistical distance between two probability distributions, in this case the probability distributions of the streamflow values at each station.
+Specifically in `hyswap`, we utilize the `scipy.stats.energy_distance()` function.
+
+.. _energy_dist: https://en.wikipedia.org/wiki/Energy_distance
+
+.. plot::
+    :context: reset
+    :include-source:
+
+    # get the data from these 5 sites
+    site_list = ["07374525", "07374000", "07289000", "07032000", "07024175"]
+
+    # fetch some streamflow data from NWIS as a list of dataframes
+    df_list = []
+    for site in site_list:
+        df, _ = dataretrieval.nwis.get_dv(site, start="2012-01-01",
+                                          end="2022-12-31",
+                                          parameterCd='00060')
+        df_list.append(df)
+
+    # calculate Wasserstein Distances
+    results, n_obs = hyswap.similarity.calculate_energy_distance(df_list, "00060_Mean")
+
+    # make plot
+    ax = hyswap.plots.plot_similarity_heatmap(
+        results, n_obs=n_obs,
+        title="Energy Distances for Streamflow\n" +
+              "Between 5 Sites Along the Mississippi River",
+        show_values=True)
+
+    # show the plot
+    plt.tight_layout()
+    plt.show()
diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst
@@ -62,3 +62,10 @@ Runoff Calculation Functions
 .. automodule:: hyswap.runoff
     :members:
     :special-members:
+
+Similarity Functions
+--------------------
+
+.. automodule:: hyswap.similarity
+    :members:
+    :special-members:
diff --git a/hyswap/__init__.py b/hyswap/__init__.py
@@ -6,6 +6,7 @@
 from hyswap.percentiles import *  # noqa
 from hyswap.cumulative import *  # noqa
 from hyswap.plots import *  # noqa
+from hyswap.similarity import *  # noqa
 from hyswap.runoff import *  # noqa
 
 try:

diff --git a/hyswap/plots.py b/hyswap/plots.py
@@ -613,3 +613,96 @@ def plot_hydrograph(df, data_col,
     ax.set_yticks(yticks[1:-1], labels=yticklabels[1:-1])
     # return
     return ax
+
+
+def plot_similarity_heatmap(sim_matrix, n_obs=None, cmap='inferno',
+                            show_values=False, ax=None,
+                            title='Similarity Matrix'):
+    """Plot a similarity matrix as a heatmap.
+
+    Parameters
+    ----------
+    sim_matrix : pandas.DataFrame
+        Similarity matrix to plot. Must be square. Can be the output of
+        :meth:`hyswap.similarity.calculate_correlations`,
+        :meth:`hyswap.similarity.calculate_wasserstein_distance`,
+        :meth:`hyswap.similarity.calculate_energy_distance`, or any other
+        square matrix represented as a pandas DataFrame.
+
+    cmap : str, optional
+        Colormap to use. Default is 'inferno'.
+
+    show_values : bool, optional
+        Whether to show the values of the matrix on the plot. Default is False.
+
+    ax : matplotlib.axes.Axes, optional
+        Axes object to plot on. If not provided, a new figure and axes will be
+        created.
+
+    title : str, optional
+        Title for the plot. Default is 'Similarity Matrix'.
+
+    Returns
+    -------
+    matplotlib.axes.Axes
+        Axes object containing the plot.
+
+    Examples
+    --------
+    Calculate the correlation matrix between two sites and plot it as a
+    heatmap.
+
+    .. plot::
+        :include-source:
+
+        >>> df, _ = dataretrieval.nwis.get_dv(site='06892350',
+        ...                                   parameterCd='00060',
+        ...                                   start='2010-01-01',
+        ...                                   end='2021-12-31')
+        >>> df2, _ = dataretrieval.nwis.get_dv(site='06892000',
+        ...                                    parameterCd='00060',
+        ...                                    start='2010-01-01',
+        ...                                    end='2021-12-31')
+        >>> corr_matrix, n_obs = hyswap.similarity.calculate_correlations(
+        ...     [df, df2], '00060_Mean')
+        >>> ax = hyswap.plots.plot_similarity_heatmap(corr_matrix,
+        ...                                           show_values=True)
+        >>> plt.show()
+    """
+    # Create axes if not provided
+    if ax is None:
+        _, ax = plt.subplots()
+    # plot heatmap using matplotlib
+    vmin = sim_matrix.min().min()
+    vmax = sim_matrix.max().max()
+    im = ax.imshow(sim_matrix, cmap=cmap,
+                   vmin=sim_matrix.min().min(),
+                   vmax=sim_matrix.max().max())
+    # show values if desired
+    if show_values:
+        for i in range(sim_matrix.shape[0]):
+            for j in range(sim_matrix.shape[1]):
+                # if below halfway point, make text white
+                if sim_matrix.iloc[i, j] < (vmax - vmin) / 2 + vmin:
+                    ax.text(j, i, f'{sim_matrix.iloc[i, j]:.2f}',
+                            ha="center", va="center", color="w")
+                # otherwise, make text black
+                else:
+                    ax.text(j, i, f'{sim_matrix.iloc[i, j]:.2f}',
+                            ha="center", va="center", color="k")
+    # set labels
+    if n_obs is not None:
+        title = f'{title} (n={n_obs})'
+    ax.set_title(title)
+    ax.set_xlabel('Site')
+    ax.set_ylabel('Site')
+    # set ticks at center of each cell
+    ax.set_xticks(np.arange(sim_matrix.shape[0]))
+    ax.set_yticks(np.arange(sim_matrix.shape[1]))
+    # set tick labels
+    ax.set_xticklabels(sim_matrix.columns)
+    ax.set_yticklabels(sim_matrix.index)
+    # add colorbar
+    plt.colorbar(im, ax=ax)
+    # return
+    return ax