From 2730167b301f05406b86ceabd94d86c6f50c3e9e Mon Sep 17 00:00:00 2001
From: elbeejay <jhariharan@usgs.gov>
Date: Thu, 15 Jun 2023 16:02:49 -0400
Subject: [PATCH 1/4] functionality to do time-series similarity calculations

---
 docs/source/conf.py                          |   1 +
 docs/source/examples/index.rst               |  10 +-
 docs/source/examples/similarity_examples.rst |  80 ++++++++
 docs/source/reference/index.rst              |   7 +
 hyswap/__init__.py                           |   1 +
 hyswap/plots.py                              |  91 +++++++++
 hyswap/similarity.py                         | 203 +++++++++++++++++++
 hyswap/utils.py                              |  59 ++++++
 8 files changed, 451 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/examples/similarity_examples.rst
 create mode 100644 hyswap/similarity.py

diff --git a/docs/source/conf.py b/docs/source/conf.py
index c3ad419..4987419 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -92,6 +92,7 @@
 from hyswap import percentiles
 from hyswap import cumulative
 from hyswap import plots
+from hyswap import similarity
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
diff --git a/docs/source/examples/index.rst b/docs/source/examples/index.rst
index d3f9797..716ab3c 100644
--- a/docs/source/examples/index.rst
+++ b/docs/source/examples/index.rst
@@ -34,4 +34,12 @@ Cumulative Hydrograph Examples
 .. toctree::
     :maxdepth: 2
 
-    cumulative_hydrograph_examples
\ No newline at end of file
+    cumulative_hydrograph_examples
+
+Similarity Examples
+-------------------
+
+.. toctree::
+    :maxdepth: 2
+
+    similarity_examples
diff --git a/docs/source/examples/similarity_examples.rst b/docs/source/examples/similarity_examples.rst
new file mode 100644
index 0000000..fbb1367
--- /dev/null
+++ b/docs/source/examples/similarity_examples.rst
@@ -0,0 +1,80 @@
+
+Similarity Measures
+-------------------
+
+These examples showcase the usage of the functions in the `similarity` module, with heatmap visualizations via the :obj:`hyswap.plots.plot_similarity_heatmap` function.
+Sometimes it is helpful to compare the relationships between a set of stations and their respective measurements.
+The `similarity` functions packaged in `hyswap` handle some of the data clean-up for you by ensuring the time-series of observations being compared at the same, and by removing any missing data.
+This ensures that your results are not skewed by missing data or gaps in one of the time-series.
+
+
+Correlations Between 5 Stations
+*******************************
+
+The following example shows the correlations between streamflow at 5 stations (07374525, 07374000, 07289000, 07032000, 07024175) along the Mississippi River, listed from downstream to upstream.
+First we have to fetch the streamflow data for these stations, to do this we will use the `dataretrieval` package to access the NWIS database.
+
+.. plot::
+    :context: reset
+    :include-source:
+
+    # get the data from these 5 sites
+    site_list = ["07374525", "07374000", "07289000", "07032000", "07024175"]
+
+    # fetch some streamflow data from NWIS as a list of dataframes
+    df_list = []
+    for site in site_list:
+        df, _ = dataretrieval.nwis.get_dv(site, start="2012-01-01",
+                                          end="2022-12-31",
+                                          parameterCd='00060')
+        df_list.append(df)
+
+Once we've collected the streamflow data, we will calculate the pair-wise correlations between the stations using the :obj:`hyswap.similarity.calculate_correlations` function and then plot the results using :obj:`hyswap.plots.plot_similarity_heatmap`.
+
+.. plot::
+    :context:
+    :include-source:
+
+    # calculate correlations
+    results = hyswap.similarity.calculate_correlations(df_list, "00060_Mean")
+
+    # make plot
+    ax = hyswap.plots.plot_similarity_heatmap(
+        results,
+        title="Pearson Correlation Coefficients for Streamflow\n" +
+              "Between 5 Sites Along the Mississippi River")
+
+    # show the plot
+    plt.tight_layout()
+    plt.show()
+
+If we'd like, we can display the specific values of the correlations by setting the `show_values` argument to `True` in the :obj:`hyswap.plots.plot_similarity_heatmap` function.
+
+.. plot::
+    :context: reset
+    :include-source:
+
+    # get the data from these 5 sites
+    site_list = ["07374525", "07374000", "07289000", "07032000", "07024175"]
+
+    # fetch some streamflow data from NWIS as a list of dataframes
+    df_list = []
+    for site in site_list:
+        df, _ = dataretrieval.nwis.get_dv(site, start="2012-01-01",
+                                          end="2022-12-31",
+                                          parameterCd='00060')
+        df_list.append(df)
+
+    # calculate correlations
+    results = hyswap.similarity.calculate_correlations(df_list, "00060_Mean")
+
+    # make plot
+    ax = hyswap.plots.plot_similarity_heatmap(
+        results,
+        title="Pearson Correlation Coefficients for Streamflow\n" +
+              "Between 5 Sites Along the Mississippi River",
+        show_values=True)
+
+    # show the plot
+    plt.tight_layout()
+    plt.show()
diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst
index 150e118..0e7a148 100644
--- a/docs/source/reference/index.rst
+++ b/docs/source/reference/index.rst
@@ -55,3 +55,10 @@ Plotting Functions
 .. automodule:: hyswap.plots
     :members:
     :special-members:
+
+Similarity Functions
+--------------------
+
+.. automodule:: hyswap.similarity
+    :members:
+    :special-members:
diff --git a/hyswap/__init__.py b/hyswap/__init__.py
index 8c66d9b..1061dc8 100644
--- a/hyswap/__init__.py
+++ b/hyswap/__init__.py
@@ -6,6 +6,7 @@
 from hyswap.percentiles import *  # noqa
 from hyswap.cumulative import *  # noqa
 from hyswap.plots import *  # noqa
+from hyswap.similarity import *  # noqa
 
 try:
     __version__ = version('hyswap')
diff --git a/hyswap/plots.py b/hyswap/plots.py
index a66940a..1e6de8c 100644
--- a/hyswap/plots.py
+++ b/hyswap/plots.py
@@ -496,3 +496,94 @@ def plot_cumulative_hydrograph(cumulative_percentiles, target_years,
 
     # return
     return ax
+
+
+def plot_similarity_heatmap(sim_matrix, cmap='inferno',
+                            show_values=False, ax=None,
+                            title='Similarity Matrix'):
+    """Plot a similarity matrix as a heatmap.
+
+    Parameters
+    ----------
+    sim_matrix : pandas.DataFrame
+        Similarity matrix to plot. Must be square. Can be the output of
+        :meth:`hyswap.similarity.calculate_correlations`,
+        :meth:`hyswap.similarity.calculate_wasserstein_distance`,
+        :meth:`hyswap.similarity.calculate_energy_distance`, or any other
+        square matrix represented as a pandas DataFrame.
+
+    cmap : str, optional
+        Colormap to use. Default is 'inferno'.
+
+    show_values : bool, optional
+        Whether to show the values of the matrix on the plot. Default is False.
+
+    ax : matplotlib.axes.Axes, optional
+        Axes object to plot on. If not provided, a new figure and axes will be
+        created.
+
+    title : str, optional
+        Title for the plot. Default is 'Similarity Matrix'.
+
+    Returns
+    -------
+    matplotlib.axes.Axes
+        Axes object containing the plot.
+
+    Examples
+    --------
+    Calculate the correlation matrix between two sites and plot it as a
+    heatmap.
+
+    .. plot::
+        :include-source:
+
+        >>> df, _ = dataretrieval.nwis.get_dv(site='06892350',
+        ...                                   parameterCd='00060',
+        ...                                   start='2010-01-01',
+        ...                                   end='2021-12-31')
+        >>> df2, _ = dataretrieval.nwis.get_dv(site='06892000',
+        ...                                    parameterCd='00060',
+        ...                                    start='2010-01-01',
+        ...                                    end='2021-12-31')
+        >>> corr_matrix = hyswap.similarity.calculate_correlations(
+        ...     [df, df2], '00060_Mean')
+        >>> ax = hyswap.plots.plot_similarity_heatmap(corr_matrix,
+        ...                                           show_values=True)
+        >>> plt.show()
+    """
+    # Create axes if not provided
+    if ax is None:
+        _, ax = plt.subplots()
+    # plot heatmap using matplotlib
+    vmin = sim_matrix.min().min()
+    vmax = sim_matrix.max().max()
+    im = ax.imshow(sim_matrix, cmap=cmap,
+                   vmin=sim_matrix.min().min(),
+                   vmax=sim_matrix.max().max())
+    # show values if desired
+    if show_values:
+        for i in range(sim_matrix.shape[0]):
+            for j in range(sim_matrix.shape[1]):
+                # if below halfway point, make text white
+                if sim_matrix.iloc[i, j] < (vmax - vmin) / 2 + vmin:
+                    ax.text(j, i, f'{sim_matrix.iloc[i, j]:.2f}',
+                            ha="center", va="center", color="w")
+                # otherwise, make text black
+                else:
+                    ax.text(j, i, f'{sim_matrix.iloc[i, j]:.2f}',
+                            ha="center", va="center", color="k")
+    # set labels
+    ax.set_title(title)
+    ax.set_xlabel('Site')
+    ax.set_ylabel('Site')
+    # set ticks at center of each cell
+    ax.set_xticks(np.arange(sim_matrix.shape[0]))
+    ax.set_yticks(np.arange(sim_matrix.shape[1]))
+    # set tick labels
+    ax.set_xticklabels(sim_matrix.columns)
+    ax.set_yticklabels(sim_matrix.index)
+    # add colorbar
+    plt.colorbar(im, ax=ax)
+    # return
+    return ax
diff --git a/hyswap/similarity.py b/hyswap/similarity.py
new file mode 100644
index 0000000..5338043
--- /dev/null
+++ b/hyswap/similarity.py
@@ -0,0 +1,203 @@
+"""Similarity measures for hyswap."""
+
+import numpy as np
+import pandas as pd
+from scipy import stats
+from hyswap.utils import filter_to_common_time
+
+
+def calculate_correlations(df_list, data_column_name, df_names=None):
+    """Calculate Pearson correlations between dataframes in df_list.
+
+    This function is designed to calculate the Pearson correlation
+    coefficients between dataframes in df_list. The dataframes in df_list are
+    expected to have the same columns. The correlation coefficients are
+    calculated using the `numpy.corrcoeff` function.
+
+    Parameters
+    ----------
+    df_list : list
+        List of dataframes. The dataframes are expected to have the same
+        columns. Likely inputs are the output of a function like
+        dataretrieval.nwis.get_dv() or similar
+
+    data_column_name : str
+        Name of the column to use for the correlation calculation.
+
+    df_names : list, optional
+        List of names for the dataframes in df_list. If provided, the names
+        will be used to label the rows and columns of the output array. If
+        not provided, the column "site_no" will be used if available, if it is
+        not available, the index of the dataframe in the list will be used.
+
+    Returns
+    -------
+    correlations : pandas.DataFrame
+        Dataframe of correlation coefficients. The rows and columns are
+        labeled with the names of the dataframes in df_list as provided
+        by df_names argument.
+
+    Examples
+    --------
+    Calculate correlations between two synthetic dataframes.
+
+    .. doctest::
+
+        >>> df1 = pd.DataFrame({'a': np.arange(10), 'b': np.arange(10)})
+        >>> df2 = pd.DataFrame({'a': -1*np.arange(10), 'b': np.arange(10)})
+        >>> results = similarity.calculate_correlations([df1, df2], 'a')
+        >>> results
+             0    1
+        0  1.0 -1.0
+        1 -1.0  1.0
+    """
+    # handle the names of the dataframes
+    df_names = _name_handling(df_list, df_names)
+    # preprocess dataframe list so they have the same index/times
+    df_list = filter_to_common_time(df_list)
+    # calculate correlations between all pairs of dataframes in the list
+    correlations = np.empty((len(df_list), len(df_list)))
+    for i, df1 in enumerate(df_list):
+        for j, df2 in enumerate(df_list):
+            correlations[i, j] = np.corrcoef(
+                df1[data_column_name], df2[data_column_name])[0, 1]
+    # turn the correlations into a dataframe
+    correlations = pd.DataFrame(
+        correlations, index=df_names, columns=df_names)
+    return correlations
+
+
+def calculate_wasserstein_distance(df_list, data_column_name, df_names=None):
+    """Calculate Wasserstein distance between dataframes in df_list.
+
+    This function is designed to calculate the Wasserstein distance between
+    dataframes in df_list. The dataframes in df_list are expected to have the
+    same columns. The Wasserstein distance is calculated using the
+    `scipy.stats.wasserstein_distance` function.
+
+    Parameters
+    ----------
+    df_list : list
+        List of dataframes. The dataframes are expected to have the same
+        columns. Likely inputs are the output of a function like
+        dataretrieval.nwis.get_dv() or similar
+
+    data_column_name : str
+        Name of the column to use for the Wasserstein distance calculation.
+
+    df_names : list, optional
+        List of names for the dataframes in df_list. If provided, the names
+        will be used to label the rows and columns of the output array. If
+        not provided, the column "site_no" will be used if available, if it is
+        not available, the index of the dataframe in the list will be used.
+
+    Returns
+    -------
+    wasserstein_distances : pandas.DataFrame
+        Dataframe of Wasserstein distances. The rows and columns are
+        labeled with the names of the dataframes in df_list as provided
+        by df_names argument.
+
+    Examples
+    --------
+    Calculate Wasserstein distances between two synthetic dataframes.
+
+    .. doctest::
+
+        >>> df1 = pd.DataFrame({'a': np.arange(10), 'b': np.arange(10)})
+        >>> df2 = pd.DataFrame({'a': -1*np.arange(10), 'b': np.arange(10)})
+        >>> results = similarity.calculate_wasserstein_distance(
+        ...     [df1, df2], 'a')
+        >>> results
+             0    1
+        0  0.0  9.0
+        1  9.0  0.0
+    """
+    # preprocess dataframe list so they have the same index/times
+    df_list = filter_to_common_time(df_list)
+    # calculate distances between all pairs of dataframes in the list
+    wasserstein_distances = np.empty((len(df_list), len(df_list)))
+    for i, df1 in enumerate(df_list):
+        for j, df2 in enumerate(df_list):
+            wasserstein_distances[i, j] = stats.wasserstein_distance(
+                df1[data_column_name], df2[data_column_name])
+    # handle the names of the dataframes
+    df_names = _name_handling(df_list, df_names)
+    # turn the distances into a dataframe
+    wasserstein_distances = pd.DataFrame(
+        wasserstein_distances, index=df_names, columns=df_names)
+    return wasserstein_distances
+
+
+def calculate_energy_distance(df_list, data_column_name, df_names=None):
+    """Calculate energy distance between dataframes in df_list.
+
+    This function is designed to calculate the energy distance between
+    dataframes in df_list. The dataframes in df_list are expected to have the
+    same columns. The energy distance is calculated using the
+    `scipy.stats.energy_distance` function.
+
+    Parameters
+    ----------
+    df_list : list
+        List of dataframes. The dataframes are expected to have the same
+        columns. Likely inputs are the output of a function like
+        dataretrieval.nwis.get_dv() or similar
+
+    data_column_name : str
+        Name of the column to use for the energy distance calculation.
+
+    df_names : list, optional
+        List of names for the dataframes in df_list. If provided, the names
+        will be used to label the rows and columns of the output array. If
+        not provided, the column "site_no" will be used if available, if it is
+        not available, the index of the dataframe in the list will be used.
+
+    Returns
+    -------
+    energy_distances : pandas.DataFrame
+        Dataframe of energy distances. The rows and columns are
+        labeled with the names of the dataframes in df_list as provided
+        by df_names argument.
+
+    Examples
+    --------
+    Calculate energy distances between two synthetic dataframes.
+
+    .. doctest::
+
+        >>> df1 = pd.DataFrame({'a': np.arange(10), 'b': np.arange(10)})
+        >>> df2 = pd.DataFrame({'a': -1*np.arange(10), 'b': np.arange(10)})
+        >>> results = similarity.calculate_energy_distance(
+        ...     [df1, df2], 'a')
+        >>> results
+                  0         1
+        0  0.000000  3.376389
+        1  3.376389  0.000000
+    """
+    # preprocess dataframe list so they have the same index/times
+    df_list = filter_to_common_time(df_list)
+    # calculate distances between all pairs of dataframes in the list
+    energy_distances = np.empty((len(df_list), len(df_list)))
+    for i, df1 in enumerate(df_list):
+        for j, df2 in enumerate(df_list):
+            energy_distances[i, j] = stats.energy_distance(
+                df1[data_column_name], df2[data_column_name])
+    # handle the names of the dataframes
+    df_names = _name_handling(df_list, df_names)
+    # turn the distances into a dataframe
+    energy_distances = pd.DataFrame(
+        energy_distances, index=df_names, columns=df_names)
+    return energy_distances
+
+
+def _name_handling(df_list, df_names):
+    """Private function to handle the names of the dataframes."""
+    if df_names is None:
+        df_names = []
+        for i, df in enumerate(df_list):
+            if 'site_no' in df.columns:
+                df_names.append(df['site_no'].iloc[0])
+            else:
+                df_names.append(str(i))
+    return df_names
diff --git a/hyswap/utils.py b/hyswap/utils.py
index 0c17fa8..489573f 100644
--- a/hyswap/utils.py
+++ b/hyswap/utils.py
@@ -430,3 +430,62 @@ def munge_nwis_stats(df, source_pct_col=None, target_pct_col=None,
     df_slim.columns = target_pct_col
     # return the dataframe
     return df_slim
+
+
+def filter_to_common_time(df_list):
+    """Filter a list of dataframes to common times based on index.
+
+    This function takes a list of dataframes and filters them to only include
+    the common times based on the index of the dataframes. This is necessary
+    before comparing the timeseries and calculating statistics between two or
+    more timeseries.
+
+    Parameters
+    ----------
+    df_list : list
+        List of pandas.DataFrame objects to filter to common times.
+        DataFrames assumed to have date-time information in the index.
+        Expect input to be the output from a function like
+        dataretrieval.nwis.get_dv() or similar.
+
+    Returns
+    -------
+    df_list : list
+        List of pandas.DataFrame objects filtered to common times.
+
+    Examples
+    --------
+    Get some NWIS data.
+
+    .. doctest::
+
+            >>> df1, md1 = dataretrieval.nwis.get_dv(
+            ...     "03586500", parameterCd="00060",
+            ...     start="2018-12-15", end="2019-01-07")
+            >>> df2, md2 = dataretrieval.nwis.get_dv(
+            ...     "01646500", parameterCd="00060",
+            ...     start="2019-01-01", end="2019-01-14")
+            >>> type(df1)
+            <class 'pandas.core.frame.DataFrame'>
+            >>> type(df2)
+            <class 'pandas.core.frame.DataFrame'>
+
+    Filter the dataframes to common times.
+
+    .. doctest::
+
+            >>> df_list = utils.filter_to_common_time([df1, df2])
+            >>> df_list[0].shape
+            (7, 3)
+            >>> df_list[1].shape
+            (7, 3)
+    """
+    # get the common index
+    common_index = df_list[0].index
+    for df in df_list:
+        common_index = common_index.intersection(df.index)
+    # filter the dataframes to the common index
+    for i, df in enumerate(df_list):
+        df_list[i] = df.loc[common_index]
+    # return the list of dataframes
+    return df_list

From 49662ec670964c2b30b6996d0b7ecb4e9b286a0d Mon Sep 17 00:00:00 2001
From: elbeejay <jhariharan@usgs.gov>
Date: Wed, 21 Jun 2023 15:30:24 -0400
Subject: [PATCH 2/4] store number of common obs

---
 docs/source/conf.py                          |  8 +-
 docs/source/examples/similarity_examples.rst | 86 +++++++++++++++++++-
 hyswap/plots.py                              |  6 +-
 hyswap/similarity.py                         | 31 +++++--
 hyswap/utils.py                              | 10 ++-
 5 files changed, 118 insertions(+), 23 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 4987419..e991e4c 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -134,13 +134,9 @@
 # -- Options for linkcheck -------------------------------------------
 
 # Links to not "check" because they are problematic for the link checker
-linkcheck_ignore = [
-    r'https://doi.org/10.3133/wsp1542A',
-    r'https://pypi.org/project/hyswap/'
-]
-
 linkcheck_exclude_documents = [
     r'meta/disclaimer*',
     r'meta/contributing*',
-    r'meta/installation*'
+    r'meta/installation*',
+    r'examples/similarity_examples',
 ]
diff --git a/docs/source/examples/similarity_examples.rst b/docs/source/examples/similarity_examples.rst
index fbb1367..0c58f3e 100644
--- a/docs/source/examples/similarity_examples.rst
+++ b/docs/source/examples/similarity_examples.rst
@@ -36,11 +36,11 @@ Once we've collected the streamflow data, we will calculate the pair-wise correl
     :include-source:
 
     # calculate correlations
-    results = hyswap.similarity.calculate_correlations(df_list, "00060_Mean")
+    results, n_obs = hyswap.similarity.calculate_correlations(df_list, "00060_Mean")
 
     # make plot
     ax = hyswap.plots.plot_similarity_heatmap(
-        results,
+        results, n_obs=n_obs,
         title="Pearson Correlation Coefficients for Streamflow\n" +
               "Between 5 Sites Along the Mississippi River")
 
@@ -66,11 +66,11 @@ If we'd like, we can display the specific values of the correlations by setting
         df_list.append(df)
 
     # calculate correlations
-    results = hyswap.similarity.calculate_correlations(df_list, "00060_Mean")
+    results, n_obs = hyswap.similarity.calculate_correlations(df_list, "00060_Mean")
 
     # make plot
     ax = hyswap.plots.plot_similarity_heatmap(
-        results,
+        results, n_obs=n_obs,
         title="Pearson Correlation Coefficients for Streamflow\n" +
               "Between 5 Sites Along the Mississippi River",
         show_values=True)
@@ -78,3 +78,81 @@ If we'd like, we can display the specific values of the correlations by setting
     # show the plot
     plt.tight_layout()
     plt.show()
+
+
+Wasserstein Distances Between 5 Stations
+****************************************
+
+In this example we compare the same 5 time-series as the previous example, but instead of calculating correlations, we calculate the `Wasserstein Distance <wasserstein_doc>`_ between each pairing of time-series.
+The Wasserstein Distance is a measure of the distance between two probability distributions, in this case the probability distributions of the streamflow values at each station.
+Specifically in `hyswap`, we utilize the `scipy.stats.wasserstein_distance()` function, and are therefore specifically calculating the "first" Wasserstein Distance between two time-series.
+
+.. _wasserstein_doc: https://en.wikipedia.org/wiki/Wasserstein_metric
+
+.. plot::
+    :context: reset
+    :include-source:
+
+    # get the data from these 5 sites
+    site_list = ["07374525", "07374000", "07289000", "07032000", "07024175"]
+
+    # fetch some streamflow data from NWIS as a list of dataframes
+    df_list = []
+    for site in site_list:
+        df, _ = dataretrieval.nwis.get_dv(site, start="2012-01-01",
+                                          end="2022-12-31",
+                                          parameterCd='00060')
+        df_list.append(df)
+
+    # calculate Wasserstein Distances
+    results, n_obs = hyswap.similarity.calculate_wasserstein_distance(df_list, "00060_Mean")
+
+    # make plot
+    ax = hyswap.plots.plot_similarity_heatmap(
+        results, n_obs=n_obs,
+        title="Wasserstein Distances for Streamflow\n" +
+              "Between 5 Sites Along the Mississippi River",
+        show_values=True)
+
+    # show the plot
+    plt.tight_layout()
+    plt.show()
+
+
+Energy Distances Between 5 Stations
+***********************************
+
+In this example we compare the same 5 time-series as the previous example, but this time using another distance measure, the so-called "Energy Distance" between two time-series.
+The `Energy Distance <energy_dist>`_ is a statistical distance between two probability distributions, in this case the probability distributions of the streamflow values at each station.
+Specifically in `hyswap`, we utilize the `scipy.stats.energy_distance()` function.
+
+.. _energy_dist: https://en.wikipedia.org/wiki/Energy_distance
+
+.. plot::
+    :context: reset
+    :include-source:
+
+    # get the data from these 5 sites
+    site_list = ["07374525", "07374000", "07289000", "07032000", "07024175"]
+
+    # fetch some streamflow data from NWIS as a list of dataframes
+    df_list = []
+    for site in site_list:
+        df, _ = dataretrieval.nwis.get_dv(site, start="2012-01-01",
+                                          end="2022-12-31",
+                                          parameterCd='00060')
+        df_list.append(df)
+
+    # calculate Wasserstein Distances
+    results, n_obs = hyswap.similarity.calculate_energy_distance(df_list, "00060_Mean")
+
+    # make plot
+    ax = hyswap.plots.plot_similarity_heatmap(
+        results, n_obs=n_obs,
+        title="Energy Distances for Streamflow\n" +
+              "Between 5 Sites Along the Mississippi River",
+        show_values=True)
+
+    # show the plot
+    plt.tight_layout()
+    plt.show()
diff --git a/hyswap/plots.py b/hyswap/plots.py
index 1e6de8c..0dfdb0c 100644
--- a/hyswap/plots.py
+++ b/hyswap/plots.py
@@ -498,7 +498,7 @@ def plot_cumulative_hydrograph(cumulative_percentiles, target_years,
     return ax
 
 
-def plot_similarity_heatmap(sim_matrix, cmap='inferno',
+def plot_similarity_heatmap(sim_matrix, n_obs=None, cmap='inferno',
                             show_values=False, ax=None,
                             title='Similarity Matrix'):
     """Plot a similarity matrix as a heatmap.
@@ -546,7 +546,7 @@ def plot_similarity_heatmap(sim_matrix, cmap='inferno',
         ...                                    parameterCd='00060',
         ...                                    start='2010-01-01',
         ...                                    end='2021-12-31')
-        >>> corr_matrix = hyswap.similarity.calculate_correlations(
+        >>> corr_matrix, n_obs = hyswap.similarity.calculate_correlations(
         ...     [df, df2], '00060_Mean')
         >>> ax = hyswap.plots.plot_similarity_heatmap(corr_matrix,
         ...                                           show_values=True)
@@ -574,6 +574,8 @@ def plot_similarity_heatmap(sim_matrix, cmap='inferno',
                     ax.text(j, i, f'{sim_matrix.iloc[i, j]:.2f}',
                             ha="center", va="center", color="k")
     # set labels
+    if n_obs is not None:
+        title = f'{title} (n={n_obs})'
     ax.set_title(title)
     ax.set_xlabel('Site')
     ax.set_ylabel('Site')
diff --git a/hyswap/similarity.py b/hyswap/similarity.py
index 5338043..c4c4454 100644
--- a/hyswap/similarity.py
+++ b/hyswap/similarity.py
@@ -37,6 +37,9 @@ def calculate_correlations(df_list, data_column_name, df_names=None):
         labeled with the names of the dataframes in df_list as provided
         by df_names argument.
 
+    n_obs : int
+        Number of observations used to calculate the energy distance.
+
     Examples
     --------
     Calculate correlations between two synthetic dataframes.
@@ -45,7 +48,7 @@ def calculate_correlations(df_list, data_column_name, df_names=None):
 
         >>> df1 = pd.DataFrame({'a': np.arange(10), 'b': np.arange(10)})
         >>> df2 = pd.DataFrame({'a': -1*np.arange(10), 'b': np.arange(10)})
-        >>> results = similarity.calculate_correlations([df1, df2], 'a')
+        >>> results, n_obs = similarity.calculate_correlations([df1, df2], 'a')
         >>> results
              0    1
         0  1.0 -1.0
@@ -54,7 +57,7 @@ def calculate_correlations(df_list, data_column_name, df_names=None):
     # handle the names of the dataframes
     df_names = _name_handling(df_list, df_names)
     # preprocess dataframe list so they have the same index/times
-    df_list = filter_to_common_time(df_list)
+    df_list, n_obs = filter_to_common_time(df_list)
     # calculate correlations between all pairs of dataframes in the list
     correlations = np.empty((len(df_list), len(df_list)))
     for i, df1 in enumerate(df_list):
@@ -64,7 +67,7 @@ def calculate_correlations(df_list, data_column_name, df_names=None):
     # turn the correlations into a dataframe
     correlations = pd.DataFrame(
         correlations, index=df_names, columns=df_names)
-    return correlations
+    return correlations, n_obs
 
 
 def calculate_wasserstein_distance(df_list, data_column_name, df_names=None):
@@ -98,6 +101,9 @@ def calculate_wasserstein_distance(df_list, data_column_name, df_names=None):
         labeled with the names of the dataframes in df_list as provided
         by df_names argument.
 
+    n_obs : int
+        Number of observations used to calculate the energy distance.
+
     Examples
     --------
     Calculate Wasserstein distances between two synthetic dataframes.
@@ -106,15 +112,17 @@ def calculate_wasserstein_distance(df_list, data_column_name, df_names=None):
 
         >>> df1 = pd.DataFrame({'a': np.arange(10), 'b': np.arange(10)})
         >>> df2 = pd.DataFrame({'a': -1*np.arange(10), 'b': np.arange(10)})
-        >>> results = similarity.calculate_wasserstein_distance(
+        >>> results, n_obs = similarity.calculate_wasserstein_distance(
         ...     [df1, df2], 'a')
         >>> results
              0    1
         0  0.0  9.0
         1  9.0  0.0
     """
+    # handle the names of the dataframes
+    df_names = _name_handling(df_list, df_names)
     # preprocess dataframe list so they have the same index/times
-    df_list = filter_to_common_time(df_list)
+    df_list, n_obs = filter_to_common_time(df_list)
     # calculate distances between all pairs of dataframes in the list
     wasserstein_distances = np.empty((len(df_list), len(df_list)))
     for i, df1 in enumerate(df_list):
@@ -126,7 +134,7 @@ def calculate_wasserstein_distance(df_list, data_column_name, df_names=None):
     # turn the distances into a dataframe
     wasserstein_distances = pd.DataFrame(
         wasserstein_distances, index=df_names, columns=df_names)
-    return wasserstein_distances
+    return wasserstein_distances, n_obs
 
 
 def calculate_energy_distance(df_list, data_column_name, df_names=None):
@@ -160,6 +168,9 @@ def calculate_energy_distance(df_list, data_column_name, df_names=None):
         labeled with the names of the dataframes in df_list as provided
         by df_names argument.
 
+    n_obs : int
+        Number of observations used to calculate the energy distance.
+
     Examples
     --------
     Calculate energy distances between two synthetic dataframes.
@@ -168,15 +179,17 @@ def calculate_energy_distance(df_list, data_column_name, df_names=None):
 
         >>> df1 = pd.DataFrame({'a': np.arange(10), 'b': np.arange(10)})
         >>> df2 = pd.DataFrame({'a': -1*np.arange(10), 'b': np.arange(10)})
-        >>> results = similarity.calculate_energy_distance(
+        >>> results, n_obs = similarity.calculate_energy_distance(
         ...     [df1, df2], 'a')
         >>> results
                   0         1
         0  0.000000  3.376389
         1  3.376389  0.000000
     """
+    # handle the names of the dataframes
+    df_names = _name_handling(df_list, df_names)
     # preprocess dataframe list so they have the same index/times
-    df_list = filter_to_common_time(df_list)
+    df_list, n_obs = filter_to_common_time(df_list)
     # calculate distances between all pairs of dataframes in the list
     energy_distances = np.empty((len(df_list), len(df_list)))
     for i, df1 in enumerate(df_list):
@@ -188,7 +201,7 @@ def calculate_energy_distance(df_list, data_column_name, df_names=None):
     # turn the distances into a dataframe
     energy_distances = pd.DataFrame(
         energy_distances, index=df_names, columns=df_names)
-    return energy_distances
+    return energy_distances, n_obs
 
 
 def _name_handling(df_list, df_names):
diff --git a/hyswap/utils.py b/hyswap/utils.py
index 489573f..ff12c83 100644
--- a/hyswap/utils.py
+++ b/hyswap/utils.py
@@ -349,6 +349,8 @@ def munge_nwis_stats(df, source_pct_col=None, target_pct_col=None,
     be used on Python dataretrieval dataframe returns for the nwis.get_stats()
     function for "daily" data, a single site, and a single parameter code.
 
+    Parameters
+    ----------
     df : pandas.DataFrame
         DataFrame containing NWIS statistics data retrieved from the statistics
         web service. Assumed to come in as a dataframe retrieved with a
@@ -452,6 +454,8 @@ def filter_to_common_time(df_list):
     -------
     df_list : list
         List of pandas.DataFrame objects filtered to common times.
+    n_obs : int
+        Number of observations in the common time period.
 
     Examples
     --------
@@ -474,7 +478,7 @@ def filter_to_common_time(df_list):
 
     .. doctest::
 
-            >>> df_list = utils.filter_to_common_time([df1, df2])
+            >>> df_list, n_obs = utils.filter_to_common_time([df1, df2])
             >>> df_list[0].shape
             (7, 3)
             >>> df_list[1].shape
@@ -487,5 +491,7 @@ def filter_to_common_time(df_list):
     # filter the dataframes to the common index
     for i, df in enumerate(df_list):
         df_list[i] = df.loc[common_index]
+    # get the number of observations
+    n_obs = len(common_index)
     # return the list of dataframes
-    return df_list
+    return df_list, n_obs

From 0e7ce4216e5cd426f7565c31c2b0548c02915458 Mon Sep 17 00:00:00 2001
From: elbeejay <jhariharan@usgs.gov>
Date: Wed, 21 Jun 2023 15:49:49 -0400
Subject: [PATCH 3/4] update tests for similarity stuff

---
 tests/test_plots.py      | 50 ++++++++++++++++++++++++++++++++++++++++
 tests/test_similarity.py | 50 ++++++++++++++++++++++++++++++++++++++++
 tests/test_utils.py      | 18 +++++++++++++++
 3 files changed, 118 insertions(+)
 create mode 100644 tests/test_similarity.py

diff --git a/tests/test_plots.py b/tests/test_plots.py
index 334758f..f307fcb 100644
--- a/tests/test_plots.py
+++ b/tests/test_plots.py
@@ -8,6 +8,7 @@
 from hyswap import rasterhydrograph
 from hyswap import percentiles
 from hyswap import plots
+from hyswap import similarity
 
 
 def test_plot_flow_duration_curve():
@@ -156,3 +157,52 @@ def test_plot_cumulative_hydrograph():
     assert len(ax.collections) == 1
     # close plot
     plt.close()
+
+
+def test_plot_similarity_heatmat():
+    """Testing the plot_similarity_heatmap function."""
+    # make dummy dataframes
+    df_1 = pd.DataFrame({
+        'data': np.arange(10),
+        'date': pd.date_range('2020-01-01', '2020-01-10')})
+    df_1.set_index('date', inplace=True)
+    df_2 = pd.DataFrame({
+        'data': np.arange(10),
+        'date': pd.date_range('2020-01-06', '2020-01-15')})
+    df_2.set_index('date', inplace=True)
+    # calculate similarity matrix (using correlation)
+    df_corr, n_obs = similarity.calculate_correlations(
+        [df_1, df_2], 'data')
+    # plot
+    ax = plots.plot_similarity_heatmap(df_corr, n_obs=n_obs,
+                                       show_values=True,
+                                       title='Test Title')
+    # assertions
+    assert isinstance(ax, plt.Axes)
+    assert ax.get_xlabel() == 'Site'
+    assert ax.get_ylabel() == 'Site'
+    assert ax.get_title() == 'Test Title (n=5)'
+    assert len(ax.lines) == 0
+    # close plot
+    plt.close()
+
+    # make a set with negative values
+    df_2 = pd.DataFrame({
+        'data': np.random.random(10) * 10,
+        'date': pd.date_range('2020-01-06', '2020-01-15')})
+    df_2.set_index('date', inplace=True)
+    # calculate similarity matrix (using correlation)
+    df_corr, n_obs = similarity.calculate_correlations(
+        [df_1, df_2], 'data')
+    # plot
+    ax = plots.plot_similarity_heatmap(df_corr, n_obs=n_obs,
+                                       show_values=True,
+                                       title='Test Title')
+    # assertions
+    assert isinstance(ax, plt.Axes)
+    assert ax.get_xlabel() == 'Site'
+    assert ax.get_ylabel() == 'Site'
+    assert ax.get_title() == 'Test Title (n=5)'
+    assert len(ax.lines) == 0
+    # close plot
+    plt.close()
\ No newline at end of file
diff --git a/tests/test_similarity.py b/tests/test_similarity.py
new file mode 100644
index 0000000..d265d36
--- /dev/null
+++ b/tests/test_similarity.py
@@ -0,0 +1,50 @@
+"""Tests for the similarity functions."""
+import pytest
+import numpy as np
+import pandas as pd
+from hyswap import similarity
+
+
+def test_calculate_correlations():
+    """Test the calculate_correlations function."""
+    df1 = pd.DataFrame({'a': np.arange(10), 'b': np.arange(10)})
+    df2 = pd.DataFrame({'a': -1*np.arange(10), 'b': np.arange(10)})
+    results, n_obs = similarity.calculate_correlations([df1, df2], 'a')
+    assert n_obs == 10
+    assert results.iloc[0, 0] == pytest.approx(1)
+    assert results.iloc[0, 1] == pytest.approx(-1)
+    assert results.iloc[1, 0] == pytest.approx(-1)
+    assert results.iloc[1, 1] == pytest.approx(1)
+
+
+def test_calculate_wasserstein_distance():
+    """Test the calculate_wasserstein_distance function."""
+    df1 = pd.DataFrame({'a': np.arange(10),
+                        'b': np.arange(10),
+                        'site_no': np.zeros(10)})
+    df2 = pd.DataFrame({'a': -1*np.arange(10),
+                        'b': np.arange(10),
+                        'site_no': np.ones(10)})
+    results, n_obs = similarity.calculate_wasserstein_distance([df1, df2], 'a')
+    assert n_obs == 10
+    assert results.iloc[0, 0] == pytest.approx(0)
+    assert results.iloc[0, 1] == pytest.approx(9)
+    assert results.iloc[1, 0] == pytest.approx(9)
+    assert results.iloc[1, 1] == pytest.approx(0)
+    assert results.index[0] == 0
+    assert results.index[1] == 1
+
+
+def test_calculate_energy_distance():
+    """Test the calculate_energy_distance function."""
+    df1 = pd.DataFrame({'a': np.arange(10), 'b': np.arange(10)})
+    df2 = pd.DataFrame({'a': -1*np.arange(10), 'b': np.arange(10)})
+    results, n_obs = similarity.calculate_energy_distance(
+        [df1, df2], 'a', df_names=['one', 'two'])
+    assert n_obs == 10
+    assert results.iloc[0, 0] == pytest.approx(0)
+    assert np.round(results.iloc[0, 1], 1) == 3.4
+    assert np.round(results.iloc[1, 0], 1) == 3.4
+    assert results.iloc[1, 1] == pytest.approx(0)
+    assert results.index[0] == 'one'
+    assert results.index[1] == 'two'
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 200d4b9..4afeab8 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -248,3 +248,21 @@ def test_munge_nwis_stats():
     assert len(df.columns) > len(df_slim.columns)
     assert 0 in df_slim.columns
     assert df_slim.columns.tolist() == [0, 5, 10, 25, 75, 90, 95, 100]
+
+
+def test_filter_to_common_time():
+    """Test the filter_to_common_time function."""
+    # make dummy dataframes
+    df_1 = pd.DataFrame({
+        'data': np.arange(10),
+        'date': pd.date_range('2020-01-01', '2020-01-10')})
+    df_1.set_index('date', inplace=True)
+    df_2 = pd.DataFrame({
+        'data': np.arange(10),
+        'date': pd.date_range('2020-01-06', '2020-01-15')})
+    df_2.set_index('date', inplace=True)
+    # apply the function
+    df_list, n_obs = utils.filter_to_common_time([df_1, df_2])
+    # check the output
+    assert len(df_list) == 2
+    assert n_obs == 5

From b669f729820daf079385c7dd44b06d0e25ba4a1d Mon Sep 17 00:00:00 2001
From: elbeejay <jhariharan@usgs.gov>
Date: Wed, 21 Jun 2023 15:57:40 -0400
Subject: [PATCH 4/4] lint

---
 tests/test_plots.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_plots.py b/tests/test_plots.py
index f307fcb..4d9a74d 100644
--- a/tests/test_plots.py
+++ b/tests/test_plots.py
@@ -205,4 +205,4 @@ def test_plot_similarity_heatmat():
     assert ax.get_title() == 'Test Title (n=5)'
     assert len(ax.lines) == 0
     # close plot
-    plt.close()
\ No newline at end of file
+    plt.close()