From 2730167b301f05406b86ceabd94d86c6f50c3e9e Mon Sep 17 00:00:00 2001 From: elbeejay Date: Thu, 15 Jun 2023 16:02:49 -0400 Subject: [PATCH 1/4] functionality to do time-series similarity calculations --- docs/source/conf.py | 1 + docs/source/examples/index.rst | 10 +- docs/source/examples/similarity_examples.rst | 80 ++++++++ docs/source/reference/index.rst | 7 + hyswap/__init__.py | 1 + hyswap/plots.py | 91 +++++++++ hyswap/similarity.py | 203 +++++++++++++++++++ hyswap/utils.py | 59 ++++++ 8 files changed, 451 insertions(+), 1 deletion(-) create mode 100644 docs/source/examples/similarity_examples.rst create mode 100644 hyswap/similarity.py diff --git a/docs/source/conf.py b/docs/source/conf.py index c3ad419..4987419 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -92,6 +92,7 @@ from hyswap import percentiles from hyswap import cumulative from hyswap import plots +from hyswap import similarity import numpy as np import pandas as pd import matplotlib.pyplot as plt diff --git a/docs/source/examples/index.rst b/docs/source/examples/index.rst index d3f9797..716ab3c 100644 --- a/docs/source/examples/index.rst +++ b/docs/source/examples/index.rst @@ -34,4 +34,12 @@ Cumulative Hydrograph Examples .. toctree:: :maxdepth: 2 - cumulative_hydrograph_examples \ No newline at end of file + cumulative_hydrograph_examples + +Similarity Examples +------------------- + +.. toctree:: + :maxdepth: 2 + + similarity_examples diff --git a/docs/source/examples/similarity_examples.rst b/docs/source/examples/similarity_examples.rst new file mode 100644 index 0000000..fbb1367 --- /dev/null +++ b/docs/source/examples/similarity_examples.rst @@ -0,0 +1,80 @@ + +Similarity Measures +------------------- + +These examples showcase the usage of the functions in the `similarity` module, with heatmap visualizations via the :obj:`hyswap.plots.plot_similarity_heatmap` function. +Sometimes it is helpful to compare the relationships between a set of stations and their respective measurements. +The `similarity` functions packaged in `hyswap` handle some of the data clean-up for you by ensuring the time-series of observations being compared at the same, and by removing any missing data. +This ensures that your results are not skewed by missing data or gaps in one of the time-series. + + +Correlations Between 5 Stations +******************************* + +The following example shows the correlations between streamflow at 5 stations (07374525, 07374000, 07289000, 07032000, 07024175) along the Mississippi River, listed from downstream to upstream. +First we have to fetch the streamflow data for these stations, to do this we will use the `dataretrieval` package to access the NWIS database. + +.. plot:: + :context: reset + :include-source: + + # get the data from these 5 sites + site_list = ["07374525", "07374000", "07289000", "07032000", "07024175"] + + # fetch some streamflow data from NWIS as a list of dataframes + df_list = [] + for site in site_list: + df, _ = dataretrieval.nwis.get_dv(site, start="2012-01-01", + end="2022-12-31", + parameterCd='00060') + df_list.append(df) + +Once we've collected the streamflow data, we will calculate the pair-wise correlations between the stations using the :obj:`hyswap.similarity.calculate_correlations` function and then plot the results using :obj:`hyswap.plots.plot_similarity_heatmap`. + +.. plot:: + :context: + :include-source: + + # calculate correlations + results = hyswap.similarity.calculate_correlations(df_list, "00060_Mean") + + # make plot + ax = hyswap.plots.plot_similarity_heatmap( + results, + title="Pearson Correlation Coefficients for Streamflow\n" + + "Between 5 Sites Along the Mississippi River") + + # show the plot + plt.tight_layout() + plt.show() + +If we'd like, we can display the specific values of the correlations by setting the `show_values` argument to `True` in the :obj:`hyswap.plots.plot_similarity_heatmap` function. + +.. plot:: + :context: reset + :include-source: + + # get the data from these 5 sites + site_list = ["07374525", "07374000", "07289000", "07032000", "07024175"] + + # fetch some streamflow data from NWIS as a list of dataframes + df_list = [] + for site in site_list: + df, _ = dataretrieval.nwis.get_dv(site, start="2012-01-01", + end="2022-12-31", + parameterCd='00060') + df_list.append(df) + + # calculate correlations + results = hyswap.similarity.calculate_correlations(df_list, "00060_Mean") + + # make plot + ax = hyswap.plots.plot_similarity_heatmap( + results, + title="Pearson Correlation Coefficients for Streamflow\n" + + "Between 5 Sites Along the Mississippi River", + show_values=True) + + # show the plot + plt.tight_layout() + plt.show() diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst index 150e118..0e7a148 100644 --- a/docs/source/reference/index.rst +++ b/docs/source/reference/index.rst @@ -55,3 +55,10 @@ Plotting Functions .. automodule:: hyswap.plots :members: :special-members: + +Similarity Functions +-------------------- + +.. automodule:: hyswap.similarity + :members: + :special-members: diff --git a/hyswap/__init__.py b/hyswap/__init__.py index 8c66d9b..1061dc8 100644 --- a/hyswap/__init__.py +++ b/hyswap/__init__.py @@ -6,6 +6,7 @@ from hyswap.percentiles import * # noqa from hyswap.cumulative import * # noqa from hyswap.plots import * # noqa +from hyswap.similarity import * # noqa try: __version__ = version('hyswap') diff --git a/hyswap/plots.py b/hyswap/plots.py index a66940a..1e6de8c 100644 --- a/hyswap/plots.py +++ b/hyswap/plots.py @@ -496,3 +496,94 @@ def plot_cumulative_hydrograph(cumulative_percentiles, target_years, # return return ax + + +def plot_similarity_heatmap(sim_matrix, cmap='inferno', + show_values=False, ax=None, + title='Similarity Matrix'): + """Plot a similarity matrix as a heatmap. + + Parameters + ---------- + sim_matrix : pandas.DataFrame + Similarity matrix to plot. Must be square. Can be the output of + :meth:`hyswap.similarity.calculate_correlations`, + :meth:`hyswap.similarity.calculate_wasserstein_distance`, + :meth:`hyswap.similarity.calculate_energy_distance`, or any other + square matrix represented as a pandas DataFrame. + + cmap : str, optional + Colormap to use. Default is 'inferno'. + + show_values : bool, optional + Whether to show the values of the matrix on the plot. Default is False. + + ax : matplotlib.axes.Axes, optional + Axes object to plot on. If not provided, a new figure and axes will be + created. + + title : str, optional + Title for the plot. Default is 'Similarity Matrix'. + + Returns + ------- + matplotlib.axes.Axes + Axes object containing the plot. + + Examples + -------- + Calculate the correlation matrix between two sites and plot it as a + heatmap. + + .. plot:: + :include-source: + + >>> df, _ = dataretrieval.nwis.get_dv(site='06892350', + ... parameterCd='00060', + ... start='2010-01-01', + ... end='2021-12-31') + >>> df2, _ = dataretrieval.nwis.get_dv(site='06892000', + ... parameterCd='00060', + ... start='2010-01-01', + ... end='2021-12-31') + >>> corr_matrix = hyswap.similarity.calculate_correlations( + ... [df, df2], '00060_Mean') + >>> ax = hyswap.plots.plot_similarity_heatmap(corr_matrix, + ... show_values=True) + >>> plt.show() + """ + # Create axes if not provided + if ax is None: + _, ax = plt.subplots() + # plot heatmap using matplotlib + vmin = sim_matrix.min().min() + vmax = sim_matrix.max().max() + im = ax.imshow(sim_matrix, cmap=cmap, + vmin=sim_matrix.min().min(), + vmax=sim_matrix.max().max()) + # show values if desired + if show_values: + for i in range(sim_matrix.shape[0]): + for j in range(sim_matrix.shape[1]): + # if below halfway point, make text white + if sim_matrix.iloc[i, j] < (vmax - vmin) / 2 + vmin: + ax.text(j, i, f'{sim_matrix.iloc[i, j]:.2f}', + ha="center", va="center", color="w") + # otherwise, make text black + else: + ax.text(j, i, f'{sim_matrix.iloc[i, j]:.2f}', + ha="center", va="center", color="k") + # set labels + ax.set_title(title) + ax.set_xlabel('Site') + ax.set_ylabel('Site') + # set ticks at center of each cell + ax.set_xticks(np.arange(sim_matrix.shape[0])) + ax.set_yticks(np.arange(sim_matrix.shape[1])) + # set tick labels + ax.set_xticklabels(sim_matrix.columns) + ax.set_yticklabels(sim_matrix.index) + # add colorbar + plt.colorbar(im, ax=ax) + # return + return ax diff --git a/hyswap/similarity.py b/hyswap/similarity.py new file mode 100644 index 0000000..5338043 --- /dev/null +++ b/hyswap/similarity.py @@ -0,0 +1,203 @@ +"""Similarity measures for hyswap.""" + +import numpy as np +import pandas as pd +from scipy import stats +from hyswap.utils import filter_to_common_time + + +def calculate_correlations(df_list, data_column_name, df_names=None): + """Calculate Pearson correlations between dataframes in df_list. + + This function is designed to calculate the Pearson correlation + coefficients between dataframes in df_list. The dataframes in df_list are + expected to have the same columns. The correlation coefficients are + calculated using the `numpy.corrcoeff` function. + + Parameters + ---------- + df_list : list + List of dataframes. The dataframes are expected to have the same + columns. Likely inputs are the output of a function like + dataretrieval.nwis.get_dv() or similar + + data_column_name : str + Name of the column to use for the correlation calculation. + + df_names : list, optional + List of names for the dataframes in df_list. If provided, the names + will be used to label the rows and columns of the output array. If + not provided, the column "site_no" will be used if available, if it is + not available, the index of the dataframe in the list will be used. + + Returns + ------- + correlations : pandas.DataFrame + Dataframe of correlation coefficients. The rows and columns are + labeled with the names of the dataframes in df_list as provided + by df_names argument. + + Examples + -------- + Calculate correlations between two synthetic dataframes. + + .. doctest:: + + >>> df1 = pd.DataFrame({'a': np.arange(10), 'b': np.arange(10)}) + >>> df2 = pd.DataFrame({'a': -1*np.arange(10), 'b': np.arange(10)}) + >>> results = similarity.calculate_correlations([df1, df2], 'a') + >>> results + 0 1 + 0 1.0 -1.0 + 1 -1.0 1.0 + """ + # handle the names of the dataframes + df_names = _name_handling(df_list, df_names) + # preprocess dataframe list so they have the same index/times + df_list = filter_to_common_time(df_list) + # calculate correlations between all pairs of dataframes in the list + correlations = np.empty((len(df_list), len(df_list))) + for i, df1 in enumerate(df_list): + for j, df2 in enumerate(df_list): + correlations[i, j] = np.corrcoef( + df1[data_column_name], df2[data_column_name])[0, 1] + # turn the correlations into a dataframe + correlations = pd.DataFrame( + correlations, index=df_names, columns=df_names) + return correlations + + +def calculate_wasserstein_distance(df_list, data_column_name, df_names=None): + """Calculate Wasserstein distance between dataframes in df_list. + + This function is designed to calculate the Wasserstein distance between + dataframes in df_list. The dataframes in df_list are expected to have the + same columns. The Wasserstein distance is calculated using the + `scipy.stats.wasserstein_distance` function. + + Parameters + ---------- + df_list : list + List of dataframes. The dataframes are expected to have the same + columns. Likely inputs are the output of a function like + dataretrieval.nwis.get_dv() or similar + + data_column_name : str + Name of the column to use for the Wasserstein distance calculation. + + df_names : list, optional + List of names for the dataframes in df_list. If provided, the names + will be used to label the rows and columns of the output array. If + not provided, the column "site_no" will be used if available, if it is + not available, the index of the dataframe in the list will be used. + + Returns + ------- + wasserstein_distances : pandas.DataFrame + Dataframe of Wasserstein distances. The rows and columns are + labeled with the names of the dataframes in df_list as provided + by df_names argument. + + Examples + -------- + Calculate Wasserstein distances between two synthetic dataframes. + + .. doctest:: + + >>> df1 = pd.DataFrame({'a': np.arange(10), 'b': np.arange(10)}) + >>> df2 = pd.DataFrame({'a': -1*np.arange(10), 'b': np.arange(10)}) + >>> results = similarity.calculate_wasserstein_distance( + ... [df1, df2], 'a') + >>> results + 0 1 + 0 0.0 9.0 + 1 9.0 0.0 + """ + # preprocess dataframe list so they have the same index/times + df_list = filter_to_common_time(df_list) + # calculate distances between all pairs of dataframes in the list + wasserstein_distances = np.empty((len(df_list), len(df_list))) + for i, df1 in enumerate(df_list): + for j, df2 in enumerate(df_list): + wasserstein_distances[i, j] = stats.wasserstein_distance( + df1[data_column_name], df2[data_column_name]) + # handle the names of the dataframes + df_names = _name_handling(df_list, df_names) + # turn the distances into a dataframe + wasserstein_distances = pd.DataFrame( + wasserstein_distances, index=df_names, columns=df_names) + return wasserstein_distances + + +def calculate_energy_distance(df_list, data_column_name, df_names=None): + """Calculate energy distance between dataframes in df_list. + + This function is designed to calculate the energy distance between + dataframes in df_list. The dataframes in df_list are expected to have the + same columns. The energy distance is calculated using the + `scipy.stats.energy_distance` function. + + Parameters + ---------- + df_list : list + List of dataframes. The dataframes are expected to have the same + columns. Likely inputs are the output of a function like + dataretrieval.nwis.get_dv() or similar + + data_column_name : str + Name of the column to use for the energy distance calculation. + + df_names : list, optional + List of names for the dataframes in df_list. If provided, the names + will be used to label the rows and columns of the output array. If + not provided, the column "site_no" will be used if available, if it is + not available, the index of the dataframe in the list will be used. + + Returns + ------- + energy_distances : pandas.DataFrame + Dataframe of energy distances. The rows and columns are + labeled with the names of the dataframes in df_list as provided + by df_names argument. + + Examples + -------- + Calculate energy distances between two synthetic dataframes. + + .. doctest:: + + >>> df1 = pd.DataFrame({'a': np.arange(10), 'b': np.arange(10)}) + >>> df2 = pd.DataFrame({'a': -1*np.arange(10), 'b': np.arange(10)}) + >>> results = similarity.calculate_energy_distance( + ... [df1, df2], 'a') + >>> results + 0 1 + 0 0.000000 3.376389 + 1 3.376389 0.000000 + """ + # preprocess dataframe list so they have the same index/times + df_list = filter_to_common_time(df_list) + # calculate distances between all pairs of dataframes in the list + energy_distances = np.empty((len(df_list), len(df_list))) + for i, df1 in enumerate(df_list): + for j, df2 in enumerate(df_list): + energy_distances[i, j] = stats.energy_distance( + df1[data_column_name], df2[data_column_name]) + # handle the names of the dataframes + df_names = _name_handling(df_list, df_names) + # turn the distances into a dataframe + energy_distances = pd.DataFrame( + energy_distances, index=df_names, columns=df_names) + return energy_distances + + +def _name_handling(df_list, df_names): + """Private function to handle the names of the dataframes.""" + if df_names is None: + df_names = [] + for i, df in enumerate(df_list): + if 'site_no' in df.columns: + df_names.append(df['site_no'].iloc[0]) + else: + df_names.append(str(i)) + return df_names diff --git a/hyswap/utils.py b/hyswap/utils.py index 0c17fa8..489573f 100644 --- a/hyswap/utils.py +++ b/hyswap/utils.py @@ -430,3 +430,62 @@ def munge_nwis_stats(df, source_pct_col=None, target_pct_col=None, df_slim.columns = target_pct_col # return the dataframe return df_slim + + +def filter_to_common_time(df_list): + """Filter a list of dataframes to common times based on index. + + This function takes a list of dataframes and filters them to only include + the common times based on the index of the dataframes. This is necessary + before comparing the timeseries and calculating statistics between two or + more timeseries. + + Parameters + ---------- + df_list : list + List of pandas.DataFrame objects to filter to common times. + DataFrames assumed to have date-time information in the index. + Expect input to be the output from a function like + dataretrieval.nwis.get_dv() or similar. + + Returns + ------- + df_list : list + List of pandas.DataFrame objects filtered to common times. + + Examples + -------- + Get some NWIS data. + + .. doctest:: + + >>> df1, md1 = dataretrieval.nwis.get_dv( + ... "03586500", parameterCd="00060", + ... start="2018-12-15", end="2019-01-07") + >>> df2, md2 = dataretrieval.nwis.get_dv( + ... "01646500", parameterCd="00060", + ... start="2019-01-01", end="2019-01-14") + >>> type(df1) + + >>> type(df2) + + + Filter the dataframes to common times. + + .. doctest:: + + >>> df_list = utils.filter_to_common_time([df1, df2]) + >>> df_list[0].shape + (7, 3) + >>> df_list[1].shape + (7, 3) + """ + # get the common index + common_index = df_list[0].index + for df in df_list: + common_index = common_index.intersection(df.index) + # filter the dataframes to the common index + for i, df in enumerate(df_list): + df_list[i] = df.loc[common_index] + # return the list of dataframes + return df_list From 49662ec670964c2b30b6996d0b7ecb4e9b286a0d Mon Sep 17 00:00:00 2001 From: elbeejay Date: Wed, 21 Jun 2023 15:30:24 -0400 Subject: [PATCH 2/4] store number of common obs --- docs/source/conf.py | 8 +- docs/source/examples/similarity_examples.rst | 86 +++++++++++++++++++- hyswap/plots.py | 6 +- hyswap/similarity.py | 31 +++++-- hyswap/utils.py | 10 ++- 5 files changed, 118 insertions(+), 23 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 4987419..e991e4c 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -134,13 +134,9 @@ # -- Options for linkcheck ------------------------------------------- # Links to not "check" because they are problematic for the link checker -linkcheck_ignore = [ - r'https://doi.org/10.3133/wsp1542A', - r'https://pypi.org/project/hyswap/' -] - linkcheck_exclude_documents = [ r'meta/disclaimer*', r'meta/contributing*', - r'meta/installation*' + r'meta/installation*', + r'examples/similarity_examples', ] diff --git a/docs/source/examples/similarity_examples.rst b/docs/source/examples/similarity_examples.rst index fbb1367..0c58f3e 100644 --- a/docs/source/examples/similarity_examples.rst +++ b/docs/source/examples/similarity_examples.rst @@ -36,11 +36,11 @@ Once we've collected the streamflow data, we will calculate the pair-wise correl :include-source: # calculate correlations - results = hyswap.similarity.calculate_correlations(df_list, "00060_Mean") + results, n_obs = hyswap.similarity.calculate_correlations(df_list, "00060_Mean") # make plot ax = hyswap.plots.plot_similarity_heatmap( - results, + results, n_obs=n_obs, title="Pearson Correlation Coefficients for Streamflow\n" + "Between 5 Sites Along the Mississippi River") @@ -66,11 +66,11 @@ If we'd like, we can display the specific values of the correlations by setting df_list.append(df) # calculate correlations - results = hyswap.similarity.calculate_correlations(df_list, "00060_Mean") + results, n_obs = hyswap.similarity.calculate_correlations(df_list, "00060_Mean") # make plot ax = hyswap.plots.plot_similarity_heatmap( - results, + results, n_obs=n_obs, title="Pearson Correlation Coefficients for Streamflow\n" + "Between 5 Sites Along the Mississippi River", show_values=True) @@ -78,3 +78,81 @@ If we'd like, we can display the specific values of the correlations by setting # show the plot plt.tight_layout() plt.show() + + +Wasserstein Distances Between 5 Stations +**************************************** + +In this example we compare the same 5 time-series as the previous example, but instead of calculating correlations, we calculate the `Wasserstein Distance `_ between each pairing of time-series. +The Wasserstein Distance is a measure of the distance between two probability distributions, in this case the probability distributions of the streamflow values at each station. +Specifically in `hyswap`, we utilize the `scipy.stats.wasserstein_distance()` function, and are therefore specifically calculating the "first" Wasserstein Distance between two time-series. + +.. _wasserstein_doc: https://en.wikipedia.org/wiki/Wasserstein_metric + +.. plot:: + :context: reset + :include-source: + + # get the data from these 5 sites + site_list = ["07374525", "07374000", "07289000", "07032000", "07024175"] + + # fetch some streamflow data from NWIS as a list of dataframes + df_list = [] + for site in site_list: + df, _ = dataretrieval.nwis.get_dv(site, start="2012-01-01", + end="2022-12-31", + parameterCd='00060') + df_list.append(df) + + # calculate Wasserstein Distances + results, n_obs = hyswap.similarity.calculate_wasserstein_distance(df_list, "00060_Mean") + + # make plot + ax = hyswap.plots.plot_similarity_heatmap( + results, n_obs=n_obs, + title="Wasserstein Distances for Streamflow\n" + + "Between 5 Sites Along the Mississippi River", + show_values=True) + + # show the plot + plt.tight_layout() + plt.show() + + +Energy Distances Between 5 Stations +*********************************** + +In this example we compare the same 5 time-series as the previous example, but this time using another distance measure, the so-called "Energy Distance" between two time-series. +The `Energy Distance `_ is a statistical distance between two probability distributions, in this case the probability distributions of the streamflow values at each station. +Specifically in `hyswap`, we utilize the `scipy.stats.energy_distance()` function. + +.. _energy_dist: https://en.wikipedia.org/wiki/Energy_distance + +.. plot:: + :context: reset + :include-source: + + # get the data from these 5 sites + site_list = ["07374525", "07374000", "07289000", "07032000", "07024175"] + + # fetch some streamflow data from NWIS as a list of dataframes + df_list = [] + for site in site_list: + df, _ = dataretrieval.nwis.get_dv(site, start="2012-01-01", + end="2022-12-31", + parameterCd='00060') + df_list.append(df) + + # calculate Wasserstein Distances + results, n_obs = hyswap.similarity.calculate_energy_distance(df_list, "00060_Mean") + + # make plot + ax = hyswap.plots.plot_similarity_heatmap( + results, n_obs=n_obs, + title="Energy Distances for Streamflow\n" + + "Between 5 Sites Along the Mississippi River", + show_values=True) + + # show the plot + plt.tight_layout() + plt.show() diff --git a/hyswap/plots.py b/hyswap/plots.py index 1e6de8c..0dfdb0c 100644 --- a/hyswap/plots.py +++ b/hyswap/plots.py @@ -498,7 +498,7 @@ def plot_cumulative_hydrograph(cumulative_percentiles, target_years, return ax -def plot_similarity_heatmap(sim_matrix, cmap='inferno', +def plot_similarity_heatmap(sim_matrix, n_obs=None, cmap='inferno', show_values=False, ax=None, title='Similarity Matrix'): """Plot a similarity matrix as a heatmap. @@ -546,7 +546,7 @@ def plot_similarity_heatmap(sim_matrix, cmap='inferno', ... parameterCd='00060', ... start='2010-01-01', ... end='2021-12-31') - >>> corr_matrix = hyswap.similarity.calculate_correlations( + >>> corr_matrix, n_obs = hyswap.similarity.calculate_correlations( ... [df, df2], '00060_Mean') >>> ax = hyswap.plots.plot_similarity_heatmap(corr_matrix, ... show_values=True) @@ -574,6 +574,8 @@ def plot_similarity_heatmap(sim_matrix, cmap='inferno', ax.text(j, i, f'{sim_matrix.iloc[i, j]:.2f}', ha="center", va="center", color="k") # set labels + if n_obs is not None: + title = f'{title} (n={n_obs})' ax.set_title(title) ax.set_xlabel('Site') ax.set_ylabel('Site') diff --git a/hyswap/similarity.py b/hyswap/similarity.py index 5338043..c4c4454 100644 --- a/hyswap/similarity.py +++ b/hyswap/similarity.py @@ -37,6 +37,9 @@ def calculate_correlations(df_list, data_column_name, df_names=None): labeled with the names of the dataframes in df_list as provided by df_names argument. + n_obs : int + Number of observations used to calculate the energy distance. + Examples -------- Calculate correlations between two synthetic dataframes. @@ -45,7 +48,7 @@ def calculate_correlations(df_list, data_column_name, df_names=None): >>> df1 = pd.DataFrame({'a': np.arange(10), 'b': np.arange(10)}) >>> df2 = pd.DataFrame({'a': -1*np.arange(10), 'b': np.arange(10)}) - >>> results = similarity.calculate_correlations([df1, df2], 'a') + >>> results, n_obs = similarity.calculate_correlations([df1, df2], 'a') >>> results 0 1 0 1.0 -1.0 @@ -54,7 +57,7 @@ def calculate_correlations(df_list, data_column_name, df_names=None): # handle the names of the dataframes df_names = _name_handling(df_list, df_names) # preprocess dataframe list so they have the same index/times - df_list = filter_to_common_time(df_list) + df_list, n_obs = filter_to_common_time(df_list) # calculate correlations between all pairs of dataframes in the list correlations = np.empty((len(df_list), len(df_list))) for i, df1 in enumerate(df_list): @@ -64,7 +67,7 @@ def calculate_correlations(df_list, data_column_name, df_names=None): # turn the correlations into a dataframe correlations = pd.DataFrame( correlations, index=df_names, columns=df_names) - return correlations + return correlations, n_obs def calculate_wasserstein_distance(df_list, data_column_name, df_names=None): @@ -98,6 +101,9 @@ def calculate_wasserstein_distance(df_list, data_column_name, df_names=None): labeled with the names of the dataframes in df_list as provided by df_names argument. + n_obs : int + Number of observations used to calculate the energy distance. + Examples -------- Calculate Wasserstein distances between two synthetic dataframes. @@ -106,15 +112,17 @@ def calculate_wasserstein_distance(df_list, data_column_name, df_names=None): >>> df1 = pd.DataFrame({'a': np.arange(10), 'b': np.arange(10)}) >>> df2 = pd.DataFrame({'a': -1*np.arange(10), 'b': np.arange(10)}) - >>> results = similarity.calculate_wasserstein_distance( + >>> results, n_obs = similarity.calculate_wasserstein_distance( ... [df1, df2], 'a') >>> results 0 1 0 0.0 9.0 1 9.0 0.0 """ + # handle the names of the dataframes + df_names = _name_handling(df_list, df_names) # preprocess dataframe list so they have the same index/times - df_list = filter_to_common_time(df_list) + df_list, n_obs = filter_to_common_time(df_list) # calculate distances between all pairs of dataframes in the list wasserstein_distances = np.empty((len(df_list), len(df_list))) for i, df1 in enumerate(df_list): @@ -126,7 +134,7 @@ def calculate_wasserstein_distance(df_list, data_column_name, df_names=None): # turn the distances into a dataframe wasserstein_distances = pd.DataFrame( wasserstein_distances, index=df_names, columns=df_names) - return wasserstein_distances + return wasserstein_distances, n_obs def calculate_energy_distance(df_list, data_column_name, df_names=None): @@ -160,6 +168,9 @@ def calculate_energy_distance(df_list, data_column_name, df_names=None): labeled with the names of the dataframes in df_list as provided by df_names argument. + n_obs : int + Number of observations used to calculate the energy distance. + Examples -------- Calculate energy distances between two synthetic dataframes. @@ -168,15 +179,17 @@ def calculate_energy_distance(df_list, data_column_name, df_names=None): >>> df1 = pd.DataFrame({'a': np.arange(10), 'b': np.arange(10)}) >>> df2 = pd.DataFrame({'a': -1*np.arange(10), 'b': np.arange(10)}) - >>> results = similarity.calculate_energy_distance( + >>> results, n_obs = similarity.calculate_energy_distance( ... [df1, df2], 'a') >>> results 0 1 0 0.000000 3.376389 1 3.376389 0.000000 """ + # handle the names of the dataframes + df_names = _name_handling(df_list, df_names) # preprocess dataframe list so they have the same index/times - df_list = filter_to_common_time(df_list) + df_list, n_obs = filter_to_common_time(df_list) # calculate distances between all pairs of dataframes in the list energy_distances = np.empty((len(df_list), len(df_list))) for i, df1 in enumerate(df_list): @@ -188,7 +201,7 @@ def calculate_energy_distance(df_list, data_column_name, df_names=None): # turn the distances into a dataframe energy_distances = pd.DataFrame( energy_distances, index=df_names, columns=df_names) - return energy_distances + return energy_distances, n_obs def _name_handling(df_list, df_names): diff --git a/hyswap/utils.py b/hyswap/utils.py index 489573f..ff12c83 100644 --- a/hyswap/utils.py +++ b/hyswap/utils.py @@ -349,6 +349,8 @@ def munge_nwis_stats(df, source_pct_col=None, target_pct_col=None, be used on Python dataretrieval dataframe returns for the nwis.get_stats() function for "daily" data, a single site, and a single parameter code. + Parameters + ---------- df : pandas.DataFrame DataFrame containing NWIS statistics data retrieved from the statistics web service. Assumed to come in as a dataframe retrieved with a @@ -452,6 +454,8 @@ def filter_to_common_time(df_list): ------- df_list : list List of pandas.DataFrame objects filtered to common times. + n_obs : int + Number of observations in the common time period. Examples -------- @@ -474,7 +478,7 @@ def filter_to_common_time(df_list): .. doctest:: - >>> df_list = utils.filter_to_common_time([df1, df2]) + >>> df_list, n_obs = utils.filter_to_common_time([df1, df2]) >>> df_list[0].shape (7, 3) >>> df_list[1].shape @@ -487,5 +491,7 @@ def filter_to_common_time(df_list): # filter the dataframes to the common index for i, df in enumerate(df_list): df_list[i] = df.loc[common_index] + # get the number of observations + n_obs = len(common_index) # return the list of dataframes - return df_list + return df_list, n_obs From 0e7ce4216e5cd426f7565c31c2b0548c02915458 Mon Sep 17 00:00:00 2001 From: elbeejay Date: Wed, 21 Jun 2023 15:49:49 -0400 Subject: [PATCH 3/4] update tests for similarity stuff --- tests/test_plots.py | 50 ++++++++++++++++++++++++++++++++++++++++ tests/test_similarity.py | 50 ++++++++++++++++++++++++++++++++++++++++ tests/test_utils.py | 18 +++++++++++++++ 3 files changed, 118 insertions(+) create mode 100644 tests/test_similarity.py diff --git a/tests/test_plots.py b/tests/test_plots.py index 334758f..f307fcb 100644 --- a/tests/test_plots.py +++ b/tests/test_plots.py @@ -8,6 +8,7 @@ from hyswap import rasterhydrograph from hyswap import percentiles from hyswap import plots +from hyswap import similarity def test_plot_flow_duration_curve(): @@ -156,3 +157,52 @@ def test_plot_cumulative_hydrograph(): assert len(ax.collections) == 1 # close plot plt.close() + + +def test_plot_similarity_heatmat(): + """Testing the plot_similarity_heatmap function.""" + # make dummy dataframes + df_1 = pd.DataFrame({ + 'data': np.arange(10), + 'date': pd.date_range('2020-01-01', '2020-01-10')}) + df_1.set_index('date', inplace=True) + df_2 = pd.DataFrame({ + 'data': np.arange(10), + 'date': pd.date_range('2020-01-06', '2020-01-15')}) + df_2.set_index('date', inplace=True) + # calculate similarity matrix (using correlation) + df_corr, n_obs = similarity.calculate_correlations( + [df_1, df_2], 'data') + # plot + ax = plots.plot_similarity_heatmap(df_corr, n_obs=n_obs, + show_values=True, + title='Test Title') + # assertions + assert isinstance(ax, plt.Axes) + assert ax.get_xlabel() == 'Site' + assert ax.get_ylabel() == 'Site' + assert ax.get_title() == 'Test Title (n=5)' + assert len(ax.lines) == 0 + # close plot + plt.close() + + # make a set with negative values + df_2 = pd.DataFrame({ + 'data': np.random.random(10) * 10, + 'date': pd.date_range('2020-01-06', '2020-01-15')}) + df_2.set_index('date', inplace=True) + # calculate similarity matrix (using correlation) + df_corr, n_obs = similarity.calculate_correlations( + [df_1, df_2], 'data') + # plot + ax = plots.plot_similarity_heatmap(df_corr, n_obs=n_obs, + show_values=True, + title='Test Title') + # assertions + assert isinstance(ax, plt.Axes) + assert ax.get_xlabel() == 'Site' + assert ax.get_ylabel() == 'Site' + assert ax.get_title() == 'Test Title (n=5)' + assert len(ax.lines) == 0 + # close plot + plt.close() \ No newline at end of file diff --git a/tests/test_similarity.py b/tests/test_similarity.py new file mode 100644 index 0000000..d265d36 --- /dev/null +++ b/tests/test_similarity.py @@ -0,0 +1,50 @@ +"""Tests for the similarity functions.""" +import pytest +import numpy as np +import pandas as pd +from hyswap import similarity + + +def test_calculate_correlations(): + """Test the calculate_correlations function.""" + df1 = pd.DataFrame({'a': np.arange(10), 'b': np.arange(10)}) + df2 = pd.DataFrame({'a': -1*np.arange(10), 'b': np.arange(10)}) + results, n_obs = similarity.calculate_correlations([df1, df2], 'a') + assert n_obs == 10 + assert results.iloc[0, 0] == pytest.approx(1) + assert results.iloc[0, 1] == pytest.approx(-1) + assert results.iloc[1, 0] == pytest.approx(-1) + assert results.iloc[1, 1] == pytest.approx(1) + + +def test_calculate_wasserstein_distance(): + """Test the calculate_wasserstein_distance function.""" + df1 = pd.DataFrame({'a': np.arange(10), + 'b': np.arange(10), + 'site_no': np.zeros(10)}) + df2 = pd.DataFrame({'a': -1*np.arange(10), + 'b': np.arange(10), + 'site_no': np.ones(10)}) + results, n_obs = similarity.calculate_wasserstein_distance([df1, df2], 'a') + assert n_obs == 10 + assert results.iloc[0, 0] == pytest.approx(0) + assert results.iloc[0, 1] == pytest.approx(9) + assert results.iloc[1, 0] == pytest.approx(9) + assert results.iloc[1, 1] == pytest.approx(0) + assert results.index[0] == 0 + assert results.index[1] == 1 + + +def test_calculate_energy_distance(): + """Test the calculate_energy_distance function.""" + df1 = pd.DataFrame({'a': np.arange(10), 'b': np.arange(10)}) + df2 = pd.DataFrame({'a': -1*np.arange(10), 'b': np.arange(10)}) + results, n_obs = similarity.calculate_energy_distance( + [df1, df2], 'a', df_names=['one', 'two']) + assert n_obs == 10 + assert results.iloc[0, 0] == pytest.approx(0) + assert np.round(results.iloc[0, 1], 1) == 3.4 + assert np.round(results.iloc[1, 0], 1) == 3.4 + assert results.iloc[1, 1] == pytest.approx(0) + assert results.index[0] == 'one' + assert results.index[1] == 'two' diff --git a/tests/test_utils.py b/tests/test_utils.py index 200d4b9..4afeab8 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -248,3 +248,21 @@ def test_munge_nwis_stats(): assert len(df.columns) > len(df_slim.columns) assert 0 in df_slim.columns assert df_slim.columns.tolist() == [0, 5, 10, 25, 75, 90, 95, 100] + + +def test_filter_to_common_time(): + """Test the filter_to_common_time function.""" + # make dummy dataframes + df_1 = pd.DataFrame({ + 'data': np.arange(10), + 'date': pd.date_range('2020-01-01', '2020-01-10')}) + df_1.set_index('date', inplace=True) + df_2 = pd.DataFrame({ + 'data': np.arange(10), + 'date': pd.date_range('2020-01-06', '2020-01-15')}) + df_2.set_index('date', inplace=True) + # apply the function + df_list, n_obs = utils.filter_to_common_time([df_1, df_2]) + # check the output + assert len(df_list) == 2 + assert n_obs == 5 From b669f729820daf079385c7dd44b06d0e25ba4a1d Mon Sep 17 00:00:00 2001 From: elbeejay Date: Wed, 21 Jun 2023 15:57:40 -0400 Subject: [PATCH 4/4] lint --- tests/test_plots.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_plots.py b/tests/test_plots.py index f307fcb..4d9a74d 100644 --- a/tests/test_plots.py +++ b/tests/test_plots.py @@ -205,4 +205,4 @@ def test_plot_similarity_heatmat(): assert ax.get_title() == 'Test Title (n=5)' assert len(ax.lines) == 0 # close plot - plt.close() \ No newline at end of file + plt.close()