+[docs]
+defcalculate_exceedance_probability_from_distribution(x,dist,*args,**kwargs):""" Calculate the exceedance probability of a value relative to a distribution.
@@ -118,7 +120,10 @@
+[docs]
+defcalculate_exceedance_probability_from_values(x,values_to_compare):""" Calculate the exceedance probability of a value compared to several values.
@@ -175,7 +180,10 @@
+[docs]
+defcalculate_exceedance_probability_from_values_multiple(values,values_to_compare):""" Calculate the exceedance probability of multiple values vs a set of values.
@@ -275,6 +286,7 @@
+[docs]
+defplot_similarity_heatmap(sim_matrix,n_obs=None,cmap='inferno',
+ show_values=False,ax=None,
+ title='Similarity Matrix'):
+"""Plot a similarity matrix as a heatmap.
+
+ Parameters
+ ----------
+ sim_matrix : pandas.DataFrame
+ Similarity matrix to plot. Must be square. Can be the output of
+ :meth:`hyswap.similarity.calculate_correlations`,
+ :meth:`hyswap.similarity.calculate_wasserstein_distance`,
+ :meth:`hyswap.similarity.calculate_energy_distance`, or any other
+ square matrix represented as a pandas DataFrame.
+
+ cmap : str, optional
+ Colormap to use. Default is 'inferno'.
+
+ show_values : bool, optional
+ Whether to show the values of the matrix on the plot. Default is False.
+
+ ax : matplotlib.axes.Axes, optional
+ Axes object to plot on. If not provided, a new figure and axes will be
+ created.
+
+ title : str, optional
+ Title for the plot. Default is 'Similarity Matrix'.
+
+ Returns
+ -------
+ matplotlib.axes.Axes
+ Axes object containing the plot.
+
+ Examples
+ --------
+ Calculate the correlation matrix between two sites and plot it as a
+ heatmap.
+
+ .. plot::
+ :include-source:
+
+ >>> df, _ = dataretrieval.nwis.get_dv(site='06892350',
+ ... parameterCd='00060',
+ ... start='2010-01-01',
+ ... end='2021-12-31')
+ >>> df2, _ = dataretrieval.nwis.get_dv(site='06892000',
+ ... parameterCd='00060',
+ ... start='2010-01-01',
+ ... end='2021-12-31')
+ >>> corr_matrix, n_obs = hyswap.similarity.calculate_correlations(
+ ... [df, df2], '00060_Mean')
+ >>> ax = hyswap.plots.plot_similarity_heatmap(corr_matrix,
+ ... show_values=True)
+ >>> plt.show()
+ """
+ # Create axes if not provided
+ ifaxisNone:
+ _,ax=plt.subplots()
+ # plot heatmap using matplotlib
+ vmin=sim_matrix.min().min()
+ vmax=sim_matrix.max().max()
+ im=ax.imshow(sim_matrix,cmap=cmap,
+ vmin=sim_matrix.min().min(),
+ vmax=sim_matrix.max().max())
+ # show values if desired
+ ifshow_values:
+ foriinrange(sim_matrix.shape[0]):
+ forjinrange(sim_matrix.shape[1]):
+ # if below halfway point, make text white
+ ifsim_matrix.iloc[i,j]<(vmax-vmin)/2+vmin:
+ ax.text(j,i,f'{sim_matrix.iloc[i,j]:.2f}',
+ ha="center",va="center",color="w")
+ # otherwise, make text black
+ else:
+ ax.text(j,i,f'{sim_matrix.iloc[i,j]:.2f}',
+ ha="center",va="center",color="k")
+ # set labels
+ ifn_obsisnotNone:
+ title=f'{title} (n={n_obs})'
+ ax.set_title(title)
+ ax.set_xlabel('Site')
+ ax.set_ylabel('Site')
+ # set ticks at center of each cell
+ ax.set_xticks(np.arange(sim_matrix.shape[0]))
+ ax.set_yticks(np.arange(sim_matrix.shape[1]))
+ # set tick labels
+ ax.set_xticklabels(sim_matrix.columns)
+ ax.set_yticklabels(sim_matrix.index)
+ # add colorbar
+ plt.colorbar(im,ax=ax)
+ # return
+ returnax
+[docs]
+def_check_inputs(df,data_column_name,date_column_name,data_type,year_type,begin_year,end_year):"""Private function to check inputs for the format_data function.
@@ -274,7 +279,10 @@
+[docs]
+def_calculate_date_range(df,year_type,begin_year,end_year):"""Private function to calculate the date range and set the index. Parameters
@@ -319,6 +327,7 @@
+[docs]
+defconvert_cfs_to_runoff(cfs,drainage_area,frequency="annual"):"""Convert cfs to runoff values for some drainage area. Parameters
@@ -97,7 +99,10 @@
+[docs]
+defstreamflow_to_runoff(df,data_col,drainage_area,frequency="annual"):"""Convert streamflow to runoff for a given drainage area. For a given gage/dataframe, convert streamflow to runoff using the
@@ -143,7 +148,10 @@
+[docs]
+defcalculate_geometric_runoff(geom_id,df_list,weights_matrix,start_date=None,end_date=None,data_col='runoff'):"""Function to calculate the runoff for a specified geometry.
@@ -214,7 +222,10 @@
+[docs]
+def_get_date_range(df_list,start_date,end_date):"""Get date range for runoff calculation. This is an internal function used by the :obj:`calculate_geometric_runoff`
@@ -265,7 +276,10 @@
+[docs]
+defidentify_sites_from_weights(geom_id,weights_matrix):"""Identify sites for a specified geometry. Function to identify sites with non-zero weights for a given
@@ -293,7 +307,10 @@
+[docs]
+defcalculate_correlations(df_list,data_column_name,df_names=None):
+"""Calculate Pearson correlations between dataframes in df_list.
+
+ This function is designed to calculate the Pearson correlation
+ coefficients between dataframes in df_list. The dataframes in df_list are
+ expected to have the same columns. The correlation coefficients are
+ calculated using the `numpy.corrcoeff` function.
+
+ Parameters
+ ----------
+ df_list : list
+ List of dataframes. The dataframes are expected to have the same
+ columns. Likely inputs are the output of a function like
+ dataretrieval.nwis.get_dv() or similar
+
+ data_column_name : str
+ Name of the column to use for the correlation calculation.
+
+ df_names : list, optional
+ List of names for the dataframes in df_list. If provided, the names
+ will be used to label the rows and columns of the output array. If
+ not provided, the column "site_no" will be used if available, if it is
+ not available, the index of the dataframe in the list will be used.
+
+ Returns
+ -------
+ correlations : pandas.DataFrame
+ Dataframe of correlation coefficients. The rows and columns are
+ labeled with the names of the dataframes in df_list as provided
+ by df_names argument.
+
+ n_obs : int
+ Number of observations used to calculate the energy distance.
+
+ Examples
+ --------
+ Calculate correlations between two synthetic dataframes.
+
+ .. doctest::
+
+ >>> df1 = pd.DataFrame({'a': np.arange(10), 'b': np.arange(10)})
+ >>> df2 = pd.DataFrame({'a': -1*np.arange(10), 'b': np.arange(10)})
+ >>> results, n_obs = similarity.calculate_correlations([df1, df2], 'a')
+ >>> results
+ 0 1
+ 0 1.0 -1.0
+ 1 -1.0 1.0
+ """
+ # handle the names of the dataframes
+ df_names=_name_handling(df_list,df_names)
+ # preprocess dataframe list so they have the same index/times
+ df_list,n_obs=filter_to_common_time(df_list)
+ # calculate correlations between all pairs of dataframes in the list
+ correlations=np.empty((len(df_list),len(df_list)))
+ fori,df1inenumerate(df_list):
+ forj,df2inenumerate(df_list):
+ correlations[i,j]=np.corrcoef(
+ df1[data_column_name],df2[data_column_name])[0,1]
+ # turn the correlations into a dataframe
+ correlations=pd.DataFrame(
+ correlations,index=df_names,columns=df_names)
+ returncorrelations,n_obs
+
+
+
+
+[docs]
+defcalculate_wasserstein_distance(df_list,data_column_name,df_names=None):
+"""Calculate Wasserstein distance between dataframes in df_list.
+
+ This function is designed to calculate the Wasserstein distance between
+ dataframes in df_list. The dataframes in df_list are expected to have the
+ same columns. The Wasserstein distance is calculated using the
+ `scipy.stats.wasserstein_distance` function.
+
+ Parameters
+ ----------
+ df_list : list
+ List of dataframes. The dataframes are expected to have the same
+ columns. Likely inputs are the output of a function like
+ dataretrieval.nwis.get_dv() or similar
+
+ data_column_name : str
+ Name of the column to use for the Wasserstein distance calculation.
+
+ df_names : list, optional
+ List of names for the dataframes in df_list. If provided, the names
+ will be used to label the rows and columns of the output array. If
+ not provided, the column "site_no" will be used if available, if it is
+ not available, the index of the dataframe in the list will be used.
+
+ Returns
+ -------
+ wasserstein_distances : pandas.DataFrame
+ Dataframe of Wasserstein distances. The rows and columns are
+ labeled with the names of the dataframes in df_list as provided
+ by df_names argument.
+
+ n_obs : int
+ Number of observations used to calculate the energy distance.
+
+ Examples
+ --------
+ Calculate Wasserstein distances between two synthetic dataframes.
+
+ .. doctest::
+
+ >>> df1 = pd.DataFrame({'a': np.arange(10), 'b': np.arange(10)})
+ >>> df2 = pd.DataFrame({'a': -1*np.arange(10), 'b': np.arange(10)})
+ >>> results, n_obs = similarity.calculate_wasserstein_distance(
+ ... [df1, df2], 'a')
+ >>> results
+ 0 1
+ 0 0.0 9.0
+ 1 9.0 0.0
+ """
+ # handle the names of the dataframes
+ df_names=_name_handling(df_list,df_names)
+ # preprocess dataframe list so they have the same index/times
+ df_list,n_obs=filter_to_common_time(df_list)
+ # calculate distances between all pairs of dataframes in the list
+ wasserstein_distances=np.empty((len(df_list),len(df_list)))
+ fori,df1inenumerate(df_list):
+ forj,df2inenumerate(df_list):
+ wasserstein_distances[i,j]=stats.wasserstein_distance(
+ df1[data_column_name],df2[data_column_name])
+ # handle the names of the dataframes
+ df_names=_name_handling(df_list,df_names)
+ # turn the distances into a dataframe
+ wasserstein_distances=pd.DataFrame(
+ wasserstein_distances,index=df_names,columns=df_names)
+ returnwasserstein_distances,n_obs
+
+
+
+
+[docs]
+defcalculate_energy_distance(df_list,data_column_name,df_names=None):
+"""Calculate energy distance between dataframes in df_list.
+
+ This function is designed to calculate the energy distance between
+ dataframes in df_list. The dataframes in df_list are expected to have the
+ same columns. The energy distance is calculated using the
+ `scipy.stats.energy_distance` function.
+
+ Parameters
+ ----------
+ df_list : list
+ List of dataframes. The dataframes are expected to have the same
+ columns. Likely inputs are the output of a function like
+ dataretrieval.nwis.get_dv() or similar
+
+ data_column_name : str
+ Name of the column to use for the energy distance calculation.
+
+ df_names : list, optional
+ List of names for the dataframes in df_list. If provided, the names
+ will be used to label the rows and columns of the output array. If
+ not provided, the column "site_no" will be used if available, if it is
+ not available, the index of the dataframe in the list will be used.
+
+ Returns
+ -------
+ energy_distances : pandas.DataFrame
+ Dataframe of energy distances. The rows and columns are
+ labeled with the names of the dataframes in df_list as provided
+ by df_names argument.
+
+ n_obs : int
+ Number of observations used to calculate the energy distance.
+
+ Examples
+ --------
+ Calculate energy distances between two synthetic dataframes.
+
+ .. doctest::
+
+ >>> df1 = pd.DataFrame({'a': np.arange(10), 'b': np.arange(10)})
+ >>> df2 = pd.DataFrame({'a': -1*np.arange(10), 'b': np.arange(10)})
+ >>> results, n_obs = similarity.calculate_energy_distance(
+ ... [df1, df2], 'a')
+ >>> results
+ 0 1
+ 0 0.000000 3.376389
+ 1 3.376389 0.000000
+ """
+ # handle the names of the dataframes
+ df_names=_name_handling(df_list,df_names)
+ # preprocess dataframe list so they have the same index/times
+ df_list,n_obs=filter_to_common_time(df_list)
+ # calculate distances between all pairs of dataframes in the list
+ energy_distances=np.empty((len(df_list),len(df_list)))
+ fori,df1inenumerate(df_list):
+ forj,df2inenumerate(df_list):
+ energy_distances[i,j]=stats.energy_distance(
+ df1[data_column_name],df2[data_column_name])
+ # handle the names of the dataframes
+ df_names=_name_handling(df_list,df_names)
+ # turn the distances into a dataframe
+ energy_distances=pd.DataFrame(
+ energy_distances,index=df_names,columns=df_names)
+ returnenergy_distances,n_obs
+
+
+
+
+[docs]
+def_name_handling(df_list,df_names):
+"""Private function to handle the names of the dataframes."""
+ ifdf_namesisNone:
+ df_names=[]
+ fori,dfinenumerate(df_list):
+ if'site_no'indf.columns:
+ df_names.append(df['site_no'].iloc[0])
+ else:
+ df_names.append(str(i))
+ returndf_names
+[docs]
+deffilter_approved_data(data,filter_column=None):"""Filter a dataframe to only return approved "A" (or "A, e") data. Parameters
@@ -89,7 +91,10 @@
+[docs]
+defrolling_average(df,data_column_name,data_type,**kwargs):"""Calculate a rolling average for a dataframe. Default behavior right-aligns the window used for the rolling average,
@@ -122,7 +127,10 @@
+[docs]
+deffilter_data_by_time(df,value,data_column_name,date_column_name=None,time_interval='day',leading_values=0,trailing_values=0):"""Filter data by some time interval.
@@ -224,7 +232,10 @@
+[docs]
+defdefine_year_doy_columns(df,date_column_name=None,year_type='calendar',clip_leap_day=False):"""Function to add year, day of year, and month-day columns to a DataFrame.
@@ -343,7 +357,10 @@
+[docs]
+defleap_year_adjustment(df,year_type='calendar'):"""Function to adjust leap year days in a DataFrame. Adjust for a leap year by removing February 29 from the DataFrame and
@@ -385,7 +402,10 @@
+[docs]
+defmunge_nwis_stats(df,source_pct_col=None,target_pct_col=None,year_type='calendar'):"""Function to munge and reformat NWIS statistics data.
@@ -394,6 +414,8 @@
Source code for hyswap.utils
be used on Python dataretrieval dataframe returns for the nwis.get_stats() function for "daily" data, a single site, and a single parameter code.
+ Parameters
+ ---------- df : pandas.DataFrame DataFrame containing NWIS statistics data retrieved from the statistics web service. Assumed to come in as a dataframe retrieved with a
@@ -477,7 +499,10 @@
+[docs]
+deffilter_to_common_time(df_list):
+"""Filter a list of dataframes to common times based on index.
+
+ This function takes a list of dataframes and filters them to only include
+ the common times based on the index of the dataframes. This is necessary
+ before comparing the timeseries and calculating statistics between two or
+ more timeseries.
+
+ Parameters
+ ----------
+ df_list : list
+ List of pandas.DataFrame objects to filter to common times.
+ DataFrames assumed to have date-time information in the index.
+ Expect input to be the output from a function like
+ dataretrieval.nwis.get_dv() or similar.
+
+ Returns
+ -------
+ df_list : list
+ List of pandas.DataFrame objects filtered to common times.
+ n_obs : int
+ Number of observations in the common time period.
+
+ Examples
+ --------
+ Get some NWIS data.
+
+ .. doctest::
+
+ >>> df1, md1 = dataretrieval.nwis.get_dv(
+ ... "03586500", parameterCd="00060",
+ ... start="2018-12-15", end="2019-01-07")
+ >>> df2, md2 = dataretrieval.nwis.get_dv(
+ ... "01646500", parameterCd="00060",
+ ... start="2019-01-01", end="2019-01-14")
+ >>> type(df1)
+ <class 'pandas.core.frame.DataFrame'>
+ >>> type(df2)
+ <class 'pandas.core.frame.DataFrame'>
+
+ Filter the dataframes to common times.
+
+ .. doctest::
+
+ >>> df_list, n_obs = utils.filter_to_common_time([df1, df2])
+ >>> df_list[0].shape
+ (7, 3)
+ >>> df_list[1].shape
+ (7, 3)
+ """
+ # get the common index
+ common_index=df_list[0].index
+ fordfindf_list:
+ common_index=common_index.intersection(df.index)
+ # filter the dataframes to the common index
+ fori,dfinenumerate(df_list):
+ df_list[i]=df.loc[common_index]
+ # get the number of observations
+ n_obs=len(common_index)
+ # return the list of dataframes
+ returndf_list,n_obs
+
+
+
+
+[docs]
+defset_data_type(data_type):"""Function to set the data type for rolling averages. Parameters
@@ -579,6 +673,7 @@
\ No newline at end of file
diff --git a/_sources/examples/index.rst.txt b/_sources/examples/index.rst.txt
index d3f9797..716ab3c 100644
--- a/_sources/examples/index.rst.txt
+++ b/_sources/examples/index.rst.txt
@@ -34,4 +34,12 @@ Cumulative Hydrograph Examples
.. toctree::
:maxdepth: 2
- cumulative_hydrograph_examples
\ No newline at end of file
+ cumulative_hydrograph_examples
+
+Similarity Examples
+-------------------
+
+.. toctree::
+ :maxdepth: 2
+
+ similarity_examples
diff --git a/_sources/examples/similarity_examples.rst.txt b/_sources/examples/similarity_examples.rst.txt
new file mode 100644
index 0000000..0c58f3e
--- /dev/null
+++ b/_sources/examples/similarity_examples.rst.txt
@@ -0,0 +1,158 @@
+
+Similarity Measures
+-------------------
+
+These examples showcase the usage of the functions in the `similarity` module, with heatmap visualizations via the :obj:`hyswap.plots.plot_similarity_heatmap` function.
+Sometimes it is helpful to compare the relationships between a set of stations and their respective measurements.
+The `similarity` functions packaged in `hyswap` handle some of the data clean-up for you by ensuring the time-series of observations being compared at the same, and by removing any missing data.
+This ensures that your results are not skewed by missing data or gaps in one of the time-series.
+
+
+Correlations Between 5 Stations
+*******************************
+
+The following example shows the correlations between streamflow at 5 stations (07374525, 07374000, 07289000, 07032000, 07024175) along the Mississippi River, listed from downstream to upstream.
+First we have to fetch the streamflow data for these stations, to do this we will use the `dataretrieval` package to access the NWIS database.
+
+.. plot::
+ :context: reset
+ :include-source:
+
+ # get the data from these 5 sites
+ site_list = ["07374525", "07374000", "07289000", "07032000", "07024175"]
+
+ # fetch some streamflow data from NWIS as a list of dataframes
+ df_list = []
+ for site in site_list:
+ df, _ = dataretrieval.nwis.get_dv(site, start="2012-01-01",
+ end="2022-12-31",
+ parameterCd='00060')
+ df_list.append(df)
+
+Once we've collected the streamflow data, we will calculate the pair-wise correlations between the stations using the :obj:`hyswap.similarity.calculate_correlations` function and then plot the results using :obj:`hyswap.plots.plot_similarity_heatmap`.
+
+.. plot::
+ :context:
+ :include-source:
+
+ # calculate correlations
+ results, n_obs = hyswap.similarity.calculate_correlations(df_list, "00060_Mean")
+
+ # make plot
+ ax = hyswap.plots.plot_similarity_heatmap(
+ results, n_obs=n_obs,
+ title="Pearson Correlation Coefficients for Streamflow\n" +
+ "Between 5 Sites Along the Mississippi River")
+
+ # show the plot
+ plt.tight_layout()
+ plt.show()
+
+If we'd like, we can display the specific values of the correlations by setting the `show_values` argument to `True` in the :obj:`hyswap.plots.plot_similarity_heatmap` function.
+
+.. plot::
+ :context: reset
+ :include-source:
+
+ # get the data from these 5 sites
+ site_list = ["07374525", "07374000", "07289000", "07032000", "07024175"]
+
+ # fetch some streamflow data from NWIS as a list of dataframes
+ df_list = []
+ for site in site_list:
+ df, _ = dataretrieval.nwis.get_dv(site, start="2012-01-01",
+ end="2022-12-31",
+ parameterCd='00060')
+ df_list.append(df)
+
+ # calculate correlations
+ results, n_obs = hyswap.similarity.calculate_correlations(df_list, "00060_Mean")
+
+ # make plot
+ ax = hyswap.plots.plot_similarity_heatmap(
+ results, n_obs=n_obs,
+ title="Pearson Correlation Coefficients for Streamflow\n" +
+ "Between 5 Sites Along the Mississippi River",
+ show_values=True)
+
+ # show the plot
+ plt.tight_layout()
+ plt.show()
+
+
+Wasserstein Distances Between 5 Stations
+****************************************
+
+In this example we compare the same 5 time-series as the previous example, but instead of calculating correlations, we calculate the `Wasserstein Distance `_ between each pairing of time-series.
+The Wasserstein Distance is a measure of the distance between two probability distributions, in this case the probability distributions of the streamflow values at each station.
+Specifically in `hyswap`, we utilize the `scipy.stats.wasserstein_distance()` function, and are therefore specifically calculating the "first" Wasserstein Distance between two time-series.
+
+.. _wasserstein_doc: https://en.wikipedia.org/wiki/Wasserstein_metric
+
+.. plot::
+ :context: reset
+ :include-source:
+
+ # get the data from these 5 sites
+ site_list = ["07374525", "07374000", "07289000", "07032000", "07024175"]
+
+ # fetch some streamflow data from NWIS as a list of dataframes
+ df_list = []
+ for site in site_list:
+ df, _ = dataretrieval.nwis.get_dv(site, start="2012-01-01",
+ end="2022-12-31",
+ parameterCd='00060')
+ df_list.append(df)
+
+ # calculate Wasserstein Distances
+ results, n_obs = hyswap.similarity.calculate_wasserstein_distance(df_list, "00060_Mean")
+
+ # make plot
+ ax = hyswap.plots.plot_similarity_heatmap(
+ results, n_obs=n_obs,
+ title="Wasserstein Distances for Streamflow\n" +
+ "Between 5 Sites Along the Mississippi River",
+ show_values=True)
+
+ # show the plot
+ plt.tight_layout()
+ plt.show()
+
+
+Energy Distances Between 5 Stations
+***********************************
+
+In this example we compare the same 5 time-series as the previous example, but this time using another distance measure, the so-called "Energy Distance" between two time-series.
+The `Energy Distance `_ is a statistical distance between two probability distributions, in this case the probability distributions of the streamflow values at each station.
+Specifically in `hyswap`, we utilize the `scipy.stats.energy_distance()` function.
+
+.. _energy_dist: https://en.wikipedia.org/wiki/Energy_distance
+
+.. plot::
+ :context: reset
+ :include-source:
+
+ # get the data from these 5 sites
+ site_list = ["07374525", "07374000", "07289000", "07032000", "07024175"]
+
+ # fetch some streamflow data from NWIS as a list of dataframes
+ df_list = []
+ for site in site_list:
+ df, _ = dataretrieval.nwis.get_dv(site, start="2012-01-01",
+ end="2022-12-31",
+ parameterCd='00060')
+ df_list.append(df)
+
+ # calculate Wasserstein Distances
+ results, n_obs = hyswap.similarity.calculate_energy_distance(df_list, "00060_Mean")
+
+ # make plot
+ ax = hyswap.plots.plot_similarity_heatmap(
+ results, n_obs=n_obs,
+ title="Energy Distances for Streamflow\n" +
+ "Between 5 Sites Along the Mississippi River",
+ show_values=True)
+
+ # show the plot
+ plt.tight_layout()
+ plt.show()
diff --git a/_sources/reference/index.rst.txt b/_sources/reference/index.rst.txt
index 0ae5053..4f2b093 100644
--- a/_sources/reference/index.rst.txt
+++ b/_sources/reference/index.rst.txt
@@ -62,3 +62,10 @@ Runoff Calculation Functions
.. automodule:: hyswap.runoff
:members:
:special-members:
+
+Similarity Functions
+--------------------
+
+.. automodule:: hyswap.similarity
+ :members:
+ :special-members:
diff --git a/_static/basic.css b/_static/basic.css
index 2a1ca75..c8079f4 100644
--- a/_static/basic.css
+++ b/_static/basic.css
@@ -237,6 +237,10 @@ a.headerlink {
visibility: hidden;
}
+a:visited {
+ color: #551A8B;
+}
+
h1:hover > a.headerlink,
h2:hover > a.headerlink,
h3:hover > a.headerlink,
diff --git a/_static/bizstyle.css b/_static/bizstyle.css
index 5e46037..8f1ce71 100644
--- a/_static/bizstyle.css
+++ b/_static/bizstyle.css
@@ -172,6 +172,10 @@ a:hover {
text-decoration: underline;
}
+a:visited {
+ color: #551a8b;
+}
+
div.body a {
text-decoration: underline;
}
diff --git a/_static/bizstyle.js b/_static/bizstyle.js
index ce40ff6..40af1ab 100644
--- a/_static/bizstyle.js
+++ b/_static/bizstyle.js
@@ -23,7 +23,7 @@ const initialiseBizStyle = () => {
}
window.addEventListener("resize",
- () => (document.querySelector("li.nav-item-0 a").innerText = (window.innerWidth <= 776) ? "Top" : "hyswap 0.1.dev1+gc646d0d documentation")
+ () => (document.querySelector("li.nav-item-0 a").innerText = (window.innerWidth <= 776) ? "Top" : "hyswap 0.1.dev1+gdcc7916 documentation")
)
if (document.readyState !== "loading") initialiseBizStyle()
diff --git a/_static/documentation_options.js b/_static/documentation_options.js
index 3579915..244331e 100644
--- a/_static/documentation_options.js
+++ b/_static/documentation_options.js
@@ -1,6 +1,5 @@
-var DOCUMENTATION_OPTIONS = {
- URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'),
- VERSION: '0.1.dev1+gc646d0d',
+const DOCUMENTATION_OPTIONS = {
+ VERSION: '0.1.dev1+gdcc7916',
LANGUAGE: 'en',
COLLAPSE_INDEX: false,
BUILDER: 'html',
diff --git a/_static/searchtools.js b/_static/searchtools.js
index 97d56a7..7918c3f 100644
--- a/_static/searchtools.js
+++ b/_static/searchtools.js
@@ -57,12 +57,12 @@ const _removeChildren = (element) => {
const _escapeRegExp = (string) =>
string.replace(/[.*+\-?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string
-const _displayItem = (item, searchTerms) => {
+const _displayItem = (item, searchTerms, highlightTerms) => {
const docBuilder = DOCUMENTATION_OPTIONS.BUILDER;
- const docUrlRoot = DOCUMENTATION_OPTIONS.URL_ROOT;
const docFileSuffix = DOCUMENTATION_OPTIONS.FILE_SUFFIX;
const docLinkSuffix = DOCUMENTATION_OPTIONS.LINK_SUFFIX;
const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY;
+ const contentRoot = document.documentElement.dataset.content_root;
const [docName, title, anchor, descr, score, _filename] = item;
@@ -75,20 +75,24 @@ const _displayItem = (item, searchTerms) => {
if (dirname.match(/\/index\/$/))
dirname = dirname.substring(0, dirname.length - 6);
else if (dirname === "index/") dirname = "";
- requestUrl = docUrlRoot + dirname;
+ requestUrl = contentRoot + dirname;
linkUrl = requestUrl;
} else {
// normal html builders
- requestUrl = docUrlRoot + docName + docFileSuffix;
+ requestUrl = contentRoot + docName + docFileSuffix;
linkUrl = docName + docLinkSuffix;
}
let linkEl = listItem.appendChild(document.createElement("a"));
linkEl.href = linkUrl + anchor;
linkEl.dataset.score = score;
linkEl.innerHTML = title;
- if (descr)
+ if (descr) {
listItem.appendChild(document.createElement("span")).innerHTML =
" (" + descr + ")";
+ // highlight search terms in the description
+ if (SPHINX_HIGHLIGHT_ENABLED) // set in sphinx_highlight.js
+ highlightTerms.forEach((term) => _highlightText(listItem, term, "highlighted"));
+ }
else if (showSearchSummary)
fetch(requestUrl)
.then((responseData) => responseData.text())
@@ -97,6 +101,9 @@ const _displayItem = (item, searchTerms) => {
listItem.appendChild(
Search.makeSearchSummary(data, searchTerms)
);
+ // highlight search terms in the summary
+ if (SPHINX_HIGHLIGHT_ENABLED) // set in sphinx_highlight.js
+ highlightTerms.forEach((term) => _highlightText(listItem, term, "highlighted"));
});
Search.output.appendChild(listItem);
};
@@ -115,14 +122,15 @@ const _finishSearch = (resultCount) => {
const _displayNextItem = (
results,
resultCount,
- searchTerms
+ searchTerms,
+ highlightTerms,
) => {
// results left, load the summary and display it
// this is intended to be dynamic (don't sub resultsCount)
if (results.length) {
- _displayItem(results.pop(), searchTerms);
+ _displayItem(results.pop(), searchTerms, highlightTerms);
setTimeout(
- () => _displayNextItem(results, resultCount, searchTerms),
+ () => _displayNextItem(results, resultCount, searchTerms, highlightTerms),
5
);
}
@@ -360,7 +368,7 @@ const Search = {
// console.info("search results:", Search.lastresults);
// print the results
- _displayNextItem(results, results.length, searchTerms);
+ _displayNextItem(results, results.length, searchTerms, highlightTerms);
},
/**
diff --git a/_static/sphinx_highlight.js b/_static/sphinx_highlight.js
index aae669d..8a96c69 100644
--- a/_static/sphinx_highlight.js
+++ b/_static/sphinx_highlight.js
@@ -29,14 +29,19 @@ const _highlight = (node, addItems, text, className) => {
}
span.appendChild(document.createTextNode(val.substr(pos, text.length)));
+ const rest = document.createTextNode(val.substr(pos + text.length));
parent.insertBefore(
span,
parent.insertBefore(
- document.createTextNode(val.substr(pos + text.length)),
+ rest,
node.nextSibling
)
);
node.nodeValue = val.substr(0, pos);
+ /* There may be more occurrences of search term in this node. So call this
+ * function recursively on the remaining fragment.
+ */
+ _highlight(rest, addItems, text, className);
if (isInSVG) {
const rect = document.createElementNS(
@@ -140,5 +145,10 @@ const SphinxHighlight = {
},
};
-_ready(SphinxHighlight.highlightSearchWords);
-_ready(SphinxHighlight.initEscapeListener);
+_ready(() => {
+ /* Do not call highlightSearchWords() when we are on the search page.
+ * It will highlight words from the *previous* search query.
+ */
+ if (typeof Search === "undefined") SphinxHighlight.highlightSearchWords();
+ SphinxHighlight.initEscapeListener();
+});
diff --git a/examples/cumulative_hydrograph_examples.html b/examples/cumulative_hydrograph_examples.html
index de3231e..98786f9 100644
--- a/examples/cumulative_hydrograph_examples.html
+++ b/examples/cumulative_hydrograph_examples.html
@@ -2,23 +2,23 @@
-
+
- Cumulative Streamflow Hydrographs — hyswap 0.1.dev1+gc646d0d documentation
+ Cumulative Streamflow Hydrographs — hyswap 0.1.dev1+gdcc7916 documentation
-
+
-
+
-
+
-
+
+
+
These examples showcase the usage of the functions in the similarity module, with heatmap visualizations via the hyswap.plots.plot_similarity_heatmap function.
+Sometimes it is helpful to compare the relationships between a set of stations and their respective measurements.
+The similarity functions packaged in hyswap handle some of the data clean-up for you by ensuring the time-series of observations being compared at the same, and by removing any missing data.
+This ensures that your results are not skewed by missing data or gaps in one of the time-series.
The following example shows the correlations between streamflow at 5 stations (07374525, 07374000, 07289000, 07032000, 07024175) along the Mississippi River, listed from downstream to upstream.
+First we have to fetch the streamflow data for these stations, to do this we will use the dataretrieval package to access the NWIS database.
+
# get the data from these 5 sites
+site_list=["07374525","07374000","07289000","07032000","07024175"]
+
+# fetch some streamflow data from NWIS as a list of dataframes
+df_list=[]
+forsiteinsite_list:
+ df,_=dataretrieval.nwis.get_dv(site,start="2012-01-01",
+ end="2022-12-31",
+ parameterCd='00060')
+ df_list.append(df)
+
If we’d like, we can display the specific values of the correlations by setting the show_values argument to True in the hyswap.plots.plot_similarity_heatmap function.
+
# get the data from these 5 sites
+site_list=["07374525","07374000","07289000","07032000","07024175"]
+
+# fetch some streamflow data from NWIS as a list of dataframes
+df_list=[]
+forsiteinsite_list:
+ df,_=dataretrieval.nwis.get_dv(site,start="2012-01-01",
+ end="2022-12-31",
+ parameterCd='00060')
+ df_list.append(df)
+
+# calculate correlations
+results,n_obs=hyswap.similarity.calculate_correlations(df_list,"00060_Mean")
+
+# make plot
+ax=hyswap.plots.plot_similarity_heatmap(
+ results,n_obs=n_obs,
+ title="Pearson Correlation Coefficients for Streamflow\n"+
+ "Between 5 Sites Along the Mississippi River",
+ show_values=True)
+
+# show the plot
+plt.tight_layout()
+plt.show()
+
In this example we compare the same 5 time-series as the previous example, but instead of calculating correlations, we calculate the Wasserstein Distance between each pairing of time-series.
+The Wasserstein Distance is a measure of the distance between two probability distributions, in this case the probability distributions of the streamflow values at each station.
+Specifically in hyswap, we utilize the scipy.stats.wasserstein_distance() function, and are therefore specifically calculating the “first” Wasserstein Distance between two time-series.
+
# get the data from these 5 sites
+site_list=["07374525","07374000","07289000","07032000","07024175"]
+
+# fetch some streamflow data from NWIS as a list of dataframes
+df_list=[]
+forsiteinsite_list:
+ df,_=dataretrieval.nwis.get_dv(site,start="2012-01-01",
+ end="2022-12-31",
+ parameterCd='00060')
+ df_list.append(df)
+
+# calculate Wasserstein Distances
+results,n_obs=hyswap.similarity.calculate_wasserstein_distance(df_list,"00060_Mean")
+
+# make plot
+ax=hyswap.plots.plot_similarity_heatmap(
+ results,n_obs=n_obs,
+ title="Wasserstein Distances for Streamflow\n"+
+ "Between 5 Sites Along the Mississippi River",
+ show_values=True)
+
+# show the plot
+plt.tight_layout()
+plt.show()
+
In this example we compare the same 5 time-series as the previous example, but this time using another distance measure, the so-called “Energy Distance” between two time-series.
+The Energy Distance is a statistical distance between two probability distributions, in this case the probability distributions of the streamflow values at each station.
+Specifically in hyswap, we utilize the scipy.stats.energy_distance() function.
+
# get the data from these 5 sites
+site_list=["07374525","07374000","07289000","07032000","07024175"]
+
+# fetch some streamflow data from NWIS as a list of dataframes
+df_list=[]
+forsiteinsite_list:
+ df,_=dataretrieval.nwis.get_dv(site,start="2012-01-01",
+ end="2022-12-31",
+ parameterCd='00060')
+ df_list.append(df)
+
+# calculate Wasserstein Distances
+results,n_obs=hyswap.similarity.calculate_energy_distance(df_list,"00060_Mean")
+
+# make plot
+ax=hyswap.plots.plot_similarity_heatmap(
+ results,n_obs=n_obs,
+ title="Energy Distances for Streamflow\n"+
+ "Between 5 Sites Along the Mississippi River",
+ show_values=True)
+
+# show the plot
+plt.tight_layout()
+plt.show()
+
These examples show how a streamflow hydrograph can be constructed by fetching historical streamflow data from NWIS using dataretrieval, and then calculating daily percentiles of streamflow for each day of the year.
The resulting hydrographs show the streamflow values for all of 2022 plotted on top of the historical percentiles which are shown as shaded regions.
Fetching Percentiles from the NWIS Statistics Service¶
You don’t have to compute the percentiles using hyswap.
If you’d rather use the NWIS web service daily percentiles, you can use those values instead.
We provide a convenience utility function to help make this possible, hyswap.utils.munge_nwis_stats.
@@ -143,7 +143,7 @@
Fetching Percentiles from the NWIS Statistics Service
The examples above show how to plot the percentiles by day of year using the calendar year.
In this example, we will plot the percentiles by day of water year, as water years are commonly by hydrologists.
The only change this requires from above is specifying the type of year we are planning to use when calculating the daily percentile thresholds.
The examples above show how to plot the percentiles by day of year using the calendar year.
In this example, we will plot the percentiles by day of climate year.
The only change this requires from above is specifying the type of year we are planning to use when calculating the daily percentile thresholds.
In this example we will calculate and plot a unique set of daily percentile thresholds.
We will also specify the colors to be used for the percentile envelopes.
# fetch historic data from NWIS
@@ -261,7 +261,7 @@
Rolling Averages for Historic Daily Percentile Calculations¶
In this example, rather than calculating historic daily percentile values based solely on the past values from that day of the year, we will calculate the historic daily percentile values based on rolling averages of the past values around that day.
Under the hood this uses the pandas.DataFrame.rolling() method to calculate the rolling average, with the default parameters.
To show the effect of this, we will plot the historic daily percentile values for the daily (default) rolling average, 7-day rolling average, and the 28-day rolling average.
@@ -331,7 +331,7 @@
Rolling Averages for Historic Daily Percentile Calculations
-
In this example we will customize the fill areas between the percentile thresholds by passing keyword arguments to the hyswap.plots.plot_duration_hydrograph function that are then passed through to the matplotlib.axes.Axes.fill_between() function.
Specifically we will set the alpha argument to 1.0 to make the fill areas opaque (the default value is 0.5 for some transparency).
# fetch historic data from NWIS
@@ -440,14 +440,14 @@
Welcome to the documentation for the Python hyswap package.
hyswap (HYdrologic Surface Water Analysis Package), is a Python package which provides a set of functions for manipulating and visualizing USGS water data.
Specifically, a number of functions for calculating statistics (e.g., exceedance probabilities, daily historic percentiles) and generating related plots (e.g., flow duration curves, streamflow duration hydrographs) are available.
@@ -56,7 +56,7 @@
WelcomeExamples section for fairly standard examples of using the functions in hyswap to perform typical hydrologic calculations and visualizations.
Read the API Reference if you’d like to see the full set of functions that are available.
-