From 6adf018e5322b54fadfafd24a11452e3752c461b Mon Sep 17 00:00:00 2001 From: yentyu Date: Thu, 9 Sep 2021 13:01:54 -0600 Subject: [PATCH 1/2] modified evaluate_pdf for scipy gkde --- bet/sample.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/bet/sample.py b/bet/sample.py index b4543b81..8ad6f40e 100644 --- a/bet/sample.py +++ b/bet/sample.py @@ -67,8 +67,9 @@ def evaluate_pdf(prob_type, prob_parameters, vals): Evaluate the probability density function defined by `prob_type` and `prob_parameters` at points defined by `vals`. - :param prob_type: Type of probability description. Options are 'kde' (weighted kernel - density estimate), 'rv' (random variable), 'gmm' (Gaussian mixture model), and 'voronoi'. + :param prob_type: Type of probability description. Options are 'kde_marginals' (weighted kernel + density estimate), 'kde' (multidimensional kde), 'rv' (random variable), + 'gmm' (Gaussian mixture model), and 'voronoi'. :type prob_type: str :param prob_parameters: Parameters that define the probability measure of type `prob_type` :param vals: Values at which to evaluate the PDF. @@ -77,11 +78,23 @@ def evaluate_pdf(prob_type, prob_parameters, vals): :rtype `numpy.ndarray` """ dim = vals.shape[1] - if prob_type == "kde": + if prob_type == "kde_marginals": mar = np.ones((vals.shape[0], )) for i in range(dim): mar *= evaluate_pdf_marginal(prob_type, prob_parameters, vals, i) return mar + elif prob_type == "kde": + mar = np.ones((vals.shape[0],)) + if type(prob_parameters) == "scipy.stats.kde.gaussian_kde": + if prob_parameters.d == dim: + # vals are neval x dim, gkde from scipy takes dim x neval + mar = prob_parameters.pdf(vals.T) + else: + raise wrong_input('Dimension of kde and vals do not match.') + else: + raise wrong_input('This kde must be a scipy.stats gkde object' + \ + '(note that previous version of "kde" has been renamed "kde_marginals")') + return mar elif prob_type == "rv": mar = np.ones((vals.shape[0],)) for i in range(dim): @@ -126,7 +139,7 @@ def evaluate_pdf_marginal(prob_type, prob_parameters, vals, i): elif len(vals.shape) == 1: x = vals - if prob_type == "kde": + if prob_type == "kde_marginals": param_marginals, cluster_weights = prob_parameters num_clusters = len(cluster_weights) mar = np.zeros(x.shape[0]) @@ -530,7 +543,7 @@ def get_prob_parameters_init(self): def set_prob_type(self, prob_type): """ Set the type of updated probability measure. - :param prob_type: Type of updated probability measure ('kde', 'gmm', 'voronoi', 'rv') + :param prob_type: Type of updated probability measure ('kde_marginals', 'kde', 'gmm', 'voronoi', 'rv') :type prob_type: str """ self._prob_type = prob_type From e79faf08db95c99ff72645435ab57cb91f1f1aba Mon Sep 17 00:00:00 2001 From: yentyu Date: Sat, 11 Sep 2021 08:31:47 -0600 Subject: [PATCH 2/2] completed mods to evaluate_pdf and introduced generate densities to calculateR --- bet/calculateP/calculateR.py | 108 ++++++++++++++++++++++++++++++++++- bet/sample.py | 16 +++++- 2 files changed, 120 insertions(+), 4 deletions(-) diff --git a/bet/calculateP/calculateR.py b/bet/calculateP/calculateR.py index 05bf6cbe..6dd7340e 100644 --- a/bet/calculateP/calculateR.py +++ b/bet/calculateP/calculateR.py @@ -16,6 +16,112 @@ import numpy as np import logging +def generate_densities(discretization, which_samples, density_method, method_args=None): + """ + Generate density estimates on sample set(s) in the discretization indicated by "which_sample". + + :param discretization: Discretization used to calculate density estimates + :type discretization: :class:`bet.sample.discretization` + :param which_samples: indicates which samples to compute the density estimate. Options are + 'predict' (for predicted sample set), 'observe' (for observed sample set), and 'output' (for both) + :type which_samples: str + :param density_method: name of the method used to implement density estimation. Options are + 'kde' (`scipy.stats.gaussian_kde` implementation), + 'clustered_kde' (computes gkdes on clusters defined by cluster_maps in the discretization), + 'gmm' (fits a Gaussian Mixture model using `sklearn.mixture.GaussianMixture`) + :type density_method: str + :param method_args: dictionary of arguments to be passed to the density estimation method + (e.g., bandwidth params, etc.). See specific density estimation implementation for details. + :type method_args: dict + + :modifies: discretization object. Specifically, this function saves density estimates to the + appropriate `sample_set_base` objects associated with the discretization object. + + :returns: list of density estimate objects from density estimation method. + :rtype: list of :class: depends on the density estimation method chosen. + """ + # allows modification of discretization + discretization.local_to_global() + + # determine which sample sets to compute the density estimation + if which_samples == "predict": + this_sample_list = [discretization.get_output_sample_set()] + elif which_samples == "observe": + this_sample_list = [discretization.get_output_observed_set()] + elif which_samples == "output": + this_sample_list = [discretization.get_output_sample_set(), discretization.get_output_observed_set()] + else: + raise ValueError("Must specify the specific sample set to compute the density estimation.") + + # evaluates density estimation method on the samples + output_densities = [] + output_density_type = None # this must be correct keyword for `sample.evaluate_pdf` + + if density_method == "kde": + from scipy.stats import gaussian_kde + for this_sample_set in this_sample_list: + output_densities.append(gaussian_kde(this_sample_set.get_values().T, **method_args)) + output_density_type = "kde" + + + elif density_method == "clustered_kde": + for this_sample_set in this_sample_list: + output_densities.append(generate_clustered_kdes(this_sample_set, **method_args)) + output_density_type = "clustered_kde" + + # save output densities to sample set + for this_sample_set, dens in zip(this_sample_list,output_densities): + this_sample_set.set_prob_type(output_density_type) + this_sample_set.set_prob_parameters(dens) + + + return output_density_type, output_densities + +def generate_clustered_kdes(sample_set_base, kde_args=None): + """ + Generates clustered kernel density estimate (similar to function `generate_output_kde`). + *Note that initial weights must now be passed as weights through the dictionary kde_args. + + :param sample_set_base: Sample set object used to calculate density estimates + :type sample_set_base: :class:`bet.sample.sample_set_base` + :param kde_args: Dictionary of arguments passed to `scipy.stats.kde.gaussian_kde`. Note that + this includes bandwidth parameters as well as weights for the kde estimate. + :type kde_args: dict + + :returns: list of `gaussian_kde` and corresponding list of cluster weights + :rtype: list, list + """ + from scipy.stats import gaussian_kde + + if sample_set_base.get_region() is None: + sample_set_base.set_region(np.array([0] * sample_set_base.check_num())) + + if sample_set_base.get_cluster_maps() is None: + num_clusters = int(np.max(sample_set_base.get_region()) + 1) + else: + num_clusters = len(sample_set_base.get_cluster_maps()) + + clustered_kdes = [] + cluster_weights = [] + num_obs = sample_set_base.check_num() + for i in range(num_clusters): + if sample_set_base.get_cluster_maps() is not None: + if len(sample_set_base.get_cluster_maps()) > 1: + clustered_kdes.append(gaussian_kde(sample_set_base.get_cluster_maps()[i].T, **kde_args)) + cluster_weights.append(len(np.where(sample_set_base.get_region() == i)[0]) / num_obs) + else: + clustered_kdes.append(None) + cluster_weights.append(None) + else: + values_pointer = np.where(sample_set_base.get_region() == i)[0] + if len(values_pointer) > 1: + clustered_kdes.append(gaussian_kde(sample_set_base.get_values()[values_pointer].T, **kde_args)) + cluster_weights.append(len(values_pointer)/num_obs) + else: + clustered_kdes.append(None) + cluster_weights.append(None) + + return clustered_kdes, cluster_weights def generate_output_kdes(discretization, bw_method=None): """ @@ -161,7 +267,7 @@ def invert_to_kde(discretization, bw_method=None): param_marginals[i].append(gaussian_kde(params[lam_ptr[j], i], weights=r[j], bw_method=bw_method)) else: param_marginals[i].append(None) - discretization.get_input_sample_set().set_prob_type("kde") + discretization.get_input_sample_set().set_prob_type("kde_marginals") discretization.get_input_sample_set().set_prob_parameters((param_marginals, cluster_weights)) print('Diagnostic for clusters [sample average of ratios in each cluster]: ', rs) return param_marginals, cluster_weights diff --git a/bet/sample.py b/bet/sample.py index 8ad6f40e..132fb187 100644 --- a/bet/sample.py +++ b/bet/sample.py @@ -85,7 +85,7 @@ def evaluate_pdf(prob_type, prob_parameters, vals): return mar elif prob_type == "kde": mar = np.ones((vals.shape[0],)) - if type(prob_parameters) == "scipy.stats.kde.gaussian_kde": + if isinstance(prob_parameters,scipy.stats.kde.gaussian_kde): if prob_parameters.d == dim: # vals are neval x dim, gkde from scipy takes dim x neval mar = prob_parameters.pdf(vals.T) @@ -95,9 +95,19 @@ def evaluate_pdf(prob_type, prob_parameters, vals): raise wrong_input('This kde must be a scipy.stats gkde object' + \ '(note that previous version of "kde" has been renamed "kde_marginals")') return mar + elif prob_type == "clustered_kde": + cluster_kdes, cluster_weights = prob_parameters + num_clusters = len(cluster_weights) + mar = np.zeros((vals.shape[0],)) + for j in range(num_clusters): + if cluster_kdes[j].d != dim: + raise wrong_input('Dimension of clustered kde {} does not match vals.'.format(j)) + else: + mar += cluster_kdes[j](vals.T) * cluster_weights[j] + return mar elif prob_type == "rv": mar = np.ones((vals.shape[0],)) - for i in range(dim): + for i in range(dim): #this assumes independence of each dimension mar *= evaluate_pdf_marginal(prob_type, prob_parameters, vals, i) return mar elif prob_type == "gmm": @@ -120,7 +130,7 @@ def evaluate_pdf_marginal(prob_type, prob_parameters, vals, i): Evaluate the marginal probability density function of index `i` defined by `prob_type` and `prob_parameters` at points defined by `vals`. - :param prob_type: Type of probability description. Options are 'kde' (weighted kernel + :param prob_type: Type of probability description. Options are 'kde_marginals' (weighted kernel density estimate), 'rv' (random variable), 'gmm' (Gaussian mixture model), and 'voronoi'. :type prob_type: str :param prob_parameters: Parameters that define the probability measure of type `prob_type`