UT-CHG · yentyu · Sep 9, 2021 · Sep 11, 2021
diff --git a/bet/calculateP/calculateR.py b/bet/calculateP/calculateR.py
@@ -16,6 +16,112 @@
 import numpy as np
 import logging
 
+def generate_densities(discretization, which_samples, density_method, method_args=None):
+    """
+    Generate density estimates on sample set(s) in the discretization indicated by "which_sample".
+
+    :param discretization: Discretization used to calculate density estimates
+    :type discretization: :class:`bet.sample.discretization`
+    :param which_samples: indicates which samples to compute the density estimate. Options are 
+        'predict' (for predicted sample set), 'observe' (for observed sample set), and 'output' (for both)
+    :type which_samples: str
+    :param density_method: name of the method used to implement density estimation. Options are
+        'kde' (`scipy.stats.gaussian_kde` implementation), 
+        'clustered_kde' (computes gkdes on clusters defined by cluster_maps in the discretization),
+        'gmm' (fits a Gaussian Mixture model using `sklearn.mixture.GaussianMixture`)
+    :type density_method: str
+    :param method_args: dictionary of arguments to be passed to the density estimation method 
+        (e.g., bandwidth params, etc.). See specific density estimation implementation for details.
+    :type method_args: dict
+
+    :modifies: discretization object. Specifically, this function saves density estimates to the 
+        appropriate `sample_set_base` objects associated with the discretization object.
+
+    :returns: list of density estimate objects from density estimation method.
+    :rtype: list of :class: depends on the density estimation method chosen.
+    """
+    # allows modification of discretization
+    discretization.local_to_global()
+
+    # determine which sample sets to compute the density estimation
+    if which_samples == "predict":
+        this_sample_list = [discretization.get_output_sample_set()]
+    elif which_samples == "observe":
+        this_sample_list = [discretization.get_output_observed_set()]
+    elif which_samples == "output":
+        this_sample_list = [discretization.get_output_sample_set(), discretization.get_output_observed_set()]
+    else:
+        raise ValueError("Must specify the specific sample set to compute the density estimation.")
+
+    # evaluates density estimation method on the samples
+    output_densities = []
+    output_density_type = None # this must be correct keyword for `sample.evaluate_pdf`
+
+    if density_method == "kde":
+        from scipy.stats import gaussian_kde
+        for this_sample_set in this_sample_list:
+            output_densities.append(gaussian_kde(this_sample_set.get_values().T, **method_args))
+        output_density_type = "kde"
+
+
+    elif density_method == "clustered_kde":
+        for this_sample_set in this_sample_list:
+            output_densities.append(generate_clustered_kdes(this_sample_set, **method_args))
+        output_density_type = "clustered_kde"
+
+    # save output densities to sample set
+    for this_sample_set, dens in zip(this_sample_list,output_densities):
+        this_sample_set.set_prob_type(output_density_type)
+        this_sample_set.set_prob_parameters(dens)
+
+
+    return output_density_type, output_densities
+
+def generate_clustered_kdes(sample_set_base, kde_args=None):
+    """
+    Generates clustered kernel density estimate (similar to function `generate_output_kde`).
+    *Note that initial weights must now be passed as weights through the dictionary kde_args.
+
+    :param sample_set_base: Sample set object used to calculate density estimates
+    :type sample_set_base: :class:`bet.sample.sample_set_base`
+    :param kde_args: Dictionary of arguments passed to `scipy.stats.kde.gaussian_kde`. Note that
+            this includes bandwidth parameters as well as weights for the kde estimate.
+    :type kde_args: dict
+
+    :returns: list of `gaussian_kde` and corresponding list of cluster weights
+    :rtype: list, list
+    """
+    from scipy.stats import gaussian_kde
+
+    if sample_set_base.get_region() is None:
+        sample_set_base.set_region(np.array([0] * sample_set_base.check_num()))
+
+    if sample_set_base.get_cluster_maps() is None:
+        num_clusters = int(np.max(sample_set_base.get_region()) + 1)
+    else:
+        num_clusters = len(sample_set_base.get_cluster_maps())
+
+    clustered_kdes = []
+    cluster_weights = []
+    num_obs = sample_set_base.check_num()
+    for i in range(num_clusters):
+        if sample_set_base.get_cluster_maps() is not None:
+            if len(sample_set_base.get_cluster_maps()) > 1:
+                clustered_kdes.append(gaussian_kde(sample_set_base.get_cluster_maps()[i].T, **kde_args))                
+                cluster_weights.append(len(np.where(sample_set_base.get_region() == i)[0]) / num_obs)
+            else:
+                clustered_kdes.append(None)
+                cluster_weights.append(None)
+        else:
+            values_pointer = np.where(sample_set_base.get_region() == i)[0]
+            if len(values_pointer) > 1:
+                clustered_kdes.append(gaussian_kde(sample_set_base.get_values()[values_pointer].T, **kde_args))
+                cluster_weights.append(len(values_pointer)/num_obs)
+            else:
+                clustered_kdes.append(None)
+                cluster_weights.append(None)
+
+    return clustered_kdes, cluster_weights
 
 def generate_output_kdes(discretization, bw_method=None):
     """
@@ -161,7 +267,7 @@ def invert_to_kde(discretization, bw_method=None):
                 param_marginals[i].append(gaussian_kde(params[lam_ptr[j], i], weights=r[j], bw_method=bw_method))
             else:
                 param_marginals[i].append(None)
-    discretization.get_input_sample_set().set_prob_type("kde")
+    discretization.get_input_sample_set().set_prob_type("kde_marginals")
     discretization.get_input_sample_set().set_prob_parameters((param_marginals, cluster_weights))
     print('Diagnostic for clusters [sample average of ratios in each cluster]: ', rs)
     return param_marginals, cluster_weights

diff --git a/bet/sample.py b/bet/sample.py
@@ -67,8 +67,9 @@ def evaluate_pdf(prob_type, prob_parameters, vals):
     Evaluate the probability density function defined by `prob_type` and `prob_parameters`
     at points defined by `vals`.
 
-    :param prob_type: Type of probability description. Options are 'kde' (weighted kernel
-        density estimate), 'rv' (random variable), 'gmm' (Gaussian mixture model), and 'voronoi'.
+    :param prob_type: Type of probability description. Options are 'kde_marginals' (weighted kernel
+        density estimate), 'kde' (multidimensional kde), 'rv' (random variable), 
+        'gmm' (Gaussian mixture model), and 'voronoi'.
     :type prob_type: str
     :param prob_parameters: Parameters that define the probability measure of type `prob_type`
     :param vals: Values at which to evaluate the PDF.
@@ -77,14 +78,36 @@ def evaluate_pdf(prob_type, prob_parameters, vals):
     :rtype `numpy.ndarray`
     """
     dim = vals.shape[1]
-    if prob_type == "kde":
+    if prob_type == "kde_marginals":
         mar = np.ones((vals.shape[0], ))
         for i in range(dim):
             mar *= evaluate_pdf_marginal(prob_type, prob_parameters, vals, i)
         return mar
+    elif prob_type == "kde":
+        mar = np.ones((vals.shape[0],))
+        if isinstance(prob_parameters,scipy.stats.kde.gaussian_kde):
+            if prob_parameters.d == dim:
+                # vals are neval x dim, gkde from scipy takes dim x neval
+                mar = prob_parameters.pdf(vals.T) 
+            else:
+                raise wrong_input('Dimension of kde and vals do not match.')
+        else:
+            raise wrong_input('This kde must be a scipy.stats gkde object' + \
+                              '(note that previous version of "kde" has been renamed "kde_marginals")')
+        return mar
+    elif prob_type == "clustered_kde":
+        cluster_kdes, cluster_weights = prob_parameters
+        num_clusters = len(cluster_weights)
+        mar = np.zeros((vals.shape[0],))
+        for j in range(num_clusters):
+            if cluster_kdes[j].d != dim:
+                raise wrong_input('Dimension of clustered kde {} does not match vals.'.format(j))
+            else:
+                mar += cluster_kdes[j](vals.T) * cluster_weights[j]
+        return mar
     elif prob_type == "rv":
         mar = np.ones((vals.shape[0],))
-        for i in range(dim):
+        for i in range(dim): #this assumes independence of each dimension
             mar *= evaluate_pdf_marginal(prob_type, prob_parameters, vals, i)
         return mar
     elif prob_type == "gmm":
@@ -107,7 +130,7 @@ def evaluate_pdf_marginal(prob_type, prob_parameters, vals, i):
     Evaluate the marginal probability density function of index `i` defined by `prob_type`
     and `prob_parameters` at points defined by `vals`.
 
-    :param prob_type: Type of probability description. Options are 'kde' (weighted kernel
+    :param prob_type: Type of probability description. Options are 'kde_marginals' (weighted kernel
         density estimate), 'rv' (random variable), 'gmm' (Gaussian mixture model), and 'voronoi'.
     :type prob_type: str
     :param prob_parameters: Parameters that define the probability measure of type `prob_type`
@@ -126,7 +149,7 @@ def evaluate_pdf_marginal(prob_type, prob_parameters, vals, i):
     elif len(vals.shape) == 1:
         x = vals
 
-    if prob_type == "kde":
+    if prob_type == "kde_marginals":
         param_marginals, cluster_weights = prob_parameters
         num_clusters = len(cluster_weights)
         mar = np.zeros(x.shape[0])
@@ -530,7 +553,7 @@ def get_prob_parameters_init(self):
     def set_prob_type(self, prob_type):
         """
         Set the type of updated probability measure.
-        :param prob_type: Type of updated probability measure ('kde', 'gmm', 'voronoi', 'rv')
+        :param prob_type: Type of updated probability measure ('kde_marginals', 'kde', 'gmm', 'voronoi', 'rv')
         :type prob_type: str
         """
         self._prob_type = prob_type