fix groupby.mean for only numeric values (#363)

* fix groupby.mean for only numeric values; change in default behaviour for pandas 2.0 * Bump pandas to 1.1 * Black * Add `numeric_only=True` to groupby mean * Fix any unit test * Fix bool in get_dummies --------- Co-authored-by: Raphael Vallat <raphaelvallat9@gmail.com>
raphaelvallat · Jun 4, 2023 · 7923141 · 7923141
1 parent faad22a
commit 7923141
Show file tree

Hide file tree

Showing 6 changed files with 23 additions and 21 deletions.
diff --git a/pingouin/pairwise.py b/pingouin/pairwise.py
@@ -475,7 +475,9 @@ def pairwise_tests(
             # designs. Indeed, a similar groupby is applied by default on
             # each within-subject factor of a two-way repeated measures design.
             if all([agg[i], marginal]):
-                tmp = data.groupby([subject, f], as_index=False, observed=True, sort=True).mean()
+                tmp = data.groupby([subject, f], as_index=False, observed=True, sort=True).mean(
+                    numeric_only=True
+                )
             else:
                 tmp = data
             pt = pairwise_tests(
@@ -880,7 +882,7 @@ def pairwise_tukey(data=None, dv=None, between=None, effsize="hedges"):
     # See https://github.com/raphaelvallat/pingouin/issues/111
     labels = np.array(list(grp.groups.keys()))
     n = grp.count().to_numpy()
-    gmeans = grp.mean().to_numpy()
+    gmeans = grp.mean(numeric_only=True).to_numpy()
     gvar = aov.at[1, "MS"] / n
 
     # Pairwise combinations
@@ -1046,8 +1048,8 @@ def pairwise_gameshowell(data=None, dv=None, between=None, effsize="hedges"):
     # See https://github.com/raphaelvallat/pingouin/issues/111
     labels = np.array(list(grp.groups.keys()))
     n = grp.count().to_numpy()
-    gmeans = grp.mean().to_numpy()
-    gvars = grp.var().to_numpy()
+    gmeans = grp.mean(numeric_only=True).to_numpy()
+    gvars = grp.var().to_numpy()  # numeric_only=True added in pandas 1.5, set to False in 2.0
 
     # Pairwise combinations
     g1, g2 = np.array(list(combinations(np.arange(ng), 2))).T
@@ -1425,7 +1427,7 @@ def traverse(o, tree_types=(list, tuple)):
             [c in keys for c in covar]
         ), "Covariate(s) are either not in data or not numeric."
         # And we make sure that X or Y does not contain covar
-        stats = stats[~stats[["X", "Y"]].isin(covar).any(1)]
+        stats = stats[~stats[["X", "Y"]].isin(covar).any(axis=1)]
         stats = stats.reset_index(drop=True)
         if stats.shape[0] == 0:
             raise ValueError(

diff --git a/pingouin/parametric.py b/pingouin/parametric.py
@@ -552,13 +552,13 @@ def rm_anova(
     grandmean = data[dv].mean()
 
     # Calculate sums of squares
-    ss_with = ((grp_with.mean() - grandmean) ** 2 * grp_with.count()).sum()
+    ss_with = ((grp_with.mean(numeric_only=True) - grandmean) ** 2 * grp_with.count()).sum()
     ss_resall = grp_with.apply(lambda x: (x - x.mean()) ** 2).sum()
     # sstotal = sstime + ss_resall =  sstime + (sssubj + sserror)
     # ss_total = ((data[dv] - grandmean)**2).sum()
     # We can further divide the residuals into a within and between component:
     grp_subj = data.groupby(subject, observed=True)[dv]
-    ss_resbetw = n_rm * np.sum((grp_subj.mean() - grandmean) ** 2)
+    ss_resbetw = n_rm * np.sum((grp_subj.mean(numeric_only=True) - grandmean) ** 2)
     ss_reswith = ss_resall - ss_resbetw
 
     # Calculate degrees of freedom
@@ -702,12 +702,12 @@ def rm_anova2(data=None, dv=None, within=None, subject=None, effsize="ng2"):
     # Groupby means
     # I think that observed=True is actually not needed here since we have already used
     # `observed=True` in pivot_table.
-    grp_s = data.groupby(subject, observed=True)[dv].mean()
-    grp_a = data.groupby([a], observed=True)[dv].mean()
-    grp_b = data.groupby([b], observed=True)[dv].mean()
-    grp_ab = data.groupby([a, b], observed=True)[dv].mean()
-    grp_as = data.groupby([a, subject], observed=True)[dv].mean()
-    grp_bs = data.groupby([b, subject], observed=True)[dv].mean()
+    grp_s = data.groupby(subject, observed=True)[dv].mean(numeric_only=True)
+    grp_a = data.groupby([a], observed=True)[dv].mean(numeric_only=True)
+    grp_b = data.groupby([b], observed=True)[dv].mean(numeric_only=True)
+    grp_ab = data.groupby([a, b], observed=True)[dv].mean(numeric_only=True)
+    grp_as = data.groupby([a, subject], observed=True)[dv].mean(numeric_only=True)
+    grp_bs = data.groupby([b, subject], observed=True)[dv].mean(numeric_only=True)
 
     # Sums of squares
     ss_tot = np.sum((data[dv] - mu) ** 2)
@@ -991,7 +991,7 @@ def anova(data=None, dv=None, between=None, ss_type=2, detailed=False, effsize="
     # Calculate sums of squares
     grp = data.groupby(between, observed=True, group_keys=False)[dv]
     # Between effect
-    ssbetween = ((grp.mean() - data[dv].mean()) ** 2 * grp.count()).sum()
+    ssbetween = ((grp.mean(numeric_only=True) - data[dv].mean()) ** 2 * grp.count()).sum()
     # Within effect (= error between)
     #  = (grp.var(ddof=0) * grp.count()).sum()
     sserror = grp.transform(lambda x: (x - x.mean()) ** 2).sum()
@@ -1346,8 +1346,8 @@ def welch_anova(data=None, dv=None, between=None):
 
     # Sums of squares (regular and adjusted)
     ss_res = grp.apply(lambda x: (x - x.mean()) ** 2).sum()
-    ss_bet = ((grp.mean() - data[dv].mean()) ** 2 * grp.count()).sum()
-    ss_betadj = np.sum(weights * np.square(grp.mean() - adj_grandmean))
+    ss_bet = ((grp.mean(numeric_only=True) - data[dv].mean()) ** 2 * grp.count()).sum()
+    ss_betadj = np.sum(weights * np.square(grp.mean(numeric_only=True) - adj_grandmean))
     ms_betadj = ss_betadj / ddof1
 
     # Calculate lambda, F-value, p-value and np2

diff --git a/pingouin/regression.py b/pingouin/regression.py
@@ -727,7 +727,7 @@ def logistic_regression(
     first level of our categorical variable (species = Adelie) which will be
     used as the reference level:
 
-    >>> df = pd.get_dummies(df, columns=['species'], drop_first=True)
+    >>> df = pd.get_dummies(df, columns=['species'], dtype=float, drop_first=True)
     >>> X = df[['body_mass_kg', 'species_Chinstrap', 'species_Gentoo']]
     >>> y = df['male']
     >>> lom = pg.logistic_regression(X, y, remove_na=True)

diff --git a/pingouin/tests/test_regression.py b/pingouin/tests/test_regression.py
@@ -349,7 +349,7 @@ def test_logistic_regression(self):
         # R: >>> glm("male ~ body_mass_kg + species", family=binomial, ...)
         #    >>> confint.default(model)  # Wald CI
         # See https://stats.stackexchange.com/a/275421/253579
-        data_dum = pd.get_dummies(data, columns=["species"], drop_first=True)
+        data_dum = pd.get_dummies(data, columns=["species"], drop_first=True, dtype=float)
         X = data_dum[["body_mass_kg", "species_Chinstrap", "species_Gentoo"]]
         y = data_dum["male"]
         lom = logistic_regression(X, y, as_dataframe=False)

diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 numpy>=1.19
 scipy>=1.7
-pandas>=1.0
+pandas>=1.1
 matplotlib>=3.0.2
 seaborn>=0.11
 statsmodels>=0.13

diff --git a/setup.py b/setup.py
@@ -24,7 +24,7 @@ def read(fname):
 INSTALL_REQUIRES = [
     "numpy>=1.19",
     "scipy>=1.7",
-    "pandas>=1.0",
+    "pandas>=1.1",
     "matplotlib>=3.0.2",
     "seaborn>=0.11",
     "statsmodels>=0.13",
@@ -41,10 +41,10 @@ def read(fname):
 
 CLASSIFIERS = [
     "Intended Audience :: Science/Research",
-    "Programming Language :: Python :: 3.7",
     "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
     "Topic :: Scientific/Engineering :: Mathematics",
     "Operating System :: POSIX",
     "Operating System :: Unix",