Skip to content

Commit

Permalink
fix groupby.mean for only numeric values (#363)
Browse files Browse the repository at this point in the history
* fix groupby.mean for only numeric values; change in default behaviour for pandas 2.0

* Bump pandas to 1.1

* Black

* Add `numeric_only=True` to groupby mean

* Fix any unit test

* Fix bool in get_dummies

---------

Co-authored-by: Raphael Vallat <raphaelvallat9@gmail.com>
  • Loading branch information
jajcayn and raphaelvallat authored Jun 4, 2023
1 parent faad22a commit 7923141
Show file tree
Hide file tree
Showing 6 changed files with 23 additions and 21 deletions.
12 changes: 7 additions & 5 deletions pingouin/pairwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,7 +475,9 @@ def pairwise_tests(
# designs. Indeed, a similar groupby is applied by default on
# each within-subject factor of a two-way repeated measures design.
if all([agg[i], marginal]):
tmp = data.groupby([subject, f], as_index=False, observed=True, sort=True).mean()
tmp = data.groupby([subject, f], as_index=False, observed=True, sort=True).mean(
numeric_only=True
)
else:
tmp = data
pt = pairwise_tests(
Expand Down Expand Up @@ -880,7 +882,7 @@ def pairwise_tukey(data=None, dv=None, between=None, effsize="hedges"):
# See https://github.com/raphaelvallat/pingouin/issues/111
labels = np.array(list(grp.groups.keys()))
n = grp.count().to_numpy()
gmeans = grp.mean().to_numpy()
gmeans = grp.mean(numeric_only=True).to_numpy()
gvar = aov.at[1, "MS"] / n

# Pairwise combinations
Expand Down Expand Up @@ -1046,8 +1048,8 @@ def pairwise_gameshowell(data=None, dv=None, between=None, effsize="hedges"):
# See https://github.com/raphaelvallat/pingouin/issues/111
labels = np.array(list(grp.groups.keys()))
n = grp.count().to_numpy()
gmeans = grp.mean().to_numpy()
gvars = grp.var().to_numpy()
gmeans = grp.mean(numeric_only=True).to_numpy()
gvars = grp.var().to_numpy() # numeric_only=True added in pandas 1.5, set to False in 2.0

# Pairwise combinations
g1, g2 = np.array(list(combinations(np.arange(ng), 2))).T
Expand Down Expand Up @@ -1425,7 +1427,7 @@ def traverse(o, tree_types=(list, tuple)):
[c in keys for c in covar]
), "Covariate(s) are either not in data or not numeric."
# And we make sure that X or Y does not contain covar
stats = stats[~stats[["X", "Y"]].isin(covar).any(1)]
stats = stats[~stats[["X", "Y"]].isin(covar).any(axis=1)]
stats = stats.reset_index(drop=True)
if stats.shape[0] == 0:
raise ValueError(
Expand Down
22 changes: 11 additions & 11 deletions pingouin/parametric.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,13 +552,13 @@ def rm_anova(
grandmean = data[dv].mean()

# Calculate sums of squares
ss_with = ((grp_with.mean() - grandmean) ** 2 * grp_with.count()).sum()
ss_with = ((grp_with.mean(numeric_only=True) - grandmean) ** 2 * grp_with.count()).sum()
ss_resall = grp_with.apply(lambda x: (x - x.mean()) ** 2).sum()
# sstotal = sstime + ss_resall = sstime + (sssubj + sserror)
# ss_total = ((data[dv] - grandmean)**2).sum()
# We can further divide the residuals into a within and between component:
grp_subj = data.groupby(subject, observed=True)[dv]
ss_resbetw = n_rm * np.sum((grp_subj.mean() - grandmean) ** 2)
ss_resbetw = n_rm * np.sum((grp_subj.mean(numeric_only=True) - grandmean) ** 2)
ss_reswith = ss_resall - ss_resbetw

# Calculate degrees of freedom
Expand Down Expand Up @@ -702,12 +702,12 @@ def rm_anova2(data=None, dv=None, within=None, subject=None, effsize="ng2"):
# Groupby means
# I think that observed=True is actually not needed here since we have already used
# `observed=True` in pivot_table.
grp_s = data.groupby(subject, observed=True)[dv].mean()
grp_a = data.groupby([a], observed=True)[dv].mean()
grp_b = data.groupby([b], observed=True)[dv].mean()
grp_ab = data.groupby([a, b], observed=True)[dv].mean()
grp_as = data.groupby([a, subject], observed=True)[dv].mean()
grp_bs = data.groupby([b, subject], observed=True)[dv].mean()
grp_s = data.groupby(subject, observed=True)[dv].mean(numeric_only=True)
grp_a = data.groupby([a], observed=True)[dv].mean(numeric_only=True)
grp_b = data.groupby([b], observed=True)[dv].mean(numeric_only=True)
grp_ab = data.groupby([a, b], observed=True)[dv].mean(numeric_only=True)
grp_as = data.groupby([a, subject], observed=True)[dv].mean(numeric_only=True)
grp_bs = data.groupby([b, subject], observed=True)[dv].mean(numeric_only=True)

# Sums of squares
ss_tot = np.sum((data[dv] - mu) ** 2)
Expand Down Expand Up @@ -991,7 +991,7 @@ def anova(data=None, dv=None, between=None, ss_type=2, detailed=False, effsize="
# Calculate sums of squares
grp = data.groupby(between, observed=True, group_keys=False)[dv]
# Between effect
ssbetween = ((grp.mean() - data[dv].mean()) ** 2 * grp.count()).sum()
ssbetween = ((grp.mean(numeric_only=True) - data[dv].mean()) ** 2 * grp.count()).sum()
# Within effect (= error between)
# = (grp.var(ddof=0) * grp.count()).sum()
sserror = grp.transform(lambda x: (x - x.mean()) ** 2).sum()
Expand Down Expand Up @@ -1346,8 +1346,8 @@ def welch_anova(data=None, dv=None, between=None):

# Sums of squares (regular and adjusted)
ss_res = grp.apply(lambda x: (x - x.mean()) ** 2).sum()
ss_bet = ((grp.mean() - data[dv].mean()) ** 2 * grp.count()).sum()
ss_betadj = np.sum(weights * np.square(grp.mean() - adj_grandmean))
ss_bet = ((grp.mean(numeric_only=True) - data[dv].mean()) ** 2 * grp.count()).sum()
ss_betadj = np.sum(weights * np.square(grp.mean(numeric_only=True) - adj_grandmean))
ms_betadj = ss_betadj / ddof1

# Calculate lambda, F-value, p-value and np2
Expand Down
2 changes: 1 addition & 1 deletion pingouin/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -727,7 +727,7 @@ def logistic_regression(
first level of our categorical variable (species = Adelie) which will be
used as the reference level:
>>> df = pd.get_dummies(df, columns=['species'], drop_first=True)
>>> df = pd.get_dummies(df, columns=['species'], dtype=float, drop_first=True)
>>> X = df[['body_mass_kg', 'species_Chinstrap', 'species_Gentoo']]
>>> y = df['male']
>>> lom = pg.logistic_regression(X, y, remove_na=True)
Expand Down
2 changes: 1 addition & 1 deletion pingouin/tests/test_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ def test_logistic_regression(self):
# R: >>> glm("male ~ body_mass_kg + species", family=binomial, ...)
# >>> confint.default(model) # Wald CI
# See https://stats.stackexchange.com/a/275421/253579
data_dum = pd.get_dummies(data, columns=["species"], drop_first=True)
data_dum = pd.get_dummies(data, columns=["species"], drop_first=True, dtype=float)
X = data_dum[["body_mass_kg", "species_Chinstrap", "species_Gentoo"]]
y = data_dum["male"]
lom = logistic_regression(X, y, as_dataframe=False)
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
numpy>=1.19
scipy>=1.7
pandas>=1.0
pandas>=1.1
matplotlib>=3.0.2
seaborn>=0.11
statsmodels>=0.13
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def read(fname):
INSTALL_REQUIRES = [
"numpy>=1.19",
"scipy>=1.7",
"pandas>=1.0",
"pandas>=1.1",
"matplotlib>=3.0.2",
"seaborn>=0.11",
"statsmodels>=0.13",
Expand All @@ -41,10 +41,10 @@ def read(fname):

CLASSIFIERS = [
"Intended Audience :: Science/Research",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Topic :: Scientific/Engineering :: Mathematics",
"Operating System :: POSIX",
"Operating System :: Unix",
Expand Down

0 comments on commit 7923141

Please sign in to comment.