Skip to content

Commit

Permalink
Group by dictionary encoded arrays.
Browse files Browse the repository at this point in the history
Casting a table column consumes memory, but casting an in-memory dataset field appears to have no performance impact. This could simplify and optimize the functional use cases which dictionary arrays do not support.

Binary only preview builds.
  • Loading branch information
coady committed Aug 17, 2024
1 parent c85f48c commit ad7c0cb
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 8 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
arrow-version: ['']
include:
- python-version: 3.x
arrow-version: '--pre '
arrow-version: '--pre --only-binary :all: '
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
Expand Down
10 changes: 5 additions & 5 deletions graphique/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,13 +421,13 @@ def group(self, *names: str, counts: str = '', **funcs: Sequence[Agg]) -> pa.Tab
"""
prefix = 'hash_' if names else ''
aggs = [agg.astuple(prefix + func) for func in funcs for agg in funcs[func]]
columns = list(names) + [agg[0] for agg in aggs]
columns = {
name: pc.field(name).cast(Column.scalar_type(self.schema.field(name)))
for name in list(names) + [agg[0] for agg in aggs]
}
if counts:
aggs.append(([], 'hash_count_all', None, counts))
if isinstance(self, pa.Table):
decl = Declaration('table_source', self.unify_dictionaries())
else:
decl = Declaration.scan(self, columns=columns)
decl = Declaration.scan(ds.dataset(self) if isinstance(self, pa.Table) else self, columns)
return decl.aggregate(aggs, names).to_table(use_threads=False)

def aggregate(self, counts: str = '', **funcs: Sequence[Agg]) -> dict:
Expand Down
3 changes: 1 addition & 2 deletions graphique/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,8 +249,7 @@ def group(
table, aggs = self.table, dict(aggregate)
refs = {agg.name for values in aggs.values() for agg in values}
fragments = set(T.fragment_keys(self.table))
dicts = (pa.types.is_dictionary(table.schema.field(name).type) for name in set(by) | refs)
if isinstance(table, ds.Scanner) or any(dicts):
if isinstance(table, ds.Scanner):
table = self.select(info)
if fragments and set(by) <= fragments:
if set(by) == fragments:
Expand Down

0 comments on commit ad7c0cb

Please sign in to comment.