From 23d0fc4783642e328da42fa52d2643b73f175f3d Mon Sep 17 00:00:00 2001 From: Aric Coady Date: Sat, 26 Oct 2024 15:48:30 -0700 Subject: [PATCH] Documentation updates. --- README.md | 5 +++-- docs/api.md | 17 ++++++----------- docs/reference.md | 2 ++ graphique/interface.py | 12 ++++++++++-- 4 files changed, 21 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index a64b415..8623603 100644 --- a/README.md +++ b/README.md @@ -40,8 +40,9 @@ Supply a mapping of names to datasets for multiple roots, and to enable federati import pyarrow.dataset as ds from graphique import GraphQL -app = GraphQL(ds.dataset(...)) # Table is root query type -app = GraphQL.federated({: ds.dataset(...), ...}, keys={...}) # Tables on federated fields +source = ds.dataset(...) +app = GraphQL(source) # Table is root query type +app = GraphQL.federated({: source, ...}, keys={: [], ...}) # Tables on federated fields ``` Start like any ASGI app. diff --git a/docs/api.md b/docs/api.md index bec736a..c5423d5 100644 --- a/docs/api.md +++ b/docs/api.md @@ -42,9 +42,8 @@ Note list inputs allow passing a single value, [coercing the input](https://spec ## Batches Datasets and scanners are processed in batches when possible, instead of loading the table into memory. -* `scan` and `filter` - native parallel batch processing +* `group`, `scan`, and `filter` - native parallel batch processing * `sort` with `length` -* `group` with associated aggregates * `apply` with `list` functions * `rank` * `flatten` @@ -52,24 +51,20 @@ Datasets and scanners are processed in batches when possible, instead of loading ## Partitions Partitioned datasets use fragment keys when possible. -* `group` on fragment keys with associated aggregates -* `rank` on fragment key +* `group` on fragment keys with counts +* `rank` and `sort` with length on fragment keys ## Column selection Each field resolver transforms a table or array as needed. When working with an embedded library like [pandas](https://pandas.pydata.org), it's common to select a working set of columns for efficiency. Whereas GraphQL has the advantage of knowing the entire query up front, so there is no `select` field because it's done automatically at every level of resolvers. ## List Arrays -Arrow ListArrays are supported as ListColumns. `group: {aggregate: {list: ...}}` and `partition` leverage that feature to transform columns into ListColumns, which can be accessed via inline fragments and further aggregated. Though `group` hash aggregate functions are more efficient than creating lists. +Arrow ListArrays are supported as ListColumns. `group: {aggregate: {list: ...}}` and `runs` leverage that feature to transform columns into ListColumns, which can be accessed via inline fragments and further aggregated. Though `group` hash aggregate functions are more efficient than creating lists. * `tables` returns a list of tables based on the list scalars. * `flatten` flattens the list columns and repeats the scalar columns as needed. -* `apply(list: {...})` applies vector functions to the list scalars. +* `apply(list: {filter:, ..., sort: ..., rank: ...})` applies vector functions to the list scalars. -ListColumns support sorting and filtering within their list scalars. They must all have the same value lengths, which is naturally the case when the result of grouping. Iterating scalars (in Python) is not ideal, but it can be faster than re-aggregating, depending on the average list size. Alternatively, `flatten` can be used to transform lists, ignoring null or empty scalars. - -1. `flatten` with `indices` -1. `scan`, `filter`, or `sort(by: ["", ...])` -1. `partition(by: ["", ...])` or `group(by: "", aggregate: {...})` +The list in use must all have the same value lengths, which is naturally the case when the result of grouping. Iterating scalars (in Python) is not ideal, but it can be faster than re-aggregating, depending on the average list size. ## Dictionary Arrays Arrow has dictionary-encoded arrays as a space optimization, but doesn't natively support some builtin functions on them. Support for dictionaries is extended, and often faster by only having to apply functions to the unique values. diff --git a/docs/reference.md b/docs/reference.md index dc5a82c..13f8162 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -4,6 +4,8 @@ ::: graphique.core.Table +::: graphique.core.Nodes + ::: graphique.interface.Dataset ::: graphique.middleware.GraphQL diff --git a/graphique/interface.py b/graphique/interface.py index a680ec7..4b0dc9f 100644 --- a/graphique/interface.py +++ b/graphique/interface.py @@ -224,7 +224,10 @@ def column( def slice( self, info: Info, offset: Long = 0, length: Optional[Long] = None, reverse: bool = False ) -> Self: - """Return zero-copy slice of table.""" + """Return zero-copy slice of table. + + Can also be sued to force loading a dataset. + """ table = self.to_table(info, length and (offset + length if offset >= 0 else None)) table = table[offset:][:length] # `slice` bug: ARROW-15412 return type(self)(table[::-1] if reverse else table) @@ -232,7 +235,7 @@ def slice( @doc_field( by="column names; empty will aggregate into a single row table", counts="optionally include counts in an aliased column", - ordered="optinally disable parallelization to maintain ordering", + ordered="optionally disable parallelization to maintain ordering", aggregate="aggregation functions applied to other columns", ) def group( @@ -243,6 +246,11 @@ def group( ordered: bool = False, aggregate: HashAggregates = {}, # type: ignore ) -> Self: + """Return table grouped by columns. + + See `column` for accessing any column which has changed type. See `tables` to split on any + aggregated list columns. + """ if not any(aggregate.keys()): fragments = T.fragments(self.source, *by, counts=counts) if set(fragments.schema.names) >= set(by):