Skip to content

Commit

Permalink
Fix combine_similar for read_parquet if one branch does not project c…
Browse files Browse the repository at this point in the history
…olumns (#254)
  • Loading branch information
phofl authored Aug 3, 2023
1 parent 53be146 commit 6880388
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 1 deletion.
8 changes: 7 additions & 1 deletion dask_expr/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,11 +447,17 @@ def _combine_similar(self, root: Expr):
rps = [self] + alike
for rp in rps:
if rp.operand("columns"):
columns |= set(rp.operand("columns"))
cols = rp.operand("columns")
else:
# No column projection on this branch, so keep all of them
cols = rp.columns
columns |= set(cols)
columns = sorted(columns)

# Can bail if we are not changing columns or the "_series" operand
columns_operand = self.operand("columns")
if columns_operand is None:
columns_operand = self.columns
if columns_operand == columns and (len(columns) > 1 or not self._series):
return

Expand Down
12 changes: 12 additions & 0 deletions dask_expr/io/tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,3 +312,15 @@ def test_combine_similar(tmpdir):
# All Replace operations should also be the same
replace_nodes = list(got.optimize(fuse=False).find_operations(Replace))
assert len(replace_nodes) == 1


def test_combine_similar_no_projection_on_one_branch(tmpdir):
pdf = lib.DataFrame(
{"x": [0, 1, 2, 3] * 4, "y": range(16), "z": [None, 1, 2, 3] * 4}
)
fn = _make_file(tmpdir, format="parquet", df=pdf)
df = read_parquet(fn)
df["xx"] = df.x != 0

pdf["xx"] = pdf.x != 0
assert_eq(df, pdf)

0 comments on commit 6880388

Please sign in to comment.