Skip to content

Commit

Permalink
BUG: Fix inconsistency for MultiIndex with empty values (pandas-dev#5…
Browse files Browse the repository at this point in the history
  • Loading branch information
brandonmonge committed Dec 3, 2024
1 parent 8911238 commit 1348c3b
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 3 deletions.
22 changes: 19 additions & 3 deletions pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,19 @@ def extract(r):
return tuple(r[i] for i in range(field_count) if i not in sic)

columns = list(zip(*(extract(r) for r in header)))
# Replace None, empty strings, or column names starting with 'Unnamed: '
# (used as placeholders in multi-index headers) with empty strings.
columns = [
tuple(
""
if level is None
or str(level).strip() == ""
or (isinstance(level, str) and level.startswith("Unnamed: "))
else level
for level in col
)
for col in columns
]
names = columns.copy()
for single_ic in sorted(ic):
names.insert(single_ic, single_ic)
Expand Down Expand Up @@ -357,7 +370,7 @@ def _agg_index(self, index) -> Index:
)
else:
col_na_values, col_na_fvalues = set(), set()

col_na_values.discard("")
cast_type = None
index_converter = False
if self.index_names is not None:
Expand Down Expand Up @@ -694,8 +707,11 @@ def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, lis

# Only clean index names that were placeholders.
for i, name in enumerate(index_names):
if isinstance(name, str) and name in self.unnamed_cols:
index_names[i] = None
if isinstance(name, str):
if name.strip() == "":
index_names[i] = ""
elif name in self.unnamed_cols:
index_names[i] = None

return index_names, columns, index_col

Expand Down
21 changes: 21 additions & 0 deletions pandas/tests/io/parser/test_index_col.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,3 +375,24 @@ def test_multiindex_columns_not_leading_index_col(all_parsers):
)
expected = DataFrame([["x", 1, 2]], columns=cols, index=["y"])
tm.assert_frame_equal(result, expected)


def test_multiindex_empty_values_handling(all_parsers):
# GH#59560
parser = all_parsers
if parser.engine == "pyarrow":
pytest.skip(
"PyArrow engine does not support multiple header rows for MultiIndex cols."
)

data = ", ,a,b,b\n" ", ,, ,b2\n" "i1,,0,1,2\n" "i2,,3,4,5\n"
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1])
expected_columns = MultiIndex.from_tuples(
[("a", ""), ("b", ""), ("b", "b2")], names=[None, None]
)
expected = DataFrame(
[[0, 1, 2], [3, 4, 5]],
index=MultiIndex.from_tuples([("i1", ""), ("i2", "")]),
columns=expected_columns,
)
tm.assert_frame_equal(result, expected)

0 comments on commit 1348c3b

Please sign in to comment.