diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ec8c8116e5aee..878653a02c4b0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4517,7 +4517,20 @@ def _set_item(self, key, value) -> None: Series/TimeSeries will be conformed to the DataFrames index to ensure homogeneity. """ - value, refs = self._sanitize_column(value) + # Check if we're setting a new column with a tuple key in a MultiIndex DataFrame + # and the value is a scalar. In this case, we need to create a Series with the + # proper name to ensure the name attribute matches the key. + if ( + isinstance(key, tuple) + and isinstance(self.columns, MultiIndex) + and not is_list_like(value) + and key not in self.columns + ): + # Create a Series with the proper name + value = Series([value] * len(self.index), index=self.index, name=key) + value, refs = self._sanitize_column(value) + else: + value, refs = self._sanitize_column(value) if ( key in self.columns diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 39607d74c0dc8..388f7114e6d75 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -2070,6 +2070,13 @@ def _wrap_applied_output( result = self.obj._constructor(index=res_index, columns=data.columns) result = result.astype(data.dtypes) + + # Preserve metadata for subclassed DataFrames + if hasattr(self.obj, "_metadata"): + for attr in self.obj._metadata: + if hasattr(self.obj, attr): + setattr(result, attr, getattr(self.obj, attr)) + return result # GH12824 @@ -2081,14 +2088,29 @@ def _wrap_applied_output( # GH57775 - Ensure that columns and dtypes from original frame are kept. result = self.obj._constructor(columns=data.columns) result = result.astype(data.dtypes) + + # Preserve metadata for subclassed DataFrames + if hasattr(self.obj, "_metadata"): + for attr in self.obj._metadata: + if hasattr(self.obj, attr): + setattr(result, attr, getattr(self.obj, attr)) + return result elif isinstance(first_not_none, DataFrame): - return self._concat_objects( + result = self._concat_objects( values, not_indexed_same=not_indexed_same, is_transform=is_transform, ) + # Preserve metadata for subclassed DataFrames + if hasattr(self.obj, "_metadata"): + for attr in self.obj._metadata: + if hasattr(self.obj, attr): + setattr(result, attr, getattr(self.obj, attr)) + + return result + key_index = self._grouper.result_index if self.as_index else None if isinstance(first_not_none, (np.ndarray, Index)): @@ -2105,21 +2127,44 @@ def _wrap_applied_output( # (expression has type "Hashable", variable # has type "Tuple[Any, ...]") name = self._selection # type: ignore[assignment] - return self.obj._constructor_sliced(values, index=key_index, name=name) + result = self.obj._constructor_sliced(values, index=key_index, name=name) + + # Preserve metadata for subclassed Series + if hasattr(self.obj, "_metadata"): + for attr in self.obj._metadata: + if hasattr(self.obj, attr): + setattr(result, attr, getattr(self.obj, attr)) + + return result elif not isinstance(first_not_none, Series): # values are not series or array-like but scalars # self._selection not passed through to Series as the # result should not take the name of original selection # of columns if self.as_index: - return self.obj._constructor_sliced(values, index=key_index) + result = self.obj._constructor_sliced(values, index=key_index) + + # Preserve metadata for subclassed Series + if hasattr(self.obj, "_metadata"): + for attr in self.obj._metadata: + if hasattr(self.obj, attr): + setattr(result, attr, getattr(self.obj, attr)) + + return result else: result = self.obj._constructor(values, columns=[self._selection]) result = self._insert_inaxis_grouper(result) + + # Preserve metadata for subclassed DataFrames + if hasattr(self.obj, "_metadata"): + for attr in self.obj._metadata: + if hasattr(self.obj, attr): + setattr(result, attr, getattr(self.obj, attr)) + return result else: # values are Series - return self._wrap_applied_output_series( + result = self._wrap_applied_output_series( values, not_indexed_same, first_not_none, @@ -2127,6 +2172,14 @@ def _wrap_applied_output( is_transform, ) + # Preserve metadata for subclassed DataFrames/Series + if hasattr(self.obj, "_metadata"): + for attr in self.obj._metadata: + if hasattr(self.obj, attr): + setattr(result, attr, getattr(self.obj, attr)) + + return result + def _wrap_applied_output_series( self, values: list[Series], diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 20dd7b0c4d3e7..985870c3593cb 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -607,6 +607,63 @@ def test_setitem_multi_index(self): df[("joe", "last")] = df[("jolie", "first")].loc[i, j] tm.assert_frame_equal(df[("joe", "last")], df[("jolie", "first")]) + def test_setitem_multiindex_scalar_indexer(self): + # GH#62135: Fix DataFrame.__setitem__ with MultiIndex columns and scalar indexer + # Test scalar key assignment with MultiIndex columns + columns = MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "a")]) + df = DataFrame(np.arange(15).reshape(5, 3), columns=columns) + + # Test setting new column with scalar tuple key + df[("C", "c")] = 100 + expected_new = DataFrame( + np.array( + [ + [0, 1, 2, 100], + [3, 4, 5, 100], + [6, 7, 8, 100], + [9, 10, 11, 100], + [12, 13, 14, 100], + ], + dtype=np.int64, + ), + columns=MultiIndex.from_tuples( + [("A", "a"), ("A", "b"), ("B", "a"), ("C", "c")] + ), + ) + tm.assert_frame_equal(df, expected_new) + + # Test setting existing column with scalar tuple key + df[("A", "a")] = 999 + expected_existing = expected_new.copy() + expected_existing[("A", "a")] = 999 + tm.assert_frame_equal(df, expected_existing) + + # Test setting with Series using scalar tuple key + series_data = Series([10, 20, 30, 40, 50], dtype=np.int64) + df[("D", "d")] = series_data + expected_series = expected_existing.copy() + expected_series[("D", "d")] = series_data + tm.assert_frame_equal(df, expected_series) + + # Test with 3-level MultiIndex + columns_3level = MultiIndex.from_tuples( + [("X", "A", "1"), ("X", "A", "2"), ("Y", "B", "1")] + ) + df_3level = DataFrame(np.arange(12).reshape(4, 3), columns=columns_3level) + + # Test scalar assignment with 3-level MultiIndex + df_3level[("Z", "C", "3")] = 42 + assert ("Z", "C", "3") in df_3level.columns + tm.assert_series_equal( + df_3level[("Z", "C", "3")], + Series([42, 42, 42, 42], name=("Z", "C", "3"), dtype=np.int64), + ) + + # Test Series assignment with 3-level MultiIndex + new_series = Series([1, 2, 3, 4], name=("W", "D", "4"), dtype=np.int64) + df_3level[("W", "D", "4")] = new_series + tm.assert_series_equal(df_3level[("W", "D", "4")], new_series) + @pytest.mark.parametrize( "columns,box,expected", [ diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index f93105498ac79..b599be5d042fe 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -168,7 +168,7 @@ def test_query_duplicate_column_name(self, engine, parser): } ).rename(columns={"B": "A"}) - res = df.query('C == 1', engine=engine, parser=parser) + res = df.query("C == 1", engine=engine, parser=parser) expect = DataFrame( [[1, 1, 1]], @@ -1411,7 +1411,7 @@ def test_expr_with_column_name_with_backtick_and_hash(self): def test_expr_with_column_name_with_backtick(self): # GH 59285 df = DataFrame({"a`b": (1, 2, 3), "ab": (4, 5, 6)}) - result = df.query("`a``b` < 2") # noqa + result = df.query("`a``b` < 2") # Note: Formatting checks may wrongly consider the above ``inline code``. expected = df[df["a`b"] < 2] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby_metadata.py b/pandas/tests/groupby/test_groupby_metadata.py new file mode 100644 index 0000000000000..83368346c8759 --- /dev/null +++ b/pandas/tests/groupby/test_groupby_metadata.py @@ -0,0 +1,32 @@ +""" +Tests for metadata preservation in groupby operations. +""" + +import numpy as np + +import pandas._testing as tm + + +class TestGroupByMetadataPreservation: + def test_groupby_apply_preserves_metadata(self): + """Test that groupby.apply() preserves _metadata from subclassed DataFrame.""" + # Create a subclassed DataFrame with metadata + subdf = tm.SubclassedDataFrame( + {"X": [1, 1, 2, 2, 3], "Y": np.arange(0, 5), "Z": np.arange(10, 15)} + ) + subdf.testattr = "test" + + # Apply groupby operation + result = subdf.groupby("X").apply(np.sum, axis=0, include_groups=False) + + # Check that metadata is preserved + assert hasattr(result, "testattr"), ( + "Metadata attribute 'testattr' should be preserved" + ) + assert result.testattr == "test", "Metadata value should be preserved" + + # Compare with equivalent operation that preserves metadata + expected = subdf.groupby("X").sum() + assert expected.testattr == "test", ( + "Equivalent operation should preserve metadata" + )