MultiIndex.difference not working with PyArrow timestamps(pandas-dev#61382)

NEREUScode · NEREUScode · commit dee7a47a9042 · 2025-05-02T12:16:38.000+01:00
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -3891,6 +3891,60 @@ def equal_levels(self, other: MultiIndex) -> bool:
     # --------------------------------------------------------------------
     # Set Methods
 
+    def difference(self, other, sort=None):
+        """
+        Return a new MultiIndex with elements from the index not in `other`.
+
+        Parameters
+        ----------
+        other : MultiIndex or array-like
+        sort : bool or None, default None
+            Whether to sort the resulting index.
+
+        Returns
+        -------
+        MultiIndex
+        """
+        if not isinstance(other, MultiIndex):
+            other = MultiIndex.from_tuples(other, names=self.names)
+
+        # Convert 'other' to codes using self's levels
+        other_codes = []
+        for i, (lev, name) in enumerate(zip(self.levels, self.names)):
+            level_vals = other.get_level_values(i)
+            other_code = lev.get_indexer(level_vals)
+            other_codes.append(other_code)
+
+        # Create mask for elements not in 'other'
+        n = len(self)
+        mask = np.ones(n, dtype=bool)
+        engine = self._engine
+        for codes in zip(*other_codes):
+            try:
+                loc = engine.get_loc(tuple(codes))
+                if isinstance(loc, slice):
+                    mask[loc] = False
+                elif isinstance(loc, np.ndarray):
+                    mask &= ~loc
+                else:
+                    mask[loc] = False
+            except KeyError:
+                pass
+
+        new_codes = [code[mask] for code in self.codes]
+        result = MultiIndex(
+            levels=self.levels,
+            codes=new_codes,
+            names=self.names,
+            verify_integrity=False,
+        )
+        if sort is None or sort is True:
+            try:
+                return result.sort_values()
+            except TypeError:
+                pass
+        return result
+
     def _union(self, other, sort) -> MultiIndex:
         other, result_names = self._convert_can_do_setop(other)
         if other.has_duplicates:
diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py
@@ -160,21 +160,13 @@ def test_query_empty_string(self):
             df.query("")
 
     def test_query_duplicate_column_name(self, engine, parser):
-        df = DataFrame(
-            {
-                "A": range(3),
-                "B": range(3),
-                "C": range(3)
-            }
-        ).rename(columns={"B": "A"})
+        df = DataFrame({"A": range(3), "B": range(3), "C": range(3)}).rename(
+            columns={"B": "A"}
+        )
 
-        res = df.query('C == 1', engine=engine, parser=parser)
+        res = df.query("C == 1", engine=engine, parser=parser)
 
-        expect = DataFrame(
-            [[1, 1, 1]],
-            columns=["A", "A", "C"],
-            index=[1]
-        )
+        expect = DataFrame([[1, 1, 1]], columns=["A", "A", "C"], index=[1])
 
         tm.assert_frame_equal(res, expect)
 
@@ -1140,9 +1132,7 @@ def test_query_with_nested_special_character(self, parser, engine):
             [">=", operator.ge],
         ],
     )
-    def test_query_lex_compare_strings(
-        self, parser, engine, op, func
-    ):
+    def test_query_lex_compare_strings(self, parser, engine, op, func):
         a = Series(np.random.default_rng(2).choice(list("abcde"), 20))
         b = Series(np.arange(a.size))
         df = DataFrame({"X": a, "Y": b})
diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py
@@ -195,6 +195,39 @@ def test_difference(idx, sort):
         first.difference([1, 2, 3, 4, 5], sort=sort)
 
 
+def test_multiindex_difference_pyarrow_timestamp():
+    pa = pytest.importorskip("pyarrow")
+
+    df = (
+        DataFrame(
+            [(1, "1900-01-01", "a"), (2, "1900-01-01", "b")],
+            columns=["id", "date", "val"],
+        )
+        .astype(
+            {
+                "id": "int64[pyarrow]",
+                "date": "timestamp[ns][pyarrow]",
+                "val": "string[pyarrow]",
+            }
+        )
+        .set_index(["id", "date"])
+    )
+
+    idx = df.index
+    idx_val = idx[0]
+
+    # Assert the value exists in the original index
+    assert idx_val in idx
+
+    # Remove idx_val using difference()
+    new_idx = idx.difference([idx_val])
+
+    # Verify the result
+    assert len(new_idx) == 1
+    assert idx_val not in new_idx
+    assert new_idx.equals(MultiIndex.from_tuples([(2, pd.Timestamp("1900-01-01"))]))
+
+
 def test_difference_sort_special():
     # GH-24959
     idx = MultiIndex.from_product([[1, 0], ["a", "b"]])
diff --git a/scripts/check_for_inconsistent_pandas_namespace.py b/scripts/check_for_inconsistent_pandas_namespace.py
@@ -30,8 +30,7 @@
 from typing import NamedTuple
 
 ERROR_MESSAGE = (
-    "{path}:{lineno}:{col_offset}: "
-    "Found both '{prefix}.{name}' and '{name}' in {path}"
+    "{path}:{lineno}:{col_offset}: Found both '{prefix}.{name}' and '{name}' in {path}"
 )
 
 
diff --git a/scripts/check_test_naming.py b/scripts/check_test_naming.py
@@ -8,6 +8,7 @@
 NOTE: if this finds a false positive, you can add the comment `# not a test` to the
 class or function definition. Though hopefully that shouldn't be necessary.
 """
+
 from __future__ import annotations
 
 import argparse
diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py
@@ -12,6 +12,7 @@
     generated with this script:
     $ python scripts/generate_pip_deps_from_conda.py --compare
 """
+
 import argparse
 import pathlib
 import re
diff --git a/scripts/pandas_errors_documented.py b/scripts/pandas_errors_documented.py
@@ -6,6 +6,7 @@
 
     pre-commit run pandas-errors-documented --all-files
 """
+
 from __future__ import annotations
 
 import argparse
diff --git a/scripts/sort_whatsnew_note.py b/scripts/sort_whatsnew_note.py
@@ -23,6 +23,7 @@
 
     pre-commit run sort-whatsnew-items --all-files
 """
+
 from __future__ import annotations
 
 import argparse
diff --git a/scripts/tests/test_check_test_naming.py b/scripts/tests/test_check_test_naming.py
@@ -24,10 +24,7 @@
             0,
         ),
         (
-            "class Foo:  # not a test\n"
-            "    pass\n"
-            "def test_foo():\n"
-            "    Class.foo()\n",
+            "class Foo:  # not a test\n    pass\ndef test_foo():\n    Class.foo()\n",
             "",
             0,
         ),
diff --git a/scripts/tests/test_inconsistent_namespace_check.py b/scripts/tests/test_inconsistent_namespace_check.py
@@ -5,14 +5,10 @@
 )
 
 BAD_FILE_0 = (
-    "from pandas import Categorical\n"
-    "cat_0 = Categorical()\n"
-    "cat_1 = pd.Categorical()"
+    "from pandas import Categorical\ncat_0 = Categorical()\ncat_1 = pd.Categorical()"
 )
 BAD_FILE_1 = (
-    "from pandas import Categorical\n"
-    "cat_0 = pd.Categorical()\n"
-    "cat_1 = Categorical()"
+    "from pandas import Categorical\ncat_0 = pd.Categorical()\ncat_1 = Categorical()"
 )
 BAD_FILE_2 = (
     "from pandas import Categorical\n"
diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py
@@ -34,8 +34,7 @@ def redundant_import(self, paramx=None, paramy=None) -> None:
         --------
         >>> import numpy as np
         >>> import pandas as pd
-        >>> df = pd.DataFrame(np.ones((3, 3)),
-        ...                   columns=('a', 'b', 'c'))
+        >>> df = pd.DataFrame(np.ones((3, 3)), columns=("a", "b", "c"))
         >>> df.all(axis=1)
         0    True
         1    True
@@ -50,14 +49,14 @@ def unused_import(self) -> None:
         Examples
         --------
         >>> import pandas as pdf
-        >>> df = pd.DataFrame(np.ones((3, 3)), columns=('a', 'b', 'c'))
+        >>> df = pd.DataFrame(np.ones((3, 3)), columns=("a", "b", "c"))
         """
 
     def missing_whitespace_around_arithmetic_operator(self) -> None:
         """
         Examples
         --------
-        >>> 2+5
+        >>> 2 + 5
         7
         """
 
@@ -66,14 +65,14 @@ def indentation_is_not_a_multiple_of_four(self) -> None:
         Examples
         --------
         >>> if 2 + 5:
-        ...   pass
+        ...     pass
         """
 
     def missing_whitespace_after_comma(self) -> None:
         """
         Examples
         --------
-        >>> df = pd.DataFrame(np.ones((3,3)),columns=('a','b', 'c'))
+        >>> df = pd.DataFrame(np.ones((3, 3)), columns=("a", "b", "c"))
         """
 
     def write_array_like_with_hyphen_not_underscore(self) -> None:
@@ -227,13 +226,13 @@ def test_validate_all_ignore_errors(self, monkeypatch):
                 "errors": [
                     ("ER01", "err desc"),
                     ("ER02", "err desc"),
-                    ("ER03", "err desc")
+                    ("ER03", "err desc"),
                 ],
                 "warnings": [],
                 "examples_errors": "",
                 "deprecated": True,
                 "file": "file1",
-                "file_line": "file_line1"
+                "file_line": "file_line1",
             },
         )
         monkeypatch.setattr(
@@ -272,14 +271,13 @@ def test_validate_all_ignore_errors(self, monkeypatch):
                 None: {"ER03"},
                 "pandas.DataFrame.align": {"ER01"},
                 # ignoring an error that is not requested should be of no effect
-                "pandas.Index.all": {"ER03"}
-            }
+                "pandas.Index.all": {"ER03"},
+            },
         )
         # two functions * two not global ignored errors - one function ignored error
         assert exit_status == 2 * 2 - 1
 
 
-
 class TestApiItems:
     @property
     def api_doc(self):
diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py
diff --git a/scripts/validate_exception_location.py b/scripts/validate_exception_location.py
diff --git a/scripts/validate_min_versions_in_sync.py b/scripts/validate_min_versions_in_sync.py
diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py
diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py

Original file line number	Diff line number	Diff line change
`@@ -30,8 +30,7 @@`
`30`	`30`	`from typing import NamedTuple`
`31`	`31`
`32`	`32`	`ERROR_MESSAGE = (`
`33`		`- "{path}:{lineno}:{col_offset}: "`
`34`		`- "Found both '{prefix}.{name}' and '{name}' in {path}"`
	`33`	`+ "{path}:{lineno}:{col_offset}: Found both '{prefix}.{name}' and '{name}' in {path}"`
`35`	`34`	`)`
`36`	`35`
`37`	`36`
Original file line number	Diff line number	Diff line change
`@@ -5,14 +5,10 @@`
`5`	`5`	`)`
`6`	`6`
`7`	`7`	`BAD_FILE_0 = (`
`8`		`- "from pandas import Categorical\n"`
`9`		`- "cat_0 = Categorical()\n"`
`10`		`- "cat_1 = pd.Categorical()"`
	`8`	`+ "from pandas import Categorical\ncat_0 = Categorical()\ncat_1 = pd.Categorical()"`
`11`	`9`	`)`
`12`	`10`	`BAD_FILE_1 = (`
`13`		`- "from pandas import Categorical\n"`
`14`		`- "cat_0 = pd.Categorical()\n"`
`15`		`- "cat_1 = Categorical()"`
	`11`	`+ "from pandas import Categorical\ncat_0 = pd.Categorical()\ncat_1 = Categorical()"`
`16`	`12`	`)`
`17`	`13`	`BAD_FILE_2 = (`
`18`	`14`	`"from pandas import Categorical\n"`