Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

String dtype: use ObjectEngine for indexing for now correctness over performance #60329

Merged
merged 13 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pandas/_libs/index.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@ class MaskedUInt16Engine(MaskedIndexEngine): ...
class MaskedUInt8Engine(MaskedIndexEngine): ...
class MaskedBoolEngine(MaskedUInt8Engine): ...

class StringObjectEngine(ObjectEngine):
def __init__(self, values: object, na_value) -> None: ...

class BaseMultiIndexCodesEngine:
levels: list[np.ndarray]
offsets: np.ndarray # np.ndarray[..., ndim=1]
Expand Down
25 changes: 25 additions & 0 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,31 @@ cdef class StringEngine(IndexEngine):
raise KeyError(val)
return str(val)

cdef class StringObjectEngine(ObjectEngine):

cdef:
object na_value
bint uses_na

def __init__(self, ndarray values, na_value):
super().__init__(values)
self.na_value = na_value
self.uses_na = na_value is C_NA

cdef bint _checknull(self, object val):
if self.uses_na:
return val is C_NA
else:
return util.is_nan(val)

cdef _check_type(self, object val):
if isinstance(val, str):
return val
elif self._checknull(val):
return self.na_value
else:
raise KeyError(val)


cdef class DatetimeEngine(Int64Engine):

Expand Down
3 changes: 1 addition & 2 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -876,7 +876,7 @@ def _engine(
# ndarray[Any, Any]]" has no attribute "_ndarray" [union-attr]
target_values = self._data._ndarray # type: ignore[union-attr]
elif is_string_dtype(self.dtype) and not is_object_dtype(self.dtype):
return libindex.StringEngine(target_values)
return libindex.StringObjectEngine(target_values, self.dtype.na_value) # type: ignore[union-attr]

# error: Argument 1 to "ExtensionEngine" has incompatible type
# "ndarray[Any, Any]"; expected "ExtensionArray"
Expand Down Expand Up @@ -5974,7 +5974,6 @@ def _should_fallback_to_positional(self) -> bool:
def get_indexer_non_unique(
self, target
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
target = ensure_index(target)
target = self._maybe_cast_listlike_indexer(target)

if not self._should_compare(target) and not self._should_partial_index(target):
Expand Down
104 changes: 93 additions & 11 deletions pandas/tests/indexes/string/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,51 @@
import pandas._testing as tm


def _isnan(val):
try:
return val is not pd.NA and np.isnan(val)
except TypeError:
return False


class TestGetLoc:
def test_get_loc(self, any_string_dtype):
index = Index(["a", "b", "c"], dtype=any_string_dtype)
assert index.get_loc("b") == 1

def test_get_loc_raises(self, any_string_dtype):
index = Index(["a", "b", "c"], dtype=any_string_dtype)
with pytest.raises(KeyError, match="d"):
index.get_loc("d")

def test_get_loc_invalid_value(self, any_string_dtype):
index = Index(["a", "b", "c"], dtype=any_string_dtype)
with pytest.raises(KeyError, match="1"):
index.get_loc(1)

def test_get_loc_non_unique(self, any_string_dtype):
index = Index(["a", "b", "a"], dtype=any_string_dtype)
result = index.get_loc("a")
expected = np.array([True, False, True])
tm.assert_numpy_array_equal(result, expected)

def test_get_loc_non_missing(self, any_string_dtype, nulls_fixture):
index = Index(["a", "b", "c"], dtype=any_string_dtype)
with pytest.raises(KeyError):
index.get_loc(nulls_fixture)

def test_get_loc_missing(self, any_string_dtype, nulls_fixture):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this test now means that you can use np.nan and pd.NA interchangeably when indexing? If that's correct, I'm not sure I agree that we should be going that far

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem is that we are coercing any missing value indicator to NaN upon construction, and so to preserve back compat, I think I prefer we do the same for input to indexing operations.

To express it in terms of get_loc, this works now:

>>> pd.options.future.infer_string = False
>>> pd.Index(["a", "b", None]).get_loc(None)
2

but the same on main with enabling the string dtype:

>>> pd.options.future.infer_string = True
>>> pd.Index(["a", "b", None]).get_loc(None)
...
KeyError: None

That is because now the None is no longer in the object dtype index, but has been coerced to NaN.
(on main, trying the above with np.nan also fails (see the issue #59879), but that's because the StringEngine simply wasn't set up to work with missing values, so that is the initial reason I replaced it now with the StringObjectEngine)

The above is with None, but essentially happens with any other missing value indicator, like pd.NA. Maybe None and np.nan are the most important ones though, but I would at least prefer that indexing with None keeps working for now (we can always start deprecating it, but I wouldn't do that it as a breaking change for 3.0)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW this is also already quite inconsistent depending on the data type .. See #59765 for an overview (e.g. also for datetimelike and categorical, we treat all NA-likes as the same in indexing lookups)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW this is also already quite inconsistent depending on the data type .. See #59765 for an overview (e.g. also for datetimelike and categorical, we treat all NA-likes as the same in indexing lookups)

Nice - that's a great issue. Thanks for opening it.

To express it in terms of get_loc, this works now:

Hmm I'm a bit confused by how this relates to all of the missing indicators becoming essentially equal though. On main, this does not work (?):

>>> pd.options.future.infer_string = False
>>> pd.Index(["a", "b", None]).get_loc(np.nan)
KeyError: nan

Definitely understand that there is not an ideal solution here given the inconsistent history, but I don't want to go too far and just start making all of the missing value indicators interchangeable. I think containment logic should land a little closer to equality logic, and in the latter we obviously don't allow this

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On main, this does not work (?):

Yes, that's the first bug that this PR is solving: right now no missing value lookup works, not even NaN itself (which is what is stored in the array). This is because the StringEngine simply doesn't handle missing values correctly (when building the hash table, it actually converts it to a sentinel string, but then for any of the lookup methods it doesn't take that into account; it's a bit an incomplete implementation)

So by using the ObjectEngine (subclass), that fixes that first issue: ensuring NaN can be found

I think containment logic should land a little closer to equality logic, and in the latter we obviously don't allow this

Missing values don't compare equal (well, Nonedoes, but we specifically didn't choose that long term as the sentinel moving forward; np.nan and pd.NA don't compare equal), so containment is already a bit of a special case anyway compared to equality, when it comes to missing values.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair point on the equality. I guess I'm still hung up on the indexing behavior being the same though.

I've lost track of the nuance a bit, but haven't np.nan and pd.NA always had different indexing behavior? I'm just wary of glossing over that as part of this.

Maybe worth some input from @pandas-dev/pandas-core if anyone else has thoughts

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I updated the PR to for now just enable exact matching missing values in get_loc, so this PR can already be merged (and fix the most glaring bug), and then we can have the discussion around backwards compatibility in #59879 (I don't think the above thread is very easy to follow for other people to chime in, will do a write up on the issue -> see #59879 (comment))

index = Index(["a", "b", nulls_fixture], dtype=any_string_dtype)
if any_string_dtype == "string" and (
(any_string_dtype.na_value is pd.NA and nulls_fixture is not pd.NA)
or (_isnan(any_string_dtype.na_value) and not _isnan(nulls_fixture))
):
with pytest.raises(KeyError):
index.get_loc(nulls_fixture)
else:
assert index.get_loc(nulls_fixture) == 2


class TestGetIndexer:
@pytest.mark.parametrize(
"method,expected",
Expand Down Expand Up @@ -41,23 +86,60 @@ def test_get_indexer_strings_raises(self, any_string_dtype):
["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2]
)

@pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA])
def test_get_indexer_missing(self, any_string_dtype, null, using_infer_string):
# NaT and Decimal("NaN") from null_fixture are not supported for string dtype
index = Index(["a", "b", null], dtype=any_string_dtype)
result = index.get_indexer(["a", null, "c"])
if using_infer_string:
expected = np.array([0, 2, -1], dtype=np.intp)
elif any_string_dtype == "string" and (
(any_string_dtype.na_value is pd.NA and null is not pd.NA)
or (_isnan(any_string_dtype.na_value) and not _isnan(null))
):
expected = np.array([0, -1, -1], dtype=np.intp)
else:
expected = np.array([0, 2, -1], dtype=np.intp)

class TestGetIndexerNonUnique:
@pytest.mark.xfail(reason="TODO(infer_string)", strict=False)
def test_get_indexer_non_unique_nas(self, any_string_dtype, nulls_fixture):
index = Index(["a", "b", None], dtype=any_string_dtype)
indexer, missing = index.get_indexer_non_unique([nulls_fixture])
tm.assert_numpy_array_equal(result, expected)

expected_indexer = np.array([2], dtype=np.intp)
expected_missing = np.array([], dtype=np.intp)

class TestGetIndexerNonUnique:
@pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA])
def test_get_indexer_non_unique_nas(
self, any_string_dtype, null, using_infer_string
):
index = Index(["a", "b", null], dtype=any_string_dtype)
indexer, missing = index.get_indexer_non_unique(["a", null])

if using_infer_string:
expected_indexer = np.array([0, 2], dtype=np.intp)
expected_missing = np.array([], dtype=np.intp)
elif any_string_dtype == "string" and (
(any_string_dtype.na_value is pd.NA and null is not pd.NA)
or (_isnan(any_string_dtype.na_value) and not _isnan(null))
):
expected_indexer = np.array([0, -1], dtype=np.intp)
expected_missing = np.array([1], dtype=np.intp)
else:
expected_indexer = np.array([0, 2], dtype=np.intp)
expected_missing = np.array([], dtype=np.intp)
tm.assert_numpy_array_equal(indexer, expected_indexer)
tm.assert_numpy_array_equal(missing, expected_missing)

# actually non-unique
index = Index(["a", None, "b", None], dtype=any_string_dtype)
indexer, missing = index.get_indexer_non_unique([nulls_fixture])

expected_indexer = np.array([1, 3], dtype=np.intp)
index = Index(["a", null, "b", null], dtype=any_string_dtype)
indexer, missing = index.get_indexer_non_unique(["a", null])

if using_infer_string:
expected_indexer = np.array([0, 1, 3], dtype=np.intp)
elif any_string_dtype == "string" and (
(any_string_dtype.na_value is pd.NA and null is not pd.NA)
or (_isnan(any_string_dtype.na_value) and not _isnan(null))
):
pass
else:
expected_indexer = np.array([0, 1, 3], dtype=np.intp)
tm.assert_numpy_array_equal(indexer, expected_indexer)
tm.assert_numpy_array_equal(missing, expected_missing)

Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/io/parser/common/test_common_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

from pandas._config import using_string_dtype

from pandas.compat import HAS_PYARROW
from pandas.errors import (
EmptyDataError,
ParserError,
Expand Down Expand Up @@ -766,7 +767,7 @@ def test_dict_keys_as_names(all_parsers):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
@xfail_pyarrow # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0
def test_encoding_surrogatepass(all_parsers):
# GH39017
Expand Down