Skip to content
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.0.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ Bug fixes
- Bug in :attr:`Series.dt.days` that would overflow ``int32`` number of days (:issue:`52391`)
- Bug in :class:`arrays.DatetimeArray` constructor returning an incorrect unit when passed a non-nanosecond numpy datetime array (:issue:`52555`)
- Bug in :func:`Series.median` with :class:`ArrowDtype` returning an approximate median (:issue:`52679`)
- Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Guess this needs to be moved to 2.0.2 now

- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`)
- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on categorical dtypes (:issue:`49889`)
- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on large string dtypes (:issue:`52795`)
- Bug in :func:`pandas.testing.assert_series_equal` where ``check_dtype=False`` would still raise for datetime or timedelta types with different resolutions (:issue:`52449`)
Expand Down
104 changes: 34 additions & 70 deletions pandas/core/interchange/from_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

import numpy as np

from pandas.compat._optional import import_optional_dependency

import pandas as pd
from pandas.core.interchange.dataframe_protocol import (
Buffer,
Expand All @@ -23,7 +25,7 @@
DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64},
DtypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64},
DtypeKind.FLOAT: {32: np.float32, 64: np.float64},
DtypeKind.BOOL: {8: bool},
DtypeKind.BOOL: {1: bool, 8: bool},
}


Expand Down Expand Up @@ -154,7 +156,9 @@ def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
buffers = col.get_buffers()

data_buff, data_dtype = buffers["data"]
data = buffer_to_ndarray(data_buff, data_dtype, col.offset, col.size())
data = buffer_to_ndarray(
data_buff, data_dtype, offset=col.offset, length=col.size()
)

data = set_nulls(data, col, buffers["validity"])
return data, buffers
Expand Down Expand Up @@ -192,7 +196,9 @@ def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]:
buffers = col.get_buffers()

codes_buff, codes_dtype = buffers["data"]
codes = buffer_to_ndarray(codes_buff, codes_dtype, col.offset, col.size())
codes = buffer_to_ndarray(
codes_buff, codes_dtype, offset=col.offset, length=col.size()
)

# Doing module in order to not get ``IndexError`` for
# out-of-bounds sentinel values in `codes`
Expand Down Expand Up @@ -252,7 +258,7 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
Endianness.NATIVE,
)
# Specify zero offset as we don't want to chunk the string data
data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=col.size())
data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=data_buff.bufsize)
Comment on lines -255 to +261
Copy link
Member Author

@MarcoGorelli MarcoGorelli Apr 21, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

had to change this one for the large-string test, where these are different:

(Pdb) data_buff.bufsize
6
(Pdb) col.size()
2

If I don't change it, then we get:

In [5]:     arr = ["Mon", "Tue"]
   ...:     table = pa.table({"weekday": pa.array(arr, "large_string")})
   ...:     exchange_df = table.__dataframe__()
   ...:     result = from_dataframe(exchange_df)

In [6]: result
Out[6]: 
  weekday
0      Mo
1        

this is the only test where data_buff.bufsize != col.size()


# Retrieve the offsets buffer containing the index offsets demarcating
# the beginning and the ending of each string
Expand All @@ -261,14 +267,16 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
# meaning that it has more elements than in the data buffer, do `col.size() + 1`
# here to pass a proper offsets buffer size
offsets = buffer_to_ndarray(
offset_buff, offset_dtype, col.offset, length=col.size() + 1
offset_buff, offset_dtype, offset=col.offset, length=col.size() + 1
)

null_pos = None
if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
assert buffers["validity"], "Validity buffers cannot be empty for masks"
valid_buff, valid_dtype = buffers["validity"]
null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size())
null_pos = buffer_to_ndarray(
valid_buff, valid_dtype, offset=col.offset, length=col.size()
)
if sentinel_val == 0:
null_pos = ~null_pos

Expand Down Expand Up @@ -356,8 +364,8 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
getattr(ArrowCTypes, f"UINT{dtype[1]}"),
Endianness.NATIVE,
),
col.offset,
col.size(),
offset=col.offset,
length=col.size(),
)

data = parse_datetime_format_str(format_str, data)
Expand All @@ -368,8 +376,9 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
def buffer_to_ndarray(
buffer: Buffer,
dtype: tuple[DtypeKind, int, str, str],
*,
length: int,
offset: int = 0,
length: int | None = None,
) -> np.ndarray:
"""
Build a NumPy array from the passed buffer.
Expand Down Expand Up @@ -406,74 +415,27 @@ def buffer_to_ndarray(
# and size in the buffer plus the dtype on the column. Use DLPack as NumPy supports
# it since https://github.com/numpy/numpy/pull/19083
ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype)
data_pointer = ctypes.cast(
buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type)
)

if bit_width == 1:
assert length is not None, "`length` must be specified for a bit-mask buffer."
arr = np.ctypeslib.as_array(data_pointer, shape=(buffer.bufsize,))
return bitmask_to_bool_ndarray(arr, length, first_byte_offset=offset % 8)
pa = import_optional_dependency("pyarrow")
arr = pa.BooleanArray.from_buffers(
pa.bool_(),
length,
[None, pa.foreign_buffer(buffer.ptr, length)],
offset=offset,
)
return np.asarray(arr)
else:
data_pointer = ctypes.cast(
buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type)
)
return np.ctypeslib.as_array(
data_pointer, shape=(buffer.bufsize // (bit_width // 8),)
data_pointer,
shape=(length,),
)


def bitmask_to_bool_ndarray(
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks like this entire function was "dead code" on arrival, https://app.codecov.io/gh/pandas-dev/pandas/blob/main/pandas/core/interchange/from_dataframe.py shows the whole thing as uncovered by tests

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, so we actually did have a pure python version of this conversion .. (just not used)

bitmask: np.ndarray, mask_length: int, first_byte_offset: int = 0
) -> np.ndarray:
"""
Convert bit-mask to a boolean NumPy array.

Parameters
----------
bitmask : np.ndarray[uint8]
NumPy array of uint8 dtype representing the bitmask.
mask_length : int
Number of elements in the mask to interpret.
first_byte_offset : int, default: 0
Number of elements to offset from the start of the first byte.

Returns
-------
np.ndarray[bool]
"""
bytes_to_skip = first_byte_offset // 8
bitmask = bitmask[bytes_to_skip:]
first_byte_offset %= 8

bool_mask = np.zeros(mask_length, dtype=bool)

# Processing the first byte separately as it has its own offset
val = bitmask[0]
mask_idx = 0
bits_in_first_byte = min(8 - first_byte_offset, mask_length)
for j in range(bits_in_first_byte):
if val & (1 << (j + first_byte_offset)):
bool_mask[mask_idx] = True
mask_idx += 1

# `mask_length // 8` describes how many full bytes to process
for i in range((mask_length - bits_in_first_byte) // 8):
# doing `+ 1` as we already processed the first byte
val = bitmask[i + 1]
for j in range(8):
if val & (1 << j):
bool_mask[mask_idx] = True
mask_idx += 1

if len(bitmask) > 1:
# Processing reminder of last byte
val = bitmask[-1]
for j in range(len(bool_mask) - mask_idx):
if val & (1 << j):
bool_mask[mask_idx] = True
mask_idx += 1

return bool_mask


def set_nulls(
data: np.ndarray | pd.Series,
col: Column,
Expand Down Expand Up @@ -509,7 +471,9 @@ def set_nulls(
elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
assert validity, "Expected to have a validity buffer for the mask"
valid_buff, valid_dtype = validity
null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size())
null_pos = buffer_to_ndarray(
valid_buff, valid_dtype, offset=col.offset, length=col.size()
)
if sentinel_val == 0:
null_pos = ~null_pos
elif null_kind in (ColumnNullType.NON_NULLABLE, ColumnNullType.USE_NAN):
Expand Down
26 changes: 26 additions & 0 deletions pandas/tests/interchange/test_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,32 @@ def test_large_string_pyarrow():
assert pa.Table.equals(pa.interchange.from_dataframe(result), table)


@pytest.mark.parametrize(
("offset", "length", "expected_values"),
[
(0, None, [3.3, float("nan"), 2.1]),
(1, None, [float("nan"), 2.1]),
(2, None, [2.1]),
(0, 2, [3.3, float("nan")]),
(0, 1, [3.3]),
(1, 1, [float("nan")]),
],
)
def test_bitmasks_pyarrow(offset, length, expected_values):
# GH 52795
pa = pytest.importorskip("pyarrow", "11.0.0")

arr = [3.3, None, 2.1]
table = pa.table({"arr": arr}).slice(offset, length)
exchange_df = table.__dataframe__()
result = from_dataframe(exchange_df)
expected = pd.DataFrame({"arr": expected_values})
tm.assert_frame_equal(result, expected)

# check round-trip
assert pa.Table.equals(pa.interchange.from_dataframe(result), table)


@pytest.mark.parametrize(
"data", [int_data, uint_data, float_data, bool_data, datetime_data]
)
Expand Down