ARROW-16838: [Python] Improve schema inference for pandas indexes with extension dtypes (#14080)

jrbourbeau · web-flow · commit afd3c40a42aa · 2022-09-21T09:52:53.000+02:00
Possible fix for https://issues.apache.org/jira/browse/ARROW-16838. `pd.Index` objects don't have a `.head` method, while `pd.DataFrame`, `pd.Series`, and `pd.Index` all support indexing with `[:0]` to return a empty object of the same type. Authored-by: James Bourbeau <jrbourbeau@gmail.com> Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
@@ -541,7 +541,9 @@ def dataframe_to_types(df, preserve_index, columns=None):
         if _pandas_api.is_categorical(values):
             type_ = pa.array(c, from_pandas=True).type
         elif _pandas_api.is_extension_array_dtype(values):
-            type_ = pa.array(c.head(0), from_pandas=True).type
+            empty = c.head(0) if isinstance(
+                c, _pandas_api.pd.Series) else c[:0]
+            type_ = pa.array(empty, from_pandas=True).type
         else:
             values, type_ = get_datetimetz_type(values, c.dtype, None)
             type_ = pa.lib._ndarray_to_arrow_type(values, type_)
diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py
@@ -663,7 +663,7 @@ def test_schema_from_pandas():
     if Version(pd.__version__) >= Version('1.0.0'):
         inputs.append(pd.array([1, 2, None], dtype=pd.Int32Dtype()))
     for data in inputs:
-        df = pd.DataFrame({'a': data})
+        df = pd.DataFrame({'a': data}, index=data)
         schema = pa.Schema.from_pandas(df)
         expected = pa.Table.from_pandas(df).schema
         assert schema == expected