rapidsai · mike-wendt · Jun 26, 2020 · Mar 16, 2020 · Mar 16, 2020 · Mar 16, 2020
diff --git a/python/cudf/cudf/_lib/rolling.pyx b/python/cudf/cudf/_lib/rolling.pyx
@@ -75,8 +75,6 @@ def rolling(Column source_column, Column pre_column_window,
                     agg)
             )
     else:
-        if op == "count":
-            min_periods = 0
         c_min_periods = min_periods
         if center:
             c_window = (window // 2) + 1

@@ -501,24 +501,30 @@ def find_and_replace(self, to_replace, replacement, all_nan):
         """
         replaced = column.as_column(self.cat().codes)
 
+        to_replace_col, replacement_col = [], []
+        new_cats = cudf.Series(self.dtype.categories)
+        for old_val, new_val in zip(to_replace, replacement):
+            if new_val not in self.dtype.categories:
+                new_cats = new_cats.replace(old_val, new_val)
+            else:
+                to_replace_col.append(self._encode(old_val))
+                replacement_col.append(self._encode(new_val))
+
         to_replace_col = column.as_column(
-            np.asarray(
-                [self._encode(val) for val in to_replace], dtype=replaced.dtype
-            )
+            np.array(to_replace_col, dtype=replaced.dtype)
         )
         replacement_col = column.as_column(
-            np.asarray(
-                [self._encode(val) for val in replacement],
-                dtype=replaced.dtype,
-            )
+            np.array(replacement_col, dtype=replaced.dtype)
         )
 
+        replaced = column.as_column(self.cat().codes)
+
         output = libcudf.replace.replace(
             replaced, to_replace_col, replacement_col
         )
 
         return column.build_categorical_column(
-            categories=self.dtype.categories,
+            categories=new_cats,
             codes=column.as_column(output.base_data, dtype=output.dtype),
             mask=output.base_mask,
             offset=output.offset,

@@ -1271,11 +1271,13 @@ def as_column(arbitrary, nan_as_null=None, dtype=None, length=None):
             )
 
         elif isinstance(arbitrary, pa.NullArray):
-            new_dtype = pd.api.types.pandas_dtype(dtype)
             if (type(dtype) == str and dtype == "empty") or dtype is None:
                 new_dtype = pd.api.types.pandas_dtype(
                     arbitrary.type.to_pandas_dtype()
                 )
+            else:
+                new_dtype = pd.api.types.pandas_dtype(dtype)
+
 
             if is_categorical_dtype(new_dtype):
                 arbitrary = arbitrary.dictionary_encode()

@@ -137,7 +137,8 @@ def as_numerical_column(self, dtype, **kwargs):
 
     def as_string_column(self, dtype, **kwargs):
         from cudf.core.column import string
-
+        if not kwargs.get("format"):
+            kwargs["format"] = "%Y-%m-%d %H:%M:%S.%f"
         if len(self) > 0:
             return string._numeric_to_str_typecast_functions[
                 np.dtype(self.dtype)

@@ -68,7 +68,6 @@ def __init__(
             names,
             (
                 Sequence,
-                pd.core.indexes.frozen.FrozenNDArray,
                 pd.core.indexes.frozen.FrozenList,
             ),
         ):
@@ -85,7 +84,7 @@ def __init__(
             raise ValueError("Must pass non-zero number of levels/codes")
 
         if not isinstance(codes, DataFrame) and not isinstance(
-            codes[0], (Sequence, pd.core.indexes.frozen.FrozenNDArray)
+            codes[0], (Sequence, np.ndarray)
         ):
             raise TypeError("Codes is not a Sequence of sequences")
 
@@ -470,7 +469,9 @@ def _index_and_downcast(self, result, index, index_key):
     def _get_row_major(self, df, row_tuple):
         from cudf import Series
 
-        if pd.api.types.is_bool_dtype(row_tuple):
+        if pd.api.types.is_bool_dtype(
+            list(row_tuple) if isinstance(row_tuple, tuple) else row_tuple
+        ):
             return df[row_tuple]
 
         valid_indices = self._get_valid_indices_by_tuple(

@@ -1417,7 +1417,7 @@ def as_mask(self):
         """
         return self._column.as_mask()
 
-    def astype(self, dtype, copy=False, errors="raise", **kwargs):
+    def astype(self, dtype, copy=False, errors="raise"):
         """
         Cast the Series to the given dtype
 
@@ -1439,7 +1439,6 @@ def astype(self, dtype, copy=False, errors="raise", **kwargs):
             object.
             - ``warn`` : prints last exceptions as warnings and
             return original object.
-        **kwargs : extra arguments to pass on to the constructor
 
         Returns
         -------
@@ -1461,7 +1460,7 @@ def astype(self, dtype, copy=False, errors="raise", **kwargs):
         if pd.api.types.is_dtype_equal(dtype, self.dtype):
             return self.copy(deep=copy)
         try:
-            data = self._column.astype(dtype, **kwargs)
+            data = self._column.astype(dtype)
 
             return self._copy_construct(
                 data=data.copy(deep=True) if copy else data, index=self.index

@@ -145,7 +145,7 @@ def test_categorical_binary_add():
 
     with pytest.raises(TypeError) as raises:
         pdsr + pdsr
-    raises.match(r"Series cannot perform the operation \+")
+    raises.match("unsupported operand")
 
     with pytest.raises(TypeError) as raises:
         sr + sr

diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
@@ -395,7 +395,7 @@ def test_csv_reader_usecols_int_char(tmpdir):
     assert len(out.columns) == len(df_out.columns)
     assert len(out) == len(df_out)
     pd.util.testing.assert_frame_equal(
-        df_out, out.to_pandas(), check_names=False
+            df_out, out.to_pandas(), check_names=False
     )
 
 

@@ -826,7 +826,6 @@ def test_dataframe_hash_partition_masked_value(nrows):
             got_value = row.val
             assert expected_value == got_value
 
-
 @pytest.mark.parametrize("nrows", [3, 10, 50])
 def test_dataframe_hash_partition_masked_keys(nrows):
     gdf = DataFrame()
@@ -1439,13 +1438,6 @@ def test_dataframe_transpose_category(num_cols, num_rows):
     got_function = gdf.transpose()
     got_property = gdf.T
 
-    # materialize our categoricals because pandas
-    for name, col in got_function._data.items():
-        got_function[name] = col.astype(col.dtype.type)
-
-    for name, col in got_property._data.items():
-        got_property[name] = col.astype(col.dtype.type)
-
     expect = pdf.transpose()
 
     assert_eq(expect, got_function.to_pandas())
@@ -3314,13 +3306,11 @@ def test_series_astype_numeric_to_other(dtype, as_dtype):
 def test_series_astype_string_to_other(as_dtype):
     if "datetime64" in as_dtype:
         data = ["2001-01-01", "2002-02-02", "2000-01-05"]
-        kwargs = {"format": "%Y-%m-%d"}
     else:
         data = ["1", "2", "3"]
-        kwargs = {}
     psr = pd.Series(data)
     gsr = gd.from_pandas(psr)
-    assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype, **kwargs))
+    assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype))
 
 
 @pytest.mark.parametrize(
@@ -3338,7 +3328,7 @@ def test_series_astype_datetime_to_other(as_dtype):
     data = ["2001-01-01", "2002-02-02", "2001-01-05"]
     psr = pd.Series(data)
     gsr = gd.from_pandas(psr)
-    assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype, format="%Y-%m-%d"))
+    assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype))
 
 
 @pytest.mark.parametrize(
@@ -3357,22 +3347,20 @@ def test_series_astype_datetime_to_other(as_dtype):
 def test_series_astype_categorical_to_other(as_dtype):
     if "datetime64" in as_dtype:
         data = ["2001-01-01", "2002-02-02", "2000-01-05", "2001-01-01"]
-        kwargs = {"format": "%Y-%m-%d"}
     else:
         data = [1, 2, 3, 1]
-        kwargs = {}
     psr = pd.Series(data, dtype="category")
     gsr = gd.from_pandas(psr)
-    assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype, **kwargs))
+    assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype))
 
 
 @pytest.mark.parametrize("ordered", [True, False])
 def test_series_astype_to_categorical_ordered(ordered):
     psr = pd.Series([1, 2, 3, 1], dtype="category")
     gsr = gd.from_pandas(psr)
     assert_eq(
-        psr.astype("int32", ordered=ordered),
-        gsr.astype("int32", ordered=ordered),
+        psr.astype("int32"),
+        gsr.astype("int32"),
     )
 
 
@@ -3393,7 +3381,7 @@ def test_series_astype_null_cases():
 
     assert_eq(
         gd.Series(data, dtype="datetime64[ms]"),
-        gd.Series(data).astype("datetime64[ms]", format="%Y-%m-%d"),
+        gd.Series(data).astype("datetime64[ms]"),
     )
 
     # categorical to other
@@ -3410,7 +3398,7 @@ def test_series_astype_null_cases():
     assert_eq(
         gd.Series(data, dtype="datetime64[ms]"),
         gd.Series(data, dtype="category").astype(
-            "datetime64[ms]", format="%Y-%m-%d"
+            "datetime64[ms]"
         ),
     )
 
@@ -3426,7 +3414,7 @@ def test_series_astype_null_cases():
             dtype="datetime64[ms]",
         ),
         gd.Series(["2001-01-01", "2001-02-01", None, "2001-03-01"]).astype(
-            "datetime64[ms]", format="%Y-%m-%d"
+            "datetime64[ms]"
         ),
     )
 
@@ -3436,12 +3424,11 @@ def test_series_astype_null_cases():
     )
 
     # datetime to other
-    data = ["2001-01-01", "2001-02-01", None, "2001-03-01"]
-
+    data = ["2001-01-01 00:00:00.000000", "2001-02-01 00:00:00.000000", None, "2001-03-01 00:00:00.000000"]
     assert_eq(
         gd.from_pandas(pd.Series(data)),
         gd.from_pandas(pd.Series(data, dtype="datetime64[ns]")).astype(
-            "str", format="%Y-%m-%d"
+            "str"
         ),
     )
 
@@ -4023,10 +4010,14 @@ def test_isin_dataframe(data, values):
         with pytest.raises(TypeError):
             gdf.isin(values)
     else:
-        expected = pdf.isin(values)
-        if isinstance(values, (pd.DataFrame, pd.Series)):
-            values = gd.from_pandas(values)
-        got = gdf.isin(values)
+        try:
+            expected = pdf.isin(values)
+            if isinstance(values, (pd.DataFrame, pd.Series)):
+                values = gd.from_pandas(values)
+            got = gdf.isin(values)
+        except ValueError as e:
+            if str(e) == "Lengths must match.":
+                pytest.xfail(reason='xref https://github.com/pandas-dev/pandas/issues/34256')
 
         assert_eq(got, expected)
 
@@ -4173,13 +4164,11 @@ def test_df_astype_string_to_other(as_dtype):
         # change None to "NaT" after this issue is fixed:
         # https://github.com/rapidsai/cudf/issues/5117
         data = ["2001-01-01", "2002-02-02", "2000-01-05", None]
-        kwargs = {"format": "%Y-%m-%d"}
     elif as_dtype == "int32":
         data = [1, 2, 3]
         kwargs = {}
     elif as_dtype == "category":
         data = ["1", "2", "3", None]
-        kwargs = {}
     elif "float" in as_dtype:
         data = [1.0, 2.0, 3.0, np.nan]
         kwargs = {}
@@ -4196,7 +4185,7 @@ def test_df_astype_string_to_other(as_dtype):
     expect["foo"] = expect_data
     expect["bar"] = expect_data
 
-    got = gdf.astype(as_dtype, **kwargs)
+    got = gdf.astype(as_dtype)
     assert_eq(expect, got)
 
 
@@ -4212,7 +4201,7 @@ def test_df_astype_string_to_other(as_dtype):
     ],
 )
 def test_df_astype_datetime_to_other(as_dtype):
-    data = ["1991-11-20", "2004-12-04", "2016-09-13", None]
+    data = ["1991-11-20 00:00:00.000000", "2004-12-04 00:00:00.000000", "2016-09-13 00:00:00.000000", None]
 
     gdf = DataFrame()
     expect = DataFrame()
@@ -4237,7 +4226,7 @@ def test_df_astype_datetime_to_other(as_dtype):
         expect["foo"] = Series(data, dtype=as_dtype)
         expect["bar"] = Series(data, dtype=as_dtype)
 
-    got = gdf.astype(as_dtype, format="%Y-%m-%d")
+    got = gdf.astype(as_dtype)
 
     assert_eq(expect, got)
 
@@ -4258,7 +4247,6 @@ def test_df_astype_datetime_to_other(as_dtype):
 def test_df_astype_categorical_to_other(as_dtype):
     if "datetime64" in as_dtype:
         data = ["2001-01-01", "2002-02-02", "2000-01-05", "2001-01-01"]
-        kwargs = {"format": "%Y-%m-%d"}
     else:
         data = [1, 2, 3, 1]
         kwargs = {}
@@ -4267,7 +4255,7 @@ def test_df_astype_categorical_to_other(as_dtype):
     pdf["foo"] = psr
     pdf["bar"] = psr
     gdf = DataFrame.from_pandas(pdf)
-    assert_eq(pdf.astype(as_dtype), gdf.astype(as_dtype, **kwargs))
+    assert_eq(pdf.astype(as_dtype), gdf.astype(as_dtype))
 
 
 @pytest.mark.parametrize("ordered", [True, False])
@@ -4279,8 +4267,8 @@ def test_df_astype_to_categorical_ordered(ordered):
     gdf = DataFrame.from_pandas(pdf)
 
     assert_eq(
-        gdf.astype("int32", ordered=ordered),
-        gdf.astype("int32", ordered=ordered),
+        gdf.astype("int32"),
+        gdf.astype("int32"),
     )
 
 

@@ -362,7 +362,7 @@ def test_typecast_from_datetime_to_datetime(data, from_dtype, to_dtype):
 @pytest.mark.parametrize("data", [numerical_data()])
 @pytest.mark.parametrize("nulls", ["some", "all"])
 def test_to_from_pandas_nulls(data, nulls):
-    pd_data = pd.Series(data.copy())
+    pd_data = pd.Series(data.copy().astype('datetime64[ns]'))
     if nulls == "some":
         # Fill half the values with NaT
         pd_data[list(range(0, len(pd_data), 2))] = np.datetime64("nat", "ns")
@@ -419,10 +419,7 @@ def test_datetime_unique(data, nulls):
     expected = psr.unique()
     got = gsr.unique()
 
-    # convert to int64 for equivalence testing
-    np.testing.assert_array_almost_equal(
-        got.to_pandas().astype(int), expected.astype(int)
-    )
+    assert_eq(pd.Series(expected), got.to_pandas())
 
 
 @pytest.mark.parametrize(

@@ -37,7 +37,7 @@ def pdf(request):
         nrows=nrows, ncols=ncols, data_gen_f=lambda r, c: r, r_idx_type="i"
     )
     # Delete the name of the column index, and rename the row index
-    del test_pdf.columns.name
+    test_pdf.columns.name = None
     test_pdf.index.name = "index"
 
     # Cast all the column dtypes to objects, rename them, and then cast to

@@ -870,6 +870,16 @@ def test_series_setitem_datetime():
     psr = pd.Series(["2001", "2002", "2003"], dtype="datetime64[ns]")
     gsr = cudf.from_pandas(psr)
 
+    psr[0] = np.datetime64("2005")
+    gsr[0] = np.datetime64("2005")
+
+    assert_eq(psr, gsr)
+
+@pytest.mark.xfail(reason='Pandas will coerce to object datatype here')
+def test_series_setitem_datetime_coerced():
+    psr = pd.Series(["2001", "2002", "2003"], dtype="datetime64[ns]")
+    gsr = cudf.from_pandas(psr)
+
     psr[0] = "2005"
     gsr[0] = "2005"