From 2316539c48c45a7bfc28d86dfde207eda2be10f6 Mon Sep 17 00:00:00 2001 From: makbigc Date: Mon, 29 Apr 2019 16:40:07 +0800 Subject: [PATCH 1/6] Add is_coerce arg to array_to_datetime_object --- pandas/_libs/tslib.pyx | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 50e3fb1c38cc7..dc377819df1bf 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -682,7 +682,8 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', return ignore_errors_out_of_bounds_fallback(values), tz_out except TypeError: - return array_to_datetime_object(values, is_raise, dayfirst, yearfirst) + return array_to_datetime_object(values, is_raise, is_coerce, + dayfirst, yearfirst) if seen_datetime and seen_integer: # we have mixed datetimes & integers @@ -697,7 +698,7 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', elif is_raise: raise ValueError("mixed datetimes and integers in passed array") else: - return array_to_datetime_object(values, is_raise, + return array_to_datetime_object(values, is_raise, is_coerce, dayfirst, yearfirst) if seen_datetime_offset and not utc_convert: @@ -709,7 +710,7 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', # (with individual dateutil.tzoffsets) are returned is_same_offsets = len(out_tzoffset_vals) == 1 if not is_same_offsets: - return array_to_datetime_object(values, is_raise, + return array_to_datetime_object(values, is_raise, is_coerce, dayfirst, yearfirst) else: tz_offset = out_tzoffset_vals.pop() @@ -757,7 +758,8 @@ cdef inline ignore_errors_out_of_bounds_fallback(ndarray[object] values): @cython.wraparound(False) @cython.boundscheck(False) -cdef array_to_datetime_object(ndarray[object] values, bint is_raise, +cdef array_to_datetime_object(ndarray[object] values, + bint is_raise, bint is_coerce, bint dayfirst=False, bint yearfirst=False): """ Fall back function for array_to_datetime @@ -806,6 +808,9 @@ cdef array_to_datetime_object(ndarray[object] values, bint is_raise, pydatetime_to_dt64(oresult[i], &dts) check_dts_bounds(&dts) except (ValueError, OverflowError): + if is_coerce: + oresult[i] = NaT + continue if is_raise: raise return values, None From 35addbf6ce330f185425601f8132dc26c3c4ba04 Mon Sep 17 00:00:00 2001 From: makbigc Date: Mon, 29 Apr 2019 16:40:25 +0800 Subject: [PATCH 2/6] Add one test --- pandas/tests/indexes/datetimes/test_tools.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index d6627f0fb8b72..e5d5b28e2f200 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -762,6 +762,18 @@ def test_iso_8601_strings_with_different_offsets(self): NaT], tz='UTC') tm.assert_index_equal(result, expected) + # GH 26122 + ts_strings = ['March 1, 2018 12:00:00+0400', + 'March 1, 2018 12:00:00+0500', + '20100240'] + result = to_datetime(ts_strings, errors='coerce') + expected = Index([datetime(2018, 3, 1, 12, 0, + tzinfo=tzoffset(None, 14400)), + datetime(2018, 3, 1, 12, 0, + tzinfo=tzoffset(None, 18000)), + NaT]) + tm.assert_index_equal(result, expected) + def test_iso8601_strings_mixed_offsets_with_naive(self): # GH 24992 result = pd.to_datetime([ From 8264fb754e6b3bcee894d3b0ce684e04b3ecc9a4 Mon Sep 17 00:00:00 2001 From: makbigc Date: Wed, 1 May 2019 23:18:02 +0800 Subject: [PATCH 3/6] Add whatsnew note --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 3e559e771f126..d644bd381e0f5 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -273,7 +273,7 @@ Datetimelike - Bug in :class:`DataFrame` and :class:`Series` where timezone aware data with ``dtype='datetime64[ns]`` was not cast to naive (:issue:`25843`) - Improved :class:`Timestamp` type checking in various datetime functions to prevent exceptions when using a subclassed ``datetime`` (:issue:`25851`) - Bug in :class:`Series` and :class:`DataFrame` repr where ``np.datetime64('NaT')`` and ``np.timedelta64('NaT')`` with ``dtype=object`` would be represented as ``NaN`` (:issue:`25445`) -- +- Bug in :meth:`to_datetime` which does not replace the invalid argument with ``NaT`` when error is set to coerce (:issue:`26122`) Timedelta ^^^^^^^^^ From b1313fd2868ce4ec0417aa7ccd4d0501c50bb006 Mon Sep 17 00:00:00 2001 From: makbigc Date: Thu, 2 May 2019 12:17:36 +0800 Subject: [PATCH 4/6] Amend the entry in whatsnew note --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index d644bd381e0f5..a4c5f1bdb439d 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -273,7 +273,7 @@ Datetimelike - Bug in :class:`DataFrame` and :class:`Series` where timezone aware data with ``dtype='datetime64[ns]`` was not cast to naive (:issue:`25843`) - Improved :class:`Timestamp` type checking in various datetime functions to prevent exceptions when using a subclassed ``datetime`` (:issue:`25851`) - Bug in :class:`Series` and :class:`DataFrame` repr where ``np.datetime64('NaT')`` and ``np.timedelta64('NaT')`` with ``dtype=object`` would be represented as ``NaN`` (:issue:`25445`) -- Bug in :meth:`to_datetime` which does not replace the invalid argument with ``NaT`` when error is set to coerce (:issue:`26122`) +- Bug in :func:`to_datetime` which does not replace the invalid argument with ``NaT`` when error is set to coerce (:issue:`26122`) Timedelta ^^^^^^^^^ From 8400007245c7f5ce28d43b04e791e95d2888a503 Mon Sep 17 00:00:00 2001 From: makbigc Date: Thu, 2 May 2019 12:18:21 +0800 Subject: [PATCH 5/6] Change the args is_raise and is_coerce to errors --- pandas/_libs/tslib.pyx | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index dc377819df1bf..83e803aa9bf4a 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -682,7 +682,7 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', return ignore_errors_out_of_bounds_fallback(values), tz_out except TypeError: - return array_to_datetime_object(values, is_raise, is_coerce, + return array_to_datetime_object(values, errors, dayfirst, yearfirst) if seen_datetime and seen_integer: @@ -698,7 +698,7 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', elif is_raise: raise ValueError("mixed datetimes and integers in passed array") else: - return array_to_datetime_object(values, is_raise, is_coerce, + return array_to_datetime_object(values, errors, dayfirst, yearfirst) if seen_datetime_offset and not utc_convert: @@ -710,7 +710,7 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', # (with individual dateutil.tzoffsets) are returned is_same_offsets = len(out_tzoffset_vals) == 1 if not is_same_offsets: - return array_to_datetime_object(values, is_raise, is_coerce, + return array_to_datetime_object(values, errors, dayfirst, yearfirst) else: tz_offset = out_tzoffset_vals.pop() @@ -758,8 +758,7 @@ cdef inline ignore_errors_out_of_bounds_fallback(ndarray[object] values): @cython.wraparound(False) @cython.boundscheck(False) -cdef array_to_datetime_object(ndarray[object] values, - bint is_raise, bint is_coerce, +cdef array_to_datetime_object(ndarray[object] values, str errors, bint dayfirst=False, bint yearfirst=False): """ Fall back function for array_to_datetime @@ -771,7 +770,7 @@ cdef array_to_datetime_object(ndarray[object] values, ---------- values : ndarray of object date-like objects to convert - is_raise : bool + errors : str, default 'raise' error behavior when parsing dayfirst : bool, default False dayfirst parsing behavior when encountering datetime strings @@ -785,9 +784,14 @@ cdef array_to_datetime_object(ndarray[object] values, cdef: Py_ssize_t i, n = len(values) object val, + bint is_ignore = errors == 'ignore' + bint is_coerce = errors == 'coerce' + bint is_raise = errors == 'raise' ndarray[object] oresult npy_datetimestruct dts + assert is_raise or is_ignore or is_coerce + oresult = np.empty(n, dtype=object) # We return an object array and only attempt to parse: From 03be04d52939f3bdcd139324d5a81d7bff3e0b52 Mon Sep 17 00:00:00 2001 From: makbigc Date: Mon, 6 May 2019 20:27:50 +0800 Subject: [PATCH 6/6] Rename the new test --- pandas/tests/indexes/datetimes/test_tools.py | 25 ++++++++++---------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index e5d5b28e2f200..fea2f1e9f3ef2 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -712,6 +712,19 @@ def test_week_without_day_and_calendar_year(self, date, format): with pytest.raises(ValueError, match=msg): pd.to_datetime(date, format=format) + def test_to_datetime_coerce(self): + # GH 26122 + ts_strings = ['March 1, 2018 12:00:00+0400', + 'March 1, 2018 12:00:00+0500', + '20100240'] + result = to_datetime(ts_strings, errors='coerce') + expected = Index([datetime(2018, 3, 1, 12, 0, + tzinfo=tzoffset(None, 14400)), + datetime(2018, 3, 1, 12, 0, + tzinfo=tzoffset(None, 18000)), + NaT]) + tm.assert_index_equal(result, expected) + def test_iso_8601_strings_with_same_offset(self): # GH 17697, 11736 ts_str = "2015-11-18 15:30:00+05:30" @@ -762,18 +775,6 @@ def test_iso_8601_strings_with_different_offsets(self): NaT], tz='UTC') tm.assert_index_equal(result, expected) - # GH 26122 - ts_strings = ['March 1, 2018 12:00:00+0400', - 'March 1, 2018 12:00:00+0500', - '20100240'] - result = to_datetime(ts_strings, errors='coerce') - expected = Index([datetime(2018, 3, 1, 12, 0, - tzinfo=tzoffset(None, 14400)), - datetime(2018, 3, 1, 12, 0, - tzinfo=tzoffset(None, 18000)), - NaT]) - tm.assert_index_equal(result, expected) - def test_iso8601_strings_mixed_offsets_with_naive(self): # GH 24992 result = pd.to_datetime([