From 5349052667163ea2baa8897b050950c9215bf434 Mon Sep 17 00:00:00 2001 From: Erik Oveson Date: Sat, 17 Nov 2018 20:17:23 -0800 Subject: [PATCH 01/12] TST: Add test case for GH14080 for overflow exception --- .../tests/scalar/timestamp/test_arithmetic.py | 27 ++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py index 0f8ddc53734f6..6120d024cfd69 100644 --- a/pandas/tests/scalar/timestamp/test_arithmetic.py +++ b/pandas/tests/scalar/timestamp/test_arithmetic.py @@ -7,11 +7,12 @@ import pandas.util.testing as tm from pandas.compat import long from pandas.tseries import offsets +from pandas.tseries.frequencies import to_offset from pandas import Timestamp, Timedelta class TestTimestampArithmetic(object): - def test_overflow_offset(self): + def test_overflow_offset1(self): # xref https://github.com/statsmodels/statsmodels/issues/3374 # ends up multiplying really large numbers which overflow @@ -27,6 +28,30 @@ def test_overflow_offset(self): with pytest.raises(OverflowError): stamp - offset + def test_overflow_offset2(self): + # xref https://github.com/pandas-dev/pandas/issues/14080 + + stamp = Timestamp("2000/1/1") + offset_overflow = to_offset("D")*100**25 + offset_no_overflow = to_offset("D")*100 + + with pytest.raises(OverflowError): + stamp + offset_overflow + + with pytest.raises(OverflowError): + offset_overflow + stamp + + with pytest.raises(OverflowError): + stamp - offset_overflow + + expected = Timestamp("2000/04/10") + assert stamp + offset_no_overflow == expected + + assert offset_no_overflow + stamp == expected + + expected = Timestamp("1999/09/23") + assert stamp - offset_no_overflow == expected + def test_delta_preserve_nanos(self): val = Timestamp(long(1337299200000000123)) result = val + timedelta(1) From a7df6eade76f0f190332696ae1d9871fd17389aa Mon Sep 17 00:00:00 2001 From: Erik Oveson Date: Sat, 17 Nov 2018 20:39:37 -0800 Subject: [PATCH 02/12] TST: Add test case for GH14080 for overflow exception --- pandas/tests/scalar/timestamp/test_arithmetic.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py index 6120d024cfd69..e8f4d93613f76 100644 --- a/pandas/tests/scalar/timestamp/test_arithmetic.py +++ b/pandas/tests/scalar/timestamp/test_arithmetic.py @@ -30,11 +30,13 @@ def test_overflow_offset1(self): def test_overflow_offset2(self): # xref https://github.com/pandas-dev/pandas/issues/14080 + # used to crash, so check for proper overflow exception stamp = Timestamp("2000/1/1") offset_overflow = to_offset("D")*100**25 offset_no_overflow = to_offset("D")*100 + # overflow expected with pytest.raises(OverflowError): stamp + offset_overflow @@ -44,6 +46,7 @@ def test_overflow_offset2(self): with pytest.raises(OverflowError): stamp - offset_overflow + # no overflow expected expected = Timestamp("2000/04/10") assert stamp + offset_no_overflow == expected From f7be8f3828f9eddd204af577216e388d7cf5da12 Mon Sep 17 00:00:00 2001 From: Erik Oveson Date: Sat, 17 Nov 2018 20:53:20 -0800 Subject: [PATCH 03/12] TST: Add test case for GH14080 for overflow exception --- pandas/tests/scalar/timestamp/test_arithmetic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py index e8f4d93613f76..a79b39b641064 100644 --- a/pandas/tests/scalar/timestamp/test_arithmetic.py +++ b/pandas/tests/scalar/timestamp/test_arithmetic.py @@ -33,8 +33,8 @@ def test_overflow_offset2(self): # used to crash, so check for proper overflow exception stamp = Timestamp("2000/1/1") - offset_overflow = to_offset("D")*100**25 - offset_no_overflow = to_offset("D")*100 + offset_overflow = to_offset("D") * 100 ** 25 + offset_no_overflow = to_offset("D") * 100 # overflow expected with pytest.raises(OverflowError): From c727003fd4f9619dffe3c555de8cce3158497919 Mon Sep 17 00:00:00 2001 From: Erik Oveson Date: Sun, 18 Nov 2018 08:45:33 -0800 Subject: [PATCH 04/12] TST: For GH14080, break up tests, test message --- .../tests/scalar/timestamp/test_arithmetic.py | 51 ++++++++++--------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py index a79b39b641064..207bd103105ea 100644 --- a/pandas/tests/scalar/timestamp/test_arithmetic.py +++ b/pandas/tests/scalar/timestamp/test_arithmetic.py @@ -12,49 +12,54 @@ class TestTimestampArithmetic(object): - def test_overflow_offset1(self): + def test_overflow_offset(self): + # no overflow expected + + stamp = Timestamp("2000/1/1") + offset_no_overflow = to_offset("D") * 100 + + expected = Timestamp("2000/04/10") + assert stamp + offset_no_overflow == expected + + assert offset_no_overflow + stamp == expected + + expected = Timestamp("1999/09/23") + assert stamp - offset_no_overflow == expected + + def test_overflow_offset_raises(self): # xref https://github.com/statsmodels/statsmodels/issues/3374 # ends up multiplying really large numbers which overflow stamp = Timestamp('2017-01-13 00:00:00', freq='D') - offset = 20169940 * offsets.Day(1) + offset_overflow = 20169940 * offsets.Day(1) + msg = ("the add operation between " + r"\<-?\d+ \* Days\> and \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} " + "will overflow") - with pytest.raises(OverflowError): - stamp + offset + with pytest.raises(OverflowError, match=msg): + stamp + offset_overflow - with pytest.raises(OverflowError): - offset + stamp + with pytest.raises(OverflowError, match=msg): + offset_overflow + stamp - with pytest.raises(OverflowError): - stamp - offset + with pytest.raises(OverflowError, match=msg): + stamp - offset_overflow - def test_overflow_offset2(self): # xref https://github.com/pandas-dev/pandas/issues/14080 # used to crash, so check for proper overflow exception stamp = Timestamp("2000/1/1") offset_overflow = to_offset("D") * 100 ** 25 - offset_no_overflow = to_offset("D") * 100 - # overflow expected - with pytest.raises(OverflowError): + with pytest.raises(OverflowError, match=msg): stamp + offset_overflow - with pytest.raises(OverflowError): + with pytest.raises(OverflowError, match=msg): offset_overflow + stamp - with pytest.raises(OverflowError): + with pytest.raises(OverflowError, match=msg): stamp - offset_overflow - # no overflow expected - expected = Timestamp("2000/04/10") - assert stamp + offset_no_overflow == expected - - assert offset_no_overflow + stamp == expected - - expected = Timestamp("1999/09/23") - assert stamp - offset_no_overflow == expected - def test_delta_preserve_nanos(self): val = Timestamp(long(1337299200000000123)) result = val + timedelta(1) From eecede19e4ac4afc97802f2496e7fe98343b8039 Mon Sep 17 00:00:00 2001 From: Erik Oveson Date: Sun, 18 Nov 2018 22:33:28 -0800 Subject: [PATCH 05/12] TST: For GH4861, Period and datetime in multiindex --- pandas/tests/indexes/period/test_period.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index ddb3fe686534a..4a18ce425be66 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -1,3 +1,5 @@ +from datetime import datetime + import numpy as np import pytest @@ -570,3 +572,20 @@ def test_maybe_convert_timedelta(): offset = offsets.BusinessDay() with pytest.raises(ValueError, match='freq'): pi._maybe_convert_timedelta(offset) + + +def test_multiindex_period_datetime(): + # GH4861, using datetime in period of multiindex raises exception + + idx1 = Index(['a', 'a', 'a', 'b', 'b']) + idx2 = period_range('2012-01', periods=len(idx1), freq='M') + s = Series(np.random.randn(len(idx1)), [idx1, idx2]) + + # try Period as index + expected = s.iloc[0] + result = s.loc['a', Period('2012-01')] + assert result == expected + + # try datetime as index + result = s.loc['a', datetime(2012, 1, 1)] + assert result == expected From ae90f93757898814ded05f78549123e2e94ce834 Mon Sep 17 00:00:00 2001 From: Erik Oveson Date: Mon, 19 Nov 2018 07:19:44 -0800 Subject: [PATCH 06/12] TST: GH4861 move changes to different test file --- pandas/tests/indexes/period/test_period.py | 19 ------------------- pandas/tests/indexing/test_multiindex.py | 22 +++++++++++++++++++++- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 4a18ce425be66..ddb3fe686534a 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -1,5 +1,3 @@ -from datetime import datetime - import numpy as np import pytest @@ -572,20 +570,3 @@ def test_maybe_convert_timedelta(): offset = offsets.BusinessDay() with pytest.raises(ValueError, match='freq'): pi._maybe_convert_timedelta(offset) - - -def test_multiindex_period_datetime(): - # GH4861, using datetime in period of multiindex raises exception - - idx1 = Index(['a', 'a', 'a', 'b', 'b']) - idx2 = period_range('2012-01', periods=len(idx1), freq='M') - s = Series(np.random.randn(len(idx1)), [idx1, idx2]) - - # try Period as index - expected = s.iloc[0] - result = s.loc['a', Period('2012-01')] - assert result == expected - - # try datetime as index - result = s.loc['a', datetime(2012, 1, 1)] - assert result == expected diff --git a/pandas/tests/indexing/test_multiindex.py b/pandas/tests/indexing/test_multiindex.py index ea17844a75033..17fb9952c8680 100644 --- a/pandas/tests/indexing/test_multiindex.py +++ b/pandas/tests/indexing/test_multiindex.py @@ -1,5 +1,7 @@ from warnings import catch_warnings +from datetime import datetime + import numpy as np import pytest @@ -7,7 +9,8 @@ import pandas as pd from pandas import ( - DataFrame, Index, MultiIndex, Panel, Series, Timestamp, date_range) + DataFrame, date_range, Index, MultiIndex, Panel, Period, period_range, + Series, Timestamp) from pandas.tests.indexing.common import _mklbl from pandas.util import testing as tm @@ -1340,3 +1343,20 @@ def test_panel_setitem_with_multiindex(self): p5.iloc[0, :, 0] = [1, 2] expected = Panel(arr, **axes) tm.assert_panel_equal(p5, expected) + + +def test_multiindex_period_datetime(): + # GH4861, using datetime in period of multiindex raises exception + + idx1 = Index(['a', 'a', 'a', 'b', 'b']) + idx2 = period_range('2012-01', periods=len(idx1), freq='M') + s = Series(np.random.randn(len(idx1)), [idx1, idx2]) + + # try Period as index + expected = s.iloc[0] + result = s.loc['a', Period('2012-01')] + assert result == expected + + # try datetime as index + result = s.loc['a', datetime(2012, 1, 1)] + assert result == expected From 4f16c8b9d9af83880144fc5d3046e20519ae554c Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 19 Nov 2018 20:58:40 -0500 Subject: [PATCH 07/12] fix sorting --- pandas/tests/indexing/test_multiindex.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/tests/indexing/test_multiindex.py b/pandas/tests/indexing/test_multiindex.py index 17fb9952c8680..f4caf17b60d65 100644 --- a/pandas/tests/indexing/test_multiindex.py +++ b/pandas/tests/indexing/test_multiindex.py @@ -1,6 +1,5 @@ -from warnings import catch_warnings - from datetime import datetime +from warnings import catch_warnings import numpy as np import pytest @@ -9,8 +8,8 @@ import pandas as pd from pandas import ( - DataFrame, date_range, Index, MultiIndex, Panel, Period, period_range, - Series, Timestamp) + DataFrame, Index, MultiIndex, Panel, Period, Series, Timestamp, date_range, + period_range) from pandas.tests.indexing.common import _mklbl from pandas.util import testing as tm From 3e96734483868d3d92da0cd9f43540841280885b Mon Sep 17 00:00:00 2001 From: Erik Oveson Date: Sat, 24 Nov 2018 14:46:31 -0800 Subject: [PATCH 08/12] PERF: For GH23814, return early in Categorical.__init__ --- asv_bench/benchmarks/categoricals.py | 4 ++++ pandas/core/arrays/categorical.py | 10 ++++++++++ 2 files changed, 14 insertions(+) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 8a0fbc48755b5..6b86f5ab6b3f1 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -46,6 +46,7 @@ def setup(self): self.values_some_nan = list(np.tile(self.categories + [np.nan], N)) self.values_all_nan = [np.nan] * len(self.values) self.values_all_int8 = np.ones(N, 'int8') + self.categorical = pd.Categorical(self.values, self.categories) def time_regular(self): pd.Categorical(self.values, self.categories) @@ -68,6 +69,9 @@ def time_all_nan(self): def time_from_codes_all_int8(self): pd.Categorical.from_codes(self.values_all_int8, self.categories) + def time_existing_categorical(self): + pd.Categorical(self.categorical) + class ValueCounts(object): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6dc3a960dc817..011f97d067a72 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -314,6 +314,16 @@ class Categorical(ExtensionArray, PandasObject): def __init__(self, values, categories=None, ordered=None, dtype=None, fastpath=False): + # GH23814, for perf, if no optional params used and values already an + # instance of Categorical, simply return the same dtype/codes + if categories is None and ordered is None and dtype is None: + if isinstance(values, (ABCSeries, ABCIndexClass)): + values = values._values + if isinstance(values, type(self)): + self._dtype = values.dtype + self._codes = values.codes.copy() + return + # Ways of specifying the dtype (prioritized ordered) # 1. dtype is a CategoricalDtype # a.) with known categories, use dtype.categories From f6d10b8e8a4cbb93f8c3a499a64735c12898f76d Mon Sep 17 00:00:00 2001 From: Erik Oveson Date: Sat, 24 Nov 2018 14:54:05 -0800 Subject: [PATCH 09/12] PERF: For GH23814, add whatsnew entry --- doc/source/whatsnew/v0.24.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index ba29a17057d02..a4b2c097d8d9c 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1150,7 +1150,7 @@ Performance Improvements - Improved performance of :func:`pd.concat` for `Series` objects (:issue:`23404`) - Improved performance of :meth:`DatetimeIndex.normalize` and :meth:`Timestamp.normalize` for timezone naive or UTC datetimes (:issue:`23634`) - Improved performance of :meth:`DatetimeIndex.tz_localize` and various ``DatetimeIndex`` attributes with dateutil UTC timezone (:issue:`23772`) - +- Improved performance of :meth:`Categorical.__init__` (:issue:`23814`) .. _whatsnew_0240.docs: From 2173c896059b13034fb4be30db1c0845c30ea07b Mon Sep 17 00:00:00 2001 From: Erik Oveson Date: Wed, 28 Nov 2018 21:20:17 -0800 Subject: [PATCH 10/12] Consolidating code to fit existing pattern better --- pandas/core/arrays/categorical.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 011f97d067a72..c084531c2f5d7 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -314,16 +314,6 @@ class Categorical(ExtensionArray, PandasObject): def __init__(self, values, categories=None, ordered=None, dtype=None, fastpath=False): - # GH23814, for perf, if no optional params used and values already an - # instance of Categorical, simply return the same dtype/codes - if categories is None and ordered is None and dtype is None: - if isinstance(values, (ABCSeries, ABCIndexClass)): - values = values._values - if isinstance(values, type(self)): - self._dtype = values.dtype - self._codes = values.codes.copy() - return - # Ways of specifying the dtype (prioritized ordered) # 1. dtype is a CategoricalDtype # a.) with known categories, use dtype.categories @@ -357,6 +347,16 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, # the "ordered" and "categories" arguments dtype = values.dtype._from_categorical_dtype(values.dtype, categories, ordered) + + # GH23814, for perf, if no optional params used and values already + # an instance of Categorical, set values to codes like fastpath + if (isinstance(values, (ABCSeries, ABCIndexClass)) and + isinstance(values._values, type(self))): + values = values._values.codes.copy() + if categories is None: + categories = dtype.categories + fastpath = True + else: # If dtype=None and values is not categorical, create a new dtype dtype = CategoricalDtype(categories, ordered) From 325be9205576f6aac0d31381703baa8e450c927c Mon Sep 17 00:00:00 2001 From: Erik Oveson Date: Wed, 28 Nov 2018 21:45:57 -0800 Subject: [PATCH 11/12] Fix comment --- pandas/core/arrays/categorical.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c084531c2f5d7..22ed0b9c5454e 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -348,8 +348,8 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, dtype = values.dtype._from_categorical_dtype(values.dtype, categories, ordered) - # GH23814, for perf, if no optional params used and values already - # an instance of Categorical, set values to codes like fastpath + # GH23814, for perf, if values._values already an instance of + # Categorical, set values to codes, and run fastpath if (isinstance(values, (ABCSeries, ABCIndexClass)) and isinstance(values._values, type(self))): values = values._values.codes.copy() From 9e270e9a9d27eec1455e462e36dcbe43c615d5fc Mon Sep 17 00:00:00 2001 From: Erik Oveson Date: Thu, 29 Nov 2018 07:44:13 -0800 Subject: [PATCH 12/12] Add asv test that exercises code, update docs --- asv_bench/benchmarks/categoricals.py | 4 ++++ doc/source/whatsnew/v0.24.0.rst | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 6b86f5ab6b3f1..7318b40efc8fb 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -47,6 +47,7 @@ def setup(self): self.values_all_nan = [np.nan] * len(self.values) self.values_all_int8 = np.ones(N, 'int8') self.categorical = pd.Categorical(self.values, self.categories) + self.series = pd.Series(self.categorical) def time_regular(self): pd.Categorical(self.values, self.categories) @@ -72,6 +73,9 @@ def time_from_codes_all_int8(self): def time_existing_categorical(self): pd.Categorical(self.categorical) + def time_existing_series(self): + pd.Categorical(self.series) + class ValueCounts(object): diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index a4b2c097d8d9c..f8e85d7d315d4 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1150,7 +1150,7 @@ Performance Improvements - Improved performance of :func:`pd.concat` for `Series` objects (:issue:`23404`) - Improved performance of :meth:`DatetimeIndex.normalize` and :meth:`Timestamp.normalize` for timezone naive or UTC datetimes (:issue:`23634`) - Improved performance of :meth:`DatetimeIndex.tz_localize` and various ``DatetimeIndex`` attributes with dateutil UTC timezone (:issue:`23772`) -- Improved performance of :meth:`Categorical.__init__` (:issue:`23814`) +- Improved performance of :class:`Categorical` constructor for `Series` objects (:issue:`23814`) .. _whatsnew_0240.docs: