diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a36420556ae24..d87fa5203bd52 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,53 +23,53 @@ jobs: - name: Looking for unwanted patterns run: ci/code_checks.sh patterns - if: true + if: always() - name: Setup environment and build pandas run: ci/setup_env.sh - if: true + if: always() - name: Linting run: | source activate pandas-dev ci/code_checks.sh lint - if: true + if: always() - name: Dependencies consistency run: | source activate pandas-dev ci/code_checks.sh dependencies - if: true + if: always() - name: Checks on imported code run: | source activate pandas-dev ci/code_checks.sh code - if: true + if: always() - name: Running doctests run: | source activate pandas-dev ci/code_checks.sh doctests - if: true + if: always() - name: Docstring validation run: | source activate pandas-dev ci/code_checks.sh docstrings - if: true + if: always() - name: Typing validation run: | source activate pandas-dev ci/code_checks.sh typing - if: true + if: always() - name: Testing docstring validation script run: | source activate pandas-dev pytest --capture=no --strict scripts - if: true + if: always() - name: Running benchmarks run: | @@ -87,7 +87,7 @@ jobs: else echo "Benchmarks did not run, no changes detected" fi - if: true + if: always() - name: Publish benchmarks artifact uses: actions/upload-artifact@master @@ -95,3 +95,65 @@ jobs: name: Benchmarks log path: asv_bench/benchmarks.log if: failure() + + web_and_docs: + name: Web and docs + runs-on: ubuntu-latest + steps: + + - name: Setting conda path + run: echo "::set-env name=PATH::${HOME}/miniconda3/bin:${PATH}" + + - name: Checkout + uses: actions/checkout@v1 + + - name: Setup environment and build pandas + run: ci/setup_env.sh + + - name: Build website + run: | + source activate pandas-dev + python web/pandas_web.py web/pandas --target-path=web/build + + - name: Build documentation + run: | + source activate pandas-dev + doc/make.py --warnings-are-errors | tee sphinx.log ; exit ${PIPESTATUS[0]} + + # This can be removed when the ipython directive fails when there are errors, + # including the `tee sphinx.log` in te previous step (https://github.com/ipython/ipython/issues/11547) + - name: Check ipython directive errors + run: "! grep -B1 \"^<<<-------------------------------------------------------------------------$\" sphinx.log" + + - name: Merge website and docs + run: | + mkdir -p pandas_web/docs + cp -r web/build/* pandas_web/ + cp -r doc/build/html/* pandas_web/docs/ + if: github.event_name == 'push' + + - name: Install Rclone + run: sudo apt install rclone -y + if: github.event_name == 'push' + + - name: Set up Rclone + run: | + RCLONE_CONFIG_PATH=$HOME/.config/rclone/rclone.conf + mkdir -p `dirname $RCLONE_CONFIG_PATH` + echo "[ovh_cloud_pandas_web]" > $RCLONE_CONFIG_PATH + echo "type = swift" >> $RCLONE_CONFIG_PATH + echo "env_auth = false" >> $RCLONE_CONFIG_PATH + echo "auth_version = 3" >> $RCLONE_CONFIG_PATH + echo "auth = https://auth.cloud.ovh.net/v3/" >> $RCLONE_CONFIG_PATH + echo "endpoint_type = public" >> $RCLONE_CONFIG_PATH + echo "tenant_domain = default" >> $RCLONE_CONFIG_PATH + echo "tenant = 2977553886518025" >> $RCLONE_CONFIG_PATH + echo "domain = default" >> $RCLONE_CONFIG_PATH + echo "user = w4KGs3pmDxpd" >> $RCLONE_CONFIG_PATH + echo "key = ${{ secrets.ovh_object_store_key }}" >> $RCLONE_CONFIG_PATH + echo "region = BHS" >> $RCLONE_CONFIG_PATH + if: github.event_name == 'push' + + - name: Sync web + run: rclone sync pandas_web ovh_cloud_pandas_web:dev + if: github.event_name == 'push' diff --git a/.travis.yml b/.travis.yml index 0c7740295b637..a11cd469e9b9c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,5 @@ language: python -python: 3.5 +python: 3.7 # To turn off cached cython files and compiler cache # set NOCACHE-true @@ -48,17 +48,12 @@ matrix: - mysql - postgresql - # In allow_failures - env: - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow" SQL="1" services: - mysql - postgresql - allow_failures: - - env: - - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow" SQL="1" - before_install: - echo "before_install" # set non-blocking IO on travis diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml index cb0b17e3553a4..55e8e839f4fae 100644 --- a/ci/azure/posix.yml +++ b/ci/azure/posix.yml @@ -19,18 +19,24 @@ jobs: ENV_FILE: ci/deps/azure-36-minimum_versions.yaml CONDA_PY: "36" PATTERN: "not slow and not network" + py36_locale_slow_old_np: ENV_FILE: ci/deps/azure-36-locale_slow.yaml CONDA_PY: "36" PATTERN: "slow" - LOCALE_OVERRIDE: "zh_CN.UTF-8" + # pandas does not use the language (zh_CN), but should support diferent encodings (utf8) + # we should test with encodings different than utf8, but doesn't seem like Ubuntu supports any + LANG: "zh_CN.utf8" + LC_ALL: "zh_CN.utf8" EXTRA_APT: "language-pack-zh-hans" py36_locale: ENV_FILE: ci/deps/azure-36-locale.yaml CONDA_PY: "36" PATTERN: "not slow and not network" - LOCALE_OVERRIDE: "it_IT.UTF-8" + LANG: "it_IT.utf8" + LC_ALL: "it_IT.utf8" + EXTRA_APT: "language-pack-it" py36_32bit: ENV_FILE: ci/deps/azure-36-32bit.yaml @@ -42,7 +48,9 @@ jobs: ENV_FILE: ci/deps/azure-37-locale.yaml CONDA_PY: "37" PATTERN: "not slow and not network" - LOCALE_OVERRIDE: "zh_CN.UTF-8" + LANG: "zh_CN.utf8" + LC_ALL: "zh_CN.utf8" + EXTRA_APT: "language-pack-zh-hans" py37_np_dev: ENV_FILE: ci/deps/azure-37-numpydev.yaml @@ -54,10 +62,16 @@ jobs: steps: - script: | - if [ "$(uname)" == "Linux" ]; then sudo apt-get install -y libc6-dev-i386 $EXTRA_APT; fi - echo '##vso[task.prependpath]$(HOME)/miniconda3/bin' - echo "Creating Environment" - ci/setup_env.sh + if [ "$(uname)" == "Linux" ]; then + sudo apt-get update + sudo apt-get install -y libc6-dev-i386 $EXTRA_APT + fi + displayName: 'Install extra packages' + + - script: echo '##vso[task.prependpath]$(HOME)/miniconda3/bin' + displayName: 'Set conda path' + + - script: ci/setup_env.sh displayName: 'Setup environment and build pandas' - script: | diff --git a/ci/azure/windows.yml b/ci/azure/windows.yml index 03529bd6569c6..187a5db99802f 100644 --- a/ci/azure/windows.yml +++ b/ci/azure/windows.yml @@ -34,7 +34,7 @@ jobs: - bash: | source activate pandas-dev conda list - python setup.py build_ext -q -i + python setup.py build_ext -q -i -j 4 python -m pip install --no-build-isolation -e . displayName: 'Build' diff --git a/ci/deps/azure-36-locale_slow.yaml b/ci/deps/azure-36-locale_slow.yaml index 2bb2b00319382..48ac50c001715 100644 --- a/ci/deps/azure-36-locale_slow.yaml +++ b/ci/deps/azure-36-locale_slow.yaml @@ -13,7 +13,7 @@ dependencies: - pytest-azurepipelines # pandas dependencies - - beautifulsoup4==4.6.0 + - beautifulsoup4=4.6.0 - bottleneck=1.2.* - lxml - matplotlib=2.2.2 diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 0b68164e5767e..8020680d617d7 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -5,17 +5,6 @@ # https://github.com/pytest-dev/pytest/issues/1075 export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') -if [ -n "$LOCALE_OVERRIDE" ]; then - export LC_ALL="$LOCALE_OVERRIDE" - export LANG="$LOCALE_OVERRIDE" - PANDAS_LOCALE=`python -c 'import pandas; pandas.get_option("display.encoding")'` - if [[ "$LOCALE_OVERRIDE" != "$PANDAS_LOCALE" ]]; then - echo "pandas could not detect the locale. System locale: $LOCALE_OVERRIDE, pandas detected: $PANDAS_LOCALE" - # TODO Not really aborting the tests until https://github.com/pandas-dev/pandas/issues/23923 is fixed - # exit 1 - fi -fi - if [[ "not network" == *"$PATTERN"* ]]; then export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4; fi diff --git a/ci/setup_env.sh b/ci/setup_env.sh index 2b488295b5cc2..db28eaea8956e 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -1,15 +1,15 @@ #!/bin/bash -e # edit the locale file if needed -if [ -n "$LOCALE_OVERRIDE" ]; then +if [[ "$(uname)" == "Linux" && -n "$LC_ALL" ]]; then echo "Adding locale to the first line of pandas/__init__.py" rm -f pandas/__init__.pyc - SEDC="3iimport locale\nlocale.setlocale(locale.LC_ALL, '$LOCALE_OVERRIDE')\n" + SEDC="3iimport locale\nlocale.setlocale(locale.LC_ALL, '$LC_ALL')\n" sed -i "$SEDC" pandas/__init__.py + echo "[head -4 pandas/__init__.py]" head -4 pandas/__init__.py echo - sudo locale-gen "$LOCALE_OVERRIDE" fi MINICONDA_DIR="$HOME/miniconda3" diff --git a/doc/source/getting_started/10min.rst b/doc/source/getting_started/10min.rst index 66e500131b316..3055a22129b91 100644 --- a/doc/source/getting_started/10min.rst +++ b/doc/source/getting_started/10min.rst @@ -697,8 +697,9 @@ Plotting See the :ref:`Plotting ` docs. +We use the standard convention for referencing the matplotlib API: + .. ipython:: python - :suppress: import matplotlib.pyplot as plt plt.close('all') diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index 77568f3bcb244..a45d7a4fa1547 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -15,6 +15,10 @@ Nullable integer data type IntegerArray is currently experimental. Its API or implementation may change without warning. +.. versionchanged:: 1.0.0 + + Now uses :attr:`pandas.NA` as the missing value rather + than :attr:`numpy.nan`. In :ref:`missing_data`, we saw that pandas primarily uses ``NaN`` to represent missing data. Because ``NaN`` is a float, this forces an array of integers with @@ -23,6 +27,9 @@ much. But if your integer column is, say, an identifier, casting to float can be problematic. Some integers cannot even be represented as floating point numbers. +Construction +------------ + Pandas can represent integer data with possibly missing values using :class:`arrays.IntegerArray`. This is an :ref:`extension types ` implemented within pandas. @@ -39,6 +46,12 @@ NumPy's ``'int64'`` dtype: pd.array([1, 2, np.nan], dtype="Int64") +All NA-like values are replaced with :attr:`pandas.NA`. + +.. ipython:: python + + pd.array([1, 2, np.nan, None, pd.NA], dtype="Int64") + This array can be stored in a :class:`DataFrame` or :class:`Series` like any NumPy array. @@ -78,6 +91,9 @@ with the dtype. In the future, we may provide an option for :class:`Series` to infer a nullable-integer dtype. +Operations +---------- + Operations involving an integer array will behave similar to NumPy arrays. Missing values will be propagated, and the data will be coerced to another dtype if needed. @@ -123,3 +139,15 @@ Reduction and groupby operations such as 'sum' work as well. df.sum() df.groupby('B').A.sum() + +Scalar NA Value +--------------- + +:class:`arrays.IntegerArray` uses :attr:`pandas.NA` as its scalar +missing value. Slicing a single element that's missing will return +:attr:`pandas.NA` + +.. ipython:: python + + a = pd.array([1, None], dtype="Int64") + a[1] diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 96ea682dd3caf..0da6d7b20a5c0 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -56,7 +56,7 @@ Dedicated string data type ^^^^^^^^^^^^^^^^^^^^^^^^^^ We've added :class:`StringDtype`, an extension type dedicated to string data. -Previously, strings were typically stored in object-dtype NumPy arrays. +Previously, strings were typically stored in object-dtype NumPy arrays. (:issue:`29975`) .. warning:: @@ -221,8 +221,8 @@ Other enhancements - DataFrame constructor preserve `ExtensionArray` dtype with `ExtensionArray` (:issue:`11363`) - :meth:`DataFrame.sort_values` and :meth:`Series.sort_values` have gained ``ignore_index`` keyword to be able to reset index after sorting (:issue:`30114`) - :meth:`DataFrame.to_markdown` and :meth:`Series.to_markdown` added (:issue:`11052`) - - :meth:`DataFrame.drop_duplicates` has gained ``ignore_index`` keyword to reset index (:issue:`30114`) +- Added new writer for exporting Stata dta files in version 118, ``StataWriter118``. This format supports exporting strings containing Unicode characters (:issue:`23573`) Build Changes ^^^^^^^^^^^^^ @@ -365,6 +365,64 @@ The following methods now also correctly output values for unobserved categories As a reminder, you can specify the ``dtype`` to disable all inference. +:class:`arrays.IntegerArray` now uses :attr:`pandas.NA` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`arrays.IntegerArray` now uses :attr:`pandas.NA` rather than +:attr:`numpy.nan` as its missing value marker (:issue:`29964`). + +*pandas 0.25.x* + +.. code-block:: python + + >>> a = pd.array([1, 2, None], dtype="Int64") + >>> a + + [1, 2, NaN] + Length: 3, dtype: Int64 + + >>> a[2] + nan + +*pandas 1.0.0* + +.. ipython:: python + + a = pd.array([1, 2, None], dtype="Int64") + a[2] + +See :ref:`missing_data.NA` for more on the differences between :attr:`pandas.NA` +and :attr:`numpy.nan`. + +:class:`arrays.IntegerArray` comparisons return :class:`arrays.BooleanArray` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Comparison operations on a :class:`arrays.IntegerArray` now returns a +:class:`arrays.BooleanArray` rather than a NumPy array (:issue:`29964`). + +*pandas 0.25.x* + +.. code-block:: python + + >>> a = pd.array([1, 2, None], dtype="Int64") + >>> a + + [1, 2, NaN] + Length: 3, dtype: Int64 + + >>> a > 1 + array([False, True, False]) + +*pandas 1.0.0* + +.. ipython:: python + + a = pd.array([1, 2, None], dtype="Int64") + a > 1 + +Note that missing values now propagate, rather than always comparing unequal +like :attr:`numpy.nan`. See :ref:`missing_data.NA` for more. + By default :meth:`Categorical.min` now returns the minimum instead of np.nan ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -723,6 +781,7 @@ Datetimelike - Bug in :class:`Timestamp` subtraction when subtracting a :class:`Timestamp` from a ``np.datetime64`` object incorrectly raising ``TypeError`` (:issue:`28286`) - Addition and subtraction of integer or integer-dtype arrays with :class:`Timestamp` will now raise ``NullFrequencyError`` instead of ``ValueError`` (:issue:`28268`) - Bug in :class:`Series` and :class:`DataFrame` with integer dtype failing to raise ``TypeError`` when adding or subtracting a ``np.datetime64`` object (:issue:`28080`) +- Bug in :meth:`Series.astype`, :meth:`Index.astype`, and :meth:`DataFrame.astype` failing to handle ``NaT`` when casting to an integer dtype (:issue:`28492`) - Bug in :class:`Week` with ``weekday`` incorrectly raising ``AttributeError`` instead of ``TypeError`` when adding or subtracting an invalid type (:issue:`28530`) - Bug in :class:`DataFrame` arithmetic operations when operating with a :class:`Series` with dtype `'timedelta64[ns]'` (:issue:`28049`) - Bug in :func:`pandas.core.groupby.generic.SeriesGroupBy.apply` raising ``ValueError`` when a column in the original DataFrame is a datetime and the column labels are not standard integers (:issue:`28247`) @@ -736,6 +795,7 @@ Datetimelike - Bug in :class:`DatetimeIndex` addition when adding a non-optimized :class:`DateOffset` incorrectly dropping timezone information (:issue:`30336`) - Bug in :meth:`DataFrame.drop` where attempting to drop non-existent values from a DatetimeIndex would yield a confusing error message (:issue:`30399`) - Bug in :meth:`DataFrame.append` would remove the timezone-awareness of new data (:issue:`30238`) +- Bug in :meth:`Series.cummin` and :meth:`Series.cummax` with timezone-aware dtype incorrectly dropping its timezone (:issue:`15553`) - Bug in :class:`DatetimeArray`, :class:`TimedeltaArray`, and :class:`PeriodArray` where inplace addition and subtraction did not actually operate inplace (:issue:`24115`) Timedelta @@ -765,6 +825,7 @@ Numeric - Bug in :class:`NumericIndex` construction that caused :class:`UInt64Index` to be casted to :class:`Float64Index` when integers in the ``np.uint64`` range were used to index a :class:`DataFrame` (:issue:`28279`) - Bug in :meth:`Series.interpolate` when using method=`index` with an unsorted index, would previously return incorrect results. (:issue:`21037`) - Bug in :meth:`DataFrame.round` where a :class:`DataFrame` with a :class:`CategoricalIndex` of :class:`IntervalIndex` columns would incorrectly raise a ``TypeError`` (:issue:`30063`) +- Bug in :class:`DataFrame` cumulative operations (e.g. cumsum, cummax) incorrect casting to object-dtype (:issue:`19296`) Conversion ^^^^^^^^^^ @@ -784,6 +845,7 @@ Interval - Bug in :meth:`IntervalIndex.get_indexer` where a :class:`Categorical` or :class:`CategoricalIndex` ``target`` would incorrectly raise a ``TypeError`` (:issue:`30063`) - Bug in ``pandas.core.dtypes.cast.infer_dtype_from_scalar`` where passing ``pandas_dtype=True`` did not infer :class:`IntervalDtype` (:issue:`30337`) +- Bug in :class:`IntervalDtype` where the ``kind`` attribute was incorrectly set as ``None`` instead of ``"O"`` (:issue:`30568`) Indexing ^^^^^^^^ @@ -832,6 +894,7 @@ I/O - Bug in :func:`read_json` where default encoding was not set to ``utf-8`` (:issue:`29565`) - Bug in :class:`PythonParser` where str and bytes were being mixed when dealing with the decimal field (:issue:`29650`) - :meth:`read_gbq` now accepts ``progress_bar_type`` to display progress bar while the data downloads. (:issue:`29857`) +- Bug in :func:`pandas.io.json.json_normalize` where a missing value in the location specified by `record_path` would raise a ``TypeError`` (:issue:`30148`) Plotting ^^^^^^^^ @@ -847,12 +910,13 @@ Plotting - :func:`set_option` now validates that the plot backend provided to ``'plotting.backend'`` implements the backend when the option is set, rather than when a plot is created (:issue:`28163`) - :meth:`DataFrame.plot` now allow a ``backend`` keyword argument to allow changing between backends in one session (:issue:`28619`). - Bug in color validation incorrectly raising for non-color styles (:issue:`29122`). +- Allow :meth: `DataFrame.plot.scatter` to plot ``objects`` and ``datetime`` type data (:issue:`18755`, :issue:`30391`) - Bug in :meth:`DataFrame.hist`, ``xrot=0`` does not work with ``by`` and subplots (:issue:`30288`). Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- +- Bug in :meth:`DataFrame.groupby.apply` only showing output from a single group when function returns an :class:`Index` (:issue:`28652`) - Bug in :meth:`DataFrame.groupby` with multiple groups where an ``IndexError`` would be raised if any group contained all NA values (:issue:`20519`) - Bug in :meth:`pandas.core.resample.Resampler.size` and :meth:`pandas.core.resample.Resampler.count` returning wrong dtype when used with an empty series or dataframe (:issue:`28427`) - Bug in :meth:`DataFrame.rolling` not allowing for rolling over datetimes when ``axis=1`` (:issue:`28192`) @@ -869,6 +933,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` when using axis=1 and having a single level columns index (:issue:`30208`) - Bug in :meth:`DataFrame.groupby` when using nunique on axis=1 (:issue:`30253`) - Bug in :meth:`GroupBy.quantile` with multiple list-like q value and integer column names (:issue:`30289`) +- Bug in :meth:`GroupBy.pct_change` and :meth:`SeriesGroupBy.pct_change` causes ``TypeError`` when ``fill_method`` is ``None`` (:issue:`30463`) Reshaping ^^^^^^^^^ @@ -911,13 +976,16 @@ Other - Bug in :meth:`Series.diff` where a boolean series would incorrectly raise a ``TypeError`` (:issue:`17294`) - :meth:`Series.append` will no longer raise a ``TypeError`` when passed a tuple of ``Series`` (:issue:`28410`) - Fix corrupted error message when calling ``pandas.libs._json.encode()`` on a 0d array (:issue:`18878`) +- Bug in ``pd.core.util.hashing.hash_pandas_object`` where arrays containing tuples were incorrectly treated as non-hashable (:issue:`28969`) - Bug in :meth:`DataFrame.append` that raised ``IndexError`` when appending with empty list (:issue:`28769`) - Fix :class:`AbstractHolidayCalendar` to return correct results for years after 2030 (now goes up to 2200) (:issue:`27790`) - Fixed :class:`IntegerArray` returning ``inf`` rather than ``NaN`` for operations dividing by 0 (:issue:`27398`) - Fixed ``pow`` operations for :class:`IntegerArray` when the other value is ``0`` or ``1`` (:issue:`29997`) - Bug in :meth:`Series.count` raises if use_inf_as_na is enabled (:issue:`29478`) -- Bug in :class:`Index` where a non-hashable name could be set without raising ``TypeError`` (:issue:29069`) +- Bug in :class:`Index` where a non-hashable name could be set without raising ``TypeError`` (:issue:`29069`) +- Bug in :class:`DataFrame` constructor when passing a 2D ``ndarray`` and an extension dtype (:issue:`12513`) +- Bug in :meth:`DaataFrame.to_csv` when supplied a series with a ``dtype="string"`` and a ``na_rep``, the ``na_rep`` was being truncated to 2 characters. (:issue:`29975`) .. _whatsnew_1000.contributors: diff --git a/environment.yml b/environment.yml index ab10d8b7e0b20..46fb5e7a19078 100644 --- a/environment.yml +++ b/environment.yml @@ -70,7 +70,7 @@ dependencies: - blosc - bottleneck>=1.2.1 - ipykernel - - ipython>=5.6.0 + - ipython>=5.6.0,<=7.10.1 # see gh-30527 - jinja2 # pandas.Styler - matplotlib>=2.2.2 # pandas.plotting, Series.plot, DataFrame.plot - numexpr>=2.6.8 diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 6844df495547a..0a3009f74492f 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -197,7 +197,7 @@ def __setattr__(self, key, val): else: raise OptionError("You can only set the value of existing options") - def __getattr__(self, key): + def __getattr__(self, key: str): prefix = object.__getattribute__(self, "prefix") if prefix: prefix += "." diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index d735890f7d07e..5298d8c5ed34e 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -70,6 +70,12 @@ def hash_object_array(object[:] arr, object key, object encoding='utf8'): # null, stringify and encode data = str(val).encode(encoding) + elif isinstance(val, tuple): + # GH#28969 we could have a tuple, but need to ensure that + # the tuple entries are themselves hashable before converting + # to str + hash(val) + data = str(val).encode(encoding) else: raise TypeError(f"{val} of type {type(val)} is not a valid type " "for hashing, must be string or null") diff --git a/pandas/_libs/intervaltree.pxi.in b/pandas/_libs/intervaltree.pxi.in index 333c05f7c0dc5..d09413bfa5210 100644 --- a/pandas/_libs/intervaltree.pxi.in +++ b/pandas/_libs/intervaltree.pxi.in @@ -6,12 +6,17 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in from pandas._libs.algos import is_monotonic -ctypedef fused scalar_t: - float64_t - float32_t +ctypedef fused int_scalar_t: int64_t - int32_t + float64_t + +ctypedef fused uint_scalar_t: uint64_t + float64_t + +ctypedef fused scalar_t: + int_scalar_t + uint_scalar_t # ---------------------------------------------------------------------- # IntervalTree @@ -128,7 +133,12 @@ cdef class IntervalTree(IntervalMixin): result = Int64Vector() old_len = 0 for i in range(len(target)): - self.root.query(result, target[i]) + try: + self.root.query(result, target[i]) + except OverflowError: + # overflow -> no match, which is already handled below + pass + if result.data.n == old_len: result.append(-1) elif result.data.n > old_len + 1: @@ -150,7 +160,12 @@ cdef class IntervalTree(IntervalMixin): missing = Int64Vector() old_len = 0 for i in range(len(target)): - self.root.query(result, target[i]) + try: + self.root.query(result, target[i]) + except OverflowError: + # overflow -> no match, which is already handled below + pass + if result.data.n == old_len: result.append(-1) missing.append(i) @@ -194,7 +209,7 @@ cdef sort_values_and_indices(all_values, all_indices, subset): {{py: nodes = [] -for dtype in ['float32', 'float64', 'int32', 'int64', 'uint64']: +for dtype in ['float64', 'int64', 'uint64']: for closed, cmp_left, cmp_right in [ ('left', '<=', '<'), ('right', '<', '<='), @@ -202,19 +217,26 @@ for dtype in ['float32', 'float64', 'int32', 'int64', 'uint64']: ('neither', '<', '<')]: cmp_left_converse = '<' if cmp_left == '<=' else '<=' cmp_right_converse = '<' if cmp_right == '<=' else '<=' + if dtype.startswith('int'): + fused_prefix = 'int_' + elif dtype.startswith('uint'): + fused_prefix = 'uint_' + elif dtype.startswith('float'): + fused_prefix = '' nodes.append((dtype, dtype.title(), closed, closed.title(), cmp_left, cmp_right, cmp_left_converse, - cmp_right_converse)) + cmp_right_converse, + fused_prefix)) }} NODE_CLASSES = {} {{for dtype, dtype_title, closed, closed_title, cmp_left, cmp_right, - cmp_left_converse, cmp_right_converse in nodes}} + cmp_left_converse, cmp_right_converse, fused_prefix in nodes}} cdef class {{dtype_title}}Closed{{closed_title}}IntervalNode: """Non-terminal node for an IntervalTree @@ -317,7 +339,7 @@ cdef class {{dtype_title}}Closed{{closed_title}}IntervalNode: @cython.wraparound(False) @cython.boundscheck(False) @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar_t point): + cpdef query(self, Int64Vector result, {{fused_prefix}}scalar_t point): """Recursively query this node and its sub-nodes for intervals that overlap with the query point. """ diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 0019fc4b36d20..8571761f77265 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -1,3 +1,4 @@ +from copy import copy from distutils.version import LooseVersion from cython import Py_ssize_t @@ -15,7 +16,7 @@ from numpy cimport (ndarray, cnp.import_array() cimport pandas._libs.util as util -from pandas._libs.lib import maybe_convert_objects +from pandas._libs.lib import maybe_convert_objects, is_scalar cdef _check_result_array(object obj, Py_ssize_t cnt): @@ -492,14 +493,19 @@ def apply_frame_axis0(object frame, object f, object names, # Need to infer if low level index slider will cause segfaults require_slow_apply = i == 0 and piece is chunk try: - if piece.index is chunk.index: - piece = piece.copy(deep='all') - else: + if piece.index is not chunk.index: mutated = True except AttributeError: # `piece` might not have an index, could be e.g. an int pass + if not is_scalar(piece): + # Need to copy data to avoid appending references + if hasattr(piece, "copy"): + piece = piece.copy(deep="all") + else: + piece = copy(piece) + results.append(piece) # If the data was modified inplace we need to diff --git a/pandas/_typing.py b/pandas/_typing.py index 69b08c581cff9..7b89486751f12 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -23,21 +23,29 @@ from pandas.core.indexes.base import Index # noqa: F401 from pandas.core.series import Series # noqa: F401 from pandas.core.generic import NDFrame # noqa: F401 + from pandas import Interval # noqa: F401 +# array-like AnyArrayLike = TypeVar("AnyArrayLike", "ExtensionArray", "Index", "Series", np.ndarray) ArrayLike = TypeVar("ArrayLike", "ExtensionArray", np.ndarray) + +# scalars + +PythonScalar = Union[str, int, float, bool] DatetimeLikeScalar = TypeVar("DatetimeLikeScalar", "Period", "Timestamp", "Timedelta") +PandasScalar = Union["Period", "Timestamp", "Timedelta", "Interval"] +Scalar = Union[PythonScalar, PandasScalar] + +# other + Dtype = Union[str, np.dtype, "ExtensionDtype"] FilePathOrBuffer = Union[str, Path, IO[AnyStr]] - FrameOrSeries = TypeVar("FrameOrSeries", bound="NDFrame") -Scalar = Union[str, int, float, bool] Axis = Union[str, int] Ordered = Optional[bool] -JSONSerializable = Union[Scalar, List, Dict] - +JSONSerializable = Union[PythonScalar, List, Dict] Axes = Collection # to maintain type information across generic functions and parametrization -_T = TypeVar("_T") +T = TypeVar("T") diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index fffe09a74571e..7158f251ad805 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -169,13 +169,6 @@ def validate_clip_with_axis(axis, args, kwargs): return axis -COMPRESS_DEFAULTS: "OrderedDict[str, Any]" = OrderedDict() -COMPRESS_DEFAULTS["axis"] = None -COMPRESS_DEFAULTS["out"] = None -validate_compress = CompatValidator( - COMPRESS_DEFAULTS, fname="compress", method="both", max_fname_arg_count=1 -) - CUM_FUNC_DEFAULTS: "OrderedDict[str, Any]" = OrderedDict() CUM_FUNC_DEFAULTS["dtype"] = None CUM_FUNC_DEFAULTS["out"] = None diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index e8fd390456f82..0a1a1376bfc8d 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -169,9 +169,9 @@ def __new__(cls) -> "DataFrame": # type: ignore # our Unpickler sub-class to override methods and some dispatcher -# functions for compat - +# functions for compat and uses a non-public class of the pickle module. +# error: Name 'pkl._Unpickler' is not defined class Unpickler(pkl._Unpickler): # type: ignore def find_class(self, module, name): # override superclass diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index a8fcd6d03847c..7301c0ab434a0 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -524,7 +524,7 @@ def astype(self, dtype, copy=True): na_value = np.nan # coerce data = self._coerce_to_ndarray(na_value=na_value) - return astype_nansafe(data, dtype, copy=None) + return astype_nansafe(data, dtype, copy=False) def value_counts(self, dropna=True): """ @@ -730,7 +730,6 @@ def all(self, skipna: bool = True, **kwargs): @classmethod def _create_logical_method(cls, op): def logical_method(self, other): - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): # Rely on pandas to unbox and dispatch to us. return NotImplemented @@ -755,9 +754,8 @@ def logical_method(self, other): if other_is_scalar and not (other is libmissing.NA or lib.is_bool(other)): raise TypeError( - "'other' should be pandas.NA or a bool. Got {} instead.".format( - type(other).__name__ - ) + "'other' should be pandas.NA or a bool. " + f"Got {type(other).__name__} instead." ) if not other_is_scalar and len(self) != len(other): @@ -772,14 +770,17 @@ def logical_method(self, other): return BooleanArray(result, mask) - name = "__{name}__".format(name=op.__name__) + name = f"__{op.__name__}__" return set_function_name(logical_method, name, cls) @classmethod def _create_comparison_method(cls, op): def cmp_method(self, other): + from pandas.arrays import IntegerArray - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): + if isinstance( + other, (ABCDataFrame, ABCSeries, ABCIndexClass, IntegerArray) + ): # Rely on pandas to unbox and dispatch to us. return NotImplemented @@ -819,7 +820,7 @@ def cmp_method(self, other): return BooleanArray(result, mask, copy=False) - name = "__{name}__".format(name=op.__name__) + name = f"__{op.__name__}" return set_function_name(cmp_method, name, cls) def _reduce(self, name, skipna=True, **kwargs): @@ -922,7 +923,7 @@ def boolean_arithmetic_method(self, other): return self._maybe_mask_result(result, mask, other, op_name) - name = "__{name}__".format(name=op_name) + name = f"__{op_name}__" return set_function_name(boolean_arithmetic_method, name, cls) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 4d6be8221557d..53051baa8e67e 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1,6 +1,6 @@ import operator from shutil import get_terminal_size -from typing import Type, Union, cast +from typing import Dict, Hashable, List, Type, Union, cast from warnings import warn import numpy as np @@ -8,7 +8,7 @@ from pandas._config import get_option from pandas._libs import algos as libalgos, hashtable as htable -from pandas._typing import ArrayLike, Dtype, Ordered +from pandas._typing import ArrayLike, Dtype, Ordered, Scalar from pandas.compat.numpy import function as nv from pandas.util._decorators import ( Appender, @@ -511,7 +511,7 @@ def itemsize(self) -> int: """ return self.categories.itemsize - def tolist(self) -> list: + def tolist(self) -> List[Scalar]: """ Return a list of the values. @@ -2067,7 +2067,7 @@ def __setitem__(self, key, value): lindexer = self._maybe_coerce_indexer(lindexer) self._codes[key] = lindexer - def _reverse_indexer(self): + def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: """ Compute the inverse of a categorical, returning a dict of categories -> indexers. @@ -2097,8 +2097,8 @@ def _reverse_indexer(self): self.codes.astype("int64"), categories.size ) counts = counts.cumsum() - result = (r[start:end] for start, end in zip(counts, counts[1:])) - result = dict(zip(categories, result)) + _result = (r[start:end] for start, end in zip(counts, counts[1:])) + result = dict(zip(categories, _result)) return result # reduction ops # diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index ceeaf018eb5f3..763a6fe560283 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -27,7 +27,6 @@ is_integer_dtype, is_list_like, is_object_dtype, - is_offsetlike, is_period_dtype, is_string_dtype, is_timedelta64_dtype, @@ -1075,8 +1074,6 @@ def _sub_period_array(self, other): f"cannot subtract {other.dtype}-dtype from {type(self).__name__}" ) - if len(self) != len(other): - raise ValueError("cannot subtract arrays/indices of unequal length") if self.freq != other.freq: msg = DIFFERENT_FREQ.format( cls=type(self).__name__, own_freq=self.freqstr, other_freq=other.freqstr @@ -1093,47 +1090,13 @@ def _sub_period_array(self, other): new_values[mask] = NaT return new_values - def _addsub_int_array(self, other, op): - """ - Add or subtract array-like of integers equivalent to applying - `_time_shift` pointwise. - - Parameters - ---------- - other : Index, ExtensionArray, np.ndarray - integer-dtype - op : {operator.add, operator.sub} - - Returns - ------- - result : same class as self - """ - # _addsub_int_array is overridden by PeriodArray - assert not is_period_dtype(self) - assert op in [operator.add, operator.sub] - - if self.freq is None: - # GH#19123 - raise NullFrequencyError("Cannot shift with no freq") - - elif isinstance(self.freq, Tick): - # easy case where we can convert to timedelta64 operation - td = Timedelta(self.freq) - return op(self, td * other) - - # We should only get here with DatetimeIndex; dispatch - # to _addsub_offset_array - assert not is_timedelta64_dtype(self) - return op(self, np.array(other) * self.freq) - - def _addsub_offset_array(self, other, op): + def _addsub_object_array(self, other: np.ndarray, op): """ Add or subtract array-like of DateOffset objects Parameters ---------- - other : Index, np.ndarray - object-dtype containing pd.DateOffset objects + other : np.ndarray[object] op : {operator.add, operator.sub} Returns @@ -1157,7 +1120,12 @@ def _addsub_offset_array(self, other, op): kwargs = {} if not is_period_dtype(self): kwargs["freq"] = "infer" - return self._from_sequence(res_values, **kwargs) + try: + res = type(self)._from_sequence(res_values, **kwargs) + except ValueError: + # e.g. we've passed a Timestamp to TimedeltaArray + res = res_values + return res def _time_shift(self, periods, freq=None): """ @@ -1220,9 +1188,9 @@ def __add__(self, other): elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(other) - elif is_offsetlike(other): - # Array/Index of DateOffset objects - result = self._addsub_offset_array(other, operator.add) + elif is_object_dtype(other): + # e.g. Array/Index of DateOffset objects + result = self._addsub_object_array(other, operator.add) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] return self._add_datetime_arraylike(other) @@ -1275,9 +1243,9 @@ def __sub__(self, other): elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(-other) - elif is_offsetlike(other): - # Array/Index of DateOffset objects - result = self._addsub_offset_array(other, operator.sub) + elif is_object_dtype(other): + # e.g. Array/Index of DateOffset objects + result = self._addsub_object_array(other, operator.sub) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] result = self._sub_datetime_arraylike(other) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 3f5a4ca49702f..62f31addedc0b 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -1,10 +1,10 @@ import numbers -from typing import Type +from typing import Any, Tuple, Type import warnings import numpy as np -from pandas._libs import lib +from pandas._libs import lib, missing as libmissing from pandas.compat import set_function_name from pandas.util._decorators import cache_readonly @@ -44,7 +44,7 @@ class _IntegerDtype(ExtensionDtype): name: str base = None type: Type - na_value = np.nan + na_value = libmissing.NA def __repr__(self) -> str: sign = "U" if self.is_unsigned_integer else "" @@ -263,6 +263,11 @@ class IntegerArray(ExtensionArray, ExtensionOpsMixin): .. versionadded:: 0.24.0 + .. versionchanged:: 1.0.0 + + Now uses :attr:`pandas.NA` as the missing value rather + than :attr:`numpy.nan`. + .. warning:: IntegerArray is currently experimental, and its API or internal @@ -358,14 +363,6 @@ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): def _from_factorized(cls, values, original): return integer_array(values, dtype=original.dtype) - def _formatter(self, boxed=False): - def fmt(x): - if isna(x): - return "NaN" - return str(x) - - return fmt - def __getitem__(self, item): if is_integer(item): if self._mask[item]: @@ -373,14 +370,30 @@ def __getitem__(self, item): return self._data[item] return type(self)(self._data[item], self._mask[item]) - def _coerce_to_ndarray(self): + def _coerce_to_ndarray(self, dtype=None, na_value=lib._no_default): """ coerce to an ndarary of object dtype """ + if dtype is None: + dtype = object + + if na_value is lib._no_default and is_float_dtype(dtype): + na_value = np.nan + elif na_value is lib._no_default: + na_value = libmissing.NA + + if is_integer_dtype(dtype): + # Specifically, a NumPy integer dtype, not a pandas integer dtype, + # since we're coercing to a numpy dtype by definition in this function. + if not self.isna().any(): + return self._data.astype(dtype) + else: + raise ValueError( + "cannot convert to integer NumPy array with missing values" + ) - # TODO(jreback) make this better - data = self._data.astype(object) - data[self._mask] = self._na_value + data = self._data.astype(dtype) + data[self._mask] = na_value return data __array_priority__ = 1000 # higher than ndarray so ops dispatch to us @@ -390,7 +403,7 @@ def __array__(self, dtype=None): the array interface, return my values We return an object array here to preserve our scalar values """ - return self._coerce_to_ndarray() + return self._coerce_to_ndarray(dtype=dtype) def __arrow_array__(self, type=None): """ @@ -506,7 +519,7 @@ def isna(self): @property def _na_value(self): - return np.nan + return self.dtype.na_value @classmethod def _concat_same_type(cls, to_concat): @@ -545,8 +558,8 @@ def astype(self, dtype, copy=True): return type(self)(result, mask=self._mask, copy=False) # coerce - data = self._coerce_to_ndarray() - return astype_nansafe(data, dtype, copy=None) + data = self._coerce_to_ndarray(dtype=dtype) + return astype_nansafe(data, dtype, copy=False) @property def _ndarray_values(self) -> np.ndarray: @@ -600,12 +613,19 @@ def value_counts(self, dropna=True): # w/o passing the dtype array = np.append(array, [self._mask.sum()]) index = Index( - np.concatenate([index.values, np.array([np.nan], dtype=object)]), + np.concatenate( + [index.values, np.array([self.dtype.na_value], dtype=object)] + ), dtype=object, ) return Series(array, index=index) + def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: + # TODO: https://github.com/pandas-dev/pandas/issues/30037 + # use masked algorithms, rather than object-dtype / np.nan. + return self._coerce_to_ndarray(na_value=np.nan), np.nan + def _values_for_argsort(self) -> np.ndarray: """Return values for sorting. @@ -629,9 +649,11 @@ def _create_comparison_method(cls, op): @unpack_zerodim_and_defer(op.__name__) def cmp_method(self, other): + from pandas.arrays import BooleanArray + mask = None - if isinstance(other, IntegerArray): + if isinstance(other, (BooleanArray, IntegerArray)): other, mask = other._data, other._mask elif is_list_like(other): @@ -643,25 +665,35 @@ def cmp_method(self, other): if len(self) != len(other): raise ValueError("Lengths must match to compare") - # numpy will show a DeprecationWarning on invalid elementwise - # comparisons, this will raise in the future - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "elementwise", FutureWarning) - with np.errstate(all="ignore"): - method = getattr(self._data, f"__{op_name}__") - result = method(other) + if other is libmissing.NA: + # numpy does not handle pd.NA well as "other" scalar (it returns + # a scalar False instead of an array) + # This may be fixed by NA.__array_ufunc__. Revisit this check + # once that's implemented. + result = np.zeros(self._data.shape, dtype="bool") + mask = np.ones(self._data.shape, dtype="bool") + else: + with warnings.catch_warnings(): + # numpy may show a FutureWarning: + # elementwise comparison failed; returning scalar instead, + # but in the future will perform elementwise comparison + # before returning NotImplemented. We fall back to the correct + # behavior today, so that should be fine to ignore. + warnings.filterwarnings("ignore", "elementwise", FutureWarning) + with np.errstate(all="ignore"): + method = getattr(self._data, f"__{op_name}__") + result = method(other) if result is NotImplemented: result = invalid_comparison(self._data, other, op) # nans propagate if mask is None: - mask = self._mask + mask = self._mask.copy() else: mask = self._mask | mask - result[mask] = op_name == "ne" - return result + return BooleanArray(result, mask) name = f"__{op.__name__}__" return set_function_name(cmp_method, name, cls) @@ -673,7 +705,8 @@ def _reduce(self, name, skipna=True, **kwargs): # coerce to a nan-aware float if needed if mask.any(): data = self._data.astype("float64") - data[mask] = self._na_value + # We explicitly use NaN within reductions. + data[mask] = np.nan op = getattr(nanops, "nan" + name) result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) @@ -739,12 +772,13 @@ def integer_arithmetic_method(self, other): raise TypeError("can only perform ops with numeric values") else: - if not (is_float(other) or is_integer(other)): + if not (is_float(other) or is_integer(other) or other is libmissing.NA): raise TypeError("can only perform ops with numeric values") - # nans propagate if omask is None: mask = self._mask.copy() + if other is libmissing.NA: + mask |= True else: mask = self._mask | omask @@ -754,20 +788,23 @@ def integer_arithmetic_method(self, other): # x ** 0 is 1. if omask is not None: mask = np.where((other == 0) & ~omask, False, mask) - else: + elif other is not libmissing.NA: mask = np.where(other == 0, False, mask) elif op_name == "rpow": # 1 ** x is 1. if omask is not None: mask = np.where((other == 1) & ~omask, False, mask) - else: + elif other is not libmissing.NA: mask = np.where(other == 1, False, mask) # x ** 0 is 1. mask = np.where((self._data == 0) & ~self._mask, False, mask) - with np.errstate(all="ignore"): - result = op(self._data, other) + if other is libmissing.NA: + result = np.ones_like(self._data) + else: + with np.errstate(all="ignore"): + result = op(self._data, other) # divmod returns a tuple if op_name == "divmod": @@ -790,6 +827,11 @@ def integer_arithmetic_method(self, other): _dtype_docstring = """ An ExtensionDtype for {dtype} integer data. +.. versionchanged:: 1.0.0 + + Now uses :attr:`pandas.NA` as its missing value, + rather than :attr:`numpy.nan`. + Attributes ---------- None diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index df057ce5a0104..1eeb9ddc8e064 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -476,11 +476,6 @@ def to_timestamp(self, freq=None, how="start"): # -------------------------------------------------------------------- # Array-like / EA-Interface Methods - def _formatter(self, boxed=False): - if boxed: - return str - return "'{}'".format - @Appender(dtl.DatetimeLikeArrayMixin._validate_fill_value.__doc__) def _validate_fill_value(self, fill_value): if isna(fill_value): @@ -492,6 +487,9 @@ def _validate_fill_value(self, fill_value): raise ValueError(f"'fill_value' should be a Period. Got '{fill_value}'.") return fill_value + def _values_for_argsort(self): + return self._data + # -------------------------------------------------------------------- def _time_shift(self, periods, freq=None): @@ -582,6 +580,11 @@ def asfreq(self, freq=None, how="E"): # ------------------------------------------------------------------ # Rendering Methods + def _formatter(self, boxed=False): + if boxed: + return str + return "'{}'".format + def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): """ actually format my specific types @@ -634,12 +637,23 @@ def _sub_period(self, other): return new_data - @Appender(dtl.DatetimeLikeArrayMixin._addsub_int_array.__doc__) def _addsub_int_array( - self, - other: Union[ABCPeriodArray, ABCSeries, ABCPeriodIndex, np.ndarray], - op: Callable[[Any], Any], - ) -> ABCPeriodArray: + self, other: np.ndarray, op: Callable[[Any], Any], + ) -> "PeriodArray": + """ + Add or subtract array of integers; equivalent to applying + `_time_shift` pointwise. + + Parameters + ---------- + other : np.ndarray[integer-dtype] + op : {operator.add, operator.sub} + + Returns + ------- + result : PeriodArray + """ + assert op in [operator.add, operator.sub] if op is operator.sub: other = -other @@ -774,9 +788,6 @@ def _check_timedeltalike_freq_compat(self, other): _raise_on_incompatible(self, other) - def _values_for_argsort(self): - return self._data - PeriodArray._add_comparison_ops() diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py index e3e0064c84da3..55de41794b30e 100644 --- a/pandas/core/arrays/sparse/dtype.py +++ b/pandas/core/arrays/sparse/dtype.py @@ -64,7 +64,7 @@ class SparseDtype(ExtensionDtype): # hash(nan) is (sometimes?) 0. _metadata = ("_dtype", "_fill_value", "_is_na_fill_value") - def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None) -> None: + def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None): if isinstance(dtype, type(self)): if fill_value is None: diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index b95dfc9ba7580..11f4131df62a6 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -378,6 +378,9 @@ def astype(self, dtype, copy=True): return self return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy=copy) + # ---------------------------------------------------------------- + # Reductions + def sum( self, axis=None, @@ -507,13 +510,13 @@ def _add_datetimelike_scalar(self, other): dtype = DatetimeTZDtype(tz=other.tz) if other.tz else _NS_DTYPE return DatetimeArray(result, dtype=dtype, freq=self.freq) - def _addsub_offset_array(self, other, op): - # Add or subtract Array-like of DateOffset objects + def _addsub_object_array(self, other, op): + # Add or subtract Array-like of objects try: # TimedeltaIndex can only operate with a subset of DateOffset # subclasses. Incompatible classes will raise AttributeError, # which we re-raise as TypeError - return super()._addsub_offset_array(other, op) + return super()._addsub_object_array(other, op) except AttributeError: raise TypeError( f"Cannot add/subtract non-tick DateOffset to {type(self).__name__}" diff --git a/pandas/core/base.py b/pandas/core/base.py index 948b80fef4032..064a51bf0ce74 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -597,7 +597,7 @@ class IndexOpsMixin: # ndarray compatibility __array_priority__ = 1000 _deprecations: FrozenSet[str] = frozenset( - ["tolist", "item"] # tolist is not deprecated, just suppressed in the __dir__ + ["tolist"] # tolist is not deprecated, just suppressed in the __dir__ ) def transpose(self, *args, **kwargs): diff --git a/pandas/core/common.py b/pandas/core/common.py index 9017584171850..8a430a4aa7d11 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -9,11 +9,12 @@ from datetime import datetime, timedelta from functools import partial import inspect -from typing import Any, Iterable, Union +from typing import Any, Collection, Iterable, Union import numpy as np from pandas._libs import lib, tslibs +from pandas._typing import T from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( @@ -270,7 +271,7 @@ def maybe_make_list(obj): return obj -def maybe_iterable_to_list(obj: Union[Iterable, Any]) -> Union[list, Any]: +def maybe_iterable_to_list(obj: Union[Iterable[T], T]) -> Union[Collection[T], T]: """ If obj is Iterable but not list-like, consume into list. """ diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index 57348ad3b81a0..a1b1cffdd1d76 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -2,10 +2,12 @@ """ from functools import partial, wraps +from typing import Dict, Optional, Sequence, Tuple, Type, Union import warnings import numpy as np +from pandas._typing import FrameOrSeries from pandas.errors import PerformanceWarning from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries @@ -15,22 +17,27 @@ from pandas.core.computation.common import result_type_many -def _align_core_single_unary_op(term): +def _align_core_single_unary_op( + term, +) -> Tuple[Union[partial, Type[FrameOrSeries]], Optional[Dict[str, int]]]: + + typ: Union[partial, Type[FrameOrSeries]] + axes: Optional[Dict[str, int]] = None + if isinstance(term.value, np.ndarray): typ = partial(np.asanyarray, dtype=term.value.dtype) else: typ = type(term.value) - ret = (typ,) + if hasattr(term.value, "axes"): + axes = _zip_axes_from_type(typ, term.value.axes) - if not hasattr(term.value, "axes"): - ret += (None,) - else: - ret += (_zip_axes_from_type(typ, term.value.axes),) - return ret + return typ, axes -def _zip_axes_from_type(typ, new_axes): - axes = {ax_name: new_axes[ax_ind] for ax_ind, ax_name in typ._AXIS_NAMES.items()} +def _zip_axes_from_type( + typ: Type[FrameOrSeries], new_axes: Sequence[int] +) -> Dict[str, int]: + axes = {name: new_axes[i] for i, name in typ._AXIS_NAMES.items()} return axes diff --git a/pandas/core/computation/engines.py b/pandas/core/computation/engines.py index dbfd6c04eee32..9c5388faae1bd 100644 --- a/pandas/core/computation/engines.py +++ b/pandas/core/computation/engines.py @@ -3,6 +3,7 @@ """ import abc +from typing import Dict, Type from pandas.core.computation.align import align_terms, reconstruct_object from pandas.core.computation.ops import _mathops, _reductions @@ -53,7 +54,7 @@ def convert(self) -> str: """ return printing.pprint_thing(self.expr) - def evaluate(self): + def evaluate(self) -> object: """ Run the engine on the expression. @@ -62,7 +63,7 @@ def evaluate(self): Returns ------- - obj : object + object The result of the passed expression. """ if not self._is_aligned: @@ -101,12 +102,6 @@ class NumExprEngine(AbstractEngine): has_neg_frac = True - def __init__(self, expr): - super().__init__(expr) - - def convert(self) -> str: - return str(super().convert()) - def _evaluate(self): import numexpr as ne @@ -128,14 +123,14 @@ class PythonEngine(AbstractEngine): has_neg_frac = False - def __init__(self, expr): - super().__init__(expr) - def evaluate(self): return self.expr() - def _evaluate(self): + def _evaluate(self) -> None: pass -_engines = {"numexpr": NumExprEngine, "python": PythonEngine} +_engines: Dict[str, Type[AbstractEngine]] = { + "numexpr": NumExprEngine, + "python": PythonEngine, +} diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index 2e5a563b815b3..7599a82ddffed 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -5,6 +5,7 @@ """ import tokenize +from typing import Optional import warnings from pandas._libs.lib import _no_default @@ -17,7 +18,7 @@ from pandas.io.formats.printing import pprint_thing -def _check_engine(engine): +def _check_engine(engine: Optional[str]) -> str: """ Make sure a valid engine is passed. @@ -168,7 +169,7 @@ def _check_for_locals(expr: str, stack_level: int, parser: str): def eval( expr, parser="pandas", - engine=None, + engine: Optional[str] = None, truediv=_no_default, local_dict=None, global_dict=None, diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index ba0a4d81a88d3..afdd8a01ee003 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -300,14 +300,15 @@ def table_schema_cb(key): _enable_data_resource_formatter(cf.get_option(key)) -def is_terminal(): +def is_terminal() -> bool: """ Detect if Python is running in a terminal. Returns True if Python is running in a terminal or False if not. """ try: - ip = get_ipython() + # error: Name 'get_ipython' is not defined + ip = get_ipython() # type: ignore except NameError: # assume standard Python interpreter in a terminal return True else: diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 1dda51da49ffb..4a06ea9500770 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -276,10 +276,12 @@ def is_dtype(cls, dtype) -> bool: return False elif isinstance(dtype, cls): return True - try: - return cls.construct_from_string(dtype) is not None - except TypeError: - return False + if isinstance(dtype, str): + try: + return cls.construct_from_string(dtype) is not None + except TypeError: + return False + return False @property def _is_numeric(self) -> bool: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 946070f8fad98..0579c97747bae 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -6,6 +6,7 @@ from pandas._libs import lib, tslib, tslibs from pandas._libs.tslibs import NaT, OutOfBoundsDatetime, Period, iNaT +from pandas._libs.tslibs.timezones import tz_compare from pandas.util._validators import validate_bool_kwarg from .common import ( @@ -409,6 +410,14 @@ def maybe_promote(dtype, fill_value=np.nan): elif is_datetime64tz_dtype(dtype): if isna(fill_value): fill_value = NaT + elif not isinstance(fill_value, datetime): + dtype = np.dtype(np.object_) + elif fill_value.tzinfo is None: + dtype = np.dtype(np.object_) + elif not tz_compare(fill_value.tzinfo, dtype.tz): + # TODO: sure we want to cast here? + dtype = np.dtype(np.object_) + elif is_extension_array_dtype(dtype) and isna(fill_value): fill_value = dtype.na_value @@ -814,6 +823,8 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): if is_object_dtype(dtype): return tslib.ints_to_pydatetime(arr.view(np.int64)) elif dtype == np.int64: + if isna(arr).any(): + raise ValueError("Cannot convert NaT values to integer") return arr.view(dtype) # allow frequency conversions @@ -826,6 +837,8 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): if is_object_dtype(dtype): return tslibs.ints_to_pytimedelta(arr.view(np.int64)) elif dtype == np.int64: + if isna(arr).any(): + raise ValueError("Cannot convert NaT values to integer") return arr.view(dtype) if dtype not in [_INT64_DTYPE, _TD_DTYPE]: diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index dc22a79a2f3fe..8fc8b8300d21c 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -18,7 +18,6 @@ ) from pandas.core.dtypes.generic import ( ABCCategorical, - ABCDateOffset, ABCDatetimeIndex, ABCIndexClass, ABCPeriodArray, @@ -368,37 +367,6 @@ def is_categorical(arr) -> bool: return isinstance(arr, ABCCategorical) or is_categorical_dtype(arr) -def is_offsetlike(arr_or_obj) -> bool: - """ - Check if obj or all elements of list-like is DateOffset - - Parameters - ---------- - arr_or_obj : object - - Returns - ------- - boolean - Whether the object is a DateOffset or listlike of DatetOffsets - - Examples - -------- - >>> is_offsetlike(pd.DateOffset(days=1)) - True - >>> is_offsetlike('offset') - False - >>> is_offsetlike([pd.offsets.Minute(4), pd.offsets.MonthEnd()]) - True - >>> is_offsetlike(np.array([pd.DateOffset(months=3), pd.Timestamp.now()])) - False - """ - if isinstance(arr_or_obj, ABCDateOffset): - return True - elif is_list_like(arr_or_obj) and len(arr_or_obj) and is_object_dtype(arr_or_obj): - return all(isinstance(x, ABCDateOffset) for x in arr_or_obj) - return False - - def is_datetime64_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the datetime64 dtype. @@ -633,7 +601,14 @@ def is_string_dtype(arr_or_dtype) -> bool: # TODO: gh-15585: consider making the checks stricter. def condition(dtype) -> bool: - return dtype.kind in ("O", "S", "U") and not is_period_dtype(dtype) + return dtype.kind in ("O", "S", "U") and not is_excluded_dtype(dtype) + + def is_excluded_dtype(dtype) -> bool: + """ + These have kind = "O" but aren't string dtypes so need to be explicitly excluded + """ + is_excluded_checks = (is_period_dtype, is_interval_dtype) + return any(is_excluded(dtype) for is_excluded in is_excluded_checks) return _is_dtype(arr_or_dtype, condition) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index b77cd34700f10..eed4514baa817 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -882,7 +882,11 @@ def construct_from_string(cls, string): return cls(freq=string) except ValueError: pass - raise TypeError(f"Cannot construct a 'PeriodDtype' from '{string}'") + if isinstance(string, str): + msg = f"Cannot construct a 'PeriodDtype' from '{string}'" + else: + msg = f"'construct_from_string' expects a string, got {type(string)}" + raise TypeError(msg) def __str__(self) -> str_type: return self.name @@ -974,7 +978,7 @@ class IntervalDtype(PandasExtensionDtype): """ name = "interval" - kind: Optional[str_type] = None + kind: str_type = "O" str = "|O08" base = np.dtype("O") num = 103 diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index aa0f7d2aba1fc..4c3f8b7374465 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -4,7 +4,10 @@ # define abstract base classes to enable isinstance type checking on our # objects def create_pandas_abc_type(name, attr, comp): - @classmethod + + # https://github.com/python/mypy/issues/1006 + # error: 'classmethod' used with a non-method + @classmethod # type: ignore def _check(cls, inst) -> bool: return getattr(inst, attr, "_typ") in comp diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fc39b264d1598..d4676a998c948 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -100,6 +100,7 @@ from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.generic import NDFrame, _shared_docs +from pandas.core.groupby import generic as groupby_generic from pandas.core.indexes import base as ibase from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences from pandas.core.indexes.datetimes import DatetimeIndex @@ -1929,14 +1930,17 @@ def to_stata( >>> df.to_stata('animals.dta') # doctest: +SKIP """ kwargs = {} - if version not in (114, 117): - raise ValueError("Only formats 114 and 117 supported.") + if version not in (114, 117, 118): + raise ValueError("Only formats 114, 117 and 118 are supported.") if version == 114: if convert_strl is not None: - raise ValueError("strl support is only available when using format 117") + raise ValueError("strl is not supported in format 114") from pandas.io.stata import StataWriter as statawriter else: - from pandas.io.stata import StataWriter117 as statawriter + if version == 117: + from pandas.io.stata import StataWriter117 as statawriter + else: + from pandas.io.stata import StataWriter118 as statawriter kwargs["convert_strl"] = convert_strl @@ -4014,8 +4018,7 @@ def fillna( inplace=False, limit=None, downcast=None, - **kwargs, - ): + ) -> Optional["DataFrame"]: return super().fillna( value=value, method=method, @@ -4023,7 +4026,6 @@ def fillna( inplace=inplace, limit=limit, downcast=downcast, - **kwargs, ) @Appender(_shared_docs["replace"] % _shared_doc_kwargs) @@ -5148,7 +5150,7 @@ def reorder_levels(self, order, axis=0): # Arithmetic / combination related def _combine_frame(self, other, func, fill_value=None, level=None): - this, other = self.align(other, join="outer", level=level, copy=False) + # at this point we have `self._indexed_same(other)` if fill_value is None: # since _arith_op may be called in a loop, avoid function call @@ -5164,14 +5166,15 @@ def _arith_op(left, right): left, right = ops.fill_binop(left, right, fill_value) return func(left, right) - if ops.should_series_dispatch(this, other, func): + if ops.should_series_dispatch(self, other, func): # iterate over columns - new_data = ops.dispatch_to_series(this, other, _arith_op) + new_data = ops.dispatch_to_series(self, other, _arith_op) else: with np.errstate(all="ignore"): - res_values = _arith_op(this.values, other.values) - new_data = dispatch_fill_zeros(func, this.values, other.values, res_values) - return this._construct_result(new_data) + res_values = _arith_op(self.values, other.values) + new_data = dispatch_fill_zeros(func, self.values, other.values, res_values) + + return new_data def _combine_match_index(self, other, func): # at this point we have `self.index.equals(other.index)` @@ -5599,6 +5602,82 @@ def update( # ---------------------------------------------------------------------- # Data reshaping + @Appender( + """ +Examples +-------- +>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', +... 'Parrot', 'Parrot'], +... 'Max Speed': [380., 370., 24., 26.]}) +>>> df + Animal Max Speed +0 Falcon 380.0 +1 Falcon 370.0 +2 Parrot 24.0 +3 Parrot 26.0 +>>> df.groupby(['Animal']).mean() + Max Speed +Animal +Falcon 375.0 +Parrot 25.0 + +**Hierarchical Indexes** + +We can groupby different levels of a hierarchical index +using the `level` parameter: + +>>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], +... ['Captive', 'Wild', 'Captive', 'Wild']] +>>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) +>>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]}, +... index=index) +>>> df + Max Speed +Animal Type +Falcon Captive 390.0 + Wild 350.0 +Parrot Captive 30.0 + Wild 20.0 +>>> df.groupby(level=0).mean() + Max Speed +Animal +Falcon 370.0 +Parrot 25.0 +>>> df.groupby(level="Type").mean() + Max Speed +Type +Captive 210.0 +Wild 185.0 +""" + ) + @Appender(_shared_docs["groupby"] % _shared_doc_kwargs) + def groupby( + self, + by=None, + axis=0, + level=None, + as_index: bool = True, + sort: bool = True, + group_keys: bool = True, + squeeze: bool = False, + observed: bool = False, + ) -> "groupby_generic.DataFrameGroupBy": + + if level is None and by is None: + raise TypeError("You have to supply one of 'by' and 'level'") + axis = self._get_axis_number(axis) + + return groupby_generic.DataFrameGroupBy( + obj=self, + keys=by, + axis=axis, + level=level, + as_index=as_index, + sort=sort, + group_keys=group_keys, + squeeze=squeeze, + observed=observed, + ) _shared_docs[ "pivot" @@ -7662,6 +7741,26 @@ def _get_data(axis_matters): raise NotImplementedError(msg) return data + if numeric_only is not None and axis in [0, 1]: + df = self + if numeric_only is True: + df = _get_data(axis_matters=True) + if axis == 1: + df = df.T + axis = 0 + + out_dtype = "bool" if filter_type == "bool" else None + + # After possibly _get_data and transposing, we are now in the + # simple case where we can use BlockManager._reduce + res = df._data.reduce(op, axis=1, skipna=skipna, **kwds) + assert isinstance(res, dict) + if len(res): + assert len(res) == max(list(res.keys())) + 1, res.keys() + out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype) + out.index = df.columns + return out + if numeric_only is None: values = self.values try: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 08c7f38ce4c82..85bbf9b553b0a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4652,6 +4652,9 @@ def head(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: on position. It is useful for quickly testing if your object has the right type of data in it. + For negative values of `n`, this function returns all rows except + the last `n` rows, equivalent to ``df[:-n]``. + Parameters ---------- n : int, default 5 @@ -4659,7 +4662,7 @@ def head(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: Returns ------- - obj_head : same type as caller + same type as caller The first `n` rows of the caller object. See Also @@ -4699,6 +4702,17 @@ def head(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: 0 alligator 1 bee 2 falcon + + For negative values of `n` + + >>> df.head(-3) + animal + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + 5 parrot """ return self.iloc[:n] @@ -4711,6 +4725,9 @@ def tail(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: position. It is useful for quickly verifying data, for example, after sorting or appending rows. + For negative values of `n`, this function returns all rows except + the first `n` rows, equivalent to ``df[n:]``. + Parameters ---------- n : int, default 5 @@ -4758,6 +4775,17 @@ def tail(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: 6 shark 7 whale 8 zebra + + For negative values of `n` + + >>> df.tail(-3) + animal + 3 lion + 4 monkey + 5 parrot + 6 shark + 7 whale + 8 zebra """ if n == 0: @@ -7273,19 +7301,10 @@ def clip( return result - def groupby( - self, - by=None, - axis=0, - level=None, - as_index: bool_t = True, - sort: bool_t = True, - group_keys: bool_t = True, - squeeze: bool_t = False, - observed: bool_t = False, - ): - """ - Group DataFrame or Series using a mapper or by a Series of columns. + _shared_docs[ + "groupby" + ] = """ + Group %(klass)s using a mapper or by a Series of columns. A groupby operation involves some combination of splitting the object, applying a function, and combining the results. This can be @@ -7330,9 +7349,8 @@ def groupby( Returns ------- - DataFrameGroupBy or SeriesGroupBy - Depends on the calling object and returns groupby object that - contains information about the groups. + %(klass)sGroupBy + Returns a groupby object that contains information about the groups. See Also -------- @@ -7343,69 +7361,7 @@ def groupby( ----- See the `user guide `_ for more. - - Examples - -------- - >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', - ... 'Parrot', 'Parrot'], - ... 'Max Speed': [380., 370., 24., 26.]}) - >>> df - Animal Max Speed - 0 Falcon 380.0 - 1 Falcon 370.0 - 2 Parrot 24.0 - 3 Parrot 26.0 - >>> df.groupby(['Animal']).mean() - Max Speed - Animal - Falcon 375.0 - Parrot 25.0 - - **Hierarchical Indexes** - - We can groupby different levels of a hierarchical index - using the `level` parameter: - - >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], - ... ['Captive', 'Wild', 'Captive', 'Wild']] - >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) - >>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]}, - ... index=index) - >>> df - Max Speed - Animal Type - Falcon Captive 390.0 - Wild 350.0 - Parrot Captive 30.0 - Wild 20.0 - >>> df.groupby(level=0).mean() - Max Speed - Animal - Falcon 370.0 - Parrot 25.0 - >>> df.groupby(level=1).mean() - Max Speed - Type - Captive 210.0 - Wild 185.0 - """ - from pandas.core.groupby.groupby import get_groupby - - if level is None and by is None: - raise TypeError("You have to supply one of 'by' and 'level'") - axis = self._get_axis_number(axis) - - return get_groupby( - self, - by=by, - axis=axis, - level=level, - as_index=as_index, - sort=sort, - group_keys=group_keys, - squeeze=squeeze, - observed=observed, - ) + """ def asfreq( self, @@ -11086,44 +11042,67 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): else: axis = self._get_axis_number(axis) - y = com.values_from_object(self).copy() - d = self._construct_axes_dict() - d["copy"] = False + if axis == 1: + return cum_func(self.T, axis=0, skipna=skipna, *args, **kwargs).T + + def na_accum_func(blk_values): + # We will be applying this function to block values + if blk_values.dtype.kind in ["m", "M"]: + # GH#30460, GH#29058 + # numpy 1.18 started sorting NaTs at the end instead of beginning, + # so we need to work around to maintain backwards-consistency. + orig_dtype = blk_values.dtype + + # We need to define mask before masking NaTs + mask = isna(blk_values) + + if accum_func == np.minimum.accumulate: + # Note: the accum_func comparison fails as an "is" comparison + y = blk_values.view("i8") + y[mask] = np.iinfo(np.int64).max + changed = True + else: + y = blk_values + changed = False + + result = accum_func(y.view("i8"), axis) + if skipna: + np.putmask(result, mask, iNaT) + elif accum_func == np.minimum.accumulate: + # Restore NaTs that we masked previously + nz = (~np.asarray(mask)).nonzero()[0] + if len(nz): + # everything up to the first non-na entry stays NaT + result[: nz[0]] = iNaT + + if changed: + # restore NaT elements + y[mask] = iNaT # TODO: could try/finally for this? + + if isinstance(blk_values, np.ndarray): + result = result.view(orig_dtype) + else: + # DatetimeArray + result = type(blk_values)._from_sequence(result, dtype=orig_dtype) + + elif skipna and not issubclass( + blk_values.dtype.type, (np.integer, np.bool_) + ): + vals = blk_values.copy().T + mask = isna(vals) + np.putmask(vals, mask, mask_a) + result = accum_func(vals, axis) + np.putmask(result, mask, mask_b) + else: + result = accum_func(blk_values.T, axis) - if issubclass(y.dtype.type, (np.datetime64, np.timedelta64)): - # numpy 1.18 started sorting NaTs at the end instead of beginning, - # so we need to work around to maintain backwards-consistency. - orig_dtype = y.dtype - if accum_func == np.minimum.accumulate: - # Note: the accum_func comparison fails as an "is" comparison - # Note that "y" is always a copy, so we can safely modify it - mask = isna(self) - y = y.view("i8") - y[mask] = np.iinfo(np.int64).max - - result = accum_func(y.view("i8"), axis).view(orig_dtype) - if skipna: - mask = isna(self) - np.putmask(result, mask, iNaT) - elif accum_func == np.minimum.accumulate: - # Restore NaTs that we masked previously - nz = (~np.asarray(mask)).nonzero()[0] - if len(nz): - # everything up to the first non-na entry stays NaT - result[: nz[0]] = iNaT + # transpose back for ndarray, not for EA + return result.T if hasattr(result, "T") else result - if self.ndim == 1: - # restore dt64tz dtype - d["dtype"] = self.dtype - - elif skipna and not issubclass(y.dtype.type, (np.integer, np.bool_)): - mask = isna(self) - np.putmask(y, mask, mask_a) - result = accum_func(y, axis) - np.putmask(result, mask, mask_b) - else: - result = accum_func(y, axis) + result = self._data.apply(na_accum_func) + d = self._construct_axes_dict() + d["copy"] = False return self._constructor(result, **d).__finalize__(self) return set_function_name(cum_func, name, cls) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 6b110a0c80c07..be94fa5484496 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -809,6 +809,9 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None): periods=periods, fill_method=fill_method, limit=limit, freq=freq ) ) + if fill_method is None: # GH30463 + fill_method = "pad" + limit = 0 filled = getattr(self, fill_method)(limit=limit) fill_grp = filled.groupby(self.grouper.codes) shifted = fill_grp.shift(periods=periods, freq=freq) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 227547daf3668..81a9145318cb5 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -325,7 +325,7 @@ def f(self): f.__name__ = "plot" return self._groupby.apply(f) - def __getattr__(self, name): + def __getattr__(self, name: str): def attr(*args, **kwargs): def f(self): return getattr(self.plot, name)(*args, **kwargs) @@ -570,7 +570,7 @@ def _set_result_index_ordered(self, result): def _dir_additions(self): return self.obj._dir_additions() | self._apply_whitelist - def __getattr__(self, attr): + def __getattr__(self, attr: str): if attr in self._internal_names_set: return object.__getattribute__(self, attr) if attr in self.obj: @@ -2362,6 +2362,9 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0 axis=axis, ) ) + if fill_method is None: # GH30463 + fill_method = "pad" + limit = 0 filled = getattr(self, fill_method)(limit=limit) fill_grp = filled.groupby(self.grouper.codes) shifted = fill_grp.shift(periods=periods, freq=freq) @@ -2377,6 +2380,8 @@ def head(self, n=5): from the original DataFrame with original index and order preserved (``as_index`` flag is ignored). + Does not work for negative values of `n`. + Returns ------- Series or DataFrame @@ -2390,6 +2395,10 @@ def head(self, n=5): A B 0 1 2 2 5 6 + >>> df.groupby('A').head(-1) + Empty DataFrame + Columns: [A, B] + Index: [] """ self._reset_group_selection() mask = self._cumcount_array() < n @@ -2405,6 +2414,8 @@ def tail(self, n=5): from the original DataFrame with original index and order preserved (``as_index`` flag is ignored). + Does not work for negative values of `n`. + Returns ------- Series or DataFrame @@ -2418,6 +2429,10 @@ def tail(self, n=5): A B 1 a 2 3 b 2 + >>> df.groupby('A').tail(-1) + Empty DataFrame + Columns: [A, B] + Index: [] """ self._reset_group_selection() mask = self._cumcount_array(ascending=False) < n @@ -2528,9 +2543,9 @@ def get_groupby( squeeze: bool = False, observed: bool = False, mutated: bool = False, -): +) -> GroupBy: - klass: Union[Type["SeriesGroupBy"], Type["DataFrameGroupBy"]] + klass: Type[GroupBy] if isinstance(obj, Series): from pandas.core.groupby.generic import SeriesGroupBy diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 2c224a1bef338..747a32ae816be 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -3,7 +3,7 @@ split-apply-combine paradigm. """ -from typing import Hashable, List, Optional, Tuple +from typing import Dict, Hashable, List, Optional, Tuple import numpy as np @@ -419,7 +419,7 @@ def _make_codes(self) -> None: self._group_index = uniques @cache_readonly - def groups(self) -> dict: + def groups(self) -> Dict[Hashable, np.ndarray]: return self.index.groupby(Categorical.from_codes(self.codes, self.group_index)) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a3808f6f4a37e..e9e3a5ef94a1f 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1,7 +1,7 @@ from datetime import datetime import operator from textwrap import dedent -from typing import FrozenSet, Hashable, Optional, Union +from typing import Dict, FrozenSet, Hashable, Optional, Union import warnings import numpy as np @@ -349,41 +349,8 @@ def __new__( # they are actually ints, e.g. '0' and 0.0 # should not be coerced # GH 11836 - if is_integer_dtype(dtype): - inferred = lib.infer_dtype(data, skipna=False) - if inferred == "integer": - data = maybe_cast_to_integer_array(data, dtype, copy=copy) - elif inferred in ["floating", "mixed-integer-float"]: - if isna(data).any(): - raise ValueError("cannot convert float NaN to integer") - - if inferred == "mixed-integer-float": - data = maybe_cast_to_integer_array(data, dtype) - - # If we are actually all equal to integers, - # then coerce to integer. - try: - return cls._try_convert_to_int_index( - data, copy, name, dtype - ) - except ValueError: - pass - - # Return an actual float index. - return Float64Index(data, copy=copy, name=name) - - elif inferred == "string": - pass - else: - data = data.astype(dtype) - elif is_float_dtype(dtype): - inferred = lib.infer_dtype(data, skipna=False) - if inferred == "string": - pass - else: - data = data.astype(dtype) - else: - data = np.array(data, dtype=dtype, copy=copy) + data = _maybe_cast_with_dtype(data, dtype, copy) + dtype = data.dtype # TODO: maybe not for object? # maybe coerce to a sub-class if is_signed_integer_dtype(data.dtype): @@ -403,43 +370,12 @@ def __new__( subarr = subarr.copy() if dtype is None: - inferred = lib.infer_dtype(subarr, skipna=False) - if inferred == "integer": - try: - return cls._try_convert_to_int_index(subarr, copy, name, dtype) - except ValueError: - pass - - return Index(subarr, copy=copy, dtype=object, name=name) - elif inferred in ["floating", "mixed-integer-float", "integer-na"]: - # TODO: Returns IntegerArray for integer-na case in the future - return Float64Index(subarr, copy=copy, name=name) - elif inferred == "interval": - try: - return IntervalIndex(subarr, name=name, copy=copy) - except ValueError: - # GH27172: mixed closed Intervals --> object dtype - pass - elif inferred == "boolean": - # don't support boolean explicitly ATM - pass - elif inferred != "string": - if inferred.startswith("datetime"): - try: - return DatetimeIndex(subarr, copy=copy, name=name, **kwargs) - except (ValueError, OutOfBoundsDatetime): - # GH 27011 - # If we have mixed timezones, just send it - # down the base constructor - pass - - elif inferred.startswith("timedelta"): - return TimedeltaIndex(subarr, copy=copy, name=name, **kwargs) - elif inferred == "period": - try: - return PeriodIndex(subarr, name=name, **kwargs) - except IncompatibleFrequency: - pass + new_data, new_dtype = _maybe_cast_data_without_dtype(subarr) + if new_dtype is not None: + return cls( + new_data, dtype=new_dtype, copy=False, name=name, **kwargs + ) + if kwargs: raise TypeError(f"Unexpected keyword arguments {repr(set(kwargs))}") return cls._simple_new(subarr, name, **kwargs) @@ -3839,50 +3775,6 @@ def where(self, cond, other=None): return self._shallow_copy_with_infer(values, dtype=dtype) # construction helpers - @classmethod - def _try_convert_to_int_index(cls, data, copy, name, dtype): - """ - Attempt to convert an array of data into an integer index. - - Parameters - ---------- - data : The data to convert. - copy : Whether to copy the data or not. - name : The name of the index returned. - - Returns - ------- - int_index : data converted to either an Int64Index or a - UInt64Index - - Raises - ------ - ValueError if the conversion was not successful. - """ - - from .numeric import Int64Index, UInt64Index - - if not is_unsigned_integer_dtype(dtype): - # skip int64 conversion attempt if uint-like dtype is passed, as - # this could return Int64Index when UInt64Index is what's desired - try: - res = data.astype("i8", copy=False) - if (res == data).all(): - return Int64Index(res, copy=copy, name=name) - except (OverflowError, TypeError, ValueError): - pass - - # Conversion to int64 failed (possibly due to overflow) or was skipped, - # so let's try now with uint64. - try: - res = data.astype("u8", copy=False) - if (res == data).all(): - return UInt64Index(res, copy=copy, name=name) - except (OverflowError, TypeError, ValueError): - pass - - raise ValueError - @classmethod def _scalar_data_error(cls, data): # We return the TypeError so that we can raise it from the constructor @@ -4594,7 +4486,7 @@ def _maybe_promote(self, other): return self.astype("object"), other.astype("object") return self, other - def groupby(self, values): + def groupby(self, values) -> Dict[Hashable, np.ndarray]: """ Group the index labels by a given array of values. @@ -4605,7 +4497,7 @@ def groupby(self, values): Returns ------- - groups : dict + dict {group name -> group labels} """ @@ -5486,3 +5378,172 @@ def maybe_extract_name(name, obj, cls) -> Optional[Hashable]: raise TypeError(f"{cls.__name__}.name must be a hashable type") return name + + +def _maybe_cast_with_dtype(data: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray: + """ + If a dtype is passed, cast to the closest matching dtype that is supported + by Index. + + Parameters + ---------- + data : np.ndarray + dtype : np.dtype + copy : bool + + Returns + ------- + np.ndarray + """ + # we need to avoid having numpy coerce + # things that look like ints/floats to ints unless + # they are actually ints, e.g. '0' and 0.0 + # should not be coerced + # GH 11836 + if is_integer_dtype(dtype): + inferred = lib.infer_dtype(data, skipna=False) + if inferred == "integer": + data = maybe_cast_to_integer_array(data, dtype, copy=copy) + elif inferred in ["floating", "mixed-integer-float"]: + if isna(data).any(): + raise ValueError("cannot convert float NaN to integer") + + if inferred == "mixed-integer-float": + data = maybe_cast_to_integer_array(data, dtype) + + # If we are actually all equal to integers, + # then coerce to integer. + try: + data = _try_convert_to_int_array(data, copy, dtype) + except ValueError: + data = np.array(data, dtype=np.float64, copy=copy) + + elif inferred == "string": + pass + else: + data = data.astype(dtype) + elif is_float_dtype(dtype): + inferred = lib.infer_dtype(data, skipna=False) + if inferred == "string": + pass + else: + data = data.astype(dtype) + else: + data = np.array(data, dtype=dtype, copy=copy) + + return data + + +def _maybe_cast_data_without_dtype(subarr): + """ + If we have an arraylike input but no passed dtype, try to infer + a supported dtype. + + Parameters + ---------- + subarr : np.ndarray, Index, or Series + + Returns + ------- + converted : np.ndarray or ExtensionArray + dtype : np.dtype or ExtensionDtype + """ + # Runtime import needed bc IntervalArray imports Index + from pandas.core.arrays import ( + IntervalArray, + PeriodArray, + DatetimeArray, + TimedeltaArray, + ) + + inferred = lib.infer_dtype(subarr, skipna=False) + + if inferred == "integer": + try: + data = _try_convert_to_int_array(subarr, False, None) + return data, data.dtype + except ValueError: + pass + + return subarr, object + + elif inferred in ["floating", "mixed-integer-float", "integer-na"]: + # TODO: Returns IntegerArray for integer-na case in the future + return subarr, np.float64 + + elif inferred == "interval": + try: + data = IntervalArray._from_sequence(subarr, copy=False) + return data, data.dtype + except ValueError: + # GH27172: mixed closed Intervals --> object dtype + pass + elif inferred == "boolean": + # don't support boolean explicitly ATM + pass + elif inferred != "string": + if inferred.startswith("datetime"): + try: + data = DatetimeArray._from_sequence(subarr, copy=False) + return data, data.dtype + except (ValueError, OutOfBoundsDatetime): + # GH 27011 + # If we have mixed timezones, just send it + # down the base constructor + pass + + elif inferred.startswith("timedelta"): + data = TimedeltaArray._from_sequence(subarr, copy=False) + return data, data.dtype + elif inferred == "period": + try: + data = PeriodArray._from_sequence(subarr) + return data, data.dtype + except IncompatibleFrequency: + pass + + return subarr, subarr.dtype + + +def _try_convert_to_int_array( + data: np.ndarray, copy: bool, dtype: np.dtype +) -> np.ndarray: + """ + Attempt to convert an array of data into an integer array. + + Parameters + ---------- + data : The data to convert. + copy : bool + Whether to copy the data or not. + dtype : np.dtype + + Returns + ------- + int_array : data converted to either an ndarray[int64] or ndarray[uint64] + + Raises + ------ + ValueError if the conversion was not successful. + """ + + if not is_unsigned_integer_dtype(dtype): + # skip int64 conversion attempt if uint-like dtype is passed, as + # this could return Int64Index when UInt64Index is what's desired + try: + res = data.astype("i8", copy=False) + if (res == data).all(): + return res # TODO: might still need to copy + except (OverflowError, TypeError, ValueError): + pass + + # Conversion to int64 failed (possibly due to overflow) or was skipped, + # so let's try now with uint64. + try: + res = data.astype("u8", copy=False) + if (res == data).all(): + return res # TODO: might still need to copy + except (OverflowError, TypeError, ValueError): + pass + + raise ValueError diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index ba476f9e25ee6..531014e4affec 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -1,5 +1,5 @@ import operator -from typing import Any +from typing import Any, List import numpy as np @@ -583,6 +583,7 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): target = ibase.ensure_index(target) + missing: List[int] if self.equals(target): indexer = None missing = [] diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 7ba04fc9d2fea..f957860240dd2 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -6,8 +6,9 @@ import numpy as np -from pandas._libs import NaT, iNaT, lib +from pandas._libs import NaT, iNaT, join as libjoin, lib from pandas._libs.algos import unique_deltas +from pandas._libs.tslibs import timezones from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, cache_readonly @@ -33,6 +34,8 @@ ) import pandas.core.indexes.base as ibase from pandas.core.indexes.base import Index, _index_shared_docs +from pandas.core.indexes.numeric import Int64Index +from pandas.core.ops import get_op_result_name from pandas.core.tools.timedeltas import to_timedelta from pandas.tseries.frequencies import DateOffset, to_offset @@ -71,34 +74,30 @@ def method(self, other): return method -class DatetimeTimedeltaMixin: +def _join_i8_wrapper(joinf, with_indexers: bool = True): """ - Mixin class for methods shared by DatetimeIndex and TimedeltaIndex, - but not PeriodIndex + Create the join wrapper methods. """ - def _set_freq(self, freq): - """ - Set the _freq attribute on our underlying DatetimeArray. + @staticmethod # type: ignore + def wrapper(left, right): + if isinstance(left, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin)): + left = left.view("i8") + if isinstance(right, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin)): + right = right.view("i8") - Parameters - ---------- - freq : DateOffset, None, or "infer" - """ - # GH#29843 - if freq is None: - # Always valid - pass - elif len(self) == 0 and isinstance(freq, DateOffset): - # Always valid. In the TimedeltaIndex case, we assume this - # is a Tick offset. - pass - else: - # As an internal method, we can ensure this assertion always holds - assert freq == "infer" - freq = to_offset(self.inferred_freq) + results = joinf(left, right) + if with_indexers: + # dtype should be timedelta64[ns] for TimedeltaIndex + # and datetime64[ns] for DatetimeIndex + dtype = left.dtype.base - self._data._freq = freq + join_index, left_indexer, right_indexer = results + join_index = join_index.view(dtype) + return join_index, left_indexer, right_indexer + return results + + return wrapper class DatetimeIndexOpsMixin(ExtensionOpsMixin): @@ -122,10 +121,13 @@ class DatetimeIndexOpsMixin(ExtensionOpsMixin): ) resolution = cache_readonly(DatetimeLikeArrayMixin.resolution.fget) # type: ignore - _maybe_mask_results = ea_passthrough(DatetimeLikeArrayMixin._maybe_mask_results) __iter__ = ea_passthrough(DatetimeLikeArrayMixin.__iter__) mean = ea_passthrough(DatetimeLikeArrayMixin.mean) + @property + def is_all_dates(self) -> bool: + return True + @property def freq(self): """ @@ -234,32 +236,6 @@ def equals(self, other): return np.array_equal(self.asi8, other.asi8) - @staticmethod - def _join_i8_wrapper(joinf, dtype, with_indexers=True): - """ - Create the join wrapper methods. - """ - from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin - - @staticmethod - def wrapper(left, right): - if isinstance( - left, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin) - ): - left = left.view("i8") - if isinstance( - right, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin) - ): - right = right.view("i8") - results = joinf(left, right) - if with_indexers: - join_index, left_indexer, right_indexer = results - join_index = join_index.view(dtype) - return join_index, left_indexer, right_indexer - return results - - return wrapper - def _ensure_localized( self, arg, ambiguous="raise", nonexistent="raise", from_utc=False ): @@ -606,66 +582,6 @@ def isin(self, values, level=None): return algorithms.isin(self.asi8, values.asi8) - def intersection(self, other, sort=False): - self._validate_sort_keyword(sort) - self._assert_can_do_setop(other) - - if self.equals(other): - return self._get_reconciled_name_object(other) - - if len(self) == 0: - return self.copy() - if len(other) == 0: - return other.copy() - - if not isinstance(other, type(self)): - result = Index.intersection(self, other, sort=sort) - if isinstance(result, type(self)): - if result.freq is None: - result._set_freq("infer") - return result - - elif ( - other.freq is None - or self.freq is None - or other.freq != self.freq - or not other.freq.is_anchored() - or (not self.is_monotonic or not other.is_monotonic) - ): - result = Index.intersection(self, other, sort=sort) - - # Invalidate the freq of `result`, which may not be correct at - # this point, depending on the values. - - result._set_freq(None) - if hasattr(self, "tz"): - result = self._shallow_copy( - result._values, name=result.name, tz=result.tz, freq=None - ) - else: - result = self._shallow_copy(result._values, name=result.name, freq=None) - if result.freq is None: - result._set_freq("infer") - return result - - # to make our life easier, "sort" the two ranges - if self[0] <= other[0]: - left, right = self, other - else: - left, right = other, self - - # after sorting, the intersection always starts with the right index - # and ends with the index of which the last elements is smallest - end = min(left[-1], right[-1]) - start = right[0] - - if end < start: - return type(self)(data=[]) - else: - lslice = slice(*left.slice_locs(start, end)) - left_chunk = left.values[lslice] - return self._shallow_copy(left_chunk) - @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) def repeat(self, repeats, axis=None): nv.validate_repeat(tuple(), dict(axis=axis)) @@ -778,6 +694,253 @@ def shift(self, periods=1, freq=None): return type(self)(result, name=self.name) +class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin, Int64Index): + """ + Mixin class for methods shared by DatetimeIndex and TimedeltaIndex, + but not PeriodIndex + """ + + # Compat for frequency inference, see GH#23789 + _is_monotonic_increasing = Index.is_monotonic_increasing + _is_monotonic_decreasing = Index.is_monotonic_decreasing + _is_unique = Index.is_unique + + def _set_freq(self, freq): + """ + Set the _freq attribute on our underlying DatetimeArray. + + Parameters + ---------- + freq : DateOffset, None, or "infer" + """ + # GH#29843 + if freq is None: + # Always valid + pass + elif len(self) == 0 and isinstance(freq, DateOffset): + # Always valid. In the TimedeltaIndex case, we assume this + # is a Tick offset. + pass + else: + # As an internal method, we can ensure this assertion always holds + assert freq == "infer" + freq = to_offset(self.inferred_freq) + + self._data._freq = freq + + # -------------------------------------------------------------------- + # Set Operation Methods + + @Appender(Index.difference.__doc__) + def difference(self, other, sort=None): + new_idx = super().difference(other, sort=sort) + new_idx._set_freq(None) + return new_idx + + def intersection(self, other, sort=False): + """ + Specialized intersection for DatetimeIndex/TimedeltaIndex. + + May be much faster than Index.intersection + + Parameters + ---------- + other : Same type as self or array-like + sort : False or None, default False + Sort the resulting index if possible. + + .. versionadded:: 0.24.0 + + .. versionchanged:: 0.24.1 + + Changed the default to ``False`` to match the behaviour + from before 0.24.0. + + .. versionchanged:: 0.25.0 + + The `sort` keyword is added + + Returns + ------- + y : Index or same type as self + """ + self._validate_sort_keyword(sort) + self._assert_can_do_setop(other) + + if self.equals(other): + return self._get_reconciled_name_object(other) + + if len(self) == 0: + return self.copy() + if len(other) == 0: + return other.copy() + + if not isinstance(other, type(self)): + result = Index.intersection(self, other, sort=sort) + if isinstance(result, type(self)): + if result.freq is None: + result._set_freq("infer") + return result + + elif ( + other.freq is None + or self.freq is None + or other.freq != self.freq + or not other.freq.is_anchored() + or (not self.is_monotonic or not other.is_monotonic) + ): + result = Index.intersection(self, other, sort=sort) + + # Invalidate the freq of `result`, which may not be correct at + # this point, depending on the values. + + result._set_freq(None) + if hasattr(self, "tz"): + result = self._shallow_copy( + result._values, name=result.name, tz=result.tz, freq=None + ) + else: + result = self._shallow_copy(result._values, name=result.name, freq=None) + if result.freq is None: + result._set_freq("infer") + return result + + # to make our life easier, "sort" the two ranges + if self[0] <= other[0]: + left, right = self, other + else: + left, right = other, self + + # after sorting, the intersection always starts with the right index + # and ends with the index of which the last elements is smallest + end = min(left[-1], right[-1]) + start = right[0] + + if end < start: + return type(self)(data=[]) + else: + lslice = slice(*left.slice_locs(start, end)) + left_chunk = left.values[lslice] + return self._shallow_copy(left_chunk) + + def _can_fast_union(self, other) -> bool: + if not isinstance(other, type(self)): + return False + + freq = self.freq + + if freq is None or freq != other.freq: + return False + + if not self.is_monotonic or not other.is_monotonic: + return False + + if len(self) == 0 or len(other) == 0: + return True + + # to make our life easier, "sort" the two ranges + if self[0] <= other[0]: + left, right = self, other + else: + left, right = other, self + + right_start = right[0] + left_end = left[-1] + + # Only need to "adjoin", not overlap + try: + return (right_start == left_end + freq) or right_start in left + except ValueError: + # if we are comparing a freq that does not propagate timezones + # this will raise + return False + + # -------------------------------------------------------------------- + # Join Methods + _join_precedence = 10 + + _inner_indexer = _join_i8_wrapper(libjoin.inner_join_indexer) + _outer_indexer = _join_i8_wrapper(libjoin.outer_join_indexer) + _left_indexer = _join_i8_wrapper(libjoin.left_join_indexer) + _left_indexer_unique = _join_i8_wrapper( + libjoin.left_join_indexer_unique, with_indexers=False + ) + + def join( + self, other, how: str = "left", level=None, return_indexers=False, sort=False + ): + """ + See Index.join + """ + if self._is_convertible_to_index_for_join(other): + try: + other = type(self)(other) + except (TypeError, ValueError): + pass + + this, other = self._maybe_utc_convert(other) + return Index.join( + this, + other, + how=how, + level=level, + return_indexers=return_indexers, + sort=sort, + ) + + def _maybe_utc_convert(self, other): + this = self + if not hasattr(self, "tz"): + return this, other + + if isinstance(other, type(self)): + if self.tz is not None: + if other.tz is None: + raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") + elif other.tz is not None: + raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") + + if not timezones.tz_compare(self.tz, other.tz): + this = self.tz_convert("UTC") + other = other.tz_convert("UTC") + return this, other + + @classmethod + def _is_convertible_to_index_for_join(cls, other: Index) -> bool: + """ + return a boolean whether I can attempt conversion to a + DatetimeIndex/TimedeltaIndex + """ + if isinstance(other, cls): + return False + elif len(other) > 0 and other.inferred_type not in ( + "floating", + "mixed-integer", + "integer", + "integer-na", + "mixed-integer-float", + "mixed", + ): + return True + return False + + def _wrap_joined_index(self, joined, other): + name = get_op_result_name(self, other) + if ( + isinstance(other, type(self)) + and self.freq == other.freq + and self._can_fast_union(other) + ): + joined = self._shallow_copy(joined) + joined.name = name + return joined + else: + kwargs = {} + if hasattr(self, "tz"): + kwargs["tz"] = getattr(other, "tz", None) + return self._simple_new(joined, name, **kwargs) + + def wrap_arithmetic_op(self, other, result): if result is NotImplemented: return NotImplemented diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 108e24ffee820..f8e8a7037b9c4 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -5,7 +5,6 @@ import numpy as np from pandas._libs import NaT, Timestamp, index as libindex, lib, tslib as libts -import pandas._libs.join as libjoin from pandas._libs.tslibs import ccalendar, fields, parsing, timezones from pandas.util._decorators import Appender, Substitution, cache_readonly @@ -32,12 +31,9 @@ import pandas.core.common as com from pandas.core.indexes.base import Index, maybe_extract_name from pandas.core.indexes.datetimelike import ( - DatetimeIndexOpsMixin, DatetimelikeDelegateMixin, DatetimeTimedeltaMixin, - ea_passthrough, ) -from pandas.core.indexes.numeric import Int64Index from pandas.core.ops import get_op_result_name import pandas.core.tools.datetimes as tools @@ -94,9 +90,7 @@ class DatetimeDelegateMixin(DatetimelikeDelegateMixin): typ="method", overwrite=False, ) -class DatetimeIndex( - DatetimeTimedeltaMixin, DatetimeIndexOpsMixin, Int64Index, DatetimeDelegateMixin -): +class DatetimeIndex(DatetimeTimedeltaMixin, DatetimeDelegateMixin): """ Immutable ndarray of datetime64 data, represented internally as int64, and which can be boxed to Timestamp objects that are subclasses of datetime and @@ -199,17 +193,6 @@ class DatetimeIndex( """ _typ = "datetimeindex" - _join_precedence = 10 - - def _join_i8_wrapper(joinf, **kwargs): - return DatetimeIndexOpsMixin._join_i8_wrapper(joinf, dtype="M8[ns]", **kwargs) - - _inner_indexer = _join_i8_wrapper(libjoin.inner_join_indexer) - _outer_indexer = _join_i8_wrapper(libjoin.outer_join_indexer) - _left_indexer = _join_i8_wrapper(libjoin.left_join_indexer) - _left_indexer_unique = _join_i8_wrapper( - libjoin.left_join_indexer_unique, with_indexers=False - ) _engine_type = libindex.DatetimeEngine _supports_partial_string_indexing = True @@ -411,12 +394,6 @@ def _convert_for_op(self, value): return _to_M8(value) raise ValueError("Passed item and index have different timezone") - @Appender(Index.difference.__doc__) - def difference(self, other, sort=None): - new_idx = super().difference(other, sort=sort) - new_idx._set_freq(None) - return new_idx - # -------------------------------------------------------------------- # Rendering Methods @@ -469,7 +446,7 @@ def _union(self, other, sort): if result.freq is None and ( this.freq is not None or other.freq is not None ): - result._data._freq = to_offset(result.inferred_freq) + result._set_freq("infer") return result def union_many(self, others): @@ -502,39 +479,6 @@ def union_many(self, others): this._data._dtype = dtype return this - def _can_fast_union(self, other) -> bool: - if not isinstance(other, DatetimeIndex): - return False - - freq = self.freq - - if freq is None or freq != other.freq: - return False - - if not self.is_monotonic or not other.is_monotonic: - return False - - if len(self) == 0 or len(other) == 0: - return True - - # to make our life easier, "sort" the two ranges - if self[0] <= other[0]: - left, right = self, other - else: - left, right = other, self - - right_start = right[0] - left_end = left[-1] - - # Only need to "adjoin", not overlap - try: - return (right_start == left_end + freq) or right_start in left - except (ValueError): - - # if we are comparing a freq that does not propagate timezones - # this will raise - return False - def _fast_union(self, other, sort=None): if len(other) == 0: return self.view(type(self)) @@ -574,30 +518,6 @@ def _fast_union(self, other, sort=None): else: return left - def intersection(self, other, sort=False): - """ - Specialized intersection for DatetimeIndex objects. - May be much faster than Index.intersection - - Parameters - ---------- - other : DatetimeIndex or array-like - sort : False or None, default False - Sort the resulting index if possible. - - .. versionadded:: 0.24.0 - - .. versionchanged:: 0.24.1 - - Changed the default to ``False`` to match the behaviour - from before 0.24.0. - - Returns - ------- - Index or DatetimeIndex or TimedeltaIndex - """ - return super().intersection(other, sort=sort) - def _wrap_setop_result(self, other, result): name = get_op_result_name(self, other) return self._shallow_copy(result, name=name, freq=None, tz=self.tz) @@ -712,68 +632,6 @@ def snap(self, freq="S"): # we know it conforms; skip check return DatetimeIndex._simple_new(snapped, name=self.name, tz=self.tz, freq=freq) - def join( - self, other, how: str = "left", level=None, return_indexers=False, sort=False - ): - """ - See Index.join - """ - if ( - not isinstance(other, DatetimeIndex) - and len(other) > 0 - and other.inferred_type - not in ( - "floating", - "integer", - "integer-na", - "mixed-integer", - "mixed-integer-float", - "mixed", - ) - ): - try: - other = DatetimeIndex(other) - except (TypeError, ValueError): - pass - - this, other = self._maybe_utc_convert(other) - return Index.join( - this, - other, - how=how, - level=level, - return_indexers=return_indexers, - sort=sort, - ) - - def _maybe_utc_convert(self, other): - this = self - if isinstance(other, DatetimeIndex): - if self.tz is not None: - if other.tz is None: - raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") - elif other.tz is not None: - raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") - - if not timezones.tz_compare(self.tz, other.tz): - this = self.tz_convert("UTC") - other = other.tz_convert("UTC") - return this, other - - def _wrap_joined_index(self, joined, other): - name = get_op_result_name(self, other) - if ( - isinstance(other, DatetimeIndex) - and self.freq == other.freq - and self._can_fast_union(other) - ): - joined = self._shallow_copy(joined) - joined.name = name - return joined - else: - tz = getattr(other, "tz", None) - return self._simple_new(joined, name, tz=tz) - def _parsed_string_to_bounds(self, reso, parsed): """ Calculate datetime bounds for parsed time string and its resolution. @@ -1126,17 +984,10 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): # -------------------------------------------------------------------- # Wrapping DatetimeArray - # Compat for frequency inference, see GH#23789 - _is_monotonic_increasing = Index.is_monotonic_increasing - _is_monotonic_decreasing = Index.is_monotonic_decreasing - _is_unique = Index.is_unique - _timezone = cache_readonly(DatetimeArray._timezone.fget) # type: ignore is_normalized = cache_readonly(DatetimeArray.is_normalized.fget) # type: ignore _resolution = cache_readonly(DatetimeArray._resolution.fget) # type: ignore - _has_same_tz = ea_passthrough(DatetimeArray._has_same_tz) - def __getitem__(self, key): result = self._data.__getitem__(key) if is_scalar(result): @@ -1173,10 +1024,6 @@ def inferred_type(self) -> str: # sure we can't have ambiguous indexing return "datetime64" - @property - def is_all_dates(self) -> bool: - return True - def insert(self, loc, item): """ Make new Index inserting new item at location @@ -1202,6 +1049,7 @@ def insert(self, loc, item): self._assert_can_do_op(item) if not self._has_same_tz(item) and not isna(item): raise ValueError("Passed item and index have different timezone") + # check freq can be preserved on edge cases if self.size and self.freq is not None: if item is NaT: @@ -1356,7 +1204,7 @@ def date_range( name=None, closed=None, **kwargs, -): +) -> DatetimeIndex: """ Return a fixed frequency DatetimeIndex. @@ -1522,7 +1370,7 @@ def bdate_range( holidays=None, closed=None, **kwargs, -): +) -> DatetimeIndex: """ Return a fixed frequency DatetimeIndex, with business day as the default frequency. diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index ce0716e36cdf3..52df491725504 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -103,19 +103,6 @@ def _get_prev_label(label): raise TypeError(f"cannot determine next label for type {repr(type(label))}") -def _get_interval_closed_bounds(interval): - """ - Given an Interval or IntervalIndex, return the corresponding interval with - closed bounds. - """ - left, right = interval.left, interval.right - if interval.open_left: - left = _get_next_label(left) - if interval.open_right: - right = _get_prev_label(right) - return left, right - - def _new_IntervalIndex(cls, d): """ This is called upon unpickling, rather than the default which doesn't have @@ -675,26 +662,6 @@ def _convert_list_indexer(self, keyarr, kind=None): return locs - def _maybe_cast_indexed(self, key): - """ - we need to cast the key, which could be a scalar - or an array-like to the type of our subtype - """ - if isinstance(key, IntervalIndex): - return key - - subtype = self.dtype.subtype - if is_float_dtype(subtype): - if is_integer(key): - key = float(key) - elif isinstance(key, (np.ndarray, Index)): - key = key.astype("float64") - elif is_integer_dtype(subtype): - if is_integer(key): - key = int(key) - - return key - def _can_reindex(self, indexer: np.ndarray) -> None: """ Check if we are allowing reindexing with this particular indexer. @@ -827,34 +794,6 @@ def _searchsorted_monotonic(self, label, side, exclude_label=False): return sub_idx._searchsorted_monotonic(label, side) - def _find_non_overlapping_monotonic_bounds(self, key): - if isinstance(key, IntervalMixin): - start = self._searchsorted_monotonic( - key.left, "left", exclude_label=key.open_left - ) - stop = self._searchsorted_monotonic( - key.right, "right", exclude_label=key.open_right - ) - elif isinstance(key, slice): - # slice - start, stop = key.start, key.stop - if (key.step or 1) != 1: - raise NotImplementedError("cannot slice with a slice step") - if start is None: - start = 0 - else: - start = self._searchsorted_monotonic(start, "left") - if stop is None: - stop = len(self) - else: - stop = self._searchsorted_monotonic(stop, "right") - else: - # scalar or index-like - - start = self._searchsorted_monotonic(key, "left") - stop = self._searchsorted_monotonic(key, "right") - return start, stop - def get_loc( self, key: Any, method: Optional[str] = None, tolerance=None ) -> Union[int, slice, np.ndarray]: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index dac9b20104c36..017e1811e786a 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1,7 +1,7 @@ from collections import OrderedDict import datetime from sys import getsizeof -from typing import List, Optional +from typing import Hashable, List, Optional, Sequence, Union import warnings import numpy as np @@ -2432,7 +2432,53 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): return target, indexer - def get_slice_bound(self, label, side, kind): + def get_slice_bound( + self, label: Union[Hashable, Sequence[Hashable]], side: str, kind: str + ) -> int: + """ + For an ordered MultiIndex, compute slice bound + that corresponds to given label. + + Returns leftmost (one-past-the-rightmost if `side=='right') position + of given label. + + Parameters + ---------- + label : object or tuple of objects + side : {'left', 'right'} + kind : {'loc', 'getitem'} + + Returns + ------- + int + Index of label. + + Notes + ----- + This method only works if level 0 index of the MultiIndex is lexsorted. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([list('abbc'), list('gefd')]) + + Get the locations from the leftmost 'b' in the first level + until the end of the multiindex: + + >>> mi.get_slice_bound('b', side="left", kind="loc") + 1 + + Like above, but if you get the locations from the rightmost + 'b' in the first level and 'f' in the second level: + + >>> mi.get_slice_bound(('b','f'), side="right", kind="loc") + 3 + + See Also + -------- + MultiIndex.get_loc : Get location for a label or a tuple of labels. + MultiIndex.get_locs : Get location for a label/slice/list/mask or a + sequence of such. + """ if not isinstance(label, tuple): label = (label,) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 6465a0c1724af..0cd4b4d4bca8d 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -20,7 +20,6 @@ ) from pandas.core.accessor import delegate_names -from pandas.core.algorithms import unique1d from pandas.core.arrays.period import PeriodArray, period_array, validate_dtype_freq from pandas.core.base import _shared_docs import pandas.core.common as com @@ -34,7 +33,8 @@ DatetimeIndexOpsMixin, DatetimelikeDelegateMixin, ) -from pandas.core.indexes.datetimes import DatetimeIndex, Index, Int64Index +from pandas.core.indexes.datetimes import DatetimeIndex, Index +from pandas.core.indexes.numeric import Int64Index from pandas.core.missing import isna from pandas.core.ops import get_op_result_name from pandas.core.tools.datetimes import DateParseError, parse_time_string @@ -511,10 +511,6 @@ def searchsorted(self, value, side="left", sorter=None): return self._ndarray_values.searchsorted(value, side=side, sorter=sorter) - @property - def is_all_dates(self) -> bool: - return True - @property def is_full(self) -> bool: """ @@ -622,18 +618,6 @@ def _get_unique_index(self, dropna=False): res = res.dropna() return res - @Appender(Index.unique.__doc__) - def unique(self, level=None): - # override the Index.unique method for performance GH#23083 - if level is not None: - # this should never occur, but is retained to make the signature - # match Index.unique - self._validate_index_level(level) - - values = self._ndarray_values - result = unique1d(values) - return self._shallow_copy(result) - def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label @@ -808,10 +792,6 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False) return self._apply_meta(result), lidx, ridx return self._apply_meta(result) - @Appender(Index.intersection.__doc__) - def intersection(self, other, sort=False): - return Index.intersection(self, other, sort=sort) - def _assert_can_do_setop(self, other): super()._assert_can_do_setop(other) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 480a4ae34bfb7..8dd8bd8642354 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -3,7 +3,7 @@ import numpy as np -from pandas._libs import NaT, Timedelta, index as libindex, join as libjoin, lib +from pandas._libs import NaT, Timedelta, index as libindex, lib from pandas.util._decorators import Appender, Substitution from pandas.core.dtypes.common import ( @@ -30,10 +30,7 @@ DatetimeIndexOpsMixin, DatetimelikeDelegateMixin, DatetimeTimedeltaMixin, - ea_passthrough, ) -from pandas.core.indexes.numeric import Int64Index -from pandas.core.ops import get_op_result_name from pandas.tseries.frequencies import to_offset @@ -50,9 +47,12 @@ class TimedeltaDelegateMixin(DatetimelikeDelegateMixin): "__neg__", "__pos__", "__abs__", + "sum", + "std", + "median", ] _raw_properties = {"components"} - _raw_methods = {"to_pytimedelta"} + _raw_methods = {"to_pytimedelta", "sum", "std", "median"} @delegate_names( @@ -65,11 +65,7 @@ class TimedeltaDelegateMixin(DatetimelikeDelegateMixin): overwrite=True, ) class TimedeltaIndex( - DatetimeTimedeltaMixin, - DatetimeIndexOpsMixin, - dtl.TimelikeOps, - Int64Index, - TimedeltaDelegateMixin, + DatetimeTimedeltaMixin, dtl.TimelikeOps, TimedeltaDelegateMixin, ): """ Immutable ndarray of timedelta64 data, represented internally as int64, and @@ -124,17 +120,6 @@ class TimedeltaIndex( """ _typ = "timedeltaindex" - _join_precedence = 10 - - def _join_i8_wrapper(joinf, **kwargs): - return DatetimeIndexOpsMixin._join_i8_wrapper(joinf, dtype="m8[ns]", **kwargs) - - _inner_indexer = _join_i8_wrapper(libjoin.inner_join_indexer) - _outer_indexer = _join_i8_wrapper(libjoin.outer_join_indexer) - _left_indexer = _join_i8_wrapper(libjoin.left_join_indexer) - _left_indexer_unique = _join_i8_wrapper( - libjoin.left_join_indexer_unique, with_indexers=False - ) _engine_type = libindex.TimedeltaEngine @@ -151,9 +136,6 @@ def _join_i8_wrapper(joinf, **kwargs): _datetimelike_ops = TimedeltaArray._datetimelike_ops _datetimelike_methods = TimedeltaArray._datetimelike_methods _other_ops = TimedeltaArray._other_ops - sum = ea_passthrough(TimedeltaArray.sum) - std = ea_passthrough(TimedeltaArray.std) - median = ea_passthrough(TimedeltaArray.median) # ------------------------------------------------------------------- # Constructors @@ -255,11 +237,6 @@ def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): # ------------------------------------------------------------------- # Wrapping TimedeltaArray - # Compat for frequency inference, see GH#23789 - _is_monotonic_increasing = Index.is_monotonic_increasing - _is_monotonic_decreasing = Index.is_monotonic_decreasing - _is_unique = Index.is_unique - @property def _box_func(self): return lambda x: Timedelta(x, unit="ns") @@ -305,98 +282,6 @@ def _union(self, other, sort): result._set_freq("infer") return result - def join(self, other, how="left", level=None, return_indexers=False, sort=False): - """ - See Index.join - """ - if _is_convertible_to_index(other): - try: - other = TimedeltaIndex(other) - except (TypeError, ValueError): - pass - - return Index.join( - self, - other, - how=how, - level=level, - return_indexers=return_indexers, - sort=sort, - ) - - def intersection(self, other, sort=False): - """ - Specialized intersection for TimedeltaIndex objects. - May be much faster than Index.intersection - - Parameters - ---------- - other : TimedeltaIndex or array-like - sort : False or None, default False - Sort the resulting index if possible. - - .. versionadded:: 0.24.0 - - .. versionchanged:: 0.24.1 - - Changed the default to ``False`` to match the behaviour - from before 0.24.0. - - .. versionchanged:: 0.25.0 - - The `sort` keyword is added - - Returns - ------- - y : Index or TimedeltaIndex - """ - return super().intersection(other, sort=sort) - - @Appender(Index.difference.__doc__) - def difference(self, other, sort=None): - new_idx = super().difference(other, sort=sort) - new_idx._set_freq(None) - return new_idx - - def _wrap_joined_index(self, joined, other): - name = get_op_result_name(self, other) - if ( - isinstance(other, TimedeltaIndex) - and self.freq == other.freq - and self._can_fast_union(other) - ): - joined = self._shallow_copy(joined, name=name) - return joined - else: - return self._simple_new(joined, name) - - def _can_fast_union(self, other): - if not isinstance(other, TimedeltaIndex): - return False - - freq = self.freq - - if freq is None or freq != other.freq: - return False - - if not self.is_monotonic or not other.is_monotonic: - return False - - if len(self) == 0 or len(other) == 0: - return True - - # to make our life easier, "sort" the two ranges - if self[0] <= other[0]: - left, right = self, other - else: - left, right = other, self - - right_start = right[0] - left_end = left[-1] - - # Only need to "adjoin", not overlap - return (right_start == left_end + freq) or right_start in left - def _fast_union(self, other): if len(other) == 0: return self.view(type(self)) @@ -557,10 +442,6 @@ def is_type_compatible(self, typ) -> bool: def inferred_type(self) -> str: return "timedelta64" - @property - def is_all_dates(self) -> bool: - return True - def insert(self, loc, item): """ Make new Index inserting new item at location @@ -645,24 +526,6 @@ def delete(self, loc): TimedeltaIndex._add_datetimelike_methods() -def _is_convertible_to_index(other) -> bool: - """ - return a boolean whether I can attempt conversion to a TimedeltaIndex - """ - if isinstance(other, TimedeltaIndex): - return True - elif len(other) > 0 and other.inferred_type not in ( - "floating", - "mixed-integer", - "integer", - "integer-na", - "mixed-integer-float", - "mixed", - ): - return True - return False - - def timedelta_range( start=None, end=None, periods=None, freq=None, name=None, closed=None ) -> TimedeltaIndex: diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index b86293e78a80d..ebecb02e20e1a 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1,4 +1,4 @@ -from typing import Tuple +from typing import Hashable, List, Tuple, Union import numpy as np @@ -2224,7 +2224,7 @@ def _convert_key(self, key, is_setter: bool = False): return key -def _tuplify(ndim: int, loc) -> tuple: +def _tuplify(ndim: int, loc: Hashable) -> Tuple[Union[Hashable, slice], ...]: """ Given an indexer for the first dimension, create an equivalent tuple for indexing over all dimensions. @@ -2238,9 +2238,10 @@ def _tuplify(ndim: int, loc) -> tuple: ------- tuple """ - tup = [slice(None, None) for _ in range(ndim)] - tup[0] = loc - return tuple(tup) + _tup: List[Union[Hashable, slice]] + _tup = [slice(None, None) for _ in range(ndim)] + _tup[0] = loc + return tuple(_tup) def convert_to_index_sliceable(obj, key): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 664f6ea75a3be..c0f1685076f69 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -380,7 +380,6 @@ def apply(self, func, **kwargs): return nbs if not isinstance(result, Block): - # Exclude the 0-dim case so we can do reductions result = self.make_block(values=_block_shape(result, ndim=self.ndim)) return result @@ -658,9 +657,9 @@ def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): if slicer is not None: values = values[:, slicer] mask = isna(values) + itemsize = writers.word_len(na_rep) - if not self.is_object and not quoting: - itemsize = writers.word_len(na_rep) + if not self.is_object and not quoting and itemsize: values = values.astype(f" 1: + # GH#12513 a EA dtype passed with a 2D array, split into + # multiple EAs that view the values + values = [values[:, n] for n in range(values.shape[1])] + else: + values = [values] + if columns is None: - columns = [0] - return arrays_to_mgr([values], columns, index, columns, dtype=dtype) + columns = list(range(len(values))) + return arrays_to_mgr(values, columns, index, columns, dtype=dtype) # by definition an array here # the dtypes will be coerced to a single dtype diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 9729f172183e7..0d2e2fbfd8ddd 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -340,6 +340,32 @@ def _verify_integrity(self): f"tot_items: {tot_items}" ) + def reduce(self, func, *args, **kwargs): + # If 2D, we assume that we're operating column-wise + if self.ndim == 1: + # we'll be returning a scalar + blk = self.blocks[0] + return func(blk.values, *args, **kwargs) + + res = {} + for blk in self.blocks: + bres = func(blk.values, *args, **kwargs) + + if np.ndim(bres) == 0: + # EA + assert blk.shape[0] == 1 + new_res = zip(blk.mgr_locs.as_array, [bres]) + else: + assert bres.ndim == 1, bres.shape + assert blk.shape[0] == len(bres), (blk.shape, bres.shape, args, kwargs) + new_res = zip(blk.mgr_locs.as_array, bres) + + nr = dict(new_res) + assert not any(key in res for key in nr) + res.update(nr) + + return res + def apply(self, f, filter=None, **kwargs): """ Iterate over the blocks, collect and create a new BlockManager. diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 1079f516a4e40..584972f2b2dd5 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -831,7 +831,7 @@ def reduction(values, axis=None, skipna=True, mask=None): try: result = getattr(values, meth)(axis, dtype=dtype_max) result.fill(np.nan) - except (AttributeError, TypeError, ValueError, np.core._internal.AxisError): + except (AttributeError, TypeError, ValueError): result = np.nan else: result = getattr(values, meth)(axis) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index be5e53eaa6721..1b868f7c10602 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -302,7 +302,7 @@ def _get_op_name(op, special): """ opname = op.__name__.strip("_") if special: - opname = "__{opname}__".format(opname=opname) + opname = f"__{opname}__" return opname @@ -385,7 +385,7 @@ def column_op(a, b): return {i: func(a.iloc[:, i], b.iloc[:, i]) for i in range(len(a.columns))} elif isinstance(right, ABCSeries) and axis == "columns": - # We only get here if called via _combine_frame_series, + # We only get here if called via _combine_series_frame, # in which case we specifically want to operate row-by-row assert right.index.equals(left.columns) @@ -603,9 +603,7 @@ def _combine_series_frame(self, other, func, fill_value=None, axis=None, level=N result : DataFrame """ if fill_value is not None: - raise NotImplementedError( - "fill_value {fill} not supported.".format(fill=fill_value) - ) + raise NotImplementedError(f"fill_value {fill_value} not supported.") if axis is None: # default axis is columns @@ -661,15 +659,13 @@ def to_series(right): else: raise ValueError( "Unable to coerce to DataFrame, shape " - "must be {req_shape}: given {given_shape}".format( - req_shape=left.shape, given_shape=right.shape - ) + f"must be {left.shape}: given {right.shape}" ) elif right.ndim > 2: raise ValueError( "Unable to coerce to Series/DataFrame, dim " - "must be <= 2: {dim}".format(dim=right.shape) + f"must be <= 2: {right.shape}" ) elif is_list_like(right) and not isinstance(right, (ABCSeries, ABCDataFrame)): @@ -702,7 +698,11 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): # Another DataFrame pass_op = op if should_series_dispatch(self, other, op) else na_op pass_op = pass_op if not is_logical else op - return self._combine_frame(other, pass_op, fill_value, level) + + left, right = self.align(other, join="outer", level=level, copy=False) + new_data = left._combine_frame(right, pass_op, fill_value) + return left._construct_result(new_data) + elif isinstance(other, ABCSeries): # For these values of `axis`, we end up dispatching to Series op, # so do not want the masked op. @@ -763,7 +763,7 @@ def _comp_method_FRAME(cls, op, special): str_rep = _get_opstr(op) op_name = _get_op_name(op, special) - @Appender("Wrapper for comparison method {name}".format(name=op_name)) + @Appender(f"Wrapper for comparison method {op_name}") def f(self, other): other = _align_method_FRAME(self, other, axis=None) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 9ae0aa930779b..931653b63af36 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -96,7 +96,7 @@ def __str__(self) -> str: ) return f"{type(self).__name__} [{', '.join(attrs)}]" - def __getattr__(self, attr): + def __getattr__(self, attr: str): if attr in self._internal_names_set: return object.__getattribute__(self, attr) if attr in self._attributes: @@ -131,7 +131,7 @@ def ax(self): return self.groupby.ax @property - def _typ(self): + def _typ(self) -> str: """ Masquerade for compat as a Series or a DataFrame. """ @@ -140,7 +140,7 @@ def _typ(self): return "dataframe" @property - def _from_selection(self): + def _from_selection(self) -> bool: """ Is the resampling from a DataFrame column or MultiIndex level. """ @@ -316,7 +316,7 @@ def _downsample(self, f): def _upsample(self, f, limit=None, fill_value=None): raise AbstractMethodError(self) - def _gotitem(self, key, ndim, subset=None): + def _gotitem(self, key, ndim: int, subset=None): """ Sub-classes to define. Return a sliced object. @@ -1407,7 +1407,7 @@ def _get_resampler(self, obj, kind=None): f"but got an instance of '{type(ax).__name__}'" ) - def _get_grouper(self, obj, validate=True): + def _get_grouper(self, obj, validate: bool = True): # create the resampler and return our binner r = self._get_resampler(obj) r._set_binner() diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index cea70012b47ea..a3d9dbfba9e71 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -472,9 +472,9 @@ def _get_result_dim(self) -> int: else: return self.objs[0].ndim - def _get_new_axes(self): + def _get_new_axes(self) -> List[Index]: ndim = self._get_result_dim() - new_axes = [None] * ndim + new_axes: List = [None] * ndim for i in range(ndim): if i == self.axis: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 37ec05c40940e..6fe2287923fcb 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -41,6 +41,7 @@ from pandas.core.dtypes.missing import isna, na_value_for_dtype from pandas import Categorical, Index, MultiIndex +from pandas.core import groupby import pandas.core.algorithms as algos from pandas.core.arrays.categorical import _recode_for_categories import pandas.core.common as com @@ -68,7 +69,7 @@ def merge( copy: bool = True, indicator: bool = False, validate=None, -): +) -> "DataFrame": op = _MergeOperation( left, right, @@ -113,6 +114,7 @@ def _groupby_and_merge( by = [by] lby = left.groupby(by, sort=False) + rby: Optional[groupby.DataFrameGroupBy] = None # if we can groupby the rhs # then we can get vastly better perf @@ -132,7 +134,7 @@ def _groupby_and_merge( try: rby = right.groupby(by, sort=False) except KeyError: - rby = None + pass for key, lhs in lby: @@ -183,7 +185,7 @@ def merge_ordered( fill_method=None, suffixes=("_x", "_y"), how: str = "outer", -): +) -> "DataFrame": """ Perform merge with optional filling/interpolation. @@ -317,7 +319,7 @@ def merge_asof( tolerance=None, allow_exact_matches: bool = True, direction: str = "backward", -): +) -> "DataFrame": """ Perform an asof merge. This is similar to a left-join except that we match on nearest key rather than equal keys. diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 4b21045cd0217..2eb2990bd58c4 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -35,7 +35,7 @@ def pivot_table( dropna=True, margins_name="All", observed=False, -): +) -> "DataFrame": index = _convert_by(index) columns = _convert_by(columns) @@ -148,7 +148,9 @@ def pivot_table( table = table.sort_index(axis=1) if fill_value is not None: - table = table.fillna(value=fill_value, downcast="infer") + filled = table.fillna(value=fill_value, downcast="infer") + assert filled is not None # needed for mypy + table = filled if margins: if dropna: @@ -426,7 +428,7 @@ def _convert_by(by): @Substitution("\ndata : DataFrame") @Appender(_shared_docs["pivot"], indents=1) -def pivot(data: "DataFrame", index=None, columns=None, values=None): +def pivot(data: "DataFrame", index=None, columns=None, values=None) -> "DataFrame": if values is None: cols = [columns] if index is None else [index, columns] append = index is None diff --git a/pandas/core/series.py b/pandas/core/series.py index 15fc712672717..aa5af9bb893fa 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -59,7 +59,7 @@ is_empty_data, sanitize_array, ) -from pandas.core.generic import _shared_docs +from pandas.core.groupby import generic as groupby_generic from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.accessors import CombinedDatetimelikeProperties from pandas.core.indexes.api import ( @@ -1431,7 +1431,7 @@ def to_string( """ ) @Substitution(klass="Series") - @Appender(_shared_docs["to_markdown"]) + @Appender(generic._shared_docs["to_markdown"]) def to_markdown( self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs, ) -> Optional[str]: @@ -1568,6 +1568,89 @@ def _set_name(self, name, inplace=False): ser.name = name return ser + @Appender( + """ +Examples +-------- +>>> ser = pd.Series([390., 350., 30., 20.], +... index=['Falcon', 'Falcon', 'Parrot', 'Parrot'], name="Max Speed") +>>> ser +Falcon 390.0 +Falcon 350.0 +Parrot 30.0 +Parrot 20.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(["a", "b", "a", "b"]).mean() +a 210.0 +b 185.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(level=0).mean() +Falcon 370.0 +Parrot 25.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(ser > 100).mean() +Max Speed +False 25.0 +True 370.0 +Name: Max Speed, dtype: float64 + +**Grouping by Indexes** + +We can groupby different levels of a hierarchical index +using the `level` parameter: + +>>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], +... ['Captive', 'Wild', 'Captive', 'Wild']] +>>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) +>>> ser = pd.Series([390., 350., 30., 20.], index=index, name="Max Speed") +>>> ser +Animal Type +Falcon Captive 390.0 + Wild 350.0 +Parrot Captive 30.0 + Wild 20.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(level=0).mean() +Animal +Falcon 370.0 +Parrot 25.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(level="Type").mean() +Type +Captive 210.0 +Wild 185.0 +Name: Max Speed, dtype: float64 +""" + ) + @Appender(generic._shared_docs["groupby"] % _shared_doc_kwargs) + def groupby( + self, + by=None, + axis=0, + level=None, + as_index: bool = True, + sort: bool = True, + group_keys: bool = True, + squeeze: bool = False, + observed: bool = False, + ) -> "groupby_generic.SeriesGroupBy": + + if level is None and by is None: + raise TypeError("You have to supply one of 'by' and 'level'") + axis = self._get_axis_number(axis) + + return groupby_generic.SeriesGroupBy( + obj=self, + keys=by, + axis=axis, + level=level, + as_index=as_index, + sort=sort, + group_keys=group_keys, + squeeze=squeeze, + observed=observed, + ) + # ---------------------------------------------------------------------- # Statistics, overridden ndarray methods @@ -4005,8 +4088,7 @@ def fillna( inplace=False, limit=None, downcast=None, - **kwargs, - ): + ) -> Optional["Series"]: return super().fillna( value=value, method=method, @@ -4014,7 +4096,6 @@ def fillna( inplace=inplace, limit=limit, downcast=downcast, - **kwargs, ) @Appender(generic._shared_docs["replace"] % _shared_doc_kwargs) diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index fa3582755a202..43655fa3ea913 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -85,11 +85,12 @@ def hash_pandas_object( if isinstance(obj, ABCMultiIndex): return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False) - if isinstance(obj, ABCIndexClass): + elif isinstance(obj, ABCIndexClass): h = hash_array(obj.values, encoding, hash_key, categorize).astype( "uint64", copy=False ) h = Series(h, index=obj, dtype="uint64", copy=False) + elif isinstance(obj, ABCSeries): h = hash_array(obj.values, encoding, hash_key, categorize).astype( "uint64", copy=False diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 5b0fbbb3518d2..176406f953f67 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -184,7 +184,7 @@ def _gotitem(self, key, ndim, subset=None): self._selection = key return self - def __getattr__(self, attr): + def __getattr__(self, attr: str): if attr in self._internal_names_set: return object.__getattribute__(self, attr) if attr in self.obj: diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 553334407d12e..fe13fce83161d 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -297,9 +297,7 @@ def read_excel( for arg in ("sheet", "sheetname", "parse_cols"): if arg in kwds: - raise TypeError( - "read_excel() got an unexpected keyword argument `{}`".format(arg) - ) + raise TypeError(f"read_excel() got an unexpected keyword argument `{arg}`") if not isinstance(io, ExcelFile): io = ExcelFile(io, engine=engine) @@ -429,7 +427,7 @@ def parse( for asheetname in sheets: if verbose: - print("Reading sheet {sheet}".format(sheet=asheetname)) + print(f"Reading sheet {asheetname}") if isinstance(asheetname, str): sheet = self.get_sheet_by_name(asheetname) @@ -622,11 +620,11 @@ def __new__(cls, path, engine=None, **kwargs): ext = "xlsx" try: - engine = config.get_option("io.excel.{ext}.writer".format(ext=ext)) + engine = config.get_option(f"io.excel.{ext}.writer") if engine == "auto": engine = _get_default_writer(ext) except KeyError: - raise ValueError("No engine for filetype: '{ext}'".format(ext=ext)) + raise ValueError(f"No engine for filetype: '{ext}'") cls = get_writer(engine) return object.__new__(cls) @@ -757,9 +755,8 @@ def check_extension(cls, ext): if ext.startswith("."): ext = ext[1:] if not any(ext in extension for extension in cls.supported_extensions): - msg = "Invalid extension for engine '{engine}': '{ext}'".format( - engine=pprint_thing(cls.engine), ext=pprint_thing(ext) - ) + msg = "Invalid extension for engine" + f"'{pprint_thing(cls.engine)}': '{pprint_thing(ext)}'" raise ValueError(msg) else: return True @@ -802,7 +799,7 @@ def __init__(self, io, engine=None): if engine is None: engine = "xlrd" if engine not in self._engines: - raise ValueError("Unknown engine: {engine}".format(engine=engine)) + raise ValueError(f"Unknown engine: {engine}") self.engine = engine # could be a str, ExcelFile, Book, etc. diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 6b9943136664a..2afb41e7bdc7e 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -178,4 +178,4 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar: elif cell_type == "time": return pd.to_datetime(str(cell)).time() else: - raise ValueError("Unrecognized type {}".format(cell_type)) + raise ValueError(f"Unrecognized type {cell_type}") diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 7a264ed2b0850..be52523e486af 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -99,7 +99,7 @@ def _convert_to_style_kwargs(cls, style_dict): for k, v in style_dict.items(): if k in _style_key_map: k = _style_key_map[k] - _conv_to_x = getattr(cls, "_convert_to_{k}".format(k=k), lambda x: None) + _conv_to_x = getattr(cls, f"_convert_to_{k}", lambda x: None) new_v = _conv_to_x(v) if new_v: style_kwargs[k] = new_v diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py index ee617d2013136..8cd4b2012cb42 100644 --- a/pandas/io/excel/_util.py +++ b/pandas/io/excel/_util.py @@ -48,7 +48,7 @@ def get_writer(engine_name): try: return _writers[engine_name] except KeyError: - raise ValueError("No Excel writer '{engine}'".format(engine=engine_name)) + raise ValueError(f"No Excel writer '{engine_name}'") def _excel2num(x): @@ -76,7 +76,7 @@ def _excel2num(x): cp = ord(c) if cp < ord("A") or cp > ord("Z"): - raise ValueError("Invalid column name: {x}".format(x=x)) + raise ValueError(f"Invalid column name: {x}") index = index * 26 + cp - ord("A") + 1 diff --git a/pandas/io/excel/_xlwt.py b/pandas/io/excel/_xlwt.py index 996ae1caa14c8..d102a885cef0a 100644 --- a/pandas/io/excel/_xlwt.py +++ b/pandas/io/excel/_xlwt.py @@ -97,20 +97,20 @@ def _style_to_xlwt( if hasattr(item, "items"): if firstlevel: it = [ - "{key}: {val}".format(key=key, val=cls._style_to_xlwt(value, False)) + f"{key}: {cls._style_to_xlwt(value, False)}" for key, value in item.items() ] - out = "{sep} ".format(sep=(line_sep).join(it)) + out = f"{(line_sep).join(it)} " return out else: it = [ - "{key} {val}".format(key=key, val=cls._style_to_xlwt(value, False)) + f"{key} {cls._style_to_xlwt(value, False)}" for key, value in item.items() ] - out = "{sep} ".format(sep=(field_sep).join(it)) + out = f"{(field_sep).join(it)} " return out else: - item = "{item}".format(item=item) + item = f"{item}" item = item.replace("True", "on") item = item.replace("False", "off") return item diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index aa14c3f3a63f3..c0596c984575a 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -3,13 +3,14 @@ from collections import defaultdict import copy -from typing import DefaultDict, Dict, List, Optional, Union +from typing import Any, DefaultDict, Dict, Iterable, List, Optional, Union import numpy as np from pandas._libs.writers import convert_json_to_lines from pandas.util._decorators import deprecate +import pandas as pd from pandas import DataFrame @@ -112,13 +113,13 @@ def nested_to_record( def _json_normalize( data: Union[Dict, List[Dict]], record_path: Optional[Union[str, List]] = None, - meta: Optional[Union[str, List]] = None, + meta: Optional[Union[str, List[Union[str, List[str]]]]] = None, meta_prefix: Optional[str] = None, record_prefix: Optional[str] = None, errors: Optional[str] = "raise", sep: str = ".", max_level: Optional[int] = None, -): +) -> "DataFrame": """ Normalize semi-structured JSON data into a flat table. @@ -229,14 +230,23 @@ def _json_normalize( Returns normalized data with columns prefixed with the given string. """ - def _pull_field(js, spec): - result = js + def _pull_field(js: Dict[str, Any], spec: Union[List, str]) -> Iterable: + result = js # type: ignore if isinstance(spec, list): for field in spec: result = result[field] else: result = result[spec] + if not isinstance(result, Iterable): + if pd.isnull(result): + result = [] # type: ignore + else: + raise TypeError( + f"{js} has non iterable value {result} for path {spec}. " + "Must be iterable or null." + ) + return result if isinstance(data, list) and not data: @@ -265,21 +275,21 @@ def _pull_field(js, spec): elif not isinstance(meta, list): meta = [meta] - meta = [m if isinstance(m, list) else [m] for m in meta] + _meta = [m if isinstance(m, list) else [m] for m in meta] # Disastrously inefficient for now records: List = [] lengths = [] meta_vals: DefaultDict = defaultdict(list) - meta_keys = [sep.join(val) for val in meta] + meta_keys = [sep.join(val) for val in _meta] def _recursive_extract(data, path, seen_meta, level=0): if isinstance(data, dict): data = [data] if len(path) > 1: for obj in data: - for val, key in zip(meta, meta_keys): + for val, key in zip(_meta, meta_keys): if level + 1 == len(val): seen_meta[key] = _pull_field(obj, val[-1]) @@ -296,7 +306,7 @@ def _recursive_extract(data, path, seen_meta, level=0): # For repeating the metadata later lengths.append(len(recs)) - for val, key in zip(meta, meta_keys): + for val, key in zip(_meta, meta_keys): if level + 1 > len(val): meta_val = seen_meta[key] else: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index db8d9eb669c20..4f1541e8d127e 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1459,7 +1459,7 @@ def copy( data = self.select(k) if isinstance(s, Table): - index: Union[bool, list] = False + index: Union[bool, List[str]] = False if propindexes: index = [a.name for a in s.axes if a.is_indexed] new_store.append( diff --git a/pandas/io/sql.py b/pandas/io/sql.py index b619ea93b981d..47805207862f0 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -241,7 +241,7 @@ def read_sql_table( try: meta.reflect(only=[table_name], views=True) except sqlalchemy.exc.InvalidRequestError: - raise ValueError("Table {name} not found".format(name=table_name)) + raise ValueError(f"Table {table_name} not found") pandas_sql = SQLDatabase(con, meta=meta) table = pandas_sql.read_table( @@ -256,7 +256,7 @@ def read_sql_table( if table is not None: return table else: - raise ValueError("Table {name} not found".format(name=table_name), con) + raise ValueError(f"Table {table_name} not found", con) def read_sql_query( @@ -498,7 +498,7 @@ def to_sql( .. versionadded:: 0.24.0 """ if if_exists not in ("fail", "replace", "append"): - raise ValueError("'{0}' is not valid for if_exists".format(if_exists)) + raise ValueError(f"'{if_exists}' is not valid for if_exists") pandas_sql = pandasSQL_builder(con, schema=schema) @@ -625,7 +625,7 @@ def __init__( self.table = self.pd_sql.get_table(self.name, self.schema) if self.table is None: - raise ValueError("Could not init table '{name}'".format(name=name)) + raise ValueError(f"Could not init table '{name}'") def exists(self): return self.pd_sql.has_table(self.name, self.schema) @@ -643,18 +643,14 @@ def _execute_create(self): def create(self): if self.exists(): if self.if_exists == "fail": - raise ValueError( - "Table '{name}' already exists.".format(name=self.name) - ) + raise ValueError(f"Table '{self.name}' already exists.") elif self.if_exists == "replace": self.pd_sql.drop_table(self.name, self.schema) self._execute_create() elif self.if_exists == "append": pass else: - raise ValueError( - "'{0}' is not valid for if_exists".format(self.if_exists) - ) + raise ValueError(f"'{self.if_exists}' is not valid for if_exists") else: self._execute_create() @@ -689,7 +685,7 @@ def insert_data(self): try: temp.reset_index(inplace=True) except ValueError as err: - raise ValueError("duplicate name in index/columns: {0}".format(err)) + raise ValueError(f"duplicate name in index/columns: {err}") else: temp = self.frame @@ -732,7 +728,7 @@ def insert(self, chunksize=None, method=None): elif callable(method): exec_insert = partial(method, self) else: - raise ValueError("Invalid parameter `method`: {}".format(method)) + raise ValueError(f"Invalid parameter `method`: {method}") keys, data_list = self.insert_data() @@ -826,7 +822,7 @@ def _index_name(self, index, index_label): if len(index_label) != nlevels: raise ValueError( "Length of 'index_label' should match number of " - "levels, which is {0}".format(nlevels) + f"levels, which is {nlevels}" ) else: return index_label @@ -839,7 +835,7 @@ def _index_name(self, index, index_label): return ["index"] else: return [ - l if l is not None else "level_{0}".format(i) + l if l is not None else f"level_{i}" for i, l in enumerate(self.frame.index.names) ] @@ -1304,10 +1300,7 @@ def to_sql( for col, my_type in dtype.items(): if not isinstance(to_instance(my_type), TypeEngine): - raise ValueError( - "The type of {column} is not a " - "SQLAlchemy type ".format(column=col) - ) + raise ValueError(f"The type of {col} is not a SQLAlchemy type") table = SQLTable( name, @@ -1331,11 +1324,11 @@ def to_sql( ) if name not in table_names: msg = ( - "The provided table name '{0}' is not found exactly as " + f"The provided table name '{name}' is not found exactly as " "such in the database after writing the table, possibly " "due to case sensitivity issues. Consider using lower " "case table names." - ).format(name) + ) warnings.warn(msg, UserWarning) @property @@ -1395,9 +1388,7 @@ def _get_unicode_name(name): try: uname = str(name).encode("utf-8", "strict").decode("utf-8") except UnicodeError: - raise ValueError( - "Cannot convert identifier to UTF-8: '{name}'".format(name=name) - ) + raise ValueError(f"Cannot convert identifier to UTF-8: '{name}'") return uname @@ -1461,8 +1452,8 @@ def insert_statement(self): bracketed_names = [escape(column) for column in names] col_names = ",".join(bracketed_names) wildcards = ",".join([wld] * len(names)) - insert_statement = "INSERT INTO {table} ({columns}) VALUES ({wld})".format( - table=escape(self.name), columns=col_names, wld=wildcards + insert_statement = ( + f"INSERT INTO {escape(self.name)} ({col_names}) VALUES ({wildcards})" ) return insert_statement @@ -1496,9 +1487,7 @@ def _create_table_setup(self): keys = self.keys cnames_br = ", ".join(escape(c) for c in keys) create_tbl_stmts.append( - "CONSTRAINT {tbl}_pk PRIMARY KEY ({cnames_br})".format( - tbl=self.name, cnames_br=cnames_br - ) + f"CONSTRAINT {self.name}_pk PRIMARY KEY ({cnames_br})" ) create_stmts = [ @@ -1599,14 +1588,11 @@ def execute(self, *args, **kwargs): self.con.rollback() except Exception as inner_exc: # pragma: no cover ex = DatabaseError( - "Execution failed on sql: {sql}\n{exc}\nunable " - "to rollback".format(sql=args[0], exc=exc) + f"Execution failed on sql: {args[0]}\n{exc}\nunable to rollback" ) raise ex from inner_exc - ex = DatabaseError( - "Execution failed on sql '{sql}': {exc}".format(sql=args[0], exc=exc) - ) + ex = DatabaseError(f"Execution failed on sql '{args[0]}': {exc}") raise ex from exc @staticmethod @@ -1731,11 +1717,7 @@ def to_sql( if dtype is not None: for col, my_type in dtype.items(): if not isinstance(my_type, str): - raise ValueError( - "{column} ({type!s}) not a string".format( - column=col, type=my_type - ) - ) + raise ValueError(f"{col} ({my_type}) not a string") table = SQLiteTable( name, @@ -1755,9 +1737,7 @@ def has_table(self, name, schema=None): # esc_name = escape(name) wld = "?" - query = ( - "SELECT name FROM sqlite_master WHERE type='table' AND name={wld};" - ).format(wld=wld) + query = f"SELECT name FROM sqlite_master WHERE type='table' AND name={wld};" return len(self.execute(query, [name]).fetchall()) > 0 @@ -1765,7 +1745,7 @@ def get_table(self, table_name, schema=None): return None # not supported in fallback mode def drop_table(self, name, schema=None): - drop_sql = "DROP TABLE {name}".format(name=_get_valid_sqlite_name(name)) + drop_sql = f"DROP TABLE {_get_valid_sqlite_name(name)}" self.execute(drop_sql) def _create_sql_schema(self, frame, table_name, keys=None, dtype=None): diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 1f8c6968359c1..b216ee80c3940 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -85,7 +85,7 @@ iterator : bool, default False Return StataReader object.""" -_read_stata_doc = """ +_read_stata_doc = f""" Read Stata file into DataFrame. Parameters @@ -100,10 +100,10 @@ By file-like object, we refer to objects with a ``read()`` method, such as a file handler (e.g. via builtin ``open`` function) or ``StringIO``. -%s -%s -%s -%s +{_statafile_processing_params1} +{_statafile_processing_params2} +{_chunksize_params} +{_iterator_params} Returns ------- @@ -125,33 +125,24 @@ >>> itr = pd.read_stata('filename.dta', chunksize=10000) >>> for chunk in itr: ... do_something(chunk) -""" % ( - _statafile_processing_params1, - _statafile_processing_params2, - _chunksize_params, - _iterator_params, -) +""" -_read_method_doc = """\ +_read_method_doc = f"""\ Reads observations from Stata file, converting them into a dataframe Parameters ---------- nrows : int Number of lines to read from data file, if None read whole file. -%s -%s +{_statafile_processing_params1} +{_statafile_processing_params2} Returns ------- DataFrame -""" % ( - _statafile_processing_params1, - _statafile_processing_params2, -) - +""" -_stata_reader_doc = """\ +_stata_reader_doc = f"""\ Class for reading Stata dta files. Parameters @@ -161,14 +152,10 @@ implementing a binary read() functions. .. versionadded:: 0.23.0 support for pathlib, py.path. -%s -%s -%s -""" % ( - _statafile_processing_params1, - _statafile_processing_params2, - _chunksize_params, -) +{_statafile_processing_params1} +{_statafile_processing_params2} +{_chunksize_params} +""" @Appender(_read_stata_doc) @@ -370,7 +357,7 @@ def convert_delta_safe(base, deltas, unit): month = np.ones_like(dates) conv_dates = convert_year_month_safe(year, month) else: - raise ValueError("Date fmt {fmt} not understood".format(fmt=fmt)) + raise ValueError(f"Date fmt {fmt} not understood") if has_bad_values: # Restore NaT for bad values conv_dates[bad_locs] = NaT @@ -465,9 +452,7 @@ def parse_dates_safe(dates, delta=False, year=False, days=False): d = parse_dates_safe(dates, year=True) conv_dates = d.year else: - raise ValueError( - "Format {fmt} is not a known Stata date format".format(fmt=fmt) - ) + raise ValueError(f"Format {fmt} is not a known Stata date format") conv_dates = Series(conv_dates, dtype=np.float64) missing_value = struct.unpack("= 2 ** 53: - ws = precision_loss_doc % ("uint64", "float64") + ws = precision_loss_doc.format("uint64", "float64") data[col] = data[col].astype(dtype) @@ -585,25 +570,21 @@ def _cast_to_stata_types(data): else: data[col] = data[col].astype(np.float64) if data[col].max() >= 2 ** 53 or data[col].min() <= -(2 ** 53): - ws = precision_loss_doc % ("int64", "float64") + ws = precision_loss_doc.format("int64", "float64") elif dtype in (np.float32, np.float64): value = data[col].max() if np.isinf(value): raise ValueError( - "Column {col} has a maximum value of " - "infinity which is outside the range " - "supported by Stata.".format(col=col) + f"Column {col} has a maximum value of infinity which is outside " + "the range supported by Stata." ) if dtype == np.float32 and value > float32_max: data[col] = data[col].astype(np.float64) elif dtype == np.float64: if value > float64_max: raise ValueError( - "Column {col} has a maximum value " - "({val}) outside the range supported by " - "Stata ({float64_max})".format( - col=col, val=value, float64_max=float64_max - ) + f"Column {col} has a maximum value ({value}) outside the range " + f"supported by Stata ({float64_max})" ) if ws: @@ -618,26 +599,18 @@ class StataValueLabel: Parameters ---------- - value : int8, int16, int32, float32 or float64 - The Stata missing value code - - Attributes - ---------- - string : string - String representation of the Stata missing value - value : int8, int16, int32, float32 or float64 - The original encoded missing value - - Methods - ------- - generate_value_label - + catarray : Categorical + Categorical Series to encode + encoding : {"latin-1", "utf-8"} + Encoding to use for value labels. """ - def __init__(self, catarray): + def __init__(self, catarray, encoding="latin-1"): + if encoding not in ("latin-1", "utf-8"): + raise ValueError("Only latin-1 and utf-8 are supported.") self.labname = catarray.name - + self._encoding = encoding categories = catarray.cat.categories self.value_labels = list(zip(np.arange(len(categories)), categories)) self.value_labels.sort(key=lambda x: x[0]) @@ -656,7 +629,7 @@ def __init__(self, catarray): value_label_mismatch_doc.format(catarray.name), ValueLabelTypeMismatch, ) - + category = category.encode(encoding) self.off.append(self.text_len) self.text_len += len(category) + 1 # +1 for the padding self.val.append(vl[0]) @@ -683,31 +656,31 @@ def _encode(self, s): """ return s.encode(self._encoding) - def generate_value_label(self, byteorder, encoding): + def generate_value_label(self, byteorder): """ + Generate the binary representation of the value labals. + Parameters ---------- byteorder : str Byte order of the output - encoding : str - File encoding Returns ------- value_label : bytes Bytes containing the formatted value label """ - - self._encoding = encoding + encoding = self._encoding bio = BytesIO() - null_string = "\x00" null_byte = b"\x00" # len bio.write(struct.pack(byteorder + "i", self.len)) # labname - labname = self._encode(_pad_bytes(self.labname[:32], 33)) + labname = self.labname[:32].encode(encoding) + lab_len = 32 if encoding not in ("utf-8", "utf8") else 128 + labname = _pad_bytes(labname, lab_len + 1) bio.write(labname) # padding - 3 bytes @@ -731,7 +704,7 @@ def generate_value_label(self, byteorder, encoding): # txt - Text labels, null terminated for text in self.txt: - bio.write(self._encode(text + null_string)) + bio.write(text + null_byte) bio.seek(0) return bio.read() @@ -1007,6 +980,22 @@ def __init__(self): "typedef", "typename", "virtual", + "_all", + "_N", + "_skip", + "_b", + "_pi", + "str#", + "in", + "_pred", + "strL", + "_coef", + "_rc", + "using", + "_cons", + "_se", + "with", + "_n", ) @@ -1192,7 +1181,7 @@ def f(typ): try: return self.TYPE_MAP_XML[typ] except KeyError: - raise ValueError("cannot convert stata types [{0}]".format(typ)) + raise ValueError(f"cannot convert stata types [{typ}]") typlist = [f(x) for x in raw_typlist] @@ -1202,7 +1191,7 @@ def f(typ): try: return self.DTYPE_MAP_XML[typ] except KeyError: - raise ValueError("cannot convert stata dtype [{0}]".format(typ)) + raise ValueError(f"cannot convert stata dtype [{typ}]") dtyplist = [f(x) for x in raw_typlist] @@ -1330,19 +1319,13 @@ def _read_old_header(self, first_char): try: self.typlist = [self.TYPE_MAP[typ] for typ in typlist] except ValueError: - raise ValueError( - "cannot convert stata types [{0}]".format( - ",".join(str(x) for x in typlist) - ) - ) + invalid_types = ",".join(str(x) for x in typlist) + raise ValueError(f"cannot convert stata types [{invalid_types}]") try: self.dtyplist = [self.DTYPE_MAP[typ] for typ in typlist] except ValueError: - raise ValueError( - "cannot convert stata dtypes [{0}]".format( - ",".join(str(x) for x in typlist) - ) - ) + invalid_dtypes = ",".join(str(x) for x in typlist) + raise ValueError(f"cannot convert stata dtypes [{invalid_dtypes}]") if self.format_version > 108: self.varlist = [ @@ -1415,12 +1398,13 @@ def _decode(self, s): except UnicodeDecodeError: # GH 25960, fallback to handle incorrect format produced when 117 # files are converted to 118 files in Stata - msg = """ + encoding = self._encoding + msg = f""" One or more strings in the dta file could not be decoded using {encoding}, and so the fallback encoding of latin-1 is being used. This can happen when a file has been incorrectly encoded by Stata or some other software. You should verify the string values returned are correct.""" - warnings.warn(msg.format(encoding=self._encoding), UnicodeWarning) + warnings.warn(msg, UnicodeWarning) return s.decode("latin-1") def _read_value_labels(self): @@ -1794,7 +1778,7 @@ def _do_convert_categoricals( repeats = list(vc.index[vc > 1]) repeats = "-" * 80 + "\n" + "\n".join(repeats) # GH 25772 - msg = """ + msg = f""" Value labels for column {col} are not unique. These cannot be converted to pandas categoricals. @@ -1805,7 +1789,7 @@ def _do_convert_categoricals( The repeated labels are: {repeats} """ - raise ValueError(msg.format(col=col, repeats=repeats)) + raise ValueError(msg) # TODO: is the next line needed above in the data(...) method? cat_data = Series(cat_data, index=data.index) cat_converted_data.append((col, cat_data)) @@ -1874,13 +1858,15 @@ def _set_endianness(endianness): elif endianness.lower() in [">", "big"]: return ">" else: # pragma : no cover - raise ValueError("Endianness {endian} not understood".format(endian=endianness)) + raise ValueError(f"Endianness {endianness} not understood") def _pad_bytes(name, length): """ Take a char string and pads it with null bytes until it's length chars. """ + if isinstance(name, bytes): + return name + b"\x00" * (length - len(name)) return name + "\x00" * (length - len(name)) @@ -1906,7 +1892,7 @@ def _convert_datetime_to_stata_type(fmt): ]: return np.float64 # Stata expects doubles for SIFs else: - raise NotImplementedError("Format {fmt} not implemented".format(fmt=fmt)) + raise NotImplementedError(f"Format {fmt} not implemented") def _maybe_convert_to_int_keys(convert_dates, varlist): @@ -1956,9 +1942,7 @@ def _dtype_to_stata_type(dtype, column): elif dtype == np.int8: return 251 else: # pragma : no cover - raise NotImplementedError( - "Data type {dtype} not supported.".format(dtype=dtype) - ) + raise NotImplementedError(f"Data type {dtype} not supported.") def _dtype_to_default_stata_fmt(dtype, column, dta_version=114, force_strl=False): @@ -1985,24 +1969,12 @@ def _dtype_to_default_stata_fmt(dtype, column, dta_version=114, force_strl=False if force_strl: return "%9s" if dtype.type == np.object_: - inferred_dtype = infer_dtype(column, skipna=True) - if not (inferred_dtype in ("string", "unicode") or len(column) == 0): - raise ValueError( - "Column `{col}` cannot be exported.\n\nOnly " - "string-like object arrays containing all " - "strings or a mix of strings and None can be " - "exported. Object arrays containing only null " - "values are prohibited. Other object types" - "cannot be exported and must first be converted " - "to one of the supported " - "types.".format(col=column.name) - ) itemsize = max_len_string_array(ensure_object(column.values)) if itemsize > max_str_len: if dta_version >= 117: return "%9s" else: - raise ValueError(excessive_string_length_error % column.name) + raise ValueError(excessive_string_length_error.format(column.name)) return "%" + str(max(itemsize, 1)) + "s" elif dtype == np.float64: return "%10.0g" @@ -2013,9 +1985,7 @@ def _dtype_to_default_stata_fmt(dtype, column, dta_version=114, force_strl=False elif dtype == np.int8 or dtype == np.int16: return "%8.0g" else: # pragma : no cover - raise NotImplementedError( - "Data type {dtype} not supported.".format(dtype=dtype) - ) + raise NotImplementedError(f"Data type {dtype} not supported.") class StataWriter(StataParser): @@ -2043,8 +2013,6 @@ class StataWriter(StataParser): timezone information write_index : bool Write the index to Stata dataset. - encoding : str - Default is latin-1. Only latin-1 and ascii are supported. byteorder : str Can be ">", "<", "little", or "big". default is `sys.byteorder` time_stamp : datetime @@ -2086,6 +2054,7 @@ class StataWriter(StataParser): """ _max_string_length = 244 + _encoding = "latin-1" def __init__( self, @@ -2101,7 +2070,6 @@ def __init__( super().__init__() self._convert_dates = {} if convert_dates is None else convert_dates self._write_index = write_index - self._encoding = "latin-1" self._time_stamp = time_stamp self._data_label = data_label self._variable_labels = variable_labels @@ -2136,7 +2104,8 @@ def _prepare_categoricals(self, data): data_formatted = [] for col, col_is_cat in zip(data, is_cat): if col_is_cat: - self._value_labels.append(StataValueLabel(data[col])) + svl = StataValueLabel(data[col], encoding=self._encoding) + self._value_labels.append(svl) dtype = data[col].cat.codes.dtype if dtype == np.int64: raise ValueError( @@ -2181,6 +2150,36 @@ def _update_strl_names(self): """No-op, forward compatibility""" pass + def _validate_variable_name(self, name): + """ + Validate variable names for Stata export. + + Parameters + ---------- + name : str + Variable name + + Returns + ------- + str + The validated name with invalid characters replaced with + underscores. + + Notes + ----- + Stata 114 and 117 support ascii characters in a-z, A-Z, 0-9 + and _. + """ + for c in name: + if ( + (c < "A" or c > "Z") + and (c < "a" or c > "z") + and (c < "0" or c > "9") + and c != "_" + ): + name = name.replace(c, "_") + return name + def _check_column_names(self, data): """ Checks column names to ensure that they are valid Stata column names. @@ -2204,14 +2203,7 @@ def _check_column_names(self, data): if not isinstance(name, str): name = str(name) - for c in name: - if ( - (c < "A" or c > "Z") - and (c < "a" or c > "z") - and (c < "0" or c > "9") - and c != "_" - ): - name = name.replace(c, "_") + name = self._validate_variable_name(name) # Variable name must not be a reserved word if name in self.RESERVED_WORDS: @@ -2251,7 +2243,7 @@ def _check_column_names(self, data): orig_name = orig_name.encode("utf-8") except (UnicodeDecodeError, AttributeError): pass - msg = "{0} -> {1}".format(orig_name, name) + msg = f"{orig_name} -> {name}" conversion_warning.append(msg) ws = invalid_name_doc.format("\n ".join(conversion_warning)) @@ -2262,12 +2254,12 @@ def _check_column_names(self, data): return data - def _set_formats_and_types(self, data, dtypes): + def _set_formats_and_types(self, dtypes): self.typlist = [] self.fmtlist = [] for col, dtype in dtypes.items(): - self.fmtlist.append(_dtype_to_default_stata_fmt(dtype, data[col])) - self.typlist.append(_dtype_to_stata_type(dtype, data[col])) + self.fmtlist.append(_dtype_to_default_stata_fmt(dtype, self.data[col])) + self.typlist.append(_dtype_to_stata_type(dtype, self.data[col])) def _prepare_pandas(self, data): # NOTE: we might need a different API / class for pandas objects so @@ -2311,17 +2303,57 @@ def _prepare_pandas(self, data): new_type = _convert_datetime_to_stata_type(self._convert_dates[key]) dtypes[key] = np.dtype(new_type) - self._set_formats_and_types(data, dtypes) + # Verify object arrays are strings and encode to bytes + self._encode_strings() + + self._set_formats_and_types(dtypes) # set the given format for the datetime cols if self._convert_dates is not None: for key in self._convert_dates: self.fmtlist[key] = self._convert_dates[key] + def _encode_strings(self): + """ + Encode strings in dta-specific encoding + + Do not encode columns marked for date conversion or for strL + conversion. The strL converter independently handles conversion and + also accepts empty string arrays. + """ + convert_dates = self._convert_dates + # _convert_strl is not available in dta 114 + convert_strl = getattr(self, "_convert_strl", []) + for i, col in enumerate(self.data): + # Skip columns marked for date conversion or strl conversion + if i in convert_dates or col in convert_strl: + continue + column = self.data[col] + dtype = column.dtype + if dtype.type == np.object_: + inferred_dtype = infer_dtype(column, skipna=True) + if not ((inferred_dtype in ("string", "unicode")) or len(column) == 0): + col = column.name + raise ValueError( + f"""\ +Column `{col}` cannot be exported.\n\nOnly string-like object arrays +containing all strings or a mix of strings and None can be exported. +Object arrays containing only null values are prohibited. Other object +types cannot be exported and must first be converted to one of the +supported types.""" + ) + encoded = self.data[col].str.encode(self._encoding) + # If larger than _max_string_length do nothing + if ( + max_len_string_array(ensure_object(encoded.values)) + <= self._max_string_length + ): + self.data[col] = encoded + def write_file(self): self._file, self._own_file = _open_file_binary_write(self._fname) try: - self._write_header(time_stamp=self._time_stamp, data_label=self._data_label) + self._write_header(data_label=self._data_label, time_stamp=self._time_stamp) self._write_map() self._write_variable_types() self._write_varnames() @@ -2344,9 +2376,8 @@ def write_file(self): os.unlink(self._fname) except OSError: warnings.warn( - "This save was not successful but {0} could not " - "be deleted. This file is not " - "valid.".format(self._fname), + f"This save was not successful but {self._fname} could not " + "be deleted. This file is not valid.", ResourceWarning, ) raise exc @@ -2392,7 +2423,7 @@ def _write_expansion_fields(self): def _write_value_labels(self): for vl in self._value_labels: - self._file.write(vl.generate_value_label(self._byteorder, self._encoding)) + self._file.write(vl.generate_value_label(self._byteorder)) def _write_header(self, data_label=None, time_stamp=None): byteorder = self._byteorder @@ -2494,9 +2525,8 @@ def _write_variable_labels(self): is_latin1 = all(ord(c) < 256 for c in label) if not is_latin1: raise ValueError( - "Variable labels must contain only " - "characters that can be encoded in " - "Latin-1" + "Variable labels must contain only characters that " + "can be encoded in Latin-1" ) self._write(_pad_bytes(label, 81)) else: @@ -2527,9 +2557,9 @@ def _prepare_data(self): typ = typlist[i] if typ <= self._max_string_length: data[col] = data[col].fillna("").apply(_pad_bytes, args=(typ,)) - stype = "S{type}".format(type=typ) + stype = f"S{typ}" dtypes[col] = stype - data[col] = data[col].str.encode(self._encoding).astype(stype) + data[col] = data[col].astype(stype) else: dtype = data[col].dtype if not native_byteorder: @@ -2715,12 +2745,6 @@ def generate_table(self): return gso_table, gso_df - def _encode(self, s): - """ - Python 3 compatibility shim - """ - return s.encode(self._encoding) - def generate_blob(self, gso_table): """ Generates the binary blob of GSOs that is written to the dta file. @@ -2860,6 +2884,7 @@ class StataWriter117(StataWriter): """ _max_string_length = 2045 + _dta_version = 117 def __init__( self, @@ -2906,18 +2931,21 @@ def _write_header(self, data_label=None, time_stamp=None): self._file.write(bytes("", "utf-8")) bio = BytesIO() # ds_format - 117 - bio.write(self._tag(bytes("117", "utf-8"), "release")) + bio.write(self._tag(bytes(str(self._dta_version), "utf-8"), "release")) # byteorder bio.write(self._tag(byteorder == ">" and "MSF" or "LSF", "byteorder")) # number of vars, 2 bytes assert self.nvar < 2 ** 16 bio.write(self._tag(struct.pack(byteorder + "H", self.nvar), "K")) - # number of obs, 4 bytes - bio.write(self._tag(struct.pack(byteorder + "I", self.nobs), "N")) + # 117 uses 4 bytes, 118 uses 8 + nobs_size = "I" if self._dta_version == 117 else "Q" + bio.write(self._tag(struct.pack(byteorder + nobs_size, self.nobs), "N")) # data label 81 bytes, char, null terminated label = data_label[:80] if data_label is not None else "" - label_len = struct.pack(byteorder + "B", len(label)) - label = label_len + bytes(label, "utf-8") + label = label.encode(self._encoding) + label_size = "B" if self._dta_version == 117 else "H" + label_len = struct.pack(byteorder + label_size, len(label)) + label = label_len + label bio.write(self._tag(label, "label")) # time stamp, 18 bytes, char, null terminated # format dd Mon yyyy hh:mm @@ -2947,7 +2975,7 @@ def _write_header(self, data_label=None, time_stamp=None): + time_stamp.strftime(" %Y %H:%M") ) # '\x11' added due to inspection of Stata file - ts = b"\x11" + bytes(ts, "utf8") + ts = b"\x11" + bytes(ts, "utf-8") bio.write(self._tag(ts, "timestamp")) bio.seek(0) self._file.write(self._tag(bio.read(), "header")) @@ -2994,9 +3022,11 @@ def _write_variable_types(self): def _write_varnames(self): self._update_map("varnames") bio = BytesIO() + # 118 scales by 4 to accommodate utf-8 data worst case encoding + vn_len = 32 if self._dta_version == 117 else 128 for name in self.varlist: name = self._null_terminate(name, True) - name = _pad_bytes_new(name[:32], 33) + name = _pad_bytes_new(name[:32].encode(self._encoding), vn_len + 1) bio.write(name) bio.seek(0) self._file.write(self._tag(bio.read(), "varnames")) @@ -3008,21 +3038,24 @@ def _write_sortlist(self): def _write_formats(self): self._update_map("formats") bio = BytesIO() + fmt_len = 49 if self._dta_version == 117 else 57 for fmt in self.fmtlist: - bio.write(_pad_bytes_new(fmt, 49)) + bio.write(_pad_bytes_new(fmt.encode(self._encoding), fmt_len)) bio.seek(0) self._file.write(self._tag(bio.read(), "formats")) def _write_value_label_names(self): self._update_map("value_label_names") bio = BytesIO() + # 118 scales by 4 to accommodate utf-8 data worst case encoding + vl_len = 32 if self._dta_version == 117 else 128 for i in range(self.nvar): # Use variable name when categorical name = "" # default name if self._is_col_cat[i]: name = self.varlist[i] name = self._null_terminate(name, True) - name = _pad_bytes_new(name[:32], 33) + name = _pad_bytes_new(name[:32].encode(self._encoding), vl_len + 1) bio.write(name) bio.seek(0) self._file.write(self._tag(bio.read(), "value_label_names")) @@ -3031,7 +3064,9 @@ def _write_variable_labels(self): # Missing labels are 80 blank characters plus null termination self._update_map("variable_labels") bio = BytesIO() - blank = _pad_bytes_new("", 81) + # 118 scales by 4 to accommodate utf-8 data worst case encoding + vl_len = 80 if self._dta_version == 117 else 320 + blank = _pad_bytes_new("", vl_len + 1) if self._variable_labels is None: for _ in range(self.nvar): @@ -3045,14 +3080,15 @@ def _write_variable_labels(self): label = self._variable_labels[col] if len(label) > 80: raise ValueError("Variable labels must be 80 characters or fewer") - is_latin1 = all(ord(c) < 256 for c in label) - if not is_latin1: + try: + encoded = label.encode(self._encoding) + except UnicodeEncodeError: raise ValueError( - "Variable labels must contain only " - "characters that can be encoded in " - "Latin-1" + "Variable labels must contain only characters that " + f"can be encoded in {self._encoding}" ) - bio.write(_pad_bytes_new(label, 81)) + + bio.write(_pad_bytes_new(encoded, vl_len + 1)) else: bio.write(blank) bio.seek(0) @@ -3084,7 +3120,7 @@ def _write_value_labels(self): self._update_map("value_labels") bio = BytesIO() for vl in self._value_labels: - lab = vl.generate_value_label(self._byteorder, self._encoding) + lab = vl.generate_value_label(self._byteorder) lab = self._tag(lab, "lbl") bio.write(lab) bio.seek(0) @@ -3114,19 +3150,140 @@ def _convert_strls(self, data): ] if convert_cols: - ssw = StataStrLWriter(data, convert_cols) + ssw = StataStrLWriter(data, convert_cols, version=self._dta_version) tab, new_data = ssw.generate_table() data = new_data self._strl_blob = ssw.generate_blob(tab) return data - def _set_formats_and_types(self, data, dtypes): + def _set_formats_and_types(self, dtypes): self.typlist = [] self.fmtlist = [] for col, dtype in dtypes.items(): force_strl = col in self._convert_strl fmt = _dtype_to_default_stata_fmt( - dtype, data[col], dta_version=117, force_strl=force_strl + dtype, + self.data[col], + dta_version=self._dta_version, + force_strl=force_strl, ) self.fmtlist.append(fmt) - self.typlist.append(_dtype_to_stata_type_117(dtype, data[col], force_strl)) + self.typlist.append( + _dtype_to_stata_type_117(dtype, self.data[col], force_strl) + ) + + +class StataWriter118(StataWriter117): + """ + A class for writing Stata binary dta files in Stata 15 format (118) + + DTA 118 format files support unicode string data (both fixed and strL) + format. Unicode is also supported in value labels, variable labels and + the dataset label. + + .. versionadded:: 1.0.0 + + Parameters + ---------- + fname : path (string), buffer or path object + string, path object (pathlib.Path or py._path.local.LocalPath) or + object implementing a binary write() functions. If using a buffer + then the buffer will not be automatically closed after the file + is written. + data : DataFrame + Input to save + convert_dates : dict + Dictionary mapping columns containing datetime types to stata internal + format to use when writing the dates. Options are 'tc', 'td', 'tm', + 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name. + Datetime columns that do not have a conversion type specified will be + converted to 'tc'. Raises NotImplementedError if a datetime column has + timezone information + write_index : bool + Write the index to Stata dataset. + byteorder : str + Can be ">", "<", "little", or "big". default is `sys.byteorder` + time_stamp : datetime + A datetime to use as file creation date. Default is the current time + data_label : str + A label for the data set. Must be 80 characters or smaller. + variable_labels : dict + Dictionary containing columns as keys and variable labels as values. + Each label must be 80 characters or smaller. + convert_strl : list + List of columns names to convert to Stata StrL format. Columns with + more than 2045 characters are automatically written as StrL. + Smaller columns can be converted by including the column name. Using + StrLs can reduce output file size when strings are longer than 8 + characters, and either frequently repeated or sparse. + + Returns + ------- + StataWriter118 + The instance has a write_file method, which will write the file to the + given `fname`. + + Raises + ------ + NotImplementedError + * If datetimes contain timezone information + ValueError + * Columns listed in convert_dates are neither datetime64[ns] + or datetime.datetime + * Column dtype is not representable in Stata + * Column listed in convert_dates is not in DataFrame + * Categorical label contains more than 32,000 characters + + Examples + -------- + Using Unicode data and column names + + >>> from pandas.io.stata import StataWriter118 + >>> data = pd.DataFrame([[1.0, 1, 'ᴬ']], columns=['a', 'β', 'ĉ']) + >>> writer = StataWriter118('./data_file.dta', data) + >>> writer.write_file() + + Or with long strings stored in strl format + + >>> data = pd.DataFrame([['ᴀ relatively long ŝtring'], [''], ['']], + ... columns=['strls']) + >>> writer = StataWriter118('./data_file_with_long_strings.dta', data, + ... convert_strl=['strls']) + >>> writer.write_file() + """ + + _encoding = "utf-8" + _dta_version = 118 + + def _validate_variable_name(self, name): + """ + Validate variable names for Stata export. + + Parameters + ---------- + name : str + Variable name + + Returns + ------- + str + The validated name with invalid characters replaced with + underscores. + + Notes + ----- + Stata 118 support most unicode characters. The only limatation is in + the ascii range where the characters supported are a-z, A-Z, 0-9 and _. + """ + # High code points appear to be acceptable + for c in name: + if ( + ord(c) < 128 + and (c < "A" or c > "Z") + and (c < "a" or c > "z") + and (c < "0" or c > "9") + and c != "_" + ) or 128 <= ord(c) < 256: + name = name.replace(c, "_") + + return name diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 6da13f188357c..609da140a3f0b 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -395,6 +395,10 @@ def _compute_plot_data(self): include_type = [np.number] exclude_type = ["timedelta"] + # GH 18755, include object and category type for scatter plot + if self._kind == "scatter": + include_type.extend(["object", "category"]) + numeric_data = data.select_dtypes(include=include_type, exclude=exclude_type) try: @@ -866,10 +870,13 @@ def __init__(self, data, x, y, **kwargs): x = self.data.columns[x] if is_integer(y) and not self.data.columns.holds_integer(): y = self.data.columns[y] - if len(self.data[x]._get_numeric_data()) == 0: - raise ValueError(self._kind + " requires x column to be numeric") - if len(self.data[y]._get_numeric_data()) == 0: - raise ValueError(self._kind + " requires y column to be numeric") + + # Scatter plot allows to plot objects data + if self._kind == "hexbin": + if len(self.data[x]._get_numeric_data()) == 0: + raise ValueError(self._kind + " requires x column to be numeric") + if len(self.data[y]._get_numeric_data()) == 0: + raise ValueError(self._kind + " requires y column to be numeric") self.x = x self.y = y diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index afce374aebe05..57368a799138a 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -2307,6 +2307,32 @@ def test_dti_addsub_offset_arraylike( expected = tm.box_expected(expected, xbox) tm.assert_equal(res, expected) + @pytest.mark.parametrize("other_box", [pd.Index, np.array]) + def test_dti_addsub_object_arraylike( + self, tz_naive_fixture, box_with_array, other_box + ): + tz = tz_naive_fixture + + dti = pd.date_range("2017-01-01", periods=2, tz=tz) + dtarr = tm.box_expected(dti, box_with_array) + other = other_box([pd.offsets.MonthEnd(), pd.Timedelta(days=4)]) + xbox = get_upcast_box(box_with_array, other) + + expected = pd.DatetimeIndex(["2017-01-31", "2017-01-06"], tz=tz_naive_fixture) + expected = tm.box_expected(expected, xbox) + + warn = None if box_with_array is pd.DataFrame else PerformanceWarning + with tm.assert_produces_warning(warn): + result = dtarr + other + tm.assert_equal(result, expected) + + expected = pd.DatetimeIndex(["2016-12-31", "2016-12-29"], tz=tz_naive_fixture) + expected = tm.box_expected(expected, xbox) + + with tm.assert_produces_warning(warn): + result = dtarr - other + tm.assert_equal(result, expected) + @pytest.mark.parametrize("years", [-1, 0, 1]) @pytest.mark.parametrize("months", [-2, 0, 2]) diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index f0edcd11567d2..8bc952e85bb5d 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -1036,6 +1036,26 @@ def test_parr_add_sub_index(self): expected = pi - pi tm.assert_index_equal(result, expected) + def test_parr_add_sub_object_array(self): + pi = pd.period_range("2000-12-31", periods=3, freq="D") + parr = pi.array + + other = np.array([pd.Timedelta(days=1), pd.offsets.Day(2), 3]) + + with tm.assert_produces_warning(PerformanceWarning): + result = parr + other + + expected = pd.PeriodIndex( + ["2001-01-01", "2001-01-03", "2001-01-05"], freq="D" + ).array + tm.assert_equal(result, expected) + + with tm.assert_produces_warning(PerformanceWarning): + result = parr - other + + expected = pd.PeriodIndex(["2000-12-30"] * 3, freq="D").array + tm.assert_equal(result, expected) + class TestPeriodSeriesArithmetic: def test_ops_series_timedelta(self): diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index cc337f8fdd7ce..d61adf5ef2e7b 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -1469,6 +1469,40 @@ def test_td64arr_addsub_anchored_offset_arraylike(self, obox, box_with_array): with tm.assert_produces_warning(PerformanceWarning): anchored - tdi + # ------------------------------------------------------------------ + # Unsorted + + def test_td64arr_add_sub_object_array(self, box_with_array): + tdi = pd.timedelta_range("1 day", periods=3, freq="D") + tdarr = tm.box_expected(tdi, box_with_array) + + other = np.array( + [pd.Timedelta(days=1), pd.offsets.Day(2), pd.Timestamp("2000-01-04")] + ) + + warn = PerformanceWarning if box_with_array is not pd.DataFrame else None + with tm.assert_produces_warning(warn): + result = tdarr + other + + expected = pd.Index( + [pd.Timedelta(days=2), pd.Timedelta(days=4), pd.Timestamp("2000-01-07")] + ) + expected = tm.box_expected(expected, box_with_array) + tm.assert_equal(result, expected) + + with pytest.raises(TypeError): + with tm.assert_produces_warning(warn): + tdarr - other + + with tm.assert_produces_warning(warn): + result = other - tdarr + + expected = pd.Index( + [pd.Timedelta(0), pd.Timedelta(0), pd.Timestamp("2000-01-01")] + ) + expected = tm.box_expected(expected, box_with_array) + tm.assert_equal(result, expected) + class TestTimedeltaArraylikeMulDivOps: # Tests for timedelta64[ns] diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index e534c93c69f68..f9b002d4409ce 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -90,7 +90,7 @@ def test_repr_dtype(dtype, expected): def test_repr_array(): result = repr(integer_array([1, None, 3])) - expected = "\n[1, NaN, 3]\nLength: 3, dtype: Int64" + expected = "\n[1, NA, 3]\nLength: 3, dtype: Int64" assert result == expected @@ -98,9 +98,9 @@ def test_repr_array_long(): data = integer_array([1, 2, None] * 1000) expected = ( "\n" - "[ 1, 2, NaN, 1, 2, NaN, 1, 2, NaN, 1,\n" + "[ 1, 2, NA, 1, 2, NA, 1, 2, NA, 1,\n" " ...\n" - " NaN, 1, 2, NaN, 1, 2, NaN, 1, 2, NaN]\n" + " NA, 1, 2, NA, 1, 2, NA, 1, 2, NA]\n" "Length: 3000, dtype: Int64" ) result = repr(data) @@ -108,13 +108,17 @@ def test_repr_array_long(): class TestConstructors: + def test_uses_pandas_na(self): + a = pd.array([1, None], dtype=pd.Int64Dtype()) + assert a[1] is pd.NA + def test_from_dtype_from_float(self, data): # construct from our dtype & string dtype dtype = data.dtype # from float expected = pd.Series(data) - result = pd.Series(np.array(data).astype("float"), dtype=str(dtype)) + result = pd.Series(np.array(data, dtype="float"), dtype=str(dtype)) tm.assert_series_equal(result, expected) # from int / list @@ -156,10 +160,13 @@ def _check_op(self, s, op_name, other, exc=None): # 1 ** na is na, so need to unmask those if op_name == "__pow__": - mask = np.where(s == 1, False, mask) + mask = np.where(~s.isna() & (s == 1), False, mask) elif op_name == "__rpow__": - mask = np.where(other == 1, False, mask) + other_is_one = other == 1 + if isinstance(other_is_one, pd.Series): + other_is_one = other_is_one.fillna(False) + mask = np.where(other_is_one, False, mask) # float result type or float op if ( @@ -208,20 +215,27 @@ def _check_op_integer(self, result, expected, mask, s, op_name, other): else: expected = expected.fillna(0) else: - expected[(s.values == 0) & ((expected == 0) | expected.isna())] = 0 + expected[ + (s.values == 0).fillna(False) + & ((expected == 0).fillna(False) | expected.isna()) + ] = 0 try: - expected[(expected == np.inf) | (expected == -np.inf)] = fill_value + expected[ + ((expected == np.inf) | (expected == -np.inf)).fillna(False) + ] = fill_value original = expected expected = expected.astype(s.dtype) except ValueError: expected = expected.astype(float) - expected[(expected == np.inf) | (expected == -np.inf)] = fill_value + expected[ + ((expected == np.inf) | (expected == -np.inf)).fillna(False) + ] = fill_value original = expected expected = expected.astype(s.dtype) - expected[mask] = np.nan + expected[mask] = pd.NA # assert that the expected astype is ok # (skip for unsigned as they have wrap around) @@ -255,21 +269,18 @@ def test_arith_integer_array(self, data, all_arithmetic_operators): def test_arith_series_with_scalar(self, data, all_arithmetic_operators): # scalar op = all_arithmetic_operators - s = pd.Series(data) self._check_op(s, op, 1, exc=TypeError) def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): # frame & scalar op = all_arithmetic_operators - df = pd.DataFrame({"A": data}) self._check_op(df, op, 1, exc=TypeError) def test_arith_series_with_array(self, data, all_arithmetic_operators): # ndarray & other series op = all_arithmetic_operators - s = pd.Series(data) other = np.ones(len(s), dtype=s.dtype.type) self._check_op(s, op, other, exc=TypeError) @@ -359,9 +370,9 @@ def test_pow_scalar(self): expected = pd.array([0, 1, None, 2], dtype="Int64") tm.assert_extension_array_equal(result, expected) - # result = a ** pd.NA - # expected = pd.array([None, 1, None, None], dtype="Int64") - # tm.assert_extension_array_equal(result, expected) + result = a ** pd.NA + expected = pd.array([None, 1, None, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) result = a ** np.nan expected = np.array([np.nan, 1, np.nan, np.nan], dtype="float64") @@ -376,9 +387,9 @@ def test_pow_scalar(self): expected = pd.array([1, 1, 1, 1], dtype="Int64") tm.assert_extension_array_equal(result, expected) - # result = pd.NA ** a - # expected = pd.array([1, None, None, None], dtype="Int64") - # tm.assert_extension_array_equal(result, expected) + result = pd.NA ** a + expected = pd.array([1, None, None, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) result = np.nan ** a expected = np.array([1, np.nan, np.nan, np.nan], dtype="float64") @@ -406,10 +417,10 @@ def _compare_other(self, data, op_name, other): # array result = pd.Series(op(data, other)) - expected = pd.Series(op(data._data, other)) + expected = pd.Series(op(data._data, other), dtype="boolean") # fill the nan locations - expected[data._mask] = op_name == "__ne__" + expected[data._mask] = pd.NA tm.assert_series_equal(result, expected) @@ -417,22 +428,61 @@ def _compare_other(self, data, op_name, other): s = pd.Series(data) result = op(s, other) - expected = pd.Series(data._data) - expected = op(expected, other) + expected = op(pd.Series(data._data), other) # fill the nan locations - expected[data._mask] = op_name == "__ne__" + expected[data._mask] = pd.NA + expected = expected.astype("boolean") tm.assert_series_equal(result, expected) - def test_compare_scalar(self, data, all_compare_operators): - op_name = all_compare_operators - self._compare_other(data, op_name, 0) + @pytest.mark.parametrize("other", [True, False, pd.NA, -1, 0, 1]) + def test_scalar(self, other, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([1, 0, None], dtype="Int64") + + result = op(a, other) + + if other is pd.NA: + expected = pd.array([None, None, None], dtype="boolean") + else: + values = op(a._data, other) + expected = pd.arrays.BooleanArray(values, a._mask, copy=True) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = pd.NA + tm.assert_extension_array_equal(a, pd.array([1, 0, None], dtype="Int64")) + + def test_array(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([0, 1, 2, None, None, None], dtype="Int64") + b = pd.array([0, 1, None, 0, 1, None], dtype="Int64") + + result = op(a, b) + values = op(a._data, b._data) + mask = a._mask | b._mask - def test_compare_array(self, data, all_compare_operators): - op_name = all_compare_operators - other = pd.Series([0] * len(data)) - self._compare_other(data, op_name, other) + expected = pd.arrays.BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = pd.NA + tm.assert_extension_array_equal( + a, pd.array([0, 1, 2, None, None, None], dtype="Int64") + ) + tm.assert_extension_array_equal( + b, pd.array([0, 1, None, 0, 1, None], dtype="Int64") + ) + + def test_compare_with_booleanarray(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([True, False, None] * 3, dtype="boolean") + b = pd.array([0] * 3 + [1] * 3 + [None] * 3, dtype="Int64") + other = pd.array([False] * 3 + [True] * 3 + [None] * 3, dtype="boolean") + expected = op(a, other) + result = op(a, b) + tm.assert_extension_array_equal(result, expected) def test_no_shared_mask(self, data): result = data + 1 @@ -442,20 +492,21 @@ def test_compare_to_string(self, any_nullable_int_dtype): # GH 28930 s = pd.Series([1, None], dtype=any_nullable_int_dtype) result = s == "a" - expected = pd.Series([False, False]) + expected = pd.Series([False, pd.NA], dtype="boolean") self.assert_series_equal(result, expected) def test_compare_to_int(self, any_nullable_int_dtype, all_compare_operators): # GH 28930 - s1 = pd.Series([1, 2, 3], dtype=any_nullable_int_dtype) - s2 = pd.Series([1, 2, 3], dtype="int") + s1 = pd.Series([1, None, 3], dtype=any_nullable_int_dtype) + s2 = pd.Series([1, None, 3], dtype="float") method = getattr(s1, all_compare_operators) result = method(2) method = getattr(s2, all_compare_operators) - expected = method(2) + expected = method(2).astype("boolean") + expected[s2.isna()] = pd.NA self.assert_series_equal(result, expected) @@ -543,6 +594,17 @@ def test_astype(self, all_data): expected = pd.Series(np.asarray(mixed)) tm.assert_series_equal(result, expected) + def test_astype_to_larger_numpy(self): + a = pd.array([1, 2], dtype="Int32") + result = a.astype("int64") + expected = np.array([1, 2], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + a = pd.array([1, 2], dtype="UInt32") + result = a.astype("uint64") + expected = np.array([1, 2], dtype="uint64") + tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("dtype", [Int8Dtype(), "Int8", UInt32Dtype(), "UInt32"]) def test_astype_specific_casting(self, dtype): s = pd.Series([1, 2, 3], dtype="Int64") @@ -572,12 +634,17 @@ def test_construct_cast_invalid(self, dtype): with pytest.raises(TypeError, match=msg): pd.Series(arr).astype(dtype) + def test_coerce_to_ndarray_float_NA_rasies(self): + a = pd.array([0, 1, 2], dtype="Int64") + with pytest.raises(TypeError, match="NAType"): + a._coerce_to_ndarray(dtype="float", na_value=pd.NA) + def test_frame_repr(data_missing): df = pd.DataFrame({"A": data_missing}) result = repr(df) - expected = " A\n0 NaN\n1 1" + expected = " A\n0 NA\n1 1" assert result == expected @@ -593,7 +660,7 @@ def test_conversions(data_missing): # we assert that we are exactly equal # including type conversions of scalars result = df["A"].astype("object").values - expected = np.array([np.nan, 1], dtype=object) + expected = np.array([pd.NA, 1], dtype=object) tm.assert_numpy_array_equal(result, expected) for r, e in zip(result, expected): @@ -756,7 +823,7 @@ def test_cross_type_arithmetic(): tm.assert_series_equal(result, expected) result = (df.A + df.C) * 3 == 12 - expected = pd.Series([False, True, False]) + expected = pd.Series([False, True, None], dtype="boolean") tm.assert_series_equal(result, expected) result = df.A + df.B @@ -820,7 +887,7 @@ def test_reduce_to_float(op): def test_astype_nansafe(): # see gh-22343 arr = integer_array([np.nan, 1, 2], dtype="Int8") - msg = "cannot convert float NaN to integer" + msg = "cannot convert to integer NumPy array with missing values" with pytest.raises(ValueError, match=msg): arr.astype("uint32") @@ -895,7 +962,9 @@ def test_arrow_array(data): import pyarrow as pa arr = pa.array(data) - expected = pa.array(list(data), type=data.dtype.name.lower(), from_pandas=True) + expected = np.array(data, dtype=object) + expected[data.isna()] = None + expected = pa.array(expected, type=data.dtype.name.lower(), from_pandas=True) assert arr.equals(expected) diff --git a/pandas/tests/arrays/test_numpy.py b/pandas/tests/arrays/test_numpy.py index 7a150c35fea09..8828a013aeea1 100644 --- a/pandas/tests/arrays/test_numpy.py +++ b/pandas/tests/arrays/test_numpy.py @@ -226,3 +226,25 @@ def test_setitem_no_coercion(): arr = PandasArray(np.array([1, 2, 3])) with pytest.raises(ValueError, match="int"): arr[0] = "a" + + # With a value that we do coerce, check that we coerce the value + # and not the underlying array. + arr[0] = 2.5 + assert isinstance(arr[0], (int, np.integer)), type(arr[0]) + + +def test_setitem_preserves_views(): + # GH#28150, see also extension test of the same name + arr = PandasArray(np.array([1, 2, 3])) + view1 = arr.view() + view2 = arr[:] + view3 = np.asarray(arr) + + arr[0] = 9 + assert view1[0] == 9 + assert view2[0] == 9 + assert view3[0] == 9 + + arr[-1] = 2.5 + view1[-1] = 5 + assert arr[-1] == 5 diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 8fa52af832907..4b6349a505509 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -315,7 +315,7 @@ def test_array_multiindex_raises(): ), ( pd.core.arrays.integer_array([0, np.nan]), - np.array([0, np.nan], dtype=object), + np.array([0, pd.NA], dtype=object), ), ( pd.core.arrays.IntervalArray.from_breaks([0, 1, 2]), diff --git a/pandas/tests/config/test_localization.py b/pandas/tests/config/test_localization.py index 20a5be0c8a289..e815a90207a08 100644 --- a/pandas/tests/config/test_localization.py +++ b/pandas/tests/config/test_localization.py @@ -8,6 +8,8 @@ from pandas.compat import is_platform_windows +import pandas as pd + _all_locales = get_locales() or [] _current_locale = locale.getlocale() @@ -56,21 +58,21 @@ def test_get_locales_prefix(): @_skip_if_only_one_locale -def test_set_locale(): +@pytest.mark.parametrize( + "lang,enc", + [ + ("it_CH", "UTF-8"), + ("en_US", "ascii"), + ("zh_CN", "GB2312"), + ("it_IT", "ISO-8859-1"), + ], +) +def test_set_locale(lang, enc): if all(x is None for x in _current_locale): # Not sure why, but on some Travis runs with pytest, # getlocale() returned (None, None). pytest.skip("Current locale is not set.") - locale_override = os.environ.get("LOCALE_OVERRIDE", None) - - if locale_override is None: - lang, enc = "it_CH", "UTF-8" - elif locale_override == "C": - lang, enc = "en_US", "ascii" - else: - lang, enc = locale_override.split(".") - enc = codecs.lookup(enc).name new_locale = lang, enc @@ -91,3 +93,13 @@ def test_set_locale(): # Once we exit the "with" statement, locale should be back to what it was. current_locale = locale.getlocale() assert current_locale == _current_locale + + +def test_encoding_detected(): + system_locale = os.environ.get("LC_ALL") + system_encoding = system_locale.split(".")[-1] if system_locale else "utf-8" + + assert ( + codecs.lookup(pd.options.display.encoding).name + == codecs.lookup(system_encoding).name + ) diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index 0939e35bd64fa..69f8f46356a4d 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -8,7 +8,6 @@ import pytest from pandas._libs.tslibs import NaT -from pandas.compat import is_platform_windows from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.common import ( @@ -406,7 +405,6 @@ def test_maybe_promote_any_with_datetime64( _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) -@pytest.mark.xfail(reason="Fails to upcast to object") def test_maybe_promote_datetimetz_with_any_numpy_dtype( tz_aware_fixture, any_numpy_dtype_reduced ): @@ -427,11 +425,6 @@ def test_maybe_promote_datetimetz_with_datetimetz(tz_aware_fixture, tz_aware_fix dtype = DatetimeTZDtype(tz=tz_aware_fixture) fill_dtype = DatetimeTZDtype(tz=tz_aware_fixture2) - from dateutil.tz import tzlocal - - if is_platform_windows() and tz_aware_fixture2 == tzlocal(): - pytest.xfail("Cannot process fill_value with this dtype, see GH 24310") - # create array of given dtype; casts "1" to correct dtype fill_value = pd.Series([10 ** 9], dtype=fill_dtype)[0] @@ -441,7 +434,6 @@ def test_maybe_promote_datetimetz_with_datetimetz(tz_aware_fixture, tz_aware_fix expected_dtype = dtype else: expected_dtype = np.dtype(object) - pytest.xfail("fails to cast to object") _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 667ee467f2f29..b8f7e585d8a51 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -5,6 +5,7 @@ import pandas.util._test_decorators as td +from pandas.core.dtypes.cast import astype_nansafe import pandas.core.dtypes.common as com from pandas.core.dtypes.dtypes import ( CategoricalDtype, @@ -13,6 +14,7 @@ IntervalDtype, PeriodDtype, ) +from pandas.core.dtypes.missing import isna import pandas as pd from pandas.conftest import ( @@ -625,18 +627,6 @@ def test_is_complex_dtype(): assert com.is_complex_dtype(np.array([1 + 1j, 5])) -def test_is_offsetlike(): - assert com.is_offsetlike(np.array([pd.DateOffset(month=3), pd.offsets.Nano()])) - assert com.is_offsetlike(pd.offsets.MonthEnd()) - assert com.is_offsetlike(pd.Index([pd.DateOffset(second=1)])) - - assert not com.is_offsetlike(pd.Timedelta(1)) - assert not com.is_offsetlike(np.array([1 + 1j, 5])) - - # mixed case - assert not com.is_offsetlike(np.array([pd.DateOffset(), pd.Timestamp(0)])) - - @pytest.mark.parametrize( "input_param,result", [ @@ -721,3 +711,42 @@ def test__get_dtype_fails(input_param, expected_error_message): ) def test__is_dtype_type(input_param, result): assert com._is_dtype_type(input_param, lambda tipo: tipo == result) + + +@pytest.mark.parametrize("val", [np.datetime64("NaT"), np.timedelta64("NaT")]) +@pytest.mark.parametrize("typ", [np.int64]) +def test_astype_nansafe(val, typ): + arr = np.array([val]) + + msg = "Cannot convert NaT values to integer" + with pytest.raises(ValueError, match=msg): + astype_nansafe(arr, dtype=typ) + + +@pytest.mark.parametrize("from_type", [np.datetime64, np.timedelta64]) +@pytest.mark.parametrize( + "to_type", + [ + np.uint8, + np.uint16, + np.uint32, + np.int8, + np.int16, + np.int32, + np.float16, + np.float32, + ], +) +def test_astype_datetime64_bad_dtype_raises(from_type, to_type): + arr = np.array([from_type("2018")]) + + with pytest.raises(TypeError, match="cannot astype"): + astype_nansafe(arr, dtype=to_type) + + +@pytest.mark.parametrize("from_type", [np.datetime64, np.timedelta64]) +def test_astype_object_preserves_datetime_na(from_type): + arr = np.array([from_type("NaT")]) + result = astype_nansafe(arr, dtype="object") + + assert isna(result)[0] diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 4dee6e3e92a7f..3a933a5ca8cdc 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -408,6 +408,9 @@ def test_construction_from_string(self): with pytest.raises(TypeError): PeriodDtype.construct_from_string("datetime64[ns, US/Eastern]") + with pytest.raises(TypeError, match="list"): + PeriodDtype.construct_from_string([1, 2, 3]) + def test_is_dtype(self): assert PeriodDtype.is_dtype(self.dtype) assert PeriodDtype.is_dtype("period[D]") @@ -685,6 +688,10 @@ def test_caching(self): tm.round_trip_pickle(dtype) assert len(IntervalDtype._cache) == 0 + def test_not_string(self): + # GH30568: though IntervalDtype has object kind, it cannot be string + assert not is_string_dtype(IntervalDtype()) + class TestCategoricalDtypeParametrized: @pytest.mark.parametrize( diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index 9a442f346c19f..b6c12b5844086 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -16,8 +16,7 @@ def test_name(self, dtype): def test_kind(self, dtype): valid = set("biufcmMOSUV") - if dtype.kind is not None: - assert dtype.kind in valid + assert dtype.kind in valid def test_construct_from_string_own_name(self, dtype): result = dtype.construct_from_string(dtype.name) @@ -38,6 +37,9 @@ def test_is_dtype_from_self(self, dtype): result = type(dtype).is_dtype(dtype) assert result is True + def test_is_dtype_other_input(self, dtype): + assert dtype.is_dtype([1, 2, 3]) is False + def test_is_not_string_type(self, dtype): return not pd.api.types.is_string_dtype(dtype) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index bb6bb02b462e2..7d50f176edd67 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -186,3 +186,14 @@ def test_setitem_scalar_key_sequence_raise(self, data): arr = data[:5].copy() with pytest.raises(ValueError): arr[0] = arr[[0, 1]] + + def test_setitem_preserves_views(self, data): + # GH#28150 setitem shouldn't swap the underlying data + assert data[-1] != data[0] # otherwise test would not be meaningful + + view1 = data.view() + view2 = data[:] + + data[0] = data[-1] + assert view1[0] == data[-1] + assert view2[0] == data[-1] diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index d051345fdd12d..8e54543e5437c 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -34,7 +34,7 @@ def make_data(): - return list(range(1, 9)) + [np.nan] + list(range(10, 98)) + [np.nan] + [99, 100] + return list(range(1, 9)) + [pd.NA] + list(range(10, 98)) + [pd.NA] + [99, 100] @pytest.fixture( @@ -65,7 +65,7 @@ def data_for_twos(dtype): @pytest.fixture def data_missing(dtype): - return integer_array([np.nan, 1], dtype=dtype) + return integer_array([pd.NA, 1], dtype=dtype) @pytest.fixture @@ -75,18 +75,18 @@ def data_for_sorting(dtype): @pytest.fixture def data_missing_for_sorting(dtype): - return integer_array([1, np.nan, 0], dtype=dtype) + return integer_array([1, pd.NA, 0], dtype=dtype) @pytest.fixture def na_cmp(): - # we are np.nan - return lambda x, y: np.isnan(x) and np.isnan(y) + # we are pd.NA + return lambda x, y: x is pd.NA and y is pd.NA @pytest.fixture def na_value(): - return np.nan + return pd.NA @pytest.fixture @@ -94,7 +94,7 @@ def data_for_grouping(dtype): b = 1 a = 0 c = 2 - na = np.nan + na = pd.NA return integer_array([b, b, na, na, a, a, b, c], dtype=dtype) @@ -129,7 +129,7 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError): expected = s.combine(other, op) if op_name in ("__rtruediv__", "__truediv__", "__div__"): - expected = expected.astype(float) + expected = expected.fillna(np.nan).astype(float) if op_name == "__rtruediv__": # TODO reverse operators result in object dtype result = result.astype(float) @@ -142,6 +142,7 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError): # combine method result in 'biggest' (int64) dtype expected = expected.astype(s.dtype) pass + if (op_name == "__rpow__") and isinstance(other, pd.Series): # TODO pow on Int arrays gives different result with NA # see https://github.com/pandas-dev/pandas/issues/22022 @@ -162,6 +163,16 @@ def test_error(self, data, all_arithmetic_operators): class TestComparisonOps(base.BaseComparisonOpsTests): + def _check_op(self, s, op, other, op_name, exc=NotImplementedError): + if exc is None: + result = op(s, other) + # Override to do the astype to boolean + expected = s.combine(other, op).astype("boolean") + self.assert_series_equal(result, expected) + else: + with pytest.raises(exc): + op(s, other) + def check_opname(self, s, op_name, other, exc=None): super().check_opname(s, op_name, other, exc=None) diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 4fdcf930d224f..2411f6cfbd936 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -147,7 +147,9 @@ class TestReshaping(BaseInterval, base.BaseReshapingTests): class TestSetitem(BaseInterval, base.BaseSetitemTests): - pass + @pytest.mark.xfail(reason="GH#27147 setitem changes underlying index") + def test_setitem_preserves_views(self, data): + super().test_setitem_preserves_views(data) class TestPrinting(BaseInterval, base.BasePrintingTests): diff --git a/pandas/tests/frame/methods/test_append.py b/pandas/tests/frame/methods/test_append.py index fac6a9139462f..1d6935795b0e4 100644 --- a/pandas/tests/frame/methods/test_append.py +++ b/pandas/tests/frame/methods/test_append.py @@ -177,3 +177,19 @@ def test_append_timestamps_aware_or_naive(self, tz_naive_fixture, timestamp): result = df.append(df.iloc[0]).iloc[-1] expected = pd.Series(pd.Timestamp(timestamp, tz=tz), name=0) tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "data, dtype", + [ + ([1], pd.Int64Dtype()), + ([1], pd.CategoricalDtype()), + ([pd.Interval(left=0, right=5)], pd.IntervalDtype()), + ([pd.Period("2000-03", freq="M")], pd.PeriodDtype("M")), + ([1], pd.SparseDtype()), + ], + ) + def test_other_dtypes(self, data, dtype): + df = pd.DataFrame(data, dtype=dtype) + result = df.append(df.iloc[0]).iloc[-1] + expected = pd.Series(data, name=0, dtype=dtype) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 3b01ae0c3c2e8..b2720f9158c6b 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1,7 +1,7 @@ from datetime import datetime from io import StringIO import re -from typing import Dict +from typing import Dict, List, Union import numpy as np import pytest @@ -12,12 +12,12 @@ @pytest.fixture -def mix_ab() -> Dict[str, list]: +def mix_ab() -> Dict[str, List[Union[int, str]]]: return {"a": list(range(4)), "b": list("ab..")} @pytest.fixture -def mix_abc() -> Dict[str, list]: +def mix_abc() -> Dict[str, List[Union[float, str]]]: return {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]} diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index eb98bdc49f976..93e165ad3d71e 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -1331,8 +1331,8 @@ def test_agg_cython_table(self, df, func, expected, axis): _get_cython_table_params( DataFrame([[np.nan, 1], [1, 2]]), [ - ("cumprod", DataFrame([[np.nan, 1], [1.0, 2.0]])), - ("cumsum", DataFrame([[np.nan, 1], [1.0, 3.0]])), + ("cumprod", DataFrame([[np.nan, 1], [1, 2]])), + ("cumsum", DataFrame([[np.nan, 1], [1, 3]])), ], ), ), @@ -1341,6 +1341,10 @@ def test_agg_cython_table_transform(self, df, func, expected, axis): # GH 21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) + if axis == "columns" or axis == 1: + # operating blockwise doesn't let us preserve dtypes + expected = expected.astype("float64") + result = df.agg(func, axis=axis) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index f3cc11cb7027d..ffdf1435f74e0 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2551,3 +2551,11 @@ def test_from_tzaware_mixed_object_array(self): "datetime64[ns, CET]", ] assert (res.dtypes == expected_dtypes).all() + + def test_from_2d_ndarray_with_dtype(self): + # GH#12513 + array_dim2 = np.arange(10).reshape((5, 2)) + df = pd.DataFrame(array_dim2, dtype="datetime64[ns, UTC]") + + expected = pd.DataFrame(array_dim2).astype("datetime64[ns, UTC]") + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/test_cumulative.py b/pandas/tests/frame/test_cumulative.py index ad2cbff888b2e..2deeeb95d057d 100644 --- a/pandas/tests/frame/test_cumulative.py +++ b/pandas/tests/frame/test_cumulative.py @@ -118,3 +118,18 @@ def test_cummax(self, datetime_frame): # fix issue cummax_xs = datetime_frame.cummax(axis=1) assert np.shape(cummax_xs) == np.shape(datetime_frame) + + def test_cumulative_ops_preserve_dtypes(self): + # GH#19296 dont incorrectly upcast to object + df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3.0], "C": [True, False, False]}) + + result = df.cumsum() + + expected = DataFrame( + { + "A": Series([1, 3, 6], dtype=np.int64), + "B": Series([1, 3, 6], dtype=np.float64), + "C": df["C"].cumsum(), + } + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index 270a7c70a2e81..54c7e450c5cd6 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -196,7 +196,7 @@ def test_set_attribute(self): def test_to_xarray_index_types(self, index): from xarray import Dataset - index = getattr(tm, "make{}".format(index)) + index = getattr(tm, f"make{index}") df = DataFrame( { "a": list("abc"), diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 0ff9d7fcdb209..d0c9b3e7a8f76 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -125,7 +125,7 @@ def test_nonzero(self): # GH 4633 # look at the boolean/nonzero behavior for objects obj = self._construct(shape=4) - msg = "The truth value of a {} is ambiguous".format(self._typ.__name__) + msg = f"The truth value of a {self._typ.__name__} is ambiguous" with pytest.raises(ValueError, match=msg): bool(obj == 0) with pytest.raises(ValueError, match=msg): @@ -203,9 +203,9 @@ def test_constructor_compound_dtypes(self): def f(dtype): return self._construct(shape=3, value=1, dtype=dtype) - msg = "compound dtypes are not implemented in the {} constructor".format( - self._typ.__name__ - ) + msg = "compound dtypes are not implemented" + f"in the {self._typ.__name__} constructor" + with pytest.raises(NotImplementedError, match=msg): f([("A", "datetime64[h]"), ("B", "str"), ("C", "int32")]) diff --git a/pandas/tests/generic/test_series.py b/pandas/tests/generic/test_series.py index aaf523956aaed..601fc2aa64434 100644 --- a/pandas/tests/generic/test_series.py +++ b/pandas/tests/generic/test_series.py @@ -205,7 +205,7 @@ def finalize(self, other, method=None, **kwargs): def test_to_xarray_index_types(self, index): from xarray import DataArray - index = getattr(tm, "make{}".format(index)) + index = getattr(tm, f"make{index}") s = Series(range(6), index=index(6)) s.index.name = "foo" result = s.to_xarray() diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 0e62569fffeb6..050b1e7c5d3b3 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -686,6 +686,17 @@ def test_apply_with_mixed_types(): tm.assert_frame_equal(result, expected) +def test_func_returns_object(): + # GH 28652 + df = DataFrame({"a": [1, 2]}, index=pd.Int64Index([1, 2])) + result = df.groupby("a").apply(lambda g: g.index) + expected = Series( + [pd.Int64Index([1]), pd.Int64Index([2])], index=pd.Int64Index([1, 2], name="a") + ) + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "group_column_dtlike", [datetime.today(), datetime.today().date(), datetime.today().time()], diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 8f88f68c69f2b..c7e76a4accee6 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -588,6 +588,20 @@ def test_groupby_multiple_columns(df, op): tm.assert_series_equal(result, expected) +def test_as_index_select_column(): + # GH 5764 + df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) + result = df.groupby("A", as_index=False)["B"].get_group(1) + expected = pd.Series([2, 4], name="B") + tm.assert_series_equal(result, expected) + + result = df.groupby("A", as_index=False)["B"].apply(lambda x: x.cumsum()) + expected = pd.Series( + [2, 6, 6], name="B", index=pd.MultiIndex.from_tuples([(0, 0), (0, 1), (1, 2)]) + ) + tm.assert_series_equal(result, expected) + + def test_groupby_as_index_agg(df): grouped = df.groupby("A", as_index=False) @@ -771,7 +785,7 @@ def test_omit_nuisance(df): # won't work with axis = 1 grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1) - msg = r"unsupported operand type\(s\) for \+: 'Timestamp'" + msg = "reduction operation 'sum' not allowed for this dtype" with pytest.raises(TypeError, match=msg): grouped.agg(lambda x: x.sum(0, numeric_only=False)) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index c46180c1d11cd..2a82b39b646c0 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -765,9 +765,12 @@ def test_transform_with_non_scalar_group(): ], ) @pytest.mark.parametrize("agg_func", ["count", "rank", "size"]) -def test_transform_numeric_ret(cols, exp, comp_func, agg_func): +def test_transform_numeric_ret(cols, exp, comp_func, agg_func, request): if agg_func == "size" and isinstance(cols, list): - pytest.xfail("'size' transformation not supported with NDFrameGroupy") + # https://github.com/pytest-dev/pytest/issues/6300 + # workaround to xfail fixture/param permutations + reason = "'size' transformation not supported with NDFrameGroupy" + request.node.add_marker(pytest.mark.xfail(reason=reason)) # GH 19200 df = pd.DataFrame( @@ -874,27 +877,19 @@ def test_pad_stable_sorting(fill_method): ), ], ) -@pytest.mark.parametrize( - "periods,fill_method,limit", - [ - (1, "ffill", None), - (1, "ffill", 1), - (1, "bfill", None), - (1, "bfill", 1), - (-1, "ffill", None), - (-1, "ffill", 1), - (-1, "bfill", None), - (-1, "bfill", 1), - ], -) +@pytest.mark.parametrize("periods", [1, -1]) +@pytest.mark.parametrize("fill_method", ["ffill", "bfill", None]) +@pytest.mark.parametrize("limit", [None, 1]) def test_pct_change(test_series, freq, periods, fill_method, limit): - # GH 21200, 21621 + # GH 21200, 21621, 30463 vals = [3, np.nan, np.nan, np.nan, 1, 2, 4, 10, np.nan, 4] keys = ["a", "b"] key_v = np.repeat(keys, len(vals)) df = DataFrame({"key": key_v, "vals": vals * 2}) - df_g = getattr(df.groupby("key"), fill_method)(limit=limit) + df_g = df + if fill_method is not None: + df_g = getattr(df.groupby("key"), fill_method)(limit=limit) grp = df_g.groupby(df.key) expected = grp["vals"].obj / grp["vals"].shift(periods) - 1 @@ -1138,3 +1133,40 @@ def func(grp): expected = pd.DataFrame([2, -2, 2, 4], columns=["B"]) tm.assert_frame_equal(result, expected) + + +def test_transform_lambda_indexing(): + # GH 7883 + df = pd.DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "flux", "foo", "flux"], + "B": ["one", "one", "two", "three", "two", "six", "five", "three"], + "C": range(8), + "D": range(8), + "E": range(8), + } + ) + df = df.set_index(["A", "B"]) + df = df.sort_index() + result = df.groupby(level="A").transform(lambda x: x.iloc[-1]) + expected = DataFrame( + { + "C": [3, 3, 7, 7, 4, 4, 4, 4], + "D": [3, 3, 7, 7, 4, 4, 4, 4], + "E": [3, 3, 7, 7, 4, 4, 4, 4], + }, + index=MultiIndex.from_tuples( + [ + ("bar", "one"), + ("bar", "three"), + ("flux", "six"), + ("flux", "three"), + ("foo", "five"), + ("foo", "one"), + ("foo", "two"), + ("foo", "two"), + ], + names=["A", "B"], + ), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 9a5f9e40374a3..306ac84ef1832 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -43,7 +43,7 @@ def test_can_hold_identifiers(self): (lambda idx: ["a", "b"] + idx, "__radd__"), ], ) - def test_disallow_set_ops(self, func, op_name): + def test_disallow_addsub_ops(self, func, op_name): # GH 10039 # set ops (+/-) raise TypeError idx = pd.Index(pd.Categorical(["a", "b"])) diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index 58ab44fba08cf..2f1fa3ce627e6 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -711,7 +711,6 @@ def test_constructor_timestamp_near_dst(self): expected = DatetimeIndex([ts[0].to_pydatetime(), ts[1].to_pydatetime()]) tm.assert_index_equal(result, expected) - # TODO(GH-24559): Remove the xfail for the tz-aware case. @pytest.mark.parametrize("klass", [Index, DatetimeIndex]) @pytest.mark.parametrize("box", [np.array, partial(np.array, dtype=object), list]) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/interval/test_interval_tree.py b/pandas/tests/indexes/interval/test_interval_tree.py index f2fca34e083c2..0a92192ee6a0f 100644 --- a/pandas/tests/indexes/interval/test_interval_tree.py +++ b/pandas/tests/indexes/interval/test_interval_tree.py @@ -20,9 +20,7 @@ def skipif_32bit(param): return pytest.param(param, marks=marks) -@pytest.fixture( - scope="class", params=["int32", "int64", "float32", "float64", "uint64"] -) +@pytest.fixture(scope="class", params=["int64", "float64", "uint64"]) def dtype(request): return request.param @@ -39,12 +37,9 @@ def leaf_size(request): @pytest.fixture( params=[ np.arange(5, dtype="int64"), - np.arange(5, dtype="int32"), np.arange(5, dtype="uint64"), np.arange(5, dtype="float64"), - np.arange(5, dtype="float32"), np.array([0, 1, 2, 3, 4, np.nan], dtype="float64"), - np.array([0, 1, 2, 3, 4, np.nan], dtype="float32"), ] ) def tree(request, leaf_size): @@ -63,6 +58,18 @@ def test_get_indexer(self, tree): ): tree.get_indexer(np.array([3.0])) + @pytest.mark.parametrize( + "dtype, target_value, target_dtype", + [("int64", 2 ** 63 + 1, "uint64"), ("uint64", -1, "int64")], + ) + def test_get_indexer_overflow(self, dtype, target_value, target_dtype): + left, right = np.array([0, 1], dtype=dtype), np.array([1, 2], dtype=dtype) + tree = IntervalTree(left, right) + + result = tree.get_indexer(np.array([target_value], dtype=target_dtype)) + expected = np.array([-1], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + def test_get_indexer_non_unique(self, tree): indexer, missing = tree.get_indexer_non_unique(np.array([1.0, 2.0, 6.5])) @@ -82,6 +89,22 @@ def test_get_indexer_non_unique(self, tree): expected = np.array([2], dtype="intp") tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize( + "dtype, target_value, target_dtype", + [("int64", 2 ** 63 + 1, "uint64"), ("uint64", -1, "int64")], + ) + def test_get_indexer_non_unique_overflow(self, dtype, target_value, target_dtype): + left, right = np.array([0, 2], dtype=dtype), np.array([1, 3], dtype=dtype) + tree = IntervalTree(left, right) + target = np.array([target_value], dtype=target_dtype) + + result_indexer, result_missing = tree.get_indexer_non_unique(target) + expected_indexer = np.array([-1], dtype="intp") + tm.assert_numpy_array_equal(result_indexer, expected_indexer) + + expected_missing = np.array([0], dtype="intp") + tm.assert_numpy_array_equal(result_missing, expected_missing) + def test_duplicates(self, dtype): left = np.array([0, 0, 0], dtype=dtype) tree = IntervalTree(left, left + 1) @@ -120,10 +143,10 @@ def test_get_indexer_closed(self, closed, leaf_size): @pytest.mark.parametrize( "left, right, expected", [ - (np.array([0, 1, 4]), np.array([2, 3, 5]), True), - (np.array([0, 1, 2]), np.array([5, 4, 3]), True), + (np.array([0, 1, 4], dtype="int64"), np.array([2, 3, 5]), True), + (np.array([0, 1, 2], dtype="int64"), np.array([5, 4, 3]), True), (np.array([0, 1, np.nan]), np.array([5, 4, np.nan]), True), - (np.array([0, 2, 4]), np.array([1, 3, 5]), False), + (np.array([0, 2, 4], dtype="int64"), np.array([1, 3, 5]), False), (np.array([0, 2, np.nan]), np.array([1, 3, np.nan]), False), ], ) @@ -138,7 +161,7 @@ def test_is_overlapping(self, closed, order, left, right, expected): def test_is_overlapping_endpoints(self, closed, order): """shared endpoints are marked as overlapping""" # GH 23309 - left, right = np.arange(3), np.arange(1, 4) + left, right = np.arange(3, dtype="int64"), np.arange(1, 4) tree = IntervalTree(left[order], right[order], closed=closed) result = tree.is_overlapping expected = closed == "both" @@ -161,7 +184,7 @@ def test_is_overlapping_trivial(self, closed, left, right): @pytest.mark.skipif(compat.is_platform_32bit(), reason="GH 23440") def test_construction_overflow(self): # GH 25485 - left, right = np.arange(101), [np.iinfo(np.int64).max] * 101 + left, right = np.arange(101, dtype="int64"), [np.iinfo(np.int64).max] * 101 tree = IntervalTree(left, right) # pivot should be average of left/right medians diff --git a/pandas/tests/indexes/multi/test_set_ops.py b/pandas/tests/indexes/multi/test_setops.py similarity index 100% rename from pandas/tests/indexes/multi/test_set_ops.py rename to pandas/tests/indexes/multi/test_setops.py diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index c11dda8f67620..da2f04d45fdac 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -105,25 +105,6 @@ def test_no_millisecond_field(self): with pytest.raises(AttributeError, match=msg): DatetimeIndex([]).millisecond - @pytest.mark.parametrize("sort", [None, False]) - def test_difference_freq(self, sort): - # GH14323: difference of Period MUST preserve frequency - # but the ability to union results must be preserved - - index = period_range("20160920", "20160925", freq="D") - - other = period_range("20160921", "20160924", freq="D") - expected = PeriodIndex(["20160920", "20160925"], freq="D") - idx_diff = index.difference(other, sort) - tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal("freq", idx_diff, expected) - - other = period_range("20160922", "20160925", freq="D") - idx_diff = index.difference(other, sort) - expected = PeriodIndex(["20160920", "20160921"], freq="D") - tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal("freq", idx_diff, expected) - def test_hash_error(self): index = period_range("20010101", periods=10) msg = f"unhashable type: '{type(index).__name__}'" diff --git a/pandas/tests/indexes/period/test_setops.py b/pandas/tests/indexes/period/test_setops.py index 03e4bd5834166..173d61849b126 100644 --- a/pandas/tests/indexes/period/test_setops.py +++ b/pandas/tests/indexes/period/test_setops.py @@ -353,3 +353,22 @@ def test_difference(self, sort): if sort is None: expected = expected.sort_values() tm.assert_index_equal(result_difference, expected) + + @pytest.mark.parametrize("sort", [None, False]) + def test_difference_freq(self, sort): + # GH14323: difference of Period MUST preserve frequency + # but the ability to union results must be preserved + + index = period_range("20160920", "20160925", freq="D") + + other = period_range("20160921", "20160924", freq="D") + expected = PeriodIndex(["20160920", "20160925"], freq="D") + idx_diff = index.difference(other, sort) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) + + other = period_range("20160922", "20160925", freq="D") + idx_diff = index.difference(other, sort) + expected = PeriodIndex(["20160920", "20160921"], freq="D") + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index db0cc9828e9e9..f7abdf53e0975 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -1,5 +1,3 @@ -from datetime import datetime, timedelta - import numpy as np import pytest @@ -464,176 +462,6 @@ def test_join_self(self, join_type): joined = index.join(index, how=join_type) assert index is joined - @pytest.mark.parametrize("sort", [None, False]) - def test_intersection(self, sort): - # intersect with Int64Index - index = self.create_index() - other = Index(np.arange(1, 6)) - result = index.intersection(other, sort=sort) - expected = Index(np.sort(np.intersect1d(index.values, other.values))) - tm.assert_index_equal(result, expected) - - result = other.intersection(index, sort=sort) - expected = Index( - np.sort(np.asarray(np.intersect1d(index.values, other.values))) - ) - tm.assert_index_equal(result, expected) - - # intersect with increasing RangeIndex - other = RangeIndex(1, 6) - result = index.intersection(other, sort=sort) - expected = Index(np.sort(np.intersect1d(index.values, other.values))) - tm.assert_index_equal(result, expected) - - # intersect with decreasing RangeIndex - other = RangeIndex(5, 0, -1) - result = index.intersection(other, sort=sort) - expected = Index(np.sort(np.intersect1d(index.values, other.values))) - tm.assert_index_equal(result, expected) - - # reversed (GH 17296) - result = other.intersection(index, sort=sort) - tm.assert_index_equal(result, expected) - - # GH 17296: intersect two decreasing RangeIndexes - first = RangeIndex(10, -2, -2) - other = RangeIndex(5, -4, -1) - expected = first.astype(int).intersection(other.astype(int), sort=sort) - result = first.intersection(other, sort=sort).astype(int) - tm.assert_index_equal(result, expected) - - # reversed - result = other.intersection(first, sort=sort).astype(int) - tm.assert_index_equal(result, expected) - - index = RangeIndex(5) - - # intersect of non-overlapping indices - other = RangeIndex(5, 10, 1) - result = index.intersection(other, sort=sort) - expected = RangeIndex(0, 0, 1) - tm.assert_index_equal(result, expected) - - other = RangeIndex(-1, -5, -1) - result = index.intersection(other, sort=sort) - expected = RangeIndex(0, 0, 1) - tm.assert_index_equal(result, expected) - - # intersection of empty indices - other = RangeIndex(0, 0, 1) - result = index.intersection(other, sort=sort) - expected = RangeIndex(0, 0, 1) - tm.assert_index_equal(result, expected) - - result = other.intersection(index, sort=sort) - tm.assert_index_equal(result, expected) - - # intersection of non-overlapping values based on start value and gcd - index = RangeIndex(1, 10, 2) - other = RangeIndex(0, 10, 4) - result = index.intersection(other, sort=sort) - expected = RangeIndex(0, 0, 1) - tm.assert_index_equal(result, expected) - - @pytest.mark.parametrize("sort", [False, None]) - def test_union_noncomparable(self, sort): - # corner case, non-Int64Index - index = self.create_index() - other = Index([datetime.now() + timedelta(i) for i in range(4)], dtype=object) - result = index.union(other, sort=sort) - expected = Index(np.concatenate((index, other))) - tm.assert_index_equal(result, expected) - - result = other.union(index, sort=sort) - expected = Index(np.concatenate((other, index))) - tm.assert_index_equal(result, expected) - - @pytest.fixture( - params=[ - (RI(0, 10, 1), RI(0, 10, 1), RI(0, 10, 1), RI(0, 10, 1)), - (RI(0, 10, 1), RI(5, 20, 1), RI(0, 20, 1), I64(range(20))), - (RI(0, 10, 1), RI(10, 20, 1), RI(0, 20, 1), I64(range(20))), - (RI(0, -10, -1), RI(0, -10, -1), RI(0, -10, -1), RI(0, -10, -1)), - (RI(0, -10, -1), RI(-10, -20, -1), RI(-19, 1, 1), I64(range(0, -20, -1))), - ( - RI(0, 10, 2), - RI(1, 10, 2), - RI(0, 10, 1), - I64(list(range(0, 10, 2)) + list(range(1, 10, 2))), - ), - ( - RI(0, 11, 2), - RI(1, 12, 2), - RI(0, 12, 1), - I64(list(range(0, 11, 2)) + list(range(1, 12, 2))), - ), - ( - RI(0, 21, 4), - RI(-2, 24, 4), - RI(-2, 24, 2), - I64(list(range(0, 21, 4)) + list(range(-2, 24, 4))), - ), - ( - RI(0, -20, -2), - RI(-1, -21, -2), - RI(-19, 1, 1), - I64(list(range(0, -20, -2)) + list(range(-1, -21, -2))), - ), - (RI(0, 100, 5), RI(0, 100, 20), RI(0, 100, 5), I64(range(0, 100, 5))), - ( - RI(0, -100, -5), - RI(5, -100, -20), - RI(-95, 10, 5), - I64(list(range(0, -100, -5)) + [5]), - ), - ( - RI(0, -11, -1), - RI(1, -12, -4), - RI(-11, 2, 1), - I64(list(range(0, -11, -1)) + [1, -11]), - ), - (RI(0), RI(0), RI(0), RI(0)), - (RI(0, -10, -2), RI(0), RI(0, -10, -2), RI(0, -10, -2)), - (RI(0, 100, 2), RI(100, 150, 200), RI(0, 102, 2), I64(range(0, 102, 2))), - ( - RI(0, -100, -2), - RI(-100, 50, 102), - RI(-100, 4, 2), - I64(list(range(0, -100, -2)) + [-100, 2]), - ), - ( - RI(0, -100, -1), - RI(0, -50, -3), - RI(-99, 1, 1), - I64(list(range(0, -100, -1))), - ), - (RI(0, 1, 1), RI(5, 6, 10), RI(0, 6, 5), I64([0, 5])), - (RI(0, 10, 5), RI(-5, -6, -20), RI(-5, 10, 5), I64([0, 5, -5])), - (RI(0, 3, 1), RI(4, 5, 1), I64([0, 1, 2, 4]), I64([0, 1, 2, 4])), - (RI(0, 10, 1), I64([]), RI(0, 10, 1), RI(0, 10, 1)), - (RI(0), I64([1, 5, 6]), I64([1, 5, 6]), I64([1, 5, 6])), - ] - ) - def unions(self, request): - """Inputs and expected outputs for RangeIndex.union tests""" - - return request.param - - def test_union_sorted(self, unions): - - idx1, idx2, expected_sorted, expected_notsorted = unions - - res1 = idx1.union(idx2, sort=None) - tm.assert_index_equal(res1, expected_sorted, exact=True) - - res1 = idx1.union(idx2, sort=False) - tm.assert_index_equal(res1, expected_notsorted, exact=True) - - res2 = idx2.union(idx1, sort=None) - res3 = idx1._int64index.union(idx2, sort=None) - tm.assert_index_equal(res2, expected_sorted, exact=True) - tm.assert_index_equal(res3, expected_sorted) - def test_nbytes(self): # memory savings vs int index diff --git a/pandas/tests/indexes/ranges/test_setops.py b/pandas/tests/indexes/ranges/test_setops.py new file mode 100644 index 0000000000000..5c1e461c9fcf0 --- /dev/null +++ b/pandas/tests/indexes/ranges/test_setops.py @@ -0,0 +1,244 @@ +from datetime import datetime, timedelta + +import numpy as np +import pytest + +from pandas import Index, Int64Index, RangeIndex +import pandas.util.testing as tm + + +class TestRangeIndexSetOps: + @pytest.mark.parametrize("sort", [None, False]) + def test_intersection(self, sort): + # intersect with Int64Index + index = RangeIndex(start=0, stop=20, step=2) + other = Index(np.arange(1, 6)) + result = index.intersection(other, sort=sort) + expected = Index(np.sort(np.intersect1d(index.values, other.values))) + tm.assert_index_equal(result, expected) + + result = other.intersection(index, sort=sort) + expected = Index( + np.sort(np.asarray(np.intersect1d(index.values, other.values))) + ) + tm.assert_index_equal(result, expected) + + # intersect with increasing RangeIndex + other = RangeIndex(1, 6) + result = index.intersection(other, sort=sort) + expected = Index(np.sort(np.intersect1d(index.values, other.values))) + tm.assert_index_equal(result, expected) + + # intersect with decreasing RangeIndex + other = RangeIndex(5, 0, -1) + result = index.intersection(other, sort=sort) + expected = Index(np.sort(np.intersect1d(index.values, other.values))) + tm.assert_index_equal(result, expected) + + # reversed (GH 17296) + result = other.intersection(index, sort=sort) + tm.assert_index_equal(result, expected) + + # GH 17296: intersect two decreasing RangeIndexes + first = RangeIndex(10, -2, -2) + other = RangeIndex(5, -4, -1) + expected = first.astype(int).intersection(other.astype(int), sort=sort) + result = first.intersection(other, sort=sort).astype(int) + tm.assert_index_equal(result, expected) + + # reversed + result = other.intersection(first, sort=sort).astype(int) + tm.assert_index_equal(result, expected) + + index = RangeIndex(5) + + # intersect of non-overlapping indices + other = RangeIndex(5, 10, 1) + result = index.intersection(other, sort=sort) + expected = RangeIndex(0, 0, 1) + tm.assert_index_equal(result, expected) + + other = RangeIndex(-1, -5, -1) + result = index.intersection(other, sort=sort) + expected = RangeIndex(0, 0, 1) + tm.assert_index_equal(result, expected) + + # intersection of empty indices + other = RangeIndex(0, 0, 1) + result = index.intersection(other, sort=sort) + expected = RangeIndex(0, 0, 1) + tm.assert_index_equal(result, expected) + + result = other.intersection(index, sort=sort) + tm.assert_index_equal(result, expected) + + # intersection of non-overlapping values based on start value and gcd + index = RangeIndex(1, 10, 2) + other = RangeIndex(0, 10, 4) + result = index.intersection(other, sort=sort) + expected = RangeIndex(0, 0, 1) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("sort", [False, None]) + def test_union_noncomparable(self, sort): + # corner case, non-Int64Index + index = RangeIndex(start=0, stop=20, step=2) + other = Index([datetime.now() + timedelta(i) for i in range(4)], dtype=object) + result = index.union(other, sort=sort) + expected = Index(np.concatenate((index, other))) + tm.assert_index_equal(result, expected) + + result = other.union(index, sort=sort) + expected = Index(np.concatenate((other, index))) + tm.assert_index_equal(result, expected) + + @pytest.fixture( + params=[ + ( + RangeIndex(0, 10, 1), + RangeIndex(0, 10, 1), + RangeIndex(0, 10, 1), + RangeIndex(0, 10, 1), + ), + ( + RangeIndex(0, 10, 1), + RangeIndex(5, 20, 1), + RangeIndex(0, 20, 1), + Int64Index(range(20)), + ), + ( + RangeIndex(0, 10, 1), + RangeIndex(10, 20, 1), + RangeIndex(0, 20, 1), + Int64Index(range(20)), + ), + ( + RangeIndex(0, -10, -1), + RangeIndex(0, -10, -1), + RangeIndex(0, -10, -1), + RangeIndex(0, -10, -1), + ), + ( + RangeIndex(0, -10, -1), + RangeIndex(-10, -20, -1), + RangeIndex(-19, 1, 1), + Int64Index(range(0, -20, -1)), + ), + ( + RangeIndex(0, 10, 2), + RangeIndex(1, 10, 2), + RangeIndex(0, 10, 1), + Int64Index(list(range(0, 10, 2)) + list(range(1, 10, 2))), + ), + ( + RangeIndex(0, 11, 2), + RangeIndex(1, 12, 2), + RangeIndex(0, 12, 1), + Int64Index(list(range(0, 11, 2)) + list(range(1, 12, 2))), + ), + ( + RangeIndex(0, 21, 4), + RangeIndex(-2, 24, 4), + RangeIndex(-2, 24, 2), + Int64Index(list(range(0, 21, 4)) + list(range(-2, 24, 4))), + ), + ( + RangeIndex(0, -20, -2), + RangeIndex(-1, -21, -2), + RangeIndex(-19, 1, 1), + Int64Index(list(range(0, -20, -2)) + list(range(-1, -21, -2))), + ), + ( + RangeIndex(0, 100, 5), + RangeIndex(0, 100, 20), + RangeIndex(0, 100, 5), + Int64Index(range(0, 100, 5)), + ), + ( + RangeIndex(0, -100, -5), + RangeIndex(5, -100, -20), + RangeIndex(-95, 10, 5), + Int64Index(list(range(0, -100, -5)) + [5]), + ), + ( + RangeIndex(0, -11, -1), + RangeIndex(1, -12, -4), + RangeIndex(-11, 2, 1), + Int64Index(list(range(0, -11, -1)) + [1, -11]), + ), + (RangeIndex(0), RangeIndex(0), RangeIndex(0), RangeIndex(0)), + ( + RangeIndex(0, -10, -2), + RangeIndex(0), + RangeIndex(0, -10, -2), + RangeIndex(0, -10, -2), + ), + ( + RangeIndex(0, 100, 2), + RangeIndex(100, 150, 200), + RangeIndex(0, 102, 2), + Int64Index(range(0, 102, 2)), + ), + ( + RangeIndex(0, -100, -2), + RangeIndex(-100, 50, 102), + RangeIndex(-100, 4, 2), + Int64Index(list(range(0, -100, -2)) + [-100, 2]), + ), + ( + RangeIndex(0, -100, -1), + RangeIndex(0, -50, -3), + RangeIndex(-99, 1, 1), + Int64Index(list(range(0, -100, -1))), + ), + ( + RangeIndex(0, 1, 1), + RangeIndex(5, 6, 10), + RangeIndex(0, 6, 5), + Int64Index([0, 5]), + ), + ( + RangeIndex(0, 10, 5), + RangeIndex(-5, -6, -20), + RangeIndex(-5, 10, 5), + Int64Index([0, 5, -5]), + ), + ( + RangeIndex(0, 3, 1), + RangeIndex(4, 5, 1), + Int64Index([0, 1, 2, 4]), + Int64Index([0, 1, 2, 4]), + ), + ( + RangeIndex(0, 10, 1), + Int64Index([]), + RangeIndex(0, 10, 1), + RangeIndex(0, 10, 1), + ), + ( + RangeIndex(0), + Int64Index([1, 5, 6]), + Int64Index([1, 5, 6]), + Int64Index([1, 5, 6]), + ), + ] + ) + def unions(self, request): + """Inputs and expected outputs for RangeIndex.union tests""" + + return request.param + + def test_union_sorted(self, unions): + + idx1, idx2, expected_sorted, expected_notsorted = unions + + res1 = idx1.union(idx2, sort=None) + tm.assert_index_equal(res1, expected_sorted, exact=True) + + res1 = idx1.union(idx2, sort=False) + tm.assert_index_equal(res1, expected_notsorted, exact=True) + + res2 = idx2.union(idx1, sort=None) + res3 = idx1._int64index.union(idx2, sort=None) + tm.assert_index_equal(res2, expected_sorted, exact=True) + tm.assert_index_equal(res3, expected_sorted) diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 37976d89ecba4..7187733fc91c3 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -736,6 +736,12 @@ def test_get_indexer(self): expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) + def test_get_indexer_nan(self): + # GH 7820 + result = Index([1, 2, np.nan]).get_indexer([np.nan]) + expected = np.array([2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + def test_intersection(self): index = self.create_index() other = Index([1, 2, 3, 4, 5]) diff --git a/pandas/tests/indexes/timedeltas/test_setops.py b/pandas/tests/indexes/timedeltas/test_setops.py index bbdd6c8c7c017..34db7ed419ddb 100644 --- a/pandas/tests/indexes/timedeltas/test_setops.py +++ b/pandas/tests/indexes/timedeltas/test_setops.py @@ -179,3 +179,51 @@ def test_intersection_non_monotonic(self, rng, expected, sort): assert isinstance(result.freq, Hour) else: assert result.freq is None + + +class TestTimedeltaIndexDifference: + @pytest.mark.parametrize("sort", [None, False]) + def test_difference_freq(self, sort): + # GH14323: Difference of TimedeltaIndex should not preserve frequency + + index = timedelta_range("0 days", "5 days", freq="D") + + other = timedelta_range("1 days", "4 days", freq="D") + expected = TimedeltaIndex(["0 days", "5 days"], freq=None) + idx_diff = index.difference(other, sort) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) + + other = timedelta_range("2 days", "5 days", freq="D") + idx_diff = index.difference(other, sort) + expected = TimedeltaIndex(["0 days", "1 days"], freq=None) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) + + @pytest.mark.parametrize("sort", [None, False]) + def test_difference_sort(self, sort): + + index = pd.TimedeltaIndex( + ["5 days", "3 days", "2 days", "4 days", "1 days", "0 days"] + ) + + other = timedelta_range("1 days", "4 days", freq="D") + idx_diff = index.difference(other, sort) + + expected = TimedeltaIndex(["5 days", "0 days"], freq=None) + + if sort is None: + expected = expected.sort_values() + + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) + + other = timedelta_range("2 days", "5 days", freq="D") + idx_diff = index.difference(other, sort) + expected = TimedeltaIndex(["1 days", "0 days"], freq=None) + + if sort is None: + expected = expected.sort_values() + + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 35575f3349f83..fa74ff2d30368 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -57,52 +57,6 @@ def test_fillna_timedelta(self): ) tm.assert_index_equal(idx.fillna("x"), exp) - @pytest.mark.parametrize("sort", [None, False]) - def test_difference_freq(self, sort): - # GH14323: Difference of TimedeltaIndex should not preserve frequency - - index = timedelta_range("0 days", "5 days", freq="D") - - other = timedelta_range("1 days", "4 days", freq="D") - expected = TimedeltaIndex(["0 days", "5 days"], freq=None) - idx_diff = index.difference(other, sort) - tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal("freq", idx_diff, expected) - - other = timedelta_range("2 days", "5 days", freq="D") - idx_diff = index.difference(other, sort) - expected = TimedeltaIndex(["0 days", "1 days"], freq=None) - tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal("freq", idx_diff, expected) - - @pytest.mark.parametrize("sort", [None, False]) - def test_difference_sort(self, sort): - - index = pd.TimedeltaIndex( - ["5 days", "3 days", "2 days", "4 days", "1 days", "0 days"] - ) - - other = timedelta_range("1 days", "4 days", freq="D") - idx_diff = index.difference(other, sort) - - expected = TimedeltaIndex(["5 days", "0 days"], freq=None) - - if sort is None: - expected = expected.sort_values() - - tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal("freq", idx_diff, expected) - - other = timedelta_range("2 days", "5 days", freq="D") - idx_diff = index.difference(other, sort) - expected = TimedeltaIndex(["1 days", "0 days"], freq=None) - - if sort is None: - expected = expected.sort_values() - - tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal("freq", idx_diff, expected) - def test_isin(self): index = tm.makeTimedeltaIndex(4) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 8b3620e8cd843..9119ca0a4511b 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -966,3 +966,17 @@ def test_loc_getitem_label_list_integer_labels( expected = df.iloc[:, expected_columns] result = df.loc[["A", "B", "C"], column_key] tm.assert_frame_equal(result, expected, check_column_type=check_column_type) + + +def test_loc_setitem_float_intindex(): + # GH 8720 + rand_data = np.random.randn(8, 4) + result = pd.DataFrame(rand_data) + result.loc[:, 0.5] = np.nan + expected_data = np.hstack((rand_data, np.array([np.nan] * 8).reshape(8, 1))) + expected = pd.DataFrame(expected_data, columns=[0.0, 1.0, 2.0, 3.0, 0.5]) + tm.assert_frame_equal(result, expected) + + result = pd.DataFrame(rand_data) + result.loc[:, 0.5] = np.nan + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 24233a0ec84b1..469f8df0cbc39 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -1,3 +1,4 @@ +import io import os import sys @@ -204,6 +205,14 @@ def test_to_csv_na_rep(self): assert df.set_index("a").to_csv(na_rep="_") == expected assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected + # GH 29975 + # Make sure full na_rep shows up when a dtype is provided + csv = pd.Series(["a", pd.NA, "c"]).to_csv(na_rep="ZZZZZ") + expected = tm.convert_rows_list_to_csv_str([",0", "0,a", "1,ZZZZZ", "2,c"]) + assert expected == csv + csv = pd.Series(["a", pd.NA, "c"], dtype="string").to_csv(na_rep="ZZZZZ") + assert expected == csv + def test_to_csv_date_format(self): # GH 10209 df_sec = DataFrame({"A": pd.date_range("20130101", periods=5, freq="s")}) @@ -563,3 +572,17 @@ def test_to_csv_na_rep_long_string(self, df_new_type): result = df.to_csv(index=False, na_rep="mynull", encoding="ascii") assert expected == result + + def test_to_csv_timedelta_precision(self): + # GH 6783 + s = pd.Series([1, 1]).astype("timedelta64[ns]") + buf = io.StringIO() + s.to_csv(buf) + result = buf.getvalue() + expected_rows = [ + ",0", + "0,0 days 00:00:00.000000001", + "1,0 days 00:00:00.000000001", + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 038dd2df4d632..46dd1e94aa739 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -462,6 +462,30 @@ def test_nested_flattening_consistent(self): # They should be the same. tm.assert_frame_equal(df1, df2) + def test_nonetype_record_path(self, nulls_fixture): + # see gh-30148 + # should not raise TypeError + result = json_normalize( + [ + {"state": "Texas", "info": nulls_fixture}, + {"state": "Florida", "info": [{"i": 2}]}, + ], + record_path=["info"], + ) + expected = DataFrame({"i": 2}, index=[0]) + tm.assert_equal(result, expected) + + def test_non_interable_record_path_errors(self): + # see gh-30148 + test_input = {"state": "Texas", "info": 1} + test_path = "info" + msg = ( + f"{test_input} has non iterable value 1 for path {test_path}. " + "Must be iterable or null." + ) + with pytest.raises(TypeError, match=msg): + json_normalize([test_input], record_path=[test_path]) + class TestNestedToRecord: def test_flat_stays_flat(self): diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 42a4a55988b0f..007a068125d43 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2207,3 +2207,13 @@ def test_first_row_bom(all_parsers): result = parser.read_csv(StringIO(data), delimiter="\t") expected = DataFrame(columns=["Head1", "Head2", "Head3"]) tm.assert_frame_equal(result, expected) + + +def test_integer_precision(all_parsers): + # Gh 7072 + s = """1,1;0;0;0;1;1;3844;3844;3844;1;1;1;1;1;1;0;0;1;1;0;0,,,4321583677327450765 +5,1;0;0;0;1;1;843;843;843;1;1;1;1;1;1;0;0;1;1;0;0,64.0,;,4321113141090630389""" + parser = all_parsers + result = parser.read_csv(StringIO(s), header=None)[4] + expected = Series([4321583677327450765, 4321113141090630389], name=4) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index cfcd2c9f2df95..59d7f6f904337 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -146,11 +146,15 @@ def test_read_non_existant(self, reader, module, error_class, fn_ext): msg3 = "Expected object or value" msg4 = "path_or_buf needs to be a string file path or file-like" msg5 = ( - r"\[Errno 2\] File .+does_not_exist\.{} does not exist:" - r" '.+does_not_exist\.{}'" - ).format(fn_ext, fn_ext) + fr"\[Errno 2\] File .+does_not_exist\.{fn_ext} does not exist:" + fr" '.+does_not_exist\.{fn_ext}'" + ) + msg6 = fr"\[Errno 2\] 没有那个文件或目录: '.+does_not_exist\.{fn_ext}'" + msg7 = ( + fr"\[Errno 2\] File o directory non esistente: '.+does_not_exist\.{fn_ext}'" + ) with pytest.raises( - error_class, match=r"({}|{}|{}|{}|{})".format(msg1, msg2, msg3, msg4, msg5) + error_class, match=fr"({msg1}|{msg2}|{msg3}|{msg4}|{msg5}|{msg6}|{msg7})" ): reader(path) @@ -177,17 +181,21 @@ def test_read_expands_user_home_dir( path = os.path.join("~", "does_not_exist." + fn_ext) monkeypatch.setattr(icom, "_expand_user", lambda x: os.path.join("foo", x)) - msg1 = r"File (b')?.+does_not_exist\.{}'? does not exist".format(fn_ext) + msg1 = fr"File (b')?.+does_not_exist\.{fn_ext}'? does not exist" msg2 = fr"\[Errno 2\] No such file or directory: '.+does_not_exist\.{fn_ext}'" msg3 = "Unexpected character found when decoding 'false'" msg4 = "path_or_buf needs to be a string file path or file-like" msg5 = ( - r"\[Errno 2\] File .+does_not_exist\.{} does not exist:" - r" '.+does_not_exist\.{}'" - ).format(fn_ext, fn_ext) + fr"\[Errno 2\] File .+does_not_exist\.{fn_ext} does not exist:" + fr" '.+does_not_exist\.{fn_ext}'" + ) + msg6 = fr"\[Errno 2\] 没有那个文件或目录: '.+does_not_exist\.{fn_ext}'" + msg7 = ( + fr"\[Errno 2\] File o directory non esistente: '.+does_not_exist\.{fn_ext}'" + ) with pytest.raises( - error_class, match=r"({}|{}|{}|{}|{})".format(msg1, msg2, msg3, msg4, msg5) + error_class, match=fr"({msg1}|{msg2}|{msg3}|{msg4}|{msg5}|{msg6}|{msg7})" ): reader(path) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index bc26615d1aad5..2bb412cf6eab5 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -383,7 +383,15 @@ def test_thousands_macau_stats(self, datapath): assert not any(s.isna().any() for _, s in df.items()) @pytest.mark.slow - def test_thousands_macau_index_col(self, datapath): + def test_thousands_macau_index_col(self, datapath, request): + # https://github.com/pandas-dev/pandas/issues/29622 + # This tests fails for bs4 >= 4.8.0 - so handle xfail accordingly + if self.read_html.keywords.get("flavor") == "bs4" and td.safe_import( + "bs4", "4.8.0" + ): + reason = "fails for bs4 version >= 4.8.0" + request.node.add_marker(pytest.mark.xfail(reason=reason)) + all_non_nan_table_index = -2 macau_data = datapath("io", "data", "html", "macau.html") dfs = self.read_html(macau_data, index_col=0, header=0) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index cbc5ebd986c15..e8bc7f480fb1d 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -21,6 +21,7 @@ PossiblePrecisionLoss, StataMissingValue, StataReader, + StataWriter118, read_stata, ) @@ -1271,11 +1272,9 @@ def test_invalid_variable_labels(self, version): variable_labels["a"] = "invalid character Œ" with tm.ensure_clean() as path: - msg = ( - "Variable labels must contain only characters that can be" - " encoded in Latin-1" - ) - with pytest.raises(ValueError, match=msg): + with pytest.raises( + ValueError, match="Variable labels must contain only characters" + ): original.to_stata( path, variable_labels=variable_labels, version=version ) @@ -1425,8 +1424,8 @@ def test_out_of_range_double(self): } ) msg = ( - r"Column ColumnTooBig has a maximum value \(.+\)" - r" outside the range supported by Stata \(.+\)" + r"Column ColumnTooBig has a maximum value \(.+\) outside the range " + r"supported by Stata \(.+\)" ) with pytest.raises(ValueError, match=msg): with tm.ensure_clean() as path: @@ -1434,8 +1433,8 @@ def test_out_of_range_double(self): df.loc[2, "ColumnTooBig"] = np.inf msg = ( - "Column ColumnTooBig has a maximum value of infinity which" - " is outside the range supported by Stata" + "Column ColumnTooBig has a maximum value of infinity which is outside " + "the range supported by Stata" ) with pytest.raises(ValueError, match=msg): with tm.ensure_clean() as path: @@ -1706,15 +1705,7 @@ def test_all_none_exception(self, version): output = pd.DataFrame(output) output.loc[:, "none"] = None with tm.ensure_clean() as path: - msg = ( - r"Column `none` cannot be exported\.\n\n" - "Only string-like object arrays containing all strings or a" - r" mix of strings and None can be exported\. Object arrays" - r" containing only null values are prohibited\. Other" - " object typescannot be exported and must first be" - r" converted to one of the supported types\." - ) - with pytest.raises(ValueError, match=msg): + with pytest.raises(ValueError, match="Column `none` cannot be exported"): output.to_stata(path, version=version) @pytest.mark.parametrize("version", [114, 117]) @@ -1778,3 +1769,41 @@ def test_stata_119(self): assert df.iloc[0, 7] == 3.14 assert df.iloc[0, -1] == 1 assert df.iloc[0, 0] == pd.Timestamp(datetime(2012, 12, 21, 21, 12, 21)) + + def test_118_writer(self): + cat = pd.Categorical(["a", "β", "ĉ"], ordered=True) + data = pd.DataFrame( + [ + [1.0, 1, "ᴬ", "ᴀ relatively long ŝtring"], + [2.0, 2, "ᴮ", ""], + [3.0, 3, "ᴰ", None], + ], + columns=["a", "β", "ĉ", "strls"], + ) + data["ᴐᴬᵀ"] = cat + variable_labels = { + "a": "apple", + "β": "ᵈᵉᵊ", + "ĉ": "ᴎტჄႲႳႴႶႺ", + "strls": "Long Strings", + "ᴐᴬᵀ": "", + } + data_label = "ᴅaᵀa-label" + data["β"] = data["β"].astype(np.int32) + with tm.ensure_clean() as path: + writer = StataWriter118( + path, + data, + data_label=data_label, + convert_strl=["strls"], + variable_labels=variable_labels, + write_index=False, + ) + writer.write_file() + reread_encoded = read_stata(path) + # Missing is intentionally converted to empty strl + data["strls"] = data["strls"].fillna("") + tm.assert_frame_equal(data, reread_encoded) + reader = StataReader(path) + assert reader.data_label == data_label + assert reader.variable_labels() == variable_labels diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 4fcdc350bc90a..c2a289b2772ba 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -1162,6 +1162,36 @@ def test_plot_scatter(self): axes = df.plot(x="x", y="y", kind="scatter", subplots=True) self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + def test_raise_error_on_datetime_time_data(self): + # GH 8113, datetime.time type is not supported by matplotlib in scatter + df = pd.DataFrame(np.random.randn(10), columns=["a"]) + df["dtime"] = pd.date_range(start="2014-01-01", freq="h", periods=10).time + msg = "must be a string or a number, not 'datetime.time'" + + with pytest.raises(TypeError, match=msg): + df.plot(kind="scatter", x="dtime", y="a") + + def test_scatterplot_datetime_data(self): + # GH 30391 + dates = pd.date_range(start=date(2019, 1, 1), periods=12, freq="W") + vals = np.random.normal(0, 1, len(dates)) + df = pd.DataFrame({"dates": dates, "vals": vals}) + + _check_plot_works(df.plot.scatter, x="dates", y="vals") + _check_plot_works(df.plot.scatter, x=0, y=1) + + def test_scatterplot_object_data(self): + # GH 18755 + df = pd.DataFrame(dict(a=["A", "B", "C"], b=[2, 3, 4])) + + _check_plot_works(df.plot.scatter, x="a", y="b") + _check_plot_works(df.plot.scatter, x=0, y=1) + + df = pd.DataFrame(dict(a=["A", "B", "C"], b=["a", "b", "c"])) + + _check_plot_works(df.plot.scatter, x="a", y="b") + _check_plot_works(df.plot.scatter, x=0, y=1) + @pytest.mark.slow def test_if_scatterplot_colorbar_affects_xaxis_visibility(self): # addressing issue #10611, to ensure colobar does not @@ -1216,24 +1246,15 @@ def test_if_scatterplot_colorbars_are_next_to_parent_axes(self): colorbar_distance = axes_x_coords[3, :] - axes_x_coords[2, :] assert np.isclose(parent_distance, colorbar_distance, atol=1e-7).all() + @pytest.mark.parametrize("x, y", [("x", "y"), ("y", "x"), ("y", "y")]) @pytest.mark.slow - def test_plot_scatter_with_categorical_data(self): - # GH 16199 + def test_plot_scatter_with_categorical_data(self, x, y): + # after fixing GH 18755, should be able to plot categorical data df = pd.DataFrame( {"x": [1, 2, 3, 4], "y": pd.Categorical(["a", "b", "a", "c"])} ) - with pytest.raises(ValueError) as ve: - df.plot(x="x", y="y", kind="scatter") - ve.match("requires y column to be numeric") - - with pytest.raises(ValueError) as ve: - df.plot(x="y", y="x", kind="scatter") - ve.match("requires x column to be numeric") - - with pytest.raises(ValueError) as ve: - df.plot(x="y", y="y", kind="scatter") - ve.match("requires x column to be numeric") + _check_plot_works(df.plot.scatter, x=x, y=y) @pytest.mark.slow def test_plot_scatter_with_c(self): diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index ebbdbd6c29842..ee9c4ed12bd92 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -353,3 +353,24 @@ def test_hash_collisions(): result = hash_array(np.asarray(hashes, dtype=object), "utf8") tm.assert_numpy_array_equal(result, np.concatenate([expected1, expected2], axis=0)) + + +def test_hash_with_tuple(): + # GH#28969 array containing a tuple raises on call to arr.astype(str) + # apparently a numpy bug github.com/numpy/numpy/issues/9441 + + df = pd.DataFrame({"data": [tuple("1"), tuple("2")]}) + result = hash_pandas_object(df) + expected = pd.Series([10345501319357378243, 8331063931016360761], dtype=np.uint64) + tm.assert_series_equal(result, expected) + + df2 = pd.DataFrame({"data": [tuple([1]), tuple([2])]}) + result = hash_pandas_object(df2) + expected = pd.Series([9408946347443669104, 3278256261030523334], dtype=np.uint64) + tm.assert_series_equal(result, expected) + + # require that the elements of such tuples are themselves hashable + + df3 = pd.DataFrame({"data": [tuple([1, []]), tuple([2, {}])]}) + with pytest.raises(TypeError, match="unhashable type: 'list'"): + hash_pandas_object(df3) diff --git a/pandas/tests/window/common.py b/pandas/tests/window/common.py index 1dfc0f34b2b8d..77f59bf919168 100644 --- a/pandas/tests/window/common.py +++ b/pandas/tests/window/common.py @@ -3,7 +3,8 @@ import numpy as np from numpy.random import randn -from pandas import DataFrame, Series, bdate_range +from pandas import DataFrame, Series, bdate_range, notna +import pandas.util.testing as tm N, K = 100, 10 @@ -21,3 +22,334 @@ def _create_data(self): self.rng = bdate_range(datetime(2009, 1, 1), periods=N) self.series = Series(arr.copy(), index=self.rng) self.frame = DataFrame(randn(N, K), index=self.rng, columns=np.arange(K)) + + +# create the data only once as we are not setting it +def _create_consistency_data(): + def create_series(): + return [ + Series(dtype=object), + Series([np.nan]), + Series([np.nan, np.nan]), + Series([3.0]), + Series([np.nan, 3.0]), + Series([3.0, np.nan]), + Series([1.0, 3.0]), + Series([2.0, 2.0]), + Series([3.0, 1.0]), + Series( + [5.0, 5.0, 5.0, 5.0, np.nan, np.nan, np.nan, 5.0, 5.0, np.nan, np.nan] + ), + Series( + [ + np.nan, + 5.0, + 5.0, + 5.0, + np.nan, + np.nan, + np.nan, + 5.0, + 5.0, + np.nan, + np.nan, + ] + ), + Series( + [ + np.nan, + np.nan, + 5.0, + 5.0, + np.nan, + np.nan, + np.nan, + 5.0, + 5.0, + np.nan, + np.nan, + ] + ), + Series( + [ + np.nan, + 3.0, + np.nan, + 3.0, + 4.0, + 5.0, + 6.0, + np.nan, + np.nan, + 7.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series( + [ + np.nan, + 5.0, + np.nan, + 2.0, + 4.0, + 0.0, + 9.0, + np.nan, + np.nan, + 3.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series( + [ + 2.0, + 3.0, + np.nan, + 3.0, + 4.0, + 5.0, + 6.0, + np.nan, + np.nan, + 7.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series( + [ + 2.0, + 5.0, + np.nan, + 2.0, + 4.0, + 0.0, + 9.0, + np.nan, + np.nan, + 3.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series(range(10)), + Series(range(20, 0, -2)), + ] + + def create_dataframes(): + return [ + DataFrame(), + DataFrame(columns=["a"]), + DataFrame(columns=["a", "a"]), + DataFrame(columns=["a", "b"]), + DataFrame(np.arange(10).reshape((5, 2))), + DataFrame(np.arange(25).reshape((5, 5))), + DataFrame(np.arange(25).reshape((5, 5)), columns=["a", "b", 99, "d", "d"]), + ] + [DataFrame(s) for s in create_series()] + + def is_constant(x): + values = x.values.ravel() + return len(set(values[notna(values)])) == 1 + + def no_nans(x): + return x.notna().all().all() + + # data is a tuple(object, is_constant, no_nans) + data = create_series() + create_dataframes() + + return [(x, is_constant(x), no_nans(x)) for x in data] + + +_consistency_data = _create_consistency_data() + + +class ConsistencyBase(Base): + base_functions = [ + (lambda v: Series(v).count(), None, "count"), + (lambda v: Series(v).max(), None, "max"), + (lambda v: Series(v).min(), None, "min"), + (lambda v: Series(v).sum(), None, "sum"), + (lambda v: Series(v).mean(), None, "mean"), + (lambda v: Series(v).std(), 1, "std"), + (lambda v: Series(v).cov(Series(v)), None, "cov"), + (lambda v: Series(v).corr(Series(v)), None, "corr"), + (lambda v: Series(v).var(), 1, "var"), + # restore once GH 8086 is fixed + # lambda v: Series(v).skew(), 3, 'skew'), + # (lambda v: Series(v).kurt(), 4, 'kurt'), + # restore once GH 8084 is fixed + # lambda v: Series(v).quantile(0.3), None, 'quantile'), + (lambda v: Series(v).median(), None, "median"), + (np.nanmax, 1, "max"), + (np.nanmin, 1, "min"), + (np.nansum, 1, "sum"), + (np.nanmean, 1, "mean"), + (lambda v: np.nanstd(v, ddof=1), 1, "std"), + (lambda v: np.nanvar(v, ddof=1), 1, "var"), + (np.nanmedian, 1, "median"), + ] + no_nan_functions = [ + (np.max, None, "max"), + (np.min, None, "min"), + (np.sum, None, "sum"), + (np.mean, None, "mean"), + (lambda v: np.std(v, ddof=1), 1, "std"), + (lambda v: np.var(v, ddof=1), 1, "var"), + (np.median, None, "median"), + ] + + def _create_data(self): + super()._create_data() + self.data = _consistency_data + + def _test_moments_consistency_mock_mean(self, mean, mock_mean): + for (x, is_constant, no_nans) in self.data: + mean_x = mean(x) + # check that correlation of a series with itself is either 1 or NaN + + if mock_mean: + # check that mean equals mock_mean + expected = mock_mean(x) + tm.assert_equal(mean_x, expected.astype("float64")) + + def _test_moments_consistency_is_constant(self, min_periods, count, mean, corr): + for (x, is_constant, no_nans) in self.data: + count_x = count(x) + mean_x = mean(x) + # check that correlation of a series with itself is either 1 or NaN + corr_x_x = corr(x, x) + + if is_constant: + exp = x.max() if isinstance(x, Series) else x.max().max() + + # check mean of constant series + expected = x * np.nan + expected[count_x >= max(min_periods, 1)] = exp + tm.assert_equal(mean_x, expected) + + # check correlation of constant series with itself is NaN + expected[:] = np.nan + tm.assert_equal(corr_x_x, expected) + + def _test_moments_consistency_var_debiasing_factors( + self, var_biased=None, var_unbiased=None, var_debiasing_factors=None + ): + for (x, is_constant, no_nans) in self.data: + if var_unbiased and var_biased and var_debiasing_factors: + # check variance debiasing factors + var_unbiased_x = var_unbiased(x) + var_biased_x = var_biased(x) + var_debiasing_factors_x = var_debiasing_factors(x) + tm.assert_equal(var_unbiased_x, var_biased_x * var_debiasing_factors_x) + + def _test_moments_consistency( + self, + min_periods, + count, + mean, + corr, + var_unbiased=None, + std_unbiased=None, + cov_unbiased=None, + var_biased=None, + std_biased=None, + cov_biased=None, + ): + + for (x, is_constant, no_nans) in self.data: + count_x = count(x) + mean_x = mean(x) + + for (std, var, cov) in [ + (std_biased, var_biased, cov_biased), + (std_unbiased, var_unbiased, cov_unbiased), + ]: + + # check that var(x), std(x), and cov(x) are all >= 0 + var_x = var(x) + std_x = std(x) + assert not (var_x < 0).any().any() + assert not (std_x < 0).any().any() + if cov: + cov_x_x = cov(x, x) + assert not (cov_x_x < 0).any().any() + + # check that var(x) == cov(x, x) + tm.assert_equal(var_x, cov_x_x) + + # check that var(x) == std(x)^2 + tm.assert_equal(var_x, std_x * std_x) + + if var is var_biased: + # check that biased var(x) == mean(x^2) - mean(x)^2 + mean_x2 = mean(x * x) + tm.assert_equal(var_x, mean_x2 - (mean_x * mean_x)) + + if is_constant: + # check that variance of constant series is identically 0 + assert not (var_x > 0).any().any() + expected = x * np.nan + expected[count_x >= max(min_periods, 1)] = 0.0 + if var is var_unbiased: + expected[count_x < 2] = np.nan + tm.assert_equal(var_x, expected) + + if isinstance(x, Series): + for (y, is_constant, no_nans) in self.data: + if not x.isna().equals(y.isna()): + # can only easily test two Series with similar + # structure + continue + + # check that cor(x, y) is symmetric + corr_x_y = corr(x, y) + corr_y_x = corr(y, x) + tm.assert_equal(corr_x_y, corr_y_x) + + if cov: + # check that cov(x, y) is symmetric + cov_x_y = cov(x, y) + cov_y_x = cov(y, x) + tm.assert_equal(cov_x_y, cov_y_x) + + # check that cov(x, y) == (var(x+y) - var(x) - + # var(y)) / 2 + var_x_plus_y = var(x + y) + var_y = var(y) + tm.assert_equal( + cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y) + ) + + # check that corr(x, y) == cov(x, y) / (std(x) * + # std(y)) + std_y = std(y) + tm.assert_equal(corr_x_y, cov_x_y / (std_x * std_y)) + + if cov is cov_biased: + # check that biased cov(x, y) == mean(x*y) - + # mean(x)*mean(y) + mean_y = mean(y) + mean_x_times_y = mean(x * y) + tm.assert_equal( + cov_x_y, mean_x_times_y - (mean_x * mean_y) + ) + + def _check_pairwise_moment(self, dispatch, name, **kwargs): + def get_result(obj, obj2=None): + return getattr(getattr(obj, dispatch)(**kwargs), name)(obj2) + + result = get_result(self.frame) + result = result.loc[(slice(None), 1), 5] + result.index = result.index.droplevel(1) + expected = get_result(self.frame[1], self.frame[5]) + tm.assert_series_equal(result, expected, check_names=False) diff --git a/pandas/tests/window/moments/test_moments_ewm.py b/pandas/tests/window/moments/test_moments_ewm.py new file mode 100644 index 0000000000000..489c1ff14ecfd --- /dev/null +++ b/pandas/tests/window/moments/test_moments_ewm.py @@ -0,0 +1,457 @@ +import numpy as np +from numpy.random import randn +import pytest + +import pandas as pd +from pandas import DataFrame, Series, concat +from pandas.tests.window.common import Base, ConsistencyBase +import pandas.util.testing as tm + + +@pytest.mark.filterwarnings("ignore:can't resolve package:ImportWarning") +class TestMoments(Base): + def setup_method(self, method): + self._create_data() + + def test_ewma(self): + self._check_ew(name="mean") + + vals = pd.Series(np.zeros(1000)) + vals[5] = 1 + result = vals.ewm(span=100, adjust=False).mean().sum() + assert np.abs(result - 1) < 1e-2 + + @pytest.mark.parametrize("adjust", [True, False]) + @pytest.mark.parametrize("ignore_na", [True, False]) + def test_ewma_cases(self, adjust, ignore_na): + # try adjust/ignore_na args matrix + + s = Series([1.0, 2.0, 4.0, 8.0]) + + if adjust: + expected = Series([1.0, 1.6, 2.736842, 4.923077]) + else: + expected = Series([1.0, 1.333333, 2.222222, 4.148148]) + + result = s.ewm(com=2.0, adjust=adjust, ignore_na=ignore_na).mean() + tm.assert_series_equal(result, expected) + + def test_ewma_nan_handling(self): + s = Series([1.0] + [np.nan] * 5 + [1.0]) + result = s.ewm(com=5).mean() + tm.assert_series_equal(result, Series([1.0] * len(s))) + + s = Series([np.nan] * 2 + [1.0] + [np.nan] * 2 + [1.0]) + result = s.ewm(com=5).mean() + tm.assert_series_equal(result, Series([np.nan] * 2 + [1.0] * 4)) + + # GH 7603 + s0 = Series([np.nan, 1.0, 101.0]) + s1 = Series([1.0, np.nan, 101.0]) + s2 = Series([np.nan, 1.0, np.nan, np.nan, 101.0, np.nan]) + s3 = Series([1.0, np.nan, 101.0, 50.0]) + com = 2.0 + alpha = 1.0 / (1.0 + com) + + def simple_wma(s, w): + return (s.multiply(w).cumsum() / w.cumsum()).fillna(method="ffill") + + for (s, adjust, ignore_na, w) in [ + (s0, True, False, [np.nan, (1.0 - alpha), 1.0]), + (s0, True, True, [np.nan, (1.0 - alpha), 1.0]), + (s0, False, False, [np.nan, (1.0 - alpha), alpha]), + (s0, False, True, [np.nan, (1.0 - alpha), alpha]), + (s1, True, False, [(1.0 - alpha) ** 2, np.nan, 1.0]), + (s1, True, True, [(1.0 - alpha), np.nan, 1.0]), + (s1, False, False, [(1.0 - alpha) ** 2, np.nan, alpha]), + (s1, False, True, [(1.0 - alpha), np.nan, alpha]), + ( + s2, + True, + False, + [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, 1.0, np.nan], + ), + (s2, True, True, [np.nan, (1.0 - alpha), np.nan, np.nan, 1.0, np.nan]), + ( + s2, + False, + False, + [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, alpha, np.nan], + ), + (s2, False, True, [np.nan, (1.0 - alpha), np.nan, np.nan, alpha, np.nan]), + (s3, True, False, [(1.0 - alpha) ** 3, np.nan, (1.0 - alpha), 1.0]), + (s3, True, True, [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha), 1.0]), + ( + s3, + False, + False, + [ + (1.0 - alpha) ** 3, + np.nan, + (1.0 - alpha) * alpha, + alpha * ((1.0 - alpha) ** 2 + alpha), + ], + ), + ( + s3, + False, + True, + [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha) * alpha, alpha], + ), + ]: + expected = simple_wma(s, Series(w)) + result = s.ewm(com=com, adjust=adjust, ignore_na=ignore_na).mean() + + tm.assert_series_equal(result, expected) + if ignore_na is False: + # check that ignore_na defaults to False + result = s.ewm(com=com, adjust=adjust).mean() + tm.assert_series_equal(result, expected) + + def test_ewmvar(self): + self._check_ew(name="var") + + def test_ewmvol(self): + self._check_ew(name="vol") + + def test_ewma_span_com_args(self): + A = self.series.ewm(com=9.5).mean() + B = self.series.ewm(span=20).mean() + tm.assert_almost_equal(A, B) + + with pytest.raises(ValueError): + self.series.ewm(com=9.5, span=20) + with pytest.raises(ValueError): + self.series.ewm().mean() + + def test_ewma_halflife_arg(self): + A = self.series.ewm(com=13.932726172912965).mean() + B = self.series.ewm(halflife=10.0).mean() + tm.assert_almost_equal(A, B) + + with pytest.raises(ValueError): + self.series.ewm(span=20, halflife=50) + with pytest.raises(ValueError): + self.series.ewm(com=9.5, halflife=50) + with pytest.raises(ValueError): + self.series.ewm(com=9.5, span=20, halflife=50) + with pytest.raises(ValueError): + self.series.ewm() + + def test_ewm_alpha(self): + # GH 10789 + s = Series(self.arr) + a = s.ewm(alpha=0.61722699889169674).mean() + b = s.ewm(com=0.62014947789973052).mean() + c = s.ewm(span=2.240298955799461).mean() + d = s.ewm(halflife=0.721792864318).mean() + tm.assert_series_equal(a, b) + tm.assert_series_equal(a, c) + tm.assert_series_equal(a, d) + + def test_ewm_alpha_arg(self): + # GH 10789 + s = self.series + with pytest.raises(ValueError): + s.ewm() + with pytest.raises(ValueError): + s.ewm(com=10.0, alpha=0.5) + with pytest.raises(ValueError): + s.ewm(span=10.0, alpha=0.5) + with pytest.raises(ValueError): + s.ewm(halflife=10.0, alpha=0.5) + + def test_ewm_domain_checks(self): + # GH 12492 + s = Series(self.arr) + msg = "comass must satisfy: comass >= 0" + with pytest.raises(ValueError, match=msg): + s.ewm(com=-0.1) + s.ewm(com=0.0) + s.ewm(com=0.1) + + msg = "span must satisfy: span >= 1" + with pytest.raises(ValueError, match=msg): + s.ewm(span=-0.1) + with pytest.raises(ValueError, match=msg): + s.ewm(span=0.0) + with pytest.raises(ValueError, match=msg): + s.ewm(span=0.9) + s.ewm(span=1.0) + s.ewm(span=1.1) + + msg = "halflife must satisfy: halflife > 0" + with pytest.raises(ValueError, match=msg): + s.ewm(halflife=-0.1) + with pytest.raises(ValueError, match=msg): + s.ewm(halflife=0.0) + s.ewm(halflife=0.1) + + msg = "alpha must satisfy: 0 < alpha <= 1" + with pytest.raises(ValueError, match=msg): + s.ewm(alpha=-0.1) + with pytest.raises(ValueError, match=msg): + s.ewm(alpha=0.0) + s.ewm(alpha=0.1) + s.ewm(alpha=1.0) + with pytest.raises(ValueError, match=msg): + s.ewm(alpha=1.1) + + @pytest.mark.parametrize("method", ["mean", "vol", "var"]) + def test_ew_empty_series(self, method): + vals = pd.Series([], dtype=np.float64) + + ewm = vals.ewm(3) + result = getattr(ewm, method)() + tm.assert_almost_equal(result, vals) + + def _check_ew(self, name=None, preserve_nan=False): + series_result = getattr(self.series.ewm(com=10), name)() + assert isinstance(series_result, Series) + + frame_result = getattr(self.frame.ewm(com=10), name)() + assert type(frame_result) == DataFrame + + result = getattr(self.series.ewm(com=10), name)() + if preserve_nan: + assert result[self._nan_locs].isna().all() + + # excluding NaNs correctly + arr = randn(50) + arr[:10] = np.NaN + arr[-10:] = np.NaN + s = Series(arr) + + # check min_periods + # GH 7898 + result = getattr(s.ewm(com=50, min_periods=2), name)() + assert result[:11].isna().all() + assert not result[11:].isna().any() + + for min_periods in (0, 1): + result = getattr(s.ewm(com=50, min_periods=min_periods), name)() + if name == "mean": + assert result[:10].isna().all() + assert not result[10:].isna().any() + else: + # ewm.std, ewm.vol, ewm.var (with bias=False) require at least + # two values + assert result[:11].isna().all() + assert not result[11:].isna().any() + + # check series of length 0 + result = getattr( + Series(dtype=object).ewm(com=50, min_periods=min_periods), name + )() + tm.assert_series_equal(result, Series(dtype="float64")) + + # check series of length 1 + result = getattr(Series([1.0]).ewm(50, min_periods=min_periods), name)() + if name == "mean": + tm.assert_series_equal(result, Series([1.0])) + else: + # ewm.std, ewm.vol, ewm.var with bias=False require at least + # two values + tm.assert_series_equal(result, Series([np.NaN])) + + # pass in ints + result2 = getattr(Series(np.arange(50)).ewm(span=10), name)() + assert result2.dtype == np.float_ + + +class TestEwmMomentsConsistency(ConsistencyBase): + def setup_method(self, method): + self._create_data() + + def test_ewmcov(self): + self._check_binary_ew("cov") + + def test_ewmcov_pairwise(self): + self._check_pairwise_moment("ewm", "cov", span=10, min_periods=5) + + def test_ewmcorr(self): + self._check_binary_ew("corr") + + def test_ewmcorr_pairwise(self): + self._check_pairwise_moment("ewm", "corr", span=10, min_periods=5) + + def _check_binary_ew(self, name): + def func(A, B, com, **kwargs): + return getattr(A.ewm(com, **kwargs), name)(B) + + A = Series(randn(50), index=np.arange(50)) + B = A[2:] + randn(48) + + A[:10] = np.NaN + B[-10:] = np.NaN + + result = func(A, B, 20, min_periods=5) + assert np.isnan(result.values[:14]).all() + assert not np.isnan(result.values[14:]).any() + + # GH 7898 + for min_periods in (0, 1, 2): + result = func(A, B, 20, min_periods=min_periods) + # binary functions (ewmcov, ewmcorr) with bias=False require at + # least two values + assert np.isnan(result.values[:11]).all() + assert not np.isnan(result.values[11:]).any() + + # check series of length 0 + empty = Series([], dtype=np.float64) + result = func(empty, empty, 50, min_periods=min_periods) + tm.assert_series_equal(result, empty) + + # check series of length 1 + result = func(Series([1.0]), Series([1.0]), 50, min_periods=min_periods) + tm.assert_series_equal(result, Series([np.NaN])) + + msg = "Input arrays must be of the same type!" + # exception raised is Exception + with pytest.raises(Exception, match=msg): + func(A, randn(50), 20, min_periods=5) + + @pytest.mark.slow + @pytest.mark.parametrize("min_periods", [0, 1, 2, 3, 4]) + @pytest.mark.parametrize("adjust", [True, False]) + @pytest.mark.parametrize("ignore_na", [True, False]) + def test_ewm_consistency(self, min_periods, adjust, ignore_na): + def _weights(s, com, adjust, ignore_na): + if isinstance(s, DataFrame): + if not len(s.columns): + return DataFrame(index=s.index, columns=s.columns) + w = concat( + [ + _weights( + s.iloc[:, i], com=com, adjust=adjust, ignore_na=ignore_na + ) + for i, _ in enumerate(s.columns) + ], + axis=1, + ) + w.index = s.index + w.columns = s.columns + return w + + w = Series(np.nan, index=s.index) + alpha = 1.0 / (1.0 + com) + if ignore_na: + w[s.notna()] = _weights( + s[s.notna()], com=com, adjust=adjust, ignore_na=False + ) + elif adjust: + for i in range(len(s)): + if s.iat[i] == s.iat[i]: + w.iat[i] = pow(1.0 / (1.0 - alpha), i) + else: + sum_wts = 0.0 + prev_i = -1 + for i in range(len(s)): + if s.iat[i] == s.iat[i]: + if prev_i == -1: + w.iat[i] = 1.0 + else: + w.iat[i] = alpha * sum_wts / pow(1.0 - alpha, i - prev_i) + sum_wts += w.iat[i] + prev_i = i + return w + + def _variance_debiasing_factors(s, com, adjust, ignore_na): + weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na) + cum_sum = weights.cumsum().fillna(method="ffill") + cum_sum_sq = (weights * weights).cumsum().fillna(method="ffill") + numerator = cum_sum * cum_sum + denominator = numerator - cum_sum_sq + denominator[denominator <= 0.0] = np.nan + return numerator / denominator + + def _ewma(s, com, min_periods, adjust, ignore_na): + weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na) + result = ( + s.multiply(weights) + .cumsum() + .divide(weights.cumsum()) + .fillna(method="ffill") + ) + result[ + s.expanding().count() < (max(min_periods, 1) if min_periods else 1) + ] = np.nan + return result + + com = 3.0 + self._test_moments_consistency_mock_mean( + mean=lambda x: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).mean(), + mock_mean=lambda x: _ewma( + x, com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ), + ) + + self._test_moments_consistency_is_constant( + min_periods=min_periods, + count=lambda x: x.expanding().count(), + mean=lambda x: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).mean(), + corr=lambda x, y: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).corr(y), + ) + + self._test_moments_consistency_var_debiasing_factors( + var_unbiased=lambda x: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).var(bias=False) + ), + var_biased=lambda x: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).var(bias=True) + ), + var_debiasing_factors=lambda x: ( + _variance_debiasing_factors( + x, com=com, adjust=adjust, ignore_na=ignore_na + ) + ), + ) + # test consistency between different ewm* moments + self._test_moments_consistency( + min_periods=min_periods, + count=lambda x: x.expanding().count(), + mean=lambda x: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).mean(), + corr=lambda x, y: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).corr(y), + var_unbiased=lambda x: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).var(bias=False) + ), + std_unbiased=lambda x: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).std(bias=False) + ), + cov_unbiased=lambda x, y: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).cov(y, bias=False) + ), + var_biased=lambda x: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).var(bias=True) + ), + std_biased=lambda x: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).std(bias=True), + cov_biased=lambda x, y: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).cov(y, bias=True) + ), + ) diff --git a/pandas/tests/window/moments/test_moments_expanding.py b/pandas/tests/window/moments/test_moments_expanding.py new file mode 100644 index 0000000000000..507fd2e2fb3ba --- /dev/null +++ b/pandas/tests/window/moments/test_moments_expanding.py @@ -0,0 +1,409 @@ +import warnings + +import numpy as np +from numpy.random import randn +import pytest + +from pandas import DataFrame, Index, MultiIndex, Series, isna, notna +from pandas.tests.window.common import ConsistencyBase +import pandas.util.testing as tm + + +class TestExpandingMomentsConsistency(ConsistencyBase): + def setup_method(self, method): + self._create_data() + + def test_expanding_apply_args_kwargs(self, raw): + def mean_w_arg(x, const): + return np.mean(x) + const + + df = DataFrame(np.random.rand(20, 3)) + + expected = df.expanding().apply(np.mean, raw=raw) + 20.0 + + result = df.expanding().apply(mean_w_arg, raw=raw, args=(20,)) + tm.assert_frame_equal(result, expected) + + result = df.expanding().apply(mean_w_arg, raw=raw, kwargs={"const": 20}) + tm.assert_frame_equal(result, expected) + + def test_expanding_corr(self): + A = self.series.dropna() + B = (A + randn(len(A)))[:-5] + + result = A.expanding().corr(B) + + rolling_result = A.rolling(window=len(A), min_periods=1).corr(B) + + tm.assert_almost_equal(rolling_result, result) + + def test_expanding_count(self): + result = self.series.expanding().count() + tm.assert_almost_equal( + result, self.series.rolling(window=len(self.series)).count() + ) + + def test_expanding_quantile(self): + result = self.series.expanding().quantile(0.5) + + rolling_result = self.series.rolling( + window=len(self.series), min_periods=1 + ).quantile(0.5) + + tm.assert_almost_equal(result, rolling_result) + + def test_expanding_cov(self): + A = self.series + B = (A + randn(len(A)))[:-5] + + result = A.expanding().cov(B) + + rolling_result = A.rolling(window=len(A), min_periods=1).cov(B) + + tm.assert_almost_equal(rolling_result, result) + + def test_expanding_cov_pairwise(self): + result = self.frame.expanding().corr() + + rolling_result = self.frame.rolling( + window=len(self.frame), min_periods=1 + ).corr() + + tm.assert_frame_equal(result, rolling_result) + + def test_expanding_corr_pairwise(self): + result = self.frame.expanding().corr() + + rolling_result = self.frame.rolling( + window=len(self.frame), min_periods=1 + ).corr() + tm.assert_frame_equal(result, rolling_result) + + def test_expanding_cov_diff_index(self): + # GH 7512 + s1 = Series([1, 2, 3], index=[0, 1, 2]) + s2 = Series([1, 3], index=[0, 2]) + result = s1.expanding().cov(s2) + expected = Series([None, None, 2.0]) + tm.assert_series_equal(result, expected) + + s2a = Series([1, None, 3], index=[0, 1, 2]) + result = s1.expanding().cov(s2a) + tm.assert_series_equal(result, expected) + + s1 = Series([7, 8, 10], index=[0, 1, 3]) + s2 = Series([7, 9, 10], index=[0, 2, 3]) + result = s1.expanding().cov(s2) + expected = Series([None, None, None, 4.5]) + tm.assert_series_equal(result, expected) + + def test_expanding_corr_diff_index(self): + # GH 7512 + s1 = Series([1, 2, 3], index=[0, 1, 2]) + s2 = Series([1, 3], index=[0, 2]) + result = s1.expanding().corr(s2) + expected = Series([None, None, 1.0]) + tm.assert_series_equal(result, expected) + + s2a = Series([1, None, 3], index=[0, 1, 2]) + result = s1.expanding().corr(s2a) + tm.assert_series_equal(result, expected) + + s1 = Series([7, 8, 10], index=[0, 1, 3]) + s2 = Series([7, 9, 10], index=[0, 2, 3]) + result = s1.expanding().corr(s2) + expected = Series([None, None, None, 1.0]) + tm.assert_series_equal(result, expected) + + def test_expanding_cov_pairwise_diff_length(self): + # GH 7512 + df1 = DataFrame([[1, 5], [3, 2], [3, 9]], columns=Index(["A", "B"], name="foo")) + df1a = DataFrame( + [[1, 5], [3, 9]], index=[0, 2], columns=Index(["A", "B"], name="foo") + ) + df2 = DataFrame( + [[5, 6], [None, None], [2, 1]], columns=Index(["X", "Y"], name="foo") + ) + df2a = DataFrame( + [[5, 6], [2, 1]], index=[0, 2], columns=Index(["X", "Y"], name="foo") + ) + # TODO: xref gh-15826 + # .loc is not preserving the names + result1 = df1.expanding().cov(df2, pairwise=True).loc[2] + result2 = df1.expanding().cov(df2a, pairwise=True).loc[2] + result3 = df1a.expanding().cov(df2, pairwise=True).loc[2] + result4 = df1a.expanding().cov(df2a, pairwise=True).loc[2] + expected = DataFrame( + [[-3.0, -6.0], [-5.0, -10.0]], + columns=Index(["A", "B"], name="foo"), + index=Index(["X", "Y"], name="foo"), + ) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + tm.assert_frame_equal(result3, expected) + tm.assert_frame_equal(result4, expected) + + def test_expanding_corr_pairwise_diff_length(self): + # GH 7512 + df1 = DataFrame( + [[1, 2], [3, 2], [3, 4]], + columns=["A", "B"], + index=Index(range(3), name="bar"), + ) + df1a = DataFrame( + [[1, 2], [3, 4]], index=Index([0, 2], name="bar"), columns=["A", "B"] + ) + df2 = DataFrame( + [[5, 6], [None, None], [2, 1]], + columns=["X", "Y"], + index=Index(range(3), name="bar"), + ) + df2a = DataFrame( + [[5, 6], [2, 1]], index=Index([0, 2], name="bar"), columns=["X", "Y"] + ) + result1 = df1.expanding().corr(df2, pairwise=True).loc[2] + result2 = df1.expanding().corr(df2a, pairwise=True).loc[2] + result3 = df1a.expanding().corr(df2, pairwise=True).loc[2] + result4 = df1a.expanding().corr(df2a, pairwise=True).loc[2] + expected = DataFrame( + [[-1.0, -1.0], [-1.0, -1.0]], columns=["A", "B"], index=Index(["X", "Y"]) + ) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + tm.assert_frame_equal(result3, expected) + tm.assert_frame_equal(result4, expected) + + @pytest.mark.parametrize("has_min_periods", [True, False]) + @pytest.mark.parametrize( + "func,static_comp", + [("sum", np.sum), ("mean", np.mean), ("max", np.max), ("min", np.min)], + ids=["sum", "mean", "max", "min"], + ) + def test_expanding_func(self, func, static_comp, has_min_periods): + def expanding_func(x, min_periods=1, center=False, axis=0): + exp = x.expanding(min_periods=min_periods, center=center, axis=axis) + return getattr(exp, func)() + + self._check_expanding(expanding_func, static_comp, preserve_nan=False) + self._check_expanding_has_min_periods( + expanding_func, static_comp, has_min_periods + ) + + @pytest.mark.parametrize("has_min_periods", [True, False]) + def test_expanding_apply(self, raw, has_min_periods): + def expanding_mean(x, min_periods=1): + + exp = x.expanding(min_periods=min_periods) + result = exp.apply(lambda x: x.mean(), raw=raw) + return result + + # TODO(jreback), needed to add preserve_nan=False + # here to make this pass + self._check_expanding(expanding_mean, np.mean, preserve_nan=False) + self._check_expanding_has_min_periods(expanding_mean, np.mean, has_min_periods) + + def test_expanding_apply_empty_series(self, raw): + ser = Series([], dtype=np.float64) + tm.assert_series_equal(ser, ser.expanding().apply(lambda x: x.mean(), raw=raw)) + + def test_expanding_apply_min_periods_0(self, raw): + # GH 8080 + s = Series([None, None, None]) + result = s.expanding(min_periods=0).apply(lambda x: len(x), raw=raw) + expected = Series([1.0, 2.0, 3.0]) + tm.assert_series_equal(result, expected) + + def _check_expanding(self, func, static_comp, preserve_nan=True): + + series_result = func(self.series) + assert isinstance(series_result, Series) + frame_result = func(self.frame) + assert isinstance(frame_result, DataFrame) + + result = func(self.series) + tm.assert_almost_equal(result[10], static_comp(self.series[:11])) + + if preserve_nan: + assert result.iloc[self._nan_locs].isna().all() + + def _check_expanding_has_min_periods(self, func, static_comp, has_min_periods): + ser = Series(randn(50)) + + if has_min_periods: + result = func(ser, min_periods=30) + assert result[:29].isna().all() + tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50])) + + # min_periods is working correctly + result = func(ser, min_periods=15) + assert isna(result.iloc[13]) + assert notna(result.iloc[14]) + + ser2 = Series(randn(20)) + result = func(ser2, min_periods=5) + assert isna(result[3]) + assert notna(result[4]) + + # min_periods=0 + result0 = func(ser, min_periods=0) + result1 = func(ser, min_periods=1) + tm.assert_almost_equal(result0, result1) + else: + result = func(ser) + tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50])) + + @pytest.mark.parametrize( + "f", + [ + lambda x: x.expanding().count(), + lambda x: x.expanding(min_periods=5).cov(x, pairwise=False), + lambda x: x.expanding(min_periods=5).corr(x, pairwise=False), + lambda x: x.expanding(min_periods=5).max(), + lambda x: x.expanding(min_periods=5).min(), + lambda x: x.expanding(min_periods=5).sum(), + lambda x: x.expanding(min_periods=5).mean(), + lambda x: x.expanding(min_periods=5).std(), + lambda x: x.expanding(min_periods=5).var(), + lambda x: x.expanding(min_periods=5).skew(), + lambda x: x.expanding(min_periods=5).kurt(), + lambda x: x.expanding(min_periods=5).quantile(0.5), + lambda x: x.expanding(min_periods=5).median(), + lambda x: x.expanding(min_periods=5).apply(sum, raw=False), + lambda x: x.expanding(min_periods=5).apply(sum, raw=True), + ], + ) + def test_moment_functions_zero_length(self, f): + # GH 8056 + s = Series(dtype=np.float64) + s_expected = s + df1 = DataFrame() + df1_expected = df1 + df2 = DataFrame(columns=["a"]) + df2["a"] = df2["a"].astype("float64") + df2_expected = df2 + + s_result = f(s) + tm.assert_series_equal(s_result, s_expected) + + df1_result = f(df1) + tm.assert_frame_equal(df1_result, df1_expected) + + df2_result = f(df2) + tm.assert_frame_equal(df2_result, df2_expected) + + @pytest.mark.parametrize( + "f", + [ + lambda x: (x.expanding(min_periods=5).cov(x, pairwise=True)), + lambda x: (x.expanding(min_periods=5).corr(x, pairwise=True)), + ], + ) + def test_moment_functions_zero_length_pairwise(self, f): + + df1 = DataFrame() + df2 = DataFrame(columns=Index(["a"], name="foo"), index=Index([], name="bar")) + df2["a"] = df2["a"].astype("float64") + + df1_expected = DataFrame( + index=MultiIndex.from_product([df1.index, df1.columns]), columns=Index([]) + ) + df2_expected = DataFrame( + index=MultiIndex.from_product( + [df2.index, df2.columns], names=["bar", "foo"] + ), + columns=Index(["a"], name="foo"), + dtype="float64", + ) + + df1_result = f(df1) + tm.assert_frame_equal(df1_result, df1_expected) + + df2_result = f(df2) + tm.assert_frame_equal(df2_result, df2_expected) + + @pytest.mark.slow + @pytest.mark.parametrize("min_periods", [0, 1, 2, 3, 4]) + def test_expanding_consistency(self, min_periods): + + # suppress warnings about empty slices, as we are deliberately testing + # with empty/0-length Series/DataFrames + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message=".*(empty slice|0 for slice).*", + category=RuntimeWarning, + ) + + # test consistency between different expanding_* moments + self._test_moments_consistency_mock_mean( + mean=lambda x: x.expanding(min_periods=min_periods).mean(), + mock_mean=lambda x: x.expanding(min_periods=min_periods).sum() + / x.expanding().count(), + ) + + self._test_moments_consistency_is_constant( + min_periods=min_periods, + count=lambda x: x.expanding().count(), + mean=lambda x: x.expanding(min_periods=min_periods).mean(), + corr=lambda x, y: x.expanding(min_periods=min_periods).corr(y), + ) + + self._test_moments_consistency_var_debiasing_factors( + var_unbiased=lambda x: x.expanding(min_periods=min_periods).var(), + var_biased=lambda x: x.expanding(min_periods=min_periods).var(ddof=0), + var_debiasing_factors=lambda x: ( + x.expanding().count() + / (x.expanding().count() - 1.0).replace(0.0, np.nan) + ), + ) + self._test_moments_consistency( + min_periods=min_periods, + count=lambda x: x.expanding().count(), + mean=lambda x: x.expanding(min_periods=min_periods).mean(), + corr=lambda x, y: x.expanding(min_periods=min_periods).corr(y), + var_unbiased=lambda x: x.expanding(min_periods=min_periods).var(), + std_unbiased=lambda x: x.expanding(min_periods=min_periods).std(), + cov_unbiased=lambda x, y: x.expanding(min_periods=min_periods).cov(y), + var_biased=lambda x: x.expanding(min_periods=min_periods).var(ddof=0), + std_biased=lambda x: x.expanding(min_periods=min_periods).std(ddof=0), + cov_biased=lambda x, y: x.expanding(min_periods=min_periods).cov( + y, ddof=0 + ), + ) + + # test consistency between expanding_xyz() and either (a) + # expanding_apply of Series.xyz(), or (b) expanding_apply of + # np.nanxyz() + for (x, is_constant, no_nans) in self.data: + functions = self.base_functions + + # GH 8269 + if no_nans: + functions = self.base_functions + self.no_nan_functions + for (f, require_min_periods, name) in functions: + expanding_f = getattr(x.expanding(min_periods=min_periods), name) + + if ( + require_min_periods + and (min_periods is not None) + and (min_periods < require_min_periods) + ): + continue + + if name == "count": + expanding_f_result = expanding_f() + expanding_apply_f_result = x.expanding(min_periods=0).apply( + func=f, raw=True + ) + else: + if name in ["cov", "corr"]: + expanding_f_result = expanding_f(pairwise=False) + else: + expanding_f_result = expanding_f() + expanding_apply_f_result = x.expanding( + min_periods=min_periods + ).apply(func=f, raw=True) + + # GH 9422 + if name in ["sum", "prod"]: + tm.assert_equal(expanding_f_result, expanding_apply_f_result) diff --git a/pandas/tests/window/test_moments.py b/pandas/tests/window/moments/test_moments_rolling.py similarity index 55% rename from pandas/tests/window/test_moments.py rename to pandas/tests/window/moments/test_moments_rolling.py index b1c5fc429cc03..c110ed172ecb9 100644 --- a/pandas/tests/window/test_moments.py +++ b/pandas/tests/window/moments/test_moments_rolling.py @@ -9,9 +9,9 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas import DataFrame, Index, Series, concat, isna, notna +from pandas import DataFrame, Index, Series, isna, notna from pandas.core.window.common import _flex_binary_moment -from pandas.tests.window.common import Base +from pandas.tests.window.common import Base, ConsistencyBase import pandas.util.testing as tm import pandas.tseries.offsets as offsets @@ -915,400 +915,6 @@ def get_result(obj, window, min_periods=None, center=False): tm.assert_series_equal(series_xp, series_rs) tm.assert_frame_equal(frame_xp, frame_rs) - def test_ewma(self): - self._check_ew(name="mean") - - vals = pd.Series(np.zeros(1000)) - vals[5] = 1 - result = vals.ewm(span=100, adjust=False).mean().sum() - assert np.abs(result - 1) < 1e-2 - - @pytest.mark.parametrize("adjust", [True, False]) - @pytest.mark.parametrize("ignore_na", [True, False]) - def test_ewma_cases(self, adjust, ignore_na): - # try adjust/ignore_na args matrix - - s = Series([1.0, 2.0, 4.0, 8.0]) - - if adjust: - expected = Series([1.0, 1.6, 2.736842, 4.923077]) - else: - expected = Series([1.0, 1.333333, 2.222222, 4.148148]) - - result = s.ewm(com=2.0, adjust=adjust, ignore_na=ignore_na).mean() - tm.assert_series_equal(result, expected) - - def test_ewma_nan_handling(self): - s = Series([1.0] + [np.nan] * 5 + [1.0]) - result = s.ewm(com=5).mean() - tm.assert_series_equal(result, Series([1.0] * len(s))) - - s = Series([np.nan] * 2 + [1.0] + [np.nan] * 2 + [1.0]) - result = s.ewm(com=5).mean() - tm.assert_series_equal(result, Series([np.nan] * 2 + [1.0] * 4)) - - # GH 7603 - s0 = Series([np.nan, 1.0, 101.0]) - s1 = Series([1.0, np.nan, 101.0]) - s2 = Series([np.nan, 1.0, np.nan, np.nan, 101.0, np.nan]) - s3 = Series([1.0, np.nan, 101.0, 50.0]) - com = 2.0 - alpha = 1.0 / (1.0 + com) - - def simple_wma(s, w): - return (s.multiply(w).cumsum() / w.cumsum()).fillna(method="ffill") - - for (s, adjust, ignore_na, w) in [ - (s0, True, False, [np.nan, (1.0 - alpha), 1.0]), - (s0, True, True, [np.nan, (1.0 - alpha), 1.0]), - (s0, False, False, [np.nan, (1.0 - alpha), alpha]), - (s0, False, True, [np.nan, (1.0 - alpha), alpha]), - (s1, True, False, [(1.0 - alpha) ** 2, np.nan, 1.0]), - (s1, True, True, [(1.0 - alpha), np.nan, 1.0]), - (s1, False, False, [(1.0 - alpha) ** 2, np.nan, alpha]), - (s1, False, True, [(1.0 - alpha), np.nan, alpha]), - ( - s2, - True, - False, - [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, 1.0, np.nan], - ), - (s2, True, True, [np.nan, (1.0 - alpha), np.nan, np.nan, 1.0, np.nan]), - ( - s2, - False, - False, - [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, alpha, np.nan], - ), - (s2, False, True, [np.nan, (1.0 - alpha), np.nan, np.nan, alpha, np.nan]), - (s3, True, False, [(1.0 - alpha) ** 3, np.nan, (1.0 - alpha), 1.0]), - (s3, True, True, [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha), 1.0]), - ( - s3, - False, - False, - [ - (1.0 - alpha) ** 3, - np.nan, - (1.0 - alpha) * alpha, - alpha * ((1.0 - alpha) ** 2 + alpha), - ], - ), - ( - s3, - False, - True, - [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha) * alpha, alpha], - ), - ]: - expected = simple_wma(s, Series(w)) - result = s.ewm(com=com, adjust=adjust, ignore_na=ignore_na).mean() - - tm.assert_series_equal(result, expected) - if ignore_na is False: - # check that ignore_na defaults to False - result = s.ewm(com=com, adjust=adjust).mean() - tm.assert_series_equal(result, expected) - - def test_ewmvar(self): - self._check_ew(name="var") - - def test_ewmvol(self): - self._check_ew(name="vol") - - def test_ewma_span_com_args(self): - A = self.series.ewm(com=9.5).mean() - B = self.series.ewm(span=20).mean() - tm.assert_almost_equal(A, B) - - with pytest.raises(ValueError): - self.series.ewm(com=9.5, span=20) - with pytest.raises(ValueError): - self.series.ewm().mean() - - def test_ewma_halflife_arg(self): - A = self.series.ewm(com=13.932726172912965).mean() - B = self.series.ewm(halflife=10.0).mean() - tm.assert_almost_equal(A, B) - - with pytest.raises(ValueError): - self.series.ewm(span=20, halflife=50) - with pytest.raises(ValueError): - self.series.ewm(com=9.5, halflife=50) - with pytest.raises(ValueError): - self.series.ewm(com=9.5, span=20, halflife=50) - with pytest.raises(ValueError): - self.series.ewm() - - def test_ewm_alpha(self): - # GH 10789 - s = Series(self.arr) - a = s.ewm(alpha=0.61722699889169674).mean() - b = s.ewm(com=0.62014947789973052).mean() - c = s.ewm(span=2.240298955799461).mean() - d = s.ewm(halflife=0.721792864318).mean() - tm.assert_series_equal(a, b) - tm.assert_series_equal(a, c) - tm.assert_series_equal(a, d) - - def test_ewm_alpha_arg(self): - # GH 10789 - s = self.series - with pytest.raises(ValueError): - s.ewm() - with pytest.raises(ValueError): - s.ewm(com=10.0, alpha=0.5) - with pytest.raises(ValueError): - s.ewm(span=10.0, alpha=0.5) - with pytest.raises(ValueError): - s.ewm(halflife=10.0, alpha=0.5) - - def test_ewm_domain_checks(self): - # GH 12492 - s = Series(self.arr) - msg = "comass must satisfy: comass >= 0" - with pytest.raises(ValueError, match=msg): - s.ewm(com=-0.1) - s.ewm(com=0.0) - s.ewm(com=0.1) - - msg = "span must satisfy: span >= 1" - with pytest.raises(ValueError, match=msg): - s.ewm(span=-0.1) - with pytest.raises(ValueError, match=msg): - s.ewm(span=0.0) - with pytest.raises(ValueError, match=msg): - s.ewm(span=0.9) - s.ewm(span=1.0) - s.ewm(span=1.1) - - msg = "halflife must satisfy: halflife > 0" - with pytest.raises(ValueError, match=msg): - s.ewm(halflife=-0.1) - with pytest.raises(ValueError, match=msg): - s.ewm(halflife=0.0) - s.ewm(halflife=0.1) - - msg = "alpha must satisfy: 0 < alpha <= 1" - with pytest.raises(ValueError, match=msg): - s.ewm(alpha=-0.1) - with pytest.raises(ValueError, match=msg): - s.ewm(alpha=0.0) - s.ewm(alpha=0.1) - s.ewm(alpha=1.0) - with pytest.raises(ValueError, match=msg): - s.ewm(alpha=1.1) - - @pytest.mark.parametrize("method", ["mean", "vol", "var"]) - def test_ew_empty_series(self, method): - vals = pd.Series([], dtype=np.float64) - - ewm = vals.ewm(3) - result = getattr(ewm, method)() - tm.assert_almost_equal(result, vals) - - def _check_ew(self, name=None, preserve_nan=False): - series_result = getattr(self.series.ewm(com=10), name)() - assert isinstance(series_result, Series) - - frame_result = getattr(self.frame.ewm(com=10), name)() - assert type(frame_result) == DataFrame - - result = getattr(self.series.ewm(com=10), name)() - if preserve_nan: - assert result[self._nan_locs].isna().all() - - # excluding NaNs correctly - arr = randn(50) - arr[:10] = np.NaN - arr[-10:] = np.NaN - s = Series(arr) - - # check min_periods - # GH 7898 - result = getattr(s.ewm(com=50, min_periods=2), name)() - assert result[:11].isna().all() - assert not result[11:].isna().any() - - for min_periods in (0, 1): - result = getattr(s.ewm(com=50, min_periods=min_periods), name)() - if name == "mean": - assert result[:10].isna().all() - assert not result[10:].isna().any() - else: - # ewm.std, ewm.vol, ewm.var (with bias=False) require at least - # two values - assert result[:11].isna().all() - assert not result[11:].isna().any() - - # check series of length 0 - result = getattr( - Series(dtype=object).ewm(com=50, min_periods=min_periods), name - )() - tm.assert_series_equal(result, Series(dtype="float64")) - - # check series of length 1 - result = getattr(Series([1.0]).ewm(50, min_periods=min_periods), name)() - if name == "mean": - tm.assert_series_equal(result, Series([1.0])) - else: - # ewm.std, ewm.vol, ewm.var with bias=False require at least - # two values - tm.assert_series_equal(result, Series([np.NaN])) - - # pass in ints - result2 = getattr(Series(np.arange(50)).ewm(span=10), name)() - assert result2.dtype == np.float_ - - -# create the data only once as we are not setting it -def _create_consistency_data(): - def create_series(): - return [ - Series(dtype=object), - Series([np.nan]), - Series([np.nan, np.nan]), - Series([3.0]), - Series([np.nan, 3.0]), - Series([3.0, np.nan]), - Series([1.0, 3.0]), - Series([2.0, 2.0]), - Series([3.0, 1.0]), - Series( - [5.0, 5.0, 5.0, 5.0, np.nan, np.nan, np.nan, 5.0, 5.0, np.nan, np.nan] - ), - Series( - [ - np.nan, - 5.0, - 5.0, - 5.0, - np.nan, - np.nan, - np.nan, - 5.0, - 5.0, - np.nan, - np.nan, - ] - ), - Series( - [ - np.nan, - np.nan, - 5.0, - 5.0, - np.nan, - np.nan, - np.nan, - 5.0, - 5.0, - np.nan, - np.nan, - ] - ), - Series( - [ - np.nan, - 3.0, - np.nan, - 3.0, - 4.0, - 5.0, - 6.0, - np.nan, - np.nan, - 7.0, - 12.0, - 13.0, - 14.0, - 15.0, - ] - ), - Series( - [ - np.nan, - 5.0, - np.nan, - 2.0, - 4.0, - 0.0, - 9.0, - np.nan, - np.nan, - 3.0, - 12.0, - 13.0, - 14.0, - 15.0, - ] - ), - Series( - [ - 2.0, - 3.0, - np.nan, - 3.0, - 4.0, - 5.0, - 6.0, - np.nan, - np.nan, - 7.0, - 12.0, - 13.0, - 14.0, - 15.0, - ] - ), - Series( - [ - 2.0, - 5.0, - np.nan, - 2.0, - 4.0, - 0.0, - 9.0, - np.nan, - np.nan, - 3.0, - 12.0, - 13.0, - 14.0, - 15.0, - ] - ), - Series(range(10)), - Series(range(20, 0, -2)), - ] - - def create_dataframes(): - return [ - DataFrame(), - DataFrame(columns=["a"]), - DataFrame(columns=["a", "a"]), - DataFrame(columns=["a", "b"]), - DataFrame(np.arange(10).reshape((5, 2))), - DataFrame(np.arange(25).reshape((5, 5))), - DataFrame(np.arange(25).reshape((5, 5)), columns=["a", "b", 99, "d", "d"]), - ] + [DataFrame(s) for s in create_series()] - - def is_constant(x): - values = x.values.ravel() - return len(set(values[notna(values)])) == 1 - - def no_nans(x): - return x.notna().all().all() - - # data is a tuple(object, is_constant, no_nans) - data = create_series() + create_dataframes() - - return [(x, is_constant(x), no_nans(x)) for x in data] - - -_consistency_data = _create_consistency_data() - def _rolling_consistency_cases(): for window in [1, 2, 3, 10, 20]: @@ -1319,363 +925,10 @@ def _rolling_consistency_cases(): yield window, min_periods, center -class TestMomentsConsistency(Base): - base_functions = [ - (lambda v: Series(v).count(), None, "count"), - (lambda v: Series(v).max(), None, "max"), - (lambda v: Series(v).min(), None, "min"), - (lambda v: Series(v).sum(), None, "sum"), - (lambda v: Series(v).mean(), None, "mean"), - (lambda v: Series(v).std(), 1, "std"), - (lambda v: Series(v).cov(Series(v)), None, "cov"), - (lambda v: Series(v).corr(Series(v)), None, "corr"), - (lambda v: Series(v).var(), 1, "var"), - # restore once GH 8086 is fixed - # lambda v: Series(v).skew(), 3, 'skew'), - # (lambda v: Series(v).kurt(), 4, 'kurt'), - # restore once GH 8084 is fixed - # lambda v: Series(v).quantile(0.3), None, 'quantile'), - (lambda v: Series(v).median(), None, "median"), - (np.nanmax, 1, "max"), - (np.nanmin, 1, "min"), - (np.nansum, 1, "sum"), - (np.nanmean, 1, "mean"), - (lambda v: np.nanstd(v, ddof=1), 1, "std"), - (lambda v: np.nanvar(v, ddof=1), 1, "var"), - (np.nanmedian, 1, "median"), - ] - no_nan_functions = [ - (np.max, None, "max"), - (np.min, None, "min"), - (np.sum, None, "sum"), - (np.mean, None, "mean"), - (lambda v: np.std(v, ddof=1), 1, "std"), - (lambda v: np.var(v, ddof=1), 1, "var"), - (np.median, None, "median"), - ] - - def _create_data(self): - super()._create_data() - self.data = _consistency_data - +class TestRollingMomentsConsistency(ConsistencyBase): def setup_method(self, method): self._create_data() - def _test_moments_consistency( - self, - min_periods, - count, - mean, - mock_mean, - corr, - var_unbiased=None, - std_unbiased=None, - cov_unbiased=None, - var_biased=None, - std_biased=None, - cov_biased=None, - var_debiasing_factors=None, - ): - def _non_null_values(x): - values = x.values.ravel() - return set(values[notna(values)].tolist()) - - for (x, is_constant, no_nans) in self.data: - count_x = count(x) - mean_x = mean(x) - - if mock_mean: - # check that mean equals mock_mean - expected = mock_mean(x) - tm.assert_equal(mean_x, expected.astype("float64")) - - # check that correlation of a series with itself is either 1 or NaN - corr_x_x = corr(x, x) - - # assert _non_null_values(corr_x_x).issubset(set([1.])) - # restore once rolling_cov(x, x) is identically equal to var(x) - - if is_constant: - exp = x.max() if isinstance(x, Series) else x.max().max() - - # check mean of constant series - expected = x * np.nan - expected[count_x >= max(min_periods, 1)] = exp - tm.assert_equal(mean_x, expected) - - # check correlation of constant series with itself is NaN - expected[:] = np.nan - tm.assert_equal(corr_x_x, expected) - - if var_unbiased and var_biased and var_debiasing_factors: - # check variance debiasing factors - var_unbiased_x = var_unbiased(x) - var_biased_x = var_biased(x) - var_debiasing_factors_x = var_debiasing_factors(x) - tm.assert_equal(var_unbiased_x, var_biased_x * var_debiasing_factors_x) - - for (std, var, cov) in [ - (std_biased, var_biased, cov_biased), - (std_unbiased, var_unbiased, cov_unbiased), - ]: - - # check that var(x), std(x), and cov(x) are all >= 0 - var_x = var(x) - std_x = std(x) - assert not (var_x < 0).any().any() - assert not (std_x < 0).any().any() - if cov: - cov_x_x = cov(x, x) - assert not (cov_x_x < 0).any().any() - - # check that var(x) == cov(x, x) - tm.assert_equal(var_x, cov_x_x) - - # check that var(x) == std(x)^2 - tm.assert_equal(var_x, std_x * std_x) - - if var is var_biased: - # check that biased var(x) == mean(x^2) - mean(x)^2 - mean_x2 = mean(x * x) - tm.assert_equal(var_x, mean_x2 - (mean_x * mean_x)) - - if is_constant: - # check that variance of constant series is identically 0 - assert not (var_x > 0).any().any() - expected = x * np.nan - expected[count_x >= max(min_periods, 1)] = 0.0 - if var is var_unbiased: - expected[count_x < 2] = np.nan - tm.assert_equal(var_x, expected) - - if isinstance(x, Series): - for (y, is_constant, no_nans) in self.data: - if not x.isna().equals(y.isna()): - # can only easily test two Series with similar - # structure - continue - - # check that cor(x, y) is symmetric - corr_x_y = corr(x, y) - corr_y_x = corr(y, x) - tm.assert_equal(corr_x_y, corr_y_x) - - if cov: - # check that cov(x, y) is symmetric - cov_x_y = cov(x, y) - cov_y_x = cov(y, x) - tm.assert_equal(cov_x_y, cov_y_x) - - # check that cov(x, y) == (var(x+y) - var(x) - - # var(y)) / 2 - var_x_plus_y = var(x + y) - var_y = var(y) - tm.assert_equal( - cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y) - ) - - # check that corr(x, y) == cov(x, y) / (std(x) * - # std(y)) - std_y = std(y) - tm.assert_equal(corr_x_y, cov_x_y / (std_x * std_y)) - - if cov is cov_biased: - # check that biased cov(x, y) == mean(x*y) - - # mean(x)*mean(y) - mean_y = mean(y) - mean_x_times_y = mean(x * y) - tm.assert_equal( - cov_x_y, mean_x_times_y - (mean_x * mean_y) - ) - - @pytest.mark.slow - @pytest.mark.parametrize("min_periods", [0, 1, 2, 3, 4]) - @pytest.mark.parametrize("adjust", [True, False]) - @pytest.mark.parametrize("ignore_na", [True, False]) - def test_ewm_consistency(self, min_periods, adjust, ignore_na): - def _weights(s, com, adjust, ignore_na): - if isinstance(s, DataFrame): - if not len(s.columns): - return DataFrame(index=s.index, columns=s.columns) - w = concat( - [ - _weights( - s.iloc[:, i], com=com, adjust=adjust, ignore_na=ignore_na - ) - for i, _ in enumerate(s.columns) - ], - axis=1, - ) - w.index = s.index - w.columns = s.columns - return w - - w = Series(np.nan, index=s.index) - alpha = 1.0 / (1.0 + com) - if ignore_na: - w[s.notna()] = _weights( - s[s.notna()], com=com, adjust=adjust, ignore_na=False - ) - elif adjust: - for i in range(len(s)): - if s.iat[i] == s.iat[i]: - w.iat[i] = pow(1.0 / (1.0 - alpha), i) - else: - sum_wts = 0.0 - prev_i = -1 - for i in range(len(s)): - if s.iat[i] == s.iat[i]: - if prev_i == -1: - w.iat[i] = 1.0 - else: - w.iat[i] = alpha * sum_wts / pow(1.0 - alpha, i - prev_i) - sum_wts += w.iat[i] - prev_i = i - return w - - def _variance_debiasing_factors(s, com, adjust, ignore_na): - weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na) - cum_sum = weights.cumsum().fillna(method="ffill") - cum_sum_sq = (weights * weights).cumsum().fillna(method="ffill") - numerator = cum_sum * cum_sum - denominator = numerator - cum_sum_sq - denominator[denominator <= 0.0] = np.nan - return numerator / denominator - - def _ewma(s, com, min_periods, adjust, ignore_na): - weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na) - result = ( - s.multiply(weights) - .cumsum() - .divide(weights.cumsum()) - .fillna(method="ffill") - ) - result[ - s.expanding().count() < (max(min_periods, 1) if min_periods else 1) - ] = np.nan - return result - - com = 3.0 - # test consistency between different ewm* moments - self._test_moments_consistency( - min_periods=min_periods, - count=lambda x: x.expanding().count(), - mean=lambda x: x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).mean(), - mock_mean=lambda x: _ewma( - x, com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ), - corr=lambda x, y: x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).corr(y), - var_unbiased=lambda x: ( - x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).var(bias=False) - ), - std_unbiased=lambda x: ( - x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).std(bias=False) - ), - cov_unbiased=lambda x, y: ( - x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).cov(y, bias=False) - ), - var_biased=lambda x: ( - x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).var(bias=True) - ), - std_biased=lambda x: x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).std(bias=True), - cov_biased=lambda x, y: ( - x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).cov(y, bias=True) - ), - var_debiasing_factors=lambda x: ( - _variance_debiasing_factors( - x, com=com, adjust=adjust, ignore_na=ignore_na - ) - ), - ) - - @pytest.mark.slow - @pytest.mark.parametrize("min_periods", [0, 1, 2, 3, 4]) - def test_expanding_consistency(self, min_periods): - - # suppress warnings about empty slices, as we are deliberately testing - # with empty/0-length Series/DataFrames - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - message=".*(empty slice|0 for slice).*", - category=RuntimeWarning, - ) - - # test consistency between different expanding_* moments - self._test_moments_consistency( - min_periods=min_periods, - count=lambda x: x.expanding().count(), - mean=lambda x: x.expanding(min_periods=min_periods).mean(), - mock_mean=lambda x: x.expanding(min_periods=min_periods).sum() - / x.expanding().count(), - corr=lambda x, y: x.expanding(min_periods=min_periods).corr(y), - var_unbiased=lambda x: x.expanding(min_periods=min_periods).var(), - std_unbiased=lambda x: x.expanding(min_periods=min_periods).std(), - cov_unbiased=lambda x, y: x.expanding(min_periods=min_periods).cov(y), - var_biased=lambda x: x.expanding(min_periods=min_periods).var(ddof=0), - std_biased=lambda x: x.expanding(min_periods=min_periods).std(ddof=0), - cov_biased=lambda x, y: x.expanding(min_periods=min_periods).cov( - y, ddof=0 - ), - var_debiasing_factors=lambda x: ( - x.expanding().count() - / (x.expanding().count() - 1.0).replace(0.0, np.nan) - ), - ) - - # test consistency between expanding_xyz() and either (a) - # expanding_apply of Series.xyz(), or (b) expanding_apply of - # np.nanxyz() - for (x, is_constant, no_nans) in self.data: - functions = self.base_functions - - # GH 8269 - if no_nans: - functions = self.base_functions + self.no_nan_functions - for (f, require_min_periods, name) in functions: - expanding_f = getattr(x.expanding(min_periods=min_periods), name) - - if ( - require_min_periods - and (min_periods is not None) - and (min_periods < require_min_periods) - ): - continue - - if name == "count": - expanding_f_result = expanding_f() - expanding_apply_f_result = x.expanding(min_periods=0).apply( - func=f, raw=True - ) - else: - if name in ["cov", "corr"]: - expanding_f_result = expanding_f(pairwise=False) - else: - expanding_f_result = expanding_f() - expanding_apply_f_result = x.expanding( - min_periods=min_periods - ).apply(func=f, raw=True) - - # GH 9422 - if name in ["sum", "prod"]: - tm.assert_equal(expanding_f_result, expanding_apply_f_result) - @pytest.mark.slow @pytest.mark.parametrize( "window,min_periods,center", list(_rolling_consistency_cases()) @@ -1692,9 +945,7 @@ def test_rolling_consistency(self, window, min_periods, center): ) # test consistency between different rolling_* moments - self._test_moments_consistency( - min_periods=min_periods, - count=lambda x: (x.rolling(window=window, center=center).count()), + self._test_moments_consistency_mock_mean( mean=lambda x: ( x.rolling( window=window, min_periods=min_periods, center=center @@ -1709,6 +960,53 @@ def test_rolling_consistency(self, window, min_periods, center): ).count() ) ), + ) + + self._test_moments_consistency_is_constant( + min_periods=min_periods, + count=lambda x: (x.rolling(window=window, center=center).count()), + mean=lambda x: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).mean() + ), + corr=lambda x, y: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).corr(y) + ), + ) + + self._test_moments_consistency_var_debiasing_factors( + var_unbiased=lambda x: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).var() + ), + var_biased=lambda x: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).var(ddof=0) + ), + var_debiasing_factors=lambda x: ( + x.rolling(window=window, center=center) + .count() + .divide( + (x.rolling(window=window, center=center).count() - 1.0).replace( + 0.0, np.nan + ) + ) + ), + ) + + self._test_moments_consistency( + min_periods=min_periods, + count=lambda x: (x.rolling(window=window, center=center).count()), + mean=lambda x: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).mean() + ), corr=lambda x, y: ( x.rolling( window=window, min_periods=min_periods, center=center @@ -1744,15 +1042,6 @@ def test_rolling_consistency(self, window, min_periods, center): window=window, min_periods=min_periods, center=center ).cov(y, ddof=0) ), - var_debiasing_factors=lambda x: ( - x.rolling(window=window, center=center) - .count() - .divide( - (x.rolling(window=window, center=center).count() - 1.0).replace( - 0.0, np.nan - ) - ) - ), ) # test consistency between rolling_xyz() and either (a) @@ -1835,16 +1124,6 @@ def test_rolling_corr_with_zero_variance(self, window): assert s.rolling(window=window).corr(other=other).isna().all() - def _check_pairwise_moment(self, dispatch, name, **kwargs): - def get_result(obj, obj2=None): - return getattr(getattr(obj, dispatch)(**kwargs), name)(obj2) - - result = get_result(self.frame) - result = result.loc[(slice(None), 1), 5] - result.index = result.index.droplevel(1) - expected = get_result(self.frame[1], self.frame[5]) - tm.assert_series_equal(result, expected, check_names=False) - def test_flex_binary_moment(self): # GH3155 # don't blow the stack @@ -1905,156 +1184,6 @@ def test_flex_binary_frame(self, method): ) tm.assert_frame_equal(res3, exp) - def test_ewmcov(self): - self._check_binary_ew("cov") - - def test_ewmcov_pairwise(self): - self._check_pairwise_moment("ewm", "cov", span=10, min_periods=5) - - def test_ewmcorr(self): - self._check_binary_ew("corr") - - def test_ewmcorr_pairwise(self): - self._check_pairwise_moment("ewm", "corr", span=10, min_periods=5) - - def _check_binary_ew(self, name): - def func(A, B, com, **kwargs): - return getattr(A.ewm(com, **kwargs), name)(B) - - A = Series(randn(50), index=np.arange(50)) - B = A[2:] + randn(48) - - A[:10] = np.NaN - B[-10:] = np.NaN - - result = func(A, B, 20, min_periods=5) - assert np.isnan(result.values[:14]).all() - assert not np.isnan(result.values[14:]).any() - - # GH 7898 - for min_periods in (0, 1, 2): - result = func(A, B, 20, min_periods=min_periods) - # binary functions (ewmcov, ewmcorr) with bias=False require at - # least two values - assert np.isnan(result.values[:11]).all() - assert not np.isnan(result.values[11:]).any() - - # check series of length 0 - empty = Series([], dtype=np.float64) - result = func(empty, empty, 50, min_periods=min_periods) - tm.assert_series_equal(result, empty) - - # check series of length 1 - result = func(Series([1.0]), Series([1.0]), 50, min_periods=min_periods) - tm.assert_series_equal(result, Series([np.NaN])) - - msg = "Input arrays must be of the same type!" - # exception raised is Exception - with pytest.raises(Exception, match=msg): - func(A, randn(50), 20, min_periods=5) - - def test_expanding_apply_args_kwargs(self, raw): - def mean_w_arg(x, const): - return np.mean(x) + const - - df = DataFrame(np.random.rand(20, 3)) - - expected = df.expanding().apply(np.mean, raw=raw) + 20.0 - - result = df.expanding().apply(mean_w_arg, raw=raw, args=(20,)) - tm.assert_frame_equal(result, expected) - - result = df.expanding().apply(mean_w_arg, raw=raw, kwargs={"const": 20}) - tm.assert_frame_equal(result, expected) - - def test_expanding_corr(self): - A = self.series.dropna() - B = (A + randn(len(A)))[:-5] - - result = A.expanding().corr(B) - - rolling_result = A.rolling(window=len(A), min_periods=1).corr(B) - - tm.assert_almost_equal(rolling_result, result) - - def test_expanding_count(self): - result = self.series.expanding().count() - tm.assert_almost_equal( - result, self.series.rolling(window=len(self.series)).count() - ) - - def test_expanding_quantile(self): - result = self.series.expanding().quantile(0.5) - - rolling_result = self.series.rolling( - window=len(self.series), min_periods=1 - ).quantile(0.5) - - tm.assert_almost_equal(result, rolling_result) - - def test_expanding_cov(self): - A = self.series - B = (A + randn(len(A)))[:-5] - - result = A.expanding().cov(B) - - rolling_result = A.rolling(window=len(A), min_periods=1).cov(B) - - tm.assert_almost_equal(rolling_result, result) - - def test_expanding_cov_pairwise(self): - result = self.frame.expanding().corr() - - rolling_result = self.frame.rolling( - window=len(self.frame), min_periods=1 - ).corr() - - tm.assert_frame_equal(result, rolling_result) - - def test_expanding_corr_pairwise(self): - result = self.frame.expanding().corr() - - rolling_result = self.frame.rolling( - window=len(self.frame), min_periods=1 - ).corr() - tm.assert_frame_equal(result, rolling_result) - - def test_expanding_cov_diff_index(self): - # GH 7512 - s1 = Series([1, 2, 3], index=[0, 1, 2]) - s2 = Series([1, 3], index=[0, 2]) - result = s1.expanding().cov(s2) - expected = Series([None, None, 2.0]) - tm.assert_series_equal(result, expected) - - s2a = Series([1, None, 3], index=[0, 1, 2]) - result = s1.expanding().cov(s2a) - tm.assert_series_equal(result, expected) - - s1 = Series([7, 8, 10], index=[0, 1, 3]) - s2 = Series([7, 9, 10], index=[0, 2, 3]) - result = s1.expanding().cov(s2) - expected = Series([None, None, None, 4.5]) - tm.assert_series_equal(result, expected) - - def test_expanding_corr_diff_index(self): - # GH 7512 - s1 = Series([1, 2, 3], index=[0, 1, 2]) - s2 = Series([1, 3], index=[0, 2]) - result = s1.expanding().corr(s2) - expected = Series([None, None, 1.0]) - tm.assert_series_equal(result, expected) - - s2a = Series([1, None, 3], index=[0, 1, 2]) - result = s1.expanding().corr(s2a) - tm.assert_series_equal(result, expected) - - s1 = Series([7, 8, 10], index=[0, 1, 3]) - s2 = Series([7, 9, 10], index=[0, 2, 3]) - result = s1.expanding().corr(s2) - expected = Series([None, None, None, 1.0]) - tm.assert_series_equal(result, expected) - def test_rolling_cov_diff_length(self): # GH 7512 s1 = Series([1, 2, 3], index=[0, 1, 2]) @@ -2082,8 +1211,8 @@ def test_rolling_corr_diff_length(self): @pytest.mark.parametrize( "f", [ - lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=False)), - lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=False)), + lambda x: x.rolling(window=10, min_periods=5).cov(x, pairwise=False), + lambda x: x.rolling(window=10, min_periods=5).corr(x, pairwise=False), lambda x: x.rolling(window=10, min_periods=5).max(), lambda x: x.rolling(window=10, min_periods=5).min(), lambda x: x.rolling(window=10, min_periods=5).sum(), @@ -2136,154 +1265,6 @@ def test_rolling_functions_window_non_shrinkage_binary(self): df_result = f(df) tm.assert_frame_equal(df_result, df_expected) - def test_moment_functions_zero_length(self): - # GH 8056 - s = Series(dtype=np.float64) - s_expected = s - df1 = DataFrame() - df1_expected = df1 - df2 = DataFrame(columns=["a"]) - df2["a"] = df2["a"].astype("float64") - df2_expected = df2 - - functions = [ - lambda x: x.expanding().count(), - lambda x: x.expanding(min_periods=5).cov(x, pairwise=False), - lambda x: x.expanding(min_periods=5).corr(x, pairwise=False), - lambda x: x.expanding(min_periods=5).max(), - lambda x: x.expanding(min_periods=5).min(), - lambda x: x.expanding(min_periods=5).sum(), - lambda x: x.expanding(min_periods=5).mean(), - lambda x: x.expanding(min_periods=5).std(), - lambda x: x.expanding(min_periods=5).var(), - lambda x: x.expanding(min_periods=5).skew(), - lambda x: x.expanding(min_periods=5).kurt(), - lambda x: x.expanding(min_periods=5).quantile(0.5), - lambda x: x.expanding(min_periods=5).median(), - lambda x: x.expanding(min_periods=5).apply(sum, raw=False), - lambda x: x.expanding(min_periods=5).apply(sum, raw=True), - lambda x: x.rolling(window=10).count(), - lambda x: x.rolling(window=10, min_periods=5).cov(x, pairwise=False), - lambda x: x.rolling(window=10, min_periods=5).corr(x, pairwise=False), - lambda x: x.rolling(window=10, min_periods=5).max(), - lambda x: x.rolling(window=10, min_periods=5).min(), - lambda x: x.rolling(window=10, min_periods=5).sum(), - lambda x: x.rolling(window=10, min_periods=5).mean(), - lambda x: x.rolling(window=10, min_periods=5).std(), - lambda x: x.rolling(window=10, min_periods=5).var(), - lambda x: x.rolling(window=10, min_periods=5).skew(), - lambda x: x.rolling(window=10, min_periods=5).kurt(), - lambda x: x.rolling(window=10, min_periods=5).quantile(0.5), - lambda x: x.rolling(window=10, min_periods=5).median(), - lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False), - lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=True), - lambda x: x.rolling(win_type="boxcar", window=10, min_periods=5).mean(), - ] - for f in functions: - try: - s_result = f(s) - tm.assert_series_equal(s_result, s_expected) - - df1_result = f(df1) - tm.assert_frame_equal(df1_result, df1_expected) - - df2_result = f(df2) - tm.assert_frame_equal(df2_result, df2_expected) - except (ImportError): - - # scipy needed for rolling_window - continue - - def test_moment_functions_zero_length_pairwise(self): - - df1 = DataFrame() - df1_expected = df1 - df2 = DataFrame(columns=Index(["a"], name="foo"), index=Index([], name="bar")) - df2["a"] = df2["a"].astype("float64") - - df1_expected = DataFrame( - index=pd.MultiIndex.from_product([df1.index, df1.columns]), - columns=Index([]), - ) - df2_expected = DataFrame( - index=pd.MultiIndex.from_product( - [df2.index, df2.columns], names=["bar", "foo"] - ), - columns=Index(["a"], name="foo"), - dtype="float64", - ) - - functions = [ - lambda x: (x.expanding(min_periods=5).cov(x, pairwise=True)), - lambda x: (x.expanding(min_periods=5).corr(x, pairwise=True)), - lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=True)), - lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=True)), - ] - for f in functions: - df1_result = f(df1) - tm.assert_frame_equal(df1_result, df1_expected) - - df2_result = f(df2) - tm.assert_frame_equal(df2_result, df2_expected) - - def test_expanding_cov_pairwise_diff_length(self): - # GH 7512 - df1 = DataFrame([[1, 5], [3, 2], [3, 9]], columns=Index(["A", "B"], name="foo")) - df1a = DataFrame( - [[1, 5], [3, 9]], index=[0, 2], columns=Index(["A", "B"], name="foo") - ) - df2 = DataFrame( - [[5, 6], [None, None], [2, 1]], columns=Index(["X", "Y"], name="foo") - ) - df2a = DataFrame( - [[5, 6], [2, 1]], index=[0, 2], columns=Index(["X", "Y"], name="foo") - ) - # TODO: xref gh-15826 - # .loc is not preserving the names - result1 = df1.expanding().cov(df2a, pairwise=True).loc[2] - result2 = df1.expanding().cov(df2a, pairwise=True).loc[2] - result3 = df1a.expanding().cov(df2, pairwise=True).loc[2] - result4 = df1a.expanding().cov(df2a, pairwise=True).loc[2] - expected = DataFrame( - [[-3.0, -6.0], [-5.0, -10.0]], - columns=Index(["A", "B"], name="foo"), - index=Index(["X", "Y"], name="foo"), - ) - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) - tm.assert_frame_equal(result3, expected) - tm.assert_frame_equal(result4, expected) - - def test_expanding_corr_pairwise_diff_length(self): - # GH 7512 - df1 = DataFrame( - [[1, 2], [3, 2], [3, 4]], - columns=["A", "B"], - index=Index(range(3), name="bar"), - ) - df1a = DataFrame( - [[1, 2], [3, 4]], index=Index([0, 2], name="bar"), columns=["A", "B"] - ) - df2 = DataFrame( - [[5, 6], [None, None], [2, 1]], - columns=["X", "Y"], - index=Index(range(3), name="bar"), - ) - df2a = DataFrame( - [[5, 6], [2, 1]], index=Index([0, 2], name="bar"), columns=["X", "Y"] - ) - result1 = df1.expanding().corr(df2, pairwise=True).loc[2] - result2 = df1.expanding().corr(df2a, pairwise=True).loc[2] - result3 = df1a.expanding().corr(df2, pairwise=True).loc[2] - result4 = df1a.expanding().corr(df2a, pairwise=True).loc[2] - expected = DataFrame( - [[-1.0, -1.0], [-1.0, -1.0]], columns=["A", "B"], index=Index(["X", "Y"]) - ) - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) - tm.assert_frame_equal(result3, expected) - tm.assert_frame_equal(result4, expected) - def test_rolling_skew_edge_cases(self): all_nan = Series([np.NaN] * 5) @@ -2334,83 +1315,6 @@ def test_rolling_kurt_eq_value_fperr(self): a = Series([1.1] * 15).rolling(window=10).kurt() assert np.isnan(a).all() - @pytest.mark.parametrize( - "func,static_comp", - [("sum", np.sum), ("mean", np.mean), ("max", np.max), ("min", np.min)], - ids=["sum", "mean", "max", "min"], - ) - def test_expanding_func(self, func, static_comp): - def expanding_func(x, min_periods=1, center=False, axis=0): - exp = x.expanding(min_periods=min_periods, center=center, axis=axis) - return getattr(exp, func)() - - self._check_expanding(expanding_func, static_comp, preserve_nan=False) - - def test_expanding_apply(self, raw): - def expanding_mean(x, min_periods=1): - - exp = x.expanding(min_periods=min_periods) - result = exp.apply(lambda x: x.mean(), raw=raw) - return result - - # TODO(jreback), needed to add preserve_nan=False - # here to make this pass - self._check_expanding(expanding_mean, np.mean, preserve_nan=False) - - ser = Series([], dtype=np.float64) - tm.assert_series_equal(ser, ser.expanding().apply(lambda x: x.mean(), raw=raw)) - - # GH 8080 - s = Series([None, None, None]) - result = s.expanding(min_periods=0).apply(lambda x: len(x), raw=raw) - expected = Series([1.0, 2.0, 3.0]) - tm.assert_series_equal(result, expected) - - def _check_expanding( - self, - func, - static_comp, - has_min_periods=True, - has_time_rule=True, - preserve_nan=True, - ): - - series_result = func(self.series) - assert isinstance(series_result, Series) - frame_result = func(self.frame) - assert isinstance(frame_result, DataFrame) - - result = func(self.series) - tm.assert_almost_equal(result[10], static_comp(self.series[:11])) - - if preserve_nan: - assert result.iloc[self._nan_locs].isna().all() - - ser = Series(randn(50)) - - if has_min_periods: - result = func(ser, min_periods=30) - assert result[:29].isna().all() - tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50])) - - # min_periods is working correctly - result = func(ser, min_periods=15) - assert isna(result.iloc[13]) - assert notna(result.iloc[14]) - - ser2 = Series(randn(20)) - result = func(ser2, min_periods=5) - assert isna(result[3]) - assert notna(result[4]) - - # min_periods=0 - result0 = func(ser, min_periods=0) - result1 = func(ser, min_periods=1) - tm.assert_almost_equal(result0, result1) - else: - result = func(ser) - tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50])) - def test_rolling_max_gh6297(self): """Replicate result expected in GH #6297""" @@ -2532,3 +1436,76 @@ def test_rolling_min_max_numeric_types(self): assert result.dtypes[0] == np.dtype("f8") result = DataFrame(np.arange(20, dtype=data_type)).rolling(window=5).min() assert result.dtypes[0] == np.dtype("f8") + + def test_moment_functions_zero_length(self): + # GH 8056 + s = Series(dtype=np.float64) + s_expected = s + df1 = DataFrame() + df1_expected = df1 + df2 = DataFrame(columns=["a"]) + df2["a"] = df2["a"].astype("float64") + df2_expected = df2 + + functions = [ + lambda x: x.rolling(window=10).count(), + lambda x: x.rolling(window=10, min_periods=5).cov(x, pairwise=False), + lambda x: x.rolling(window=10, min_periods=5).corr(x, pairwise=False), + lambda x: x.rolling(window=10, min_periods=5).max(), + lambda x: x.rolling(window=10, min_periods=5).min(), + lambda x: x.rolling(window=10, min_periods=5).sum(), + lambda x: x.rolling(window=10, min_periods=5).mean(), + lambda x: x.rolling(window=10, min_periods=5).std(), + lambda x: x.rolling(window=10, min_periods=5).var(), + lambda x: x.rolling(window=10, min_periods=5).skew(), + lambda x: x.rolling(window=10, min_periods=5).kurt(), + lambda x: x.rolling(window=10, min_periods=5).quantile(0.5), + lambda x: x.rolling(window=10, min_periods=5).median(), + lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False), + lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=True), + lambda x: x.rolling(win_type="boxcar", window=10, min_periods=5).mean(), + ] + for f in functions: + try: + s_result = f(s) + tm.assert_series_equal(s_result, s_expected) + + df1_result = f(df1) + tm.assert_frame_equal(df1_result, df1_expected) + + df2_result = f(df2) + tm.assert_frame_equal(df2_result, df2_expected) + except (ImportError): + + # scipy needed for rolling_window + continue + + def test_moment_functions_zero_length_pairwise(self): + + df1 = DataFrame() + df2 = DataFrame(columns=Index(["a"], name="foo"), index=Index([], name="bar")) + df2["a"] = df2["a"].astype("float64") + + df1_expected = DataFrame( + index=pd.MultiIndex.from_product([df1.index, df1.columns]), + columns=Index([]), + ) + df2_expected = DataFrame( + index=pd.MultiIndex.from_product( + [df2.index, df2.columns], names=["bar", "foo"] + ), + columns=Index(["a"], name="foo"), + dtype="float64", + ) + + functions = [ + lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=True)), + lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=True)), + ] + + for f in functions: + df1_result = f(df1) + tm.assert_frame_equal(df1_result, df1_expected) + + df2_result = f(df2) + tm.assert_frame_equal(df2_result, df2_expected) diff --git a/pandas/util/_depr_module.py b/pandas/util/_depr_module.py index 5733663dd7ab3..5694ca24aab57 100644 --- a/pandas/util/_depr_module.py +++ b/pandas/util/_depr_module.py @@ -46,7 +46,7 @@ def __repr__(self) -> str: __str__ = __repr__ - def __getattr__(self, name): + def __getattr__(self, name: str): if name in self.self_dir: return object.__getattribute__(self, name) diff --git a/requirements-dev.txt b/requirements-dev.txt index 4df0946ac0078..9f18bf767ae56 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -45,7 +45,7 @@ pip blosc bottleneck>=1.2.1 ipykernel -ipython>=5.6.0 +ipython>=5.6.0,<=7.10.1 jinja2 matplotlib>=2.2.2 numexpr>=2.6.8 diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index b0eeb7b96e0eb..850217c8a7803 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -286,7 +286,7 @@ def _load_obj(name): continue if "obj" not in locals(): - raise ImportError("No module can be imported " 'from "{}"'.format(name)) + raise ImportError(f'No module can be imported from "{name}"') for part in func_parts: obj = getattr(obj, part) diff --git a/setup.cfg b/setup.cfg index 8fb602188dad5..96af78c77feb8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -151,15 +151,9 @@ ignore_errors=True [mypy-pandas._version] check_untyped_defs=False -[mypy-pandas.core.arrays.boolean] -check_untyped_defs=False - [mypy-pandas.core.arrays.categorical] check_untyped_defs=False -[mypy-pandas.core.arrays.integer] -check_untyped_defs=False - [mypy-pandas.core.arrays.interval] check_untyped_defs=False @@ -169,12 +163,6 @@ check_untyped_defs=False [mypy-pandas.core.base] check_untyped_defs=False -[mypy-pandas.core.computation.align] -check_untyped_defs=False - -[mypy-pandas.core.computation.eval] -check_untyped_defs=False - [mypy-pandas.core.computation.expr] check_untyped_defs=False @@ -190,15 +178,9 @@ check_untyped_defs=False [mypy-pandas.core.computation.scope] check_untyped_defs=False -[mypy-pandas.core.config_init] -check_untyped_defs=False - [mypy-pandas.core.dtypes.cast] check_untyped_defs=False -[mypy-pandas.core.dtypes.generic] -check_untyped_defs=False - [mypy-pandas.core.frame] check_untyped_defs=False @@ -217,9 +199,6 @@ check_untyped_defs=False [mypy-pandas.core.indexes.base] check_untyped_defs=False -[mypy-pandas.core.indexes.category] -check_untyped_defs=False - [mypy-pandas.core.indexes.datetimelike] check_untyped_defs=False @@ -232,9 +211,6 @@ check_untyped_defs=False [mypy-pandas.core.indexes.multi] check_untyped_defs=False -[mypy-pandas.core.indexes.timedeltas] -check_untyped_defs=False - [mypy-pandas.core.indexing] check_untyped_defs=False @@ -268,9 +244,6 @@ check_untyped_defs=False [mypy-pandas.core.reshape.reshape] check_untyped_defs=False -[mypy-pandas.core.series] -check_untyped_defs=False - [mypy-pandas.core.strings] check_untyped_defs=False @@ -325,9 +298,6 @@ check_untyped_defs=False [mypy-pandas.io.json._json] check_untyped_defs=False -[mypy-pandas.io.json._normalize] -check_untyped_defs=False - [mypy-pandas.io.json._table_schema] check_untyped_defs=False diff --git a/setup.py b/setup.py index af70ee3b30095..489a9602511e8 100755 --- a/setup.py +++ b/setup.py @@ -526,6 +526,11 @@ def maybe_cythonize(extensions, *args, **kwargs): elif parsed.j: nthreads = parsed.j + # GH#30356 Cythonize doesn't support parallel on Windows + if is_platform_windows() and nthreads > 0: + print("Parallel build for cythonize ignored on Windows") + nthreads = 0 + kwargs["nthreads"] = nthreads build_ext.render_templates(_pxifiles) return cythonize(extensions, *args, **kwargs)