diff --git a/ci/docs.yml b/ci/docs.yml index 0d08956d9..e9c3f56ae 100644 --- a/ci/docs.yml +++ b/ci/docs.yml @@ -8,8 +8,12 @@ dependencies: - numpydoc - numpy_groupies - toolz + - matplotlib-base - myst-parser + - myst-nb - sphinx - furo + - ipykernel + - jupyter - pip: - git+https://github.com/dcherian/flox diff --git a/docs/source/_static/style.css b/docs/source/_static/style.css new file mode 100644 index 000000000..5d9d45bb7 --- /dev/null +++ b/docs/source/_static/style.css @@ -0,0 +1,12 @@ +.xr-wrap { + font-size: 0.85em; + margin-left: 1.25em; + padding-left: 1.25em; + border-left: thin var(--color-foreground-muted) solid; +} +.xr-array-wrap, .xr-var-data, .xr-var-preview { + font-size: 0.9em; +} +.gp { + color: darkorange; +} diff --git a/docs/source/api.rst b/docs/source/api.rst index fb5bd9b98..28e640922 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -40,5 +40,5 @@ Aggregation Objects :toctree: generated/ aggregations.Aggregation - aggregations.sum + aggregations.sum_ aggregations.nansum diff --git a/docs/source/conf.py b/docs/source/conf.py index 6b68a0a99..9c00d8c83 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -16,6 +16,9 @@ import sys import flox +import flox.aggregations +import flox.visualize +import flox.xarray # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the @@ -29,7 +32,6 @@ # -- General configuration ----------------------------------------------------- extensions = [ - "myst_parser", "sphinx.ext.autodoc", "sphinx.ext.viewcode", "sphinx.ext.autosummary", @@ -37,6 +39,7 @@ "sphinx.ext.extlinks", "numpydoc", "sphinx.ext.napoleon", + "myst_nb", ] extlinks = { @@ -45,7 +48,7 @@ } templates_path = ["_templates"] -source_suffix = [".rst", ".md"] +source_suffix = [".rst"] master_doc = "index" language = "en" @@ -117,7 +120,8 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -# html_static_path = ["_static"] +html_static_path = ["_static"] +html_css_files = ["style.css"] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. diff --git a/docs/source/index.md b/docs/source/index.md index 350d25231..6149998b9 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -50,4 +50,5 @@ It was motivated by many discussions in the [Pangeo](https://pangeo.io) communit implementation.md custom.md api.rst + user-stories.md ``` diff --git a/docs/source/user-stories.md b/docs/source/user-stories.md new file mode 100644 index 000000000..29e357fd0 --- /dev/null +++ b/docs/source/user-stories.md @@ -0,0 +1,8 @@ +# User Stories + +```{eval-rst} +.. toctree:: + :maxdepth: 1 + + user-stories/climatology.ipynb +``` diff --git a/docs/source/user-stories/climatology.ipynb b/docs/source/user-stories/climatology.ipynb new file mode 100644 index 000000000..3fd7ae55d --- /dev/null +++ b/docs/source/user-stories/climatology.ipynb @@ -0,0 +1,392 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4e9bf3f9-0952-493c-a8df-4a1d851c37a9", + "metadata": {}, + "source": [ + "# Strategies for climatology calculations\n", + "\n", + "This notebook is motivated by\n", + "[this post](https://discourse.pangeo.io/t/understanding-optimal-zarr-chunking-scheme-for-a-climatology/2335)\n", + "on the Pangeo discourse forum.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85ac0588-ff00-43cc-b952-7ab775b24e4a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import dask.array\n", + "import flox\n", + "import flox.xarray\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import xarray as xr" + ] + }, + { + "cell_type": "markdown", + "id": "82f46621-1b6c-4a14-ac0f-3aa5121dad54", + "metadata": {}, + "source": [ + "Let's first create an example Xarray Dataset representing the OISST dataset,\n", + "with chunk sizes matching that in the post.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a91d2e2-bd6d-4b35-8002-5fac76c4c5b3", + "metadata": {}, + "outputs": [], + "source": [ + "oisst = xr.DataArray(\n", + " dask.array.ones((14532, 720, 1440), chunks=(20, -1, -1)),\n", + " dims=(\"time\", \"lat\", \"lon\"),\n", + " coords={\n", + " \"time\": pd.date_range(\"1981-09-01 12:00\", \"2021-06-14 12:00\", freq=\"D\")\n", + " },\n", + " name=\"sst\",\n", + ")\n", + "oisst" + ] + }, + { + "cell_type": "markdown", + "id": "6d913e7f-25bd-43c4-98b6-93bcb420c524", + "metadata": {}, + "source": [ + "## map-reduce\n", + "\n", + "The default\n", + "[method=\"map-reduce\"](https://flox.readthedocs.io/en/latest/implementation.html#method-map-reduce)\n", + "doesn't work so well. We aggregate all days in a single chunk.\n", + "\n", + "For this to work well, we'd want smaller chunks in space and bigger chunks in\n", + "time.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef2a14de-7526-40e3-8a97-28e84d6d6f20", + "metadata": {}, + "outputs": [], + "source": [ + "flox.xarray.xarray_reduce(\n", + " oisst,\n", + " oisst.time.dt.dayofyear,\n", + " func=\"mean\",\n", + " method=\"map-reduce\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "442ad701-ea45-4555-9550-ec9daecfbea3", + "metadata": {}, + "source": [ + "## Rechunking for map-reduce\n", + "\n", + "We can split each chunk along the `lat`, `lon` dimensions to make sure the\n", + "output chunk sizes are more reasonable\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "322c7776-9a21-4115-8ac9-9c7c6c6e2c91", + "metadata": {}, + "outputs": [], + "source": [ + "flox.xarray.xarray_reduce(\n", + " oisst.chunk({\"lat\": -1, \"lon\": 120}),\n", + " oisst.time.dt.dayofyear,\n", + " func=\"mean\",\n", + " method=\"map-reduce\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "833f72eb-1501-4362-ae55-ec419c9f0ac1", + "metadata": {}, + "source": [ + "But what if we didn't want to rechunk the dataset so drastically (note the 10x\n", + "increase in tasks). For that let's try `method=\"cohorts\"`\n", + "\n", + "## method=cohorts\n", + "\n", + "We can take advantage of patterns in the groups here \"day of year\".\n", + "Specifically:\n", + "\n", + "1. The groups at an approximately periodic interval, 365 or 366 days\n", + "2. The chunk size 20 is smaller than the period of 365 or 366. This means, that\n", + " to construct the mean for days 1-20, we just need to use the chunks that\n", + " contain days 1-20.\n", + "\n", + "This strategy is implemented as\n", + "[method=\"cohorts\"](https://flox.readthedocs.io/en/latest/implementation.html#method-cohorts)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3bafc32-7e13-41b8-90eb-b27955393392", + "metadata": {}, + "outputs": [], + "source": [ + "flox.xarray.xarray_reduce(\n", + " oisst,\n", + " oisst.time.dt.dayofyear,\n", + " func=\"mean\",\n", + " method=\"cohorts\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "b4e1ba0b-20e5-466a-9199-38b47029a0ed", + "metadata": {}, + "source": [ + "By default cohorts doesn't work so well for this problem because the period\n", + "isn't regular (365 vs 366) and the period isn't divisible by the chunk size. So\n", + "the groups end up being \"out of phase\" (for a visual illustration\n", + "[click here](https://flox.readthedocs.io/en/latest/implementation.html#method-cohorts)).\n", + "Now we have the opposite problem: the chunk sizes on the output are too small.\n", + "\n", + "Looking more closely, We can see the cohorts that `flox` has detected are not\n", + "really cohorts, each cohort is a single group label. We've replicated Xarray's\n", + "current strategy; what flox calls\n", + "[\"split-reduce\"](https://flox.readthedocs.io/en/latest/implementation.html#method-split-reduce-xarray-s-current-groupby-strategy)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13ce5531-0d6c-4c89-bc44-dc2c24fa4e47", + "metadata": {}, + "outputs": [], + "source": [ + "flox.core.find_group_cohorts(\n", + " labels=oisst.time.dt.dayofyear.data,\n", + " chunks=(oisst.chunksizes[\"time\"],),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "bcbdbb3b-2aed-4f3f-ad20-efabb52b5e68", + "metadata": {}, + "source": [ + "## Rechunking data for cohorts\n", + "\n", + "Can we fix the \"out of phase\" problem by rechunking along time?\n", + "\n", + "First lets see where the current chunk boundaries are\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90a884bc-1b71-4874-8143-73b3b5c41458", + "metadata": {}, + "outputs": [], + "source": [ + "array = oisst.data\n", + "labels = oisst.time.dt.dayofyear.data\n", + "axis = oisst.get_axis_num(\"time\")\n", + "oldchunks = array.chunks[axis]\n", + "oldbreaks = np.insert(np.cumsum(oldchunks), 0, 0)\n", + "labels_at_breaks = labels[oldbreaks[:-1]]\n", + "labels_at_breaks" + ] + }, + { + "cell_type": "markdown", + "id": "4b2573e5-0d30-4cb8-b5af-751b824f0689", + "metadata": {}, + "source": [ + "Now we'll use a convenient function `rechunk_for_cohorts` to rechunk the `oisst`\n", + "dataset along time. We'll ask it to rechunk so that a new chunk starts at each\n", + "of the elements\n", + "\n", + "```\n", + "[244, 264, 284, 304, 324, 344, 364, 19, 39, 59, 79, 99, 119,\n", + " 139, 159, 179, 199, 219, 239]\n", + "```\n", + "\n", + "These are labels at the chunk boundaries in the first year of data. We are\n", + "forcing that chunking pattern to repeat as much as possible. We also tell the\n", + "function to ignore any existing chunk boundaries.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9ab6382-e93b-49e9-8e2e-1ba526046aea", + "metadata": {}, + "outputs": [], + "source": [ + "rechunked = flox.xarray.rechunk_for_cohorts(\n", + " oisst,\n", + " dim=\"time\",\n", + " labels=oisst.time.dt.dayofyear,\n", + " force_new_chunk_at=[\n", + " 244,\n", + " 264,\n", + " 284,\n", + " 304,\n", + " 324,\n", + " 344,\n", + " 364,\n", + " 19,\n", + " 39,\n", + " 59,\n", + " 79,\n", + " 99,\n", + " 119,\n", + " 139,\n", + " 159,\n", + " 179,\n", + " 199,\n", + " 219,\n", + " 239,\n", + " ],\n", + " ignore_old_chunks=True,\n", + ")\n", + "rechunked" + ] + }, + { + "cell_type": "markdown", + "id": "570d869b-9612-4de9-83ee-336a35c1fdad", + "metadata": {}, + "source": [ + "We see that chunks are mostly 20 elements long in time with some differences\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "86bb4461-d921-40f8-9ff7-8d6e7e8c7e4b", + "metadata": {}, + "outputs": [], + "source": [ + "plt.plot(rechunked.chunksizes[\"time\"], marker=\"x\", ls=\"none\")" + ] + }, + { + "cell_type": "markdown", + "id": "12b7a27f-ebab-4673-bb9f-80620389994b", + "metadata": {}, + "source": [ + "And now our cohorts contain more than one group\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f522fb82-764d-4e4e-8337-a5123e3088f8", + "metadata": {}, + "outputs": [], + "source": [ + "flox.core.find_group_cohorts(\n", + " labels=rechunked.time.dt.dayofyear.data,\n", + " chunks=(rechunked.chunksizes[\"time\"],),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "949ac39c-dd84-4375-a884-0c1c3c382a8f", + "metadata": {}, + "source": [ + "Now the groupby reduction **looks OK** in terms of number of tasks but remember\n", + "that rechunking to get to this point involves some communication overhead.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f1e45f9-5b18-482a-8c76-66f81ff5710f", + "metadata": {}, + "outputs": [], + "source": [ + "flox.xarray.xarray_reduce(\n", + " rechunked, rechunked.time.dt.dayofyear, func=\"mean\", method=\"cohorts\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "93c58969-5c99-4bc0-90ee-9cef468bf78b", + "metadata": {}, + "source": [ + "## How about other climatologies?\n", + "\n", + "Let's try monthly\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e559ea33-5499-48ff-9a2e-5141c3a69fea", + "metadata": {}, + "outputs": [], + "source": [ + "flox.xarray.xarray_reduce(oisst, oisst.time.dt.month, func=\"mean\")" + ] + }, + { + "cell_type": "markdown", + "id": "a00de8eb-e414-4920-8dcd-b64afbf91b62", + "metadata": {}, + "source": [ + "This looks great. Why?\n", + "\n", + "It's because each chunk (size 20) is smaller than number of days in a typical\n", + "month. `flox` initially applies the groupby-reduction blockwise. For the chunk\n", + "size of 20, we will have at most 2 groups in each chunk, so the initial\n", + "blockwise reduction is quite effective - at least a 10x reduction in size from\n", + "20 elements in time to at most 2 elements in time.\n", + "\n", + "For this kind of problem, `\"map-reduce\"` works quite well.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.1" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}