From 7c57913cae9d3d70ee86f645045327bd8e2a8e74 Mon Sep 17 00:00:00 2001 From: Ray Bell Date: Fri, 8 Oct 2021 16:53:31 -0400 Subject: [PATCH 1/2] update house dataset in tabular --- CHANGELOG.rst | 5 +- ci/docs_notebooks.yml | 1 + docs/source/tabular-data.ipynb | 1700 +++++++++++++++++++++++++++----- 3 files changed, 1474 insertions(+), 232 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 4b3b3dd5..3f81d623 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,8 +2,11 @@ Changelog History ================= -xskillscore v0.0.24 (2021-XX-XX) +xskillscore v0.0.24 (2021-10-08) -------------------------------- +Documentation +~~~~~~~~~~~~~ +- Replaced Boston house data in ``tabular.ipynb`` (:pr:`XX`) `Ray Bell`_. xskillscore v0.0.23 (2021-08-09) diff --git a/ci/docs_notebooks.yml b/ci/docs_notebooks.yml index 09313670..d4d7236e 100644 --- a/ci/docs_notebooks.yml +++ b/ci/docs_notebooks.yml @@ -16,6 +16,7 @@ dependencies: - importlib_metadata - ipykernel - jupyterlab + - jupyterlab_code_formatter - matplotlib-base - nbsphinx - nbstripout diff --git a/docs/source/tabular-data.ipynb b/docs/source/tabular-data.ipynb index e4babdca..56452780 100644 --- a/docs/source/tabular-data.ipynb +++ b/docs/source/tabular-data.ipynb @@ -1,248 +1,915 @@ { - "metadata": { - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.4" - }, - "orig_nbformat": 4, - "kernelspec": { - "name": "python3", - "display_name": "Python 3.9.4 64-bit ('xskillscore-docs-notebooks': conda)" - }, - "interpreter": { - "hash": "e5607b67897ceeb4cb8d1a6f5e8f77cf995244d75ab9ff3b133e23bb37c07f75" - } - }, - "nbformat": 4, - "nbformat_minor": 2, "cells": [ { + "cell_type": "markdown", + "metadata": {}, "source": [ "# Tabular Data\n", "\n", "`xskillscore` can be used on tabular data such as that stored in a `pandas.DataFrame`.\n", "\n", "It can be used most effectively when evaluating predictions over different fields." - ], - "cell_type": "markdown", - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2021-10-08T20:48:08.435286Z", + "iopub.status.busy": "2021-10-08T20:48:08.434791Z", + "iopub.status.idle": "2021-10-08T20:48:11.106270Z", + "shell.execute_reply": "2021-10-08T20:48:11.105553Z", + "shell.execute_reply.started": "2021-10-08T20:48:08.435216Z" + } + }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", - "import xskillscore as xs\n", - "from sklearn.datasets import load_boston\n", + "from sklearn.datasets import fetch_california_housing\n", "from sklearn.metrics import mean_squared_error\n", + "\n", + "import xskillscore as xs\n", + "\n", "np.random.seed(seed=42)" ] }, { - "source": [ - "## Boston house prices dataset" - ], "cell_type": "markdown", - "metadata": {} + "metadata": {}, + "source": [ + "## California house prices dataset" + ] }, { + "cell_type": "markdown", + "metadata": {}, "source": [ "A small example is to take a dataset and evaluate the model according to a field (column).\n", "\n", - "Load the Boston house prices dataset:" - ], - "cell_type": "markdown", - "metadata": {} + "Load the California house prices dataset:" + ] }, { "cell_type": "code", "execution_count": 2, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2021-10-08T20:48:11.107574Z", + "iopub.status.busy": "2021-10-08T20:48:11.107202Z", + "iopub.status.idle": "2021-10-08T20:48:11.148574Z", + "shell.execute_reply": "2021-10-08T20:48:11.147838Z", + "shell.execute_reply.started": "2021-10-08T20:48:11.107553Z" + } + }, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \\\n", - "0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 \n", - "1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 \n", - "2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 \n", - "3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 \n", - "4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 \n", - ".. ... ... ... ... ... ... ... ... ... ... \n", - "501 0.06263 0.0 11.93 0.0 0.573 6.593 69.1 2.4786 1.0 273.0 \n", - "502 0.04527 0.0 11.93 0.0 0.573 6.120 76.7 2.2875 1.0 273.0 \n", - "503 0.06076 0.0 11.93 0.0 0.573 6.976 91.0 2.1675 1.0 273.0 \n", - "504 0.10959 0.0 11.93 0.0 0.573 6.794 89.3 2.3889 1.0 273.0 \n", - "505 0.04741 0.0 11.93 0.0 0.573 6.030 80.8 2.5050 1.0 273.0 \n", - "\n", - " PTRATIO B LSTAT y \n", - "0 15.3 396.90 4.98 24.0 \n", - "1 17.8 396.90 9.14 21.6 \n", - "2 17.8 392.83 4.03 34.7 \n", - "3 18.7 394.63 2.94 33.4 \n", - "4 18.7 396.90 5.33 36.2 \n", - ".. ... ... ... ... \n", - "501 21.0 391.99 9.67 22.4 \n", - "502 21.0 396.90 9.08 20.6 \n", - "503 21.0 396.90 5.64 23.9 \n", - "504 21.0 393.45 6.48 22.0 \n", - "505 21.0 396.90 7.88 11.9 \n", - "\n", - "[506 rows x 14 columns]" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MedIncHouseAgeAveRoomsAveBedrmsPopulationAveOccupLatitudeLongitudey
08.325241.07.01.023810322.02.55555637.88-122.234.526
18.301421.06.00.9718802401.02.10984237.86-122.223.585
27.257452.08.01.073446496.02.80226037.85-122.243.521
35.643152.06.01.073059558.02.54794537.85-122.253.413
43.846252.06.01.081081565.02.18146737.85-122.253.422
..............................
206351.560325.05.01.133333845.02.56060639.48-121.090.781
206362.556818.06.01.315789356.03.12280739.49-121.210.771
206371.700017.05.01.1200921007.02.32563539.43-121.220.923
206381.867218.05.01.171920741.02.12320939.43-121.320.847
206392.388616.05.01.1622641387.02.61698139.37-121.240.894
\n", + "

20640 rows × 9 columns

\n", + "
" ], - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTATy
00.0063218.02.310.00.5386.57565.24.09001.0296.015.3396.904.9824.0
10.027310.07.070.00.4696.42178.94.96712.0242.017.8396.909.1421.6
20.027290.07.070.00.4697.18561.14.96712.0242.017.8392.834.0334.7
30.032370.02.180.00.4586.99845.86.06223.0222.018.7394.632.9433.4
40.069050.02.180.00.4587.14754.26.06223.0222.018.7396.905.3336.2
.............................................
5010.062630.011.930.00.5736.59369.12.47861.0273.021.0391.999.6722.4
5020.045270.011.930.00.5736.12076.72.28751.0273.021.0396.909.0820.6
5030.060760.011.930.00.5736.97691.02.16751.0273.021.0396.905.6423.9
5040.109590.011.930.00.5736.79489.32.38891.0273.021.0393.456.4822.0
5050.047410.011.930.00.5736.03080.82.50501.0273.021.0396.907.8811.9
\n

506 rows × 14 columns

\n
" + "text/plain": [ + " MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n", + "0 8.3252 41.0 7.0 1.023810 322.0 2.555556 37.88 \n", + "1 8.3014 21.0 6.0 0.971880 2401.0 2.109842 37.86 \n", + "2 7.2574 52.0 8.0 1.073446 496.0 2.802260 37.85 \n", + "3 5.6431 52.0 6.0 1.073059 558.0 2.547945 37.85 \n", + "4 3.8462 52.0 6.0 1.081081 565.0 2.181467 37.85 \n", + "... ... ... ... ... ... ... ... \n", + "20635 1.5603 25.0 5.0 1.133333 845.0 2.560606 39.48 \n", + "20636 2.5568 18.0 6.0 1.315789 356.0 3.122807 39.49 \n", + "20637 1.7000 17.0 5.0 1.120092 1007.0 2.325635 39.43 \n", + "20638 1.8672 18.0 5.0 1.171920 741.0 2.123209 39.43 \n", + "20639 2.3886 16.0 5.0 1.162264 1387.0 2.616981 39.37 \n", + "\n", + " Longitude y \n", + "0 -122.23 4.526 \n", + "1 -122.22 3.585 \n", + "2 -122.24 3.521 \n", + "3 -122.25 3.413 \n", + "4 -122.25 3.422 \n", + "... ... ... \n", + "20635 -121.09 0.781 \n", + "20636 -121.21 0.771 \n", + "20637 -121.22 0.923 \n", + "20638 -121.32 0.847 \n", + "20639 -121.24 0.894 \n", + "\n", + "[20640 rows x 9 columns]" + ] }, + "execution_count": 2, "metadata": {}, - "execution_count": 2 + "output_type": "execute_result" } ], "source": [ - "data = load_boston()\n", - "df = pd.DataFrame(data.data, columns=data.feature_names)\n", - "df['y'] = pd.Series(data.target)\n", + "housing = fetch_california_housing(as_frame=True)\n", + "df = housing.frame\n", + "df[\"AveRooms\"] = df[\"AveRooms\"].round()\n", + "df = df.rename(columns={\"MedHouseVal\": \"y\"})\n", "df" ] }, { + "cell_type": "markdown", + "metadata": {}, "source": [ "Create a dummy prediction column by adding noise to `y`:" - ], - "cell_type": "markdown", - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 3, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2021-10-08T20:48:11.150235Z", + "iopub.status.busy": "2021-10-08T20:48:11.149886Z", + "iopub.status.idle": "2021-10-08T20:48:11.158588Z", + "shell.execute_reply": "2021-10-08T20:48:11.157830Z", + "shell.execute_reply.started": "2021-10-08T20:48:11.150199Z" + } + }, "outputs": [], "source": [ - "noise = np.random.uniform(-1, 1, size=len(df['y']))\n", - "df['yhat'] = (df['y'] + (df['y'] * noise)).clip(lower=df[\"y\"].min())" + "noise = np.random.uniform(-1, 1, size=len(df[\"y\"]))\n", + "df[\"yhat\"] = (df[\"y\"] + (df[\"y\"] * noise)).clip(lower=df[\"y\"].min())" ] }, { - "source": [ - "Evaluate the model over the field `RAD` using `pandas.groupby.apply` with `mean_squared_error` from `scikit-learn`:" - ], "cell_type": "markdown", - "metadata": {} + "metadata": {}, + "source": [ + "Evaluate the model over the field `AveRooms` using `pandas.groupby.apply` with `mean_squared_error` from `scikit-learn`:" + ] }, { "cell_type": "code", "execution_count": 4, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2021-10-08T20:48:11.161964Z", + "iopub.status.busy": "2021-10-08T20:48:11.161638Z", + "iopub.status.idle": "2021-10-08T20:48:11.190810Z", + "shell.execute_reply": "2021-10-08T20:48:11.190152Z", + "shell.execute_reply.started": "2021-10-08T20:48:11.161926Z" + }, + "tags": [] + }, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ - "RAD\n", - "1.0 161.237554\n", - "2.0 313.855750\n", - "3.0 307.220760\n", - "4.0 162.634430\n", - "5.0 221.852969\n", - "6.0 155.612978\n", - "7.0 214.375240\n", - "8.0 278.092560\n", - "24.0 148.840507\n", + "AveRooms\n", + "1.0 1.789466\n", + "2.0 1.827004\n", + "3.0 1.492455\n", + "4.0 1.352848\n", + "5.0 1.384756\n", "dtype: float64" ] }, + "execution_count": 4, "metadata": {}, - "execution_count": 4 + "output_type": "execute_result" } ], "source": [ - "df.groupby('RAD').apply(lambda x: mean_squared_error(x[\"y\"], x[\"yhat\"]))" + "df.groupby(\"AveRooms\").apply(lambda x: mean_squared_error(x[\"y\"], x[\"yhat\"])).head()" ] }, { + "cell_type": "markdown", + "metadata": {}, "source": [ "You could also do the following using `xskillscore`.\n", "\n", "First, structure the `pandas.DataFrame` to keep the core fields when converting to an `xarray` object:" - ], - "cell_type": "markdown", - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 5, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2021-10-08T20:48:11.192324Z", + "iopub.status.busy": "2021-10-08T20:48:11.191807Z", + "iopub.status.idle": "2021-10-08T20:48:11.217701Z", + "shell.execute_reply": "2021-10-08T20:48:11.216703Z", + "shell.execute_reply.started": "2021-10-08T20:48:11.192292Z" + } + }, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - " y yhat\n", - "index RAD \n", - "0 1.0 24.0 17.977926\n", - "1 2.0 21.6 41.070858\n", - "2 2.0 34.7 50.800380\n", - "3 3.0 33.4 39.990387\n", - "4 3.0 36.2 11.295750\n", - "... ... ...\n", - "501 1.0 22.4 24.017117\n", - "502 1.0 20.6 12.752538\n", - "503 1.0 23.9 38.899402\n", - "504 1.0 22.0 30.128172\n", - "505 1.0 11.9 5.000000\n", - "\n", - "[506 rows x 2 columns]" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
yyhat
indexAveRooms
07.04.5263.390337
16.03.5856.816622
28.03.5215.154701
36.03.4134.086443
46.03.4221.067792
............
206355.00.7810.611083
206366.00.7711.497737
206375.00.9230.648200
206385.00.8471.470100
206395.00.8940.166662
\n", + "

20640 rows × 2 columns

\n", + "
" ], - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
yyhat
indexRAD
01.024.017.977926
12.021.641.070858
22.034.750.800380
33.033.439.990387
43.036.211.295750
............
5011.022.424.017117
5021.020.612.752538
5031.023.938.899402
5041.022.030.128172
5051.011.95.000000
\n

506 rows × 2 columns

\n
" + "text/plain": [ + " y yhat\n", + "index AveRooms \n", + "0 7.0 4.526 3.390337\n", + "1 6.0 3.585 6.816622\n", + "2 8.0 3.521 5.154701\n", + "3 6.0 3.413 4.086443\n", + "4 6.0 3.422 1.067792\n", + "... ... ...\n", + "20635 5.0 0.781 0.611083\n", + "20636 6.0 0.771 1.497737\n", + "20637 5.0 0.923 0.648200\n", + "20638 5.0 0.847 1.470100\n", + "20639 5.0 0.894 0.166662\n", + "\n", + "[20640 rows x 2 columns]" + ] }, + "execution_count": 5, "metadata": {}, - "execution_count": 5 + "output_type": "execute_result" } ], "source": [ - "min_df = df.reset_index().set_index([\"index\", \"RAD\"])[[\"y\", \"yhat\"]]\n", + "min_df = df.reset_index().set_index([\"index\", \"AveRooms\"])[[\"y\", \"yhat\"]]\n", "min_df" ] }, { - "source": [ - "Convert it to an `xarray.Dataset` using `pandas.DataFrame.to_xarray`. Note: This will create an array of `index` by `RAD` and pad the values that do not exist with `nan`." - ], "cell_type": "markdown", - "metadata": {} + "metadata": {}, + "source": [ + "Convert it to an `xarray.Dataset` using `pandas.DataFrame.to_xarray`. Note: This will create an array of `index` by `AveRooms` and pad the values that do not exist with `nan`." + ] }, { "cell_type": "code", "execution_count": 6, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2021-10-08T20:48:11.219754Z", + "iopub.status.busy": "2021-10-08T20:48:11.219360Z", + "iopub.status.idle": "2021-10-08T20:48:11.248632Z", + "shell.execute_reply": "2021-10-08T20:48:11.247861Z", + "shell.execute_reply.started": "2021-10-08T20:48:11.219713Z" + } + }, "outputs": [ { - "output_type": "execute_result", "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:   (index: 20640, AveRooms: 47)\n",
+       "Coordinates:\n",
+       "  * index     (index) int64 0 1 2 3 4 5 ... 20634 20635 20636 20637 20638 20639\n",
+       "  * AveRooms  (AveRooms) float64 1.0 2.0 3.0 4.0 5.0 ... 60.0 62.0 133.0 142.0\n",
+       "Data variables:\n",
+       "    y         (index, AveRooms) float64 nan nan nan nan nan ... nan nan nan nan\n",
+       "    yhat      (index, AveRooms) float64 nan nan nan nan nan ... nan nan nan nan
" + ], "text/plain": [ "\n", - "Dimensions: (RAD: 9, index: 506)\n", + "Dimensions: (index: 20640, AveRooms: 47)\n", "Coordinates:\n", - " * index (index) int64 0 1 2 3 4 5 6 7 8 ... 498 499 500 501 502 503 504 505\n", - " * RAD (RAD) float64 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 24.0\n", + " * index (index) int64 0 1 2 3 4 5 ... 20634 20635 20636 20637 20638 20639\n", + " * AveRooms (AveRooms) float64 1.0 2.0 3.0 4.0 5.0 ... 60.0 62.0 133.0 142.0\n", "Data variables:\n", - " y (index, RAD) float64 24.0 nan nan nan nan ... nan nan nan nan nan\n", - " yhat (index, RAD) float64 17.98 nan nan nan nan ... nan nan nan nan nan" - ], - "text/html": "
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
<xarray.Dataset>\nDimensions:  (RAD: 9, index: 506)\nCoordinates:\n  * index    (index) int64 0 1 2 3 4 5 6 7 8 ... 498 499 500 501 502 503 504 505\n  * RAD      (RAD) float64 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 24.0\nData variables:\n    y        (index, RAD) float64 24.0 nan nan nan nan ... nan nan nan nan nan\n    yhat     (index, RAD) float64 17.98 nan nan nan nan ... nan nan nan nan nan
" + " y (index, AveRooms) float64 nan nan nan nan nan ... nan nan nan nan\n", + " yhat (index, AveRooms) float64 nan nan nan nan nan ... nan nan nan nan" + ] }, + "execution_count": 6, "metadata": {}, - "execution_count": 6 + "output_type": "execute_result" } ], "source": [ @@ -251,120 +918,649 @@ ] }, { - "source": [ - "You call now apply any metric from `xskillscore` using the accessor method. The input for the `dim` argument is `index` as we want to reduce this dimension and apply the metric over `RAD`. In addition, there are `nan`'s in the `xarray.Dataset` so you should use `skipna=True`:" - ], "cell_type": "markdown", - "metadata": {} + "metadata": {}, + "source": [ + "You call now apply any metric from `xskillscore` using the accessor method. The input for the `dim` argument is `index` as we want to reduce this dimension and apply the metric over `AveRooms`. In addition, there are `nan`'s in the `xarray.Dataset` so you should use `skipna=True`:" + ] }, { "cell_type": "code", "execution_count": 7, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2021-10-08T20:48:11.250257Z", + "iopub.status.busy": "2021-10-08T20:48:11.249828Z", + "iopub.status.idle": "2021-10-08T20:48:11.306232Z", + "shell.execute_reply": "2021-10-08T20:48:11.305395Z", + "shell.execute_reply.started": "2021-10-08T20:48:11.250218Z" + } + }, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - "\n", - "array([161.23755363, 313.85575025, 307.22076012, 162.63442999,\n", - " 221.85296903, 155.6129776 , 214.37524005, 278.09256049,\n", - " 148.84050691])\n", + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.DataArray (AveRooms: 47)>\n",
+       "array([1.78946554e+00, 1.82700352e+00, 1.49245536e+00, 1.35284843e+00,\n",
+       "       1.38475581e+00, 1.78975720e+00, 3.26525395e+00, 4.80747797e+00,\n",
+       "       5.16524336e+00, 3.13496890e+00, 1.23401728e+00, 1.12562885e+00,\n",
+       "       5.54888374e-01, 2.61824323e+00, 8.84529997e-01, 1.17865387e+00,\n",
+       "       8.96786588e-01, 6.93484341e-01, 8.44837355e-01, 9.50615751e-01,\n",
+       "       2.55912220e+00, 4.16548298e-01, 3.07284580e-01, 8.31537279e-01,\n",
+       "       4.06466713e+00, 8.79983025e-01, 1.09491040e-02, 1.12379707e+00,\n",
+       "       1.50188148e+00, 1.56069394e+00, 2.73330025e-02, 2.68438951e-01,\n",
+       "       4.63967683e-01, 1.47081770e+00, 3.28568563e+00, 4.86835859e-01,\n",
+       "       5.48064237e-04, 1.40563208e+00, 9.04093610e-01, 3.26459003e-01,\n",
+       "       1.48460982e-01, 3.39427104e+00, 4.19379397e+00, 1.74130396e-01,\n",
+       "       1.04411235e+00, 1.23495233e+00, 2.64087781e-01])\n",
        "Coordinates:\n",
-       "  * RAD      (RAD) float64 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 24.0"
+       "  * AveRooms  (AveRooms) float64 1.0 2.0 3.0 4.0 5.0 ... 60.0 62.0 133.0 142.0
" ], - "text/html": "
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
<xarray.DataArray (RAD: 9)>\narray([161.23755363, 313.85575025, 307.22076012, 162.63442999,\n       221.85296903, 155.6129776 , 214.37524005, 278.09256049,\n       148.84050691])\nCoordinates:\n  * RAD      (RAD) float64 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 24.0
" + "text/plain": [ + "\n", + "array([1.78946554e+00, 1.82700352e+00, 1.49245536e+00, 1.35284843e+00,\n", + " 1.38475581e+00, 1.78975720e+00, 3.26525395e+00, 4.80747797e+00,\n", + " 5.16524336e+00, 3.13496890e+00, 1.23401728e+00, 1.12562885e+00,\n", + " 5.54888374e-01, 2.61824323e+00, 8.84529997e-01, 1.17865387e+00,\n", + " 8.96786588e-01, 6.93484341e-01, 8.44837355e-01, 9.50615751e-01,\n", + " 2.55912220e+00, 4.16548298e-01, 3.07284580e-01, 8.31537279e-01,\n", + " 4.06466713e+00, 8.79983025e-01, 1.09491040e-02, 1.12379707e+00,\n", + " 1.50188148e+00, 1.56069394e+00, 2.73330025e-02, 2.68438951e-01,\n", + " 4.63967683e-01, 1.47081770e+00, 3.28568563e+00, 4.86835859e-01,\n", + " 5.48064237e-04, 1.40563208e+00, 9.04093610e-01, 3.26459003e-01,\n", + " 1.48460982e-01, 3.39427104e+00, 4.19379397e+00, 1.74130396e-01,\n", + " 1.04411235e+00, 1.23495233e+00, 2.64087781e-01])\n", + "Coordinates:\n", + " * AveRooms (AveRooms) float64 1.0 2.0 3.0 4.0 5.0 ... 60.0 62.0 133.0 142.0" + ] }, + "execution_count": 7, "metadata": {}, - "execution_count": 7 + "output_type": "execute_result" } ], "source": [ - "out = ds.xs.mse('y', 'yhat', dim=\"index\", skipna=True)\n", + "out = ds.xs.mse(\"y\", \"yhat\", dim=\"index\", skipna=True)\n", "out" ] }, { + "cell_type": "markdown", + "metadata": {}, "source": [ "It makes sense to return the data in tabular form hence you can call `xarray.DataArray.to_series` to convert it to a `pandas.Series`:" - ], - "cell_type": "markdown", - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 8, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2021-10-08T20:48:11.308026Z", + "iopub.status.busy": "2021-10-08T20:48:11.307711Z", + "iopub.status.idle": "2021-10-08T20:48:11.314966Z", + "shell.execute_reply": "2021-10-08T20:48:11.313984Z", + "shell.execute_reply.started": "2021-10-08T20:48:11.307992Z" + } + }, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ - "RAD\n", - "1.0 161.237554\n", - "2.0 313.855750\n", - "3.0 307.220760\n", - "4.0 162.634430\n", - "5.0 221.852969\n", - "6.0 155.612978\n", - "7.0 214.375240\n", - "8.0 278.092560\n", - "24.0 148.840507\n", + "AveRooms\n", + "1.0 1.789466\n", + "2.0 1.827004\n", + "3.0 1.492455\n", + "4.0 1.352848\n", + "5.0 1.384756\n", "dtype: float64" ] }, + "execution_count": 8, "metadata": {}, - "execution_count": 8 + "output_type": "execute_result" } ], "source": [ - "out.to_series()" + "out.to_series().head()" ] }, { + "cell_type": "markdown", + "metadata": {}, "source": [ "## Evaluating predictions over many columns" - ], - "cell_type": "markdown", - "metadata": {} + ] }, { + "cell_type": "markdown", + "metadata": {}, "source": [ "`xskillscore` is built upon `xarray.apply_ufunc` which offers speed-up by vectorizing operations. As a result `xskillscore` can be faster than `pandas.groupby.apply`. This is espicially true if there are many samples in the dataset and if the predictions have to be evaluated over many fields.\n", "\n", "For this exercise we will create fake data for which the predictions have to be evaluated over three fields:" - ], - "cell_type": "markdown", - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 9, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2021-10-08T20:48:11.316699Z", + "iopub.status.busy": "2021-10-08T20:48:11.316373Z", + "iopub.status.idle": "2021-10-08T20:48:11.952125Z", + "shell.execute_reply": "2021-10-08T20:48:11.951413Z", + "shell.execute_reply.started": "2021-10-08T20:48:11.316667Z" + } + }, "outputs": [ { - "output_type": "execute_result", "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DATESTORESKUyyhat
02020-01-010063.874272
12020-01-0101913.551266
22020-01-010283.979884
32020-01-010333.222543
42020-01-010461.647346
..................
999952020-01-10999511.000000
999962020-01-10999642.770135
999972020-01-10999775.820397
999982020-01-10999821.000000
999992020-01-10999921.000000
\n", + "

100000 rows × 5 columns

\n", + "
" + ], "text/plain": [ " DATE STORE SKU y yhat\n", - "0 2020-01-01 0 0 3 4.617306\n", - "1 2020-01-01 0 1 6 1.000000\n", - "2 2020-01-01 0 2 2 3.039347\n", - "3 2020-01-01 0 3 3 5.102145\n", - "4 2020-01-01 0 4 5 3.563087\n", + "0 2020-01-01 0 0 6 3.874272\n", + "1 2020-01-01 0 1 9 13.551266\n", + "2 2020-01-01 0 2 8 3.979884\n", + "3 2020-01-01 0 3 3 3.222543\n", + "4 2020-01-01 0 4 6 1.647346\n", "... ... ... ... .. ...\n", - "99995 2020-01-10 99 95 9 15.836256\n", - "99996 2020-01-10 99 96 5 7.515791\n", - "99997 2020-01-10 99 97 1 1.000000\n", - "99998 2020-01-10 99 98 6 6.676512\n", - "99999 2020-01-10 99 99 5 4.600985\n", + "99995 2020-01-10 99 95 1 1.000000\n", + "99996 2020-01-10 99 96 4 2.770135\n", + "99997 2020-01-10 99 97 7 5.820397\n", + "99998 2020-01-10 99 98 2 1.000000\n", + "99999 2020-01-10 99 99 2 1.000000\n", "\n", "[100000 rows x 5 columns]" - ], - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
DATESTORESKUyyhat
02020-01-010034.617306
12020-01-010161.000000
22020-01-010223.039347
32020-01-010335.102145
42020-01-010453.563087
..................
999952020-01-109995915.836256
999962020-01-10999657.515791
999972020-01-10999711.000000
999982020-01-10999866.676512
999992020-01-10999954.600985
\n

100000 rows × 5 columns

\n
" + ] }, + "execution_count": 9, "metadata": {}, - "execution_count": 9 + "output_type": "execute_result" } ], "source": [ @@ -388,98 +1584,116 @@ " )\n", "df = pd.DataFrame(rows)\n", "\n", - "noise = np.random.uniform(-1, 1, size=len(df['y']))\n", - "df['yhat'] = (df['y'] + (df['y'] * noise)).clip(lower=df[\"y\"].min())\n", + "noise = np.random.uniform(-1, 1, size=len(df[\"y\"]))\n", + "df[\"yhat\"] = (df[\"y\"] + (df[\"y\"] * noise)).clip(lower=df[\"y\"].min())\n", "df" ] }, { + "cell_type": "markdown", + "metadata": {}, "source": [ "Time the `pandas.groupby.apply` method:" - ], - "cell_type": "markdown", - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 10, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2021-10-08T20:48:11.953540Z", + "iopub.status.busy": "2021-10-08T20:48:11.953024Z", + "iopub.status.idle": "2021-10-08T20:48:15.357312Z", + "shell.execute_reply": "2021-10-08T20:48:15.356680Z", + "shell.execute_reply.started": "2021-10-08T20:48:11.953508Z" + } + }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "CPU times: user 2.57 s, sys: 10.3 ms, total: 2.58 s\nWall time: 2.58 s\n" + "CPU times: user 3.38 s, sys: 14.6 ms, total: 3.39 s\n", + "Wall time: 3.4 s\n" ] }, { - "output_type": "execute_result", "data": { "text/plain": [ "STORE SKU\n", - "0 0 8.384360\n", - " 1 7.071648\n", - " 2 14.677462\n", - " 3 13.391239\n", - " 4 12.131033\n", + "0 0 10.968313\n", + " 1 5.465377\n", + " 2 2.546790\n", + " 3 4.274809\n", + " 4 8.443736\n", " ... \n", - "99 95 18.473114\n", - " 96 10.154608\n", - " 97 11.743513\n", - " 98 8.406069\n", - " 99 7.098808\n", + "99 95 6.832711\n", + " 96 4.262613\n", + " 97 11.533266\n", + " 98 14.450065\n", + " 99 2.820765\n", "Length: 10000, dtype: float64" ] }, + "execution_count": 10, "metadata": {}, - "execution_count": 10 + "output_type": "execute_result" } ], "source": [ "%%time\n", - "df.groupby(['STORE', 'SKU']).apply(lambda x: mean_squared_error(x[\"y\"], x[\"yhat\"]))" + "df.groupby([\"STORE\", \"SKU\"]).apply(lambda x: mean_squared_error(x[\"y\"], x[\"yhat\"]))" ] }, { + "cell_type": "markdown", + "metadata": {}, "source": [ "Time it using `xskillscore`:" - ], - "cell_type": "markdown", - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 11, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2021-10-08T20:48:15.358729Z", + "iopub.status.busy": "2021-10-08T20:48:15.358406Z", + "iopub.status.idle": "2021-10-08T20:48:15.387412Z", + "shell.execute_reply": "2021-10-08T20:48:15.386799Z", + "shell.execute_reply.started": "2021-10-08T20:48:15.358695Z" + } + }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "CPU times: user 21.8 ms, sys: 4.02 ms, total: 25.8 ms\nWall time: 24.4 ms\n" + "CPU times: user 20.2 ms, sys: 3.89 ms, total: 24.1 ms\n", + "Wall time: 22.2 ms\n" ] }, { - "output_type": "execute_result", "data": { "text/plain": [ "STORE SKU\n", - "0 0 8.384360\n", - " 1 7.071648\n", - " 2 14.677462\n", - " 3 13.391239\n", - " 4 12.131033\n", + "0 0 10.968313\n", + " 1 5.465377\n", + " 2 2.546790\n", + " 3 4.274809\n", + " 4 8.443736\n", " ... \n", - "99 95 18.473114\n", - " 96 10.154608\n", - " 97 11.743513\n", - " 98 8.406069\n", - " 99 7.098808\n", + "99 95 6.832711\n", + " 96 4.262613\n", + " 97 11.533266\n", + " 98 14.450065\n", + " 99 2.820765\n", "Length: 10000, dtype: float64" ] }, + "execution_count": 11, "metadata": {}, - "execution_count": 11 + "output_type": "execute_result" } ], "source": [ @@ -490,11 +1704,11 @@ ] }, { + "cell_type": "markdown", + "metadata": {}, "source": [ "See [xskillscore-tutorial](https://github.com/raybellwaves/xskillscore-tutorial) for further reading." - ], - "cell_type": "markdown", - "metadata": {} + ] }, { "cell_type": "code", @@ -503,5 +1717,29 @@ "outputs": [], "source": [] } - ] + ], + "metadata": { + "interpreter": { + "hash": "e5607b67897ceeb4cb8d1a6f5e8f77cf995244d75ab9ff3b133e23bb37c07f75" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } From b52cc3f66c4b74fdacee54ee2d9a7541dfd6263c Mon Sep 17 00:00:00 2001 From: Ray Bell Date: Fri, 8 Oct 2021 16:59:04 -0400 Subject: [PATCH 2/2] add PR num --- CHANGELOG.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 3f81d623..c1817bc9 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,7 +6,7 @@ xskillscore v0.0.24 (2021-10-08) -------------------------------- Documentation ~~~~~~~~~~~~~ -- Replaced Boston house data in ``tabular.ipynb`` (:pr:`XX`) `Ray Bell`_. +- Replaced Boston house data in ``tabular.ipynb`` (:pr:`352`) `Ray Bell`_. xskillscore v0.0.23 (2021-08-09)