{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "nbsphinx": "hidden"
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "pd.set_option(\"display.max_rows\", 5)\n",
    "\n",
    "import tabulate\n",
    "\n",
    "table = tabulate.tabulate([\n",
    "    [\"Column operations are pandas Series methods\", \"✅\", \"✅\", \"✅\"],\n",
    "    [\"Table verbs supports user defined functions\", \"✅\", \"✅\", \"✅\"],\n",
    "    [\"pipe syntax (`>>`) \", \"✅\", \"✅\", \"❌\"],\n",
    "    [\"concise, <b>lazy expressions</b> (`_.a + _.b`)\", \"✅\", \"✅\", \"❌\"],\n",
    "    [\"No more reset_index\", \"✅\", \"✅\", \"❌\"],\n",
    "    [\"<b>unified API</b> over (un)grouped data\", \"✅\", \"✅\", \"❌\"],\n",
    "    [\"generate fast grouped operations\", \"✅\", \"❌\", \"✅\"],\n",
    "    [\"<br>generate SQL queries</b> \", \"✅\", \"❌\", \"❌\"],\n",
    "    [\"Abstract syntax trees for <br>transforming operations<b>\", \"✅\", \"❌\", \"❌\"],\n",
    "    [\"handles nested data\", \"✅\", \"❌\", \"⚠️\"]\n",
    "\n",
    "\n",
    "], tablefmt = \"html\", headers = [\"siuba\", \"dplython\", \"pandas\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Key features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "jupyter": {
     "source_hidden": true
    },
    "nbsphinx": "hidden"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "<div class=\"output_area rendered_html docutils container\">\n",
      "  <table><thead><tr><th>                                                        </th><th>siuba  </th><th>dplython  </th><th>pandas  </th></tr></thead><tbody><tr><td>Column operations are pandas Series methods             </td><td>✅     </td><td>✅        </td><td>✅      </td></tr><tr><td>Table verbs supports user defined functions             </td><td>✅     </td><td>✅        </td><td>✅      </td></tr><tr><td>pipe syntax (`>>`)                                      </td><td>✅     </td><td>✅        </td><td>❌      </td></tr><tr><td>concise, <b>lazy expressions</b> (`_.a + _.b`)          </td><td>✅     </td><td>✅        </td><td>❌      </td></tr><tr><td>No more reset_index                                     </td><td>✅     </td><td>✅        </td><td>❌      </td></tr><tr><td><b>unified API</b> over (un)grouped data                </td><td>✅     </td><td>✅        </td><td>❌      </td></tr><tr><td>generate fast grouped operations                        </td><td>✅     </td><td>❌        </td><td>✅      </td></tr><tr><td><br>generate SQL queries</b>                            </td><td>✅     </td><td>❌        </td><td>❌      </td></tr><tr><td>Abstract syntax trees for <br>transforming operations<b></td><td>✅     </td><td>❌        </td><td>❌      </td></tr><tr><td>handles nested data                                     </td><td>✅     </td><td>❌        </td><td>⚠️       </td></tr></tbody></table>\n",
      "</div>\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# this is a hidden cell\n",
    "\n",
    "print(\"\"\"\n",
    "<div class=\"output_area rendered_html docutils container\">\n",
    "  {table}\n",
    "</div>\n",
    "\"\"\".format(table = table.replace('\\n', \"\")))"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {
    "raw_mimetype": "text/html"
   },
   "source": [
    "<div class=\"output_area rendered_html docutils container\">\n",
    "  <table><thead><tr><th>                                                        </th><th>siuba  </th><th>dplython  </th><th>pandas  </th></tr></thead><tbody><tr><td>Column operations are pandas Series methods             </td><td>✅     </td><td>✅        </td><td>✅      </td></tr><tr><td>Table verbs supports user defined functions             </td><td>✅     </td><td>✅        </td><td>✅      </td></tr><tr><td>pipe syntax (`>>`)                                      </td><td>✅     </td><td>✅        </td><td>❌      </td></tr><tr><td>concise, <b>lazy expressions</b> (`_.a + _.b`)          </td><td>✅     </td><td>✅        </td><td>❌      </td></tr><tr><td>No more reset_index                                     </td><td>✅     </td><td>✅        </td><td>❌      </td></tr><tr><td><b>unified API</b> over (un)grouped data                </td><td>✅     </td><td>✅        </td><td>❌      </td></tr><tr><td>generate fast grouped operations                        </td><td>✅     </td><td>❌        </td><td>✅      </td></tr><tr><td><br>generate SQL queries</b>                            </td><td>✅     </td><td>❌        </td><td>❌      </td></tr><tr><td>Abstract syntax trees for <br>transforming operations<b></td><td>✅     </td><td>❌        </td><td>❌      </td></tr><tr><td>handles nested data                                     </td><td>✅     </td><td>❌        </td><td>⚠️       </td></tr></tbody></table>\n",
    "</div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Built on pandas\n",
    "\n",
    "pandas is everywhere in python data analysis. The siuba library builds on this incredible work by using pandas Series methods as its reference implementation. This means that you use the pandas methods you've already learned!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>g</th>\n",
       "      <th>x</th>\n",
       "      <th>avg</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>a</td>\n",
       "      <td>1</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>a</td>\n",
       "      <td>2</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>b</td>\n",
       "      <td>3</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   g  x  avg\n",
       "0  a  1  2.0\n",
       "1  a  2  2.0\n",
       "2  b  3  2.0"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from siuba import _, mutate\n",
    "\n",
    "my_data = pd.DataFrame({\n",
    "    'g': ['a', 'a', 'b'],\n",
    "    'x': [1,2,3],\n",
    "})\n",
    "\n",
    "# pandas\n",
    "my_data.assign(avg = lambda d: d.x.mean())\n",
    "\n",
    "# siuba\n",
    "mutate(my_data, avg = _.x.mean())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note how you can debug both pieces of code by running and inspecting `df.a.mean()`.\n",
    "\n",
    "While pandas is an incredibly powerful API, its syntax can get quite cumbersome."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>g</th>\n",
       "      <th>x</th>\n",
       "      <th>avg</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>a</td>\n",
       "      <td>1</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>a</td>\n",
       "      <td>2</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   g  x  avg\n",
       "0  a  1  2.0\n",
       "1  a  2  2.0"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(my_data\n",
    "  .assign(avg = lambda d: d.x.mean())    # create new column\n",
    "  .loc[lambda d: d.x != 3]               # filter out some rows\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Notice how much of this code is writing the word lambda."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Inspired by dplython\n",
    "\n",
    "Like other ports of the popular R library, dplyr--such as dplython--siuba offers a simple, flexible way to work on tables of data."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Pipe syntax\n",
    "\n",
    "The pipe syntax allows you to import table functions (verbs), rather than having 300+ methods on your DataFrame."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>g</th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>a</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>a</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>b</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   g  x  y\n",
       "1  a  2  3\n",
       "0  a  1  2\n",
       "2  b  3  4"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# actions can be imported individually\n",
    "from siuba import mutate, arrange\n",
    "\n",
    "# they can be combined using a pipe\n",
    "my_data >> mutate(y = _.x + 1) >> arrange(_.g, -_.x)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Lazy expressions\n",
    "\n",
    "Using lazy expressions saves you from repeating the name of your DataFrame over and over."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   a  b  c\n",
       "0  1  2  4\n",
       "1  2  3  5\n",
       "2  3  4  6"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# rather than repeat the name of your data, you can use lazy expressions ---\n",
    "my_data_frame = pd.DataFrame({'a': [1,2,3]})\n",
    "\n",
    "\n",
    "# bad\n",
    "my_data_frame[\"b\"] = my_data_frame[\"a\"] + 1\n",
    "my_data_frame[\"c\"] = my_data_frame[\"b\"] + 2\n",
    "\n",
    "# good\n",
    "my_data_frame >> mutate(b = _.a + 1, c = _.b + 2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### No reset_index\n",
    "\n",
    "Notice how siuba mutate can take a DataFrame, and return a DataFrame.\n",
    "Moreover, it doesn't stick columns onto the index.\n",
    "This means you don't need to call `reset_index` all the time.\n",
    "\n",
    "A common place where `reset_index` is called is after a pandas grouped aggregation."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>hp</th>\n",
       "      <th>mpg</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>cyl</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>82.636364</td>\n",
       "      <td>26.663636</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>122.285714</td>\n",
       "      <td>19.742857</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>209.214286</td>\n",
       "      <td>15.100000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             hp        mpg\n",
       "cyl                       \n",
       "4     82.636364  26.663636\n",
       "6    122.285714  19.742857\n",
       "8    209.214286  15.100000"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from siuba.data import mtcars\n",
    "from siuba import summarize\n",
    "\n",
    "g_cyl = mtcars.groupby(\"cyl\")\n",
    "\n",
    "agg_res = g_cyl[[\"hp\", \"mpg\"]].agg(\"mean\")\n",
    "agg_res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cyl</th>\n",
       "      <th>hp</th>\n",
       "      <th>mpg</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4</td>\n",
       "      <td>82.636364</td>\n",
       "      <td>26.663636</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>6</td>\n",
       "      <td>122.285714</td>\n",
       "      <td>19.742857</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>209.214286</td>\n",
       "      <td>15.100000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   cyl          hp        mpg\n",
       "0    4   82.636364  26.663636\n",
       "1    6  122.285714  19.742857\n",
       "2    8  209.214286  15.100000"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# bad\n",
    "agg_res.reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cyl</th>\n",
       "      <th>hp</th>\n",
       "      <th>mpg</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4</td>\n",
       "      <td>82.636364</td>\n",
       "      <td>26.663636</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>6</td>\n",
       "      <td>122.285714</td>\n",
       "      <td>19.742857</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>209.214286</td>\n",
       "      <td>15.100000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   cyl          hp        mpg\n",
       "0    4   82.636364  26.663636\n",
       "1    6  122.285714  19.742857\n",
       "2    8  209.214286  15.100000"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# good\n",
    "summarize(g_cyl, hp = _.hp.mean(), mpg = _.mpg.mean())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Unified (un)grouped API\n",
    "\n",
    "In **siuba** it doesn't matter whether your data is grouped or not."
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {
    "jupyter": {
     "source_hidden": true
    },
    "nbsphinx": "hidden",
    "raw_mimetype": "text/html"
   },
   "source": [
    "<table class=\"table-align-left docutils align-default\">\n",
    "  <tr>\n",
    "    <th>grouped?</th>\n",
    "    <th>siuba</th>\n",
    "    <th>pandas</th>\n",
    "  </tr>\n",
    "\n",
    "  <tr>\n",
    "    <td>\n",
    "      <span>yes</span>\n",
    "    </td>\n",
    "    <td>\n",
    "      <pre lang=\"python\">\n",
    "mutate(mtcars,\n",
    "  res = _.hp - _.hp.mean()\n",
    ")</pre>\n",
    "    </td>\n",
    "    <td>\n",
    "      <pre lang=\"pyton\">\n",
    "mtcars.assign(\n",
    "  res = lambda d: d.hp - d.hp.mean()\n",
    ")</pre>\n",
    "    </td>\n",
    "  </tr>\n",
    "\n",
    "  <tr>\n",
    "    <td>\n",
    "      <span>no</span>\n",
    "    </td>\n",
    "    <td>\n",
    "      <pre lang=\"python\">\n",
    "mutate(g_cyl,\n",
    "  res = _.hp - _.hp.mean()\n",
    ")</pre>\n",
    "    </td>\n",
    "    <td>\n",
    "      <pre lang=\"pyton\">\n",
    "mtcars.assign(\n",
    "  res = mtcars.hp - g_cyl.hp.transform(\"mean\")\n",
    ")</pre>\n",
    "    </td>\n",
    "  </tr>\n",
    "</table>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><p>(grouped data frame)</p><div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mpg</th>\n",
       "      <th>cyl</th>\n",
       "      <th>disp</th>\n",
       "      <th>hp</th>\n",
       "      <th>drat</th>\n",
       "      <th>wt</th>\n",
       "      <th>qsec</th>\n",
       "      <th>vs</th>\n",
       "      <th>am</th>\n",
       "      <th>gear</th>\n",
       "      <th>carb</th>\n",
       "      <th>demeaned</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>21.0</td>\n",
       "      <td>6</td>\n",
       "      <td>160.0</td>\n",
       "      <td>110</td>\n",
       "      <td>3.90</td>\n",
       "      <td>2.620</td>\n",
       "      <td>16.46</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>-12.285714</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>21.0</td>\n",
       "      <td>6</td>\n",
       "      <td>160.0</td>\n",
       "      <td>110</td>\n",
       "      <td>3.90</td>\n",
       "      <td>2.875</td>\n",
       "      <td>17.02</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>-12.285714</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>15.0</td>\n",
       "      <td>8</td>\n",
       "      <td>301.0</td>\n",
       "      <td>335</td>\n",
       "      <td>3.54</td>\n",
       "      <td>3.570</td>\n",
       "      <td>14.60</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>8</td>\n",
       "      <td>125.785714</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>21.4</td>\n",
       "      <td>4</td>\n",
       "      <td>121.0</td>\n",
       "      <td>109</td>\n",
       "      <td>4.11</td>\n",
       "      <td>2.780</td>\n",
       "      <td>18.60</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>26.363636</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>32 rows × 12 columns</p>\n",
       "</div></div>"
      ],
      "text/plain": [
       "<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1181aac88>"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "g_cyl = mtcars.groupby(\"cyl\")\n",
    "\n",
    "mtcars >> mutate(demeaned = _.hp - _.hp.mean())    # uses ungrouped mean\n",
    "g_cyl  >> mutate(demeaned = _.hp - _.hp.mean())    # uses grouped mean"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In **pandas** you have to change your code for grouped data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mpg</th>\n",
       "      <th>cyl</th>\n",
       "      <th>disp</th>\n",
       "      <th>hp</th>\n",
       "      <th>drat</th>\n",
       "      <th>wt</th>\n",
       "      <th>qsec</th>\n",
       "      <th>vs</th>\n",
       "      <th>am</th>\n",
       "      <th>gear</th>\n",
       "      <th>carb</th>\n",
       "      <th>demeaned</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>21.0</td>\n",
       "      <td>6</td>\n",
       "      <td>160.0</td>\n",
       "      <td>110</td>\n",
       "      <td>3.90</td>\n",
       "      <td>2.620</td>\n",
       "      <td>16.46</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>-12.285714</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>21.0</td>\n",
       "      <td>6</td>\n",
       "      <td>160.0</td>\n",
       "      <td>110</td>\n",
       "      <td>3.90</td>\n",
       "      <td>2.875</td>\n",
       "      <td>17.02</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>-12.285714</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>15.0</td>\n",
       "      <td>8</td>\n",
       "      <td>301.0</td>\n",
       "      <td>335</td>\n",
       "      <td>3.54</td>\n",
       "      <td>3.570</td>\n",
       "      <td>14.60</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>8</td>\n",
       "      <td>125.785714</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>21.4</td>\n",
       "      <td>4</td>\n",
       "      <td>121.0</td>\n",
       "      <td>109</td>\n",
       "      <td>4.11</td>\n",
       "      <td>2.780</td>\n",
       "      <td>18.60</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>26.363636</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>32 rows × 12 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     mpg  cyl   disp   hp  drat     wt   qsec  vs  am  gear  carb    demeaned\n",
       "0   21.0    6  160.0  110  3.90  2.620  16.46   0   1     4     4  -12.285714\n",
       "1   21.0    6  160.0  110  3.90  2.875  17.02   0   1     4     4  -12.285714\n",
       "..   ...  ...    ...  ...   ...    ...    ...  ..  ..   ...   ...         ...\n",
       "30  15.0    8  301.0  335  3.54  3.570  14.60   0   1     5     8  125.785714\n",
       "31  21.4    4  121.0  109  4.11  2.780  18.60   1   1     4     2   26.363636\n",
       "\n",
       "[32 rows x 12 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "g_cyl = mtcars.groupby(\"cyl\")\n",
    "\n",
    "# ungrouped vs grouped mean\n",
    "mtcars.assign(demeaned = lambda d: d.hp - d.hp.mean())\n",
    "mtcars.assign(demeaned = g_cyl.obj.hp - g_cyl.hp.transform(\"mean\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note that `g_cyl` does not have an `assign` method, and requires passing what operation you want to do (`\"mean\"`) as a string to `.transform()`."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "toc-hr-collapsed": false
   },
   "source": [
    "## Going beyond"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Fast grouped operations\n",
    "\n",
    "Consider some data (`students`) where 2,000 students have each completed 10 courses, and received a score on each course."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "nbsphinx": "hidden"
   },
   "outputs": [],
   "source": [
    "# fast grouped operations (pull from dev docs)\n",
    "# PLOT of timing\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "np.random.seed(123)\n",
    "students = pd.DataFrame({\n",
    "    'student_id': np.repeat(np.arange(2000), 10),\n",
    "    'course_id': np.random.randint(1, 20, 20000),\n",
    "    'score': np.random.randint(1, 100, 20000)\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><p>(grouped data frame)</p><div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>student_id</th>\n",
       "      <th>course_id</th>\n",
       "      <th>score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>14</td>\n",
       "      <td>38</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19998</th>\n",
       "      <td>1999</td>\n",
       "      <td>11</td>\n",
       "      <td>32</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19999</th>\n",
       "      <td>1999</td>\n",
       "      <td>17</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>20000 rows × 3 columns</p>\n",
       "</div></div>"
      ],
      "text/plain": [
       "<pandas.core.groupby.generic.DataFrameGroupBy object at 0x10f2152e8>"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "g_students = students.groupby('student_id')\n",
    "g_students"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Suppose that we want to get the courses each student scored lowest on.\n",
    "\n",
    "In pandas we could use some complex, but fast code."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 6.21 ms, sys: 1.9 ms, total: 8.11 ms\n",
      "Wall time: 8.67 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# pandas\n",
    "is_student_min = g_students.obj.score == g_students.score.transform('min')\n",
    "low_scores = students[is_student_min]\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In siuba it is simpler, and comparable in speed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "from siuba.experimental.pd_groups import fast_filter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 9.96 ms, sys: 1.5 ms, total: 11.5 ms\n",
      "Wall time: 18.5 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# siuba\n",
    "low_scores = fast_filter(g_students, _.score == _.score.min())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This is because siuba's lazy expressions let it optimize grouped operations.\n",
    "\n",
    "However, **dplython is over 100x slower** in this case, because it uses the slower pandas `DataFrame.apply()` method under the hood."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# set up code for timing\n",
    "from dplython import X, DplyFrame, sift, group_by as dply_group_by\n",
    "\n",
    "g_students2 = DplyFrame(students) >> dply_group_by(X.student_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1.67 s, sys: 26 ms, total: 1.7 s\n",
      "Wall time: 1.82 s\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>student_id</th>\n",
       "      <th>course_id</th>\n",
       "      <th>score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>1</td>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19987</th>\n",
       "      <td>1998</td>\n",
       "      <td>17</td>\n",
       "      <td>31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19997</th>\n",
       "      <td>1999</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2117 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       student_id  course_id  score\n",
       "2               0          3     17\n",
       "10              1         10      1\n",
       "...           ...        ...    ...\n",
       "19987        1998         17     31\n",
       "19997        1999          3      1\n",
       "\n",
       "[2117 rows x 3 columns]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "g_students2 >> sift(X.score == X.score.min())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### SQL queries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SELECT cars.cyl, cars.mpg, cars.hp, cars.hp - avg(cars.hp) OVER (PARTITION BY cars.cyl) AS demeaned \n",
      "FROM cars\n"
     ]
    }
   ],
   "source": [
    "# generate SQL queries\n",
    "from siuba.data import cars_sql\n",
    "from siuba import group_by, mutate, show_query\n",
    "\n",
    "q = (cars_sql\n",
    "  >> group_by(\"cyl\")\n",
    "  >> mutate(demeaned = _.hp - _.hp.mean())\n",
    "  >> show_query()\n",
    ")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Abstract syntax trees"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This is made possible because siuba represents lazy expressions with abstract syntax trees.\n",
    "Fast grouped operations and SQL queries are just the beginning. This allows people to produce a whole range of interesting tools!\n",
    "\n",
    "Siuba's lazy expressions consist of a Symbolic and Call class.\n",
    "\n",
    "Symbolic is used to quickly create lazy expressions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "█─+\n",
       "├─█─'__call__'\n",
       "│ └─█─.\n",
       "│   ├─█─.\n",
       "│   │ ├─_\n",
       "│   │ └─'a'\n",
       "│   └─'mean'\n",
       "└─█─[\n",
       "  ├─_\n",
       "  └─'b'"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# ASTs for transforming\n",
    "from siuba.siu import Symbolic, Call, strip_symbolic\n",
    "\n",
    "_ = Symbolic()\n",
    "\n",
    "sym = _.a.mean() + _[\"b\"]\n",
    "sym"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Each black box in the printout above is a Call. Calls are the pieces that represent the underlying operations. They have methods to inspect and transform them."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'a', 'b'}"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "call = strip_symbolic(sym)\n",
    "\n",
    "# get columns names used in lazy expression\n",
    "call.op_vars(attr_calls = False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Nested data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>tags</th>\n",
       "      <th>split_tags</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>a,b,c</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>a,b,c</td>\n",
       "      <td>b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>d,e</td>\n",
       "      <td>e</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>3</td>\n",
       "      <td>f</td>\n",
       "      <td>f</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    id   tags split_tags\n",
       "0    1  a,b,c          a\n",
       "1    1  a,b,c          b\n",
       "..  ..    ...        ...\n",
       "4    2    d,e          e\n",
       "5    3      f          f\n",
       "\n",
       "[6 rows x 3 columns]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from siuba import _, mutate, unnest\n",
    "\n",
    "tagged = pd.DataFrame({\n",
    "    'id': [1,2,3],\n",
    "    'tags': ['a,b,c', 'd,e', 'f']\n",
    "})\n",
    "\n",
    "(tagged\n",
    "    >> mutate(split_tags = _.tags.str.split(','))\n",
    "    >> unnest(\"split_tags\")\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}