[1]:

import pandas as pd
pd.set_option("display.max_rows", 20)

Nest¶

[2]:

from siuba import _, nest, unnest, group_by
from gapminder import gapminder

Specifying column to exclude¶

[3]:

gap_country = nest(gapminder, -_.country)
gap_country

[3]:

	country	data
0	Afghanistan	continent year lifeExp pop gdpPer...
1	Albania	continent year lifeExp pop gdpPer...
2	Algeria	continent year lifeExp pop gdpPe...
3	Angola	continent year lifeExp pop gdpPe...
4	Argentina	continent year lifeExp pop gdpP...
...	...	...
137	Vietnam	continent year lifeExp pop gdp...
138	West Bank and Gaza	continent year lifeExp pop gdpP...
139	Yemen, Rep.	continent year lifeExp pop gdp...
140	Zambia	continent year lifeExp pop gdp...
141	Zimbabwe	continent year lifeExp pop gdpP...

142 rows × 2 columns

[4]:

# unnest is its inverse (except for some sorting!)
unnest(gap_country, "data")

[4]:

	country	continent	year	lifeExp	pop	gdpPercap
0	Afghanistan	Asia	1952	28.801	8425333	779.445314
1	Afghanistan	Asia	1957	30.332	9240934	820.853030
2	Afghanistan	Asia	1962	31.997	10267083	853.100710
3	Afghanistan	Asia	1967	34.020	11537966	836.197138
4	Afghanistan	Asia	1972	36.088	13079460	739.981106
...	...	...	...	...	...	...
1699	Zimbabwe	Africa	1987	62.351	9216418	706.157306
1700	Zimbabwe	Africa	1992	60.377	10704340	693.420786
1701	Zimbabwe	Africa	1997	46.809	11404948	792.449960
1702	Zimbabwe	Africa	2002	39.989	11926563	672.038623
1703	Zimbabwe	Africa	2007	43.487	12311143	469.709298

1704 rows × 6 columns

[5]:

# specifying columns to nest directly
df = pd.DataFrame({
    'group': ['a', 'a', 'b', 'b'],
    'value': [1,2,3,4]
})

df >> nest(_.value)

[5]:

	group	data
0	a	value 0 1 1 2
1	b	value 2 3 3 4

[6]:

# equivalent to
# gapminder >> nest(-_.country, -_.continent)

(gapminder
  >> group_by(_.country, _.continent)
  >> nest()
  )

[6]:

142 rows × 3 columns

[7]:

from siuba import _, unnest, mutate

sent = pd.DataFrame({
    'id': ['1', '2'],
    'sentence': ['a b c d e', 'x y z']
})

sent

[7]:

	id	sentence
0	1	a b c d e
1	2	x y z

[8]:

split_sent = sent >> mutate(data = _.sentence.str.split(" "))

split_sent

[8]:

	id	sentence	data
0	1	a b c d e	[a, b, c, d, e]
1	2	x y z	[x, y, z]

[9]:

split_sent >> unnest()

[9]:

Edit page on github here. Interactive version: