[1]:
import pandas as pd
pd.set_option("display.max_rows", 20)

Nest

[2]:
from siuba import _, nest, unnest, group_by
from gapminder import gapminder

Specifying column to exclude

[3]:
gap_country = nest(gapminder, -_.country)
gap_country
[3]:
country data
0 Afghanistan continent year lifeExp pop gdpPer...
1 Albania continent year lifeExp pop gdpPer...
2 Algeria continent year lifeExp pop gdpPe...
3 Angola continent year lifeExp pop gdpPe...
4 Argentina continent year lifeExp pop gdpP...
... ... ...
137 Vietnam continent year lifeExp pop gdp...
138 West Bank and Gaza continent year lifeExp pop gdpP...
139 Yemen, Rep. continent year lifeExp pop gdp...
140 Zambia continent year lifeExp pop gdp...
141 Zimbabwe continent year lifeExp pop gdpP...

142 rows × 2 columns

[4]:
# unnest is its inverse (except for some sorting!)
unnest(gap_country, "data")
[4]:
country continent year lifeExp pop gdpPercap
0 Afghanistan Asia 1952 28.801 8425333 779.445314
1 Afghanistan Asia 1957 30.332 9240934 820.853030
2 Afghanistan Asia 1962 31.997 10267083 853.100710
3 Afghanistan Asia 1967 34.020 11537966 836.197138
4 Afghanistan Asia 1972 36.088 13079460 739.981106
... ... ... ... ... ... ...
1699 Zimbabwe Africa 1987 62.351 9216418 706.157306
1700 Zimbabwe Africa 1992 60.377 10704340 693.420786
1701 Zimbabwe Africa 1997 46.809 11404948 792.449960
1702 Zimbabwe Africa 2002 39.989 11926563 672.038623
1703 Zimbabwe Africa 2007 43.487 12311143 469.709298

1704 rows × 6 columns

Specifying column to include

[5]:
# specifying columns to nest directly
df = pd.DataFrame({
    'group': ['a', 'a', 'b', 'b'],
    'value': [1,2,3,4]
})

df >> nest(_.value)
[5]:
group data
0 a value 0 1 1 2
1 b value 2 3 3 4

Group by and nesting

[6]:
# equivalent to
# gapminder >> nest(-_.country, -_.continent)

(gapminder
  >> group_by(_.country, _.continent)
  >> nest()
  )
[6]:
country continent data
0 Afghanistan Asia year lifeExp pop gdpPercap 0 19...
1 Albania Europe year lifeExp pop gdpPercap 12 19...
2 Algeria Africa year lifeExp pop gdpPercap 24 1...
3 Angola Africa year lifeExp pop gdpPercap 36 1...
4 Argentina Americas year lifeExp pop gdpPercap 48 ...
... ... ... ...
137 Vietnam Asia year lifeExp pop gdpPercap 164...
138 West Bank and Gaza Asia year lifeExp pop gdpPercap 1656...
139 Yemen, Rep. Asia year lifeExp pop gdpPercap 166...
140 Zambia Africa year lifeExp pop gdpPercap 168...
141 Zimbabwe Africa year lifeExp pop gdpPercap 1692...

142 rows × 3 columns

Unnesting lists

For context, see this Stack Overflow post.

[7]:
from siuba import _, unnest, mutate

sent = pd.DataFrame({
    'id': ['1', '2'],
    'sentence': ['a b c d e', 'x y z']
})

sent
[7]:
id sentence
0 1 a b c d e
1 2 x y z
[8]:
split_sent = sent >> mutate(data = _.sentence.str.split(" "))

split_sent
[8]:
id sentence data
0 1 a b c d e [a, b, c, d, e]
1 2 x y z [x, y, z]
[9]:
split_sent >> unnest()
[9]:
id sentence data
0 1 a b c d e a
1 1 a b c d e b
2 1 a b c d e c
3 1 a b c d e d
4 1 a b c d e e
5 2 x y z x
6 2 x y z y
7 2 x y z z

Edit page on github here. Interactive version: Binder badge