[1]:
import pandas as pd
pd.set_option("display.max_rows", 5)

Mutate

[2]:
from siuba import _, group_by, mutate, select
from siuba.data import mtcars

small_cars = mtcars[["mpg", "cyl", "hp"]]

Assign new column

[3]:
mutate(small_cars, cyl2 = _.cyl * 2)
[3]:
mpg cyl hp cyl2
0 21.0 6 110 12
1 21.0 6 110 12
... ... ... ... ...
30 15.0 8 335 16
31 21.4 4 109 8

32 rows × 4 columns

[4]:
mutate(small_cars, cyl2 = _.cyl * 2, cyl4 = _.cyl2 * 2)
[4]:
mpg cyl hp cyl2 cyl4
0 21.0 6 110 12 24
1 21.0 6 110 12 24
... ... ... ... ... ...
30 15.0 8 335 16 32
31 21.4 4 109 8 16

32 rows × 5 columns

[5]:
mutate(small_cars, misc = "hey")
[5]:
mpg cyl hp misc
0 21.0 6 110 hey
1 21.0 6 110 hey
... ... ... ... ...
30 15.0 8 335 hey
31 21.4 4 109 hey

32 rows × 4 columns

Used with group_by

[6]:
(small_cars
  >> group_by(_.cyl)
  >> mutate(
       hp_mean = _.hp.mean(),
       demeaned_hp = _.hp - _.hp_mean
     )
  )
[6]:

(grouped data frame)

mpg cyl hp hp_mean demeaned_hp
0 21.0 6 110 122.285714 -12.285714
1 21.0 6 110 122.285714 -12.285714
... ... ... ... ... ...
30 15.0 8 335 209.214286 125.785714
31 21.4 4 109 82.636364 26.363636

32 rows × 5 columns

[7]:
(small_cars
  >> group_by(_.cyl)
  >> mutate(
       hp_per_cyl = _.hp / _.cyl,
       diff = _.hp_per_cyl - _.hp_per_cyl.shift(1)
     )
  )
[7]:

(grouped data frame)

mpg cyl hp hp_per_cyl diff
0 21.0 6 110 18.333333 NaN
1 21.0 6 110 18.333333 0.000
... ... ... ... ... ...
30 15.0 8 335 41.875000 8.875
31 21.4 4 109 27.250000 -1.000

32 rows × 5 columns

With if_else and case_when

TODO

[ ]:

Edit page on github here. Interactive version: Binder badge