### ROWS# df["key1", "key2"]# in this case, the keys are the row numbers# loc and at are optimized# df[0:3]print("\n")# like# assert df.iloc[2] == df.loc[2]# Rows are also seriesrow_2=df.iloc[2]display(row_2)# multiple rows is a data fraamedisplay(df.iloc[2:5])
SELECTDISTINCTspecies,islandFROMpenguins;
df[["species","island"]].drop_duplicates()# same thing, all rows, 2 columns# df.loc[:, ["species", "island"]].drop_duplicates()
# DataFrameGroupByspecies=df.groupby("species")# a data frame for each species# species.apply(display)# SeriesGroupBygroupby_column=species["bill_length_mm"]# groupby_column.apply(display)# mean() on the SeriesGroupBy only does it that seriesdisplay(groupby_column.mean())# mean() on the DataFrameGroupBy applies it to every columndisplay(species.mean())
species=df.groupby("species")# size(): special function that only returns the count of every speciesdisplay(species.size())display(species.count())# redundant information for every column
species
Adelie 152
Chinstrap 68
Gentoo 124
dtype: int64
species=df.groupby("species")# DataFrame, key is the species nameaggregated=species.agg({"bill_length_mm":np.mean,"species":np.size})renamed=aggregated.rename(columns={"bill_length_mm":"avg_bill_length","species":"count"})renamed.sort_values("avg_bill_length",ascending=False)
species_island=df.groupby(["species","island"])# DataFrameGroupBy# data frame for every unique combo of (species, island)# aggregated = species_island.apply(display)# DataFrame with multi-level columnsspecies_island_bill_length=species_island.agg({"bill_length_mm":[np.size,np.mean]})display(species_island_bill_length)# to access the individual columns, you have to get the top name firstdisplay(species_island_bill_length["bill_length_mm"]["mean"])# or iloc, all rows, the mean column# species_island_bill_length.iloc[:, 1]
species island
Adelie Biscoe 38.975000
Dream 38.501786
Torgersen 38.950980
Chinstrap Dream 48.833824
Gentoo Biscoe 47.504878
Name: mean, dtype: float64