Skip to content

Why Pandas

Data structures for data analysis

  • series
  • data frames
  • stack

Pandas always returns a copy (never mutates the OG)

Numpy vs Pandas

  • pandas is built with numpy arrays

https://pandas.pydata.org/docs/getting_started/comparison/comparison_with_sql.html

https://pandas.pydata.org/docs/user_guide/10min.html

https://towardsdatascience.com/pandas-equivalent-of-10-useful-sql-queries-f79428e60bd9

import numpy as np
import pandas as pd
df = pd.read_csv("penguins.csv")

Viewing data

display(df.head())
display(df.tail(2))
display(df.describe())
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 MALE
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 FEMALE
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 FEMALE
3 Adelie Torgersen NaN NaN NaN NaN NaN
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 FEMALE
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex
342 Gentoo Biscoe 45.2 14.8 212.0 5200.0 FEMALE
343 Gentoo Biscoe 49.9 16.1 213.0 5400.0 MALE
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
count 342.000000 342.000000 342.000000 342.000000
mean 43.921930 17.151170 200.915205 4201.754386
std 5.459584 1.974793 14.061714 801.954536
min 32.100000 13.100000 172.000000 2700.000000
25% 39.225000 15.600000 190.000000 3550.000000
50% 44.450000 17.300000 197.000000 4050.000000
75% 48.500000 18.700000 213.000000 4750.000000
max 59.600000 21.500000 231.000000 6300.000000

Getting stats on data

# get a single column
# df.bill_length_mm
bill_length = df["bill_length_mm"]

# Series
type(bill_length)

bill_length.mean()
43.9219298245614

Get a single value: at

# `at` can only get one value, can't return a series
assert df.iat[0, 2] == df.at[0, "bill_length_mm"]

Selecting rows

### ROWS

# df["key1", "key2"]
# in this case, the keys are the row numbers

# loc and at are optimized
# df[0:3]

print("\n")

# like
# assert df.iloc[2] == df.loc[2]

# Rows are also series
row_2 = df.iloc[2]
display(row_2)

# multiple rows is a data fraame
display(df.iloc[2:5])
SELECT DISTINCT species, island
FROM penguins;
df[["species", "island"]].drop_duplicates()

# same thing, all rows, 2 columns
# df.loc[:, ["species", "island"]].drop_duplicates()
species island
0 Adelie Torgersen
20 Adelie Biscoe
30 Adelie Dream
152 Chinstrap Dream
220 Gentoo Biscoe

New column

SELECT *, bill_length_mm + bill_depth_mm AS beak_to_eye
FROM penguins;

this returns a new data frame, doesn't mutate the existing df

df.assign(beak_to_eye=df.bill_length_mm + df.bill_depth_mm).head()
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex beak_to_eye
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 MALE 57.8
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 FEMALE 56.9
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 FEMALE 58.3
3 Adelie Torgersen NaN NaN NaN NaN NaN NaN
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 FEMALE 56.0

WHERE (filtering)

boolean indexing

SELECT species, body_mass_g
FROM penguins
WHERE body_mass_g > 6000
  AND species = 'Gentoo'
  AND bill_length_mm IS NOT NULL;
# Boolean series
is_chonky = df.body_mass_g > 6000
is_gentoo = df["species"] == "Gentoo"
are_looong = df.bill_length_mm.notna()

all_columns = df[is_chonky & is_gentoo & are_looong]

display(all_columns)

# loc: 1st param: filter on rows
# 2nd param: column filter
df.loc[is_chonky & is_gentoo & has_length, ["species", "body_mass_g"]]
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex
237 Gentoo Biscoe 49.2 15.2 221.0 6300.0 MALE
253 Gentoo Biscoe 59.6 17.0 230.0 6050.0 MALE
species body_mass_g
237 Gentoo 6300.0
253 Gentoo 6050.0

Get the row with the max

longest_bill_index = df.bill_length_mm.idxmax()

df.iloc[longest_bill_index]
species              Gentoo
island               Biscoe
bill_length_mm         59.6
bill_depth_mm          17.0
flipper_length_mm     230.0
body_mass_g          6050.0
sex                    MALE
Name: 253, dtype: object

Group by

SELECT AVG(bill_length_mm), COUNT(*)
FROM penguins
GROUP BY species
# DataFrameGroupBy
species = df.groupby("species")

# a data frame for each species
# species.apply(display)

# SeriesGroupBy
groupby_column = species["bill_length_mm"]
# groupby_column.apply(display)

# mean() on the SeriesGroupBy only does it that series
display(groupby_column.mean())

# mean() on the DataFrameGroupBy applies it to every column
display(species.mean())
species
Adelie       38.791391
Chinstrap    48.833824
Gentoo       47.504878
Name: bill_length_mm, dtype: float64
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
species
Adelie 38.791391 18.346358 189.953642 3700.662252
Chinstrap 48.833824 18.420588 195.823529 3733.088235
Gentoo 47.504878 14.982114 217.186992 5076.016260
SELECT COUNT(*)
FROM penguins
GROUP BY species
species = df.groupby("species")
# size(): special function that only returns the count of every species
display(species.size())
display(species.count())  # redundant information for every column
species
Adelie       152
Chinstrap     68
Gentoo       124
dtype: int64
island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex
species
Adelie 152 151 151 151 151 146
Chinstrap 68 68 68 68 68 68
Gentoo 124 123 123 123 123 119
SELECT species, COUNT(*) AS 'count', AVG(bill_length_mm) AS avg_bill_length
FROM penguins
GROUP BY species
ORDER BY avg_bill_length DESC
species = df.groupby("species")

# DataFrame, key is the species name
aggregated = species.agg({"bill_length_mm": np.mean, "species": np.size})

renamed = aggregated.rename(columns={"bill_length_mm":"avg_bill_length", "species" : "count"})

renamed.sort_values("avg_bill_length", ascending=False)
avg_bill_length count
species
Chinstrap 48.833824 68
Gentoo 47.504878 124
Adelie 38.791391 152

Group by multiple

SELECT species, COUNT(*) AS 'count', AVG(bill_length_mm) AS avg_bill_length
FROM penguins
GROUP BY species, island
species_island = df.groupby(["species", "island"])

# DataFrameGroupBy
# data frame for every unique combo of (species, island)
# aggregated = species_island.apply(display)

# DataFrame with multi-level columns
species_island_bill_length = species_island.agg({"bill_length_mm": [np.size, np.mean]})

display(species_island_bill_length)

# to access the individual columns, you have to get the top name first
display(species_island_bill_length["bill_length_mm"]["mean"])

# or iloc, all rows, the mean column
# species_island_bill_length.iloc[:, 1]
bill_length_mm
size mean
species island
Adelie Biscoe 44 38.975000
Dream 56 38.501786
Torgersen 52 38.950980
Chinstrap Dream 68 48.833824
Gentoo Biscoe 124 47.504878
species    island
Adelie     Biscoe       38.975000
           Dream        38.501786
           Torgersen    38.950980
Chinstrap  Dream        48.833824
Gentoo     Biscoe       47.504878
Name: mean, dtype: float64

Stack

image

Move the multi-column into the index

display(species_island_bill_length)
species_island_bill_length.stack()
bill_length_mm
size mean
species island
Adelie Biscoe 44 38.975000
Dream 56 38.501786
Torgersen 52 38.950980
Chinstrap Dream 68 48.833824
Gentoo Biscoe 124 47.504878
bill_length_mm
species island
Adelie Biscoe size 44.000000
mean 38.975000
Dream size 56.000000
mean 38.501786
Torgersen size 52.000000
mean 38.950980
Chinstrap Dream size 68.000000
mean 48.833824
Gentoo Biscoe size 124.000000
mean 47.504878

HAVING

SELECT island
FROM penguins
GROUP BY island
HAVING COUNT(species) > 1
islands = df.groupby(["island"])
islands = islands.filter(lambda x: x["species"].drop_duplicates().count() > 1)

islands
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex
20 Adelie Biscoe 37.8 18.3 174.0 3400.0 FEMALE
21 Adelie Biscoe 37.7 18.7 180.0 3600.0 MALE
22 Adelie Biscoe 35.9 19.2 189.0 3800.0 FEMALE
23 Adelie Biscoe 38.2 18.1 185.0 3950.0 MALE
24 Adelie Biscoe 38.8 17.2 180.0 3800.0 MALE
... ... ... ... ... ... ... ...
339 Gentoo Biscoe NaN NaN NaN NaN NaN
340 Gentoo Biscoe 46.8 14.3 215.0 4850.0 FEMALE
341 Gentoo Biscoe 50.4 15.7 222.0 5750.0 MALE
342 Gentoo Biscoe 45.2 14.8 212.0 5200.0 FEMALE
343 Gentoo Biscoe 49.9 16.1 213.0 5400.0 MALE

292 rows × 7 columns

SQL in Pandas

import sqlite3
conn = sqlite3.connect("penguins.db")
peaks = pd.read_sql("""
    SELECT MIN(body_mass_g)
    FROM penguins
    WHERE species = 'Gentoo'""",
conn)

peaks
MIN(body_mass_g)
0 3950

Django

import pandas as pd
import sqlalchemy as sql

engine = sql.create_engine(os.environ["DATABASE_URL"])

df = pd.read_sql_query(
        sql.text(
            """
        SELECT *
        FROM penguins
        """
        ),
        engine,
    )

Last update: 2022-11-04