%%HTML
<link rel="stylesheet" type="text/css" href="https://raw.githubusercontent.com/malkaguillot/Foundations-in-Data-Science-and-Machine-Learning/refs/heads/main/docs/utils/custom.css">
%%HTML
<link rel="stylesheet" type="text/css" href="../utils/custom.css">

import pandas as pd

import pandas as pd
pd.options.mode.copy_on_write = True
pd.options.future.infer_string

df = pd.DataFrame(
    data=[[1, "bla"], [3, "blubb"]],
    columns=["a", "b"],
    index=["c", "d"]
)
df

df = pd.DataFrame(
    data={
        "a": [1, 3],
        "b": ["bla", "blubb"]
    },
    index=["c", "d"]
)
df

pd.Series(
    [3.0, 4.5], index=["x", "y"],
)

x    3.0
y    4.5
dtype: float64

sr = pd.Series(
    [3.0, 4.5], index=["c", "d"],
)
df["new_col"] = sr
df

df = pd.read_csv(
    "https://raw.githubusercontent.com/HSG-8276-Spring-2025/class-ressources/refs/heads/main/data/winemag-data-130k-v3.csv", index_col=0,
    engine="pyarrow"
)
df.head()

df.dtypes

Unnamed: 0                 int64
serial                     int64
country                   object
description               object
designation               object
points                     int64
price                    float64
province                  object
region_1                  object
region_2                  object
taster_name               object
taster_twitter_handle     object
title                     object
variety                   object
winery                    object
dtype: object

better_dtypes = {
 "country": pd.CategoricalDtype(),
 "points": pd.UInt8Dtype(),
 "price": pd.Float32Dtype(),
}

df = df.astype(better_dtypes)
df.dtypes

Unnamed: 0                  int64
serial                      int64
country                  category
description                object
designation                object
points                      UInt8
price                     Float32
province                   object
region_1                   object
region_2                   object
taster_name                object
taster_twitter_handle      object
title                      object
variety                    object
winery                     object
dtype: object

sr = pd.Series(["Guido", "Tim", "Raymond"])
sr.str.lower()

0      guido
1        tim
2    raymond
dtype: object

sr.str.replace("i", "iii")

0    Guiiido
1      Tiiim
2    Raymond
dtype: object

cat_type = pd.CategoricalDtype(
    categories=["low", "middle", "high"],
    ordered=True,
)
sr = pd.Series(
    ["low", "high", "high"],
    dtype=cat_type,
)
sr

0     low
1    high
2    high
dtype: category
Categories (3, object): ['low' < 'middle' < 'high']

df = pd.read_csv(
    "https://raw.githubusercontent.com/HSG-8276-Spring-2025/class-ressources/refs/heads/main/data/winemag-data-130k-v3.csv", index_col=0, 
    engine="pyarrow"
)
df.head()

df.index

Index([     0,      1,      2,      3,      4,      5,      6,      7,      8,
            9,
       ...
       129961, 129962, 129963, 129964, 129965, 129966, 129967, 129968, 129969,
       129970],
      dtype='int64', name='', length=129971)

df = df.set_index(["country", "winery"])
df.index

MultiIndex([(   'Italy',                                  'Nicosia'),
            ('Portugal',                      'Quinta dos Avidagos'),
            (      'US',                                'Rainstorm'),
            (      'US',                               'St. Julian'),
            (      'US',                             'Sweet Cheeks'),
            (   'Spain',                                   'Tandem'),
            (   'Italy',                          'Terre di Giurfo'),
            (  'France',                                 'Trimbach'),
            ( 'Germany',                              'Heinz Eifel'),
            (  'France',                       'Jean-Baptiste Adam'),
            ...
            (   'Italy',                                      'COS'),
            (   'Italy',                                 'Cusumano'),
            (  'Israel',                                   'Dalton'),
            (  'France',                          'Domaine Ehrhart'),
            (  'France',                  'Domaine Rieflé-Landmann'),
            ( 'Germany', 'Dr. H. Thanisch (Erben Müller-Burggraef)'),
            (      'US',                                 'Citation'),
            (  'France',                          'Domaine Gresser'),
            (  'France',                     'Domaine Marcel Deiss'),
            (  'France',                         'Domaine Schoffit')],
           names=['country', 'winery'], length=129971)

df = df.reset_index()
df.index

RangeIndex(start=0, stop=129971, step=1)

df.columns

Index(['country', 'winery', 'Unnamed: 0', 'serial', 'description',
       'designation', 'points', 'price', 'province', 'region_1', 'region_2',
       'taster_name', 'taster_twitter_handle', 'title', 'variety'],
      dtype='object')

new_names = {
    "country": "country name",
    "taster_name": "taster name",
}
df = df.rename(columns=new_names)
df.columns

Index(['country name', 'winery', 'Unnamed: 0', 'serial', 'description',
       'designation', 'points', 'price', 'province', 'region_1', 'region_2',
       'taster name', 'taster_twitter_handle', 'title', 'variety'],
      dtype='object')

df['country name'].head(6)

0       Italy
1    Portugal
2          US
3          US
4          US
5       Spain
Name: country name, dtype: object

df[['country name', 'taster name']]

df.loc[1]

country name                                                      Portugal
winery                                                 Quinta dos Avidagos
Unnamed: 0                                                               1
serial                                                                   1
description              This is ripe and fruity, a wine that is smooth...
designation                                                       Avidagos
points                                                                  87
price                                                                 15.0
province                                                             Douro
region_1                                                              None
region_2                                                              None
taster name                                                     Roger Voss
taster_twitter_handle                                           @vossroger
title                        Quinta dos Avidagos 2011 Avidagos Red (Douro)
variety                                                     Portuguese Red
Name: 1, dtype: object

df = df.set_index(["country name", "winery"])
df.loc["Italy"]

df.loc["Italy", "taster name"]

/var/folders/cg/tgk7cwd906x_71j8jdd3gzc00000gn/T/ipykernel_90395/898296892.py:1: PerformanceWarning: indexing past lexsort depth may impact performance.
  df.loc["Italy", "taster name"]

winery
Nicosia                            Kerin O’Keefe
Terre di Giurfo                    Kerin O’Keefe
Masseria Setteporte                Kerin O’Keefe
Baglio di Pianetto                 Kerin O’Keefe
Canicattì                          Kerin O’Keefe
                                       ...      
Col Vetoraz Spumanti                        None
Baglio del Cristo di Campobello    Kerin O’Keefe
Feudo Principi di Butera           Kerin O’Keefe
COS                                Kerin O’Keefe
Cusumano                           Kerin O’Keefe
Name: taster name, Length: 19540, dtype: object

df.loc[["Italy", "Australia"], ["taster name", "points"]]

df = df.reset_index()

df['points']>88

0         False
1         False
2         False
3         False
4         False
          ...  
129966     True
129967     True
129968     True
129969     True
129970     True
Name: points, Length: 129971, dtype: bool

df = df[['country name', 'points', 'price', "taster name"]]

df[df['points']>88]

df

relevant_columns = ['points', 'price']
df[relevant_columns].describe()

df['points'].mean()

np.float64(88.44713820775404)

df.groupby('country name')['points'].mean()[:5]

country name
Argentina                 86.710263
Armenia                   87.500000
Australia                 88.580507
Austria                   90.101345
Bosnia and Herzegovina    86.500000
Name: points, dtype: float64

pd.options.plotting.backend = "plotly"
import plotly.io as pio
pio.renderers.default = "notebook"  # For Jupyter compatibility

df.loc[df['country name'] == 'Bosnia and Herzegovina', 'country name'] = 'BH'

fig = df.groupby("country name")["points"].mean().plot(width=700, height=400) 
fig.update_layout(xaxis=dict(tickfont=dict(size=10)))  # Adjust the font size as needed
fig.show()

df.plot.scatter(x="points", y="price", width=700, height=400)

df['country name'].unique()[:3]

array(['Italy', 'Portugal', 'US'], dtype=object)

df['country name'].value_counts().sort_index()[:3]

country name
Argentina    3800
Armenia         2
Australia    2329
Name: count, dtype: int64

import numpy as np
df['log_price'] = np.log(df['price'])
df['log_price'].head()

0         NaN
1    2.708050
2    2.639057
3    2.564949
4    4.174387
Name: log_price, dtype: float64

df['price_thousand'] = df['price'] / 1000
df['price_thousand'].head()

0      NaN
1    0.015
2    0.014
3    0.013
4    0.065
Name: price_thousand, dtype: float64

df['country code'] = df['country name'].replace(
    {
        "Italy": "IT",
        "France": "FR",
        "Spain": "ES",
        "USA": "US",
    }
) 
df[['country name', 'country code']].head()

Type	Properties
pd.Int8Dtype()	Byte (-128 to 127)
pd.Int16Dtype()	Integer (-32768 to 32767)
pd.Int32Dtype()	Integer (-2147483648 to 2147483647)
pd.Int64Dtype()	Integer (-9223372036854775808 to 9223372036854775807)
pd.UInt8Dtype()	Unsigned Integer (0 to 255)
pd.UInt16Dtype()	Unsigned Integer (0 to 65535)
pd.UInt32Dtype()	Unsigned Integer (0 to 4294967295)
pd.UInt64Dtype()	Unsigned Integer (0 to 18446744073709551615)
pd.Float32Dtype()	Float (3.4028235e+38 to 1.4012985e-45)
pd.Float64Dtype()	Float (double precision float)

Reader	Extension	Comment
pd.read_csv	.csv	Often need to use optional arguments to make it work
pd.read_pickle	.pkl	Good for intermediate files; Python specific.
pd.read_feather	.arrow	Very modern and powerful file format.
pd.read_stata	.dta	Stata’s proprietary format. Avoid if you can.

	Unnamed: 0	serial	description	designation	points	price	province	region_1	region_2	taster name	taster_twitter_handle	title	variety
winery
Nicosia	0	0	Aromas include tropical fruit, broom, brimston...	Vulkà Bianco	87	NaN	Sicily & Sardinia	Etna	None	Kerin O’Keefe	@kerinokeefe	Nicosia 2013 Vulkà Bianco (Etna)	White Blend
Terre di Giurfo	6	6	Here's a bright, informal red that opens with ...	Belsito	87	16.0	Sicily & Sardinia	Vittoria	None	Kerin O’Keefe	@kerinokeefe	Terre di Giurfo 2013 Belsito Frappato (Vittoria)	Frappato
Masseria Setteporte	13	13	This is dominated by oak and oak-driven aromas...	Rosso	87	NaN	Sicily & Sardinia	Etna	None	Kerin O’Keefe	@kerinokeefe	Masseria Setteporte 2012 Rosso (Etna)	Nerello Mascalese
Baglio di Pianetto	22	22	Delicate aromas recall white flower and citrus...	Ficiligno	87	19.0	Sicily & Sardinia	Sicilia	None	Kerin O’Keefe	@kerinokeefe	Baglio di Pianetto 2007 Ficiligno White (Sicilia)	White Blend
Canicattì	24	24	Aromas of prune, blackcurrant, toast and oak c...	Aynat	87	35.0	Sicily & Sardinia	Sicilia	None	Kerin O’Keefe	@kerinokeefe	Canicattì 2009 Aynat Nero d'Avola (Sicilia)	Nero d'Avola
...	...	...	...	...	...	...	...	...	...	...	...	...	...
Col Vetoraz Spumanti	129929	129929	This luminous sparkler has a sweet, fruit-forw...	None	91	38.0	Veneto	Prosecco Superiore di Cartizze	None	None	None	Col Vetoraz Spumanti NV Prosecco Superiore di...	Prosecco
Baglio del Cristo di Campobello	129943	129943	A blend of Nero d'Avola and Syrah, this convey...	Adènzia	90	29.0	Sicily & Sardinia	Sicilia	None	Kerin O’Keefe	@kerinokeefe	Baglio del Cristo di Campobello 2012 Adènzia R...	Red Blend
Feudo Principi di Butera	129947	129947	A blend of 65% Cabernet Sauvignon, 30% Merlot ...	Symposio	90	20.0	Sicily & Sardinia	Terre Siciliane	None	Kerin O’Keefe	@kerinokeefe	Feudo Principi di Butera 2012 Symposio Red (Te...	Red Blend
COS	129961	129961	Intense aromas of wild cherry, baking spice, t...	None	90	30.0	Sicily & Sardinia	Sicilia	None	Kerin O’Keefe	@kerinokeefe	COS 2013 Frappato (Sicilia)	Frappato
Cusumano	129962	129962	Blackberry, cassis, grilled herb and toasted a...	Sàgana Tenuta San Giacomo	90	40.0	Sicily & Sardinia	Sicilia	None	Kerin O’Keefe	@kerinokeefe	Cusumano 2012 Sàgana Tenuta San Giacomo Nero d...	Nero d'Avola

	points	price
count	129971.000000	120975.000000
mean	88.447138	35.363389
std	3.039730	41.022218
min	80.000000	4.000000
25%	86.000000	17.000000
50%	88.000000	25.000000
75%	91.000000	42.000000
max	100.000000	3300.000000

	Unnamed: 0	serial	country	description	designation	points	price	province	region_1	region_2	taster_name	taster_twitter_handle	title	variety	winery

0	0	0	Italy	Aromas include tropical fruit, broom, brimston...	Vulkà Bianco	87	NaN	Sicily & Sardinia	Etna	None	Kerin O’Keefe	@kerinokeefe	Nicosia 2013 Vulkà Bianco (Etna)	White Blend	Nicosia
1	1	1	Portugal	This is ripe and fruity, a wine that is smooth...	Avidagos	87	15.0	Douro	None	None	Roger Voss	@vossroger	Quinta dos Avidagos 2011 Avidagos Red (Douro)	Portuguese Red	Quinta dos Avidagos
2	2	2	US	Tart and snappy, the flavors of lime flesh and...	None	87	14.0	Oregon	Willamette Valley	Willamette Valley	Paul Gregutt	@paulgwine	Rainstorm 2013 Pinot Gris (Willamette Valley)	Pinot Gris	Rainstorm
3	3	3	US	Pineapple rind, lemon pith and orange blossom ...	Reserve Late Harvest	87	13.0	Michigan	Lake Michigan Shore	None	Alexander Peartree	None	St. Julian 2013 Reserve Late Harvest Riesling ...	Riesling	St. Julian
4	4	4	US	Much like the regular bottling from 2012, this...	Vintner's Reserve Wild Child Block	87	65.0	Oregon	Willamette Valley	Willamette Valley	Paul Gregutt	@paulgwine	Sweet Cheeks 2012 Vintner's Reserve Wild Child...	Pinot Noir	Sweet Cheeks

		taster name	points
country name	winery
Italy	Nicosia	Kerin O’Keefe	87
	Terre di Giurfo	Kerin O’Keefe	87
	Masseria Setteporte	Kerin O’Keefe	87
	Baglio di Pianetto	Kerin O’Keefe	87
	Canicattì	Kerin O’Keefe	87
...	...	...	...
Australia	Kilikanoon	None	89
	Atze's Corner	None	89
	Moorooroo	None	87
	Mr. Riggs	None	87
	Henschke	Joe Czerwinski	90

	country name	points	price	taster name
119	France	92	80.0	None
120	Italy	92	70.0	None
121	US	92	36.0	None
122	US	92	39.0	None
123	Australia	92	40.0	Joe Czerwinski
...	...	...	...	...
129966	Germany	90	28.0	Anna Lee C. Iijima
129967	US	90	75.0	Paul Gregutt
129968	France	90	30.0	Roger Voss
129969	France	90	32.0	Roger Voss
129970	France	90	21.0	Roger Voss

	a	b
c	1	bla
d	3	blubb

	a	b
c	1	bla
d	3	blubb

	a	b	new_col
c	1	bla	3.0
d	3	blubb	4.5

Foundations in Data Science and Machine Learning¶

Module 4: Data management with pandas¶

Malka Guillot¶

What is (modern) pandas?¶

What is pandas?¶

What is a DataFrame?¶

DataFrame and Series¶

Creating a DataFrame¶

Creating a DataFrame from a dictionary¶

What is a Series?¶

Creating a Series¶

Assignment is index aligned!¶

Data types¶

The need for different data types¶

Benefits of good type representation¶

Converting to efficient dtypes¶

Overview of numeric dtypes¶

String vs. Categorical¶

Working with Strings¶

Working with categoricals¶

Loading and saving data¶

Example: Loading a csv file¶

Other read functions¶

File format recommendations¶

Setting and renaming columns and indices¶

Why the Index is important¶

Setting and resetting the index¶

Renaming columns¶

Selecting rows and columns¶

Selecting columns¶

Selecting individual rows¶

Selecting rows and columns¶

Selecting rows using Boolean Series¶

Inspecting and summarizing data¶

Summarize an entire DataFrame¶

Calculate specific statistics¶

Grouping and aggregating¶

Quick plotting: Series¶

Quick plotting: DataFrames¶

Statistics for categorical data¶

Creating new variables¶

Using numpy math functions¶

Arithmetic with Series¶

Recoding values¶

Merging datasets¶

Why merge datasets?¶

Concatenating DataFrames vertically¶

Concatenating DataFrames horizontally¶

Merge¶

Rules for data management¶

1. Never ever change source data¶

2. Separate data mgm’t and analysis¶

3. Values have no internal structure¶

4. No redundant information in tables¶

5. No structure in variable names (and use meaningful names)¶

Module 4: Data management with `pandas`¶