import pandas as pdimport numpy as npimport seaborn as snsimport matplotlib.pyplot as pltfrom matplotlib.dates import DateFormatterimport matplotlib.dates as mdatesfrom janitor import clean_names# Set the theme for seabornsns.set_theme(style="white", palette="colorblind")# Set figure parametersplt.rcParams['figure.figsize'] = [8, 8*0.618]plt.rcParams['figure.autolayout'] =True
Working with dates
Air Quality Index
The AQI is the Environmental Protection Agency’s index for reporting air quality
Higher values of AQI indicate worse air quality
AQI levels
The previous graphic in tabular form, to be used later…
"." likely indicates NA, and it’s causing the entire column to be read in as characters
Rewind, and start over
# Reload data with correct NA valuestuc_2022 = pd.read_csv("https://raw.githubusercontent.com/Gchism94/GL-dataviz-lectures/main/slides/data/tucson/ad_aqi_tracker_data-2022.csv", na_values=[".", ""])# Clean and transform data againtuc_2022 = clean_names(tuc_2022).rename(columns={'_aqi_value': 'aqi_value'})tuc_2022['date'] = pd.to_datetime(tuc_2022['date'], format='%m/%d/%Y')# Check the structure of the dataprint(tuc_2022.info())
sns.lineplot(data=tuc_2022, x='date', y='cumsum_good_aqi')plt.gca().xaxis.set_major_formatter(DateFormatter("%b %Y"))plt.xlabel(None)plt.ylabel("Number of days")plt.title("Cumulative number of good AQI days (AQI < 50)\nTucson, AZ")plt.figtext(0.5, -0.1, 'Source: EPA Daily Air Quality Tracker', ha='center', size=10)plt.show()
Detrending
Detrending
Detrending is removing prominent long-term trend in time series to specifically highlight any notable deviations
Let’s demonstrate using multiple years of AQI data
Multiple years of Tucson, AZ data
Reading multiple files
# Define the list of URLstuc_files = ["https://raw.githubusercontent.com/Gchism94/GL-dataviz-lectures/main/slides/data/tucson/ad_aqi_tracker_data-2022.csv","https://raw.githubusercontent.com/Gchism94/GL-dataviz-lectures/main/slides/data/tucson/ad_aqi_tracker_data-2021.csv","https://raw.githubusercontent.com/Gchism94/GL-dataviz-lectures/main/slides/data/tucson/ad_aqi_tracker_data-2020.csv","https://raw.githubusercontent.com/Gchism94/GL-dataviz-lectures/main/slides/data/tucson/ad_aqi_tracker_data-2019.csv","https://raw.githubusercontent.com/Gchism94/GL-dataviz-lectures/main/slides/data/tucson/ad_aqi_tracker_data-2018.csv","https://raw.githubusercontent.com/Gchism94/GL-dataviz-lectures/main/slides/data/tucson/ad_aqi_tracker_data-2017.csv"]# Initialize an empty dataframetuc = pd.DataFrame()# Read and concatenate all data filesforfilein tuc_files: data = pd.read_csv(file, na_values=[".", ""]) tuc = pd.concat([tuc, data], ignore_index=True)# Clean column names using the clean_names function from the skimpy packagetuc = clean_names(tuc).rename(columns={'_aqi_value': 'aqi_value'})# Clean and transform datatuc['date'] = pd.to_datetime(tuc['date'], format='%m/%d/%Y')tuc = tuc.dropna(subset=['aqi_value'])tuc['good_aqi'] = np.where(tuc['aqi_value'] <=50, 1, 0)tuc = tuc.sort_values('date')tuc['cumsum_good_aqi'] = tuc['good_aqi'].cumsum()# Convert date to ordinal for regressiontuc['date_ordinal'] = tuc['date'].apply(lambda x: x.toordinal())print(tuc.head())
from sklearn.linear_model import LinearRegression# Fit linear regression for the trend linemodel = LinearRegression()model.fit(tuc[['date_ordinal']], tuc['cumsum_good_aqi'])tuc['fitted'] = model.predict(tuc[['date_ordinal']])
sns.lineplot(data=tuc, x='date', y='cumsum_good_aqi', color ='black')sns.lineplot(data=tuc, x='date', y='fitted', color='pink', label='Trend Line')plt.gca().xaxis.set_major_formatter(DateFormatter("%Y"))plt.xlabel(None)plt.ylabel("Number of days")plt.title("Cumulative number of good AQI days (AQI < 50)\nTucson, AZ")plt.figtext(0.5, -0.1, 'Source: EPA Daily Air Quality Tracker', ha='center', size=10)plt.show()
Detrend
Step 1. Fit a simple linear regression
# Convert dates to ordinal for regressiontuc['date_ordinal'] = tuc['date'].apply(lambda x: x.toordinal())# Fit linear regressionmodel = LinearRegression()model.fit(tuc[['date_ordinal']], tuc['cumsum_good_aqi'])# Get fitted valuestuc['fitted'] = model.predict(tuc[['date_ordinal']])
Detrend
Step 2. Divide the observed value of cumsum_good_aqi by the respective value in the long-term trend (i.e., fitted)
plt.axhline(y=1, color='gray')sns.lineplot(data=tuc, x='date', y='ratio', color='black')plt.gca().xaxis.set_major_formatter(DateFormatter("%Y"))plt.ylim([0, 20])plt.xlabel(None)plt.ylabel("Number of days\n(detrended)")plt.title("Cumulative number of good AQI days (AQI < 50)\nTucson, AZ (2016-2022)")plt.figtext(0.5, -0.1, 'Source: EPA Daily Air Quality Tracker', ha='center', size=10)plt.show()
Air Quality in Tucson
barely anything interesting happening!
let’s look at data from somewhere with a bit more “interesting” air quality data…
Read in multiple years of SF data
Code
# Define the list of URLssf_files = ["https://raw.githubusercontent.com/Gchism94/GL-dataviz-lectures/main/slides/data/san-francisco/ad_aqi_tracker_data-2022.csv","https://raw.githubusercontent.com/Gchism94/GL-dataviz-lectures/main/slides/data/san-francisco/ad_aqi_tracker_data-2021.csv","https://raw.githubusercontent.com/Gchism94/GL-dataviz-lectures/main/slides/data/san-francisco/ad_aqi_tracker_data-2020.csv","https://raw.githubusercontent.com/Gchism94/GL-dataviz-lectures/main/slides/data/san-francisco/ad_aqi_tracker_data-2019.csv","https://raw.githubusercontent.com/Gchism94/GL-dataviz-lectures/main/slides/data/san-francisco/ad_aqi_tracker_data-2018.csv","https://raw.githubusercontent.com/Gchism94/GL-dataviz-lectures/main/slides/data/san-francisco/ad_aqi_tracker_data-2017.csv","https://raw.githubusercontent.com/Gchism94/GL-dataviz-lectures/main/slides/data/san-francisco/ad_aqi_tracker_data-2016.csv"]# Initialize an empty dataframesf = pd.DataFrame()# Read and concatenate all data filesforfilein sf_files: data = pd.read_csv(file, na_values=[".", ""]) sf = pd.concat([sf, data], ignore_index=True)# Clean column names using the clean_names function from the skimpy packagesf = clean_names(sf).rename(columns={'_aqi_value': 'aqi_value'})# Clean and transform datasf['date'] = pd.to_datetime(sf['date'], format='%m/%d/%Y')sf = sf.dropna(subset=['aqi_value'])sf['good_aqi'] = np.where(sf['aqi_value'] <=50, 1, 0)sf = sf.sort_values('date')sf['cumsum_good_aqi'] = sf['good_aqi'].cumsum()# Convert date to ordinal for regressionsf['date_ordinal'] = sf['date'].apply(lambda x: x.toordinal())print(sf.head())
# Fit linear regression for the trend linemodel = LinearRegression()model.fit(sf[['date_ordinal']], sf['cumsum_good_aqi'])sf['fitted'] = model.predict(sf[['date_ordinal']])
sns.lineplot(data=sf, x='date', y='cumsum_good_aqi', color ='black')sns.lineplot(data=sf, x='date', y='fitted', color='pink', label='Trend Line')plt.gca().xaxis.set_major_formatter(DateFormatter("%Y"))plt.xlabel(None)plt.ylabel("Number of days")plt.title("Cumulative number of good AQI days (AQI < 50)\nSan Francisco, CA")plt.figtext(0.5, -0.1, 'Source: EPA Daily Air Quality Tracker', ha='center', size=10)plt.show()
Detrend
Step 1. Fit a simple linear regression
# Convert dates to ordinal for regressionsf['date_ordinal'] = sf['date'].apply(lambda x: x.toordinal())# Fit linear regressionmodel = LinearRegression()model.fit(sf[['date_ordinal']], sf['cumsum_good_aqi'])# Get fitted valuessf['fitted'] = model.predict(sf[['date_ordinal']])
Detrend
Step 2. Divide the observed value of cumsum_good_aqi by the respective value in the long-term trend (i.e., fitted)
plt.axhline(y=1, color='gray')sns.lineplot(data=sf, x='date', y='ratio', color='black')plt.gca().xaxis.set_major_formatter(DateFormatter("%Y"))plt.xlabel(None)plt.ylabel("Number of days\n(detrended)")plt.title("Cumulative number of good AQI days (AQI < 50)\nSan Francisco, CA (2016-2022)")plt.figtext(0.5, -0.1, 'Source: EPA Daily Air Quality Tracker', ha='center', size=10)plt.show()
Detrending
In step 2 we fit a very simple model
Depending on the complexity you’re trying to capture you might choose to fit a much more complex model
You can also decompose the trend into multiple trends, e.g. monthly, long-term, seasonal, etc.
Highlighting
Data prep
from datetime import datetimesf['year'] = sf['date'].dt.yearsf['day_of_year'] = sf['date'].dt.dayofyear
sns.lineplot(data=sf, x='day_of_year', y='aqi_value', hue='year', palette='tab10', legend=False)plt.xlabel('Day of year')plt.ylabel('AQI value')plt.title('AQI levels in San Francisco (2016 - 2022)')plt.show()
Highlight specific year (2016)
Code
# Highlight the year 2016sns.lineplot(data=sf, x='day_of_year', y='aqi_value', color='gray')sns.lineplot(data=sf[sf['year'] ==2016], x='day_of_year', y='aqi_value', color='red')plt.xlabel('Day of year')plt.ylabel('AQI value')plt.title('AQI levels in SF in 2016\nVersus all years 2016 - 2022')plt.show()
Highlight specific year (2017)
Code
# Highlight the year 2017sns.lineplot(data=sf, x='day_of_year', y='aqi_value', color='gray')sns.lineplot(data=sf[sf['year'] ==2017], x='day_of_year', y='aqi_value', color='red')plt.xlabel('Day of year')plt.ylabel('AQI value')plt.title('AQI levels in SF in 2017\nVersus all years 2016 - 2022')plt.show()
Highlight specific year (2018)
Code
# Highlight the year 2018sns.lineplot(data=sf, x='day_of_year', y='aqi_value', color='gray')sns.lineplot(data=sf[sf['year'] ==2018], x='day_of_year', y='aqi_value', color='red')plt.xlabel('Day of year')plt.ylabel('AQI value')plt.title('AQI levels in SF in 2018\nVersus all years 2016 - 2022')plt.show()
Highlight any year
Code
# Function to highlight a specific yeardef highlight_year(year_to_highlight): sns.lineplot(data=sf, x='day_of_year', y='aqi_value', color='gray') sns.lineplot(data=sf[sf['year'] == year_to_highlight], x='day_of_year', y='aqi_value', color='red') plt.xlabel('Day of year') plt.ylabel('AQI value') plt.title(f'AQI levels in SF in {year_to_highlight}\nVersus all years 2016 - 2022') plt.show()# Highlight any yearhighlight_year(2018)