# Data Manipulation and Analysisimport pandas as pdimport numpy as np# Data Visualizationimport seaborn as snsimport matplotlib.pyplot as plt# Machine Learningfrom sklearn.decomposition import PCAfrom sklearn.impute import SimpleImputerfrom sklearn.preprocessing import StandardScaler, OneHotEncoder# Statistical Analysisimport statsmodels.api as smimport scipy.stats as stats# Increase font size of all Seaborn plot elementssns.set(font_scale=1.25)# Set Seaborn themesns.set_theme(style="whitegrid", palette="colorblind")
Data Preprocessing
Data preprocessing can refer to manipulation, filtration or augmentation of data before it is analyzed, and is often an important step in the data analysis process.
Datasets
Human Freedom Index
The Human Freedom Index is a report that attempts to summarize the idea of “freedom” through variables for many countries around the globe.
year datetime64[ns]
ISO_code object
countries object
region object
pf_rol_procedural float64
...
hf_score float64
hf_rank float64
hf_quartile float64
mean_pf_score float64
median_pf_score float64
Length: 125, dtype: object
Removing duplicates
hfi.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458 entries, 0 to 1457
Columns: 125 entries, year to median_pf_score
dtypes: datetime64[ns](1), float64(121), object(3)
memory usage: 1.4+ MB
hfi.drop_duplicates(inplace =True)hfi.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458 entries, 0 to 1457
Columns: 125 entries, year to median_pf_score
dtypes: datetime64[ns](1), float64(121), object(3)
memory usage: 1.4+ MB
No duplicates! 😊
Dimensional reduction
Dimension reduction, is the transformation of data from a high-dimensional space into a low-dimensional space so that the low-dimensional representation retains some meaningful properties of the original data, ideally close to its intrinsic dimension.
Principal component analysis (PCA) - Unsupervised
Maximizes variance in the dataset.
Finds orthogonal principal components.
Useful for feature extraction and data visualization.
# Combining the scatterplot of principal components with the scree plot using the correct column namesfig, axes = plt.subplots(nrows =1, ncols =2, figsize = (12, 5))# Scatterplot of Principal Componentsaxes[0].scatter(principalDf['principal component 1'], principalDf['principal component 2'])for i inrange(len(pca.components_)): axes[0].arrow(0, 0, pca.components_[i, 0], pca.components_[i, 1], head_width =0.1, head_length =0.15, fc ='r', ec ='r', linewidth =2) axes[0].text(pca.components_[i, 0] *1.2, pca.components_[i, 1] *1.2, f'Eigenvector {i+1}', color ='r', fontsize =12)axes[0].set_xlabel('Principal Component 1')axes[0].set_ylabel('Principal Component 2')axes[0].set_title('Scatterplot of Principal Components with Eigenvectors')axes[0].grid()# Scree Plot for PCAaxes[1].bar(range(1, len(pca_variance_explained) +1), pca_variance_explained, alpha =0.6, color ='g', label ='Individual Explained Variance')axes[1].set_ylabel('Explained variance ratio')axes[1].set_xlabel('Principal components')axes[1].set_title('Scree Plot for PCA')axes[1].legend(loc='best')plt.tight_layout()plt.show()
So, that’s it?
…Not really
Find the optimal number of components.
Code
# Assuming hfi DataFrame is already defined and loaded# Select numerical columnsnumerical_cols = hfi.select_dtypes(include=['int64', 'float64']).columns# Scale the datascaler = StandardScaler()scaled_data = scaler.fit_transform(hfi[numerical_cols])# Apply PCApca = PCA().fit(scaled_data)# Get explained variance ratio and cumulative explained varianceexplained_variance_ratio = pca.explained_variance_ratio_cumulative_explained_variance = explained_variance_ratio.cumsum()# Decide number of components to retain 75% variancethreshold =0.75num_components =next(i for i, cumulative_var inenumerate(cumulative_explained_variance) if cumulative_var >= threshold) +1# Plot the explained varianceplt.figure(figsize=(10, 6))plt.plot(range(1, len(cumulative_explained_variance) +1), cumulative_explained_variance, marker='o', linestyle='--')plt.axhline(y=threshold, color='r', linestyle='-')plt.axvline(x=num_components, color='r', linestyle='-')plt.annotate(f'{num_components} components', xy=(num_components, threshold), xytext=(num_components+5, threshold-0.05), arrowprops=dict(color='r', arrowstyle='->'), fontsize=12, color='r')plt.title('Cumulative Explained Variance by Principal Components')plt.xlabel('Principal Component')plt.ylabel('Cumulative Explained Variance')plt.grid(True)plt.show()print(f"Number of components to retain 75% variance: {num_components}")# Apply PCA with the chosen number of componentspca = PCA(n_components=num_components)reduced_data = pca.fit_transform(scaled_data)
Number of components to retain 75% variance: 19
Dimensional reduction: what now?
Feature Selection: Choose the most informative components.
Visualization: Graph the reduced dimensions to identify patterns.
Clustering: Group similar data points using clustering algorithms.
Classification: Predict categories using classifiers on reduced features.
Model Evaluation: Assess model performance with metrics like accuracy.
Cross-Validation: Validate model stability with cross-validation.
Hyperparameter Tuning: Optimize model settings for better performance.
Model Interpretation: Understand feature influence in the models.
Ensemble Methods: Improve predictions by combining multiple models.
Deployment: Deploy the model for real-world predictions.
Iterative Refinement: Refine analysis based on initial results.
Reporting: Summarize findings for stakeholders.
Clustering
Setup
# Data Handling and Manipulationimport pandas as pdimport numpy as np# Data Preprocessingfrom sklearn.preprocessing import StandardScaler, OneHotEncoderfrom sklearn.compose import ColumnTransformerfrom sklearn.pipeline import Pipelinefrom sklearn.impute import SimpleImputerfrom sklearn.decomposition import PCA# Model Selection and Evaluationfrom sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCVfrom sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_scorefrom sklearn.mixture import GaussianMixture# Clustering Modelsfrom sklearn.cluster import KMeans# Data Visualizationimport matplotlib.pyplot as pltimport seaborn as sns# Set the default style for visualizationsns.set_theme(style ="white", palette ="colorblind")# Increase font size of all Seaborn plot elementssns.set(font_scale =1.25)
The goal of K-Means is to minimize the variance within each cluster. The variance is measured as the sum of squared distances between each point and its corresponding cluster centroid. The objective function, which K-Means aims to minimize, can be defined as:
\(C_i\) is the set of points belonging to a cluster \(i\).
\(x\) is a point in the cluster \(C_i\)
\(||x - \mu_i||^2\) is the squared Euclidean distance between a point \(x\) and the centroid \(\mu_i\), which measures the dissimilarity between them.
# Finding the optimal number of clusters using Calinski-Harabasz Indexcalinski_harabasz_scores = []cluster_range =range(2, 11) # Define the range for number of clustersfor n_clusters in cluster_range: kmeans = KMeans(n_clusters=n_clusters, random_state=0) kmeans.fit(data) labels = kmeans.labels_ score = calinski_harabasz_score(data, labels) calinski_harabasz_scores.append(score)# Plotting the Calinski-Harabasz scoresplt.plot(cluster_range, calinski_harabasz_scores, marker='o')plt.title('Calinski-Harabasz Index for Different Numbers of Clusters')plt.xlabel('Number of Clusters')plt.ylabel('Calinski-Harabasz Index')plt.grid(True)plt.show()# Finding the number of clusters that maximizes the Calinski-Harabasz Indexoptimal_n_clusters = cluster_range[calinski_harabasz_scores.index(max(calinski_harabasz_scores))]print(f"The optimal number of clusters is: {optimal_n_clusters}")
The optimal number of clusters is: 2
Code
# K-Means Clustering with the optimal number of clusterskmeans = KMeans(n_clusters=optimal_n_clusters, random_state=0)kmeans.fit(data)clusters = kmeans.predict(data)# Adding cluster labels to the DataFramedata['Cluster'] = clusters# Model Summaryprint("Cluster Centers:\n", kmeans.cluster_centers_)# Evaluate clustering performance using the Calinski-Harabasz Indexcalinski_harabasz_score_final = calinski_harabasz_score(data.drop(columns='Cluster'), clusters)print(f"For n_clusters = {optimal_n_clusters}, the Calinski-Harabasz Index is : {calinski_harabasz_score_final:.3f}")