{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "d02d522a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Country | \n",
" Year | \n",
" Status | \n",
" Life expectancy | \n",
" Adult Mortality | \n",
" infant deaths | \n",
" Alcohol | \n",
" percentage expenditure | \n",
" Hepatitis B | \n",
" Measles | \n",
" ... | \n",
" Polio | \n",
" Total expenditure | \n",
" Diphtheria | \n",
" HIV/AIDS | \n",
" GDP | \n",
" Population | \n",
" thinness 1-19 years | \n",
" thinness 5-9 years | \n",
" Income composition of resources | \n",
" Schooling | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Afghanistan | \n",
" 2015 | \n",
" Developing | \n",
" 65.0 | \n",
" 263.0 | \n",
" 62 | \n",
" 0.01 | \n",
" 71.279624 | \n",
" 65.0 | \n",
" 1154 | \n",
" ... | \n",
" 6.0 | \n",
" 8.16 | \n",
" 65.0 | \n",
" 0.1 | \n",
" 584.259210 | \n",
" 33736494.0 | \n",
" 17.2 | \n",
" 17.3 | \n",
" 0.479 | \n",
" 10.1 | \n",
"
\n",
" \n",
" 1 | \n",
" Afghanistan | \n",
" 2014 | \n",
" Developing | \n",
" 59.9 | \n",
" 271.0 | \n",
" 64 | \n",
" 0.01 | \n",
" 73.523582 | \n",
" 62.0 | \n",
" 492 | \n",
" ... | \n",
" 58.0 | \n",
" 8.18 | \n",
" 62.0 | \n",
" 0.1 | \n",
" 612.696514 | \n",
" 327582.0 | \n",
" 17.5 | \n",
" 17.5 | \n",
" 0.476 | \n",
" 10.0 | \n",
"
\n",
" \n",
" 2 | \n",
" Afghanistan | \n",
" 2013 | \n",
" Developing | \n",
" 59.9 | \n",
" 268.0 | \n",
" 66 | \n",
" 0.01 | \n",
" 73.219243 | \n",
" 64.0 | \n",
" 430 | \n",
" ... | \n",
" 62.0 | \n",
" 8.13 | \n",
" 64.0 | \n",
" 0.1 | \n",
" 631.744976 | \n",
" 31731688.0 | \n",
" 17.7 | \n",
" 17.7 | \n",
" 0.470 | \n",
" 9.9 | \n",
"
\n",
" \n",
" 3 | \n",
" Afghanistan | \n",
" 2012 | \n",
" Developing | \n",
" 59.5 | \n",
" 272.0 | \n",
" 69 | \n",
" 0.01 | \n",
" 78.184215 | \n",
" 67.0 | \n",
" 2787 | \n",
" ... | \n",
" 67.0 | \n",
" 8.52 | \n",
" 67.0 | \n",
" 0.1 | \n",
" 669.959000 | \n",
" 3696958.0 | \n",
" 17.9 | \n",
" 18.0 | \n",
" 0.463 | \n",
" 9.8 | \n",
"
\n",
" \n",
" 4 | \n",
" Afghanistan | \n",
" 2011 | \n",
" Developing | \n",
" 59.2 | \n",
" 275.0 | \n",
" 71 | \n",
" 0.01 | \n",
" 7.097109 | \n",
" 68.0 | \n",
" 3013 | \n",
" ... | \n",
" 68.0 | \n",
" 7.87 | \n",
" 68.0 | \n",
" 0.1 | \n",
" 63.537231 | \n",
" 2978599.0 | \n",
" 18.2 | \n",
" 18.2 | \n",
" 0.454 | \n",
" 9.5 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 22 columns
\n",
"
"
],
"text/plain": [
" Country Year Status Life expectancy Adult Mortality \\\n",
"0 Afghanistan 2015 Developing 65.0 263.0 \n",
"1 Afghanistan 2014 Developing 59.9 271.0 \n",
"2 Afghanistan 2013 Developing 59.9 268.0 \n",
"3 Afghanistan 2012 Developing 59.5 272.0 \n",
"4 Afghanistan 2011 Developing 59.2 275.0 \n",
"\n",
" infant deaths Alcohol percentage expenditure Hepatitis B Measles ... \\\n",
"0 62 0.01 71.279624 65.0 1154 ... \n",
"1 64 0.01 73.523582 62.0 492 ... \n",
"2 66 0.01 73.219243 64.0 430 ... \n",
"3 69 0.01 78.184215 67.0 2787 ... \n",
"4 71 0.01 7.097109 68.0 3013 ... \n",
"\n",
" Polio Total expenditure Diphtheria HIV/AIDS GDP Population \\\n",
"0 6.0 8.16 65.0 0.1 584.259210 33736494.0 \n",
"1 58.0 8.18 62.0 0.1 612.696514 327582.0 \n",
"2 62.0 8.13 64.0 0.1 631.744976 31731688.0 \n",
"3 67.0 8.52 67.0 0.1 669.959000 3696958.0 \n",
"4 68.0 7.87 68.0 0.1 63.537231 2978599.0 \n",
"\n",
" thinness 1-19 years thinness 5-9 years \\\n",
"0 17.2 17.3 \n",
"1 17.5 17.5 \n",
"2 17.7 17.7 \n",
"3 17.9 18.0 \n",
"4 18.2 18.2 \n",
"\n",
" Income composition of resources Schooling \n",
"0 0.479 10.1 \n",
"1 0.476 10.0 \n",
"2 0.470 9.9 \n",
"3 0.463 9.8 \n",
"4 0.454 9.5 \n",
"\n",
"[5 rows x 22 columns]"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Life expectancy data from Kaggle: https://www.kaggle.com/kumarajarshi/life-expectancy-who\n",
"# Prepreprocessing of the data is based on code from: https://www.kaggle.com/manishsharma448/life-expectancy-linearreg\n",
"import pandas as pd\n",
"df=pd.read_csv(\"Life Expectancy Data.csv\")\n",
"df.head()\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "39220b20",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Country 0\n",
"Year 0\n",
"Status 0\n",
"Life expectancy 0\n",
"Adult Mortality 0\n",
"infant deaths 0\n",
"Alcohol 0\n",
"percentage expenditure 0\n",
"Hepatitis B 0\n",
"Measles 0\n",
" BMI 0\n",
"under-five deaths 0\n",
"Polio 0\n",
"Total expenditure 0\n",
"Diphtheria 0\n",
" HIV/AIDS 0\n",
"GDP 0\n",
"Population 0\n",
" thinness 1-19 years 0\n",
" thinness 5-9 years 0\n",
"Income composition of resources 0\n",
"Schooling 0\n",
"dtype: int64"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Remove rows with missing values\n",
"df=df.dropna(axis=0)\n",
"df.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "ef3b3955",
"metadata": {},
"outputs": [],
"source": [
"# Select only a single year (2014) per country. Why?\n",
"df=df.loc[df['Year'] == 2014]\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "5ec55bbc",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Status | \n",
" Adult Mortality | \n",
" infant deaths | \n",
" Alcohol | \n",
" percentage expenditure | \n",
" Hepatitis B | \n",
" Measles | \n",
" BMI | \n",
" under-five deaths | \n",
" Polio | \n",
" Total expenditure | \n",
" Diphtheria | \n",
" HIV/AIDS | \n",
" GDP | \n",
" Population | \n",
" thinness 1-19 years | \n",
" thinness 5-9 years | \n",
" Income composition of resources | \n",
" Schooling | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" Developing | \n",
" 271.0 | \n",
" 64 | \n",
" 0.01 | \n",
" 73.523582 | \n",
" 62.0 | \n",
" 492 | \n",
" 18.6 | \n",
" 86 | \n",
" 58.0 | \n",
" 8.18 | \n",
" 62.0 | \n",
" 0.1 | \n",
" 612.696514 | \n",
" 327582.0 | \n",
" 17.5 | \n",
" 17.5 | \n",
" 0.476 | \n",
" 10.0 | \n",
"
\n",
" \n",
" 17 | \n",
" Developing | \n",
" 8.0 | \n",
" 0 | \n",
" 4.51 | \n",
" 428.749067 | \n",
" 98.0 | \n",
" 0 | \n",
" 57.2 | \n",
" 1 | \n",
" 98.0 | \n",
" 5.88 | \n",
" 98.0 | \n",
" 0.1 | \n",
" 4575.763787 | \n",
" 288914.0 | \n",
" 1.2 | \n",
" 1.3 | \n",
" 0.761 | \n",
" 14.2 | \n",
"
\n",
" \n",
" 33 | \n",
" Developing | \n",
" 11.0 | \n",
" 21 | \n",
" 0.01 | \n",
" 54.237318 | \n",
" 95.0 | \n",
" 0 | \n",
" 58.4 | \n",
" 24 | \n",
" 95.0 | \n",
" 7.21 | \n",
" 95.0 | \n",
" 0.1 | \n",
" 547.851700 | \n",
" 39113313.0 | \n",
" 6.0 | \n",
" 5.8 | \n",
" 0.741 | \n",
" 14.4 | \n",
"
\n",
" \n",
" 49 | \n",
" Developing | \n",
" 348.0 | \n",
" 67 | \n",
" 8.33 | \n",
" 23.965612 | \n",
" 64.0 | \n",
" 11699 | \n",
" 22.7 | \n",
" 101 | \n",
" 68.0 | \n",
" 3.31 | \n",
" 64.0 | \n",
" 2.0 | \n",
" 479.312240 | \n",
" 2692466.0 | \n",
" 8.5 | \n",
" 8.3 | \n",
" 0.527 | \n",
" 11.4 | \n",
"
\n",
" \n",
" 81 | \n",
" Developing | \n",
" 118.0 | \n",
" 8 | \n",
" 7.93 | \n",
" 847.371746 | \n",
" 94.0 | \n",
" 1 | \n",
" 62.2 | \n",
" 9 | \n",
" 92.0 | \n",
" 4.79 | \n",
" 94.0 | \n",
" 0.1 | \n",
" 12245.256450 | \n",
" 42981515.0 | \n",
" 1.0 | \n",
" 0.9 | \n",
" 0.825 | \n",
" 17.3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Status Adult Mortality infant deaths Alcohol \\\n",
"1 Developing 271.0 64 0.01 \n",
"17 Developing 8.0 0 4.51 \n",
"33 Developing 11.0 21 0.01 \n",
"49 Developing 348.0 67 8.33 \n",
"81 Developing 118.0 8 7.93 \n",
"\n",
" percentage expenditure Hepatitis B Measles BMI under-five deaths \\\n",
"1 73.523582 62.0 492 18.6 86 \n",
"17 428.749067 98.0 0 57.2 1 \n",
"33 54.237318 95.0 0 58.4 24 \n",
"49 23.965612 64.0 11699 22.7 101 \n",
"81 847.371746 94.0 1 62.2 9 \n",
"\n",
" Polio Total expenditure Diphtheria HIV/AIDS GDP \\\n",
"1 58.0 8.18 62.0 0.1 612.696514 \n",
"17 98.0 5.88 98.0 0.1 4575.763787 \n",
"33 95.0 7.21 95.0 0.1 547.851700 \n",
"49 68.0 3.31 64.0 2.0 479.312240 \n",
"81 92.0 4.79 94.0 0.1 12245.256450 \n",
"\n",
" Population thinness 1-19 years thinness 5-9 years \\\n",
"1 327582.0 17.5 17.5 \n",
"17 288914.0 1.2 1.3 \n",
"33 39113313.0 6.0 5.8 \n",
"49 2692466.0 8.5 8.3 \n",
"81 42981515.0 1.0 0.9 \n",
"\n",
" Income composition of resources Schooling \n",
"1 0.476 10.0 \n",
"17 0.761 14.2 \n",
"33 0.741 14.4 \n",
"49 0.527 11.4 \n",
"81 0.825 17.3 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# We will predict life expectancy based on the other attributes, without using the name of the country or the year\n",
"y=df['Life expectancy ']\n",
"X=df.drop([\"Life expectancy \",\"Country\",\"Year\"],axis=1)\n",
"X.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "5a5301e2",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Adult Mortality | \n",
" infant deaths | \n",
" Alcohol | \n",
" percentage expenditure | \n",
" Hepatitis B | \n",
" Measles | \n",
" BMI | \n",
" under-five deaths | \n",
" Polio | \n",
" Total expenditure | \n",
" Diphtheria | \n",
" HIV/AIDS | \n",
" GDP | \n",
" Population | \n",
" thinness 1-19 years | \n",
" thinness 5-9 years | \n",
" Income composition of resources | \n",
" Schooling | \n",
" Status_Developing | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 271.0 | \n",
" 64 | \n",
" 0.01 | \n",
" 73.523582 | \n",
" 62.0 | \n",
" 492 | \n",
" 18.6 | \n",
" 86 | \n",
" 58.0 | \n",
" 8.18 | \n",
" 62.0 | \n",
" 0.1 | \n",
" 612.696514 | \n",
" 327582.0 | \n",
" 17.5 | \n",
" 17.5 | \n",
" 0.476 | \n",
" 10.0 | \n",
" 1 | \n",
"
\n",
" \n",
" 17 | \n",
" 8.0 | \n",
" 0 | \n",
" 4.51 | \n",
" 428.749067 | \n",
" 98.0 | \n",
" 0 | \n",
" 57.2 | \n",
" 1 | \n",
" 98.0 | \n",
" 5.88 | \n",
" 98.0 | \n",
" 0.1 | \n",
" 4575.763787 | \n",
" 288914.0 | \n",
" 1.2 | \n",
" 1.3 | \n",
" 0.761 | \n",
" 14.2 | \n",
" 1 | \n",
"
\n",
" \n",
" 33 | \n",
" 11.0 | \n",
" 21 | \n",
" 0.01 | \n",
" 54.237318 | \n",
" 95.0 | \n",
" 0 | \n",
" 58.4 | \n",
" 24 | \n",
" 95.0 | \n",
" 7.21 | \n",
" 95.0 | \n",
" 0.1 | \n",
" 547.851700 | \n",
" 39113313.0 | \n",
" 6.0 | \n",
" 5.8 | \n",
" 0.741 | \n",
" 14.4 | \n",
" 1 | \n",
"
\n",
" \n",
" 49 | \n",
" 348.0 | \n",
" 67 | \n",
" 8.33 | \n",
" 23.965612 | \n",
" 64.0 | \n",
" 11699 | \n",
" 22.7 | \n",
" 101 | \n",
" 68.0 | \n",
" 3.31 | \n",
" 64.0 | \n",
" 2.0 | \n",
" 479.312240 | \n",
" 2692466.0 | \n",
" 8.5 | \n",
" 8.3 | \n",
" 0.527 | \n",
" 11.4 | \n",
" 1 | \n",
"
\n",
" \n",
" 81 | \n",
" 118.0 | \n",
" 8 | \n",
" 7.93 | \n",
" 847.371746 | \n",
" 94.0 | \n",
" 1 | \n",
" 62.2 | \n",
" 9 | \n",
" 92.0 | \n",
" 4.79 | \n",
" 94.0 | \n",
" 0.1 | \n",
" 12245.256450 | \n",
" 42981515.0 | \n",
" 1.0 | \n",
" 0.9 | \n",
" 0.825 | \n",
" 17.3 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Adult Mortality infant deaths Alcohol percentage expenditure \\\n",
"1 271.0 64 0.01 73.523582 \n",
"17 8.0 0 4.51 428.749067 \n",
"33 11.0 21 0.01 54.237318 \n",
"49 348.0 67 8.33 23.965612 \n",
"81 118.0 8 7.93 847.371746 \n",
"\n",
" Hepatitis B Measles BMI under-five deaths Polio \\\n",
"1 62.0 492 18.6 86 58.0 \n",
"17 98.0 0 57.2 1 98.0 \n",
"33 95.0 0 58.4 24 95.0 \n",
"49 64.0 11699 22.7 101 68.0 \n",
"81 94.0 1 62.2 9 92.0 \n",
"\n",
" Total expenditure Diphtheria HIV/AIDS GDP Population \\\n",
"1 8.18 62.0 0.1 612.696514 327582.0 \n",
"17 5.88 98.0 0.1 4575.763787 288914.0 \n",
"33 7.21 95.0 0.1 547.851700 39113313.0 \n",
"49 3.31 64.0 2.0 479.312240 2692466.0 \n",
"81 4.79 94.0 0.1 12245.256450 42981515.0 \n",
"\n",
" thinness 1-19 years thinness 5-9 years \\\n",
"1 17.5 17.5 \n",
"17 1.2 1.3 \n",
"33 6.0 5.8 \n",
"49 8.5 8.3 \n",
"81 1.0 0.9 \n",
"\n",
" Income composition of resources Schooling Status_Developing \n",
"1 0.476 10.0 1 \n",
"17 0.761 14.2 1 \n",
"33 0.741 14.4 1 \n",
"49 0.527 11.4 1 \n",
"81 0.825 17.3 1 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Use dummy coding to turn \"Status\" into a numeric column \"Status_Developing\"\n",
"X=pd.get_dummies(X,columns=[\"Status\"],drop_first=True)\n",
"X.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "67579b80",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Size of train set: 65\n",
"Size of test set: 66\n"
]
}
],
"source": [
"# Split into train and test set\n",
"from sklearn.model_selection import train_test_split\n",
"Xtrain,Xtest,ytrain,ytest=train_test_split(X,y,test_size=0.5,random_state=1)\n",
"print(\"Size of train set:\", len(ytrain))\n",
"print(\"Size of test set:\",len(ytest))\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "a693c91e",
"metadata": {},
"outputs": [],
"source": [
"# Standardize features to be between 0 and 1\n",
"from sklearn.preprocessing import MinMaxScaler\n",
"scaling=MinMaxScaler()\n",
"Xtrain=scaling.fit_transform(Xtrain)\n",
"Xtest=scaling.transform(Xtest)\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "5fd998d5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LinearRegression()"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Fit least squares model\n",
"from sklearn.linear_model import LinearRegression\n",
"lsmodel=LinearRegression()\n",
"lsmodel.fit(Xtrain,ytrain)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "aeea000e",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train error for least squares: 8.870248223284364\n",
"Test error for least squares: 41.05146734248825\n"
]
}
],
"source": [
"# Compute average residual sum of squares for least squares = average loss \n",
"# We see that the average loss on the test set is significantly higher than the average loss on the train set\n",
"import numpy as np\n",
"def RSS(predictions,yy):\n",
" return(np.sum(np.power(yy - predictions,2))/len(yy))\n",
"\n",
"lsRSStrain = RSS(lsmodel.predict(X=Xtrain), ytrain)\n",
"lsRSStest = RSS(lsmodel.predict(X=Xtest), ytest)\n",
"\n",
"print(\"Train error for least squares:\", lsRSStrain)\n",
"print(\"Test error for least squares:\", lsRSStest)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "a6896756",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Lasso(alpha=1)"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Fit Lasso with nonsense regularization parameter alpha=1\n",
"from sklearn.linear_model import Lasso\n",
"lassomodel = Lasso(alpha=1)\n",
"lassomodel.fit(Xtrain,ytrain)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "da3bcbbb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train error for Lasso: 28.89264019963598\n",
"Test error for Lasso: 26.65570335122436\n"
]
}
],
"source": [
"# Compute average train and test loss for Lasso\n",
"lassoRSStrain = RSS(lassomodel.predict(X=Xtrain),ytrain)\n",
"lassoRSStest = RSS(lassomodel.predict(X=Xtest), ytest)\n",
"\n",
"print(\"Train error for Lasso:\", lassoRSStrain)\n",
"print(\"Test error for Lasso:\", lassoRSStest)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fe9439a1",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}