import os
import pandas as pd
ML Training
Test a machine learning model
PROTIP - type function name with empty parenthesis and press shift+tab inside parenthesis to see documentation
= "./data/titanic"
data_dir = pd.read_csv(os.path.join(data_dir, "train.csv"))
df 5) df.sample(
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
301 | 302 | 1 | 3 | McCoy, Mr. Bernard | male | NaN | 2 | 0 | 367226 | 23.25 | NaN | Q |
179 | 180 | 0 | 3 | Leonard, Mr. Lionel | male | 36.0 | 0 | 0 | LINE | 0.00 | NaN | S |
865 | 866 | 1 | 2 | Bystrom, Mrs. (Karolina) | female | 42.0 | 0 | 0 | 236852 | 13.00 | NaN | S |
112 | 113 | 0 | 3 | Barton, Mr. David John | male | 22.0 | 0 | 0 | 324669 | 8.05 | NaN | S |
530 | 531 | 1 | 2 | Quick, Miss. Phyllis May | female | 2.0 | 1 | 1 | 26360 | 26.00 | NaN | S |
y
= dependent variable
x
= independent variable
= df["Survived"]
y = df.drop(["Survived", "PassengerId"], axis=1) x
x.sample()
Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|
732 | 2 | Knight, Mr. Robert J | male | NaN | 0 | 0 | 239855 | 0.0 | NaN | S |
from sklearn.model_selection import train_test_split
# Order matters (train, test)
# test_size = what percent of training data goes into test model
= train_test_split(x, y, test_size=0.1) x_train, x_test, y_train, y_test
x_train.shape
(801, 10)
x_test.shape
(90, 10)
Decision Trees - nodes, branches, leaves
Prone to overfitting. Overcome by using random forests and using multiple iterations
First ML Model
Getting started
import os
import pandas as pd
from sklearn.model_selection import train_test_split
= "./data/titanic"
data_dir = pd.read_csv(os.path.join(data_dir, "train.csv"))
df 5)
df.sample(
= df["Survived"]
y = df.drop(["Survived", "PassengerId"], axis=1)
x
= train_test_split(x, y, test_size=0.1)
x_train, x_test, y_train, y_test
df.info() df.describe()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
# Count number of null values per column
sum() df.isnull().
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
# List all of peoples tables
def get_title(name):
if "." in name:
return name.split(",")[1].split(".")[0].strip()
return "Unknown"
= sorted(set([x for x in df.Name.map(lambda x: get_title(x))]))
titles print("Different titles found in the dataset: ")
print(len(titles), ":", titles)
Different titles found in the dataset:
17 : ['Capt', 'Col', 'Don', 'Dr', 'Jonkheer', 'Lady', 'Major', 'Master', 'Miss', 'Mlle', 'Mme', 'Mr', 'Mrs', 'Ms', 'Rev', 'Sir', 'the Countess']
# Normalize the titles
def replace_titles(x):
= x["Title"]
title if title in ["Capt", "Col", "Major"]:
return "Officer"
if title in ["Jonkheer", "Don", "the Countess", "Dona", "Lady", "Sir"]:
return "Royalty"
if title in ["Mme"]:
return "Mrs"
if title in ["Mlle", "Ms"]:
return "Miss"
return title
"Title"] = df["Name"].map(lambda x: get_title(x))
df["Title"] = df.apply(replace_titles, axis=1)
df[print(df.Title.value_counts())
Mr 517
Miss 185
Mrs 126
Master 40
Dr 7
Rev 6
Officer 5
Royalty 5
Name: Title, dtype: int64
"Age"].fillna(df["Age"].median(), inplace=True)
df["Fare"].fillna(df["Fare"].median(), inplace=True)
df["Embarked"].fillna("S", inplace=True)
df["Cabin", axis=1, inplace=True)
df.drop("Ticket", axis=1, inplace=True)
df.drop("Name", axis=1, inplace=True)
df.drop("male", "female"), (0, 1), inplace=True)
df.Sex.replace(("S", "C", "Q"), (0, 1, 2), inplace=True)
df.Embarked.replace((
df.Title.replace("Mr", "Miss", "Mrs", "Master", "Dr", "Rev", "Office", "Royalty"), (0, 1, 2, 3, 4, 5, 6, 7), inplace=True
(
)
print(df.isnull().sum())
print(df["Sex"].sample(5))
print(df.columns)
PassengerId 0
Survived 0
Pclass 0
Sex 0
Age 0
SibSp 0
Parch 0
Fare 0
Embarked 0
Title 0
dtype: int64
791 0
161 1
284 0
323 1
127 0
Name: Sex, dtype: int64
Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
'Fare', 'Embarked', 'Title'],
dtype='object')
# Correlate 2 columns
= df.corr()
corr corr.Survived
PassengerId -0.005007
Survived 1.000000
Pclass -0.338481
Sex 0.543351
Age -0.064910
SibSp -0.035322
Parch 0.081629
Fare 0.257307
Embarked 0.106811
Name: Survived, dtype: float64
Machine Learning Model - Putting it all together
import os
import pandas as pd
from sklearn.model_selection import train_test_split
= "../10_Data Science/data/titanic"
data_dir = pd.read_csv(os.path.join(data_dir, "train.csv"))
df 5)
df.sample(
# List all of peoples tables
def get_title(name):
if "." in name:
return name.split(",")[1].split(".")[0].strip()
return "Unknown"
= sorted(set([x for x in df.Name.map(lambda x: get_title(x))]))
titles
# Normalize the titles
def replace_titles(x):
= x["Title"]
title if title in ["Capt", "Col", "Major"]:
return "Officer"
if title in ["Jonkheer", "Don", "the Countess", "Dona", "Lady", "Sir"]:
return "Royalty"
if title in ["Mme"]:
return "Mrs"
if title in ["Mlle", "Ms"]:
return "Miss"
return title
"Title"] = df["Name"].map(lambda x: get_title(x))
df["Title"] = df.apply(replace_titles, axis=1)
df[
# Normalize data
"Age"].fillna(df["Age"].median(), inplace=True)
df["Fare"].fillna(df["Fare"].median(), inplace=True)
df["Embarked"].fillna("S", inplace=True)
df["Cabin", axis=1, inplace=True)
df.drop("Ticket", axis=1, inplace=True)
df.drop("Name", axis=1, inplace=True)
df.drop("male", "female"), (0, 1), inplace=True)
df.Sex.replace(("S", "C", "Q"), (0, 1, 2), inplace=True)
df.Embarked.replace((
df.Title.replace("Mr", "Miss", "Mrs", "Master", "Dr", "Rev", "Officer", "Royalty"), (0, 1, 2, 3, 4, 5, 6, 7), inplace=True
(
)
= df["Survived"]
y = df.drop(["Survived", "PassengerId"], axis=1)
x
# print(x.sample())
# print(y.sample())
= train_test_split(x, y, test_size=0.1) x_train, x_val, y_train, y_val
# Saving the model
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
= RandomForestClassifier() # initiate random forest classification
randomforest # train the model
randomforest.fit(x_train, y_train) = randomforest.predict(x_val) # Make some predicions using the x validation
y_pred = round(accuracy_score(y_pred, y_val) * 100, 2)
acc_randomforest print(f"Accuracy: {acc_randomforest}")
open("titanic_model.save", "wb")) pickle.dump(randomforest,
Accuracy: 83.33
Make some ML predictions
= pd.read_csv(os.path.join(data_dir, "test.csv"))
df_test
"Title"] = df_test["Name"].map(lambda x: get_title(x))
df_test["Title"] = df_test.apply(replace_titles, axis=1)
df_test[
= df_test["PassengerId"]
ids
"Age"].fillna(df["Age"].median(), inplace=True)
df_test["Fare"].fillna(df["Fare"].median(), inplace=True)
df_test["Embarked"].fillna("S", inplace=True)
df_test["Cabin", axis=1, inplace=True)
df_test.drop("Ticket", axis=1, inplace=True)
df_test.drop("Name", axis=1, inplace=True)
df_test.drop("PassengerId", axis=1, inplace=True)
df_test.drop("male", "female"), (0, 1), inplace=True)
df_test.Sex.replace(("S", "C", "Q"), (0, 1, 2), inplace=True)
df_test.Embarked.replace((
df_test.Title.replace("Mr", "Miss", "Mrs", "Master", "Dr", "Rev", "Officer", "Royalty"), (0, 1, 2, 3, 4, 5, 6, 7), inplace=True
(
)
df_test.sample()
Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | Title | |
---|---|---|---|---|---|---|---|---|
110 | 2 | 0 | 41.0 | 0 | 0 | 15.0458 | 1 | 0 |
= randomforest.predict(df_test)
predictions = pd.DataFrame({"PassengerId": ids, "Survived": predictions})
output "submission.csv", index=False) output.to_csv(
import numpy as np
= [1, 2, 3, 4, 5]
x = [5, 7, 9, 13, 23]
y = np.polyfit(x, y, 1)
m, b print(m, b)
4.2 -1.2000000000000026
ML Compiled Predictor
Compiled machine learning predictor.
import os
import pickle
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
= "./data/titanic"
data_dir = pd.read_csv(os.path.join(data_dir, "train.csv"))
df
# List all of peoples tables
def get_title(name):
if "." in name:
return name.split(",")[1].split(".")[0].strip()
return "Unknown"
# Normalize the titles
def replace_titles(x):
= x["Title"]
title if title in ["Capt", "Col", "Major"]:
return "Officer"
if title in ["Jonkheer", "Don", "the Countess", "Dona", "Lady", "Sir"]:
return "Royalty"
if title in ["the Countess", "Mme", "Lady"]:
return "Mrs"
if title in ["Mlle", "Ms"]:
return "Miss"
return title
"Title"] = df["Name"].map(lambda x: get_title(x))
df["Title"] = df.apply(replace_titles, axis=1)
df[
# Normalize data
"Age"].fillna(df["Age"].median(), inplace=True)
df["Fare"].fillna(df["Fare"].median(), inplace=True)
df["Embarked"].fillna("S", inplace=True)
df["Cabin", axis=1, inplace=True)
df.drop("Ticket", axis=1, inplace=True)
df.drop("Name", axis=1, inplace=True)
df.drop("male", "female"), (0, 1), inplace=True)
df.Sex.replace(("S", "C", "Q"), (0, 1, 2), inplace=True)
df.Embarked.replace((
df.Title.replace("Mr", "Miss", "Mrs", "Master", "Dr", "Rev", "Officer", "Royalty"), (0, 1, 2, 3, 4, 5, 6, 7), inplace=True
(
)
= df["Survived"]
y = df.drop(["Survived", "PassengerId"], axis=1)
x
= train_test_split(x, y, test_size=0.1)
x_train, x_val, y_train, y_val
= RandomForestClassifier() # initiate random forest classification
randomforest # train the model
randomforest.fit(x_train, y_train)
open("titanic_model.save", "wb")) pickle.dump(randomforest,
def prediction_model(pclass, sex, age, sibsp, parch, fare, embarked, title):
import pickle
= [[pclass, sex, age, sibsp, parch, fare, embarked, title]]
x = pickle.load(open("titanic_model.save", "rb"))
randomforest = randomforest.predict(x)
predictions print(predictions)
1, 1, 11, 1, 1, 19, 1, 1) prediction_model(