ML Training

Test a machine learning model

PROTIP - type function name with empty parenthesis and press shift+tab inside parenthesis to see documentation

import os

import pandas as pd

data_dir = "./data/titanic"
df = pd.read_csv(os.path.join(data_dir, "train.csv"))
df.sample(5)

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
301	302	1	3	McCoy, Mr. Bernard	male	NaN	2	0	367226	23.25	NaN	Q
179	180	0	3	Leonard, Mr. Lionel	male	36.0	0	0	LINE	0.00	NaN	S
865	866	1	2	Bystrom, Mrs. (Karolina)	female	42.0	0	0	236852	13.00	NaN	S
112	113	0	3	Barton, Mr. David John	male	22.0	0	0	324669	8.05	NaN	S
530	531	1	2	Quick, Miss. Phyllis May	female	2.0	1	1	26360	26.00	NaN	S

y= dependent variable

x = independent variable

y = df["Survived"]
x = df.drop(["Survived", "PassengerId"], axis=1)

x.sample()

	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
732	2	Knight, Mr. Robert J	male	NaN	0	0	239855	0.0	NaN	S

from sklearn.model_selection import train_test_split

# Order matters (train, test)

# test_size = what percent of training data goes into test model

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)

x_train.shape

(801, 10)

x_test.shape

(90, 10)

Decision Trees - nodes, branches, leaves

Prone to overfitting. Overcome by using random forests and using multiple iterations

First ML Model

Getting started

import os

import pandas as pd
from sklearn.model_selection import train_test_split

data_dir = "./data/titanic"
df = pd.read_csv(os.path.join(data_dir, "train.csv"))
df.sample(5)

y = df["Survived"]
x = df.drop(["Survived", "PassengerId"], axis=1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)

df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

# Count number of null values per column
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

# List all of peoples tables
def get_title(name):
    if "." in name:
        return name.split(",")[1].split(".")[0].strip()
    return "Unknown"


titles = sorted(set([x for x in df.Name.map(lambda x: get_title(x))]))
print("Different titles found in the dataset: ")
print(len(titles), ":", titles)

Different titles found in the dataset: 
17 : ['Capt', 'Col', 'Don', 'Dr', 'Jonkheer', 'Lady', 'Major', 'Master', 'Miss', 'Mlle', 'Mme', 'Mr', 'Mrs', 'Ms', 'Rev', 'Sir', 'the Countess']

# Normalize the titles
def replace_titles(x):
    title = x["Title"]
    if title in ["Capt", "Col", "Major"]:
        return "Officer"
    if title in ["Jonkheer", "Don", "the Countess", "Dona", "Lady", "Sir"]:
        return "Royalty"
    if title in ["Mme"]:
        return "Mrs"
    if title in ["Mlle", "Ms"]:
        return "Miss"
    return title


df["Title"] = df["Name"].map(lambda x: get_title(x))
df["Title"] = df.apply(replace_titles, axis=1)
print(df.Title.value_counts())

Mr         517
Miss       185
Mrs        126
Master      40
Dr           7
Rev          6
Officer      5
Royalty      5
Name: Title, dtype: int64

df["Age"].fillna(df["Age"].median(), inplace=True)
df["Fare"].fillna(df["Fare"].median(), inplace=True)
df["Embarked"].fillna("S", inplace=True)
df.drop("Cabin", axis=1, inplace=True)
df.drop("Ticket", axis=1, inplace=True)
df.drop("Name", axis=1, inplace=True)
df.Sex.replace(("male", "female"), (0, 1), inplace=True)
df.Embarked.replace(("S", "C", "Q"), (0, 1, 2), inplace=True)
df.Title.replace(
    ("Mr", "Miss", "Mrs", "Master", "Dr", "Rev", "Office", "Royalty"), (0, 1, 2, 3, 4, 5, 6, 7), inplace=True
)

print(df.isnull().sum())
print(df["Sex"].sample(5))
print(df.columns)

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
Title          0
dtype: int64
791    0
161    1
284    0
323    1
127    0
Name: Sex, dtype: int64
Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Fare', 'Embarked', 'Title'],
      dtype='object')

# Correlate 2 columns
corr = df.corr()
corr.Survived

PassengerId   -0.005007
Survived       1.000000
Pclass        -0.338481
Sex            0.543351
Age           -0.064910
SibSp         -0.035322
Parch          0.081629
Fare           0.257307
Embarked       0.106811
Name: Survived, dtype: float64

Machine Learning Model - Putting it all together

import os

import pandas as pd
from sklearn.model_selection import train_test_split

data_dir = "../10_Data Science/data/titanic"
df = pd.read_csv(os.path.join(data_dir, "train.csv"))
df.sample(5)


# List all of peoples tables
def get_title(name):
    if "." in name:
        return name.split(",")[1].split(".")[0].strip()
    return "Unknown"


titles = sorted(set([x for x in df.Name.map(lambda x: get_title(x))]))


# Normalize the titles
def replace_titles(x):
    title = x["Title"]
    if title in ["Capt", "Col", "Major"]:
        return "Officer"
    if title in ["Jonkheer", "Don", "the Countess", "Dona", "Lady", "Sir"]:
        return "Royalty"
    if title in ["Mme"]:
        return "Mrs"
    if title in ["Mlle", "Ms"]:
        return "Miss"
    return title


df["Title"] = df["Name"].map(lambda x: get_title(x))
df["Title"] = df.apply(replace_titles, axis=1)

# Normalize data
df["Age"].fillna(df["Age"].median(), inplace=True)
df["Fare"].fillna(df["Fare"].median(), inplace=True)
df["Embarked"].fillna("S", inplace=True)
df.drop("Cabin", axis=1, inplace=True)
df.drop("Ticket", axis=1, inplace=True)
df.drop("Name", axis=1, inplace=True)
df.Sex.replace(("male", "female"), (0, 1), inplace=True)
df.Embarked.replace(("S", "C", "Q"), (0, 1, 2), inplace=True)
df.Title.replace(
    ("Mr", "Miss", "Mrs", "Master", "Dr", "Rev", "Officer", "Royalty"), (0, 1, 2, 3, 4, 5, 6, 7), inplace=True
)

y = df["Survived"]
x = df.drop(["Survived", "PassengerId"], axis=1)

# print(x.sample())
# print(y.sample())
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.1)

# Saving the model
import pickle

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

randomforest = RandomForestClassifier()  # initiate random forest classification
randomforest.fit(x_train, y_train)  # train the model
y_pred = randomforest.predict(x_val)  # Make some predicions using the x validation
acc_randomforest = round(accuracy_score(y_pred, y_val) * 100, 2)
print(f"Accuracy: {acc_randomforest}")

pickle.dump(randomforest, open("titanic_model.save", "wb"))

Accuracy: 83.33

Make some ML predictions

df_test = pd.read_csv(os.path.join(data_dir, "test.csv"))

df_test["Title"] = df_test["Name"].map(lambda x: get_title(x))
df_test["Title"] = df_test.apply(replace_titles, axis=1)

ids = df_test["PassengerId"]

df_test["Age"].fillna(df["Age"].median(), inplace=True)
df_test["Fare"].fillna(df["Fare"].median(), inplace=True)
df_test["Embarked"].fillna("S", inplace=True)
df_test.drop("Cabin", axis=1, inplace=True)
df_test.drop("Ticket", axis=1, inplace=True)
df_test.drop("Name", axis=1, inplace=True)
df_test.drop("PassengerId", axis=1, inplace=True)
df_test.Sex.replace(("male", "female"), (0, 1), inplace=True)
df_test.Embarked.replace(("S", "C", "Q"), (0, 1, 2), inplace=True)
df_test.Title.replace(
    ("Mr", "Miss", "Mrs", "Master", "Dr", "Rev", "Officer", "Royalty"), (0, 1, 2, 3, 4, 5, 6, 7), inplace=True
)

df_test.sample()

	Pclass	Sex	Age	SibSp	Parch	Fare	Embarked	Title
110	2	0	41.0	0	0	15.0458	1	0

predictions = randomforest.predict(df_test)
output = pd.DataFrame({"PassengerId": ids, "Survived": predictions})
output.to_csv("submission.csv", index=False)

import numpy as np

x = [1, 2, 3, 4, 5]
y = [5, 7, 9, 13, 23]
m, b = np.polyfit(x, y, 1)
print(m, b)

4.2 -1.2000000000000026

ML Compiled Predictor

Compiled machine learning predictor.

import os
import pickle

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

data_dir = "./data/titanic"
df = pd.read_csv(os.path.join(data_dir, "train.csv"))


# List all of peoples tables
def get_title(name):
    if "." in name:
        return name.split(",")[1].split(".")[0].strip()
    return "Unknown"


# Normalize the titles
def replace_titles(x):
    title = x["Title"]
    if title in ["Capt", "Col", "Major"]:
        return "Officer"
    if title in ["Jonkheer", "Don", "the Countess", "Dona", "Lady", "Sir"]:
        return "Royalty"
    if title in ["the Countess", "Mme", "Lady"]:
        return "Mrs"
    if title in ["Mlle", "Ms"]:
        return "Miss"
    return title


df["Title"] = df["Name"].map(lambda x: get_title(x))
df["Title"] = df.apply(replace_titles, axis=1)

# Normalize data
df["Age"].fillna(df["Age"].median(), inplace=True)
df["Fare"].fillna(df["Fare"].median(), inplace=True)
df["Embarked"].fillna("S", inplace=True)
df.drop("Cabin", axis=1, inplace=True)
df.drop("Ticket", axis=1, inplace=True)
df.drop("Name", axis=1, inplace=True)
df.Sex.replace(("male", "female"), (0, 1), inplace=True)
df.Embarked.replace(("S", "C", "Q"), (0, 1, 2), inplace=True)
df.Title.replace(
    ("Mr", "Miss", "Mrs", "Master", "Dr", "Rev", "Officer", "Royalty"), (0, 1, 2, 3, 4, 5, 6, 7), inplace=True
)

y = df["Survived"]
x = df.drop(["Survived", "PassengerId"], axis=1)

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.1)

randomforest = RandomForestClassifier()  # initiate random forest classification
randomforest.fit(x_train, y_train)  # train the model

pickle.dump(randomforest, open("titanic_model.save", "wb"))

def prediction_model(pclass, sex, age, sibsp, parch, fare, embarked, title):
    import pickle

    x = [[pclass, sex, age, sibsp, parch, fare, embarked, title]]
    randomforest = pickle.load(open("titanic_model.save", "rb"))
    predictions = randomforest.predict(x)
    print(predictions)


prediction_model(1, 1, 11, 1, 1, 19, 1, 1)