Kaggle_Titanic project Programming

Posted by Sun on February 4, 2020
1
2
3
# 多行输出
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = 'all'  #默认为'last'

Overview

The challenge

There weren’t enough lifeboats for everyone onboard, resulting in the death of 1502 out of 2224 passengers and crew.

In this challenge, we ask you to build a predictive model that answers the question: “what sorts of people were more likely to survive?” using passenger data (ie name, age, gender, socio-economic class, etc).

Submission File Format:

You should submit a csv file with exactly 418 entries plus a header row.

The file should have exactly 2 columns:

PassengerId (sorted in any order) Survived (contains your binary predictions: 1 for survived, 0 for deceased)

Edition1

Load the data

1
2
3
4
5
6
import numpy as np
import pandas as pd

train_data = pd.read_csv('//Volumes//windows//pythonstudy//titanic/train.csv')
test_data = pd.read_csv('//Volumes//windows//pythonstudy//titanic/test.csv')
train_data.head(5)
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

查看路径下的文件夹

1
2
3
4
import os
for dirname, _, filenames in os.walk('//Volumes//windows//pythonstudy//titanic'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
1
2
3
//Volumes//windows//pythonstudy//titanic/gender_submission.csv
//Volumes//windows//pythonstudy//titanic/test.csv
//Volumes//windows//pythonstudy//titanic/train.csv

获取当前路径

1
os.getcwd()
1
'/Volumes/windows/pythonstudy/BestProj/MyJupyter'

Explore a pattern

Show the survive rate of female

1
2
3
4
# Show the survive rate of female
women = train_data.loc[train_data.Sex == 'female']['Survived']  # df.loc[val] selet a line or lines by index
rate_women = women.sum()/len(women)
rate_women
1
0.7420382165605095
1
2
3
4
# Show the survive rate of male
men = train_data.loc[train_data.Sex == 'male']['Survived']  # df.loc[val] selet a line or lines by index
rate_men = men.sum()/len(men)
rate_men
1
0.18890814558058924
1
2
3
4
5
6
7
# survive rate figured at every pclass
per_pclass_rate = []
for i in range(1,4):
    per_pclass = train_data.loc[train_data.Pclass == i]['Survived']  # df.loc[val] selet a line or lines by index
    per_pclass_rate_i = per_pclass.sum()/len(per_pclass)
    per_pclass_rate.append(per_pclass_rate_i)
per_pclass_rate
1
[0.6296296296296297, 0.47282608695652173, 0.24236252545824846]
1
2
3
4
5
6
7
8
9
10
import matplotlib.pyplot as plt
fig, axes = plt.subplots(1,2)

data = pd.Series([rate_women, rate_men], index=['women','men'])
data.plot.bar(ax=axes[0],color='g',alpha=0.7)

data = pd.Series(per_pclass_rate, index=[1,2,3])
data.plot.bar(ax=axes[1],color='g',alpha=0.7)


1
2
3
4
5
6
7
8
<matplotlib.axes._subplots.AxesSubplot at 0x11f665e10>






<matplotlib.axes._subplots.AxesSubplot at 0x11ef44290>

png

Data processing

Onehot coding convert categorical to dummy

1
2
3
4
5
6
7
# Pclass Sex SibSp Parch
y = train_data['Survived']
feature = ['Pclass', 'Sex', 'SibSp', 'Parch']
X = train_data[feature]  # one argument, which is a list
X_test = test_data[feature]
X
X_test
Pclass Sex SibSp Parch
0 3 male 1 0
1 1 female 1 0
2 3 female 0 0
3 1 female 1 0
4 3 male 0 0
... ... ... ... ...
886 2 male 0 0
887 1 female 0 0
888 3 female 1 2
889 1 male 0 0
890 3 male 0 0

891 rows × 4 columns

Pclass Sex SibSp Parch
0 3 male 0 0
1 3 female 1 0
2 2 male 0 0
3 3 male 0 0
4 3 female 1 1
... ... ... ... ...
413 3 male 0 0
414 1 female 0 0
415 3 male 0 0
416 3 male 0 0
417 3 male 1 1

418 rows × 4 columns

1
2
3
4
X = pd.get_dummies(X)
X_test = pd.get_dummies(X_test)
X
X_test
Pclass SibSp Parch Sex_female Sex_male
0 3 1 0 0 1
1 1 1 0 1 0
2 3 0 0 1 0
3 1 1 0 1 0
4 3 0 0 0 1
... ... ... ... ... ...
886 2 0 0 0 1
887 1 0 0 1 0
888 3 1 2 1 0
889 1 0 0 0 1
890 3 0 0 0 1

891 rows × 5 columns

Pclass SibSp Parch Sex_female Sex_male
0 3 0 0 0 1
1 3 1 0 1 0
2 2 0 0 0 1
3 3 0 0 0 1
4 3 1 1 1 0
... ... ... ... ... ...
413 3 0 0 0 1
414 1 0 0 1 0
415 3 0 0 0 1
416 3 0 0 0 1
417 3 1 1 0 1

418 rows × 5 columns

get_dummies

Convert categorical variable into dummy/indicator variables.

1
pd.get_dummies??

Module selection

1
2
3
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X,y)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
/Users/king/opt/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
1
2
predictions = model.predict(X_test)
predictions
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
array([0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0])

Output

1
2
3
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('./results/Titanic.csv', index=False)
print('My submission was successfully saved.')
1
My submission was successfully saved.

Edition2_stacking/ensemblingModel

This notebook is a very basic and simple introductory primer to the method of ensembling (combining) base learning models, in particular the variant of ensembling known as Stacking. In a nutshell stacking uses as a first-level (base), the predictions of a few basic classifiers and then uses another model at the second-level to predict the output from the earlier first-level predictions.

A proper ensembling/stacking script Stacking Starter : written in the AllState Severity Claims competition by the great Faron

Other standalone Kaggle script which implements exactly the same ensembling steps

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# Load in our libraries
import pandas as pd
import numpy as np
import re
import sklearn
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

import warnings
warnings.filterwarnings('ignore')

# Going to use these 5 base models for the stacking
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.model_selection import KFold

Feature Exploration, Engineering and Cleaning

Now we will proceed much like how most kernels in general are structured, and that is to first explore the data on hand, identify possible feature engineering opportunities as well as numerically encode any categorical features.

1
2
3
4
5
6
7
8
# Load in the train and test datasets
train = pd.read_csv('//Volumes//windows//pythonstudy//titanic/train.csv')
test = pd.read_csv('//Volumes//windows//pythonstudy//titanic/test.csv')

# Store our passenger ID for easy access
PassengerId = test['PassengerId']
PassengerId
train.head(3)
1
2
3
4
5
6
7
8
9
10
11
12
0       892
1       893
2       894
3       895
4       896
       ... 
413    1305
414    1306
415    1307
416    1308
417    1309
Name: PassengerId, Length: 418, dtype: int64
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S

Feature Engineering

Here, credit must be extended to Sina’s very comprehensive and well-thought out notebook for the feature engineering ideas so please check out his work

Titanic Best Working Classfier : by Sina

1
2
3
4
5
6
7
8
9
10
11
full_data = [train, test]  # a list joint train and test 

# Some features of my own that I have added in
# Gives the length of the name
train['Name_length'] = train['Name'].apply(len)
test['Name_length'] = test['Name'].apply(len)

# Feature that tells whether a passenger had a cabin on the Titanic
# type(NaN)=float,else is str
train['Has_Cabin'] = train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
test['Has_Cabin'] = test["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
1
2
3
4
5
# Feature engineering steps taken from Sina
# Create new feature FamilySize as a combination of SibSp and Parch
# pandas can stright to creat a new col
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

Remove NULLs by fillna()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# Create new feature IsAlone from FamilySize
# row of FamilySize==1
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
    
# Remove all NULLS in the Embarked column
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
    
# Remove all NULLS in the Fare column and create a new feature CategoricalFare
for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())
train['CategoricalFare'] = pd.qcut(train['Fare'], 4)
# Create a New feature CategoricalAge
for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)
train['CategoricalAge'] = pd.cut(train['Age'], 5)
# Define function to extract titles from passenger names
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""
# Create a new feature Title, containing the titles of passenger names
for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)
# Group all non-common titles into one single grouping "Rare"
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

for dataset in full_data:
    # Mapping Sex
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    
    # Mapping titles
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)
    
    # Mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    
    # Mapping Fare
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] 						        = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] 							        = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    
    # Mapping Age
    dataset.loc[ dataset['Age'] <= 16, 'Age'] 					       = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4 ;
1
2
3
4
5
6
7
8
9
10
11
12
0      0
1      1
2      0
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Has_Cabin, Length: 891, dtype: int64