1
2
3
# 多行输出
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' #默认为'last'
Overview
The challenge
There weren’t enough lifeboats for everyone onboard, resulting in the death of 1502 out of 2224 passengers and crew.
In this challenge, we ask you to build a predictive model that answers the question: “what sorts of people were more likely to survive?” using passenger data (ie name, age, gender, socio-economic class, etc).
Submission File Format:
You should submit a csv file with exactly 418 entries plus a header row.
The file should have exactly 2 columns:
PassengerId (sorted in any order) Survived (contains your binary predictions: 1 for survived, 0 for deceased)
Edition1
Load the data
1
2
3
4
5
6
import numpy as np
import pandas as pd
train_data = pd.read_csv('//Volumes//windows//pythonstudy//titanic/train.csv')
test_data = pd.read_csv('//Volumes//windows//pythonstudy//titanic/test.csv')
train_data.head(5)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
查看路径下的文件夹
1
2
3
4
import os
for dirname, _, filenames in os.walk('//Volumes//windows//pythonstudy//titanic'):
for filename in filenames:
print(os.path.join(dirname, filename))
1
2
3
//Volumes//windows//pythonstudy//titanic/gender_submission.csv
//Volumes//windows//pythonstudy//titanic/test.csv
//Volumes//windows//pythonstudy//titanic/train.csv
获取当前路径
1
os.getcwd()
1
'/Volumes/windows/pythonstudy/BestProj/MyJupyter'
Explore a pattern
Show the survive rate of female
1
2
3
4
# Show the survive rate of female
women = train_data.loc[train_data.Sex == 'female']['Survived'] # df.loc[val] selet a line or lines by index
rate_women = women.sum()/len(women)
rate_women
1
0.7420382165605095
1
2
3
4
# Show the survive rate of male
men = train_data.loc[train_data.Sex == 'male']['Survived'] # df.loc[val] selet a line or lines by index
rate_men = men.sum()/len(men)
rate_men
1
0.18890814558058924
1
2
3
4
5
6
7
# survive rate figured at every pclass
per_pclass_rate = []
for i in range(1,4):
per_pclass = train_data.loc[train_data.Pclass == i]['Survived'] # df.loc[val] selet a line or lines by index
per_pclass_rate_i = per_pclass.sum()/len(per_pclass)
per_pclass_rate.append(per_pclass_rate_i)
per_pclass_rate
1
[0.6296296296296297, 0.47282608695652173, 0.24236252545824846]
1
2
3
4
5
6
7
8
9
10
import matplotlib.pyplot as plt
fig, axes = plt.subplots(1,2)
data = pd.Series([rate_women, rate_men], index=['women','men'])
data.plot.bar(ax=axes[0],color='g',alpha=0.7)
data = pd.Series(per_pclass_rate, index=[1,2,3])
data.plot.bar(ax=axes[1],color='g',alpha=0.7)
1
2
3
4
5
6
7
8
<matplotlib.axes._subplots.AxesSubplot at 0x11f665e10>
<matplotlib.axes._subplots.AxesSubplot at 0x11ef44290>
Data processing
Onehot coding convert categorical to dummy
1
2
3
4
5
6
7
# Pclass Sex SibSp Parch
y = train_data['Survived']
feature = ['Pclass', 'Sex', 'SibSp', 'Parch']
X = train_data[feature] # one argument, which is a list
X_test = test_data[feature]
X
X_test
Pclass | Sex | SibSp | Parch | |
---|---|---|---|---|
0 | 3 | male | 1 | 0 |
1 | 1 | female | 1 | 0 |
2 | 3 | female | 0 | 0 |
3 | 1 | female | 1 | 0 |
4 | 3 | male | 0 | 0 |
... | ... | ... | ... | ... |
886 | 2 | male | 0 | 0 |
887 | 1 | female | 0 | 0 |
888 | 3 | female | 1 | 2 |
889 | 1 | male | 0 | 0 |
890 | 3 | male | 0 | 0 |
891 rows × 4 columns
Pclass | Sex | SibSp | Parch | |
---|---|---|---|---|
0 | 3 | male | 0 | 0 |
1 | 3 | female | 1 | 0 |
2 | 2 | male | 0 | 0 |
3 | 3 | male | 0 | 0 |
4 | 3 | female | 1 | 1 |
... | ... | ... | ... | ... |
413 | 3 | male | 0 | 0 |
414 | 1 | female | 0 | 0 |
415 | 3 | male | 0 | 0 |
416 | 3 | male | 0 | 0 |
417 | 3 | male | 1 | 1 |
418 rows × 4 columns
1
2
3
4
X = pd.get_dummies(X)
X_test = pd.get_dummies(X_test)
X
X_test
Pclass | SibSp | Parch | Sex_female | Sex_male | |
---|---|---|---|---|---|
0 | 3 | 1 | 0 | 0 | 1 |
1 | 1 | 1 | 0 | 1 | 0 |
2 | 3 | 0 | 0 | 1 | 0 |
3 | 1 | 1 | 0 | 1 | 0 |
4 | 3 | 0 | 0 | 0 | 1 |
... | ... | ... | ... | ... | ... |
886 | 2 | 0 | 0 | 0 | 1 |
887 | 1 | 0 | 0 | 1 | 0 |
888 | 3 | 1 | 2 | 1 | 0 |
889 | 1 | 0 | 0 | 0 | 1 |
890 | 3 | 0 | 0 | 0 | 1 |
891 rows × 5 columns
Pclass | SibSp | Parch | Sex_female | Sex_male | |
---|---|---|---|---|---|
0 | 3 | 0 | 0 | 0 | 1 |
1 | 3 | 1 | 0 | 1 | 0 |
2 | 2 | 0 | 0 | 0 | 1 |
3 | 3 | 0 | 0 | 0 | 1 |
4 | 3 | 1 | 1 | 1 | 0 |
... | ... | ... | ... | ... | ... |
413 | 3 | 0 | 0 | 0 | 1 |
414 | 1 | 0 | 0 | 1 | 0 |
415 | 3 | 0 | 0 | 0 | 1 |
416 | 3 | 0 | 0 | 0 | 1 |
417 | 3 | 1 | 1 | 0 | 1 |
418 rows × 5 columns
get_dummies
Convert categorical variable into dummy/indicator variables.
1
pd.get_dummies??
Module selection
1
2
3
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X,y)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
/Users/king/opt/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
"10 in version 0.20 to 100 in 0.22.", FutureWarning)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False)
1
2
predictions = model.predict(X_test)
predictions
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
array([0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0])
Output
1
2
3
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('./results/Titanic.csv', index=False)
print('My submission was successfully saved.')
1
My submission was successfully saved.
Edition2_stacking/ensemblingModel
This notebook is a very basic and simple introductory primer to the method of ensembling (combining) base learning models, in particular the variant of ensembling known as Stacking. In a nutshell stacking uses as a first-level (base), the predictions of a few basic classifiers and then uses another model at the second-level to predict the output from the earlier first-level predictions.
A proper ensembling/stacking script Stacking Starter : written in the AllState Severity Claims competition by the great Faron
Other standalone Kaggle script which implements exactly the same ensembling steps
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# Load in our libraries
import pandas as pd
import numpy as np
import re
import sklearn
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import warnings
warnings.filterwarnings('ignore')
# Going to use these 5 base models for the stacking
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier,
GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.model_selection import KFold
Feature Exploration, Engineering and Cleaning
Now we will proceed much like how most kernels in general are structured, and that is to first explore the data on hand, identify possible feature engineering opportunities as well as numerically encode any categorical features.
1
2
3
4
5
6
7
8
# Load in the train and test datasets
train = pd.read_csv('//Volumes//windows//pythonstudy//titanic/train.csv')
test = pd.read_csv('//Volumes//windows//pythonstudy//titanic/test.csv')
# Store our passenger ID for easy access
PassengerId = test['PassengerId']
PassengerId
train.head(3)
1
2
3
4
5
6
7
8
9
10
11
12
0 892
1 893
2 894
3 895
4 896
...
413 1305
414 1306
415 1307
416 1308
417 1309
Name: PassengerId, Length: 418, dtype: int64
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
Feature Engineering
Here, credit must be extended to Sina’s very comprehensive and well-thought out notebook for the feature engineering ideas so please check out his work
Titanic Best Working Classfier : by Sina
1
2
3
4
5
6
7
8
9
10
11
full_data = [train, test] # a list joint train and test
# Some features of my own that I have added in
# Gives the length of the name
train['Name_length'] = train['Name'].apply(len)
test['Name_length'] = test['Name'].apply(len)
# Feature that tells whether a passenger had a cabin on the Titanic
# type(NaN)=float,else is str
train['Has_Cabin'] = train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
test['Has_Cabin'] = test["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
1
2
3
4
5
# Feature engineering steps taken from Sina
# Create new feature FamilySize as a combination of SibSp and Parch
# pandas can stright to creat a new col
for dataset in full_data:
dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
Remove NULLs by fillna()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# Create new feature IsAlone from FamilySize
# row of FamilySize==1
for dataset in full_data:
dataset['IsAlone'] = 0
dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
# Remove all NULLS in the Embarked column
for dataset in full_data:
dataset['Embarked'] = dataset['Embarked'].fillna('S')
# Remove all NULLS in the Fare column and create a new feature CategoricalFare
for dataset in full_data:
dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())
train['CategoricalFare'] = pd.qcut(train['Fare'], 4)
# Create a New feature CategoricalAge
for dataset in full_data:
age_avg = dataset['Age'].mean()
age_std = dataset['Age'].std()
age_null_count = dataset['Age'].isnull().sum()
age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
dataset['Age'] = dataset['Age'].astype(int)
train['CategoricalAge'] = pd.cut(train['Age'], 5)
# Define function to extract titles from passenger names
def get_title(name):
title_search = re.search(' ([A-Za-z]+)\.', name)
# If the title exists, extract and return it.
if title_search:
return title_search.group(1)
return ""
# Create a new feature Title, containing the titles of passenger names
for dataset in full_data:
dataset['Title'] = dataset['Name'].apply(get_title)
# Group all non-common titles into one single grouping "Rare"
for dataset in full_data:
dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
for dataset in full_data:
# Mapping Sex
dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
# Mapping titles
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
dataset['Title'] = dataset['Title'].map(title_mapping)
dataset['Title'] = dataset['Title'].fillna(0)
# Mapping Embarked
dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
# Mapping Fare
dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2
dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
dataset['Fare'] = dataset['Fare'].astype(int)
# Mapping Age
dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
dataset.loc[ dataset['Age'] > 64, 'Age'] = 4 ;
1
2
3
4
5
6
7
8
9
10
11
12
0 0
1 1
2 0
3 1
4 0
..
886 0
887 1
888 0
889 1
890 0
Name: Has_Cabin, Length: 891, dtype: int64
-
Previous
理论知识 Rsa加密算法 -
Next
论文学习 (19TGRS)A CNN With Multiscale Convolution and Diversified Metric for Hyperspectral Image Classification