Python: Kaggle Titannic数据集处理
阅读原文时间:2021年04月21日阅读:1
# list Titannic_all file

想法1:能否用某个算法求个权值矩阵,后用其.x数据,再用k-means聚类

想法2 :直接先用逻辑回归,随机森林

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

#% matplotlib inline


train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')


train_data.head()

PassengerId

Survived

Pclass

Name

Sex

Age

SibSp

Parch

Ticket

Fare

Cabin

Embarked

0

1

0

3

Braund, Mr. Owen Harris

male

22.0

1

0

A/5 21171

7.2500

NaN

S

1

2

1

1

Cumings, Mrs. John Bradley (Florence Briggs Th…

female

38.0

1

0

PC 17599

71.2833

C85

C

2

3

1

3

Heikkinen, Miss. Laina

female

26.0

0

0

STON/O2. 3101282

7.9250

NaN

S

3

4

1

1

Futrelle, Mrs. Jacques Heath (Lily May Peel)

female

35.0

1

0

113803

53.1000

C123

S

4

5

0

3

Allen, Mr. William Henry

male

35.0

0

0

373450

8.0500

NaN

S

train_data.info()
print("-" * 40)
test_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
----------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


train_data.describe()

PassengerId

Survived

Pclass

Age

SibSp

Parch

Fare

count

891.000000

891.000000

891.000000

714.000000

891.000000

891.000000

891.000000

mean

446.000000

0.383838

2.308642

29.699118

0.523008

0.381594

32.204208

std

257.353842

0.486592

0.836071

14.526497

1.102743

0.806057

49.693429

min

1.000000

0.000000

1.000000

0.420000

0.000000

0.000000

0.000000

25%

223.500000

0.000000

2.000000

20.125000

0.000000

0.000000

7.910400

50%

446.000000

0.000000

3.000000

28.000000

0.000000

0.000000

14.454200

75%

668.500000

1.000000

3.000000

38.000000

1.000000

0.000000

31.000000

max

891.000000

1.000000

3.000000

80.000000

8.000000

6.000000

512.329200

性别对存活率影响

train_data['Survived'].value_counts().plot.pie(autopct = '%1.2f%%')


<matplotlib.axes._subplots.AxesSubplot at 0x1acc7eab518>

男性人数是女性两倍,但女性存活率大很多

train_data.groupby(['Sex','Survived']).size().plot.bar()


<matplotlib.axes._subplots.AxesSubplot at 0x1acc9f482b0>

train_data[['Sex','Survived']].groupby(['Sex']).sum().plot.bar()


<matplotlib.axes._subplots.AxesSubplot at 0x1acc9fa5b70>

train_data[['Sex','Survived']].groupby(['Sex']).mean().plot.bar()


<matplotlib.axes._subplots.AxesSubplot at 0x1acca00aa58>

船舱等级似乎影响更大,3等舱人最多,Survived最少

train_data.groupby(['Pclass','Sex']).size().plot.bar()


<matplotlib.axes._subplots.AxesSubplot at 0x1acca081f28>

train_data[['Pclass','Survived']].groupby(['Pclass']).sum().plot.bar()


<matplotlib.axes._subplots.AxesSubplot at 0x1acca0e9400>

train_data[['Pclass','Survived']].groupby(['Pclass']).mean().plot.bar()


<matplotlib.axes._subplots.AxesSubplot at 0x1acca162470>

不同船舱的女性存活率有所区别

train_data[['Sex','Pclass','Survived']].groupby(['Pclass','Sex']).mean().plot.bar()


<matplotlib.axes._subplots.AxesSubplot at 0x1acca1bd1d0>

年龄与存活的关系

fig, ax = plt.subplots(1,2,figsize = (18,8))
sns.violinplot('Pclass','Age',hue='Survived',data=train_data,split=True,ax=ax[0])
ax[0].set_title('Pclass and Age vs Survived')
ax[0].set_yticks(range(0,100,10))

sns.violinplot('Sex','Age',hue='Survived',data = train_data,split=True,ax=ax[1])
ax[1].set_title('Sex and Age vs Survived')
ax[1].set_yticks(range(0,100,10))

plt.show()

年龄分布特征分析,拼错单词难受呀

在15岁之前存活率较高,40以后没区别

fig,ax = plt.subplots(figsize=(10,5))
sns.kdeplot(train_data.loc[(train_data['Survived'] == 0),'Age'],shade = True,color = 'gray',label = 'Not Survived')
sns.kdeplot(train_data.loc[(train_data['Survived'] == 1),'Age'],shade=True,color = 'g',label = 'Survived')
plt.title('Age--Survived or NOt')
plt.xlabel('Age')


Text(0.5, 0, 'Age')

登陆港口与存活与否的关系,s港口登陆的人最多,获救率最低,c,q港口大多为女性登陆?

grid = sns.FacetGrid(data = train_data,col='Pclass',hue='Sex')
grid.map(sns.countplot,'Embarked')
grid.add_legend()


<seaborn.axisgrid.FacetGrid at 0x1acca219668>

sns.countplot('Embarked',hue='Survived',data = train_data)
plt.title('Embarked and Survived')


Text(0.5, 1.0, 'Embarked and Survived')

sns.factorplot('Embarked','Survived',data = train_data,kind='bar')
plt.title('Embarked and Survived rate')


Text(0.5, 1.0, 'Embarked and Survived rate')

第一次简易处理数据,只填充年龄,选取几个简单特征

# age 用中位数填充
train_data["Age"] = train_data["Age"].fillna(train_data["Age"].median())
train_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


# 线性回归算法
from sklearn.linear_model import LinearRegression
# 这啥,留着百度
from sklearn.model_selection import KFold

#选取典型特征
predictors = ["Pclass","Age","SibSp","Parch","Fare"]

#初始化线性回归算法
alg = LinearRegression()
#样本分三份,3折交叉验证
kf = KFold(n_splits=3,shuffle=False,random_state=1)

predictions = []
for train,test in kf.split(train_data):
    train_predictors = (train_data[predictors].iloc[train,:]) #iloc DataFarm里的函数
    train_target = (train_data["Survived"].iloc[train])
    alg.fit(train_predictors,train_target)  # 使用线性回归算法
    test_predictions = alg.predict(train_data[predictors].iloc[test,:])
    predictions.append(test_predictions)

print(predictions[0][1:10],predictions[1][1:10],predictions[2][1:10],'\n'*2,len(predictions),len(predictions[0]))  #len = 3


[0.64716068 0.22381187 0.65781892 0.15821019 0.20954606 0.54764008
 0.35828968 0.29636233 0.54633321] [0.62891234 0.77642663 0.22950758 0.13992775 0.29488562 0.42050708
 0.22989295 1.06210609 0.73216932] [0.22171068 0.46757728 0.11475416 0.26813918 0.47166107 0.45771509
 0.26832766 0.66241433 0.15305294] 

 3 297


predictions = np.concatenate(predictions,axis=0)


predictions[predictions > 0.5] = 1
predictions[predictions <= 0.5] = 0

accuracy = sum(predictions == train_data["Survived"]) / len(predictions)

print("准确率:",accuracy)


准确率: 0.7037037037037037

增加Age、Embarked、Cabin特征值,提高0.08

# 对Sex进行处理:male:0,female:1
train_data.loc[train_data['Sex'] == 'male','Sex'] = 0
train_data.loc[train_data['Sex'] == 'female','Sex'] = 1


# 对Embarked进行处理,根据上述分析,C口登船的几乎都为女性,Pclass = 2,尤其明显。男性几乎都在S口登船,取众数
train_data['Embarked'] = train_data['Embarked'].fillna('C')


# 对Cabin进行特征化,Cabin缺失太多,也是特征(甲板 = 。=)
train_data['Cabin'] = train_data.Cabin.fillna('U0')
train_data.loc[train_data['Embarked'] == 'U0','Embarked'] = 0
train_data.loc[train_data['Embarked'] == 'S','Embarked'] = 1
train_data.loc[train_data['Embarked'] == 'C','Embarked'] = 2
train_data.loc[train_data['Embarked'] == 'Q','Embarked'] = 3


train_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null int64
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          891 non-null object
Embarked       891 non-null int64
dtypes: float64(2), int64(7), object(3)
memory usage: 83.6+ KB


predictors_2 = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
alg_2 = LinearRegression()
#样本分三份,3折交叉验证
kf_2 = KFold(n_splits=3,shuffle=False,random_state=1)

predictions_2 = []
for train,test in kf_2.split(train_data):
    train_predictors_2 = (train_data[predictors_2].iloc[train,:]) #iloc DataFarm里的函数
    train_target_2 = (train_data["Survived"].iloc[train])
    alg_2.fit(train_predictors_2,train_target_2)  # 使用线性回归算法
    test_predictions_2 = alg_2.predict(train_data[predictors_2].iloc[test,:])
    predictions_2.append(test_predictions_2)


predictions_2 = np.concatenate(predictions_2,axis=0)
predictions_2[predictions_2 > 0.5] = 1
predictions_2[predictions_2 <= 0.5] = 0

accuracy_2 = sum(predictions_2 == train_data["Survived"]) / len(predictions_2)

print("准确率:",accuracy_2)


准确率: 0.7833894500561167

测试集处理

test_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


#处理Age
test_data["Age"] = test_data["Age"].fillna(train_data["Age"].median())
#处理Sex
test_data.loc[test_data['Sex'] == 'male','Sex'] = 0
test_data.loc[test_data['Sex'] == 'female','Sex'] = 1
#处理Embarked
train_data['Embarked'] = train_data['Embarked'].fillna('C')
#处理Cabin
test_data['Cabin'] = test_data.Cabin.fillna('U0')
test_data.loc[test_data['Embarked'] == 'U0','Embarked'] = 0
test_data.loc[test_data['Embarked'] == 'S','Embarked'] = 1
test_data.loc[test_data['Embarked'] == 'C','Embarked'] = 2
test_data.loc[test_data['Embarked'] == 'Q','Embarked'] = 3
test_data.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null int64
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          418 non-null object
Embarked       418 non-null int64
dtypes: float64(2), int64(6), object(3)
memory usage: 36.0+ KB


#处理Fare,训练集不缺
test_data["Fare"] = test_data["Fare"].fillna(train_data["Fare"].median())
test_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null int64
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           418 non-null float64
Cabin          418 non-null object
Embarked       418 non-null int64
dtypes: float64(2), int64(6), object(3)
memory usage: 36.0+ KB


test_features = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
test_data['Survived'] = -1

test_predictors = test_data[test_features]
test_data['Survived'] = alg_2.predict(test_predictors)


test_data.head()

PassengerId

Pclass

Name

Sex

Age

SibSp

Parch

Ticket

Fare

Cabin

Embarked

Survived

0

892

3

Kelly, Mr. James

0

34.5

0

0

330911

7.8292

U0

3

0.158051

1

893

3

Wilkes, Mrs. James (Ellen Needs)

1

47.0

1

0

363272

7.0000

U0

1

0.480204

2

894

2

Myles, Mr. Thomas Francis

0

62.0

0

0

240276

9.6875

U0

3

0.177382

3

895

3

Wirz, Mr. Albert

0

27.0

0

0

315154

8.6625

U0

1

0.106463

4

896

3

Hirvonen, Mrs. Alexander (Helga E Lindqvist)

1

22.0

1

1

3101298

12.2875

U0

1

0.617975

test_data.loc[test_data['Survived'] > 0.5,'Survived'] = 1
test_data.loc[test_data['Survived'] <= 0.5,'Survived'] = 0
test_data.head()

PassengerId

Pclass

Name

Sex

Age

SibSp

Parch

Ticket

Fare

Cabin

Embarked

Survived

0

892

3

Kelly, Mr. James

0

34.5

0

0

330911

7.8292

U0

3

0.0

1

893

3

Wilkes, Mrs. James (Ellen Needs)

1

47.0

1

0

363272

7.0000

U0

1

0.0

2

894

2

Myles, Mr. Thomas Francis

0

62.0

0

0

240276

9.6875

U0

3

0.0

3

895

3

Wirz, Mr. Albert

0

27.0

0

0

315154

8.6625

U0

1

0.0

4

896

3

Hirvonen, Mrs. Alexander (Helga E Lindqvist)

1

22.0

1

1

3101298

12.2875

U0

1

1.0

submission = pd.DataFrame({
    'PassengerId':test_data['PassengerId'],
    'Survived':test_data['Survived']
})
submission.head()

PassengerId

Survived

0

892

0.0

1

893

0.0

2

894

0.0

3

895

0.0

4

896

1.0

submission.describe()

PassengerId

Survived

count

418.000000

418.000000

mean

1100.500000

0.358852

std

120.810458

0.480238

min

892.000000

0.000000

25%

996.250000

0.000000

50%

1100.500000

0.000000

75%

1204.750000

1.000000

max

1309.000000

1.000000

submission.to_csv('titanic_submission.csv',index = False)