# list Titannic_all file
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
#% matplotlib inline
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data.head()
PassengerId
Survived
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
0
1
0
3
Braund, Mr. Owen Harris
male
22.0
1
0
A/5 21171
7.2500
NaN
S
1
2
1
1
Cumings, Mrs. John Bradley (Florence Briggs Th…
female
38.0
1
0
PC 17599
71.2833
C85
C
2
3
1
3
Heikkinen, Miss. Laina
female
26.0
0
0
STON/O2. 3101282
7.9250
NaN
S
3
4
1
1
Futrelle, Mrs. Jacques Heath (Lily May Peel)
female
35.0
1
0
113803
53.1000
C123
S
4
5
0
3
Allen, Mr. William Henry
male
35.0
0
0
373450
8.0500
NaN
S
train_data.info()
print("-" * 40)
test_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
----------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId 418 non-null int64
Pclass 418 non-null int64
Name 418 non-null object
Sex 418 non-null object
Age 332 non-null float64
SibSp 418 non-null int64
Parch 418 non-null int64
Ticket 418 non-null object
Fare 417 non-null float64
Cabin 91 non-null object
Embarked 418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
train_data.describe()
PassengerId
Survived
Pclass
Age
SibSp
Parch
Fare
count
891.000000
891.000000
891.000000
714.000000
891.000000
891.000000
891.000000
mean
446.000000
0.383838
2.308642
29.699118
0.523008
0.381594
32.204208
std
257.353842
0.486592
0.836071
14.526497
1.102743
0.806057
49.693429
min
1.000000
0.000000
1.000000
0.420000
0.000000
0.000000
0.000000
25%
223.500000
0.000000
2.000000
20.125000
0.000000
0.000000
7.910400
50%
446.000000
0.000000
3.000000
28.000000
0.000000
0.000000
14.454200
75%
668.500000
1.000000
3.000000
38.000000
1.000000
0.000000
31.000000
max
891.000000
1.000000
3.000000
80.000000
8.000000
6.000000
512.329200
train_data['Survived'].value_counts().plot.pie(autopct = '%1.2f%%')
<matplotlib.axes._subplots.AxesSubplot at 0x1acc7eab518>
train_data.groupby(['Sex','Survived']).size().plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x1acc9f482b0>
train_data[['Sex','Survived']].groupby(['Sex']).sum().plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x1acc9fa5b70>
train_data[['Sex','Survived']].groupby(['Sex']).mean().plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x1acca00aa58>
train_data.groupby(['Pclass','Sex']).size().plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x1acca081f28>
train_data[['Pclass','Survived']].groupby(['Pclass']).sum().plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x1acca0e9400>
train_data[['Pclass','Survived']].groupby(['Pclass']).mean().plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x1acca162470>
train_data[['Sex','Pclass','Survived']].groupby(['Pclass','Sex']).mean().plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x1acca1bd1d0>
fig, ax = plt.subplots(1,2,figsize = (18,8))
sns.violinplot('Pclass','Age',hue='Survived',data=train_data,split=True,ax=ax[0])
ax[0].set_title('Pclass and Age vs Survived')
ax[0].set_yticks(range(0,100,10))
sns.violinplot('Sex','Age',hue='Survived',data = train_data,split=True,ax=ax[1])
ax[1].set_title('Sex and Age vs Survived')
ax[1].set_yticks(range(0,100,10))
plt.show()
fig,ax = plt.subplots(figsize=(10,5))
sns.kdeplot(train_data.loc[(train_data['Survived'] == 0),'Age'],shade = True,color = 'gray',label = 'Not Survived')
sns.kdeplot(train_data.loc[(train_data['Survived'] == 1),'Age'],shade=True,color = 'g',label = 'Survived')
plt.title('Age--Survived or NOt')
plt.xlabel('Age')
Text(0.5, 0, 'Age')
grid = sns.FacetGrid(data = train_data,col='Pclass',hue='Sex')
grid.map(sns.countplot,'Embarked')
grid.add_legend()
<seaborn.axisgrid.FacetGrid at 0x1acca219668>
sns.countplot('Embarked',hue='Survived',data = train_data)
plt.title('Embarked and Survived')
Text(0.5, 1.0, 'Embarked and Survived')
sns.factorplot('Embarked','Survived',data = train_data,kind='bar')
plt.title('Embarked and Survived rate')
Text(0.5, 1.0, 'Embarked and Survived rate')
# age 用中位数填充
train_data["Age"] = train_data["Age"].fillna(train_data["Age"].median())
train_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 891 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
# 线性回归算法
from sklearn.linear_model import LinearRegression
# 这啥,留着百度
from sklearn.model_selection import KFold
#选取典型特征
predictors = ["Pclass","Age","SibSp","Parch","Fare"]
#初始化线性回归算法
alg = LinearRegression()
#样本分三份,3折交叉验证
kf = KFold(n_splits=3,shuffle=False,random_state=1)
predictions = []
for train,test in kf.split(train_data):
train_predictors = (train_data[predictors].iloc[train,:]) #iloc DataFarm里的函数
train_target = (train_data["Survived"].iloc[train])
alg.fit(train_predictors,train_target) # 使用线性回归算法
test_predictions = alg.predict(train_data[predictors].iloc[test,:])
predictions.append(test_predictions)
print(predictions[0][1:10],predictions[1][1:10],predictions[2][1:10],'\n'*2,len(predictions),len(predictions[0])) #len = 3
[0.64716068 0.22381187 0.65781892 0.15821019 0.20954606 0.54764008
0.35828968 0.29636233 0.54633321] [0.62891234 0.77642663 0.22950758 0.13992775 0.29488562 0.42050708
0.22989295 1.06210609 0.73216932] [0.22171068 0.46757728 0.11475416 0.26813918 0.47166107 0.45771509
0.26832766 0.66241433 0.15305294]
3 297
predictions = np.concatenate(predictions,axis=0)
predictions[predictions > 0.5] = 1
predictions[predictions <= 0.5] = 0
accuracy = sum(predictions == train_data["Survived"]) / len(predictions)
print("准确率:",accuracy)
准确率: 0.7037037037037037
# 对Sex进行处理:male:0,female:1
train_data.loc[train_data['Sex'] == 'male','Sex'] = 0
train_data.loc[train_data['Sex'] == 'female','Sex'] = 1
# 对Embarked进行处理,根据上述分析,C口登船的几乎都为女性,Pclass = 2,尤其明显。男性几乎都在S口登船,取众数
train_data['Embarked'] = train_data['Embarked'].fillna('C')
# 对Cabin进行特征化,Cabin缺失太多,也是特征(甲板 = 。=)
train_data['Cabin'] = train_data.Cabin.fillna('U0')
train_data.loc[train_data['Embarked'] == 'U0','Embarked'] = 0
train_data.loc[train_data['Embarked'] == 'S','Embarked'] = 1
train_data.loc[train_data['Embarked'] == 'C','Embarked'] = 2
train_data.loc[train_data['Embarked'] == 'Q','Embarked'] = 3
train_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null int64
Age 891 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 891 non-null object
Embarked 891 non-null int64
dtypes: float64(2), int64(7), object(3)
memory usage: 83.6+ KB
predictors_2 = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
alg_2 = LinearRegression()
#样本分三份,3折交叉验证
kf_2 = KFold(n_splits=3,shuffle=False,random_state=1)
predictions_2 = []
for train,test in kf_2.split(train_data):
train_predictors_2 = (train_data[predictors_2].iloc[train,:]) #iloc DataFarm里的函数
train_target_2 = (train_data["Survived"].iloc[train])
alg_2.fit(train_predictors_2,train_target_2) # 使用线性回归算法
test_predictions_2 = alg_2.predict(train_data[predictors_2].iloc[test,:])
predictions_2.append(test_predictions_2)
predictions_2 = np.concatenate(predictions_2,axis=0)
predictions_2[predictions_2 > 0.5] = 1
predictions_2[predictions_2 <= 0.5] = 0
accuracy_2 = sum(predictions_2 == train_data["Survived"]) / len(predictions_2)
print("准确率:",accuracy_2)
准确率: 0.7833894500561167
test_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId 418 non-null int64
Pclass 418 non-null int64
Name 418 non-null object
Sex 418 non-null object
Age 332 non-null float64
SibSp 418 non-null int64
Parch 418 non-null int64
Ticket 418 non-null object
Fare 417 non-null float64
Cabin 91 non-null object
Embarked 418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
#处理Age
test_data["Age"] = test_data["Age"].fillna(train_data["Age"].median())
#处理Sex
test_data.loc[test_data['Sex'] == 'male','Sex'] = 0
test_data.loc[test_data['Sex'] == 'female','Sex'] = 1
#处理Embarked
train_data['Embarked'] = train_data['Embarked'].fillna('C')
#处理Cabin
test_data['Cabin'] = test_data.Cabin.fillna('U0')
test_data.loc[test_data['Embarked'] == 'U0','Embarked'] = 0
test_data.loc[test_data['Embarked'] == 'S','Embarked'] = 1
test_data.loc[test_data['Embarked'] == 'C','Embarked'] = 2
test_data.loc[test_data['Embarked'] == 'Q','Embarked'] = 3
test_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId 418 non-null int64
Pclass 418 non-null int64
Name 418 non-null object
Sex 418 non-null int64
Age 418 non-null float64
SibSp 418 non-null int64
Parch 418 non-null int64
Ticket 418 non-null object
Fare 417 non-null float64
Cabin 418 non-null object
Embarked 418 non-null int64
dtypes: float64(2), int64(6), object(3)
memory usage: 36.0+ KB
#处理Fare,训练集不缺
test_data["Fare"] = test_data["Fare"].fillna(train_data["Fare"].median())
test_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId 418 non-null int64
Pclass 418 non-null int64
Name 418 non-null object
Sex 418 non-null int64
Age 418 non-null float64
SibSp 418 non-null int64
Parch 418 non-null int64
Ticket 418 non-null object
Fare 418 non-null float64
Cabin 418 non-null object
Embarked 418 non-null int64
dtypes: float64(2), int64(6), object(3)
memory usage: 36.0+ KB
test_features = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
test_data['Survived'] = -1
test_predictors = test_data[test_features]
test_data['Survived'] = alg_2.predict(test_predictors)
test_data.head()
PassengerId
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
Survived
0
892
3
Kelly, Mr. James
0
34.5
0
0
330911
7.8292
U0
3
0.158051
1
893
3
Wilkes, Mrs. James (Ellen Needs)
1
47.0
1
0
363272
7.0000
U0
1
0.480204
2
894
2
Myles, Mr. Thomas Francis
0
62.0
0
0
240276
9.6875
U0
3
0.177382
3
895
3
Wirz, Mr. Albert
0
27.0
0
0
315154
8.6625
U0
1
0.106463
4
896
3
Hirvonen, Mrs. Alexander (Helga E Lindqvist)
1
22.0
1
1
3101298
12.2875
U0
1
0.617975
test_data.loc[test_data['Survived'] > 0.5,'Survived'] = 1
test_data.loc[test_data['Survived'] <= 0.5,'Survived'] = 0
test_data.head()
PassengerId
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
Survived
0
892
3
Kelly, Mr. James
0
34.5
0
0
330911
7.8292
U0
3
0.0
1
893
3
Wilkes, Mrs. James (Ellen Needs)
1
47.0
1
0
363272
7.0000
U0
1
0.0
2
894
2
Myles, Mr. Thomas Francis
0
62.0
0
0
240276
9.6875
U0
3
0.0
3
895
3
Wirz, Mr. Albert
0
27.0
0
0
315154
8.6625
U0
1
0.0
4
896
3
Hirvonen, Mrs. Alexander (Helga E Lindqvist)
1
22.0
1
1
3101298
12.2875
U0
1
1.0
submission = pd.DataFrame({
'PassengerId':test_data['PassengerId'],
'Survived':test_data['Survived']
})
submission.head()
PassengerId
Survived
0
892
0.0
1
893
0.0
2
894
0.0
3
895
0.0
4
896
1.0
submission.describe()
PassengerId
Survived
count
418.000000
418.000000
mean
1100.500000
0.358852
std
120.810458
0.480238
min
892.000000
0.000000
25%
996.250000
0.000000
50%
1100.500000
0.000000
75%
1204.750000
1.000000
max
1309.000000
1.000000
submission.to_csv('titanic_submission.csv',index = False)
手机扫一扫
移动阅读更方便
你可能感兴趣的文章