sklearn-使用决策树进行数据预测

导包

1
2
3
4
5
import numpy as np
import pandas as pd
import sklearn.tree as tree
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
import matplotlib.pyplot as plt

读取数据

1
data = pd.read_csv(r"data.csv")

探索数据

1
data.info()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# output
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None
1
data.head(10)

1.png

1
2
3
4
# 将标签类移动到最后一列
tmp = data.pop('Survived')
data.insert(len(data.columns), 'Survived', tmp)
data.head()

1.png

数据预处理

1
2
3
# 筛选特征,删除缺失值过多的列,和观察判断来说和预测的y没有关系的列
data.drop(["Cabin", "Name", "Ticket"], axis=1, inplace=True)
data.head()

1.png

1
2
3
4
5
6
# 处理缺失值 ==> 'Age',用均值填充
print("填充前".center(50,'='))
print(data.info(),end='\n\n')
data['Age'] = data['Age'].fillna(data['Age'].mean())
print("填充后".center(50,'='))
print(data.info())
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# output

=======================填充前========================
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
PassengerId 891 non-null int64
Pclass 891 non-null int64
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Fare 891 non-null float64
Embarked 889 non-null object
Survived 891 non-null int64
dtypes: float64(2), int64(5), object(2)
memory usage: 62.7+ KB
None

=======================填充后========================
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
PassengerId 891 non-null int64
Pclass 891 non-null int64
Sex 891 non-null object
Age 891 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Fare 891 non-null float64
Embarked 889 non-null object
Survived 891 non-null int64
dtypes: float64(2), int64(5), object(2)
memory usage: 62.7+ KB
None
1
2
3
# 删除有缺失值的行,缺失较少时才直接删除,否则填充
data.dropna(inplace=True)
data.info() # 891 -> 889
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# output

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 9 columns):
PassengerId 889 non-null int64
Pclass 889 non-null int64
Sex 889 non-null object
Age 889 non-null float64
SibSp 889 non-null int64
Parch 889 non-null int64
Fare 889 non-null float64
Embarked 889 non-null object
Survived 889 non-null int64
dtypes: float64(2), int64(5), object(2)
memory usage: 69.5+ KB
1
2
3
4
5
6
7
8
# 字符串特征转数值特征,这里只是简单的情况,详细的要再多多学习
labels = data['Embarked'].unique().tolist()
data['Embarked'] = data['Embarked'].apply(lambda x : labels.index(x))

# 'Sex' 只有两个取值,讨巧的做法
data['Sex'] = (data['Sex'] == 'male').astype("int")

data.head()

1.png

划分训练集和测试集

1
2
3
4
5
6
7
# 获取训练姐和测试集
X = data.iloc[:, data.columns != 'Survived']
y = data.iloc[:, data.columns == 'Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

X_train.shape, X_test.shape, y_train.shape, y_test.shape
1
2
3
# output

((622, 8), (267, 8), (622, 1), (267, 1))
1
X_train.head()

1.png

1
2
3
4
5
# 重新调整行索引
for x in [X_train, X_test, y_train, y_test]:
x.index = range(x.shape[0])

X_train.head()

1.png

构建模型训练

1
2
3
4
clf = tree.DecisionTreeClassifier(random_state=25)
clf = clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
score # 0.719
1
2
3
4
# 使用交叉验证
clf = tree.DecisionTreeClassifier(random_state=25)
score = cross_val_score(clf, X, y, cv=5).mean()
score # 0.749
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# 循环搜索树的深度
tr = []
te = []
for i in range(10):
clf = tree.DecisionTreeClassifier(random_state=25,
max_depth=i+1,
criterion='entropy')
clf = clf.fit(X_train, y_train)
score_tr = clf.score(X_train, y_train)
score_te = cross_val_score(clf, X, y, cv=10).mean()
tr.append(score_tr)
te.append(score_te)

print(max(te), np.argmax(te) + 1, sep='\n') # 0.816 3

plt.plot(range(1, 11), tr, color="red", label="train")
plt.plot(range(1, 11), te, color="blue", label="test")
plt.xticks(range(1, 11))
plt.legend()
plt.show()

1.png

1
2
3
4
5
6
7
8
9
10
11
12
# 网格搜索
parameters = {
"criterion": ('gini', 'entropy'),
"splitter": ('best', 'random'),
"max_depth": [*range(1,10)],
"min_samples_leaf": [*range(1, 50, 5)]
# "min_impurity_decrease": [*np.linspace(0, 0.5, 50)]
}

clf = tree.DecisionTreeClassifier(random_state=25)
GS = GridSearchCV(clf, parameters, cv=10)
GS.fit(X_train, y_train)
1
2
# 最佳组合
GS.best_params_
1
2
3
4
5
6
# output

{'criterion': 'gini',
'max_depth': 6,
'min_samples_leaf': 1,
'splitter': 'random'}
1
2
# 网格搜索后的模型评价
GS.best_score_ # 0.82

注:网格搜索比较慢,而且不一定能得到最好的结果

文章推荐