DAY53. Python TreeModel (1)DicisionTree
decisionTree ์์ฌ๊ฒฐ์ ํธ๋ฆฌ
๋ชจ๋ธ์ ์๊ฐํ๊ฐ ์ฝ๊ณ , ๊ฐ๋ ์ฑ ๋์(ํด์ ์ฌ์)
ํน์ง(๋ณ์)์ ์ค์ผ์ผ(์ ๊ทํ๋ ํ์คํ)์กฐ์ ์ด ํ์ ์์
๋ ๋ฆฝ๋ณ์์ ์ด์ง๊ณผ ์ฐ์ ๋ณ์๊ฐ ํผํฉ๋์ด ์์ด๋ ์ ๋์
๋ง์ ํน์ง(์ ๋ ฅ๋ณ์)์ ๊ฐ๋ ๋ฐ์ดํฐ ์ ์ ๋ถ์ ํฉ
๋จ์ผ๊ฒฐ์ Tree ํ์ต์ผ๋ก ๊ณผ์ ํฉ ๋ฐ์ ์ฐ๋ ค (์ผ๋ฐํ ์ฑ๋ฅ ์ ํ)
๊ณผ์ ํฉ ํด๊ฒฐ๋ฐฉ์ : ๊ฐ์ง์น๊ธฐ (CP : Cut Prune)
* ๊น์ ํธ๋ฆฌ(๋ณต์กํ ๋ชจ๋ธ) : ๊ณผ์ ํฉ(↑), ์ค๋ถ๋ฅ(↓)
์์ฌ๊ฒฐ์ ๋๋ฌด(Decision Tree) ์๊ณ ๋ฆฌ์ฆ
์๊ณ ๋ฆฌ์ฆ | ์ค์๋ณ์ ํ๊ฐ์ง์ | ๋น๊ณ |
CART(Classification And Regression Trees) | GINI Index | ๋ฒ์ฃผํ๊ณผ ์ซ์ํ ์ข ์๋ณ์ ํจํค์ง : rpart |
C5.0(C4.5) | Information Gain | ๋ฒ์ฃผํ๊ณผ ์ซ์ํ ์ข ์๋ณ์ ํจํค์ง : C50 |
Entropy & GINI
ํ๋ฅ ๋ณ์ ๊ฐ์ ๋ถํ์ค์ฑ์ ๋ํ๋ด๋ ์์น
Tree model์์ ์ค์ ๋ณ์(x) ์ ์ ์ ์ฌ์ฉ
1. C5.0(C4.5)์์๋ ์ ๋ณด์ด๋ ์ด์ฉ : ์ ๋ณด์ด๋์ด ํด ์๋ก ์ค์๋ณ์
2. CART์์๋ GINI index ์ด์ฉ : ์ง๋ ๊ณ์๊ฐ ํด ์๋ก ์ค์๋ณ์
์ค์ต
import pandas as pd #csv file read
from sklearn.tree import DecisionTreeClassifier #Tree model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
Tree ์๊ฐํ
from sklearn.tree import export_graphviz #dot file ๋ด๋ณด๋ด๊ธฐ
from graphviz import Source #dot file ์๊ฐํ (pip install graphviz)
1. dataset load
dataset = pd.read_csv(r'C:\ITWILL\4_Python-2\data\tree_data.csv')
dataset.info()
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 iq 6 non-null int64
1 age 6 non-null int64
2 income 6 non-null int64
3 owner 6 non-null int64
4 unidegree 6 non-null int64
5 smoking 6 non-null int64 -> y๋ณ์
dataset
iq age income owner unidegree smoking
0 90 42 40 0 0 1
1 110 20 20 1 1 0
2 100 50 46 0 0 0
3 140 40 28 1 1 1
4 110 70 100 0 0 1
5 100 50 20 0 0 0
* decision tree์ ์ฅ์ : x๊ฐ 2์ง์์ฌ๋ y์ ๋ถ๋ฅ๋ชจ๋ธ์ ๋ง๋ค ์ ์์
cols = list(dataset.columns)
cols #[iq age income owner unidegree smoking(y๋ณ์)]
X = dataset[cols[:-1]] #๋ณ์ 5๊ฐ : iq, age, income, owner, unidegree
y = dataset[cols[-1]]
X.shape #(6, 5)
y.shape #(6, )
2. tree model ์์ฑ
model = DecisionTreeClassifier(random_state=123).fit(X, y)
dir(model)
tree์ ๊น์ด
model.get_depth() #3 -> ์ต์๋จ X์ ์ธํ๊ณ ๊ฐ์ง๊ฐ 3๊ฐ
model ์์ธก์น
y_pred = model.predict(X=X)
print(y_pred) #[1 0 0 1 1 0]
print(y)
acc = accuracy_score(y, y_pred)
print('accuracy =', acc) #accuracy = 1.0
3. tree model ์๊ฐํ
feature_names = cols[:-1] #['iq', 'age', 'income', 'owner', 'unidegree']
class_names = ['No', 'Yes'] #y๋ณ์์ class๋ช
export_graphviz
function sklearn.tree._export.export_graphviz
(decision_tree, out_file=None, *, max_depth=None, feature_names=None,
class_names=None, label='all', filled=False, leaves_parallel=False,
impurity=True, node_ids=False, proportion=False, rotate=False,
rounded=False, special_characters=False, precision=3)
graph = export_graphviz(model, out_file = "tree_graph.dot", #๊ฒฝ๋ก ์ค์ ์ ํ๋ฉด ๊ธฐ๋ณธ ๊ฒฝ๋ก์ ํ์ผ ์ ์ฅ
feature_names = feature_names,
class_names = class_names,
filled = False,
impurity = True,
rounded = False)
filled = False : ์์ ์ ์ฑ์ฐ๊ฒ ๋ค
impurity = True : GINI ๊ณ์
rounded = False : ๋ชจ์๋ผ ๋ผ์ด๋ฉ ์ ํ๊ฒ ๋ค
dot file read
file = open("tree_graph.dot")
dot_graph = file.read()
dot file ์๊ฐํ
Source(dot_graph)
decisionTree_parameter
Dicision Tree Hyper parameter
๊ณผ์ ํฉ ๊ด๋ จ parameter
์ค์๋ณ์ ์ ์ ๊ด๋ จ parameter
from sklearn.datasets import load_iris #dataset
from sklearn.tree import DecisionTreeClassifier #Tree model
from sklearn.model_selection import train_test_split #ํ๋ จ/๊ฒ์ ์
from sklearn.metrics import accuracy_score #tree model ์๊ฐํ
from sklearn.tree import export_graphviz #dot file ๋ด๋ณด๋ด๊ธฐ
from graphviz import Source #dot file ์๊ฐํ(pip install graphviz)
1.dataset load
iris = load_iris()
feature_names = iris.feature_names #listํ์์ผ๋ก x๋ณ์๋ช
์ถ์ถ
#['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
class_names = iris.target_names
X = iris.data
y = iris.target
2. train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size = 0.3, random_state = 123)
3. tree model ์์ฑ
criterion : defalut = "gini" ์ค์๋ณ์ ์ ์ ๊ธฐ์ค
splitter : defalut = "best" ๊ฐ ๋
ธ๋์ ๋ถํ ๋ฐฉ๋ฒ
max_depth : ์ต๋ tree๊น์ด(max_depth = 3) -> ๊ณผ์ ํฉ ์ ์ด ์ญํ
min_sample_split : defalut = 2 ๋ด๋ถ๋
ธ๋ ๋ถํ ์ ํ์ํ ์ต์ ์ํ์
min_samples_leaf : defalut = 1 ์ข
๋จ๋
ธ๋ ๋ถํ ์ ํ์ํ ์ต์ ์ํ์
tree = DecisionTreeClassifier (criterion = 'gini',
splitter = 'best',
max_depth = None,
min_samples_split = 2,
random_state = 123)
model = tree.fit(X = X_train, y = y_train)
์ค์ ๋ณ์ ๋๋
model.feature_importances_ #array([0.01364196, 0.01435996, 0.5461181 , 0.42587999])
tree ๊น์ด
model.get_depth() #5
4. model ํ๊ฐ
train_score = model.score(X = X_train, y = y_train)
test_score = model.score(X = X_test, y = y_test)
print('train score :', train_score) #train score : 1.0
print('test_socre :', test_score) #test_socre : 0.9555555555555556
5. tree model ์๊ฐํ
graph = export_graphviz(model, out_file = "tree_graph.dot", #๊ฒฝ๋ก ์ค์ ์ ํ๋ฉด ๊ธฐ๋ณธ ๊ฒฝ๋ก์ ํ์ผ ์ ์ฅ
feature_names = feature_names,
class_names = class_names,
filled = True,
impurity = True,
rounded = True)
dot file read
file = open("tree_graph.dot")
dot_graph = file.read()
dot file ์๊ฐํ
Source(dot_graph)
์๋ก์ด ๋ชจ๋ธ ๋ง๋ค์ด๋ณด๊ธฐ
์กฐ๊ฑด : criterion = 'entropy', max_depth = 3
์ค์๋ณ์ ์ ์ ๊ธฐ์ค ์ง๋๊ณ์ -> ์ํธ๋กํผ
๊ณผ์ ํฉ ๋ฐ์์ ๊ฐ์ ํ๊ณ , ๋ ๊ฐ ๊ฐ์ง์น๊ธฐ
1. new model
tree2 = DecisionTreeClassifier (criterion = 'entropy',
splitter = 'best',
max_depth = 3,
min_samples_split = 2,
random_state = 123)
model2 = tree2.fit(X = X_train, y = y_train)
2. new model ํ๊ฐ
train_score = model2.score(X = X_train, y = y_train)
test_score = model2.score(X = X_test, y = y_test)
print('train score :', train_score) #train score : 0.9809523809523809
print('test_socre :', test_score) #test_socre : 0.9333333333333333
* ๋ ์ ์์ ์ฐจ๊ฐ ์์์ง๋ฉด ๊ณผ์ ํฉ์ด ํด์๋์๋ค๊ณ ๋ด -> ์ค๋ถ๋ฅ๊ฐ ์ ์ด์ก๊ณ , ์ ํ๋๊ฐ ๋ฎ์์ง๋ค
3. new tree model ์๊ฐํ
graph = export_graphviz(model, out_file = "tree_graph.dot", #๊ฒฝ๋ก ์ค์ ์ ํ๋ฉด ๊ธฐ๋ณธ ๊ฒฝ๋ก์ ํ์ผ ์ ์ฅ
feature_names = feature_names,
class_names = class_names,
filled = True,
impurity = True,
rounded = True)
dot file read
file = open("tree_graph.dot")
dot_graph = file.read()
dot file ์๊ฐํ
Source(dot_graph)
[ํด์] modelํ๋ : ์ ํ๋๋ ๋จ์ด์ก์ผ๋, ๊ณผ์ ํฉ์ ํด๊ฒฐ
์ค์๋ณ์ ์ ์ ๊ธฐ์ค : ๊ฐ์ฅ ์ค์ํ ๋ณ์๋ petal length. ์ง๋๊ณ์์ ์ํธ๋กํผ๊ฐ ์ ์ฌํจ.