72. Python TreeModel ์ฐ์ต๋ฌธ์
๋ฌธ1) load_breast_cancer ๋ฐ์ดํฐ ์ ์ ์ด์ฉํ์ฌ ๋ค์๊ณผ ๊ฐ์ด Decision Tree ๋ชจ๋ธ์ ์์ฑํ์์ค.
<์กฐ๊ฑด1> 75:25๋น์จ train/test ๋ฐ์ดํฐ ์ ๊ตฌ์ฑ
<์กฐ๊ฑด2> y๋ณ์ : cancer.target, x๋ณ์ : cancer.data
<์กฐ๊ฑด3> tree ์ต๋ ๊น์ด : 5
<์กฐ๊ฑด4> decision tree ์๊ฐํ & ์ค์๋ณ์ ํ์ธ
from sklearn import model_selection
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
#tree ์๊ฐํ
from sklearn.tree import export_graphviz
from graphviz import Source #pip install graphviz
๋ฐ์ดํฐ ์
load
cancer = load_breast_cancer()
<๋จ๊ณ1> y๋ณ์ : cancer.target, x๋ณ์ : cancer.data
feature_names = cancer.feature_names
class_names = cancer.target_names
X = cancer.data
y = cancer.target
<๋จ๊ณ2> 75:25๋น์จ train/test ๋ฐ์ดํฐ ์
๊ตฌ์ฑ
X_train, X_test, y_train, y_test = model_selection.train_test_split(
X, y, test_size=0.25, random_state=123)
<๋จ๊ณ3> tree ์ต๋ ๊น์ด : 5
tree = DecisionTreeClassifier(max_depth=5, random_state=123)
model = tree.fit(X=X_train, y=y_train)
test_score = model.score(X=X_test, y=y_test)
print('accuracy =', test_score) #accuracy = 0.972027972027972
<๋จ๊ณ4> decision tree ์๊ฐํ & ์ค์๋ณ์ ํ์ธ
export_graphviz(model, out_file="tree_exam.dot",
feature_names=feature_names,
class_names=class_names)
file = open("tree_exam.dot")
graph = file.read()
file.close()
Source(graph) #์ค์๋ณ์ : worst_radius
๋ฌธ2) ๋น๋ฃ๋ณ(diabetes.csv) ๋ฐ์ดํฐ ์ ์ ์ด์ฉํ์ฌ ๋ค์๊ณผ ๊ฐ์ ๋จ๊ณ๋ก RandomForest ๋ชจ๋ธ์ ์์ฑํ์์ค.
<๋จ๊ณ1> ๋ฐ์ดํฐ์ ๋ก๋ & ์นผ๋ผ๋ช ์ ์ฉ
<๋จ๊ณ2> x, y ๋ณ์ ์ ํ : x๋ณ์ : 1 ~ 8๋ฒ์งธ ์นผ๋ผ, y๋ณ์ : 9๋ฒ์งธ ์นผ๋ผ
<๋จ๊ณ3> 500๊ฐ์ ํธ๋ฆฌ๋ฅผ ์ด์ฉํ์ฌ ๋ชจ๋ธ ์์ฑ
<๋จ๊ณ4> ์ค์๋ณ์ ์๊ฐํ : feature names ์ ์ฉ
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import matplotlib.pyplot as plt #์ค์๋ณ์ ์๊ฐํ
๋จ๊ณ1. ํ
์ดํฐ์
๋ก๋
dia = pd.read_csv('C:/ITWILL/4_Python-2/data/diabetes.csv',
header=None) #์ ๋ชฉ ์์
print(dia.info())
์นผ๋ผ๋ช
์ถ๊ฐ
dia.columns = ['Pregnancies','Glucose','BloodPressure','SkinThickness',
'Insulin','BMI','DiabetesPedigree','Age','Outcome']
print(dia.info())
0 Pregnancies 759 non-null float64
1 Glucose 759 non-null float64
2 BloodPressure 759 non-null float64
3 SkinThickness 759 non-null float64
4 Insulin 759 non-null float64
5 BMI 759 non-null float64
6 DiabetesPedigree 759 non-null float64
7 Age 759 non-null float64
8 Outcome 759 non-null int64 - y๋ณ์
(ํ๊ธ๋ช
: ์์ , ํ๋น, ํ์, ํผ๋ถ๋๊ป,์ธ์๋ฆฐ,๋น๋ง๋์ง์,๋น๋ฃ๋ณ์ ์ ,๋์ด,๊ฒฐ๊ณผ)
type(dia)
๋จ๊ณ2. x,y ๋ณ์ ์์ฑ
cols = list(dia.columns)
cols
X = dia[cols[:-1]] #์ค์ฒฉlist
X.shape #(759, 8)
y = dia['Outcome'] #๋จ์ผlist
๋จ๊ณ3. model ์์ฑ
model = RandomForestClassifier(n_estimators=500).fit(X, y)
๋จ๊ณ4. ์ค์๋ณ์ ์๊ฐํ
model.feature_importances_
# array([0.0786396 , 0.26605613, 0.0838773 , 0.0800118 , 0.08649011,
# 0.16625653, 0.12740268, 0.11126585])
x_names = cols[:-1]
size = len(x_names) #8
plt.barh(y=range(size), width=model.feature_importances_)
#y์ถ ๋๊ธ : x๋ณ์ ์ด๋ฆ
plt.yticks(range(size), x_names)
plt.xlabel("feature_importances")
plt.show()
#์ค์๋ณ์ : Glucose(ํ๋น) > BMI(๋น๋ง๋์ง์)
๋ฌธ3) iris dataset์ ์ด์ฉํ์ฌ ๋ค์๊ณผ ๊ฐ์ ๋จ๊ณ๋ก XGBoost model์ ์์ฑํ์์ค.
import pandas as pd #file read
from xgboost import XGBClassifier #model ์์ฑ
from xgboost import plot_importance #์ค์๋ณ์ ์๊ฐํ
from sklearn.model_selection import train_test_split #dataset split
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report # model ํ๊ฐ
๋จ๊ณ1 : data set load
iris = pd.read_csv("C:/ITWILL/4_Python-2/data/iris.csv")
๋ณ์๋ช
์ถ์ถ
cols=list(iris.columns)
cols #['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width', 'Species']
col_x=cols[:4] #x๋ณ์๋ช
: ['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width']
col_y=cols[-1] #y๋ณ์๋ช
: 'Species'
X = iris[col_x]
y = iris[col_y]
y.value_counts()
setosa 50
versicolor 50
virginica 50
๋จ๊ณ2 : ํ๋ จ/๊ฒ์ ๋ฐ์ดํฐ์
์์ฑ
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.25)
๋จ๊ณ3 : model ์์ฑ : train data ์ด์ฉ
model = XGBClassifier(objective="multi:softprob").fit(X = X_train, y = y_train, eval_metric='merror')
UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release.
- use_label_encoder=True ์์ฑ : y๋ณ์ ์๋ ์ธ์ฝ๋ฉ ๊ธฐ๋ฅ
- ํฅํ ๋ฆด๋ฆฌ์ฆ์์ ์ ๊ฑฐ๋ ์์ ์ผ๋ก ๊ฒฝ๊ณ ๋ฉ์์ง ๋ํ๋จ
- ๊ฒฝ๊ณ ๋ฉ์์ง ์ ๊ฑฐ ๋ฐฉ๋ฒ
1. y๋ณ์์ label ์ธ์ฝ๋ฉ -> 2) use_label_encoder=False
๋จ๊ณ4 :์์ธก์น ์์ฑ : test data ์ด์ฉ
y_pred = model.predict(X = X_test)
๋จ๊ณ5 : ์ค์๋ณ์ ํ์ธ & ์๊ฐํ
fscore = model.get_booster().get_score()
print('fscore =', fscore)
fscore = {'Sepal.Length': 39.0, 'Sepal.Width': 71.0, 'Petal.Length': 115.0, 'Petal.Width': 104.0}
plot_importance(model) #์ค์๋ณ์ : 'Petal.Width'
๋จ๊ณ6 : model ํ๊ฐ : confusion matrix, accuracy, report
con_mat = confusion_matrix(y_test, y_pred)
print(con_mat)
acc = accuracy_score(y_test, y_pred)
print(acc)
report = classification_report(y_test, y_pred)
print(report)
๋ฌธ4) food๋ฅผ ๋์์ผ๋ก ๋ค์๊ณผ ๊ฐ์ด xgboost ๋ชจ๋ธ์ ์์ฑํ์์ค.
<์กฐ๊ฑด1> 6:4 ๋น์จ train/test set ์์ฑ
<์กฐ๊ฑด2> y๋ณ์ ; ํ์ _2๋ , x๋ณ์ ; ๋๋จธ์ง 20๊ฐ
<์กฐ๊ฑด3> ์ค์๋ณ์์ ๋ํ f1 score ์ถ๋ ฅ
<์กฐ๊ฑด4> ์ค์๋ณ์ ์๊ฐํ
<์กฐ๊ฑด5> accuracy์ model report ์ถ๋ ฅ
import pandas as pd #csv file read
from sklearn import model_selection, metrics #split, ํ๊ฐ ๋๊ตฌ
from xgboost import XGBClassifier #xgboost ๋ชจ๋ธ ์์ฑ
from xgboost import plot_importance #์ค์๋ณ์ ์๊ฐํ
์ค์๋ณ์ ์๊ฐํ
from matplotlib import font_manager, rc #ํ๊ธ ์ง์
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)
์ธ์์
์ข
๊ด๋ จ data set
food = pd.read_csv("C:/ITWILL/4_Python-2/data/food_dataset.csv",
encoding="utf-8", thousands=',')
๊ฒฐ์ธก์น ์ ๊ฑฐ
food=food.dropna()
print(food.info())
<class 'pandas.core.frame.DataFrame'>
Int64Index: 68796 entries, 0 to 70170
Data columns (total 21 columns):
food['ํ์
_2๋
'].value_counts()
0 54284 : ํ์
(x)
1 14512 : ํ์
(o)
<์กฐ๊ฑด2> X, y๋ณ์ ์ ํ
cols = list(food.columns)
cols #21๊ฐ ๋ณ์
y = food[cols[-1]] #food['ํ์
_2๋
']
X = food[cols[:-1]] #20๊ฐ ๋ณ์
X.shape #(68796, 20)
<์กฐ๊ฑด1> 6:4 ๋น์จ train/test set ์์ฑ
X_train, X_test, y_train, y_test = model_selection.train_test_split(
X, y, test_size=0.4, random_state=123)
<์กฐ๊ฑด3> ์ค์๋ณ์์ ๋ํ f1 score ์ถ๋ ฅ
model = XGBClassifier(objective = "binary:logistic").fit(X_train, y_train)
score = model.get_booster().get_fscore()
print(score)
{'์์ฌ์ง๋ฉด์ ': 472.0, '์์์
ํ๋ช
': 109.0, '์ฃผ๋ณ': 341.0, '์ฃผ๋ณ๋์ข
': 244.0, '๊ธฐ๊ฐํ๊ท ': 485.0, 'pop': 236.0, 'bank': 265.0, 'nonbank': 240.0, 'tax_sum': 210.0, '์ ๋์ธ๊ตฌ_์ฃผ์ค_์ค์ ': 287.0, '์ ๋์ธ๊ตฌ_์ฃผ์ค_์คํ': 304.0, '์ ๋์ธ๊ตฌ_์ฃผ๋ง_์ค์ ': 299.0, '์ ๋์ธ๊ตฌ_์ฃผ๋ง_์คํ': 295.0, 'X1km_๋ณ์๊ฐฏ์': 127.0, 'X1km_์ด๋ฑํ๊ต๊ฐฏ์': 98.0, 'X3km_๋ํ๊ต๊ฐฏ์': 124.0, 'X1km_๊ณ ๋ฑํ๊ต๊ฐฏ์': 98.0, 'X1km_์ํ๊ด๊ฐฏ์': 90.0, 'X1km_์งํ์ฒ ์ญ๊ฐฏ์': 96.0}
<์กฐ๊ฑด4> ์ค์๋ณ์ ์๊ฐํ
plot_importance(model)
<์กฐ๊ฑด5> accuracy์ model report ์ถ๋ ฅ
y_pred = model.predict(X = X_test)
acc = metrics.accuracy_score(y_test, y_pred)
print(acc) #0.7898542824957302
y_test.value_counts()
0 21730
1 5789
report = metrics.classification_report(y_test, y_pred)
print(report)
precision recall f1-score support
0 0.80 0.98 0.88 21730
1 0.50 0.08 0.14 5789
accuracy 0.79 27519
macro avg 0.65 0.53 0.51 27519
weighted avg 0.74 0.79 0.73 27519
๋ฌธ5) wine dataset์ ์ด์ฉํ์ฌ ๋ค์๊ณผ ๊ฐ์ด ๋คํญ๋ถ๋ฅ ๋ชจ๋ธ์ ์์ฑํ์์ค.
<์กฐ๊ฑด1> tree model 200๊ฐ ํ์ต
<์กฐ๊ฑด2> tree model ํ์ต๊ณผ์ ์์ ์กฐ๊ธฐ ์ข ๋ฃ 100ํ ์ง์
<์กฐ๊ฑด3> model์ ๋ถ๋ฅ์ ํ๋์ ๋ฆฌํฌํธ ์ถ๋ ฅ
from xgboost import XGBClassifier #model
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_wine #๋คํญ๋ถ๋ฅ
from sklearn.metrics import accuracy_score, classification_report
1. XGBoost Hyper Parameter
1. dataset load
wine = load_wine()
print(wine.feature_names) #13๊ฐ
print(wine.target_names) #['class_0' 'class_1' 'class_2']
X, y = load_wine(return_X_y=True)
2. train/test ์์ฑ
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3)
3. model ์์ฑ : ๋คํญ๋ถ๋ฅ
xgb = XGBClassifier(objective='multi:softprob',
n_estimators = 200) #softmax ํจ์
4. model ํ์ต ์กฐ๊ธฐ์ข
๋ฃ
eval_set = [(X_test, y_test)] #ํ๊ฐ์
model = xgb.fit(X_train, y_train,
eval_set = eval_set,
eval_metric='merror',
early_stopping_rounds=100)
5. model ํ๊ฐ
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print('accuracy =', acc) #accuracy = 0.9814814814814815
report = classification_report(y_test, y_pred)
print(report)
precision recall f1-score support
0 1.00 1.00 1.00 23
1 0.96 1.00 0.98 23
2 1.00 0.88 0.93 8
accuracy 0.98 54
macro avg 0.99 0.96 0.97 54
weighted avg 0.98 0.98 0.98 54