๊ฐœ์ธ๊ณต๋ถ€/Python

72. Python TreeModel ์—ฐ์Šต๋ฌธ์ œ

LEE_BOMB 2021. 12. 7. 20:11
๋ฌธ1) load_breast_cancer ๋ฐ์ดํ„ฐ ์…‹์„ ์ด์šฉํ•˜์—ฌ ๋‹ค์Œ๊ณผ ๊ฐ™์ด Decision Tree ๋ชจ๋ธ์„ ์ƒ์„ฑํ•˜์‹œ์˜ค.
<์กฐ๊ฑด1> 75:25๋น„์œจ train/test ๋ฐ์ดํ„ฐ ์…‹ ๊ตฌ์„ฑ 
<์กฐ๊ฑด2> y๋ณ€์ˆ˜ : cancer.target, x๋ณ€์ˆ˜ : cancer.data
<์กฐ๊ฑด3> tree ์ตœ๋Œ€ ๊นŠ์ด : 5 
<์กฐ๊ฑด4> decision tree ์‹œ๊ฐํ™” & ์ค‘์š”๋ณ€์ˆ˜ ํ™•์ธ

 

from sklearn import model_selection
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
#tree ์‹œ๊ฐํ™” 
from sklearn.tree import export_graphviz
from graphviz import Source #pip install graphviz


๋ฐ์ดํ„ฐ ์…‹ load 

cancer = load_breast_cancer()


<๋‹จ๊ณ„1> y๋ณ€์ˆ˜ : cancer.target, x๋ณ€์ˆ˜ : cancer.data 

feature_names = cancer.feature_names
class_names = cancer.target_names
X = cancer.data
y = cancer.target


<๋‹จ๊ณ„2> 75:25๋น„์œจ train/test ๋ฐ์ดํ„ฐ ์…‹ ๊ตฌ์„ฑ

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.25, random_state=123)


<๋‹จ๊ณ„3> tree ์ตœ๋Œ€ ๊นŠ์ด : 5

tree = DecisionTreeClassifier(max_depth=5, random_state=123)
model = tree.fit(X=X_train, y=y_train)

test_score = model.score(X=X_test, y=y_test)
print('accuracy =', test_score) #accuracy = 0.972027972027972


<๋‹จ๊ณ„4> decision tree ์‹œ๊ฐํ™” & ์ค‘์š”๋ณ€์ˆ˜ ํ™•์ธ 

export_graphviz(model, out_file="tree_exam.dot",
                    feature_names=feature_names, 
                    class_names=class_names)

file = open("tree_exam.dot")
graph = file.read()
file.close()

Source(graph) #์ค‘์š”๋ณ€์ˆ˜ : worst_radius

 

 

 

 

 

๋ฌธ2) ๋‹น๋ฃŒ๋ณ‘(diabetes.csv) ๋ฐ์ดํ„ฐ ์…‹์„ ์ด์šฉํ•˜์—ฌ ๋‹ค์Œ๊ณผ ๊ฐ™์€ ๋‹จ๊ณ„๋กœ RandomForest ๋ชจ๋ธ์„ ์ƒ์„ฑํ•˜์‹œ์˜ค.

<๋‹จ๊ณ„1> ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ & ์นผ๋Ÿผ๋ช… ์ ์šฉ 
<๋‹จ๊ณ„2> x, y ๋ณ€์ˆ˜ ์„ ํƒ : x๋ณ€์ˆ˜ : 1 ~ 8๋ฒˆ์งธ ์นผ๋Ÿผ, y๋ณ€์ˆ˜ : 9๋ฒˆ์งธ ์นผ๋Ÿผ
<๋‹จ๊ณ„3> 500๊ฐœ์˜ ํŠธ๋ฆฌ๋ฅผ ์ด์šฉํ•˜์—ฌ ๋ชจ๋ธ ์ƒ์„ฑ   
<๋‹จ๊ณ„4> ์ค‘์š”๋ณ€์ˆ˜ ์‹œ๊ฐํ™” : feature names ์ ์šฉ 

 

from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import matplotlib.pyplot as plt #์ค‘์š”๋ณ€์ˆ˜ ์‹œ๊ฐํ™”


๋‹จ๊ณ„1. ํ…Œ์ดํ„ฐ์…‹ ๋กœ๋“œ  

dia = pd.read_csv('C:/ITWILL/4_Python-2/data/diabetes.csv', 
                  header=None) #์ œ๋ชฉ ์—†์Œ 
print(dia.info())


์นผ๋Ÿผ๋ช… ์ถ”๊ฐ€ 

dia.columns = ['Pregnancies','Glucose','BloodPressure','SkinThickness',
               'Insulin','BMI','DiabetesPedigree','Age','Outcome']
print(dia.info())

 0   Pregnancies       759 non-null    float64
 1   Glucose           759 non-null    float64
 2   BloodPressure     759 non-null    float64
 3   SkinThickness     759 non-null    float64
 4   Insulin           759 non-null    float64
 5   BMI               759 non-null    float64
 6   DiabetesPedigree  759 non-null    float64
 7   Age               759 non-null    float64
 8   Outcome           759 non-null    int64    - y๋ณ€์ˆ˜ 
 (ํ•œ๊ธ€๋ช… : ์ž„์‹ , ํ˜ˆ๋‹น, ํ˜ˆ์••, ํ”ผ๋ถ€๋‘๊ป˜,์ธ์А๋ฆฐ,๋น„๋งŒ๋„์ง€์ˆ˜,๋‹น๋ฃŒ๋ณ‘์œ ์ „,๋‚˜์ด,๊ฒฐ๊ณผ)  

type(dia)


๋‹จ๊ณ„2. x,y ๋ณ€์ˆ˜ ์ƒ์„ฑ 

cols = list(dia.columns)
cols

X = dia[cols[:-1]] #์ค‘์ฒฉlist 
X.shape  #(759, 8)

y = dia['Outcome'] #๋‹จ์ผlist


๋‹จ๊ณ„3. model ์ƒ์„ฑ

model = RandomForestClassifier(n_estimators=500).fit(X, y)


๋‹จ๊ณ„4. ์ค‘์š”๋ณ€์ˆ˜ ์‹œ๊ฐํ™” 

model.feature_importances_
# array([0.0786396 , 0.26605613, 0.0838773 , 0.0800118 , 0.08649011,
#       0.16625653, 0.12740268, 0.11126585])

x_names = cols[:-1]
size = len(x_names) #8

plt.barh(y=range(size), width=model.feature_importances_)
#y์ถ• ๋ˆˆ๊ธˆ : x๋ณ€์ˆ˜ ์ด๋ฆ„  
plt.yticks(range(size), x_names)
plt.xlabel("feature_importances")
plt.show()

#์ค‘์š”๋ณ€์ˆ˜ : Glucose(ํ˜ˆ๋‹น) > BMI(๋น„๋งŒ๋„์ง€์ˆ˜)

 

 

 

 

 

๋ฌธ3) iris dataset์„ ์ด์šฉํ•˜์—ฌ ๋‹ค์Œ๊ณผ ๊ฐ™์€ ๋‹จ๊ณ„๋กœ XGBoost model์„ ์ƒ์„ฑํ•˜์‹œ์˜ค.

 

import pandas as pd #file read
from xgboost import XGBClassifier #model ์ƒ์„ฑ 
from xgboost import plot_importance #์ค‘์š”๋ณ€์ˆ˜ ์‹œ๊ฐํ™”  
from sklearn.model_selection import train_test_split #dataset split
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report # model ํ‰๊ฐ€


๋‹จ๊ณ„1 : data set load 

iris = pd.read_csv("C:/ITWILL/4_Python-2/data/iris.csv")


๋ณ€์ˆ˜๋ช… ์ถ”์ถœ 

cols=list(iris.columns)
cols #['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width', 'Species']


col_x=cols[:4] #x๋ณ€์ˆ˜๋ช… : ['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width']
col_y=cols[-1] #y๋ณ€์ˆ˜๋ช… : 'Species'

X = iris[col_x]
y = iris[col_y] 
y.value_counts()

setosa        50
versicolor    50
virginica     50


๋‹จ๊ณ„2 : ํ›ˆ๋ จ/๊ฒ€์ • ๋ฐ์ดํ„ฐ์…‹ ์ƒ์„ฑ

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25)

 


๋‹จ๊ณ„3 : model ์ƒ์„ฑ : train data ์ด์šฉ

model = XGBClassifier(objective="multi:softprob").fit(X = X_train, y = y_train, eval_metric='merror')

UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release.
- use_label_encoder=True ์†์„ฑ : y๋ณ€์ˆ˜ ์ž๋™ ์ธ์ฝ”๋”ฉ ๊ธฐ๋Šฅ  
- ํ–ฅํ›„ ๋ฆด๋ฆฌ์ฆˆ์—์„œ ์ œ๊ฑฐ๋  ์˜ˆ์ •์œผ๋กœ ๊ฒฝ๊ณ  ๋ฉ”์‹œ์ง€ ๋‚˜ํƒ€๋‚จ  
- ๊ฒฝ๊ณ  ๋ฉ”์‹œ์ง€ ์ œ๊ฑฐ ๋ฐฉ๋ฒ• 
  1. y๋ณ€์ˆ˜์˜ label ์ธ์ฝ”๋”ฉ -> 2) use_label_encoder=False

 


๋‹จ๊ณ„4 :์˜ˆ์ธก์น˜ ์ƒ์„ฑ : test data ์ด์šฉ  

y_pred = model.predict(X = X_test)

 


๋‹จ๊ณ„5 : ์ค‘์š”๋ณ€์ˆ˜ ํ™•์ธ & ์‹œ๊ฐํ™”  

fscore = model.get_booster().get_score()
print('fscore =', fscore)

fscore = {'Sepal.Length': 39.0, 'Sepal.Width': 71.0, 'Petal.Length': 115.0, 'Petal.Width': 104.0}

plot_importance(model) #์ค‘์š”๋ณ€์ˆ˜ : 'Petal.Width'



๋‹จ๊ณ„6 : model ํ‰๊ฐ€ : confusion matrix, accuracy, report

con_mat = confusion_matrix(y_test, y_pred)
print(con_mat)

acc = accuracy_score(y_test, y_pred)
print(acc) 

report = classification_report(y_test, y_pred)
print(report)

 

 

 

 

 

๋ฌธ4) food๋ฅผ ๋Œ€์ƒ์œผ๋กœ ๋‹ค์Œ๊ณผ ๊ฐ™์ด xgboost ๋ชจ๋ธ์„ ์ƒ์„ฑํ•˜์‹œ์˜ค.

<์กฐ๊ฑด1> 6:4 ๋น„์œจ train/test set ์ƒ์„ฑ 
<์กฐ๊ฑด2> y๋ณ€์ˆ˜ ; ํ์—…_2๋…„, x๋ณ€์ˆ˜ ; ๋‚˜๋จธ์ง€ 20๊ฐœ 
<์กฐ๊ฑด3> ์ค‘์š”๋ณ€์ˆ˜์— ๋Œ€ํ•œ  f1 score ์ถœ๋ ฅ
<์กฐ๊ฑด4> ์ค‘์š”๋ณ€์ˆ˜ ์‹œ๊ฐํ™”  
<์กฐ๊ฑด5> accuracy์™€ model report ์ถœ๋ ฅ 

 

import pandas as pd #csv file read 
from sklearn import model_selection, metrics  #split, ํ‰๊ฐ€ ๋„๊ตฌ 
from xgboost import XGBClassifier #xgboost ๋ชจ๋ธ ์ƒ์„ฑ 
from xgboost import plot_importance #์ค‘์š”๋ณ€์ˆ˜ ์‹œ๊ฐํ™”


์ค‘์š”๋ณ€์ˆ˜ ์‹œ๊ฐํ™” 

from matplotlib import font_manager, rc #ํ•œ๊ธ€ ์ง€์›
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)


์™ธ์‹์—…์ข… ๊ด€๋ จ data set

food = pd.read_csv("C:/ITWILL/4_Python-2/data/food_dataset.csv",
                   encoding="utf-8", thousands=',')


๊ฒฐ์ธก์น˜ ์ œ๊ฑฐ

food=food.dropna()  
print(food.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68796 entries, 0 to 70170
Data columns (total 21 columns):

food['ํ์—…_2๋…„'].value_counts()

0    54284 : ํ์—…(x)
1    14512 : ํ์—…(o)

<์กฐ๊ฑด2> X, y๋ณ€์ˆ˜ ์„ ํƒ 

cols = list(food.columns)
cols #21๊ฐœ ๋ณ€์ˆ˜ 
y = food[cols[-1]] #food['ํ์—…_2๋…„']
X = food[cols[:-1]] #20๊ฐœ ๋ณ€์ˆ˜ 
X.shape #(68796, 20)


<์กฐ๊ฑด1> 6:4 ๋น„์œจ train/test set ์ƒ์„ฑ 

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.4, random_state=123)


<์กฐ๊ฑด3> ์ค‘์š”๋ณ€์ˆ˜์— ๋Œ€ํ•œ f1 score ์ถœ๋ ฅ

model = XGBClassifier(objective = "binary:logistic").fit(X_train, y_train)

score = model.get_booster().get_fscore() 
print(score)

{'์†Œ์žฌ์ง€๋ฉด์ ': 472.0, '์œ„์ƒ์—…ํƒœ๋ช…': 109.0, '์ฃผ๋ณ€': 341.0, '์ฃผ๋ณ€๋™์ข…': 244.0, '๊ธฐ๊ฐ„ํ‰๊ท ': 485.0, 'pop': 236.0, 'bank': 265.0, 'nonbank': 240.0, 'tax_sum': 210.0, '์œ ๋™์ธ๊ตฌ_์ฃผ์ค‘_์˜ค์ „': 287.0, '์œ ๋™์ธ๊ตฌ_์ฃผ์ค‘_์˜คํ›„': 304.0, '์œ ๋™์ธ๊ตฌ_์ฃผ๋ง_์˜ค์ „': 299.0, '์œ ๋™์ธ๊ตฌ_์ฃผ๋ง_์˜คํ›„': 295.0, 'X1km_๋ณ‘์›๊ฐฏ์ˆ˜': 127.0, 'X1km_์ดˆ๋“ฑํ•™๊ต๊ฐฏ์ˆ˜': 98.0, 'X3km_๋Œ€ํ•™๊ต๊ฐฏ์ˆ˜': 124.0, 'X1km_๊ณ ๋“ฑํ•™๊ต๊ฐฏ์ˆ˜': 98.0, 'X1km_์˜ํ™”๊ด€๊ฐฏ์ˆ˜': 90.0, 'X1km_์ง€ํ•˜์ฒ ์—ญ๊ฐฏ์ˆ˜': 96.0}

<์กฐ๊ฑด4> ์ค‘์š”๋ณ€์ˆ˜ ์‹œ๊ฐํ™”

plot_importance(model)


<์กฐ๊ฑด5> accuracy์™€ model report ์ถœ๋ ฅ 

y_pred = model.predict(X = X_test)

acc = metrics.accuracy_score(y_test, y_pred)
print(acc)  #0.7898542824957302  

y_test.value_counts()

0    21730
1     5789

report = metrics.classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.80      0.98      0.88     21730
           1       0.50      0.08      0.14      5789

    accuracy                           0.79     27519
   macro avg       0.65      0.53      0.51     27519
weighted avg       0.74      0.79      0.73     27519

 

 

 

 

๋ฌธ5) wine dataset์„ ์ด์šฉํ•˜์—ฌ ๋‹ค์Œ๊ณผ ๊ฐ™์ด ๋‹คํ•ญ๋ถ„๋ฅ˜ ๋ชจ๋ธ์„ ์ƒ์„ฑํ•˜์‹œ์˜ค. 
<์กฐ๊ฑด1> tree model 200๊ฐœ ํ•™์Šต
<์กฐ๊ฑด2> tree model ํ•™์Šต๊ณผ์ •์—์„œ ์กฐ๊ธฐ ์ข…๋ฃŒ 100ํšŒ ์ง€์ •
<์กฐ๊ฑด3> model์˜ ๋ถ„๋ฅ˜์ •ํ™•๋„์™€ ๋ฆฌํฌํŠธ ์ถœ๋ ฅ   

 

from xgboost import XGBClassifier #model
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_wine #๋‹คํ•ญ๋ถ„๋ฅ˜
from sklearn.metrics import accuracy_score, classification_report


1. XGBoost Hyper Parameter
1. dataset load

wine = load_wine()
print(wine.feature_names) #13๊ฐœ 
print(wine.target_names) #['class_0' 'class_1' 'class_2']

X, y = load_wine(return_X_y=True)

 


2. train/test ์ƒ์„ฑ 

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3)



3. model ์ƒ์„ฑ : ๋‹คํ•ญ๋ถ„๋ฅ˜ 

xgb = XGBClassifier(objective='multi:softprob',
                    n_estimators = 200) #softmax ํ•จ์ˆ˜



4. model ํ•™์Šต ์กฐ๊ธฐ์ข…๋ฃŒ 

eval_set = [(X_test, y_test)] #ํ‰๊ฐ€์…‹ 

model = xgb.fit(X_train, y_train, 
                eval_set = eval_set,
                eval_metric='merror',
                early_stopping_rounds=100)



5. model ํ‰๊ฐ€ 

y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print('accuracy =', acc) #accuracy = 0.9814814814814815

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       0.96      1.00      0.98        23
           2       1.00      0.88      0.93         8

    accuracy                           0.98        54
   macro avg       0.99      0.96      0.97        54
weighted avg       0.98      0.98      0.98        54