DAY52. Python Classification (2)TF-IDF sparse Matrix (๋จ์ด ์ถ์ถ)
Tfidf Vectorizer
๋จ์ด์์ฑ๊ธฐ(TfidfVectorizer) : ๋ฌธ์ฅ -> ๋จ์ด ์ถ์ถ
TFiDF ๋จ์ด ์์ฑ๊ธฐ : TfidfVectorizer
1. ๋จ์ด ์์ฑ๊ธฐ[word tokenizer] : ๋ฌธ์ฅ(sentences) -> ๋จ์ด(word) ์์ฑ
2. ๋จ์ด ์ฌ์ [word dictionary] : (word, ๊ณ ์ ์์น)
3. ํฌ์ํ๋ ฌ[sparse matrix] : ๋จ์ด ์ถํ ๋น์จ์ ์ํด์ ๊ฐ์ค์น ์ ์ฉ ํ๋ ฌ
1) TF ๊ฐ์ค์น : ๋จ์ด์ถํ๋น๋์
2) TFiDF ๊ฐ์ค์น : ๋จ์ด์ถํ๋น๋์(TF) x ๋ฌธ์์ถํ๋น๋์์ ์ญ์(iDF)
์ฌ์ฉ๋ถ์ผ : ๋ฌธ์๋ถ๋ฅ๊ธฐ์์ ์ฌ์ฉ๋ ํ
์คํธ ์ ์ฒ๋ฆฌ
from sklearn.feature_extraction.text import TfidfVectorizer #class
๋ฌธ์ฅ(sentence) : 3๊ฐ ๋ฌธ์ฅ
sentences = [
"Mr. Green killed Colonel Mustard in the study with the candlestick. Mr. Green is not a very nice fellow.",
"Professor Plum has a green plant in his study.",
"Miss Scarlett watered Professor Plum's green plant while he was away from his office last week."
]
print(sentences)
1. ๋จ์ด ์์ฑ๊ธฐ[word tokenizer]
tfidf = TfidfVectorizer()
tfidf
2. ๋จ์ด ์ฌ์
fit = tfidf.fit(sentences) #๋ฌธ์ฅ ์ ์ฉ
voca = fit.vocabulary_
print(voca) #{'๋จ์ด':๊ณ ์ ์ซ์} - ๊ณ ์ ์ซ์ : ์๋ฌธ ์ค๋ฆ์ฐจ์
len(voca) #31
3. ํฌ์ํ๋ ฌ(sparse matrix)
sp_max = tfidf.fit_transform(sentences)
print(sp_max)
(doc,word) TFiDF ๊ฐ์ค์น
(0, 3) 0.2205828828763741
(0, 16) 0.2205828828763741
(0, 25) 0.2205828828763741
(0, 17) 0.2205828828763741
scipy -> numpy ํฌ์ํ๋ ฌ ๋ณ๊ฒฝ
sp_max_arr = sp_max.toarray()
print(sp_max_arr)
[[0. 0.22058288 0.22058288 0.22058288 0. 0.26055961
0. 0. 0. 0.16775897 0.22058288 0.22058288
0. 0. 0.44116577 0.22058288 0.22058288 0.22058288
0. 0. 0. 0. 0. 0.16775897
0.44116577 0.22058288 0. 0. 0. 0.
0.22058288]
[0. 0. 0. 0. 0. 0.26903992
0.45552418 0. 0.34643788 0.34643788 0. 0.
0. 0. 0. 0. 0. 0.
0. 0.34643788 0.34643788 0.34643788 0. 0.34643788
0. 0. 0. 0. 0. 0.
0. ]
[0.27054288 0. 0. 0. 0.27054288 0.15978698
0. 0.27054288 0.20575483 0. 0. 0.
0.27054288 0.27054288 0. 0. 0. 0.
0.27054288 0.20575483 0.20575483 0.20575483 0.27054288 0.
0. 0. 0.27054288 0.27054288 0.27054288 0.27054288
0. ]]
sp_max_arr.shape #(3, 31)
Tfidf sparseMatrix
<์์
์์>
1. csv file ๊ฐ์ ธ์ค๊ธฐ
2. texts, target ์ ์ฒ๋ฆฌ
3. max features
4. sparse matrix
import pandas as pd # csv file read
from sklearn.feature_extraction.text import TfidfVectorizer #ํฌ์ํ๋ ฌ
1. csv file ๊ฐ์ ธ์ค๊ธฐ
path = 'C:/ITWILL/4_Python-II/workspace/chap07_Classification/data'
spam_data = pd.read_csv(path + '/temp_spam_data.csv',header=None)
print(spam_data)
0 1
0 ham ์ฐ๋ฆฌ๋๋ผ ๋ํ๋ฏผ๊ตญ, ์ฐ๋ฆฌ๋๋ผ ๋ง์ธ
1 spam ๋น์๊ทธ๋ผ 500GRAM ์ ๋ ฅ ์ต๊ณ !
2 ham ๋๋ ๋ํ๋ฏผ๊ตญ ์ฌ๋
3 spam ๋ณดํ๋ฃ 15000์์ ํ์ ๋ณด์ฅ ๋ง๊ฐ ์๋ฐ
4 ham ๋๋ ํ๊ธธ๋
2. texts, target ์ ์ฒ๋ฆฌ
1) target ์ ์ฒ๋ฆฌ : dummy๋ณ์
target = spam_data[0]
target
list + for
target = [0 if t=='ham' else 1 for t in target]
target # [0, 1, 0, 1, 0]
import string #texts ์ ์ฒ๋ฆฌ
def text_prepro(texts): #๋ฌธ๋จ(sentences)
#Lower case : ๋ฌธ๋จ -> ๋ฌธ์ฅ -> ์๋ฌธ์๋ฌธ์ ๋ณ๊ฒฝ
texts = [x.lower() for x in texts]
#Remove punctuation : ๋ฌธ๋จ -> ๋ฌธ์ฅ -> ์์ -> ํํฐ๋ง -> ๋ฌธ์ฅ
texts = [''.join(ch for ch in st if ch not in string.punctuation) for st in texts]
#Remove numbers : ๋ฌธ๋จ -> ๋ฌธ์ฅ -> ์์ -> ํํฐ๋ง -> ๋ฌธ์ฅ
texts = [''.join(ch for ch in st if ch not in string.digits) for st in texts]
#Trim extra whitespace : ๋ฌธ๋จ -> ๋ฌธ์ฅ -> ๊ณต๋ฐฑ ์ ๊ฑฐ
texts = [' '.join(x.split()) for x in texts]
return texts
2) texts ์ ์ฒ๋ฆฌ : ๋ถ์ฉ์ด(๊ณต๋ฐฑ,ํน์๋ฌธ์,๋ฌธ์ฅ๋ถํธ,์ซ์)
texts = spam_data[1]
texts #์ ์ฒ๋ฆฌ ์
texts = text_prepro(texts)
texts #์ ์ฒ๋ฆฌ ํ
3. max features : ํฌ์ํ๋ ฌ์ ์ฌ์ฉ๋ ๋จ์ด ๊ฐ์
tfidf = TfidfVectorizer() #๋จ์ด ์์ฑ๊ธฐ
fit = tfidf.fit(texts) #ํ
์คํธ ์ ์ฉ
voca = fit.vocabulary_
print(voca)
len(voca) #16
max_features = len(voca) #์ ์ฒด ๋จ์ด ์ด์ฉ
* max_features = 10 : ์ค์๋จ์ด 10๊ฐ๋ง ์ ์ ํ์ฌ ํฌ์ํ๋ ฌ
4. sparse matrix
tfidf = TfidfVectorizer(max_features=max_features)
sp_mat = tfidf.fit_transform(texts)
print(sp_mat)
numpy matrix
sp_mat_arr = sp_mat.toarray()
print(sp_mat_arr)
[[0. 0. 0.33939315 0. 0.42066906 0.
0. 0. 0. 0.84133812 0. 0.
0. 0. 0. 0. ]
[0.5 0. 0. 0. 0. 0.
0. 0.5 0. 0. 0. 0.
0.5 0.5 0. 0. ]
[0. 0.53177225 0.53177225 0. 0. 0.
0. 0. 0.659118 0. 0. 0.
0. 0. 0. 0. ]
[0. 0. 0. 0.40824829 0. 0.40824829
0.40824829 0. 0. 0. 0.40824829 0.40824829
0. 0. 0.40824829 0. ]
[0. 0.62791376 0. 0. 0. 0.
0. 0. 0. 0. 0. 0.
0. 0. 0. 0.77828292]]
Tfidf sparseMatrix2
<์์
์์>
1. csv file ๊ฐ์ ธ์ค๊ธฐ[์์ ]
2. texts, target ์ ์ฒ๋ฆฌ
3. max features[์์ ]
4. sparse matrix
5. train/test split[์ถ๊ฐ]
6. file save[์ถ๊ฐ]
import pandas as pd # csv file read
from sklearn.feature_extraction.text import TfidfVectorizer #ํฌ์ํ๋ ฌ
1. csv file ๊ฐ์ ธ์ค๊ธฐ[์์ ]
path = 'C:/ITWILL/4_Python-2/workspace/chap07_Classification/data'
spam_data = pd.read_csv(path + '/temp_spam_data2.csv',header=None)
print(spam_data)
0 1
0 ham Go until jurong point, crazy.. Available only ...
1 ham Ok lar... Joking wif u oni...
2 spam Free entry in 2 a wkly comp to win FA Cup fina...
3 ham U dun say so early hor... U c already then say...
4 ham Nah I don't think he goes to usf, he lives aro...
2. texts, target ์ ์ฒ๋ฆฌ
1) target ์ ์ฒ๋ฆฌ : dummy๋ณ์
target = spam_data[0]
target
#list + for
target = [0 if t=='ham' else 1 for t in target]
target #[0, 1, 0, 1, 0]
import string #texts ์ ์ฒ๋ฆฌ
def text_prepro(texts): #๋ฌธ๋จ(sentences)
#Lower case : ๋ฌธ๋จ -> ๋ฌธ์ฅ -> ์๋ฌธ์๋ฌธ์ ๋ณ๊ฒฝ
texts = [x.lower() for x in texts]
#Remove punctuation : ๋ฌธ๋จ -> ๋ฌธ์ฅ -> ์์ -> ํํฐ๋ง -> ๋ฌธ์ฅ
texts = [''.join(ch for ch in st if ch not in string.punctuation) for st in texts]
#Remove numbers : ๋ฌธ๋จ -> ๋ฌธ์ฅ -> ์์ -> ํํฐ๋ง -> ๋ฌธ์ฅ
texts = [''.join(ch for ch in st if ch not in string.digits) for st in texts]
#Trim extra whitespace : ๋ฌธ๋จ -> ๋ฌธ์ฅ -> ๊ณต๋ฐฑ ์ ๊ฑฐ
texts = [' '.join(x.split()) for x in texts]
return texts
2) texts ์ ์ฒ๋ฆฌ : ๋ถ์ฉ์ด(๊ณต๋ฐฑ,ํน์๋ฌธ์,๋ฌธ์ฅ๋ถํธ,์ซ์)
texts = spam_data[1]
texts #์ ์ฒ๋ฆฌ ์
texts = text_prepro(texts)
texts #์ ์ฒ๋ฆฌ ํ
3. max features : ํฌ์ํ๋ ฌ์ ์ฌ์ฉ๋ ๋จ์ด ๊ฐ์
tfidf = TfidfVectorizer() #๋จ์ด ์์ฑ๊ธฐ
fit = tfidf.fit(texts) #ํ
์คํธ ์ ์ฉ
voca = fit.vocabulary_
print(voca)
len(voca) #8603
max_features = 5000 #์ค์๋จ์ด 5,000๊ฐ ํฌ์ํ๋ ฌ
4. sparse matrix
tfidf = TfidfVectorizer(max_features=max_features)
sp_mat = tfidf.fit_transform(texts)
print(sp_mat)
numpy matrix
sp_mat_arr = sp_mat.toarray()
print(sp_mat_arr)
[[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
...
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]]
sp_mat_arr.shape #(5574, 5000)
5. train/test split[์ถ๊ฐ]
from sklearn.model_selection import train_test_split
import numpy as np
X_train, X_test, y_train, y_test = train_test_split(
sp_mat_arr, target, test_size=0.3)
X_train.shape #(3901, 5000)
X_test.shape #(1673, 5000)
list -> numpy
y_train = np.array(y_train)
y_test = np.array(y_test)
y_train.shape #(3901,)
y_test.shape #(1673,)
6. file save[์ถ๊ฐ] : np.save()
spam_train_test = (X_train,X_test,y_train,y_test)
np.save("file", object)
np.save(path + "/spam_train_test.npy", spam_train_test)
np.load("file")
X_train,X_test,y_train,y_test = np.load(path + "/spam_train_test.npy",allow_pickle=True)
X_train.shape #(3901, 5000)
ham spam classifier
๋ฌธ์๋ถ๋ฅ๊ธฐ
NB vs SVM
NB : ์ฐ์์๋ ๋น ๋ฆ
SVM : ์ ํ๋ ๋์
import numpy as np #np.load()
from sklearn.naive_bayes import MultinomialNB #nb model
from sklearn.svm import SVC #svm model
from sklearn.metrics import accuracy_score, confusion_matrix #ํ๊ฐ
import time #์๊ฐ ์ธก์
1. dataset load
path = 'C:/ITWILL/4_Python-II/workspace/chap07_Classification/data'
X_train,X_test,y_train,y_test = np.load(path + "/spam_train_test.npy",allow_pickle=True)
input data : X -> ํฌ์ํ๋ ฌ
X_train.shape #(3901, 5000)
X_test.shape #(1673, 5000)
output data : y - dummy(0 or 1)
y_train[:10] #array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1])
NB model
start = time.time()
nb_model = MultinomialNB().fit(X=X_train, y=y_train)
end = time.time() - start
print('์คํ์๊ฐ : ', end)
y_pred = nb_model.predict(X = X_test)
y_true = y_test
acc = accuracy_score(y_true, y_pred)
print('NB model ๋ถ๋ฅ์ ํ๋ :', acc)
SVM model
start = time.time()
svm_model = SVC(kernel='linear').fit(X=X_train, y=y_train)
end = time.time() - start
print('์คํ์๊ฐ : ', end)
y_pred = svm_model.predict(X = X_test)
y_true = y_test
acc = accuracy_score(y_true, y_pred)
print('SVM model ๋ถ๋ฅ์ ํ๋ :', acc)
์คํ์๊ฐ : 0.10205936431884766
NB model ๋ถ๋ฅ์ ํ๋ : 0.9575612671846981
์คํ์๊ฐ : 7.592423677444458
SVM model ๋ถ๋ฅ์ ํ๋ : 0.9760908547519426
๋ถ๊ท ํ ๋น์จ
con_mat = confusion_matrix(y_true, y_pred)
print(con_mat)
0 1
0 [[1437 1] = 1438
1 [ 39 196]] = 235
์ ํ๋ฅ : ์์ธก์น yes(1) -> yes(1)
p = con_mat[1,1] / con_mat[:,1].sum() #0.9949238578680203
์ฌํ์จ=๋ฏผ๊ฐ๋ : ๊ด์ธก์น YES(1) -> YES(1)
r = con_mat[1,1] / con_mat[1,:].sum() #0.8340425531914893
f1 score : ์กฐํํ๊ท
f1_score = 2 * ((p*r) / (p+r))
print('f1 score =', f1_score) #f1 score = 0.9074074074074074