๊ฐ์ธ๊ณต๋ถ/Python
106. ํ์ด๋ ํ๋ก์ ํธ (12)๋ค์ด๋ฒ ๋ฆฌ๋ทฐ ๊ฐ์ฑ๋ถ์
LEE_BOMB
2022. 1. 15. 21:55
0. ๋ชจ๋ ์ํฌํธ
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import konlpy
import re
from konlpy.tag import Okt
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
1. ๋ฐ์ดํฐ ๊ฐ์ ธ์ค๊ธฐ & ๊ฒฐ์ธก์น ์ ๊ฑฐ
df = pd.read_csv(r"๊ฒฝ๋ก๋ช
\ํ์ผ๋ช
.csv")
df.head()
df.shape #11450, 2
df.isnull().sum()
score 0
review 104
dtype: int64
df = df.dropna(axis=0) #๊ฒฐ์ธก์น ํ ์ ๊ฑฐ
df.shape #11346, 2
df.isnull().sum()
score 0
review 0
dtype: int64
2. ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ
def apply_regular_expression(review):
hangul = re.compile('[^ ใฑ-ใ
ฃ ๊ฐ-ํฃ]') #ํ๊ธ ์ถ์ถ ๊ท์น: ๋์ด ์ฐ๊ธฐ(1 ๊ฐ)๋ฅผ ํฌํจํ ํ๊ธ
result = hangul.sub('', str(review)) #hangul๊ท์น์ review์ ์ ์ฉ(.sub)์ํด
return result
apply_regular_expression(df['review'])
3. ํํ์ ๋ถ์ (๋ช
์ฌ ๋จ์ ์ถ์ถ)
okt = Okt() #๋ช
์ฌ ํํ์ ์ถ์ถ ํจ์
nouns = okt.nouns(apply_regular_expression(df['review'][0]))
nouns #['ํ์ ', '์๋ฐ', 'ํน', '๋ด์ฉ', '์๊ธฐ', '๋ฐฐ์ฐ', '์ฐ๊ธฐ', '๋จ๋ง', '์ฅ์ฐฝ', 'ํจ']
corpus = "".join(df['review'].tolist()) #๋ง๋ญ์น ์์ฑ
corpus #๋ถ๋ถ์ถ๋ ฅ
์ ์ฒด ํํ์ ๋ถ์
apply_regular_expression(corpus)
nouns = okt.nouns(apply_regular_expression(corpus))
print(nouns)
4. ์ ์ฒ๋ฆฌ
#๋น๋ ๋ณด๊ธฐ
counter = Counter(nouns)
counter.most_common(10)
#ํ ๊ธ์ ๋ช
์ฌ ์ ๊ฑฐ
available_counter = Counter({x: counter[x] for x in counter if len(x) > 1})
available_counter.most_common(10)
#๋ถ์ฉ์ด ์ ๊ฑฐ
stopwords = pd.read_csv("https://raw.githubusercontent.com/yoonkt200/FastCampusDataset/master/korean_stopwords.txt").values.tolist()
stopwords[:10]
#๋ถ์ฉ์ด ์ฌ์ ์ ์ถ๊ฐํ๊ธฐ
movie_stopwords = ['์ํ', '์ด์ ', '๋๋ฌธ', '์ ๋', '๋ถ๋ถ', '์ด๊ฒ', '์ด๊ฑฐ', 'ํ๋ค', '๋ด๊ฐ', '์ํด', '์ ํฌ', '๋ฐ๋ผ', '์ํด']
for word in movie_stopwords:
stopwords.append(word)
5. bow vector ์์ฑ
def text_cleaning(review):
hangul = re.compile('[^ ใฑ-ใ
ฃ ๊ฐ-ํฃ]') #์ ๊ท ํํ์
result = hangul.sub('', review)
okt = Okt() #ํํ์ ์ถ์ถ
nouns = okt.nouns(result)
nouns = [x for x in nouns if len(x) > 1] # ํ๊ธ์ ํค์๋ ์ ๊ฑฐ
nouns = [x for x in nouns if x not in stopwords] # ๋ถ์ฉ์ด ์ ๊ฑฐ
return nouns
vect = CountVectorizer(tokenizer = lambda x: text_cleaning(x))
bow_vect = vect.fit_transform(df['review'].tolist())
word_list = vect.get_feature_names()
count_list = bow_vect.toarray().sum(axis=0)
word_list #๋จ์ด ๋ฆฌ์คํธ
count_list #๊ฐ ๋จ์ด๊ฐ ์ ์ฒด ๋ฆฌ๋ทฐ์ค์ ๋ฑ์ฅํ ์ด ํ์
bow_vect.toarray() #๊ฐ ๋จ์ด์ ๋ฆฌ๋ทฐ๋ณ ๋ฑ์ฅ ํ์
bow_vect.shape #(11346, 6489)
word_count_dict = dict(zip(word_list, count_list))
word_count_dict #"๋จ์ด" - "์ด ๋ฑ์ฅ ํ์" Matching
6. TF-IDF๋ก ๋ณํ
tfidf_vectorizer = TfidfTransformer()
tf_idf_vect = tfidf_vectorizer.fit_transform(bow_vect)
print(tf_idf_vect.shape) #(11346, 6489)
print(tf_idf_vect[0]) #์ฒซ ๋ฒ์งธ ๋ฆฌ๋ทฐ์์์ ๋จ์ด ์ค์๋(TF-IDF๊ฐ) -> 0์ด ์๋ ๊ฒ๋ง ์ถ๋ ฅ
#์ฒซ ๋ฒ์งธ ๋ฆฌ๋ทฐ์์ ๋ชจ๋ ๋จ์ด์ ์ค์๋ (0์ธ ๊ฐ๊น์ง ํฌํจ)
print(tf_idf_vect[0].toarray().shape) #(1, 6489)
print(tf_idf_vect[0].toarray()) #[[0. 0. 0. ... 0. 0. 0.]]
#๋ฒกํฐ-๋จ์ด ๋งคํ
vect.vocabulary_
invert_index_vectorizer = {v: k for k, v in vect.vocabulary_.items()}
print(str(invert_index_vectorizer)[:100]+'...')
7. ๊ฐ์ฑ๋ถ๋ฅ ์์ธก๋ชจ๋ธ
df.sample(10)
df['score'].hist()
#1~6 ๋ถ์ ์ 7~10 ๊ธ์ ์ ๋ถ๋ฅํ์ฌ 1, 0 ๋ถ์ฌ
def rating_to_label(score):
if score > 6:
return 1
else:
return 0
df['y'] = df['score'].apply(lambda x: rating_to_label(x))
df["y"].value_counts()
1 8007
0 3339
ํ๋ จ/๊ฒ์ set ๋๋๊ธฐ
x = tf_idf_vect
y = df['y']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=1)
x_train.shape, y_train.shape #((7942, 6489), (7942,))
x_test.shape, y_test.shape #((3404, 6489), (3404,))
8. ๋ชจ๋ธ ํ์ต
#fit in training set
lr = LogisticRegression(random_state = 0)
lr.fit(x_train, y_train)
#predict in test set
y_pred = lr.predict(x_test)
#๋ถ๋ฅ ๊ฒฐ๊ณผ ํ๊ฐ
print('accuracy: %.2f' % accuracy_score(y_test, y_pred)) #0.77
print('precision: %.2f' % precision_score(y_test, y_pred)) #0.77
print('recall: %.2f' % recall_score(y_test, y_pred)) #0.95
print('F1: %.2f' % f1_score(y_test, y_pred)) #0.85
#ํผ๋ํ๋ ฌ
from sklearn.metrics import confusion_matrix
confu = confusion_matrix(y_true = y_test, y_pred = y_pred)
plt.figure(figsize=(4, 3))
sns.heatmap(confu, annot=True, annot_kws={'size':15}, cmap='OrRd', fmt='.10g')
plt.title('Confusion Matrix')
plt.show()
10. ์๊ฐํ
plt.figure(figsize=(10, 8))
plt.bar(range(len(lr.coef_[0])), lr.coef_[0])
์ฐธ๊ณ
๊ตฐ์ฐ๋ ๊ฐ์ฑ๋ถ์ https://github.com/park1200656/KnuSentiLex
KoNLPy https://konlpy-ko.readthedocs.io/ko/v0.4.3/morph/
๊ฐ์ฑ๋ถ์ https://cyc1am3n.github.io/2018/11/10/classifying_korean_movie_review.html
https://hyemin-kim.github.io/2020/08/29/E-Python-TextMining-2/