๊ฐœ์ธ๊ณต๋ถ€/Python

106. ํŒŒ์ด๋„ ํ”„๋กœ์ ํŠธ (12)๋„ค์ด๋ฒ„ ๋ฆฌ๋ทฐ ๊ฐ์„ฑ๋ถ„์„

LEE_BOMB 2022. 1. 15. 21:55

0. ๋ชจ๋“ˆ ์ž„ํฌํŠธ

%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

import konlpy
import re

from konlpy.tag import Okt
from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



1. ๋ฐ์ดํ„ฐ ๊ฐ€์ ธ์˜ค๊ธฐ & ๊ฒฐ์ธก์น˜ ์ œ๊ฑฐ

df = pd.read_csv(r"๊ฒฝ๋กœ๋ช…\ํŒŒ์ผ๋ช….csv")

df.head()
df.shape #11450, 2
df.isnull().sum()

score       0
review    104
dtype: int64

df = df.dropna(axis=0) #๊ฒฐ์ธก์น˜ ํ–‰ ์ œ๊ฑฐ 
df.shape #11346, 2
df.isnull().sum()

score     0
review    0
dtype: int64



2. ๋ฐ์ดํ„ฐ ์ „์ฒ˜๋ฆฌ

def apply_regular_expression(review):
    hangul = re.compile('[^ ใ„ฑ-ใ…ฃ ๊ฐ€-ํžฃ]')  #ํ•œ๊ธ€ ์ถ”์ถœ ๊ทœ์น™: ๋„์–ด ์“ฐ๊ธฐ(1 ๊ฐœ)๋ฅผ ํฌํ•จํ•œ ํ•œ๊ธ€
    result = hangul.sub('', str(review))  #hangul๊ทœ์น™์„ review์— ์ ์šฉ(.sub)์‹œํ‚ด
    return result

apply_regular_expression(df['review'])



3. ํ˜•ํƒœ์†Œ ๋ถ„์„ (๋ช…์‚ฌ ๋‹จ์œ„ ์ถ”์ถœ)

okt = Okt() #๋ช…์‚ฌ ํ˜•ํƒœ์†Œ ์ถ”์ถœ ํ•จ์ˆ˜
nouns = okt.nouns(apply_regular_expression(df['review'][0]))
nouns #['ํ‰์ ', '์•Œ๋ฐ”', 'ํŠน', '๋‚ด์šฉ', '์–˜๊ธฐ', '๋ฐฐ์šฐ', '์—ฐ๊ธฐ', '๋‹จ๋ง', '์žฅ์ฐฝ', 'ํ•จ']

corpus = "".join(df['review'].tolist()) #๋ง๋ญ‰์น˜ ์ƒ์„ฑ
corpus #๋ถ€๋ถ„์ถœ๋ ฅ


์ „์ฒด ํ˜•ํƒœ์†Œ ๋ถ„์„

apply_regular_expression(corpus)
nouns = okt.nouns(apply_regular_expression(corpus))
print(nouns)



4. ์ „์ฒ˜๋ฆฌ

#๋นˆ๋„ ๋ณด๊ธฐ
counter = Counter(nouns)
counter.most_common(10)

#ํ•œ ๊ธ€์ž ๋ช…์‚ฌ ์ œ๊ฑฐ
available_counter = Counter({x: counter[x] for x in counter if len(x) > 1})
available_counter.most_common(10)

#๋ถˆ์šฉ์–ด ์ œ๊ฑฐ
stopwords = pd.read_csv("https://raw.githubusercontent.com/yoonkt200/FastCampusDataset/master/korean_stopwords.txt").values.tolist()
stopwords[:10]

#๋ถˆ์šฉ์–ด ์‚ฌ์ „์— ์ถ”๊ฐ€ํ•˜๊ธฐ
movie_stopwords = ['์˜ํ™”', '์ด์ œ', '๋•Œ๋ฌธ', '์ •๋„', '๋ถ€๋ถ„', '์ด๊ฒƒ', '์ด๊ฑฐ', 'ํ•˜๋‹ค', '๋‚ด๊ฐ€', '์˜ํ•ด', '์ €ํฌ', '๋”ฐ๋ผ', '์˜ํ•ด']

for word in movie_stopwords:
    stopwords.append(word)



5. bow vector ์ƒ์„ฑ

def text_cleaning(review):
    hangul = re.compile('[^ ใ„ฑ-ใ…ฃ ๊ฐ€-ํžฃ]')  #์ •๊ทœ ํ‘œํ˜„์‹
    result = hangul.sub('', review)
    okt = Okt()  #ํ˜•ํƒœ์†Œ ์ถ”์ถœ
    nouns = okt.nouns(result)
    nouns = [x for x in nouns if len(x) > 1]  # ํ•œ๊ธ€์ž ํ‚ค์›Œ๋“œ ์ œ๊ฑฐ
    nouns = [x for x in nouns if x not in stopwords]  # ๋ถˆ์šฉ์–ด ์ œ๊ฑฐ
    return nouns

vect = CountVectorizer(tokenizer = lambda x: text_cleaning(x))
bow_vect = vect.fit_transform(df['review'].tolist())
word_list = vect.get_feature_names()
count_list = bow_vect.toarray().sum(axis=0)


word_list #๋‹จ์–ด ๋ฆฌ์ŠคํŠธ
count_list #๊ฐ ๋‹จ์–ด๊ฐ€ ์ „์ฒด ๋ฆฌ๋ทฐ์ค‘์— ๋“ฑ์žฅํ•œ ์ด ํšŸ์ˆ˜
bow_vect.toarray() #๊ฐ ๋‹จ์–ด์˜ ๋ฆฌ๋ทฐ๋ณ„ ๋“ฑ์žฅ ํšŸ์ˆ˜
bow_vect.shape #(11346, 6489)

word_count_dict = dict(zip(word_list, count_list))
word_count_dict #"๋‹จ์–ด" - "์ด ๋“ฑ์žฅ ํšŸ์ˆ˜" Matching



6. TF-IDF๋กœ ๋ณ€ํ™˜

tfidf_vectorizer = TfidfTransformer()
tf_idf_vect = tfidf_vectorizer.fit_transform(bow_vect)

print(tf_idf_vect.shape) #(11346, 6489)
print(tf_idf_vect[0]) #์ฒซ ๋ฒˆ์งธ ๋ฆฌ๋ทฐ์—์„œ์˜ ๋‹จ์–ด ์ค‘์š”๋„(TF-IDF๊ฐ’) -> 0์ด ์•„๋‹Œ ๊ฒƒ๋งŒ ์ถœ๋ ฅ

#์ฒซ ๋ฒˆ์งธ ๋ฆฌ๋ทฐ์—์„œ ๋ชจ๋“  ๋‹จ์–ด์˜ ์ค‘์š”๋„ (0์ธ ๊ฐ’๊นŒ์ง€ ํฌํ•จ)
print(tf_idf_vect[0].toarray().shape) #(1, 6489)
print(tf_idf_vect[0].toarray()) #[[0. 0. 0. ... 0. 0. 0.]]

#๋ฒกํ„ฐ-๋‹จ์–ด ๋งคํ•‘
vect.vocabulary_ 

invert_index_vectorizer = {v: k for k, v in vect.vocabulary_.items()}
print(str(invert_index_vectorizer)[:100]+'...')



7. ๊ฐ์„ฑ๋ถ„๋ฅ˜ ์˜ˆ์ธก๋ชจ๋ธ

df.sample(10)
df['score'].hist()

#1~6 ๋ถ€์ •์  7~10 ๊ธ์ •์  ๋ถ„๋ฅ˜ํ•˜์—ฌ 1, 0 ๋ถ€์—ฌ
def rating_to_label(score):
    if score > 6:
        return 1
    else:
        return 0
    
df['y'] = df['score'].apply(lambda x: rating_to_label(x))

df["y"].value_counts()

1    8007
0    3339

ํ›ˆ๋ จ/๊ฒ€์ •set ๋‚˜๋ˆ„๊ธฐ

x = tf_idf_vect
y = df['y']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=1)

x_train.shape, y_train.shape #((7942, 6489), (7942,))
x_test.shape, y_test.shape #((3404, 6489), (3404,))



8. ๋ชจ๋ธ ํ•™์Šต

#fit in training set
lr = LogisticRegression(random_state = 0)
lr.fit(x_train, y_train)

#predict in test set
y_pred = lr.predict(x_test)

#๋ถ„๋ฅ˜ ๊ฒฐ๊ณผ ํ‰๊ฐ€
print('accuracy: %.2f' % accuracy_score(y_test, y_pred)) #0.77
print('precision: %.2f' % precision_score(y_test, y_pred)) #0.77
print('recall: %.2f' % recall_score(y_test, y_pred)) #0.95
print('F1: %.2f' % f1_score(y_test, y_pred)) #0.85

#ํ˜ผ๋™ํ–‰๋ ฌ
from sklearn.metrics import confusion_matrix

confu = confusion_matrix(y_true = y_test, y_pred = y_pred)

plt.figure(figsize=(4, 3))
sns.heatmap(confu, annot=True, annot_kws={'size':15}, cmap='OrRd', fmt='.10g')
plt.title('Confusion Matrix')
plt.show()

 

 

10. ์‹œ๊ฐํ™”

plt.figure(figsize=(10, 8))
plt.bar(range(len(lr.coef_[0])), lr.coef_[0])

 

 

 

 

 

 

์ฐธ๊ณ  

๊ตฐ์‚ฐ๋Œ€ ๊ฐ์„ฑ๋ถ„์„ https://github.com/park1200656/KnuSentiLex

KoNLPy https://konlpy-ko.readthedocs.io/ko/v0.4.3/morph/

๊ฐ์„ฑ๋ถ„์„ https://cyc1am3n.github.io/2018/11/10/classifying_korean_movie_review.html

https://hyemin-kim.github.io/2020/08/29/E-Python-TextMining-2/