DAY59. Python TextMining Cosine similarity (์ฝ์ฌ์ธ ์ ์ฌ๋)
cosine_similarity
<์์
์ ์ฐจ>
1. ๋์ ๋ฌธ์(์์ฐ์ด) -> ํฌ์ํ๋ ฌ(DTM:๋ฌธ์๋จ์ดํ๋ ฌ)
2. ์ฝ์ฌ์ธ ์ ์ฌ๋ ์ ์ฉ
-> ๋ฌธ์๋ฅผ ๊ตฌ์ฑํ๋ ๋จ์ด๋ค ๊ฐ์ ์ ์ฌ๋ ์ธก์ (-1 ~ +1)
from sklearn.feature_extraction.text import TfidfVectorizer #class. ํฌ์ํ๋ ฌ(sparse matrix)
from sklearn.metrics.pairwise import cosine_similarity #function. ์ฝ์ฌ์ธ ์ ์ฌ๋
๋ฌธ์ฅ(sentence) : 3๊ฐ ๋ฌธ์ฅ(์์ฐ์ด)
sentences = [
"Mr. Green killed Colonel Mustard in the study with the candlestick. Mr. Green is not a very nice fellow.",
"Professor Plum has a green plant in his study.",
"Miss Scarlett watered Professor Plum's green plant while he was away from his office last week."
]
print(sentences)
len(sentences) #3๊ฐ ๋ฌธ์ฅ
1. ๋์ ๋ฌธ์(์์ฐ์ด) -> ํฌ์ํ๋ ฌ(DTM:๋ฌธ์๋จ์ดํ๋ ฌ)
tfidf = TfidfVectorizer() #1) ๋จ์ด์์ฑ๊ธฐ
๋จ์ด ๋ณด๊ธฐ
fit = tfidf.fit(sentences) #๋ฌธ์ฅ ์ ์ฉ
voca = fit.vocabulary_
print(voca)
#2) ํฌ์ํ๋ ฌ(DTM)
sp_mat = tfidf.fit_transform(sentences) #๋ฌธ์ฅ ์ ์ฉ
print(sp_mat)
(ํ,์ด)
(0, 3) 0.2205828828763741
scipy -> numpy
sp_mat_arr = sp_mat.toarray()
print(sp_mat_arr)
sp_mat_arr.shape #(3, 31) -> (๋ฌธ์๊ฐ์, ๋จ์ด๊ฐ์)
2. ์ฝ์ฌ์ธ ์ ์ฌ๋ ์ ์ฉ
1) ๊ฒ์์ฟผ๋ฆฌ : ๊ฒ์ํ ๋ฌธ์
query = ['green plant in his study']
2) ํฌ์ํ๋ ฌ(DTM)
query_sp_mat = tfidf.transform(query) #์ฃผ์ : ํจ์๋ช
numpy ํ๋ ฌ
query_sp_mat_arr = query_sp_mat.toarray()
3) ์ฝ์ฌ์ธ ์ ์ฌ๋ ๊ณ์ฐ
sim = cosine_similarity(query_sp_mat_arr, sp_mat_arr)
print(sim) #[[0.25069697 0.74327606 0.24964024]]
sim.shape #(1, 3)
2d -> 1d
sim1d = sim.reshape(3)
sim1d # [0.25069697, 0.74327606, 0.24964024]
4) ๋ด๋ฆผ์ฐจ์ ์ ๋ ฌ(์์ธ ๊ธฐ์ค)
sim_idx = sim1d.argsort()[::-1] #[1, 0, 2]
5) query์ ๊ฐ์ฅ ์ ์ฌ๋๊ฐ ๋์ ์์ผ๋ก ๋ฌธ์ฅ ๊ฒ์
for idx in sim_idx :
print(f'์ ์ฌ๋ : {sim1d[idx]}, ๋ฌธ์ฅ : {sentences[idx]}')
movie recomm
์ ์ฌ ๋ฌธ์ ๊ฒ์ ์์คํ
์ํ ๊ฒ์(์ถ์ฒ) ์์คํ
: ์ฝ์ฌ์ธ ์ ์ฌ๋ ๊ธฐ๋ฐ
ex) ์ํ ํค์๋ -> ์ํ ํ๊ธฐ ํ
์คํธ์์ ๊ด๋ จ ์ํ ์ค๊ฑฐ๋ฆฌ ์ ๊ณต
import pandas as pd #csv file rad
from sklearn.feature_extraction.text import TfidfVectorizer #class. ํฌ์ํ๋ ฌ(sparse matrix)
from sklearn.metrics.pairwise import cosine_similarity #function. ์ฝ์ฌ์ธ ์ ์ฌ๋
1. dataset load
data = pd.read_csv(r'C:\ITWILL\4_Python-2\data\movie_reviews.csv')
data.info()
RangeIndex: 1492 entries, 0 to 1491
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 reviews 1492 non-null object : ์ํํ๊ธฐ
1 title 1492 non-null object : ์ํ์ ๋ชฉ
2 label 1492 non-null int64 : ๊ธ์ /๋ถ์
data.head()
2. ์ ์ฒ๋ฆฌ : ๊ฒฐ์ธก์น ์ ๊ฑฐ
data_df = data.dropna()
data_df.info()
3. ํฌ์ํ๋ ฌ(DTM) : reviews ๋์
reviews = data_df['reviews']
print(reviews)
1) ๋จ์ด์์ฑ๊ธฐ-๋ถ์ฉ์ด ์ ๊ฑฐ
tfidf = TfidfVectorizer(stop_words='english')
2) ํฌ์ํ๋ ฌ(sparse matrix)
movie_sm = tfidf.fit_transform(reviews)
movie_sm.shape #(1492, 34641) - DTM
numpy array ๋ณํ
movie_sm_arr = movie_sm.toarray()
movie_sm_arr.shape #(1492, 34641) - DTM
print(movie_sm_arr)
title = data_df['title'] #์ํ์ ๋ชฉ
#4. query ์์ฑ -> ํฌ์ํ๋ ฌ -> ์ ์ฌ๋๊ณ์ฐ -> Top5 ์ํ ์ถ์ฒ
def movie_search(query) :
#1) query ์์ฑ
user_query = [query]
#2) query ํฌ์ํ๋ ฌ
query_sm = tfidf.transform(user_query)
query_sm_arr = query_sm.toarray() #numpy array
#3) ์ฝ์ฌ์ธ ์ ์ฌ๋
sim = cosine_similarity(query_sm_arr, movie_sm_arr)
print(sim.shape) #(1, 1492)
#2d -> 1d
sim1d = sim.reshape(1492)
#4) ๋ด๋ฆผ์ฐจ์ ์ ๋ ฌ : index ์ ๋ ฌ
sim_idx = sim1d.argsort()[::-1]
print('top5 index : ', sim_idx[:5])
#top5 index : [1281 1304 373 554 260]
#5) Top5 ์ํ์ถ์ฒํ๊ธฐ
for idx in sim_idx[:5] :
print(f'์ ์ฌ๋ : {sim1d[idx]}, ์ํ์ ๋ชฉ : {title[idx]}')
ํจ์ ํธ์ถ : ์ํ๊ด๋ จ ํค์๋(ํค๋ณด๋ ์
๋ ฅ)
movie_search(input('search query input : '))
search query input : action
์ ์ฌ๋ : 0.20192921485638887, ์ํ์ ๋ชฉ : Soldier (1998)
์ ์ฌ๋ : 0.1958404700223592, ์ํ์ ๋ชฉ : Romeo Must Die (2000)
์ ์ฌ๋ : 0.18885169874338412, ์ํ์ ๋ชฉ : Aliens (1986)
์ ์ฌ๋ : 0.18489066174805405, ์ํ์ ๋ชฉ : Speed 2: Cruise Control (1997)
์ ์ฌ๋ : 0.16658803590038168, ์ํ์ ๋ชฉ : Total Recall (1990)
search query input : drama
์ ์ฌ๋ : 0.1931737274266525, ์ํ์ ๋ชฉ : Apollo 13 (1995)
์ ์ฌ๋ : 0.11796112357272329, ์ํ์ ๋ชฉ : Double Jeopardy (1999)
์ ์ฌ๋ : 0.11374906390472769, ์ํ์ ๋ชฉ : Practical Magic (1998)
์ ์ฌ๋ : 0.11037479275255738, ์ํ์ ๋ชฉ : Civil Action, A (1998)
์ ์ฌ๋ : 0.09607905933279662, ์ํ์ ๋ชฉ : Truman Show, The (1998)
word2vec
์ ์ฌ ๋จ์ด ๊ฒ์
1. pip install gensim
2. spyder ์์ import
Word2Vec ์๊ณ ๋ฆฌ์ฆ
1. CBOW
2. Skip-Gram
from gensim.models import Word2Vec #์ ์ฌ๋จ์ด ์์ธก ๋ชจ๋ธ
import nltk #nltk(Natural Langualge Toolkit) : ์์ฐ์ด ์ฒ๋ฆฌ ๋๊ตฌ
nltk.download('punkt') #nltk data download
from nltk.tokenize import word_tokenize #๋ฌธ์ฅ -> ๋จ์ด ์ถ์ถ
from nltk.tokenize import sent_tokenize #ํ
์คํธ -> ๋ฌธ์ฅ ์ถ์ถ
import pandas as pd #csv file read
1. dataset load
์ถ์ฒ : https://www.kaggle.com/rounakbanik/the-movies-dataset
data = pd.read_csv('C:/ITWILL/4_Python-2/data/movies_metadata.csv')
data.info()
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
2. ๋ณ์ ์ ํ & ์ ์ฒ๋ฆฌ
df = data[['title', 'overview']] #์ํ ์ ๋ชฉ, ์ค๊ฑฐ๋ฆฌ๋ง ์ถ์ถ
df = df.dropna()
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 44506 entries, 0 to 45465
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 title 44506 non-null object : ์ํ ์ ๋ชฉ
1 overview 44506 non-null object : ์ํ ์ค๊ฑฐ๋ฆฌ
df.head()
3. ํ ํฐ(token) ์์ฑ
sentendce -> word
sent = "my name is hong."
words = word_tokenize(sent)
print(words) #['my', 'name', 'is', 'hong', '.']
len(words) #5
2) text -> sentence
text = "my name is hong. my hobby is reading."
sents = sent_tokenize(text)
print(sents) #['my name is hong.', 'my hobby is reading.']
3) overview ๋จ์ด ๋ฒกํฐ ์์ฑ
overview = df['overview'].tolist() #colums -> list ๋ณํ
overview[:5]
len(overview) #44506
result = [] #๋จ์ด ๋ฒกํฐ ์ ์ฅ
for row in overview :
words = word_tokenize(row) #๋ฌธ์ฅ -> ๋จ์ด ์ถ์ถ
result.append(words) #[[1.๋จ์ด๋ฒกํฐ], [2.๋จ์ด๋ฒกํฐ]...]
print(result)
result[0] #์ฒซ๋ฒ์งธ ๋ฌธ์ฅ์ ๋จ์ด ๋ฒกํฐ
result[-1] #๋ง์ง๋ง ๋ฌธ์ฅ์ ๋จ์ด ๋ฒกํฐ
4. word2vec ๋ชจ๋ธ ์์ฑ
model = Word2Vec(sentences=result, window = 5, min_count = 1, sg = 1)
sentences : ๋จ์ด ๋ฒกํฐ
window : 1ํ ํ์ตํ ๋จ์ด ์
min_count : ์ต์ ์ถํ ๋น๋์
sg : 0-CBOW, 1-Skip-Gram
5. ์ ์ฌ ๋จ์ด ๊ฒ์
def word_search(keyword) :
search_re = model.wv.most_similar([keyword])
print('top5 :', search_re[:5])
word_search(input('key word input :')) #husband -> woman -> success
('top5 : ', word_search[:5])
top5 : [('boyfriend', 0.8590863347053528),
('lover', 0.8467974066734314),
('fiancé', 0.7997056245803833),
('ex-husband', 0.7850815653800964),
('fiance', 0.7803053855895996)]
print('top5 : ', word_search[:5])
top5 : [('man', 0.8099219799041748),
('girl', 0.7905499339103699),
('schoolgirl', 0.7901395559310913),
('lady', 0.7746134996414185),
('spinster', 0.7675780653953552)]
('top5 : ', word_search[:5])
top5 : [('fame', 0.8123695850372314),
('stardom', 0.7987002730369568),
('commercial', 0.7903648614883423),
('popularity', 0.7882120609283447),
('achieves', 0.7871276140213013)]