01. ๋‹ค์Œ์€ 15๋ช…์˜ ๋ฉด์ ‘์ž๋ฅผ ๋Œ€์ƒ์œผ๋กœ ๊ฐ€์น˜๊ด€, ์ „๋ฌธ์ง€์‹, ์ž๊ฒฉ์ฆ ์œ ๋ฌด ๋“ฑ์„ ํ† ๋Œ€๋กœ ์ข…ํ•ฉ์ ์ˆ˜์— ๊ทผ๊ฑฐํ•˜์—ฌ ํ•ฉ๊ฒฉ์—ฌ๋ถ€๋ฅผ ๊ฒฐ์ •ํ•œ ์ž๋ฃŒ์ด๋‹ค. ๋‹ค์Œ๊ณผ ๊ฐ™์€ ๋‹จ๊ณ„๋กœ ๊ณ„์ธต์  ๊ตฐ์ง‘๋ถ„์„์„ ์ˆ˜ํ–‰ํ•˜์—ฌ ๊ตฐ์ง‘์ˆ˜(cluster)๋ฅผ ํƒ์ƒ‰ํ•˜๊ณ , ๊ฐ ๊ตฐ์ง‘๋ณ„๋กœ ์„œ๋ธŒ์…‹์„ ์ž‘์„ฑํ•˜์—ฌ ๊ฐ ๊ตฐ์ง‘์˜ ํŠน์„ฑ์„ ๋ถ„์„ํ•˜์‹œ์˜ค.

setwd('c:/ITWILL/2_Rwork/data')


๋‹จ๊ณ„1 : dataset ๊ฐ€์ ธ์˜ค๊ธฐ 

interview = read.csv("interview.csv")
names(interview) # ๋ณ€์ˆ˜๋ช… 
head(interview)


๋‹จ๊ณ„2 : ์œ ํด๋ฆฌ๋””์•ˆ ๊ฑฐ๋ฆฌ ๊ณ„์‚ฐ  

inter_df = interview[c(2:8)] # ์‘์‹œ๋ฒˆํ˜ธ์™€ ํ•ฉ๊ฒฉ์—ฌ๋ถ€ ์ œ์™ธ
idist <- dist(inter_df) # ์œ ํด๋ฆฌ๋””์•ˆ ๊ฑฐ๋ฆฌ ์ƒ์„ฑ 
head(idist)


๋‹จ๊ณ„3 : ๊ณ„์ธต์  ๊ตฐ์ง‘๋ถ„์„ & ๋ด๋“œ๋กœ๊ทธ๋žจ ์‹œ๊ฐํ™” 

hc <- hclust(idist)
hc

plot(hc, hang=-1) # ์Œ์ˆ˜๊ฐ’ ์ œ์™ธ
rect.hclust(hc, k=3, border="red") # 3๊ฐœ ๊ทธ๋ฃน์ˆ˜


๋‹จ๊ณ„4 : ๊ตฐ์ง‘๋ณ„ ์„œ๋ธŒ์…‹ ๋งŒ๋“ค๊ธฐ : cutree()ํ•จ์ˆ˜ ์ด์šฉ 

ghc = cutree(hc, k=3)
inter_df$ghc <- ghc
inter_df

g1 = subset(inter_df, ghc==1)
g2 = subset(inter_df, ghc==2)
g3 = subset(inter_df, ghc==3)


๋‹จ๊ณ„5 : ๊ฐ ๊ตฐ์ง‘๋ณ„ ํŠน์„ฑ ๋ถ„์„ : summary()ํ•จ์ˆ˜ ์ด์šฉ 

summary(g1) # ์ž๊ฒฉ์ฆ : 1,       Mean   :19   Mean   :14.4   Mean   :15.6   Mean   :14.8  Mean   :11.8  Mean   :75.6
summary(g2) # ์ž๊ฒฉ์ฆ : 0 or 1,  Mean   :11   Mean   :15.2   Mean   :19.4   Mean   :11,   Mean   :6.2   Mean   :62.8
summary(g3) # ์ž๊ฒฉ์ฆ : 0,       Mean   :14.4 Mean   :18.8   Mean   :10.8   Mean   : 9.4, Mean   :18.2  Mean   :71.6

g1

interview[c(1:2,4,6,13), ]






02. ๋‹ค์Œ๊ณผ ๊ฐ™์€ ์กฐ๊ฑด์„ ์ด์šฉํ•˜์—ฌ ๊ฐ ๋‹จ๊ณ„๋ณ„๋กœ ๋น„๊ณ„์ธต์  ๊ตฐ์ง‘๋ถ„์„์„ ์ˆ˜ํ–‰ํ•˜์‹œ์˜ค.
์กฐ๊ฑด1) ๋Œ€์ƒ ํŒŒ์ผ : c:/Rwork/Part-IV/product_sales.csv
์กฐ๊ฑด2) ๋ณ€์ˆ˜ ์„ค๋ช… : tot_price : ์ด๊ตฌ๋งค์•ก, buy_count : ๊ตฌ๋งคํšŸ์ˆ˜, visit_count : ๋งค์žฅ๋ฐฉ๋ฌธํšŸ์ˆ˜, avg_price : ํ‰๊ท ๊ตฌ๋งค์•ก

sales <- read.csv("product_sales.csv", header=TRUE)
head(sales)



๋‹จ๊ณ„1: ๋น„๊ณ„์ธต์  ๊ตฐ์ง‘๋ถ„์„ : 3๊ฐœ ๊ตฐ์ง‘์œผ๋กœ ๊ตฐ์ง‘ํ™”

model = kmeans(sales, centers = 3)
model

sizes 38, 50, 62
Within cluster sum of squares by cluster: ์‘์ง‘๋„ 
[1] 23.88395 22.51380 39.95968
(between_SS / total_SS =  87.4 %) : ๋ชจ๋ธ ์šฐ์ˆ˜์„ฑ 

๋‹จ๊ณ„2: ์›ํ˜•๋ฐ์ดํ„ฐ์— ๊ตฐ์ง‘์ˆ˜ ์ถ”๊ฐ€

sales$cluster = model$cluster # ํด๋Ÿฌ์Šคํ„ฐ ์ •๋ณด ์ถ”๊ฐ€ 
head(sales)


๋‹จ๊ณ„3 : tot_price ๋ณ€์ˆ˜์™€ ๊ฐ€์žฅ ์ƒ๊ด€๊ณ„์ˆ˜๊ฐ€ ๋†’์€ ๋ณ€์ˆ˜์™€ ๊ตฐ์ง‘๋ถ„์„ ์‹œ๊ฐํ™”
(1) ์ƒ๊ด€๊ด€๊ณ„ ๋ถ„์„ : cor()

cor(sales) # 0.87175416 : tot_price vs avg_price


(2) ๋น„๊ณ„์ธต์  ๊ตฐ์ง‘๋ถ„์„ ์‹œ๊ฐํ™” : ๊ทธ๋ฃน์œผ๋กœ ์ƒ‰์ƒ ํ‘œ์‹œ 

plot(sales$tot_price ~ sales$avg_price, col = sales$cluster)


๋‹จ๊ณ„4. ๊ตฐ์ง‘์˜ ์ค‘์‹ฌ์  ํ‘œ์‹œ

model$centers # ๊ฐ ๊ตฐ์ง‘์˜ ๋ณ€์ˆ˜ ํ‰๊ท  

points(model$centers[, c('avg_price', 'tot_price')], col=c(3,1,2),
       pch=8, cex=5)

[ํ•ด์„ค] avg_price ๋ณ€์ˆ˜๊ฐ€ tot_price ๋ณ€์ˆ˜์— ๋น„ํ•ด์„œ ๊ตฐ์ง‘์— ์˜ํ–ฅ๋„ ๋†’์Œ  

library(factoextra) # fviz_cluster()
fviz_cluster(model, data = sales)

Dim1(57.8%) : ๊ณตํ—Œ๋„ 58% -> avg_price ์ฃผ์„ฑ๋ถ„ 
Dim2(27.6%) : ๊ณตํ—Œ๋„ 28%

 

sales[110, ]

      tot_price visit_count buy_count avg_price cluster
110       6.1         1.4       2.6       5.6   1 -> cluster1

sales[36, ]

36         5           1         2       3.5    3 -> cluster3 

sales[17, ]

17       4.5         0.3       2.3       1.3    2 -> cluster2

 

 


๊ฐ ๊ตฐ์ง‘๋ณ„ ํŠน์„ฑ ๋ถ„์„(x์ถ• ๊ธฐ์ค€ : avg_price) 
cluster1 : ์ด๊ตฌ๋งค๊ธˆ์•ก๊ณผ ํ‰๊ท ๊ตฌ๋งค์•ก ๊ฐ€์žฅ ๋†’์€ ๊ทธ๋ฃน 
cluster2 : ์ด๊ตฌ๋งค๊ธˆ์•ก๊ณผ ํ‰๊ท ๊ตฌ๋งค์•ก ๊ฐ€์žฅ ๋‚ฎ์€ ๊ทธ๋ฃน 
cluster3 : ์ด๊ตฌ๋งค๊ธˆ์•ก๊ณผ ํ‰๊ท ๊ตฌ๋งค์•ก ์ค‘๊ฐ„์ธ ๊ทธ๋ฃน 

+ Recent posts