๊ฐœ์ธ๊ณต๋ถ€/R

19. R EDA, ๋ฐ์ดํ„ฐ์ „์ฒ˜๋ฆฌ ์—ฐ์Šต๋ฌธ์ œ

LEE_BOMB 2021. 10. 3. 15:08

01. mtcars ๋ฐ์ดํ„ฐ์…‹์˜ qsec(1/4๋งˆ์ผ ์†Œ์š”์‹œ๊ฐ„) ๋ณ€์ˆ˜๋ฅผ ๋Œ€์ƒ์œผ๋กœ ๊ทน๋‹จ์น˜(์ƒ์œ„ 0.3%)๋ฅผ ๋ฐœ๊ฒฌํ•˜๊ณ , ์ •์ œํ•˜์—ฌ mtcars_df ์ด๋ฆ„์œผ๋กœ ์„œ๋ธŒ์…‹์„ ์ƒ์„ฑํ•˜์‹œ์˜ค.

library(ggplot2)
str(mtcars) # 'data.frame': 32 obs. of  11 variables:


(1) ์ด์ƒ์น˜ ํ†ต๊ณ„ ํ™•์ธ 

boxplot(mtcars$qsec)$stats # 1/4๋งˆ์ผ ์†Œ์š”์‹œ๊ฐ„

์ผ๋ฐ˜์ ์œผ๋กœ ์ƒํ•˜์œ„ 0.3% ๊ทน๋‹จ์น˜๋กœ ๋ณธ๋‹ค.

(2) ์„œ๋ธŒ์…‹ ์ƒ์„ฑ : subset() ์ด์šฉ 

mtcars_sub = subset(mtcars, qsec >= 14.5 & qsec <= 20.22)

 

(3) ์ •์ œ ๊ฒฐ๊ณผ ํ™•์ธ : boxplot() ์ด์šฉ 

boxplot(mtcars_sub$qsec)

 

 



02. ๋ณธ๋ฌธ์—์„œ ์ƒ์„ฑ๋œ dataset2์˜ resident ์นผ๋Ÿผ์„ ๋Œ€์ƒ์œผ๋กœ NA ๊ฐ’์„ ์ œ๊ฑฐํ•œ ํ›„ dataset5 ๋ณ€์ˆ˜์— ์ €์žฅํ•˜์‹œ์˜ค.


๋ฐฉ๋ฒ•1) subset ์ด์šฉ : ํŠน์ • ์นผ๋Ÿผ ๊ฒฐ์ธก์น˜๊ฐ€ ์•„๋‹Œ ๊ฒฝ์šฐ

dim(dataset2)
dataset5 = subset(dataset2, !(is.na(dataset2$resident)) )  
dim(dataset5)


๋ฐฉ๋ฒ•2) na.omit ์ด์šฉ 

dataset5 = dataset2
resident = na.omit(dataset2$resident) # ํŠน์ • ์นผ๋Ÿผ๋งŒ ๊ฒฐ์ธก์น˜ ์ œ๊ฑฐ 
dataset5$resident = resident

 

 




03. ๋ณธ๋ฌธ์—์„œ ์ƒ์„ฑ๋œ dataset2์˜ ์ง๊ธ‰(position) ์นผ๋Ÿผ์„ ๋Œ€์ƒ์œผ๋กœ 1๊ธ‰ -> 5๊ธ‰, 5๊ธ‰ -> 1๊ธ‰ ํ˜•์‹์œผ๋กœ ์—ญ์ฝ”๋”ฉํ•˜์—ฌ position2 ์นผ๋Ÿผ์— ์ถ”๊ฐ€ํ•˜์‹œ์˜ค.

position = dataset2$position
cprosition = 6 - position # ์—ญ์ฝ”๋”ฉ
dataset2$position2 = cprosition # ์นผ๋Ÿผ ์ถ”๊ฐ€



 

 

 


04. dataset2์˜ gender ์นผ๋Ÿผ์„ ๋Œ€์ƒ์œผ๋กœ 1->"๋‚จ์ž", 2->"์—ฌ์ž" ํ˜•ํƒœ๋กœ ์ฝ”๋”ฉ ๋ณ€๊ฒฝํ•˜์—ฌ gender2 ์นผ๋Ÿผ์— ์ถ”๊ฐ€ํ•˜๊ณ , ํŒŒ์ด ์ฐจํŠธ๋กœ ๊ฒฐ๊ณผ๋ฅผ ํ™•์ธํ•˜์‹œ์˜ค.

dataset2$gender2[dataset2$gender == 1] <- "๋‚จ์ž"
dataset2$gender2[dataset2$gender == 2] <- "์—ฌ์ž"
table(dataset2$gender2)

๋‚จ์ž ์—ฌ์ž 
119   87

pie(table(dataset2$gender2))

 

 

 



05. ๋‚˜์ด๋ฅผ 30์„ธ ์ดํ•˜ -> 1, 31~55 -> 2, 56์ด์ƒ -> 3 ์œผ๋กœ ๋ฆฌ์ฝ”๋”ฉํ•˜์—ฌ age3 ์นผ๋Ÿผ์— ์ถ”๊ฐ€ํ•œ ํ›„ age, age2, age3 ์นผ๋Ÿผ๋งŒ ํ™•์ธํ•˜์‹œ์˜ค.

dataset2$age3[dataset2$age <= 30] = 1
dataset2$age3[dataset2$age > 30 & dataset2$age <= 55] = 2
dataset2$age3[dataset2$age > 55] = 3

dataset2[c(age,age2,age3)] # ๊ฒฐ๊ณผํ™•์ธ

 

 

 



06. ์ •์ œ๋œ dataset2๋ฅผ ๋Œ€์ƒ์œผ๋กœ ์ž‘์—… ๋””๋ ‰ํ„ฐ๋ฆฌ(c:/itwill/2_Rwork/output)์— cleanData.csv ํŒŒ์ผ๋ช…์œผ๋กœ ๋”ฐ์˜ดํ‘œ์™€ ํ–‰ ์ด๋ฆ„์„ ์ œ๊ฑฐํ•˜์—ฌ ์ €์žฅํ•˜๊ณ , new_data๋ณ€์ˆ˜๋กœ ์ฝ์–ด์˜ค์‹œ์˜ค.

(1) ์ •์ œ๋œ ๋ฐ์ดํ„ฐ ์ €์žฅ

write.csv(dataset2, 'cleanData.csv', row.names = F, quote = F)


(2) ์ €์žฅ๋œ ํŒŒ์ผ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ/ํ™•์ธ

new_data <- read.csv('cleanData.csv')

 

 




07. user_data.csv์™€ return_data.csv ํŒŒ์ผ์„ ์ด์šฉํ•˜์—ฌ ๊ฐ ๊ณ ๊ฐ๋ณ„ ๋ฐ˜ํ’ˆ์‚ฌ์œ ์ฝ”๋“œ(return_code)๋ฅผ ๋Œ€์ƒ์œผ๋กœ ๋‹ค์Œ๊ณผ ๊ฐ™์ด ํŒŒ์ƒ๋ณ€์ˆ˜๋ฅผ ์ถ”๊ฐ€ํ•˜์‹œ์˜ค.
<์กฐ๊ฑด1> ๋ฐ˜ํ’ˆ์‚ฌ์œ ์ฝ”๋“œ์— ๋Œ€ํ•œ ํŒŒ์ƒ๋ณ€์ˆ˜ ์นผ๋Ÿผ๋ช… ์„ค๋ช… 
์ œํ’ˆ์ด์ƒ(1) : return_code1, ๋ณ€์‹ฌ

(2) : return_code2, 
์›์ธ๋ถˆ๋ช…(3) :> return_code3, ๊ธฐํƒ€

(4) : return_code4 

 

return_data = read.csv('return_data.csv')
head(return_data)

  user_id return_code
1    1008           1
2    1009           2

library(reshape2)
return_df = dcast(return_data, user_id ~ return_code, length)
head(return_df)

  user_id 1 2 3 4
1    1008 1 0 0 0
2    1009 0 1 0 0

table(return_data$return_code)

names(return_df) <- c('user_id','return_code1','return_code2','return_code3','return_code4')
head(return_df)



<์กฐ๊ฑด2> ๊ณ ๊ฐ๋ณ„ ๋ฐ˜ํ’ˆ์‚ฌ์œ ์ฝ”๋“œ๋ฅผ ๊ณ ๊ฐ์ •๋ณด(user_data) ํ…Œ์ด๋ธ”์— ์ถ”๊ฐ€(join)

user_return_data = join(user_data, return_df, by='user_id')
head(user_return_data, 10)