๊ฐœ์ธ๊ณต๋ถ€/R

28. R T๊ฒ€์ • ์—ฐ์Šต๋ฌธ์ œ

LEE_BOMB 2021. 10. 12. 22:07

01. ์šฐ๋ฆฌ๋‚˜๋ผ ์ „์ฒด ์ค‘ํ•™๊ต 2ํ•™๋…„ ์—ฌํ•™์ƒ ํ‰๊ท  ํ‚ค๊ฐ€ 148.5cm๋กœ ์•Œ๋ ค์ ธ ์žˆ๋Š” ์ƒํƒœ์—์„œ A์ค‘ํ•™๊ต 2ํ•™๋…„ ์ „์ฒด 500๋ช…์„ ๋Œ€์ƒ์œผ๋กœ 10%์ธ 50๋ช…์„ ํ‘œ๋ณธ์œผ๋กœ ์„ ์ •ํ•˜์—ฌ ํ‘œ๋ณธํ‰๊ท ์‹ ์žฅ์„ ๊ณ„์‚ฐํ•˜๊ณ , ๋ชจ์ง‘๋‹จ์˜ ํ‰๊ท ๊ณผ ์ฐจ์ด๊ฐ€ ์žˆ๋Š”์ง€๋ฅผ ๊ฒ€์ •ํ•˜์‹œ์˜ค.(๋‹จ์ผํ‘œ๋ณธ T๊ฒ€์ •)

 

setwd('C:/ITWILL/2_Rwork/data')

 

๋‹จ๊ณ„1 : ๋ฐ์ดํ„ฐ์…‹ ๊ฐ€์ ธ์˜ค๊ธฐ

stheight<- read.csv("student_height.csv")
stheight
height <- stheight$height
head(height)

 


๋‹จ๊ณ„2 : ๊ธฐ์ˆ ํ†ต๊ณ„๋Ÿ‰/๊ฒฐ์ธก์น˜ ํ™•์ธ

ength(height) #50
summary(height) # 149.4 

x <- na.omit(height)
x # ์ •์ œ ๋ฐ์ดํ„ฐ 
mean(x) # 149.4 -> ํ‰๊ท ์‹ ์žฅ

 

 

๋‹จ๊ณ„3 : ์ •๊ทœ์„ฑ ๊ฒ€์ • - ๊ธฐ๋ณธ๊ฐ€์ • 

shapiro.test(x) # p-value = 0.0001853 < 0.05 : ๊ธฐ๊ฐ 
hist(x) # ์™ผ์ชฝ์œผ๋กœ ๊ธฐ์šธ์–ด์ง

 


๋‹จ๊ณ„4 : ๊ฐ€์„ค๊ฒ€์ • - ์–‘์ธก๊ฒ€์ • : ๋น„๋ชจ์ˆ˜๊ฒ€์ • 

wilcox.test(x, mu=148.5)
# V = 826, p-value = 0.067 > 0.05

[ํ•ด์„ค] ๋ชจํ‰๊ท ๊ณผ ์ฐจ์ด๊ฐ€ ์—†๋‹ค.

 

 

 



02. ๊ต์œก๋ฐฉ๋ฒ•์— ๋”ฐ๋ผ ์‹œํ—˜์„ฑ์ ์— ์ฐจ์ด๊ฐ€ ์žˆ๋Š”์ง€ ๊ฒ€์ •ํ•˜์‹œ์˜ค.(๋…๋ฆฝํ‘œ๋ณธ T๊ฒ€์ •)
์กฐ๊ฑด1) ๋ณ€์ˆ˜ : method : ๊ต์œก๋ฐฉ๋ฒ•, score : ์‹œํ—˜์„ฑ์ 
์กฐ๊ฑด2) ๋ชจ๋ธ : ๊ต์œก๋ฐฉ๋ฒ•(๋ฒ”์ฃผํ˜•๋ณ€์ˆ˜)  ->  ์‹œํ—˜์„ฑ์ (์—ฐ์†ํ˜•๋ณ€์ˆ˜)
์กฐ๊ฑด3) ์ „์ฒ˜๋ฆฌ : ๊ฒฐ์ธก์น˜ ์ œ๊ฑฐ : ํ‰๊ท ์œผ๋กœ ๋Œ€์ฒด 

๋‹จ๊ณ„1. ์‹ค์ŠตํŒŒ์ผ ๊ฐ€์ ธ์˜ค๊ธฐ

Data <- read.csv("twomethod.csv", header=TRUE)
head(Data) #3๊ฐœ ๋ณ€์ˆ˜ ํ™•์ธ -> id method score

 


๋‹จ๊ณ„2. ๋‘ ์ง‘๋‹จ subset ์ž‘์„ฑ

unique(Data$method) # 1 2
table(Data$method)


๋ณ€์ˆ˜ ์„ ํƒ -> ์„œ๋ธŒ์…‹ ์ƒ์„ฑ 

data_df <- Data[c('method', 'score')]
data_df



๋‹จ๊ณ„3. ๋ฐ์ดํ„ฐ ๋ถ„๋ฆฌ
1) ์ง‘๋‹จ(๊ต์œก๋ฐฉ๋ฒ•)์œผ๋กœ ๋ถ„๋ฆฌ

method1 <- subset(data_df, method == 1)
method2 <- subset(data_df, method == 2)
dim(method1) # 24  2
dim(method2) # 39  2


2) ๊ต์œก๋ฐฉ๋ฒ•์—์„œ ์‹œํ—˜์„ฑ์  ์ถ”์ถœ

score1 <- method1$score
score2 <- method2$score



๋‹จ๊ณ„4 : ๋ถ„ํฌ๋ชจ์–‘ ๊ฒ€์ • : ๋“ฑ๋ถ„์‚ฐ์„ฑ ๊ฒ€์ •(์—ฐ์†ํ˜•๋ณ€์ˆ˜)

var.test(score1, score2) # p-value = 0.8494

 

 

๋‹จ๊ณ„5: ๊ฐ€์„ค๊ฒ€์ •

t.test(score1, score2) # ๊ท€๋ฌด๊ฐ€์„ค ๊ธฐ๊ฐ 
# t = -5.6056(์ ˆ๋Œ€๊ฐ’), df = 43.705, p-value = 1.303e-06=0.000001303


๋ฐฉํ–ฅ์„ฑ์ด ์žˆ๋Š” ์—ฐ๊ตฌ๊ฐ€์„ค ๊ฒ€์ •(๊ธฐ๊ฐ) : score1 > score2

t.test(score1, score2, alter="greater", conf.int=TRUE, conf.level=0.95)


๋ฐฉํ–ฅ์„ฑ์ด ์žˆ๋Š” ์—ฐ๊ตฌ๊ฐ€์„ค ๊ฒ€์ •(์ฑ„ํƒ) : score1 < score2

t.test(score1, score2, alter="less", conf.int=TRUE, conf.level=0.95)

 

 




03.datas๋ฅผ ๋Œ€์ƒ์œผ๋กœ ์—ฐ๋ น๋ณ„(age) ๋งŒ์กฑ๋„(satis)์— ์ฐจ์ด๊ฐ€ ์žˆ๋Š”์ง€ ๊ฒ€์ •ํ•˜์‹œ์˜ค. (์ผ์›๋ฐฐ์น˜ ๋ถ„์‚ฐ๋ถ„์„ : ๋ชจ์ˆ˜ ๊ฒ€์ •)    

๋‹จ๊ณ„1 : dataset ์ƒ์„ฑ 
20๋Œ€ ๋งŒ์กฑ๋„(10์  ๋งŒ์กฑ)

age20 <- rep(20, 10)
satis20 <- c(5,7,10,6,8,3,9,5,6,5)
df1 <- data.frame(age=age20, satis=satis20)


30๋Œ€ ๋งŒ์กฑ๋„

age30 <- rep(30, 10)
satis30 <- c(8,7,10,6,8,5,9,7,6,6)
df2 <- data.frame(age=age30, satis=satis30)


40๋Œ€ ๋งŒ์กฑ๋„

age40 <- rep(40, 10)
satis40 <- c(8,9,10,6,8,7,9,7,9,8)
df3 <- data.frame(age=age40, satis=satis40)


DataFrame ์ƒ์„ฑ 

datas <- rbind(df1, df2, df3)
datas # age satis
str(datas)


๋…๋ฆฝ๋ณ€์ˆ˜ ์š”์ธํ˜• ๋ณ€ํ™˜ : ์ง‘๋‹จ๋ณ€์ˆ˜ ์ƒ์„ฑ(์ˆซ์ž๋ณ€์ˆ˜ : ์‚ฌํ›„๊ฒ€์ • ์‹œ ์˜ค๋ฅ˜) 

datas$age <- as.factor(datas$age)
str(datas) # $ age  : Factor w/ 3 levels



๋‹จ๊ณ„2 : ๋“ฑ๋ถ„์‚ฐ์„ฑ ๊ฒ€์ • : ์—ฐ๋ น์— ๋”ฐ๋ฅธ ๋งŒ์กฑ๋„์˜ ๋ถ„์‚ฐ ์ฐจ์ด  

bartlett.test(satis ~ age, data =  datas) # p-value = 0.2494



๋‹จ๊ณ„3 : ๋ถ„์‚ฐ๋ถ„์„ 

model <- aov(satis ~ age, data = datas)


๋‹จ๊ณ„4. ๋ถ„์„๋ถ„์„ ๊ฒฐ๊ณผ ํ•ด์„ 

summary(model)
# age          2  14.47   7.233   2.607 0.0922 .

[ํ•ด์„ค] ์—ฐ๋ น๋ณ„(20,30,40) ๋งŒ์กฑ๋„์— ์ฐจ์ด๊ฐ€ ์—†๋‹ค.

๋‹จ๊ณ„5. ์‚ฌํ›„๊ฒ€์ • 

TukeyHSD(model)

         diff        lwr      upr     p adj
30-20  0.8 -1.0468164 2.646816 0.5379671 -> ์ฐจ์ด ์—†์Œ 
40-20  1.7 -0.1468164 3.546816 0.0756158 -> ์ฐจ์ด ์—†์Œ 
40-30  0.9 -0.9468164 2.746816 0.4586358 -> ์ฐจ์ด ์—†์Œ 

plot(TukeyHSD(model))

[ํ•ด์„ค] 3์ง‘๋‹จ ๋ชจ๋‘ ์‹ ๋ขฐ๊ตฌ๊ฐ„์— 0์„ ํฌํ•จํ•˜๊ณ  ์žˆ์Œ 

 

 

 


04.airquality๋ฅผ ๋Œ€์ƒ์œผ๋กœ ์›”๋ณ„(Month)๋กœ ์˜ค์กด๋Ÿ‰(Ozone)์— ์ฐจ์ด๊ฐ€ ์žˆ๋Š”์ง€ ๊ฒ€์ •ํ•˜์‹œ์˜ค. (์ผ์›๋ฐฐ์น˜ ๋ถ„์‚ฐ๋ถ„์„ : ๋น„๋ชจ์ˆ˜ ๊ฒ€์ •)  

data(airquality)
str(airquality)
# $ Ozone -> y : ์—ฐ์†ํ˜• ๋ณ€์ˆ˜ 
# $ Month -> x : ์ง‘๋‹จ๋ณ€์ˆ˜ 
table(airquality$Month) # 5  6  7  8  9

 


๋‹จ๊ณ„ 1: ์ „์ฒ˜๋ฆฌ(๊ฒฐ์ธก์น˜ ์ œ๊ฑฐ)

summary(airquality) # ๊ฒฐ์ธก์น˜ ๋ฐœ๊ฒฌ 
dataset <- na.omit(airquality) # ๊ฒฐ์ธก์น˜ ์ œ๊ฑฐ 

dataset$Month <- as.factor(dataset$Month)
str(dataset) # $ Month  : Factor w/ 5 levels

 


๋‹จ๊ณ„ 2: ๋“ฑ๋ถ„์‚ฐ์„ฑ ๊ฒ€์ •

bartlett.test(Ozone ~ Month, data =  dataset)

* p-value = 0.007395 : ๋น„๋ชจ์ˆ˜ ๊ฒ€์ • 


๋‹จ๊ณ„ 3: ๋ถ„์‚ฐ๋ถ„์„(๋ชจ์ˆ˜ vs ๋น„๋ชจ์ˆ˜) & ํ•ด์„ 

kruskal.test(Ozone ~ Month, data =  dataset)

* p-value = 2.742e-05 : ์ž‘์–ด๋„ ํ•œ ์ง‘๋‹จ ์ด์ƒ ํ‰๊ท ์ฐจ์ด 


๋‹จ๊ณ„ 4: ์‚ฌํ›„๊ฒ€์ • : ์ง‘๋‹จ๋ณ„ ํ‰๊ท (dplyr ํŒจํ‚ค์ง€ ์ด์šฉ) 

install.packages('dplyr')
library(dplyr)

 

์›”๋ณ„ ์˜ค์กด๋Ÿ‰์˜ ํ‰๊ท  

dataset %>% group_by(Month) %>% summarise(avg = mean(Ozone))

Month   avg
<fct> <dbl>
1 5      24.1
2 6      29.4
3 7      59.1
4 8      60  
5 9      31.4
[ํ•ด์„ค] 8์›”์— ํ‰๊ท  ์˜ค์กด๋Ÿ‰์ด ๊ฐ€์žฅ๋งŽ๊ณ , 5์›”์ด ๊ฐ€์žฅ ์ž‘๋‹ค.