01. ๋ฐ์ดํ„ฐ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ

1) ํ‚ค๋ณด๋“œ ์ž…๋ ฅ : ์†Œ๋Ÿ‰์˜ ์ž๋ฃŒ (ํ…Œ์ŠคํŠธํ•  ๋•Œ ์‚ฌ์šฉ)
num = scan() #์ˆซ์ž ์ž…๋ ฅ
num
sum(num)

names = scan(what=character()) #์ •์ˆ˜ํ˜•๋ฟ๋งŒ ์•„๋‹ˆ๋ผ ๋ฌธ์žํ˜• ์ž…๋ ฅ ๊ฐ€๋Šฅ
names

 

2) ํŒŒ์ผ ์ž๋ฃŒ ์ฝ๊ธฐ(๋ถˆ๋Ÿฌ์˜ค๊ธฐ)
์นผ๋Ÿผ ๋‹จ์œ„๋กœ ๊ตฌ๋ถ„ : excel, csv

์ค€๋น„
getwd()
setwd("C:/ITWILL/2_Rwork/data") ์ž‘์—…๊ฒฝ๋กœ ๋ณ€๊ฒฝ

 

 

 

 

 

02. ๋Œ€ํ‘œ ํ•จ์ˆ˜
(1) read.table() : ๊ณต๋ฐฑ, ํŠน์ˆ˜๋ฌธ์ž๋กœ ์นผ๋Ÿผ ๊ตฌ๋ถ„

st1 = read.table('student.txt', header = FALSE) #์นผ๋Ÿผ๋ช… ์—†๋Š” ๊ฒฝ์šฐ
st1 v1 v2 v3 v4 -> ๊ธฐ๋ณธ ์ œ๊ณต ์นผ๋Ÿผ๋ช…

st2 = read.table('student2.txt', header=TRUE) #์ œ๋ชฉ ์—ด์„ ์ฒซ๋ฒˆ ์งธ ํ–‰์œผ๋กœ ์ธ์‹
st2

st3 = read.table('student2.txt', header=TRUE, sep";") #ํŠน์ˆ˜๋ฌธ์ž
st3

 

(2) read.csV() : ์ฝค๋งˆ๋กœ ์นผ๋Ÿผ ๊ตฌ๋ถ„

st4 = read.csv('student4.txt') #header=TRUE, sep=','
st4
str(st4)

 

ํŠน์ˆ˜๋ฌธ์ž(-) -> NA(๊ฒฐ์ธก์น˜)๋กœ ๋ณ€๊ฒฝ

st4 = read.csv('student4.txt', na.strings="-")
st4 #๋ฐ์ดํ„ฐ ์ž๋ฃŒํ˜•์ด chr์—์„œ int๋กœ ๋ฐ”๋€œ
str(st4)
mean(st4$ํ‚ค, na.rm=TRUE) #na๊ฐ’์„ ์ž„์˜๋กœ ์—†์• ๊ณ , st4 ์นผ๋Ÿผ ๊ฐ’์˜ ํ‰๊ท 

 

ํƒ์ƒ‰๊ธฐ ์ œ๊ณต : file ์„ ํƒ

test = read.csv(file=file.choose()) #ํŒŒ์ผ์„ ํƒ ํŒ์—…์ฐฝ
str(test)

 

(3) read.excel() : excel์ „์šฉ ๋ณ„๋„์˜ ํŒจํ‚ค์ง€ ์„ค์น˜ ํ•„์š”

install.packages('readxl')
library(readxl)

help("read_excel")
st_excel = read_excel('studentexcel.xlsx')
st_excel

 

 

 

 

 

03. ์ธํ„ฐ๋„ท ํŒŒ์ผ ์ฝ๊ธฐ

๋ฐ์ดํ„ฐ ์…‹ ์ œ๊ณต ์‚ฌ์ดํŠธ
https://vincentarelbundock.github.io/Rdatasets/datasets.html
https://r-dir.com/reference/datasets.html - Dataset site
http://www.rdatamining.com/datasets

 

์‚ฌ๋ก€

titanic = read.csv('https://vincentarelbundock.github.io/Rdatasets/csv/COUNT/titanic.csv')
str(titanic)

 

์ ์ฃผํ˜• ๋ณ€์ˆ˜์˜ ๋นˆ๋„์ˆ˜

table(titanic$class)

 

์„ฑ๋ณ„ ๋นˆ๋„์ˆ˜

satle(titanic$sex)

 

์ƒ์กด ์œ ๋ฌด

table(titanic$survived)

 

๊ต์ฐจ๋ถ„ํ• ํ‘œ

table(titanic$sex, titanic$survived) #๊ฒ€์ฆ์„ ํ†ตํ•ด ๋‘ ๋ฐ์ดํ„ฐ ๊ฐ„ ์—ฐ๊ด€์„ฑ์˜ ์—ฌ๋ถ€

 

[์˜ˆ์ œ] ๋‚จ์„ฑ ์ƒ์กด ๋น„์œจ ๊ตฌํ•˜๊ธฐ

cat('๋‚จ์„ฑ ์ƒ์กด๋น„์œจ=', 175/(694+175)) #๋ฌธ์ž์—ด+์ˆ˜์‹
print(175/(694+175)) #์ƒ์ˆ˜, ์ˆ˜์‹

 

2) ํŒŒ์ผ ์ž๋ฃŒ ์ €์žฅ
table() <-> write.table()
read.cvs() <-> write.csv(
read.excel() <-> wite_xlsx()

 

(1) write.csv()

titanic_df = subset(titanic, select=c(class, sex, survived))
str(titanic_df) #subset ์ƒ์„ฑ
taitanic_df

 

ํ–‰ ์ด๋ฆ„, ์ด์ค‘๋ถ€ํ˜ธ ์ œ์™ธํ•˜๊ณ  ์ €์žฅํ•˜๊ธฐ

write.csv(titanic_df, 'titanic.csv', row.names=FALSE, quote=FALSE)

df=read.csv('titanic.csv')
df

 

(2) write_xlsx() - ์„ค์น˜ ํ•„์š”

install.packages('writexl')
library(writexl)
write_xlsx(st_excel, path = 'student_ex.xlsx')

 

 

 

 

 

04. subset ๋งŒ๋“ค๊ธฐ

x = 1:5
y = 6:10
z = letters[1:5]

df = data.frame(x,y,z)
df

help("subset")

 

1) subset์กฐ๊ฑด์‹ : ํ–‰ ์„ ํƒ

df2 = subset(df, subset=y>=8)
df2

 

2) select=c(์นผ๋Ÿผ๋ช…1, ์นผ๋Ÿผ๋ช…2, ...) : ์—ด ์„ ํƒ

df3 <- subset(df, select = c(x,y))
df3

 

3) ์นผ๋Ÿผ๋ช…, %in%, (list) : ๊ด„ํ˜ธ ์•ˆ์˜ ๋ชฉ๋ก๊ณผ ์ผ์น˜ํ•˜๋Š” ๊ฒƒ๋งŒ ์ถœ๋ ฅ

df4 = subset(df, z%in% c('a','c','e')) #ํ–‰ ๋‹จ์œ„ ์ž๋ฃŒ ์ถ”์ถœ
df4

 

 

[์˜ˆ์‹œ]

data("iris") #Rstudio ์ œ๊ณต
str(iris) ๋ถ“๊ฝƒ ๋ฐ์ดํ„ฐ์…‹

'data.frame': 150 obs. of 5 variables:
$ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... ๊ฝƒ๋ฐ›์นจ ๊ธธ์ด
$ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... ๊ฝƒ๋ฐ›์นจ ๋„“์ด
$ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... ๊ฝƒ์žŽ ๊ธธ์ด
$ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... ๊ฝƒ์žŽ ๋„“์ด
$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ... ๊ฝƒ์˜ ์ข… 3levels (์ง‘๋‹จ๋ณ€์ˆ˜=์š”์ธํ˜•)

 

๋ฌธ1) 1,3,5๋ฒˆ ์นผ๋Ÿผ์„ ์„ ํƒํ•ด์„œ subset ์ž‘์„ฑ

iris_df = subset(iris, select=c(Sepal.Length, Petal.Width, Species))
str(iris_df)

 

๋ฌธ2) ๋ฌธ1์˜ ๊ฒฐ๊ณผ์—์„œ 2๋ฒˆ ์นผ๋Ÿผ์˜ ํ‰๊ท ๊ฐ’ ์ด์ƒ ์ถœ๋ ฅ

mean(iris$Petal.Length) #3.758
iris_df2 = subset(iris_df, subset = Petal.Length >= mean(iris$Petal.Length))
str(iris_df2)


๋ฒ”์ฃผํ˜•์— ๋Œ€ํ•œ ์œ ํ˜• 3๊ฐ€์ง€ ์ถœ๋ ฅ

๋ฌธ3) ๋ฌธ1์˜ ๊ฒฐ๊ณผ์—์„œ Species์นผ๋Ÿผ์„ ๋Œ€์ƒ์œผ๋กœ "setosa" ๊ฝƒ์˜ ์ข… ์ถœ๋ ฅ

iris_df3 = subset(iris_df, Species %in% "setosa")
str(iris_df3)

 

 

 

 

 

์—ฐ์Šต๋ฌธ์ œ
01. ๋ณธ๋ฌธ์—์„œ ์ž‘์„ฑํ•œ titanic ๋ณ€์ˆ˜๋ฅผ ๋‹ค์Œ๊ณผ ๊ฐ™์€ ๋‹จ๊ณ„๋ฅผ ํ†ตํ•ด์„œ “titanic.csv” ํŒŒ์ผ๋กœ ์ €์žฅํ•œ ํ›„ ํŒŒ์ผ์„ ๋ถˆ๋Ÿฌ์˜ค์‹œ์˜ค.
[๋‹จ๊ณ„ 1] 'C:/ITWILL/2_Rwork/output' ํด๋”์— 'titanic.csv'๋กœ ์ €์žฅํ•œ๋‹ค.
ํžŒํŠธ: write.csv() ํ•จ์ˆ˜ ์‚ฌ์šฉ

getwd()
setwd("C:/ITWILL/2_Rwork/output")
write.csv(titanic, 'titanic.csv')


[๋‹จ๊ณ„ 2] 'titanic.csv' ํŒŒ์ผ์„ titanicData ๋ณ€์ˆ˜๋กœ ๊ฐ€์ ธ์™€์„œ ๊ฒฐ๊ณผ๋ฅผ ํ™•์ธํ•˜๊ณ , titanicData์˜ ๊ด€์ธก์น˜์™€ ์นผ๋Ÿผ์ˆ˜๋ฅผ ํ™•์ธํ•œ๋‹ค.
ํžŒํŠธ: str() ํ•จ์ˆ˜ ์‚ฌ์šฉ

titanicData = read.csv('titanic.csv')
str(titanicData)


[๋‹จ๊ณ„ 3] 1๋ฒˆ, 3๋ฒˆ ์นผ๋Ÿผ์„ ์ œ์™ธํ•œ ๋‚˜๋จธ์ง€ ์นผ๋Ÿผ์„ ๋Œ€์ƒ์œผ๋กœ ์ƒ์œ„ 6๊ฐœ์˜ ๊ด€์ธก์น˜๋ฅผ ํ™•์ธํ•œ๋‹ค. 

titanicData[1:6, -c(1,3)] #ํ–‰์— ๋Œ€ํ•œ ์ƒ‰์ธ, (์—ด์— ๋Œ€ํ•œ ์ƒ‰์ธ)




02. R์—์„œ ์ œ๊ณตํ•˜๋Š” quakes ๋ฐ์ดํ„ฐ์…‹์„ ๋Œ€์ƒ์œผ๋กœ ๋‹ค์Œ๊ณผ ๊ฐ™์ด ์ฒ˜๋ฆฌํ•˜์‹œ์˜ค

data("quakes")
quakes # ์ง€์ง„ ์ง„์•™์ง€ ๋ฐ์ดํ„ฐ ์…‹ 
str(quakes)
# 'data.frame': 1000 obs. of  5 variables:


๋‹จ๊ณ„1) ํ˜„์žฌ ๊ฒฝ๋กœ์— row.names, quote ์—†์ด "quakes_df.csv" ํŒŒ์ผ๋ช…์œผ๋กœ ์ €์žฅ 

write.csv(quakes, "quakes_df.csv", row.names=FALSE, quote=FALSE) #outputํด๋”์— quakes_df.csv๋ช…์œผ๋กœ ์ €์žฅ๋จ


๋‹จ๊ณ„2) quakes_data๋กœ ํŒŒ์ผ ์ฝ๊ธฐ

quakes_data = read.csv("quakes_df.csv")
quakes_data

    
๋‹จ๊ณ„3) mag ๋ณ€์ˆ˜๋ฅผ ๋Œ€์ƒ์œผ๋กœ ํ‰๊ท  ๊ณ„์‚ฐํ•˜๊ธฐ 

mag = quakes_data$mag #์ปฌ๋Ÿผ๋ถˆ๋Ÿฌ์˜ฌ ๋•Œ๋Š” $. ๋ฒกํ„ฐ ํ˜•์‹์œผ๋กœ ๋ถˆ๋Ÿฌ์˜ฌ ์ˆ˜ ์žˆ์Œ.
mean(mag)




03. R์—์„œ ์ œ๊ณตํ•˜๋Š” CO2 ๋ฐ์ดํ„ฐ์…‹์„ ๋Œ€์ƒ์œผ๋กœ ๋‹ค์Œ๊ณผ ๊ฐ™์ด ํŒŒ์ผ๋กœ ์ €์žฅํ•˜์‹œ์˜ค.
ํžŒํŠธ : subset() ํ•จ์ˆ˜ ์ด์šฉ 

data("CO2")
CO2


๋‹จ๊ณ„1) Treatment ์นผ๋Ÿผ ๊ฐ’์ด 'nonchilled'์ธ ๊ฒฝ์šฐ๋งŒ 'CO2_df1.csv' ํŒŒ์ผ๋กœ ์ €์žฅํ•˜๊ธฐ 

df1 = subset(CO2, Treatment=='nonchilled') #=๋Š” <-, ==๋Š” ๋น„๊ต์—ฐ์‚ฐ์ž
CO2
str(CO2)

* treatment์นผ๋Ÿผ์€ ์„ธ๋ฒˆ์งธ. ๋‘ ๊ฐœ์˜ ๋ฒ”์ฃผํ˜• ๊ฐ’์„ ๊ฐ€์ง€๊ณ  ์žˆ์Œ(=levels)

๋‹จ๊ณ„2) Treatment ์นผ๋Ÿผ ๊ฐ’์ด 'chilled'์ธ ๊ฒฝ์šฐ๋งŒ 'CO2_df2.csv' ํŒŒ์ผ๋กœ ์ €์žฅ 

df = subset(CO2, Treatment=='chilled')
write.csv(df2, "CO2_df2.csv", row.names = F)



+ Recent posts