0. ํ•„์š”ํ•œ ํŒจํ‚ค์ง€ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ

library1 = c("plyr", "dplyr", "ggplot2", "stringr", "tidyr", "readxl", "xlsx", "readxl") unlist(lapply(library1, require, character.only=TRUE))




1. dataset ๋ถˆ๋Ÿฌ์˜ค๊ธฐ

setwd('C:/ITWILL/2_Rwork/00/SemiProject') hospital = read.csv("hospital.csv") population = read.csv("population.csv", check.names=F, header = T) head(hospital, n=10) #๋ณ‘์› 1๊ฐœ๋งˆ๋‹ค์˜ ์ •๋ณด ๊ธฐ์žฌ. head(population, n=10) #์ง€์—ญ ๋ณ„ ์ธ๊ตฌ ์ˆ˜ dataset str(population)




2. dataset ์ •์ œ
๊ฒฐ์ธก์น˜ ์ฒ˜๋ฆฌ

sum(is.na(hospital)) #0 sum(is.na(population)) #0


๋น„ ๋…ธ์ธ์ธ๊ตฌ ํ–‰ ์ถ”๊ฐ€ํ•˜๊ธฐ (์ „์ฒด์ธ๊ตฌ ์ˆ˜-๋…ธ์ธ์ธ๊ตฌ ์ˆ˜-์˜์œ ์•„์ธ๊ตฌ ์ˆ˜)

pop = (population$์ด์ธ๊ตฌ์ˆ˜-(population$๋…ธ์ธ์ธ๊ตฌ+population$์˜์œ ์•„์ธ๊ตฌ)) population = cbind(population, pop) head(population)


๋…ธ์ธ ์ธ๊ตฌ, ์˜์œ ์•„ ์ธ๊ตฌ, ๋น„ ๋…ธ์ธ์ธ๊ตฌ์˜ ๋น„์œจ ํ–‰ ์ถ”๊ฐ€ํ•˜๊ธฐ

pop_be = (population$pop/population$์ด์ธ๊ตฌ์ˆ˜) * 100 population = cbind(population, pop_be) pop_old_be = (population$๋…ธ์ธ์ธ๊ตฌ/population$์ด์ธ๊ตฌ์ˆ˜) * 100 population = cbind(population, pop_old_be) pop_child_be = (population$์˜์œ ์•„์ธ๊ตฌ/population$์ด์ธ๊ตฌ์ˆ˜) * 100 population = cbind(population, pop_child_be) head(population)


๋ณ€์ˆ˜๋ช… ์˜์–ด๋กœ ๋ณ€ํ™˜

colnames(hospital) = c('division', 'area1', 'area2') colnames(population) = c('area', 'total_pop', 'pop_child', 'pop_old', '65~69', '70~74', '75~79', '80~84', '85~89', '90~94', '95~99', '100~', 'pop', 'pop_be', 'pop_old_be', 'pop_child_be') head(hospital) head(population)


๋ชจ๋“  ๋ฌธ์ž ๊ณต๋ฐฑ -> ๋‹จ์ผ ์ŠคํŽ˜์ด์Šค ๊ณต๋ฐฑ์œผ๋กœ ์น˜ํ™˜

str_trim(hospital$area1) str_trim(hospital$area2)


hospital์—์„œ ์›ํ•˜๋Š” ์ •๋ณด(์ง€์—ญ๋ณ„ ๋ณ‘์› ๊ฐœ์ˆ˜) ์ถ”์ถœ

hospital = hospital %>% group_by(area1, area2) %>% dplyr :: summarize(count=n()) str(hospital) #grouped_df

[๊ฒฝ๊ณ ] `summarise()` has grouped output by 'area1'. You can override using the `.groups` argument.
[ํ•ด๊ฒฐ] ๋งˆ์ง€๋ง‰ ๋ณ€์ˆ˜ ์‚ญ์ œ ์•Œ๋ฆฌ๋Š” ๋‹จ์ˆœ ๊ฒฝ๊ณ ๋ฉ”์‹œ์ง€์ด๋ฏ€๋กœ ๋ฌด์‹œํ•˜๊ณ  ์ง„ํ–‰

hospital์—์„œ area1๊ณผ area2๋ฅผ ๋”ํ•ด ใ…‡ใ…‡์‹œ ใ…‡ใ…‡๊ตฐ ํ˜•ํƒœ๋กœ ๋งŒ๋“ค๊ธฐ

hospital = unite(hospital, "area", c(area1, area2), sep=" ") head (hospital)


๋‘ ๋ฐ์ดํ„ฐ ํ”„๋ ˆ์ž„์„ ๊ณตํ†ต๊ฐ’(area:์ง€์—ญ)์œผ๋กœ ๋ฌถ๊ธฐ

total = merge(hospital, population, by="area", all=T) head(total)


[๋ฌธ์ œ์ ] ๊ฒฐ์ธก์น˜ ๋ฐœ์ƒ

sum(is.na(total)) #87๊ฐœ


1. -์ถœ์žฅ์†Œ ์ง€์—ญ์€ ์ด ๋ณ‘์› ์ˆ˜ NA, ์ธ๊ตฌ ์ˆ˜ 0
2. -์‹œ, -๋„ ๋‹จ์œ„์˜ ์ด ๋ณ‘์› ์ˆ˜๊ฐ€ ์ถœ๋ ฅ๋˜์ง€ ์•Š์Œ

[ํ•ด๊ฒฐ1] ์ถœ์žฅ์†Œ์—๋Š” ๋ณ‘์›๊ณผ ์ธ๊ตฌ๊ฐ€ ๋ชจ๋‘ ์—†์œผ๋ฏ€๋กœ ์˜๋ฏธ์—†๋Š” ๋ฐ์ดํ„ฐ๋ผ๊ณ  ํŒ๋‹จ, ํ•ด๋‹น ํ–‰ ๋ชจ๋‘ ์‚ญ์ œ

total[total == 0] = NA #์ถœ์žฅ์†Œ ์ง€์—ญ์˜ ๋ชจ๋“  0 ๊ฐ’์„ NA๋กœ ๋ณ€ํ™˜ total[is.na(total$pop),] total <- total[!is.na(total$pop),] #์ด ์ธ๊ตฌ ์ˆ˜(pop) NA๊ฐ’ ์‚ญ์ œ sum(is.na(total)) #27๊ฐœ total[is.na(total$count),]


[ํ•ด๊ฒฐ2] sql์ด์šฉํ•ด์„œ ๊ฐ ์ง€์—ญ๋ณ„ ๋ณ‘์› ์ด ๊ฐœ์ˆ˜ ํ™•์ธํ•œ ๋’ค ์—‘์…€ํŒŒ์ผ๋กœ ์ง์ ‘ ์ˆ˜์น˜ ์ž…๋ ฅ.

write.xlsx(total, "C:/ITWILL/2_Rwork/00/SemiProject/total.xlsx", col.names=T, row.names=T, append=F) total = read_excel("total2.xlsx")


๋„ ๋‹จ์œ„ ์ด ๋ณ‘์› ์ˆ˜ ๋˜ํ•œ ์˜๋ฏธ์—†๋Š” ๋ฐ์ดํ„ฐ๋ผ๊ณ  ํŒ๋‹จ, ํ•ด๋‹น ํ–‰ ๋ชจ๋‘ ์‚ญ์ œ

total <- total[!is.na(total$count),] sum(is.na(total$count)) #0๊ฐœ

+ Recent posts