01. ํŠธ๋Ÿผํ”„ ์—ฐ์„ค๋ฌธ(trump.txt)๊ณผ ์˜ค๋ฐ”๋งˆ ์—ฐ์„ค๋ฌธ(obama.txt)์„ ๋Œ€์ƒ์œผ๋กœ ๋นˆ๋„์ˆ˜๊ฐ€ 2ํšŒ ์ด์ƒ ๋‹จ์–ด๋ฅผ ๋Œ€์ƒ์œผ๋กœ ๋‹จ์–ด๊ตฌ๋ฆ„ ์‹œ๊ฐํ™”ํ•˜์‹œ์˜ค.
[๋‹จ๊ณ„1], [๋‹จ๊ณ„4] ~ [๋‹จ๊ณ„8]

obama <- file(file.choose(), encoding="UTF-8")
obama_data <- readLines(obama)
str(obama_data) #1:496
obama_data[1:6]


๋ง๋ญ‰์น˜

myCorpus <- Corpus(VectorSource(obama_data)) 
myCorpus

inspect(myCorpus[100]) 

myCorpusPrepro <- tm_map(myCorpus, removePunctuation) # ๋ฌธ์žฅ๋ถ€ํ˜ธ ์ œ๊ฑฐ
myCorpusPrepro <- tm_map(myCorpusPrepro, removeNumbers) # ์ˆ˜์น˜ ์ œ๊ฑฐ
myCorpusPrepro <- tm_map(myCorpusPrepro, tolower) # ์†Œ๋ฌธ์ž ๋ณ€๊ฒฝ

stopwords('english')
myCorpusPrepro <-tm_map(myCorpusPrepro, removeWords, stopwords('english')) # ๋ถˆ์šฉ์–ด์ œ๊ฑฐ


์ „์ฒ˜๋ฆฌ ๊ฒฐ๊ณผ ํ™•์ธ

myCorpusPrepro # Content:  documents: 76
inspect(myCorpusPrepro[1:3])


๋‹จ์–ด์„ ๋ณ„
(1) ํ•œ๊ธ€ ๋‹จ์–ด๊ธธ์ด 2์Œ์ ˆ ~ 8์Œ์ ˆ(ํ•œ๊ธ€ 1๊ฐœ 2byte) 

myCorpusPrepro_term <- TermDocumentMatrix(myCorpusPrepro,
                       control=list(wordLengths=c(2,8))) 

myCorpusPrepro_term


(2) Corpus -> ํ‰์„œ๋ฌธ ๋ณ€ํ™˜ : matrix -> data.frame ๋ณ€๊ฒฝ

myTerm_df <- as.data.frame(as.matrix(myCorpusPrepro_term)) 
dim(myTerm_df) #1021 496


๋‹จ์–ด ๋นˆ๋„์ˆ˜
(1) ๋‹จ์–ด ๋นˆ๋„์ˆ˜ ๋‚ด๋ฆผ์ฐจ์ˆœ ์ •๋ ฌ

wordResult <- sort(rowSums(myTerm_df), decreasing=TRUE) 
wordResult[1:10] # top10 ๋‹จ์–ด


(2) ๋ถˆ์šฉ์–ด ์ œ๊ฑฐ 

myStopwords = c(stopwords('english'), 'applause', 'cheers'); # ์ œ๊ฑฐํ•  ๋ฌธ์ž ์ถ”๊ฐ€
myCorpusPrepro <-tm_map(myCorpusPrepro, removeWords, myStopwords) # ๋ถˆ์šฉ์–ด์ œ๊ฑฐ


(3) ๋‹จ์–ด ์„ ๋ณ„๊ณผ ํ‰์„œ๋ฌธ ๋ณ€ํ™˜

myCorpusPrepro_term <- TermDocumentMatrix(myCorpusPrepro, 
                                          control=list(wordLengths=c(4,16))) # 2์Œ์ ˆ ~ 8์Œ์ ˆ


(4) ๋ง๋ญ‰์น˜ ๊ฐ์ฒด๋ฅผ ํ‰์„œ๋ฌธ์œผ๋กœ ๋ณ€ํ™˜

myTerm_df <- as.data.frame(as.matrix(myCorpusPrepro_term))


(5) ๋‹จ์–ด ์ถœํ˜„ ๋นˆ๋„์ˆ˜ ๊ตฌํ•˜๊ธฐ

wordResult <- sort(rowSums(myTerm_df), decreasing=TRUE) 
wordResult[1:10]


๋‹จ์–ด ๊ตฌ๋ฆ„ ์‹œ๊ฐํ™”
(1) ๋‹จ์–ด ์ด๋ฆ„ ์ƒ์„ฑ -> ๋นˆ๋„์ˆ˜์˜ ์ด๋ฆ„

myName <- names(wordResult)


(2) ๋‹จ์–ด์ด๋ฆ„๊ณผ ๋นˆ๋„์ˆ˜๋กœ data.frame ์ƒ์„ฑ

word.df <- data.frame(word=myName, freq=wordResult) 
str(word.df) # word, freq ๋ณ€์ˆ˜
head(word.df)


(3) ๋‹จ์–ด ์ƒ‰์ƒ๊ณผ ๊ธ€๊ผด ์ง€์ •

pal <- brewer.pal(12,"Paired") # 12๊ฐ€์ง€ ์ƒ‰์ƒ pal <- brewer.pal(9,"Set1") # Set1~ Set3


ํฐํŠธ ์„ค์ •์„ธํŒ… : "๋ง‘์€ ๊ณ ๋”•", "์„œ์šธ๋‚จ์‚ฐ์ฒด B"

windowsFonts(malgun=windowsFont("๋ง‘์€ ๊ณ ๋”•"))  #windows


(4) ๋‹จ์–ด ๊ตฌ๋ฆ„ ์‹œ๊ฐํ™”: ํฌ๊ธฐ,์ตœ์†Œ๋นˆ๋„์ˆ˜,์ˆœ์„œ,ํšŒ์ „,์ƒ‰์ƒ,๊ธ€๊ผด ์ง€์ •  

wordcloud(word.df$word, word.df$freq, 
          scale=c(3,1), min.freq=2, random.order=F, 
          rot.per=.1, colors=pal, family="malgun")

 





02. ๊ณต๊ณต๋ฐ์ดํ„ฐ ์‚ฌ์ดํŠธ์—์„œ ๊ด€์‹ฌ๋ถ„์•ผ ๋ฐ์ดํ„ฐ ์…‹์„ ๋‹ค์šด๋กœ๋“œ ๋ฐ›์•„์„œ ๋นˆ๋„์ˆ˜๊ฐ€ 5ํšŒ ์ด์ƒ ๋‹จ์–ด๋ฅผ ์ด์šฉํ•˜์—ฌ ๋‹จ์–ด ๊ตฌ๋ฆ„์œผ๋กœ ์‹œ๊ฐํ™” ํ•˜์‹œ์˜ค. ๊ณต๊ณต๋ฐ์ดํ„ฐ ์‚ฌ์ดํŠธ : www.data.go.kr ๋˜๋Š” ๊ธฐํƒ€ ์‚ฌ์ดํŠธ

women <- file(file.choose(), encoding="UTF-8")
women_data <- readLines(women)
str(women_data)


์ง€์ • ๋ถˆ์šฉ์–ด ์ œ๊ฑฐ > ๋นˆ๋„์ˆ˜์—์„œ ํ™•์ธํ•ด๋ณด๊ธฐ

women_data = gsub("์—ฌ์„ฑ","",women_data)
women_data = gsub("์šฐ๋ฆฌ","",women_data)


๋‹จ์–ด ์ถ”์ถœ

exNouns <- function(x) { #x๊ฐ€ ๋ฌธ์žฅ์„ ๋ฐ›์•„์„œ 1.character์ฒ˜๋ฆฌ(๋ฌธ์ž๋ณ€ํ™˜) 2.๋ช…์‚ฌ ์ถ”์ถœ 3.paste : ๊ณต๋ฐฑ์œผ๋กœ ๋Œ€์ฒด
  paste(extractNoun(as.character(x)), collapse=" ")
}

women_nouns <- sapply(women_data, exNouns)
women_nouns


์ž๋ฃŒ ์ „์ฒ˜๋ฆฌ

womenCorpus <- Corpus(VectorSource(women_nouns)) 
womenCorpus


๋‚ด์šฉ๋ณด๊ธฐ

inspect(data_unlist[1])  

womenCorpusP <- tm_map(myCorpus, removePunctuation) # ๋ฌธ์žฅ๋ถ€ํ˜ธ ์ œ๊ฑฐ
womenCorpusP <- tm_map(womenCorpus, removeNumbers) # ์ˆ˜์น˜ ์ œ๊ฑฐ
womenCorpusP <- tm_map(womenCorpus, tolower) # ์†Œ๋ฌธ์ž ๋ณ€๊ฒฝ

inspect(womenCorpusP[1])

 

์ „์ฒ˜๋ฆฌ ๊ฒฐ๊ณผ ํ™•์ธ

womenCorpusP
inspect(womenCorpusP[1])


๋‹จ์–ด ์„ ๋ณ„

womenCorpusP_term <- TermDocumentMatrix(womenCorpusP, 
                                          control=list(wordLengths=c(4,16))) 
womenCorpusP_term


ํ‰์„œ๋ฌธ ๋ณ€ํ™˜

myTerm_df <- as.data.frame(as.matrix(womenCorpusP_term)) 
dim(myTerm_df) #548,1
str(myTerm_df)


๋นˆ๋„์ˆ˜ ๊ตฌํ•˜๊ธฐ

wordResult <- sort(rowSums(myTerm_df), decreasing=TRUE)
wordResult[1:10]


์‹œ๊ฐํ™”

myName <- names(wordResult)


(2) ๋‹จ์–ด์ด๋ฆ„๊ณผ ๋นˆ๋„์ˆ˜๋กœ data.frame ์ƒ์„ฑ

word.df <- data.frame(word=myName, freq=wordResult) 
str(word.df)


(3) ๋‹จ์–ด ์ƒ‰์ƒ๊ณผ ๊ธ€๊ผด ์ง€์ •

pal <- brewer.pal(5,"Paired")
windowsFonts(malgun=windowsFont("๋ง‘์€ ๊ณ ๋”•"))


(4) ๋‹จ์–ด ๊ตฌ๋ฆ„ ์‹œ๊ฐํ™”: ํฌ๊ธฐ,์ตœ์†Œ๋นˆ๋„์ˆ˜,์ˆœ์„œ,ํšŒ์ „,์ƒ‰์ƒ,๊ธ€๊ผด ์ง€์ •  

wordcloud(word.df$word, word.df$freq, 
          scale=c(2,1), min.freq=5, random.order=F, 
          rot.per=.1, colors=pal, family="malgun")
myCorpusPrepro_term <- TermDocumentMatrix(myCorpusPrepro, 
                                          control=list(wordLengths=c(4,16))) # 2์Œ์ ˆ ~ 8์Œ์ ˆ

myTerm_df <- as.data.frame(as.matrix(myCorpusPrepro_term)) 

wordResult <- sort(rowSums(myTerm_df), decreasing=TRUE) 
wordResult

+ Recent posts