1. 程式人生 > >R語言:詞雲圖

R語言:詞雲圖

這是當時在琢磨文字挖掘時的小技術,貼出來共享一下

library(Rwordseg) #分詞的包

#匯入資料
data = read.csv("C:\\Users\\hormy\\Documents\\諮詢資料.csv",stringsAsFactors=F)

#去除數字,英文字元
data$內容 = gsub("[a-z0-9A-Z_]","",data$內容)

#分詞,Rwordseg包,手動加入分詞表在工作路徑的檔案
words = segmentCN(data$內容)

#生成停詞表stopwordsCN.txt,讀入,確保是utf-8編碼
stopwordsCN = as.character(readLines("stopwordsCN.txt"))
stopwordsCN = enc2utf8(stopwordsCN)
stopwordsCN<-stopwordsCN[Encoding(stopwordsCN)!="unknown"]

#編寫去停詞函式
removeStopWords <- function(x,stopwords) {
  temp <- character(0)
  index <- 1
  xLen <- length(x)
  while (index <= xLen) {
    if (length(stopwords[stopwords==x[index]]) <1)
      temp<- c(temp,x[index])
    index <- index +1
  }
  temp
}

#去停詞
words = lapply(words,removeStopWords,stopwordsCN)

#畫詞雲圖
library(wordcloud)
#計算詞頻
wordsnum = table(unlist(words))
wordsnum = sort(wordsnum)  #排序
#選出詞頻最高的250個
wordsnum = tail(wordsnum,250)
#畫詞雲圖
wordcloud(names(wordsnum), as.vector(wordsnum),random.order=FALSE,
          random.color=FALSE,colors=brewer.pal(8,"Dark2"),family="myFont3")