R語言:詞雲圖
阿新 • • 發佈:2019-02-04
這是當時在琢磨文字挖掘時的小技術,貼出來共享一下
library(Rwordseg) #分詞的包 #匯入資料 data = read.csv("C:\\Users\\hormy\\Documents\\諮詢資料.csv",stringsAsFactors=F) #去除數字,英文字元 data$內容 = gsub("[a-z0-9A-Z_]","",data$內容) #分詞,Rwordseg包,手動加入分詞表在工作路徑的檔案 words = segmentCN(data$內容) #生成停詞表stopwordsCN.txt,讀入,確保是utf-8編碼 stopwordsCN = as.character(readLines("stopwordsCN.txt")) stopwordsCN = enc2utf8(stopwordsCN) stopwordsCN<-stopwordsCN[Encoding(stopwordsCN)!="unknown"] #編寫去停詞函式 removeStopWords <- function(x,stopwords) { temp <- character(0) index <- 1 xLen <- length(x) while (index <= xLen) { if (length(stopwords[stopwords==x[index]]) <1) temp<- c(temp,x[index]) index <- index +1 } temp } #去停詞 words = lapply(words,removeStopWords,stopwordsCN) #畫詞雲圖 library(wordcloud) #計算詞頻 wordsnum = table(unlist(words)) wordsnum = sort(wordsnum) #排序 #選出詞頻最高的250個 wordsnum = tail(wordsnum,250) #畫詞雲圖 wordcloud(names(wordsnum), as.vector(wordsnum),random.order=FALSE,
random.color=FALSE,colors=brewer.pal(8,"Dark2"),family="myFont3")