0
私は以下のファイルを作成しました(悪名高いコースラコース以降)。何か変わったかどうかは分かりませんが、今は動作していないようで、何も変えていません。テキスト解析プログラムが動作していましたが、今は動作しません
最初に動作しないように見えるのは、特殊文字を削除するforループです。
次に、Plan Text Docとして扱うと、雲という言葉は働きたくないようです。
最後に、トークナイザ関数は同じチャートを作成しています。本質的によく使用される単一の単語対プログラムされたngramです。それぞれのngramが同じチャートを作成していることを意味します。最も頻繁に使用される単語とngramの2,3,4単語など...
パッケージの更新やRの更新が原因であるかどうかは不明です。
どのような考えですか?
#Set working directory and read file
cname <- file.path("c:/texts")
cname
dir(cname)
setwd("c:/texts")
library("RColorBrewer")
library("tm")
library("knitr")
library("devtools")
library("plyr")
library("ggplot2")
library("wordcloud")
library("rJava")
library("RWeka")
library("stringi")
library("XLConnect")
library("XLConnectJars")
df<- readWorksheetFromFile("uars.xlsx", sheet=1, startRow=1)
df1 <- df[df$Business %in% "FRAUD", ]
#Load the R package for text mining and then load your texts into R.
library(tm)
docs <- Corpus(VectorSource(df1))
summary(docs)
#read your documents in the R terminal using
inspect(docs)
#Preprocessing
#Removing punctuation
docs <- tm_map(docs, removePunctuation)
# remove special characters.
for(j in seq(docs))
{
docs[[j]] <- gsub("/", " ", docs[[j]])
docs[[j]] <- gsub("@", " ", docs[[j]])
docs[[j]] <- gsub("\\|", " ", docs[[j]])
}
#Removing numbers:
docs <- tm_map(docs, removeNumbers)
#Converting to lowercase:
docs <- tm_map(docs, tolower)
#Removing "stopwords" (common words) that usually have no analytic value
docs <- tm_map(docs, removeWords, c(stopwords("english"), "bank", "account", "customer", "transactions", "sent", "received", "company",
"wire", "wires", "payment", "payments", "wells", "fargo", "transaction", "fraud", "wholesale", "wholesal", "uar", "email"))
#Removing common word endings (e.g., "ing", "es", "s")
library(SnowballC)
docs <- tm_map(docs, stemDocument)
#Stripping unnecesary whitespace from your documents:
docs <- tm_map(docs, stripWhitespace)
#treat your preprocessed documents as text documents.
docs <- tm_map(docs, PlainTextDocument)
#Stage the Data
#To proceed, create a document term matrix
dtm <- DocumentTermMatrix(docs)
dtm
inspect(dtm)
#transpose of this matrix
tdm <- TermDocumentMatrix(docs)
tdm
##TCorpus <- tm_map(TCorpus, removeWords, badWords)
wordcloud(docs, scale=c(3,0.5), min.freq=5, max.words=100, random.order=TRUE,
rot.per=0.5, colors=brewer.pal(8, "Set1"), use.r.layout=FALSE)
#Tokenizer functions
bigram <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3))
quadgram <- function(x) NGramTokenizer(x, Weka_control(min=4, max=4))
fivegram <- function(x) NGramTokenizer (x, Weka_control(min=5, max=5))
sixgram <- function(x) NGramTokenizer (x, Weka_control(min=6, max=6))
#Word/phrase count function
freq_df <- function(tdm){
# Helper function to tabulate frequency
freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
freq_df <- data.frame(word=names(freq), freq=freq)
return(freq_df)
}
#Creating the n-grams
corpus.unigram <- TermDocumentMatrix(docs)
corpus.unigram <- removeSparseTerms(corpus.unigram, 0.99)
corpus.unigram.freq <- freq_df(corpus.unigram)
corpus.bigram <- TermDocumentMatrix(docs, control=list(tokenize=bigram))
corpus.bigram <- removeSparseTerms(corpus.bigram, 0.999)
corpus.bigram.freq <- freq_df(corpus.bigram)
corpus.trigram <- TermDocumentMatrix(docs, control=list(tokenize=trigram))
corpus.trigram <- removeSparseTerms(corpus.trigram, 0.99)
corpus.trigram.freq <- freq_df(corpus.trigram)
corpus.quadgram <- TermDocumentMatrix(docs, control=list(tokenize=quadgram))
corpus.quadgram <- removeSparseTerms(corpus.quadgram, 0.9999)
corpus.quadgram.freq <- freq_df(corpus.quadgram)
corpus.fivegram <- TermDocumentMatrix(docs, control=list(tokenize=fivegram))
corpus.fivegram <- removeSparseTerms(corpus.fivegram, 0.9999)
corpus.fivegram.freq <- freq_df(corpus.fivegram)
corpus.sixgram <- TermDocumentMatrix(docs, control=list(tokenize=sixgram))
corpus.sixgram <- removeSparseTerms(corpus.sixgram, 0.9999)
corpus.sixgram.freq <- freq_df(corpus.sixgram)
top_50 <- function(df1, title, color) {
ggplot(df[1:50,], aes(x = seq(1:50), y = freq)) +
geom_bar(stat = "identity", fill = color, colour = "black", width = 0.80) +
coord_cartesian(xlim = c(0, 51)) +
labs(title = title) +
xlab("Words") +
ylab("Count") +
scale_x_continuous(breaks = seq(1, 50, by = 1), labels = df$word[1:50]) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
}
top_50(corpus.unigram.freq,"Top 50 words","green")
top_50(corpus.bigram.freq,"Top 2 word combos","yellow")
top_50(corpus.trigram.freq,"Top 3 word combos","orange")
top_50(corpus.quadgram.freq,"Top 4 word combos","red")
top_50(corpus.fivegram.freq,"Top 5 word combos","blue")
top_50(corpus.sixgram.freq,"Top 6 word combos","purple")