比较两个文档中的词袋并找到第二个文档中的匹配词及其频率答案

【问题标题】：Compare the bag of words in two document and find the matching word and their frequency in second document比较两个文档中的词袋并找到第二个文档中的匹配词及其频率
【发布时间】：2018-01-29 03:46:04
【问题描述】：

我计算了“yelp.csv”、“yelpp.csv”、“yelpn.csv”的词袋，并创建了个人数据集词频矩阵。现在，我想将 yelp 的词袋与 yelpn 进行比较，并检查 yelp 中有多少词出现在 yelpn 中以及它们的频率，并将其存储在一个变量中作为矩阵，然后对于 yelpp 也是如此。叫喊包含正面和负面。 yelpp，只有正面和 yelpn，只有负面。任何人都可以完成代码吗？我不知道这段代码是否相关，我希望如此。

getwd()
setwd("/Users/ash/RProjects/exc")
getwd()
df <- read.csv("yelp.CSV",header = TRUE,quote="\"",stringsAsFactors= TRUE,
           strip.white = TRUE)
df
dfd<-as.character(df[,2])
dfd
df2<-as.character(df[,1])
df2
words <- readLines(system.file("stopwords", "english.dat",
                           package = "tm"))
s<-remove_stopwords(dfd, words, lines = TRUE)
s
print(paste("****Stopwords are removed successfully****"))
n<-removeNumbers(s)
n
t<-removePunctuation(n, preserve_intra_word_dashes = FALSE)
t

#pos
dfp <- read.csv("yelpp.CSV",header = TRUE,quote="\"",stringsAsFactors= TRUE,
           strip.white = TRUE)
dfp
dfdp<-as.character(dfp[,2])
dfdp
df2p<-as.character(dfp[,1])
df2p
wordsp <- readLines(system.file("stopwords", "english.dat",
                           package = "tm"))
sp<-remove_stopwords(dfdp, words, lines = TRUE)
sp
print(paste("****Stopwords are removed successfully****"))
np<-removeNumbers(sp)
np
tp<-removePunctuation(np, preserve_intra_word_dashes = FALSE)
tp

#neg
dfn <- read.csv("yelpn.CSV",header = TRUE,quote="\"",stringsAsFactors=   TRUE,
           strip.white = TRUE)
dfn
dfdn<-as.character(dfn[,2])
dfdn
df2n<-as.character(dfn[,1])
df2n
wordsn <- readLines(system.file("stopwords", "english.dat",
                           package = "tm"))
sn<-remove_stopwords(dfdn, words, lines = TRUE)
sn
print(paste("****Stopwords are removed successfully****"))
nn<-removeNumbers(sn)
nn
tn<-removePunctuation(nn, preserve_intra_word_dashes = FALSE)
tn



#bag
b<-bag_o_words(t, apostrophe.remove = TRUE)
b
b.mat = as.matrix(b)
b.mat
bp<-bag_o_words(tp, apostrophe.remove = TRUE)
bp
bp.mat = as.matrix(bp)
bp.mat
bn<-bag_o_words(tn, apostrophe.remove = TRUE)
bn
bn.mat = as.matrix(bn)
bn.mat

#frequent terms
frequent_terms <- freq_terms(b.mat, 2000)
frequent_terms
frequent_termsp <- freq_terms(tp, 2000)
frequent_termsp
frequent_termsn <- freq_terms(tn, 2000)
frequent_termsn

【问题讨论】：

为每个文件创建语料库后，使用tm包，使用findFreqTerms(...)获取常用词列表，然后使用yelp*.csv比较这些常用词 findAssocs(...)
@parth 你能给我一个 findAssocs() 的例子吗？假设所有三个语料库的 findfreqterm() 变量为 o、p、q 和语料库 c1、c2、c3。当我在 findassocs() 中使用 corlimit 时，它会给出与它相关的术语。但是，我需要在 yelp*.csv 中重复的确切单词及其以整数表示的重复频率。
我的意思是，如果两个语料库相似，语料库中的常用词可能有更高的相似性，很快就会分享一个例子

标签： r text-mining tm qdap sentimentr

【解决方案1】：

我正在以来自wiki Text mining 的语料库为例。使用tm包和findFreqTerms,agrep函数是这种方法的要点。

agrep

使用广义的 Levenshtein 编辑距离（将一个字符串转换为另一个）。

方法步骤：

文本 -> 语料库 -> 数据清理 -> findfreqterms -> 与其他术语文档矩阵比较

library(tm)

c1 <- Corpus(VectorSource("Text mining, also referred to as text data mining, roughly equivalent to text analytics, is the process of deriving high-quality information from text. High-quality information is typically derived through the devising of patterns and trends through means such as statistical pattern learning"))

c2 <- Corpus(VectorSource("Text mining usually involves the process of structuring the input text (usually parsing, along with the addition of some derived linguistic features and the removal of others, and subsequent insertion into a database), deriving patterns within the structured data, and finally evaluation and interpretation of the output"))

c3 <- Corpus(VectorSource("Typical text mining tasks include text categorization, text clustering, concept/entity extraction, production of granular taxonomies, sentiment analysis, document summarization, and entity relation modeling (i.e., learning relations between named entities)"))

# Data Cleaning and transformation
c1 <- tm_map(c1, content_transformer(tolower))
c2 <- tm_map(c2, content_transformer(tolower))
c3 <- tm_map(c3, content_transformer(tolower))

c1 <- tm_map(c1, removePunctuation)
c1 <- tm_map(c1, removeNumbers)
c1 <- tm_map(c1, removeWords, stopwords("english"))
c1 <- tm_map(c1, stripWhitespace)

c2 <- tm_map(c2, removePunctuation)
c2 <- tm_map(c2, removeNumbers)
c2 <- tm_map(c2, removeWords, stopwords("english"))
c2 <- tm_map(c2, stripWhitespace)

c3 <- tm_map(c3, removePunctuation)
c3 <- tm_map(c3, removeNumbers)
c3 <- tm_map(c3, removeWords, stopwords("english"))
c3 <- tm_map(c3, stripWhitespace)

dtm1 <- DocumentTermMatrix(c1, control = list(weighting = weightTfIdf, stopwords = TRUE))
dtm2 <- DocumentTermMatrix(c2, control = list(weighting = weightTfIdf, stopwords = TRUE))
dtm3 <- DocumentTermMatrix(c3, control = list(weighting = weightTfIdf, stopwords = TRUE))

ft1 <- findFreqTerms(dtm1)
ft2 <- findFreqTerms(dtm2)
ft3 <- findFreqTerms(dtm3)

#similarity between c1 and c2
common.c1c2 <- data.frame(term = character(0), freq = integer(0))
for(t in ft1){
  find <- agrep(t, ft2)
  if(length(find) != 0){
    common.c1c2 <- rbind(common.c1c2, data.frame(term = t, freq = length(find)))
  }
}
# Note : this for loop can be substituted by apply family functions if taking time for large text

common.c1c2 包含 corpus1 和 corpus2 之间有频率的常用词

> common.c1c2
      term freq
1     also    1
2     data    2
3  derived    1
4 deriving    1
5   mining    1
6  pattern    1
7 patterns    1
8  process    1
9     text    1

> ft1
 [1] "also"        "analytics"   "data"        "derived"     "deriving"    "devising"    "equivalent" 
 [8] "highquality" "information" "learning"    "means"       "mining"      "pattern"     "patterns"   
[15] "process"     "referred"    "roughly"     "statistical" "text"        "trends"      "typically"  

> ft2
 [1] "addition"       "along"          "data"           "database"       "derived"        "deriving"      
 [7] "evaluation"     "features"       "finally"        "input"          "insertion"      "interpretation"
[13] "involves"       "linguistic"     "mining"         "others"         "output"         "parsing"       
[19] "patterns"       "process"        "removal"        "structured"     "structuring"    "subsequent"    
[25] "text"           "usually"        "within"

此解决方案不是最有效的解决方案，但希望对您有所帮助。

【讨论】：