2017-10-17 13 views
2

私は、異なるパッケージの結果、したがってアルゴリズムがどのように異なっているのか、同様のトピックを生成するためのパラメータを設定できるのかどうか疑問に思っていました。私は特にパッケージtext2vectopicmodelsを見ました。text2vecとtopicmodelsは、LDAに適したパラメータ設定で同様のトピックを生成できますか?

これらのパッケージで生成された10のトピック(用語についてはコードセクションを参照)を比較するために以下のコードを使用しました。私は、同様の意味を持つトピックの集合を生成することはできませんでした。例えば。トピック10は、text2vecから "警察"と関係がありますが、topicmodelsによって生成されたトピックのいずれも "警察"または類似の用語を指していません。さらにtext2vecで作成されたトピックでは、「生命愛する家族戦争」と関係があるtopicmodelsによって作成されたトピック5のペンダントを特定できませんでした。

私はLDAの初心者です。したがって、私の理解は経験豊かなプログラマーにとっては素朴に聞こえるかもしれません。しかし、直観的には、結果の妥当性/堅牢性を証明するために、同様の意味を持つトピックの集合を生成することが可能であるべきであると断言するであろう。もちろん、まったく同じ用語セットではありませんが、同様のトピックに対処する用語リストです。

多分、これらの用語リストの私の人間の解釈は、類似性を捕捉するのに十分ではないかもしれませんが、人間の解釈の類似性を高める可能性のあるパラメータがあるかもしれません。誰かがこれを達成するためのパラメータを設定する方法について私を導くことができますか?そうでなければ、問題の理解を深めるための説明やヒントを適切なリソースに提供できますか?ここで

関連するかもしれないいくつかの問題:

  • 私はtext2vecはすでにtopcimodelsにアルゴリズムの違いである、標準ギブスサンプリングが、WarpLDAを使用していないことを知っています。私の理解が正しければ、topicmodelsで使用されるalphadeltadoc_topic_priortopic_word_priorがそれぞれtext2vecに設定されています。
  • さらに、後処理では、text2vecは頻度に基づいてトピックの用語をソートするためにlambdaの適合を許可します。私は、用語がどのようにソートされているかをまだ理解していませんtopicmodels - lambda=1に匹敵する? (私は0から1まで同じようなトピックを取らずに0から1までの異なるラムダを試しました)
  • もう1つの問題は、seed(see, e.g., this question)を設定しても完全に再現可能な例を生成するのが難しいようです。これは私の質問ではありませんが、対応が難しくなる可能性があります。

ご質問やご迷惑をおかけして申し訳ございませんが、ご協力いただきましたことをお詫び申し上げます。

更新2:私は最初の更新プログラムの内容をより完全な分析に基づいた回答に変更しました。

更新:text2vecパッケージ作成者Dmitriy Selivanovの役に立つコメントに続いて、私はlambda=1を設定する二つのパッケージによって生成termlists betweeenトピックの類似性を増加させることを確認することができます。

さらに、トピック間でのlength(setdiff())length(intersect())のクイックチェックにより、両方のパッケージで作成された用語リストの相違点を詳しく調べました(下記のコードを参照)。この大まかなチェックは、text2vecがトピックごとにいくつかの用語を破棄していることを示しています。おそらく個々のトピックの確率の閾値によってですか? topicmodelsはすべてのトピックのすべての用語を保持します。これはtermlistsから(人間が)導出することができる意味の違いの一部を説明しています。

すでに述べたように、再現可能な例を生成するのは難しいと思われるので、以下のコードですべてのデータ例を適用していません。実行時間が短いため、誰でも自分のシステムをチェックできます。

library(text2vec) 
    library(topicmodels) 
    library(slam) #to convert dtm to simple triplet matrix for topicmodels 

    ntopics <- 10 
    alphaprior <- 0.1 
    deltaprior <- 0.001 
    niter <- 1000 
    convtol <- 0.001 
    set.seed(0) #for text2vec 
    seedpar <- 0 #for topicmodels 

    #Generate document term matrix with text2vec  
    tokens = movie_review$review[1:1000] %>% 
      tolower %>% 
      word_tokenizer 

    it = itoken(tokens, ids = movie_review$id[1:1000], progressbar = FALSE) 

    vocab = create_vocabulary(it) %>% 
      prune_vocabulary(term_count_min = 10, doc_proportion_max = 0.2) 

    vectorizer = vocab_vectorizer(vocab) 

    dtm = create_dtm(it, vectorizer, type = "dgTMatrix") 


    #LDA model with text2vec 
    lda_model = text2vec::LDA$new(n_topics = ntopics 
            ,doc_topic_prior = alphaprior 
            ,topic_word_prior = deltaprior 
           ) 

    doc_topic_distr = lda_model$fit_transform(x = dtm 
               ,n_iter = niter 
               ,convergence_tol = convtol 
               ,n_check_convergence = 25 
               ,progressbar = FALSE 
              )  


    #LDA model with topicmodels 
    ldatopicmodels <- LDA(as.simple_triplet_matrix(dtm), k = ntopics, method = "Gibbs", 
          LDA_Gibbscontrol = list(burnin = 100 
                ,delta = deltaprior 
                ,alpha = alphaprior 
                ,iter = niter 
                ,keep = 50 
                ,tol = convtol 
                ,seed = seedpar 
                ,initialize = "seeded" 
          ) 
    ) 

    #show top 15 words 
    lda_model$get_top_words(n = 10, topic_number = c(1:10), lambda = 0.3) 
    #  [,1]  [,2]  [,3]  [,4]  [,5]   [,6]   [,7]   [,8]  [,9]   [,10]  
    # [1,] "finally" "men"  "know"  "video" "10"   "king"  "five"  "our"  "child"  "cop"  
    # [2,] "re"  "always" "ve"  "1"  "doesn"  "match"  "atmosphere" "husband" "later"  "themselves" 
    # [3,] "three"  "lost"  "got"  "head"  "zombie"  "lee"  "mr"   "comedy" "parents" "mary"  
    # [4,] "m"   "team"  "say"  "girls" "message" "song"  "de"   "seem" "sexual"  "average" 
    # [5,] "gay"  "here"  "d"   "camera" "start"  "musical" "may"  "man"  "murder"  "scenes"  
    # [6,] "kids"  "within" "funny"  "kill"  "3"   "four"  "especially" "problem" "tale"  "police"  
    # [7,] "sort"  "score"  "want"  "stupid" "zombies" "dance"  "quality" "friends" "television" "appears" 
    # [8,] "few"  "thriller" "movies" "talking" "movies"  "action"  "public"  "given" "okay"  "trying"  
    # [9,] "bit"  "surprise" "let"  "hard"  "ask"  "fun"  "events"  "crime" "cover"  "waiting" 
    # [10,] "hot"  "own"  "thinking" "horrible" "won"  "tony"  "u"   "special" "stan"  "lewis"  
    # [11,] "die"  "political" "nice"  "stay"  "open"  "twist"  "kelly"  "through" "uses"  "imdb"  
    # [12,] "credits" "success" "never"  "back"  "davis"  "killer"  "novel"  "world" "order"  "candy"  
    # [13,] "two"  "does"  "bunch"  "didn"  "completely" "ending"  "copy"  "show" "strange" "name"  
    # [14,] "otherwise" "beauty" "hilarious" "room"  "love"  "dancing" "japanese" "new"  "female"  "low"  
    # [15,] "need"  "brilliant" "lot"  "minutes" "away"  "convincing" "far"  "mostly" "girl"  "killing"  

    terms(ldatopicmodels, 10) 
    #  Topic 1  Topic 2 Topic 3  Topic 4 Topic 5 Topic 6  Topic 7  Topic 8  Topic 9 Topic 10 
    # [1,] "show"  "where" "horror"  "did"  "life" "such"  "m"   "films"  "man"  "seen"  
    # [2,] "years" "minutes" "pretty"  "10"  "young" "character" "something" "music"  "new"  "movies"  
    # [3,] "old"  "gets" "best"   "now"  "through" "while"  "re"  "actors"  "two"  "plot"  
    # [4,] "every" "guy"  "ending"  "why"  "love" "those"  "going"  "role"  "though" "better"  
    # [5,] "series" "another" "bit"   "saw"  "woman" "does"  "things" "performance" "big"  "worst"   
    # [6,] "funny" "around" "quite"  "didn" "us"  "seems"  "want"  "between"  "back"  "interesting" 
    # [7,] "comedy" "nothing" "little"  "say"  "real" "book"  "thing"  "love"  "action" "your"  
    # [8,] "again" "down" "actually"  "thought" "our"  "may"  "know"  "play"  "shot"  "money"  
    # [9,] "tv"  "take" "house"  "still" "war"  "work"  "ve"  "line"  "together" "hard"  
    # [10,] "watching" "these" "however"  "end"  "father" "far"  "here"  "actor"  "against" "poor"  
    # [11,] "cast"  "fun"  "cast"   "got"  "find" "scenes" "doesn"  "star"  "title" "least"  
    # [12,] "long"  "night" "entertaining" "2"  "human" "both"  "look"  "never"  "go"  "say"   
    # [13,] "through" "scene" "must"   "am"  "shows" "yet"  "isn"  "played"  "city"  "director" 
    # [14,] "once"  "back" "each"   "done" "family" "audience" "anything" "hollywood" "came"  "probably" 
    # [15,] "watched" "dead" "makes"  "3"  "mother" "almost" "enough" "always"  "match" "video" 

#UPDATE 

#number of terms in each model is the same 
length([email protected]) 
# [1] 2170 
nrow(vocab) 
# [1] 2170 

#number of NA entries for termlist of first topic differs 
sum(is.na(
      lda_model$get_top_words(n = nrow(vocab), topic_number = c(1:10), lambda = 1)[,1] 
     ) 
    ) 
#[1] 1778 

sum(is.na(
      terms(ldatopicmodels, length([email protected])) 
     ) 
    ) 
#[1] 0 


#function to check number of terms that differ between two sets of topic collections (excluding NAs) 
lengthsetdiff <- function(x, y) { 

    apply(x, 2, function(i) { 

    apply(y, 2, function(j) { 

     length(setdiff(i[!is.na(i)],j[!is.na(j)])) 
    }) 

    }) 

} 


#apply the check 
termstopicmodels <- terms(ldatopicmodels,length([email protected])) 
termstext2vec <- lda_model$get_top_words(n = nrow(vocab), topic_number = c(1:10), lambda = 1) 


lengthsetdiff(termstopicmodels, 
      termstopicmodels) 
# Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7 Topic 8 Topic 9 Topic 10 
# Topic 1  0  0  0  0  0  0  0  0  0  0 
# Topic 2  0  0  0  0  0  0  0  0  0  0 
# Topic 3  0  0  0  0  0  0  0  0  0  0 
# Topic 4  0  0  0  0  0  0  0  0  0  0 
# Topic 5  0  0  0  0  0  0  0  0  0  0 
# Topic 6  0  0  0  0  0  0  0  0  0  0 
# Topic 7  0  0  0  0  0  0  0  0  0  0 
# Topic 8  0  0  0  0  0  0  0  0  0  0 
# Topic 9  0  0  0  0  0  0  0  0  0  0 
# Topic 10  0  0  0  0  0  0  0  0  0  0 

lengthsetdiff(termstext2vec, 
       termstext2vec) 
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] 
# [1,] 0 340 318 335 292 309 320 355 294 322 
# [2,] 355 0 321 343 292 319 311 346 302 339 
# [3,] 350 338 0 316 286 309 311 358 318 322 
# [4,] 346 339 295 0 297 310 301 335 309 332 
# [5,] 345 330 307 339 0 310 310 354 309 333 
# [6,] 350 345 318 340 298 0 311 342 308 325 
# [7,] 366 342 325 336 303 316 0 364 311 325 
# [8,] 355 331 326 324 301 301 318 0 311 335 
# [9,] 336 329 328 340 298 309 307 353 0 314 
# [10,] 342 344 310 341 300 304 299 355 292  0 

lengthsetdiff(termstopicmodels, 
       termstext2vec) 
# Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7 Topic 8 Topic 9 Topic 10 
# [1,] 1778 1778 1778 1778 1778 1778 1778 1778 1778  1778 
# [2,] 1793 1793 1793 1793 1793 1793 1793 1793 1793  1793 
# [3,] 1810 1810 1810 1810 1810 1810 1810 1810 1810  1810 
# [4,] 1789 1789 1789 1789 1789 1789 1789 1789 1789  1789 
# [5,] 1831 1831 1831 1831 1831 1831 1831 1831 1831  1831 
# [6,] 1819 1819 1819 1819 1819 1819 1819 1819 1819  1819 
# [7,] 1824 1824 1824 1824 1824 1824 1824 1824 1824  1824 
# [8,] 1778 1778 1778 1778 1778 1778 1778 1778 1778  1778 
# [9,] 1820 1820 1820 1820 1820 1820 1820 1820 1820  1820 
# [10,] 1798 1798 1798 1798 1798 1798 1798 1798 1798  1798 

lengthsetdiff(termstext2vec, 
       termstopicmodels) 
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] 
# Topic 1  0 0 0 0 0 0 0 0 0  0 
# Topic 2  0 0 0 0 0 0 0 0 0  0 
# Topic 3  0 0 0 0 0 0 0 0 0  0 
# Topic 4  0 0 0 0 0 0 0 0 0  0 
# Topic 5  0 0 0 0 0 0 0 0 0  0 
# Topic 6  0 0 0 0 0 0 0 0 0  0 
# Topic 7  0 0 0 0 0 0 0 0 0  0 
# Topic 8  0 0 0 0 0 0 0 0 0  0 
# Topic 9  0 0 0 0 0 0 0 0 0  0 
# Topic 10 0 0 0 0 0 0 0 0 0  0 

#also the intersection can be checked between the two sets 
lengthintersect <- function(x, y) { 

    apply(x, 2, function(i) { 

    apply(y, 2, function(j) { 

     length(intersect(i[!is.na(i)], j[!is.na(j)])) 
    }) 

    }) 

} 

lengthintersect(termstopicmodels, 
       termstext2vec) 

# Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7 Topic 8 Topic 9 Topic 10 
# [1,]  392  392  392  392  392  392  392  392  392  392 
# [2,]  377  377  377  377  377  377  377  377  377  377 
# [3,]  360  360  360  360  360  360  360  360  360  360 
# [4,]  381  381  381  381  381  381  381  381  381  381 
# [5,]  339  339  339  339  339  339  339  339  339  339 
# [6,]  351  351  351  351  351  351  351  351  351  351 
# [7,]  346  346  346  346  346  346  346  346  346  346 
# [8,]  392  392  392  392  392  392  392  392  392  392 
# [9,]  350  350  350  350  350  350  350  350  350  350 
# [10,]  372  372  372  372  372  372  372  372  372  372 
+1

kが再びkを見つけるために使用される指標に応じて、アルゴリズムの間で異なる場合があります。最適には)私は 'ラムダ= '1' topicmodels'は、用語をソートする方法に該当すると思います。 –

+1

あなたの迅速で有益な応答をありがとう。私は私の質問を更新し、出力の違いに関するいくつかの調査結果を紹介しています。 –

答えて

1

私の質問をいくつかの比較結果で更新した後も、私はまだ詳細に関心がありました。したがって、text2vec(5000個のドキュメント)に含まれる完全なmovie_reviewデータセットでldaモデルを実行しました。現実的な結果の半分を生成するために、私はいくつかの穏やかな前処理とストップワード除去を導入しました。 (下の長いコード例には申し訳ありません)

私の結論は、2つのパッケージによって生成された「良い」トピック(主観的な観点からのもの)のいくつかはある程度匹敵しているということです本当に良くなく、比較が難しい)。しかし、2つのパッケージ間で類似のトピックを見ると、各トピックごとに異なる(主観的な)関連が生成されました。したがって、標準GibbsサンプリングおよびWarpLDAアルゴリズムは、所与のデータに関して類似の局所領域を捕捉するようであるが、トピックには異なる「気分」が表される。

WarpLDAアルゴリズムが用語を破棄し、NAの値をbeta行列(用語 - トピック - 分布)に導入しているように見えるという点で、主な理由がわかります。これについては下記の例を参照してください。したがって、完全性を犠牲にすることによって、そのより速い収束が達成されると思われる。

私はどのテーマが主観的に "より良い"ものかを判断したくないので、自分の判断でこのままにしておきます。この分析の

一つの重要な制限は、私がいない(まだ)話題の最適な数の結果は、私だけk=10を使用し確認していること、です。したがって、トピックの比較可能性が最適なkのために増加する可能性があります、いずれにせよ品質が向上し、それによって多分 "気分"。

library(text2vec) 
library(topicmodels) 
library(slam) #to convert dtm to simple triplet matrix for topicmodels 
library(LDAvis) 
library(tm) #for stopwords only 

ntopics <- 10 
alphaprior <- 0.1 
deltaprior <- 0.001 
niter <- 1000 
convtol <- 0.001 
set.seed(0) #for text2vec 
seedpar <- 0 #for topicmodels 

docs <- movie_review$review 

preproc_fun <- function(x) { 
    tolower(x) %>% 
    { gsub("[\\W]+", " ", ., perl=T) } %>% 
    { gsub("[\\d]+", " ", ., perl=T) } %>% 
    { gsub(paste0("(?<=\\b)(\\w{1,", 2, "})(?=\\b)"), "", ., perl=T) } %>% 
    { gsub("\\s+", " ", . , perl=T) } %>% 
    { gsub("^\\s|\\s$", "", ., perl=T) } %>% 
    return() 
} 

#Generate document term matrix with text2vec  
tokens = docs %>% 
    preproc_fun %>% 
    word_tokenizer 

it = itoken(tokens, ids = movie_review$id, progressbar = FALSE) 

vocab = create_vocabulary(it, stopwords = tm::stopwords()) %>% 
    prune_vocabulary(term_count_min = 10, doc_proportion_max = 0.2) 

vectorizer = vocab_vectorizer(vocab) 

dtm = create_dtm(it, vectorizer, type = "dgTMatrix") 
dim(dtm) 
# [1] 5000 7407 

#LDA model with text2vec 
ldatext2vec = text2vec::LDA$new(n_topics = ntopics 
           ,doc_topic_prior = alphaprior 
           ,topic_word_prior = deltaprior 
) 

doc_topic_distr = ldatext2vec$fit_transform(x = dtm 
              ,n_iter = niter 
              ,convergence_tol = convtol 
              ,n_check_convergence = 25 
              ,progressbar = FALSE 
)  


control_Gibbs_topicmodels <- list(
          alpha = alphaprior 
          ,delta = deltaprior 
          ,iter = niter 
          ,burnin = 100 
          ,keep = 50 
          ,nstart = 1 
          ,best = TRUE 
          ,seed = seedpar 
          ) 

#LDA model with topicmodels 
ldatopicmodels <- LDA(as.simple_triplet_matrix(dtm) 
         ,k = ntopics 
         ,method = "Gibbs" 
         ,control = control_Gibbs_topicmodels 
         ) 


#I have ordered the topics manually after printing top 15 terms and put similar (at least from my subjective standpoint) topics at the beginning 
topicsterms_ldatopicmodels <- terms(ldatopicmodels,length([email protected]))[,c(6,8,10,3,5,9,7,4,1,2)] 
topicsterms_ldatext2vec <- ldatext2vec$get_top_words(n = nrow(vocab), topic_number = c(1:10), lambda = 1)[, c(9,6,4,10,5,3,7,2,8,1)] 

#show top 15 words 
topicsterms_ldatext2vec[1:15,] 
#  [,1]  [,2]   [,3]   [,4]  [,5]  [,6]  [,7] [,8]  [,9]  [,10]  
# [1,] "show"  "performance" "films"  "war"  "horror" "say"  "man" "love"  "know"  "man"  
# [2,] "series" "role"  "director" "american" "killer" "better" "back" "life"  "say"  "woman" 
# [3,] "funny" "films"  "scenes"  "book"  "doesn" "nothing" "last" "big"  "life"  "life" 
# [4,] "still" "music"  "audience" "may"  "little" "watching" "match" "real"  "didn"  "police" 
# [5,] "original" "love"  "though"  "world" "isn"  "know"  "big" "women" "going"  "father" 
# [6,] "years" "cast"  "may"   "young" "guy"  "worst"  "men" "job"  "now"  "world" 
# [7,] "version" "john"  "quite"  "family" "actually" "didn"  "takes" "black" "something" "black" 
# [8,] "episode" "play"  "real"  "mother" "gets"  "something" "woman" "new"  "things" "wife" 
# [9,] "now"  "man"   "seems"  "true"  "dead"  "actors" "take" "money" "back"  "goes" 
# [10,] "dvd"  "played"  "work"  "years" "look"  "minutes" "young" "work"  "saw"  "new"  
# [11,] "saw"  "actor"  "scene"  "novel" "house" "films"  "life" "game"  "family" "without" 
# [12,] "old"  "excellent" "actors"  "however" "looks" "least"  "city" "world" "love"  "around" 
# [13,] "watching" "young"  "interesting" "small" "poor"  "script" "town" "still" "thought" "scene" 
# [14,] "watched" "perfect"  "rather"  "quite" "pretty" "budget" "dance" "comedy" "got"  "shot" 
# [15,] "better" "high"  "yet"   "history" "stupid" "lot"  "rock" "american" "thing"  "another" 

topicsterms_ldatopicmodels[1:15,] 
#  Topic 6 Topic 8  Topic 10 Topic 3  Topic 5 Topic 9 Topic 7 Topic 4 Topic 1 Topic 2  
# [1,] "show" "performance" "films"  "war"   "horror" "funny" "man"  "love"  "life" "little" 
# [2,] "years" "role"   "director" "american" "house" "better" "wife" "book"  "love" "music"  
# [3,] "series" "cast"   "something" "documentary" "scene" "say"  "gets" "films" "world" "action" 
# [4,] "now"  "actor"  "enough" "part"  "killer" "know"  "father" "version" "young" "fun"  
# [5,] "episode" "play"   "doesn"  "world"  "sex" "watching" "back" "still" "family" "big"  
# [6,] "old"  "performances" "nothing" "history"  "scenes" "thing" "goes" "original" "real" "rock"  
# [7,] "back" "comedy"  "actually" "america"  "gore" "pretty" "new"  "quite" "may" "king"  
# [8,] "love" "played"  "things" "new"   "blood" "guy"  "woman" "music" "man" "animation" 
# [9,] "saw"  "director"  "seems"  "hollywood" "around" "didn"  "later" "years" "work" "films"  
# [10,] "shows" "job"   "know"  "japanese" "little" "got"  "home" "scenes" "little" "black"  
# [11,] "new"  "john"   "without" "white"  "woman" "worst" "money" "old"  "lives" "song"  
# [12,] "family" "actors"  "real"  "shot"  "night" "thought" "son"  "scene" "mother" "pretty" 
# [13,] "dvd"  "star"   "far"  "despite"  "dead" "wasn"  "police" "better" "men" "quite"  
# [14,] "still" "excellent" "might"  "still"  "zombie" "minutes" "husband" "bit"  "find" "musical" 
# [15,] "know" "work"   "fact"  "early"  "scary" "stupid" "town" "times" "women" "effects" 

#number of total terms for each model is the same 
#however, the ldatext2vec from text2vec has NA values 
length([email protected]) 
# [1] 7407 
length([email protected][ !is.na([email protected])]) 
# [1] 7407 

terms_ldatext2vec <- unique(as.character(topicsterms_ldatext2vec)) 
length(terms_ldatext2vec) 
# [1] 7408 
length(terms_ldatext2vec[!is.na(terms_ldatext2vec)]) 
# [1] 7407 

#number of NA entries in topic/termlists of text2vec ldatext2vec 
dim(topicsterms_ldatext2vec) 
#[1] 7407 10 
sum(is.na(topicsterms_ldatext2vec)) 
# [1] 60368 
#share of NA values 
sum(is.na(topicsterms_ldatext2vec))/(dim(topicsterms_ldatext2vec)[1]*dim(topicsterms_ldatext2vec)[2]) 
#[1] 0.8150128 

#no NA values in ldatopicmodels 
sum(is.na(terms(ldatopicmodels, length([email protected])))) 
#[1] 0 

#function to check number of terms that differ between two sets of topic collections (excluding NAs) 
lengthsetdiff <- function(x, y) { 
    apply(x, 2, function(i) { 
    apply(y, 2, function(j) { 
     length(setdiff(i[!is.na(i)],j[!is.na(j)])) 
    }) 
    }) 
} 

#also the intersection can be checked between the two sets 
lengthintersect <- function(x, y) { 
    apply(x, 2, function(i) { 
    apply(y, 2, function(j) { 
     length(intersect(i[!is.na(i)], j[!is.na(j)])) 
    }) 
    }) 
} 

#since especially the top words are of interest, we first check the intersection of top 20 words 
#please note that the order of the topics, especially the last 3 is subjective 
lengthintersect(topicsterms_ldatopicmodels[1:20,], 
       topicsterms_ldatext2vec[1:20,]) 
#   Topic 6 Topic 8 Topic 10 Topic 3 Topic 5 Topic 9 Topic 7 Topic 4 Topic 1 Topic 2 
# [1,]  13  1  0  2  0  3  1  7  1  2 
# [2,]  1  9  1  0  0  0  2  4  5  4 
# [3,]  0  4  8  0  2  0  0  4  3  2 
# [4,]  3  0  0  5  2  0  1  5  6  2 
# [5,]  1  0  3  0  7  7  1  1  1  2 
# [6,]  2  3  6  0  0  10  0  3  0  1 
# [7,]  4  2  1  2  1  0  8  1  4  3 
# [8,]  3  4  2  5  1  1  2  3  8  5 
# [9,]  10  0  4  0  0  8  1  3  3  1 
# [10,]  1  0  1  3  3  0  7  1  5  2 



#apply the check with the topics ordered as shown above for the top 15 words 

#all words are appear in each topic 
lengthsetdiff(topicsterms_ldatopicmodels, 
       topicsterms_ldatopicmodels) 
#    Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7 Topic 8 Topic 9 Topic 10 
# Topic 1  0  0  0  0  0  0  0  0  0  0 
# Topic 2  0  0  0  0  0  0  0  0  0  0 
# Topic 3  0  0  0  0  0  0  0  0  0  0 
# Topic 4  0  0  0  0  0  0  0  0  0  0 
# Topic 5  0  0  0  0  0  0  0  0  0  0 
# Topic 6  0  0  0  0  0  0  0  0  0  0 
# Topic 7  0  0  0  0  0  0  0  0  0  0 
# Topic 8  0  0  0  0  0  0  0  0  0  0 
# Topic 9  0  0  0  0  0  0  0  0  0  0 
# Topic 10  0  0  0  0  0  0  0  0  0  0 

#not all words appear in each topic 
lengthsetdiff(topicsterms_ldatext2vec , 
       topicsterms_ldatext2vec) 
#  [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] 
# [1,] 0 1188 1216 1241 1086 1055 1196 1131 1126 1272 
# [2,] 1029 0 1203 1223 1139 1073 1188 1140 1188 1260 
# [3,] 1032 1178 0 1224 1084 1024 1186 1122 1164 1238 
# [4,] 1075 1216 1242 0 1175 1139 1202 1152 1207 1271 
# [5,] 1011 1223 1193 1266 0 1082 1170 1170 1160 1214 
# [6,] 993 1170 1146 1243 1095 0 1178 1119 1092 1206 
# [7,] 1078 1229 1252 1250 1127 1122 0 1200 1195 1227 
# [8,] 1030 1198 1205 1217 1144 1080 1217 0 1171 1211 
# [9,] 966 1187 1188 1213 1075 994 1153 1112 0 1198 
# [10,] 1095 1242 1245 1260 1112 1091 1168 1135 1181  0 

#difference of terms in topics per topic between the two models 
lengthsetdiff(topicsterms_ldatopicmodels, 
       topicsterms_ldatext2vec) 
#   Topic 6 Topic 8 Topic 10 Topic 3 Topic 5 Topic 9 Topic 7 Topic 4 Topic 1 Topic 2 
# [1,] 6157 6157  6157 6157 6157 6157 6157 6157 6157 6157 
# [2,] 5998 5998  5998 5998 5998 5998 5998 5998 5998 5998 
# [3,] 5973 5973  5973 5973 5973 5973 5973 5973 5973 5973 
# [4,] 5991 5991  5991 5991 5991 5991 5991 5991 5991 5991 
# [5,] 6082 6082  6082 6082 6082 6082 6082 6082 6082 6082 
# [6,] 6095 6095  6095 6095 6095 6095 6095 6095 6095 6095 
# [7,] 6039 6039  6039 6039 6039 6039 6039 6039 6039 6039 
# [8,] 6056 6056  6056 6056 6056 6056 6056 6056 6056 6056 
# [9,] 5997 5997  5997 5997 5997 5997 5997 5997 5997 5997 
# [10,] 5980 5980  5980 5980 5980 5980 5980 5980 5980 5980 

lengthsetdiff(topicsterms_ldatext2vec, 
       topicsterms_ldatopicmodels) 
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] 
# Topic 6  0 0 0 0 0 0 0 0 0  0 
# Topic 8  0 0 0 0 0 0 0 0 0  0 
# Topic 10 0 0 0 0 0 0 0 0 0  0 
# Topic 3  0 0 0 0 0 0 0 0 0  0 
# Topic 5  0 0 0 0 0 0 0 0 0  0 
# Topic 9  0 0 0 0 0 0 0 0 0  0 
# Topic 7  0 0 0 0 0 0 0 0 0  0 
# Topic 4  0 0 0 0 0 0 0 0 0  0 
# Topic 1  0 0 0 0 0 0 0 0 0  0 
# Topic 2  0 0 0 0 0 0 0 0 0  0 
関連する問題