df = pd.DataFrame({'text':["Anyone who reads Old and Middle English literary texts will be familiar with the mid-brown volumes of the EETS, with the symbol of Alfreds jewel embossed on the front cover", 
        "Most of the works attributed to King Alfred or to Aelfric, along with some of those by bishop Wulfstan and much anonymous prose and verse from the pre-Conquest period, are to be found within the Society's three series", 
        "all of the surviving medieval drama, most of the Middle English romances, much religious and secular prose and verse including the English works of John Gower, Thomas Hoccleve and most of Caxton's prints all find their place in the publications", 
        "Without EETS editions, study of medieval English texts would hardly be possible."]}) 

また、問題についてもう少し説明し、予想される出力を追加できますか? –


文とトークンリストの間の類似性を計算し、出力文として最も類似した文のトークンリストを選択します。あるいは、各トークンリストのトークンの出現を文で数えるより簡単な方法では、トークンリストの出力としてトークンが最大に出現する文を選択します。 – mutux



私が以前言ったように、このポストは私の問題の実例です。私はクラスタリング問題を解決していました。私はそれを行うためにLDAとK-meansアルゴリズムを使用しました。 私のトークンリストに適切な文を見つけるために、私はK平均距離パラメータを使用しました。

import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer 
import lda 
from sklearn.feature_extraction.text import CountVectorizer 
import logging 
from sklearn.cluster import MiniBatchKMeans 
from sklearn import preprocessing 

df = pd.DataFrame({'text':["Anyone who reads Old and Middle English literary texts will be familiar with the mid-brown volumes of the EETS, with the symbol of Alfreds jewel embossed on the front cover", 
         "Most of the works attributed to King Alfred or to Aelfric, along with some of those by bishop Wulfstan and much anonymous prose and verse from the pre-Conquest period, are to be found within the Society's three series", 
         "all of the surviving medieval drama, most of the Middle English romances, much religious and secular prose and verse including the English works of John Gower, Thomas Hoccleve and most of Caxton's prints all find their place in the publications", 
         "Without EETS editions, study of medieval English texts would hardly be possible."], 
        'tokens':[['middl engl', 'mid-brown', 'symbol'], ["king", 'anonym', 'series'], ['mediev', 'romance', 'relig'], ['hocclev', 'edit', 'publ']]}) 
df['tokens'] = df.tokens.str.join(',') 

vectorizer = TfidfVectorizer(min_df=1, max_features=10000, ngram_range=(1, 2)) 
vz = vectorizer.fit_transform(df['tokens']) 

cvectorizer = CountVectorizer(min_df=1, max_features=10000, ngram_range=(1,2)) 
cvz = cvectorizer.fit_transform(df['tokens']) 

n_topics = 4 

n_iter = 2000 
lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter) 
X_topics = lda_model.fit_transform(cvz) 

num_clusters = 4 
kmeans_model = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++', n_init=1, 
         init_size=1000, batch_size=1000, verbose=False, max_iter=1000) 
kmeans = kmeans_model.fit(vz) 
kmeans_clusters = kmeans.predict(vz) 
kmeans_distances = kmeans.transform(vz) 

X_all = X_topics 
kmeans1 = kmeans_model.fit(X_all) 
kmeans_clusters1 = kmeans1.predict(X_all) 
kmeans_distances1 = kmeans1.transform(X_all) 
d = dict() 
l = 1 

for i, desc in enumerate(df.text): 
    if(i < 3): 
     num = 3 
     if kmeans_clusters1[i] == num: 
      if l > kmeans_distances1[i][kmeans_clusters1[i]]: 
       l = kmeans_distances1[i][kmeans_clusters1[i]] 
      d['Cluster' + str(kmeans_clusters1[i])] = "distance: " + str(l)+ " "+ df.iloc[i]['text'] 
      print("Cluster " + str(kmeans_clusters1[i]) + ": " + desc + 
        "(distance: " + str(kmeans_distances1[i][kmeans_clusters1[i]]) + ")") 
print("Cluster " + str(num) + " " + str(d.get('Cluster' + str(num)))) 
