import numpy as np 
    from copy import deepcopy 
    from string import punctuation 
    from random import shuffle 
    import chardet 
    from sklearn.manifold import TSNE 
    from sklearn.preprocessing import scale 

    import bokeh.plotting as bp 
    from bokeh.models import HoverTool, BoxSelectTool 
    from bokeh.plotting import figure, show, output_notebook 

    import gensim 
    from gensim.models.word2vec import Word2Vec 
    LabeledSentence = gensim.models.doc2vec.LabeledSentence 

    import pandas as pd 
    pd.options.mode.chained_assignment = None 

    from tqdm import tqdm 

    from nltk.tokenize import TweetTokenizer 
    tokenizer = TweetTokenizer() 

    from sklearn.model_selection import train_test_split 
    from sklearn.feature_extraction.text import TfidfVectorizer 

    def ingest(filename): 
     with open(filename, 'rb') as f: 
      result = chardet.detect(f.read()) 
     data = pd.read_csv(filename, encoding=result['encoding']) 
     data.drop(['ItemID', 'Date', 'Blank', 'SentimentSource'], axis=1, inplace=True) 
     data = data[data.Sentiment.isnull() == False] 
     data['Sentiment'] = data['Sentiment'].map({4:1, 0:0}) 
     data = data[data['SentimentText'].isnull() == False] 
     data.drop('index', axis=1, inplace=True) 
     print('dataset loaded with shape {}', format(data.shape)) 

     return data 

    def tokenize(tweet): 
      tweet = unicode(tweet.decode('utf-8').lower()) 
      tokens = tokenizer.tokenize(tweet) 
      tokens = filter(lambda t: not t.startswith('@'), tokens) 
      tokens = filter(lambda t: not t.startswith('#'), tokens) 
      tokens = filter(lambda t: not t.startswith('http'), tokens) 
      return tokens 
      return 'NC' 

    def postprocess(data, n=100): 
     data = data.head(n) 
     data['tokens'] = data['SentimentText'].progress_map(tokenize) 
     data = data[data.tokens != 'NC'] 
     data.drop('index', inplace=True, axis=1) 
     return data 

    def labelizeTweets(tweets, label_type): 
     labelized = [] 
     for i,v in enumerate(tweets): 
      label = '%s_%s'%(label_type,i) 
      labelized.append(LabeledSentence(v, [label])) 
     return labelized 

    def labelizeTweets(tweets, label_type): 
     labelized = [] 
     for i,v in tqdm(enumerate(tweets)): 
      label = '%s_%s'%(label_type,i) 
      labelized.append(LabeledSentence(v, [label])) 
     return labelized 

    def buildWordVector(tokens, size): 
     vec = np.zeros(size).reshape((1, size)) 
     count = 0. 
     for word in tokens: 
       vec += tweet_w2v[word].reshape((1, size)) * tfidf[word] 
       count += 1. 
      except KeyError: 

     if count != 0: 
      vec /= count 
     return vec 

    if __name__ == '__main__': 

     filename = './training.csv' 

     #n = 1000000 
     n = 100 
     n_dim = 200 

     data = ingest(filename) 
     #data = data.head(5) 
     data = postprocess(data, n) 

     x_train, x_test, y_train, y_test = train_test_split(np.array(data.head(n).tokens), np.array(data.head(n).Sentiment), test_size=0.2) 

     print("training length X", len(x_train)) 

     print("training length Y", len(y_train)) 

     x_train = labelizeTweets(x_train, 'TRAIN') 
     x_test = labelizeTweets(x_test, 'TEST') 

     print("jljkjkjlkjlj", len(x_train)) 

     tweet_w2v = Word2Vec(size=n_dim, min_count=10) 
     #tweet_w2v.build_vocab([x.words for x in tqdm(x_train)]) 
     tweet_w2v.build_vocab([x.words for x in x_train]) 

     #tweet_w2v.train([x.words for x in tqdm(x_train)],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter) 
     tweet_w2v.train([x.words for x in x_train],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter) 


     if True: 
      print('building tf-idf matrix ...') 
      vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10) 
      matrix = vectorizer.fit_transform([x.words for x in x_train]) 
      tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_)) 
      print('vocab size :', len(tfidf)) 

      train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_train))]) 
      train_vecs_w2v = scale(train_vecs_w2v) 

      test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_test))]) 
      test_vecs_w2v = scale(test_vecs_w2v) 

      model = Sequential() 
      model.add(Dense(32, activation='relu', input_dim=200)) 
      model.add(Dense(1, activation='sigmoid')) 

      model.fit(train_vecs_w2v, y_train, epochs=20, batch_size=32, verbose=2) 

      score = model.evaluate(test_vecs_w2v, y_test, batch_size=128, verbose=2) 
      print (score[1]) 

    plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="A map of 10000 word vectors", 
     x_axis_type=None, y_axis_type=None, min_border=1) 

    word_vectors = [tweet_w2v[w] for w in tweet_w2v.wv.vocab.keys()[:5000]] 

    tsne_model = TSNE(n_components=2, verbose=1, random_state=0) 
    tsne_w2v = tsne_model.fit_transform(word_vectors) 

    tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y']) 
    tsne_df['words'] = tweet_w2v.wv.vocab.keys()[:5000] 

    plot_tfidf.scatter(x='x', y='y', source=tsne_df) 
    hover = plot_tfidf.select(dict(type=HoverTool)) 
    hover.tooltips={"word": "@words"} 


C:\Users\lenovo\AppData\Local\Programs\Python\Python35\lib\site-packages\gensim\utils.py:860: UserWarning: detected Windows; aliasing chunkize to chunkize_serial 
    warnings.warn("detected Windows; aliasing chunkize to chunkize_serial") 
dataset loaded with shape {} (505, 2) 
progress-bar: 100%|##########################################################################| 505/505 [00:00<?, ?it/s] 
training length X 0 
training length Y 0 
0it [00:00, ?it/s] 
0it [00:00, ?it/s] 
jljkjkjlkjlj 0 
Traceback (most recent call last): 
    File "Sentiment_Analysis.py", line 127, in <module> 
    tweet_w2v.train([x.words for x in x_train],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter) 
    File "C:\Users\lenovo\AppData\Local\Programs\Python\Python35\lib\site-packages\gensim\models\word2vec.py", line 951, in train 
    raise RuntimeError("you must first build vocabulary before training the model") 
RuntimeError: you must first build vocabulary before training the model 

これを試すことができますか? :tweet_w2v = Word2Vec(文章= [x_trainのxのx.words、サイズ= n_dim、min_count = 10) –


またはそれ以上 - build_vocabを呼び出した後にtweet_w2v.wv.vocabを印刷/検査できますか?あなたはスタックトレースを持っていますか? –


私は印刷物( "トレーニング長X"、len(x_train))を返して0を返すので、問題はコード内にあると思います。 – user573014




私の回避策は、3.xの代わりにPython 2.7で同じ正確なコードを実行するとスムーズに実行されるということでした。しかし、Python 3.xに移植することができれば、より速いデータ/メモリアクセス速度が得られ、これは非常に望ましいことです。

編集:問題が見つかりました。現在はPython 3でも動作します。これに対応するコードセグメントを編集し、問題なくビルドする必要があります。

def tokenize(tweet): 
      tweet = unicode(tweet.decode('utf-8').lower()) 
      tokens = tokenizer.tokenize(tweet) 
      tokens = list(filter(lambda t: not t.startswith('@'), tokens)) 
      tokens = list(filter(lambda t: not t.startswith('#'), tokens)) 
      tokens = list(filter(lambda t: not t.startswith('http'), tokens)) 
      return tokens 
      return 'NC' 