2017-03-22 24 views
0

単語埋め込みグラフを作成しようとしましたが、私はそれを行いましたが、グラフ内の各単語ベクトルの座標を取得したいと思います。それは例えば、私は座標を見つけるために何をすべきか、私のpython、tensorflowを使用し、私のコードはgithub内の1つの例である1を下回っている単語ベクトル座標の取得方法

example

示す言葉の各座標を取得したい、意味します?私は安全のためにコードを添付しました。

# -*- coding: UTF-8 -*- 

from __future__ import absolute_import 
from __future__ import print_function 


import collections 
import math 
import os 
import random 
import zipfile 

import numpy as np 
from six.moves import urllib 
from six.moves import xrange # pylint: disable=redefined-builtin 
import tensorflow as tf 

import matplotlib 
import matplotlib.font_manager as fm 
font_location = "c:\\windows\\fonts\\malgun.ttf" 
font_name = fm.FontProperties(fname=font_location).get_name() 
matplotlib.rc('font', family=font_name) 


# Step 1 
filename = "text8.zip" 

def read_data(filename): 
    with zipfile.ZipFile(filename) as f: 
    data = f.read(f.namelist()[0]).split() 
    for i, item in enumerate(data): 
     data[i] = item.decode('utf-8') 
    return data 

words = read_data(filename) 
print('Data size', len(words)) 

# Step 2 
vocabulary_size = 10000 

def build_dataset(words): 
    count = [['UNK', -1]] 
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1)) 
    dictionary = dict() 
    for word, _ in count: 
    dictionary[word] = len(dictionary) 
    data = list() 
    unk_count = 0 
    for word in words: 
    if word in dictionary: 
     index = dictionary[word] 
    else: 
     index = 0 # dictionary['UNK'] 
     unk_count += 1 
    data.append(index) 
    count[0][1] = unk_count 
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
    return data, count, dictionary, reverse_dictionary 

data, count, dictionary, reverse_dictionary = build_dataset(words) 
del words # Hint to reduce memory. 
print('Most common words (+UNK)', count[:5]) 
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]]) 

data_index = 0 

# Step 3 
def generate_batch(batch_size, num_skips, skip_window): 
    global data_index 
    assert batch_size % num_skips == 0 
    assert num_skips <= 2 * skip_window 
    batch = np.ndarray(shape=(batch_size), dtype=np.int32) 
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32) 
    span = 2 * skip_window + 1 # [ skip_window target skip_window ] 
    buffer = collections.deque(maxlen=span) 
    for _ in range(span): 
    buffer.append(data[data_index]) 
    data_index = (data_index + 1) % len(data) 
    for i in range(batch_size // num_skips): 
    target = skip_window # target label at the center of the buffer 
    targets_to_avoid = [ skip_window ] 
    for j in range(num_skips): 
     while target in targets_to_avoid: 
     target = random.randint(0, span - 1) 
     targets_to_avoid.append(target) 
     batch[i * num_skips + j] = buffer[skip_window] 
     labels[i * num_skips + j, 0] = buffer[target] 
    buffer.append(data[data_index]) 
    data_index = (data_index + 1) % len(data) 
    return batch, labels 

batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1) 
for i in range(8): 
    print(batch[i], reverse_dictionary[batch[i]], 
     '->', labels[i, 0], reverse_dictionary[labels[i, 0]]) 

# Step 4 

batch_size = 128 
embedding_size = 128 
skip_window = 1  
num_skips = 2   

valid_size = 16  
valid_window = 100 
valid_examples = np.random.choice(valid_window, valid_size, replace=False) 
num_sampled = 64  

graph = tf.Graph() 

with graph.as_default(): 


    train_inputs = tf.placeholder(tf.int32, shape=[batch_size]) 
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) 
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32) 

    # Ops and variables pinned to the CPU because of missing GPU implementation 
    with tf.device('/cpu:0'): 

    embeddings = tf.Variable(
     tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) 

    embed = tf.nn.embedding_lookup(embeddings, train_inputs) 


    nce_weights = tf.Variable(
     tf.truncated_normal([vocabulary_size, embedding_size], 
          stddev=1.0/math.sqrt(embedding_size))) 
    nce_biases = tf.Variable(tf.zeros([vocabulary_size])) 


    loss = tf.reduce_mean(
     tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels, 
        num_sampled, vocabulary_size)) 

    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss) 

    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True)) 
    normalized_embeddings = embeddings/norm 
    valid_embeddings = tf.nn.embedding_lookup(
     normalized_embeddings, valid_dataset) 
    similarity = tf.matmul(
     valid_embeddings, normalized_embeddings, transpose_b=True) 

# Step 5 
num_steps = 100001 

with tf.Session(graph=graph) as session: 

    tf.initialize_all_variables().run() 
    print("Initialized") 

    average_loss = 0 
    for step in xrange(num_steps): 
    batch_inputs, batch_labels = generate_batch(
     batch_size, num_skips, skip_window) 
    feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels} 

    _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict) 
    average_loss += loss_val 

    if step % 2000 == 0: 
     if step > 0: 
     average_loss /= 2000 
     print("Average loss at step ", step, ": ", average_loss) 
     average_loss = 0 

    # Note that this is expensive (~20% slowdown if computed every 500 steps) 
    if step % 10000 == 0: 
     sim = similarity.eval() 
     for i in xrange(valid_size): 
     valid_word = reverse_dictionary[valid_examples[i]] 
     top_k = 8 
     nearest = (-sim[i, :]).argsort()[1:top_k+1] 
     log_str = "Nearest to %s:" % valid_word 
     for k in xrange(top_k): 
      close_word = reverse_dictionary[nearest[k]] 
      log_str = "%s %s," % (log_str, close_word) 
     print(log_str) 
    final_embeddings = normalized_embeddings.eval() 


# Step 6 

def plot_with_labels(low_dim_embs, labels, filename='tsne.png'): 
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings" 
    plt.figure(figsize=(18, 18)) #in inches 
    for i, tmlab in enumerate(labels): 
    # slabel = tmlab.decode('utf-8') 
    x, y = low_dim_embs[i,:] 
    plt.scatter(x, y) 
    plt.annotate(tmlab, 
       xy=(x, y), 
       xytext=(5, 2), 
       textcoords='offset points', 
       ha='right', 
       va='bottom') 

    plt.savefig(filename) 

try: 
    from sklearn.manifold import TSNE 
    import matplotlib.pyplot as plt 

    tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000) 
    plot_only = 750 
    low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:]) 
    labels = [reverse_dictionary[i] for i in xrange(plot_only)] 
    plot_with_labels(low_dim_embs, labels) 

except ImportError: 
    print("Please install sklearn and matplotlib to visualize embeddings.") 

答えて

0

あなたの埋め込み寸法は、任意に大きい(50-300以上)ことができます。しかし、2次元空間に単語をプロットしたいとします。埋め込みサイズが128よりもはるかに大きいので、これらの埋め込みを2次元空間に投影する必要があります。この問題の最もよく知られている方法はT-SNEです。このQuora Answerのコードを使用できます。それは私のために働いた。これがあなたの問題を解決したらうれしいupvote!

+0

しかし、後で計算するために元の座標を保存したいだけの場合はどうなりますか?どのオブジェクトに保存されていますか? – dorien

関連する問題