2017-01-03 13 views
14

挿入1MM +行wide and deep learning modelに2GBを超えることはできないがValueError: GraphDef cannot be larger than 2GBスロー: ワイド&ディープラーニング:GraphDefは

Traceback (most recent call last): 
    File "search_click.py", line 207, in <module> 
    tf.app.run() 
    File "/usr/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 30, in run 
    sys.exit(main(sys.argv)) 
    File "search_click.py", line 204, in main 
    train_and_eval() 
    File "search_click.py", line 181, in train_and_eval 
    m.fit(input_fn=lambda: input_fn(df_train), steps=FLAGS.train_steps) 
    File "/usr/lib/python2.7/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 182, in fit 
    monitors=monitors) 
    File "/usr/lib/python2.7/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 458, in _train_model 
    summary_writer=graph_actions.get_summary_writer(self._model_dir)) 
    File "/usr/lib/python2.7/site-packages/tensorflow/contrib/learn/python/learn/graph_actions.py", line 76, in get_summary_writer 
    graph=ops.get_default_graph()) 
    File "/usr/lib/python2.7/site-packages/tensorflow/python/training/summary_io.py", line 113, in __init__ 
    self.add_graph(graph=graph, graph_def=graph_def) 
    File "/usr/lib/python2.7/site-packages/tensorflow/python/training/summary_io.py", line 204, in add_graph 
    true_graph_def = graph.as_graph_def(add_shapes=True) 
    File "/usr/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2117, in as_graph_def 
    raise ValueError("GraphDef cannot be larger than 2GB.") 
ValueError: GraphDef cannot be larger than 2GB. 

私は例と同じ input_fn定義:

def input_fn(df): 
    """Input builder function.""" 
    # Creates a dictionary mapping from each continuous feature column name (k) to 
    # the values of that column stored in a constant Tensor. 
    continuous_cols = {k: tf.constant(df[k].values) for k in CONTINUOUS_COLUMNS} 
    # Creates a dictionary mapping from each categorical feature column name (k) 
    # to the values of that column stored in a tf.SparseTensor. 
    categorical_cols = {k: tf.SparseTensor(
     indices=[[i, 0] for i in range(df[k].size)], 
     values=df[k].values, 
     shape=[df[k].size, 1]) 
         for k in CATEGORICAL_COLUMNS} 
    # Merges the two dictionaries into one. 
    feature_cols = dict(continuous_cols) 
    feature_cols.update(categorical_cols) 
    # Converts the label column into a constant Tensor. 
    label = tf.constant(df[LABEL_COLUMN].values) 
    # Returns the feature columns and the label. 
    return feature_cols, label 

tf.constanttf.SparseTensorの代替品があり、バッチでの挿入とメモリエラーの回避が可能ですか?

+0

定数としてロードする代わりに、入力パイプライン/キューを使用する必要があります。 – soloice

答えて

1

この例では、データセット全体をメモリにロードする必要があります。大量のデータがある場合は、csv形式の入力にはtf.decode_csvが必要です。入力フォーマットが自己定義されている場合は、custom data readerを作成する必要があります。

+0

ありがとうございました! –

1

@ilblackdragonによって作成されたsolutionは、dataframe.queuesを使用します。また、仕事とエンコードされたカテゴリ(例えばKeyError: 'Embarked_ids')にキーエラーを与えていないDNNClassifierを使用して

ValueError: Data types for extracting pandas data must be int, float, or bool. Found: 'sex' type='object', 'embarked' type='object' 

:残念ながら、カテゴリー変数のためには、最初にそれ以外の場合は、あなたがエラーを取得し、それらをコード整数する必要があります。投稿では、著者が独自の分類子モデルを作成します。

例コード:

# -*- coding: utf-8 -*- 
# flake8: noqa ignore=E501 
import tempfile 

import pandas as pd 
import tensorflow as tf 
import tensorflow.contrib.learn as tf_learn 
import tensorflow.contrib.layers as tf_layers 
from sklearn.metrics import accuracy_score 
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split 

# Define the column names for the data sets. 
LABEL_COLUMN = 'Survived' 
CONTINUOUS_COLUMNS = ['Age', 'SibSp', 'Parch', 'Fare'] 
CATEGORICAL_COLUMNS = ['Pclass', 'Sex', 'Embarked'] 
CATEGORICAL_ID_COLUMNS = [col + '_ids' for col in CATEGORICAL_COLUMNS] 
FEATURE_COLUMNS = CONTINUOUS_COLUMNS + CATEGORICAL_ID_COLUMNS 

HIDDEN_UNITS = [10, 20, 10] 
CATEGORICAL_EMBED_SIZE = 10 

LABEL_ENCODERS = {} 

def get_feature_cols(): 
    # used in DNNClassifier which doesn't work 
    continuous_features = [tf_layers.real_valued_column(col) for col in 
          CONTINUOUS_COLUMNS] 
    categorical_features = [ 
     tf_layers.embedding_column(
      tf_layers.sparse_column_with_integerized_feature(col + '_ids', len(
       LABEL_ENCODERS[col].classes_)), 
      CATEGORICAL_EMBED_SIZE) 
     for col in CATEGORICAL_COLUMNS 
     ] 
    return continuous_features + categorical_features 


def pandas_input_fn(X, y=None, batch_size=128, num_epochs=None): 
    def input_fn(): 
     if y is not None: 
      X['target'] = y 
     queue = tf_learn.dataframe.queues.feeding_functions.enqueue_data(
      X, 1000, shuffle=num_epochs is None, num_epochs=num_epochs) 
     if num_epochs is None: 
      features = queue.dequeue_many(batch_size) 
     else: 
      features = queue.dequeue_up_to(batch_size) 

     features = dict(zip(['index'] + list(X.columns), features)) 

     if y is not None: 
      target = features.pop('target') 
      return features, target 
     return features 

    return input_fn 


def encode_categorical(df): 
    global LABEL_ENCODERS 
    for col in CATEGORICAL_COLUMNS: 
     if df[col].dtype == 'object': 
      df[col] = df[col].astype(str) 
     encoder = LabelEncoder().fit(df[col]) 
     df[col + '_ids'] = encoder.transform(df[col]) 
     df.pop(col) 
     LABEL_ENCODERS[col] = encoder 
    return df, LABEL_ENCODERS 


def dnn_tanh(features, target, hidden_units=HIDDEN_UNITS): 
    global LABEL_ENCODERS 
    target = tf.one_hot(target, 2, 1.0, 0.0) 

    # Organize continuous features. 
    final_features = [tf.expand_dims(tf.cast(features[col], tf.float32), 1) for 
         col in CONTINUOUS_COLUMNS] 

    # Embed categorical variables into distributed representation. 
    for col in CATEGORICAL_COLUMNS: 
     feature = tf_learn.ops.categorical_variable(
      features[col + '_ids'], 
      len(LABEL_ENCODERS[col].classes_), 
      embedding_size=CATEGORICAL_EMBED_SIZE, 
      name=col) 
     final_features.append(feature) 

    # Concatenate all features into one vector. 
    features = tf.concat(1, final_features) 

    # Deep Neural Network 
    logits = tf_layers.stack(features, 
          tf_layers.fully_connected, 
          stack_args=hidden_units, 
          activation_fn=tf.tanh) 
    prediction, loss = tf_learn.models.logistic_regression(logits, target) 
    train_op = tf_layers.optimize_loss(loss, 
             tf.contrib.framework.get_global_step(), 
             optimizer='SGD', 
             learning_rate=0.05) 
    return tf.argmax(prediction, dimension=1), loss, train_op 


def process_input_df(df): 
    df, label_encoders = encode_categorical(df) 
    y = df.pop(LABEL_COLUMN) 
    X = df[CATEGORICAL_ID_COLUMNS + CONTINUOUS_COLUMNS].fillna(0) 
    return X, y 


def train(X, y, steps=100): 
    model_dir = tempfile.mkdtemp() 
    print("model dir: ", model_dir) 
    classifier = tf_learn.Estimator(model_fn=dnn_tanh, model_dir=model_dir) 
    classifier.fit(input_fn=pandas_input_fn(X, y), steps=steps) 

    ''' 
    # Using DNNClassifier gives KeyError (e.g on EmbedIds) 
    classifier = learn.DNNClassifier(hidden_units=[10, 20, 10], 
     n_classes=2, 
     feature_columns=get_feature_cols(), 
     optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.05)) 
    classifier.fit(X, y, batch_size=128, steps=500) 
    ''' 

    return classifier 


def predict(classifier, X): 
    return list(classifier.predict(input_fn=pandas_input_fn(X, num_epochs=1), 
            as_iterable=True)) 


def evaluate(classifier, X, y, steps=1): 
    results = classifier.evaluate(input_fn=pandas_input_fn(X[FEATURE_COLUMNS], y), 
            steps=steps) 
    for key in sorted(results): 
     print("%s: %s" % (key, results[key])) 


if __name__ == '__main__': 
    # DOWNLOAD TITANIC TRAIN DATA 
    data = pd.read_csv('~/titanic_train.csv') # LOAD DATA 
    X, y = process_input_df(data) 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) 
    classifier = train(X_train, y_train, steps=100) 
    print("accuracy_score", accuracy_score(y_test, predict(classifier, X_test))) 
    evaluate(classifier, X_test, y_test, steps=1) 
+1

ありがとうございました! –

1

私は他の人に触発されて、次のコードを使用して、最後に、この問題を解決します。このコードが他の人に役立つことを願っています。上記のコメントをいただき、ありがとうございます。

def input_fn(batch_size,filename): 
    examples_op = tf.contrib.learn.read_batch_examples(
     filename, 
     batch_size=batch_size, 
     reader=tf.TextLineReader, 
     num_epochs=1, 
     parse_fn=lambda x: tf.decode_csv(x, [tf.constant([''], dtype=tf.string)] * len(HEADERS))) 

    examples_dict = {} 
    for i, header in enumerate(HEADERS): 
     examples_dict[header] = examples_op[:, i] 

    feature_cols = {k: tf.string_to_number(examples_dict[k], out_type=tf.float32) 
        for k in CONTINUOUS_COLUMNS} 

    feature_cols.update({k: dense_to_sparse(examples_dict[k]) 
         for k in CATEGORICAL_COLUMNS}) 

    label = tf.string_to_number(examples_dict[LABEL_COLUMN], out_type=tf.int32) 

    return feature_cols, label 

def input_fn_pre(batch_size,filename): 
    examples_op = tf.contrib.learn.read_batch_examples(
     filename, 
     batch_size=batch_size, 
     reader=tf.TextLineReader, 
     num_epochs=1, 
     parse_fn=lambda x: tf.decode_csv(x, [tf.constant([''], dtype=tf.string)] * len(HEADERS))) 

    examples_dict = {} 
    for i, header in enumerate(HEADERS): 
     examples_dict[header] = examples_op[:, i] 

    feature_cols = {k: tf.string_to_number(examples_dict[k], out_type=tf.float32) 
        for k in CONTINUOUS_COLUMNS} 

    feature_cols.update({k: dense_to_sparse(examples_dict[k]) 
         for k in CATEGORICAL_COLUMNS}) 
    return feature_cols 
def dense_to_sparse(dense_tensor): 
    indices = tf.to_int64(tf.transpose([tf.range(tf.shape(dense_tensor)[0]), tf.zeros_like(dense_tensor, dtype=tf.int32)])) 
    values = dense_tensor 
    shape = tf.to_int64([tf.shape(dense_tensor)[0], tf.constant(1)]) 

    return tf.SparseTensor(
     indices=indices, 
     values=values, 
     shape=shape 
    ) 

def train_and_eval(): 
    """Train and evaluate the model.""" 
    data = pd.read_csv('spark_traindata_forrun_no_nanindex.csv',skipinitialspace=True, 
     engine="python") 
    value_range={} 
    for column in CATEGORICAL_COLUMNS: 
    data[column]=data[column].astype(str) 
    value_range[column]=list(set(data[column])) 
    model_dir = './model6' 
    print("model directory = %s" % model_dir) 
    test = pd.read_csv('test.csv',names= HEADERS) 
    m = build_estimator(model_dir,value_range) 
    m.fit(input_fn=lambda: input_fn(128,'train.csv'), steps=FLAGS.train_steps) 
    results = m.evaluate(input_fn=lambda: input_fn(5000,'test.csv'), steps=1) 
    for key in sorted(results): 
    print("%s: %s" % (key, results[key])) 
関連する問題