ここにmy(OP)ソリューションがあります。変換とシリアル化の手順はかなり遅いです(1000サンプルあたり約3秒)。もっと効率的なものがあれば大いに感謝します。
import tensorflow as tf
######################################
# Define Feature conversion functions
######################################
def int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=[int(value)]))
def float_feature(value):
return tf.train.Feature(float_list=tf.train.FloatList(value=[float(value)]))
def bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(value)]))
####################################################
# Define tensorflow data feed from pandas DataFrame
####################################################
def input_fn(df, label_col_name, int_col_names, float_col_names, cat_col_names, num_epochs, batch_size, shuffle=False):
# Define new column groups
feature_col_names = int_col_names + float_col_names + cat_col_names
all_col_names = [label_col_name] + feature_col_names
# Create conversion and parser dicts
converters = {}
parse_dict = {}
for col in all_col_names:
if col in cat_col_names:
converters[col] = bytes_feature
parse_dict[col] = tf.VarLenFeature(tf.string)
elif col in float_col_names:
converters[col] = float_feature
parse_dict[col] = tf.FixedLenFeature([], tf.float32)
elif col in int_col_names + [label_col_name]:
converters[col] = int64_feature
parse_dict[col] = tf.FixedLenFeature([], tf.int64)
# Convert DataFrame rows to feature Examples, serialize examples to string
serialized_examples = []
for record in df[all_col_names].to_dict('records'):
feat_record = {k: converters[k](v) for k,v in record.iteritems()}
example = tf.train.Example(features=tf.train.Features(feature=feat_record))
serialized_examples.append(example.SerializeToString())
# Create input queue
example_queue = tf.train.slice_input_producer([serialized_examples], num_epochs=num_epochs, shuffle=shuffle)
# Create batch
example_batch = tf.train.batch(example_queue, batch_size=batch_size, capacity=30, allow_smaller_final_batch=True)
# Parse batch
parsed_example_batch = tf.parse_example(example_batch, parse_dict)
# Split into features and label
feature_batch = {k: parsed_example_batch[k] for k in feature_col_names}
label_batch = parsed_example_batch[label_col_name]
return feature_batch, label_batch
使用例:
import functools
import numpy as np
import pandas as pd
# Create toy dataset
df = pd.DataFrame(np.random.randint(1, 4, [7, 3]), columns=['c0', 'c1', 'c2'])
df['c1'] = df['c1'].astype(str) + 'g'
df['c2'] = (df['c2'] > 2.5).astype(int)
# Specify feature names
cat_feats = ['c1']
float_feats = []
int_feats = ['c0']
label_feat = 'c2'
# Create parameterless input function
epochs = 3
batch_size = 2
input_fn_train = functools.partial(input_fn, df, label_feat, int_feats, float_feats, cat_feats, epochs, batch_size)
# Define features
continuous_features = [tf.contrib.layers.real_valued_column(feat) for feat in float_feats+int_feats]
categorical_features = [tf.contrib.layers.sparse_column_with_hash_bucket(feat, hash_bucket_size=1000) for feat in cat_feats]
features = continuous_features + categorical_features
# Create and fit model
model = tf.contrib.learn.LinearClassifier(feature_columns=features)
model.fit(input_fn=input_fn_train, steps=1000)