2017-05-18 6 views
0

私は幅広い&テンソルフローチュートリアルを取りました。私は自分のデータセットで使用しました。私は基本的に列名とテンソルだけを変更しました。私はフロート値 "valeur"を予測したい。しかし、私は0の精度を得続けます。誰かがなぜその理由を説明するのですか?幅広く深いテンソルフローチュートリアルで0の精度が得られ続けます

NSVCUCOG; PCE; 05; DENMOY; H; AF; 0.619

COLUMNS = ["idPCE", "typeObj", "heure", "typeG", "pas", 
      "qualite", "valeur"] 
LABEL_COLUMN = "label" 
CATEGORICAL_COLUMNS = ["idPCE", "typeG", "pas", "qualite"] 
CONTINUOUS_COLUMNS = ["heure"] 


def maybe_download(train_data, test_data): 
    """Maybe downloads training data and returns train and test file names.""" 
    if train_data: 
    train_file_name = train_data 
    else: 
    train_file = tempfile.NamedTemporaryFile(delete=False) 
    urllib.request.urlretrieve("http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data", train_file.name) # pylint: disable=line-too-long 
    train_file_name = train_file.name 
    train_file.close() 
    print("Training data is downloaded to %s" % train_file_name) 

    if test_data: 
    test_file_name = test_data 
    else: 
    test_file = tempfile.NamedTemporaryFile(delete=False) 
    urllib.request.urlretrieve("http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.test", test_file.name) # pylint: disable=line-too-long 
    test_file_name = test_file.name 
    test_file.close() 
    print("Test data is downloaded to %s" % test_file_name) 

    return train_file_name, test_file_name 


def build_estimator(model_dir, model_type): 
    """Build an estimator.""" 
    # Sparse base columns. 
    idPCE = tf.contrib.layers.sparse_column_with_hash_bucket("idPCE", hash_bucket_size=1000) 
    typeG = tf.contrib.layers.sparse_column_with_keys(column_name="typeG", 
                keys=["DENMOY","ENETER","ETHMOY","METMOY","PCSMOY","PREMOY","TEMMOY","VOLBAL","VOLBCP","VOLBCR","VOLCAL","VOLCCU","VOLTER"]) 
    pas = tf.contrib.layers.sparse_column_with_keys(column_name="pas", 
                keys=["H","J"]) 
    qualite = tf.contrib.layers.sparse_column_with_keys(column_name="qualite", 
                keys=["A","AA","AD","AF","CS","M"]) 

    # Continuous base columns. 
    heure = tf.contrib.layers.real_valued_column("heure") 



    # Transformations. 
    heure_buckets = tf.contrib.layers.bucketized_column(heure, 
                boundaries=[ 
                 6, 12, 18 
                ]) 

    # Wide columns and deep columns. 
    wide_columns = [idPCE, typeG, pas, 
        qualite,heure_buckets, 
        tf.contrib.layers.crossed_column([typeG, qualite], 
                hash_bucket_size=int(1e4)), 
        tf.contrib.layers.crossed_column(
         [heure_buckets, idPCE, pas], 
         hash_bucket_size=int(1e6)), 
        tf.contrib.layers.crossed_column([heure_buckets, qualite], 
                hash_bucket_size=int(1e4))] 

    deep_columns = [ 
     tf.contrib.layers.embedding_column(qualite, dimension=3), 
     tf.contrib.layers.embedding_column(pas, dimension=1), 
     tf.contrib.layers.embedding_column(typeG, dimension=4), 
     tf.contrib.layers.embedding_column(idPCE, 
             dimension=3), 
     heure 
    ] 

    if model_type == "wide": 
    m = tf.contrib.learn.LinearClassifier(model_dir=model_dir, 
              feature_columns=wide_columns) 
    elif model_type == "deep": 
    m = tf.contrib.learn.DNNClassifier(model_dir=model_dir, 
             feature_columns=deep_columns, 
             hidden_units=[100, 50]) 
    else: 
    m = tf.contrib.learn.DNNLinearCombinedClassifier(
     model_dir=model_dir, 
     linear_feature_columns=wide_columns, 
     dnn_feature_columns=deep_columns, 
     dnn_hidden_units=[100, 50], 
     fix_global_step_increment_bug=True) 
    return m 


def input_fn(df): 
    """Input builder function.""" 
    # Creates a dictionary mapping from each continuous feature column name (k) to 
    # the values of that column stored in a constant Tensor. 
    continuous_cols = {k: tf.constant(df[k].values) for k in CONTINUOUS_COLUMNS} 
    # Creates a dictionary mapping from each categorical feature column name (k) 
    # to the values of that column stored in a tf.SparseTensor. 
    categorical_cols = { 
     k: tf.SparseTensor(
      indices=[[i, 0] for i in range(df[k].size)], 
      values=df[k].values, 
      dense_shape=[df[k].size, 1]) 
     for k in CATEGORICAL_COLUMNS} 
    # Merges the two dictionaries into one. 
    feature_cols = dict(continuous_cols) 
    feature_cols.update(categorical_cols) 
    # Converts the label column into a constant Tensor. 
    label = tf.constant(df[LABEL_COLUMN].values) 
    # Returns the feature columns and the label. 
    return feature_cols, label 


def train_and_eval(model_dir, model_type, train_steps, train_data, test_data): 
    """Train and evaluate the model.""" 
    train_file_name, test_file_name = maybe_download(train_data, test_data) 
    df_train = pd.read_csv(
     tf.gfile.Open(train_file_name), 
     sep=';', 
     names=COLUMNS, 
     skipinitialspace=True, 
     engine="python") 
    df_test = pd.read_csv(
     tf.gfile.Open(test_file_name), 
     sep=';', 
     names=COLUMNS, 
     skipinitialspace=True, 
     engine="python") 

    # remove NaN elements 
    df_train = df_train.dropna(how='any', axis=0) 
    df_test = df_test.dropna(how='any', axis=0) 

    df_train[LABEL_COLUMN] = pd.to_numeric(df_train["valeur"]) 
    df_test[LABEL_COLUMN] = pd.to_numeric(df_test["valeur"]) 


    model_dir = tempfile.mkdtemp() if not model_dir else model_dir 
    print("model directory = %s" % model_dir) 

    m = build_estimator(model_dir, model_type) 
    m.fit(input_fn=lambda: input_fn(df_train), steps=train_steps) 
    results = m.evaluate(input_fn=lambda: input_fn(df_test), steps=1) 
    for key in sorted(results): 
    print("%s: %s" % (key, results[key])) 


FLAGS = None 


def main(_): 
    train_and_eval(FLAGS.model_dir, FLAGS.model_type, FLAGS.train_steps, 
       FLAGS.train_data, FLAGS.test_data) 


if __name__ == "__main__": 
    parser = argparse.ArgumentParser() 
    parser.register("type", "bool", lambda v: v.lower() == "true") 
    parser.add_argument(
     "--model_dir", 
     type=str, 
     default="", 
     help="Base directory for output models." 
) 
    parser.add_argument(
     "--model_type", 
     type=str, 
     default="wide_n_deep", 
     help="Valid model types: {'wide', 'deep', 'wide_n_deep'}." 
) 
    parser.add_argument(
     "--train_steps", 
     type=int, 
     default=200, 
     help="Number of training steps." 
) 
    parser.add_argument(
     "--train_data", 
     type=str, 
     default="", 
     help="Path to the training data." 
) 
    parser.add_argument(
     "--test_data", 
     type=str, 
     default="", 
     help="Path to the test data." 
) 
    FLAGS, unparsed = parser.parse_known_args() 
    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) 

ここに私のCSVデータセット(電車やテストのための10000用47000)のラインです出力:

accuracy: 0.0 
accuracy/baseline_label_mean: 46.8802 
accuracy/threshold_0.500000_mean: 0.0 
auc: 0.5 
global_step: 200 
labels/actual_label_mean: 46.8802 
labels/prediction_mean: 1.0 
loss: -5.34063e+07 
precision/positive_threshold_0.500000_mean: 0.998047 
recall/positive_threshold_0.500000_mean: 1.0 

答えて

0

データラベルは、範囲(例: [0,1,2 ...]。

tf.contrib.learn.DNNLinearCombinedClassifier( n_classes = 5、

すべての浮動小数点数は、ラベルとして扱われ、デフォルトクラスの数は、私は私に説明できるすべてです2.

ですので

+0

答えがありがたいですが、5つのクラスは意味がありません。私の出力は任意の浮動小数点である可能性があるため、DNNClassifierはこれを解決できませんでした(決して分類ではありません)。 6.98953e + 06の損失でさらに私を得る...私はさらに多くのデータ(200 000行)を持っています。 。 –

関連する問題