hereの実装を使用していくつか問題があるようです。 私は出力にマップして入力しようとしている男の投稿と同様の状況に少しあります。入力はオーディオファイルのサンプルであり、出力は長さ14の特徴ベクトル(長さは静的)である。オーディオファイルの長さが異なると、シーケンスの長さは可変であり、サンプルを含むベクトルも異なる長さになります。ゼロ詰めが正しく行われていないと思われる
私は分類問題を解決するのではなく、回帰であるため、少し別のタスクです。
私のコードは次のようになります。私は理解して、私は、エラーメッセージがMAX_LENGTHの使用によるものであり、入力を取得
import tensorflow as tf
from tensorflow.models.rnn import rnn_cell
from tensorflow.models.rnn import rnn
import numpy as np
import librosa
import glob
import matplotlib.pyplot as plt
from os import listdir
from os.path import isfile, join
import os
from os import walk
from os.path import splitext
from os.path import join
import time
rng = np.random
np.set_printoptions(threshold=np.nan)
import functools
start_time = time.time()
print "Preprocessing"
def lazy_property(function):
attribute = '_' + function.__name__
@property
@functools.wraps(function)
def wrapper(self):
if not hasattr(self, attribute):
setattr(self, attribute, function(self))
return getattr(self, attribute)
return wrapper
## Class definition ##
class VariableSequenceLabelling:
def __init__(self, data, target, num_hidden=200, num_layers=3):
self.data = data
self.target = target
self._num_hidden = num_hidden
self._num_layers = num_layers
self.prediction
self.error
self.optimize
@lazy_property
def length(self):
used = tf.sign(tf.reduce_max(tf.abs(self.data), reduction_indices=2))
length = tf.reduce_sum(used, reduction_indices=1)
length = tf.cast(length, tf.int32)
return length
@lazy_property
def prediction(self):
# Recurrent network.
output, _ = tf.nn.dynamic_rnn(
rnn_cell.GRUCell(self._num_hidden),
self.data,
dtype=tf.float32,
sequence_length=self.length,
)
# Softmax layer.
max_length = int(self.target.get_shape()[1])
num_classes = int(self.target.get_shape()[2])
weight, bias = self._weight_and_bias(self._num_hidden, num_classes)
# Flatten to apply same weights to all time steps.
output = tf.reshape(output, [-1, self._num_hidden])
prediction = tf.nn.softmax(tf.matmul(output, weight) + bias)
prediction = tf.reshape(prediction, [-1, max_length, num_classes])
return prediction
@lazy_property
def cost(self):
# Compute cross entropy for each frame.
cross_entropy = self.target * tf.log(self.prediction)
cross_entropy = -tf.reduce_sum(cross_entropy, reduction_indices=2)
mask = tf.sign(tf.reduce_max(tf.abs(self.target), reduction_indices=2))
cross_entropy *= mask
# Average over actual sequence lengths.
cross_entropy = tf.reduce_sum(cross_entropy, reduction_indices=1)
cross_entropy /= tf.cast(self.length, tf.float32)
return tf.reduce_mean(cross_entropy)
@lazy_property
def optimize(self):
learning_rate = 0.0003
optimizer = tf.train.AdamOptimizer(learning_rate)
return optimizer.minimize(self.cost)
@lazy_property
def error(self):
mistakes = tf.not_equal(
tf.argmax(self.target, 2), tf.argmax(self.prediction, 2))
mistakes = tf.cast(mistakes, tf.float32)
mask = tf.sign(tf.reduce_max(tf.abs(self.target), reduction_indices=2))
mistakes *= mask
# Average over actual sequence lengths.
mistakes = tf.reduce_sum(mistakes, reduction_indices=1)
mistakes /= tf.cast(self.length, tf.float32)
return tf.reduce_mean(mistakes)
@staticmethod
def _weight_and_bias(in_size, out_size):
weight = tf.truncated_normal([in_size, out_size], stddev=0.01)
bias = tf.constant(0.1, shape=[out_size])
return tf.Variable(weight), tf.Variable(bias)
#######################
#Converting file to .wav from .sph file format... God dammit!!!
#with open(train_filelist, 'r') as train_filelist, open(test_filelist, 'r') as test_filelist:
#train_mylist = train_filelist.read().splitlines()
#test_mylist = test_filelist.read().splitlines()
#for line in train_mylist:
#new_line = ' '.join(reversed(line))
#index_start = new_line.find('h')
#index_end = new_line.find('/')
#edited_line = ''.join(reversed(new_line[index_start+5:index_end])).strip().replace(" ","")
#new_file = edited_line + 'wav'
#os.system(line + ' >> ' + dnn_train + new_file)
#for line in test_mylist:
#new_line = ' '.join(reversed(line))
#index_start = new_line.find('h')
#index_end = new_line.find('/')
#edited_line = ''.join(reversed(new_line[index_start+5:index_end])).strip().replace(" ","")
#new_file = edited_line + 'wav'
#os.system(line + ' >> ' + dnn_test + new_file)
path_train = "/home/JoeS/kaldi-trunk/egs/start/s5/data/train"
path_test = "/home/JoeS/kaldi-trunk/egs/start/s5/data/test"
dnn_train = "/home/JoeS/kaldi-trunk/dnn/train/"
dnn_test = "/home/JoeS/kaldi-trunk/dnn/test/"
dnn = "/home/JoeS/kaldi-trunk/dnn/"
path = "/home/JoeS/kaldi-trunk/egs/start/s5/data/"
MFCC_dir = "/home/JoeS/kaldi-trunk/egs/start/s5/mfcc/raw_mfcc_train.txt"
train_filelist = path_train+"/wav_train.txt"
test_filelist = path_test+"/wav_test.txt"
os.chdir(path)
def find_all(a_str, sub):
start = 0
while True:
start = a_str.find(sub, start)
if start == -1: return
yield start
start += len(sub) # use start += 1 to find overlapping matches
def load_sound_files(file_paths , names_input, data_input):
raw_sounds = []
names_output = []
data_output = []
class_output = []
for fp in file_paths:
X,sr = librosa.load(fp)
raw_sounds.append(X)
index = list(find_all(fp,'-'))
input_index = names_input.index(fp[index[1]+1:index[2]])
names_output.append(names_input[input_index])
data_output.append(data_input[input_index])
class_output.append(binify(data_input[input_index][0]))
return raw_sounds, names_output, data_output, class_output
def generate_list_of_names_data(file_path):
# Proprocess
# extract name and data
name = []
data = []
with open(MFCC_dir) as mfcc_feature_list:
content = [x.strip('\n') for x in mfcc_feature_list.readlines()] # remove endlines
start_index_data = 0
end_index_data = 2
for number in range(0,42):
start = list(find_all(content[start_index_data],'['))[0]
end = list(find_all(content[end_index_data],']'))[0]
end_name = list(find_all(content[start_index_data],' '))[0]
substring_data = content[start_index_data][start+1 :]+content[end_index_data][: end]
substring_name = content[start_index_data][:end_name]
arr = np.array(substring_data.split(), dtype = float)
data.append(arr)
name.append(substring_name)
start_index_data = start_index_data + +3
end_index_data = end_index_data +3
return name, data
files_train_path = [dnn_train+f for f in listdir(dnn_train) if isfile(join(dnn_train, f))]
files_test_path = [dnn_test+f for f in listdir(dnn_test) if isfile(join(dnn_test, f))]
files_train_name = [f for f in listdir(dnn_train) if isfile(join(dnn_train, f))]
files_test_name = [f for f in listdir(dnn_test) if isfile(join(dnn_test, f))]
os.chdir(dnn_train)
train_name,train_data = generate_list_of_names_data(files_train_path)
train_data, train_names, train_output_data, train_class_output = load_sound_files(files_train_path,train_name,train_data)
max_length = 0 ## Used for variable sequence input
for element in train_data:
if element.size > max_length:
max_length = element.size
NUM_EXAMPLES = len(train_data)/2
test_data = train_data[NUM_EXAMPLES:]
test_output = train_output_data[NUM_EXAMPLES:]
train_data = train_data[:NUM_EXAMPLES]
train_output = train_output_data[:NUM_EXAMPLES]
print("--- %s seconds ---" % (time.time() - start_time))
##-------------------MAIN----------------------------##
if __name__ == '__main__':
data = tf.placeholder(tf.float32, [None, max_length, 1])
target = tf.placeholder(tf.float32, [None, 14, 1])
model = VariableSequenceLabelling(data, target)
sess = tf.Session()
sess.run(tf.initialize_all_variables())
for epoch in range(10):
for sample_set in range(100):
batch_train = train_data[sample_set]
batch_target = train_output[sample_set]
sess.run(model.optimize, {data: batch_train, target: batch_target})
test_set = test_data[epoch]
test_set_output = test_output[epoch]
error = sess.run(model.error, {data: test_set, target: test_set_output})
print('Epoch {:2d} error {:3.1f}%'.format(epoch + 1, 100 * error))
、エラーメッセージが
Traceback (most recent call last):
File "tensorflow_datapreprocess_mfcc_extraction_rnn.py", line 239, in <module>
sess.run(model.optimize, {data: batch_train, target: batch_target})
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 340, in run
run_metadata_ptr)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 553, in _run
% (np_val.shape, subfeed_t.name, str(subfeed_t.get_shape())))
ValueError: Cannot feed value of shape (63945,) for Tensor u'Placeholder:0', which has shape '(?, 138915, 1)'
ですそれは適切なサイズを持っていません - 入力が正しくゼロ埋められていないことを意味していますか?または間違っていますか?もしそうなら、私はそれをどのように修正するのですか?私が求めている解決策はテンソルフローからネイティブに来ていないようですが、他のフレームワークはこれをネイティブに行います - 機能がないために別のフレームワークを使用することをお勧めしますか?