ゼロからのPythonで複数の線形回帰を実装しています。あらゆる時代の後のコストは非常に急速に増加し、最終的には溢れています。何が起こっていますか?論理的な誤りまたは系統的な誤りはありますか?ここ は、コードは次のとおりです。コスト関数は勾配降下の実装では減少しません
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
class LinearRegression:
def __init__(self):
#initialize parameters with none values
self.cost_history = None
self.std_dev = None
self.mean = None
self.weights = None
def set_mean_and_std_dev(self, X_data, axis=0):
#mean and std_deviation of training_data
self.std_dev = np.array(np.std(X_data, axis = axis))
self.mean = np.array(np.mean(X_data, axis = axis))
print("Mean : ", self.mean)
print("Standard deviation : ", self.std_dev)
return
def normalize(self, X_data):
#normalizing the data
X_data = (X_data - self.mean)/(self.std_dev+0.00001)
return
def cost(self, X, y):
#squared error based cost
return np.sum((np.dot(X, self.weights.T)-y)**2)
def get_gradient(self, X, y, pred_y, learning_rate):
grad = np.zeros(shape=(X.shape[1],), dtype='float64')
#print("X.shape : ", X.shape, " grad.shape : ", grad.shape, "pred_y.shape", pred_y.shape, "y.shape : ", y.shape)
for ix in range(X.shape[0]):
#for each example in X
x_example = X[ix, :]
error = (pred_y[ix] - y[ix])
for jx in range(grad.shape[0]):
#for each feature of X
grad[jx] = grad[jx] + x_example[jx]*error
for ix in range(grad.shape[0]):
#divide by the number of examples
grad[jx] = grad[jx]*learning_rate*(1.0/X.shape[0])
return grad
def gradient_descent(self, X, y, learning_rate=0.001, num_iterations=100):
self.weights = np.zeros(shape=(X.shape[1],), dtype='float64')
for ix in range(X.shape[1]):
#initialize weights with random values
self.weights[ix] = np.random.rand()*100
#initialize cost history
self.cost_history = []
for ix in range(num_iterations):
pred_y = np.dot(X, self.weights.T)
gradient = self.get_gradient(X=X, y=y, pred_y=pred_y, learning_rate=learning_rate)
self.weights = self.weights - gradient
cost = self.cost(X, y)
self.cost_history.append(cost)
if ix%10 == 0:
print("(learning_rate/(X.shape[0]))*gradient) : ", gradient)
print("Cost at ", ix, " epoch is : ", cost)
def predict(self, X):
return np.dot(X, self.weights.T)
def get_weights(self):
return self.weights
#create a linear regression object
lr = LinearRegression()
# # Some random function generator
# y = 2.5 + 2*X1 - X2 - 3*X3 + 1.23*X4
def generate_data(X):
y = np.zeros(shape=(X.shape[0],), dtype='float64')
for ix in range(X.shape[0]):
y[ix] = 1.23 + 2.5*X[ix, 0] + 2*X[ix, 1] - X[ix, 2] - 3*X[ix, 3]
return y
X = np.zeros(shape=(300, 5), dtype='float64')
data_gen = [[np.random.rand()*107 for jx in range(4)] for ix in range(300)]
X[:, 1:] = np.array(data_gen, dtype='float64')
y = generate_data(X[:, 1:])
lr.set_mean_and_std_dev(X)
lr.normalize(X)
X[:, 0] = 1
print(X.shape, y.shape, X.dtype, y.dtype)
print(X[0], y[0])
X_train, X_test, y_train, y_test = X[:200], X[200:], y[:200], y[200:]
print(y_test)
lr.gradient_descent(X_train, y_train, 0.01, 500)
pred_y = lr.predict(X_test)
あなたは小さな学習率を試してみましたか? –
はい、私は0.00001と0.0000001も試しましたが、それは撮影を続けます。 –
私は最良のアプローチは、2Dデータセット(300 x 2)をとり、グラジエント、ウェイト、およびコストの値を追跡することだと思います。それはあなたがエラーを単一にするのを助けるべきです。また、紙で解くことができる例を使って作業するのにも役立ちます。 –