2017-06-12 20 views
0

人工データを使った線形回帰のシミュレーションを行い、RSEとR Squareを手動で計算しています。私は、モデルを訓練したIn Sampleデータセットに対してこれを行い、次にOut of Sampleデータセットでモデルをテストします。サンプル外およびサンプル内のデータは、同じ正規分布から引き出されますが、異なるシードが使用されます。私の数字は、サンプル外のデータセットに関しては意味がありません。バグを見つけるのを助けてくれますか?負の線形回帰のテストデータセットのR-二乗?

set.seed(1) 
z1 <- rnorm(100) 
z2 <- z1^2 
error <- rnorm(100, sd = 0.25) 
y1 <- 1 + 2 * z1 + error 
data1 <- data.table(y1, z1, z2) 
model_quad <- lm(y1 ~ z1 + z2, data1) 
model_lin <- lm(y1 ~ z1, data1) 

confint(model_lin) 
confint(model_quad) 

summary(model_lin) 
summary(model_quad) 

ggplot(data1) + 
    geom_point(aes(x = z1, y = y1), color = "blue", size = 3) + 
    geom_point(aes(x = z2, y = y1), color = "red", size = 3) + 
    geom_line(stat = "smooth", method = lm, aes(x = z1, y = y1), color = "blue", size = 2, alpha = 0.5) + 
    geom_line(stat = "smooth", method = lm, aes(x = z2, y = y1), color = "red", size = 2, alpha = 0.5) + 
    geom_ribbon(stat = "smooth", method = lm, aes(x = z1, y = y1), fill = "blue", alpha = 0.1) + 
    geom_ribbon(stat = "smooth", method = lm, aes(x = z2, y = y1), fill = "red", alpha = 0.1) 

set.seed(100) 
z12 <- rnorm(100) 
z22 <- z12^2 
error2 <- rnorm(100, sd = 0.25) 
y2 <- 1 + 2 * z12 + error2 
data2 <- data.table(y2, z12, z22) 

summary(model_lin) 
summary(model_quad) 

ggplot(data2) + 
    geom_point(aes(x = z12, y = y2), color = "blue", size = 3) + 
    geom_point(aes(x = z22, y = y2), color = "red", size = 3) + 
    geom_line(stat = "smooth", method = lm, aes(x = z12, y = y2), color = "blue", size = 2, alpha = 0.5) + 
    geom_line(stat = "smooth", method = lm, aes(x = z22, y = y2), color = "red", size = 2, alpha = 0.5) + 
    geom_ribbon(stat = "smooth", method = lm, aes(x = z12, y = y2), fill = "blue", alpha = 0.1) + 
    geom_ribbon(stat = "smooth", method = lm, aes(x = z22, y = y2), fill = "red", alpha = 0.1) + 
    geom_abline(intercept = 0.99, slope = 1.999, size = 2, color = "yellow", alpha = 0.3) 


predictions_in_sample_linear <- predict(model_lin, data1) 
predictions_in_sample_quadratic <- predict(model_quad, data1) 
predictions_out_of_sample_linear <- predict(model_lin, data2) 
predictions_out_of_sample_quadratic <- predict(model_quad, data2) 
TSE_in_sample <- (y1 - mean(y1)) %*% (y1 - mean(y1)) 
RSE_in_sample_linear <- (predictions_in_sample_linear - y1) %*% (predictions_in_sample_linear - y1) 
RSE_in_sample_quadratic <- (predictions_in_sample_quadratic - y1) %*% (predictions_in_sample_quadratic - y1) 
R_Square_in_sample_linear <- (TSE_in_sample - RSE_in_sample_linear)/TSE_in_sample 
R_Square_in_sample_quadratic<- (TSE_in_sample - RSE_in_sample_quadratic)/TSE_in_sample 
TSE_out_of_sample <- (y2 - mean(y2)) %*% (y2 - mean(y2)) 
RSE_out_of_sample_linear <- (predictions_out_of_sample_linear - y2) %*% (predictions_out_of_sample_linear - y2) 
RSE_out_of_sample_quadratic <- (predictions_out_of_sample_quadratic - y2) %*% (predictions_out_of_sample_quadratic - y2) 
R_Square_out_of_sample_linear <- (TSE_out_of_sample - RSE_out_of_sample_linear)/TSE_out_of_sample 
R_Square_out_of_sample_quadratic<- (TSE_out_of_sample - RSE_out_of_sample_quadratic)/TSE_out_of_sample 

predictions_in_sample_linear 
predictions_in_sample_quadratic 
predictions_out_of_sample_linear 
predictions_out_of_sample_quadratic 
TSE_in_sample 
RSE_in_sample_linear 
RSE_in_sample_quadratic 
R_Square_in_sample_linear 
R_Square_in_sample_quadratic 
TSE_out_of_sample 
RSE_out_of_sample_linear 
RSE_out_of_sample_quadratic 
R_Square_out_of_sample_linear 
R_Square_out_of_sample_quadratic 

このコードは、Out of Sampleデータに負のR_squareを返します。これは不合理です。

あなたのアドバイスは高く評価されます。

答えて

0

長い質問ですが短い答えです。あなたはこれが

RSE_out_of_sample_linear 
# 0.9902969 

RSE_out_of_sample_quadratic 
# 0.989241 
を与える

data2 <- data.frame(y1 = y2, z1 = z12, z2 = z22) 

使用する必要があります