2017-07-13 10 views
1

でカラムを通してループIは次のようにデータフレームが配置有する:便宜上dplyr鎖

df <- structure(list(name1 = c("A","A","B","B","A","A","B","B"), 
       name2  = c("B","B","C","C","ALL","ALL","ALL","ALL"), 
       pair_id = c(1,1,2,2,3,3,4,4), 
       year  = c(2010, 2011, 2010, 2011, 2010, 2011,2010, 2011), 
       var1  = c(1.5,2,4,5,12,15,20,18), 
       var2  = c(8,10,24,5.5,35,28,27,26), 
       var3  = c(25,6,12,18.5,30,41,33,38)), 
      .Names   = c("name1","name2","pair_id","year", "var1", "var2", "var3"), 
      row.names  = c("1", "2", "3", "4", "5", "6", "7", "8"), class =("data.frame")) 

Iは3つの変数(VAR1、VAR2、VAR3)のみを示しています。私は、年とpair_idごとに、私が持っているすべての変数(例えばvar1、var2、var3)の合計(ALL)の割合を計算したいと思います。希望の出力は

df <- structure(list(name1  = c("A","A","B","B","A","A","B","B"), 
       name2   = c("B","B","C","C","ALL","ALL","ALL","ALL"), 
       pair_id   = c(1,1,2,2,3,3,4,4), 
       year   = c(2010, 2011, 2010, 2011, 2010, 2011,2010, 2011), 
       var1   = c(1.5,2,4,5,12,15,20,18), 
       var2   = c(8,10,24,5.5,15,18,7,22), 
       var3   = c(25,6,12,18.5,29,11,12,30), 
       var1_share  = c(0.125, 0.133333333, 0.2, 0.277777778, 1, 1, 1, 1), 
       var2_share  = c(0.228571429, 0.357142857, 0.888888889, 0.211538462, 1, 1, 1, 1), 
       var3_share  = c(0.833333333, 0.146341463, 0.363636364, 0.486842105, 1, 1, 1, 1)), 
      .Names    = c("name1","name2","pair_id","year", "var1", "var2", "var3", "var1_share", "var2_share", "var3_share"), 
      row.names   = c("1", "2", "3", "4", "5", "6", "7", "8"), class =("data.frame")) 

ここに私の試みです。構文は明らかに間違っています。

varlist <- c("var1","var2","var3") 
for (var in varlist) { 
    df <- df %>% 
    group_by(name1, year) %>% 
    mutate(denom = var[name2 == "ALL"]) %>% 
    group_by(pair_id, add = TRUE) %>% 
    mutate(var_share = (var/denom)*100) 
    } 

あなたのご意見は高く評価されます。 dplyrtidyr

+0

私はあなたが 'var_share'を取得する方法はかなりよく分かりません。あなたは単一の値を持つ例を進めることができますか? – CPak

+0

なぜ 'dataframe'を分割して分割しないのですか? @ChiPak。 – Wen

+0

パーセンテージを計算する方法がわからないということですか?たぶんこれが明らかになるでしょう。 var1_share = var1 /(var_1 [name2 = "ALL"])をpair_idとyearで指定します。 – korone

答えて

0

library(dplyr) 
library(tidyr) 

df %>% 
    gather(var,val, var1, var2, var3) %>% 
    inner_join(.,filter(., name2=="ALL"), c("name1", "year", "var")) %>% 
    mutate(var_share_val = val.x/val.y, 
     var_share_var = paste0(var, "_share"), 
     name2 = name2.x, name2.x = NULL, 
     pair_id = pair_id.x, pair_id.x = NULL, 
     pair_id.y = NULL,name2.y = NULL, val.y = NULL) %>% 
    spread(var, val.x) %>% 
    spread(var_share_var, var_share_val) %>% 
    group_by(name1, name2, year) %>% 
    summarize_all(.funs=funs(max(., na.rm=TRUE))) 

    name1 name2 year pair_id var1 var2 var3 var1_share var2_share var3_share 
    <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>  <dbl>  <dbl> 
1  A ALL 2010  3 12.0 35.0 30.0 1.0000000 1.0000000 1.0000000 
2  A ALL 2011  3 15.0 28.0 41.0 1.0000000 1.0000000 1.0000000 
3  A  B 2010  1 1.5 8.0 25.0 0.1250000 0.2285714 0.8333333 
4  A  B 2011  1 2.0 10.0 6.0 0.1333333 0.3571429 0.1463415 
5  B ALL 2010  4 20.0 27.0 33.0 1.0000000 1.0000000 1.0000000 
6  B ALL 2011  4 18.0 26.0 38.0 1.0000000 1.0000000 1.0000000 
7  B  C 2010  2 4.0 24.0 12.0 0.2000000 0.8888889 0.3636364 
8  B  C 2011  2 5.0 5.5 18.5 0.2777778 0.2115385 0.4868421 
+0

ありがとうございます。あなたの解決策を試してみましょう。 – korone