2016-04-22 8 views
0

私はいくつかの大きな時系列データセットで作業しており、各ファイルには約2百万行もあります。これまではddplyを使用してデータを集計していましたが、残念ながら遅すぎるため、実際にはもっと速い方法が必要です。ここに私のコードです:dplyplyをdplyrパッケージで複製していますか? ddplyが遅すぎる

DF <- read.csv(file = "NSE/20151221/AUROPHARMA15DECFUT_20151221_ob.csv",header = FALSE,sep = "", col.names = c("DateTime","Seq","BP1","BQ1","BO1","AP1","AQ1","AO1","BP2","BQ2","BO2","AP2","AQ2","AO2","BP3","BQ3","BO3","AP3","AQ3","AO3","BP4","BQ4","BO4","AP4","AQ4","AO4","BP5","BQ5","BO5","AP5","AQ5","AO5","BP6","BQ6","BO6","AP6","AQ6","AO6","BP7","BQ7","BO7","AP7","AQ7","AO7","BP8","BQ8","BO8","AP8","AQ8","AO8","BP9","BQ9","BO9","AP9","AQ9","AO9","BP10","BQ10","BO10","AP10","AQ10","AO10", "C", "Price", "Qty", "OldPrice", "OldQty"), colClasses = c(NA, rep("integer",31), rep("NULL", 35))) 
    DF <- DF[which(DF$DateTime != 0),] 
    options(digits.secs = 3) 
    DF$DateTime = as.POSIXct(DF$DateTime/(10^9), origin="1970-01-01")  
    completecase <- DF[complete.cases(DF),] 
    midpoint = data.frame(DateTime=completecase$DateTime, MP=(completecase$BP1+completecase$AP1)/2) 
    **# creating 10 millisecond time intervals** 
    cuts = seq.POSIXt(from = min(midpoint$DateTime), to = max(midpoint$DateTime), by = .01) 
    **#creating new Time variable with the 10 millisecond breaks** 
    midpoint$Time = cut(midpoint$DateTime, breaks = cuts) 
    **#summarizing the MP variable every 10millisecond while keeping the empty time frames with the .drop = FALSE argument** 
    mp = ddply(midpoint, .(Time), summarise, MP = mean(MP), .drop = FALSE) 
    mp$Time = as.POSIXct(mp$Time) 
    mp_xts = xts(mp$MP, mp$Time, tzone = 'Asia/Kolkata') 
    mp_xts = mp_xts["2015-12-21 09:15:00.000/2015-12-21 15:30:00.000"] 
    mp_xts = makeReturns(mp_xts) 

ddplyはあまりにも遅いです。私はdata.tableパッケージまたはdplyrパッケージを使ってこれを行うことができますが、.drop = FALSE引き数を複製することはできません。両方の他のパッケージはすべての空の時間間隔を削除しますが、分析のためにそれらを保持することが重要です。

私がしようとしているのは、基本的には、10ミリ秒ごとに自分のタイムシリーズを集計し、NAタイムフレームに0またはlocfを入力します。それが正しいのであれば、出力として6.25 * 60 * 60 * 100 = 2,250,000行が必要です。

ddplyと全く同じことをやっているうちに、これをもっと速くする方法はありますか?

> dput(DF[1:20,]) 
structure(list(DateTime = structure(c(1450669500.804, 1450669500.806, 
1450669500.806, 1450669500.807, 1450669500.807, 1450669500.808, 
1450669500.812, 1450669500.813, 1450669500.813, 1450669500.813, 
1450669500.814, 1450669500.819, 1450669500.82, 1450669500.82, 
1450669500.827, 1450669500.85, 1450669500.85, 1450669500.85, 
1450669500.851, 1450669500.851), class = c("POSIXct", "POSIXt" 
), tzone = ""), Seq = c(104L, 163L, 169L, 190L, 198L, 227L, 301L, 
315L, 319L, 320L, 326L, 404L, 429L, 435L, 583L, 928L, 931L, 932L, 
944L, 947L), BP1 = c(82055L, 82055L, 82055L, 82055L, 82055L, 
82630L, 82630L, 82630L, 82630L, 82630L, 82630L, 82630L, 82630L, 
82630L, 82630L, 82630L, 82630L, 82630L, 82830L, 82830L), BQ1 = c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), BO1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), AP1 = c(0L, 87800L, 
83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 
83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 83795L, 83795L, 
83795L, 83795L), AQ1 = c(0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L), AO1 = c(0L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L), BP2 = c(0L, 0L, 0L, 0L, 0L, 82055L, 82525L, 82525L, 
82525L, 82525L, 82525L, 82525L, 82525L, 82525L, 82525L, 82525L, 
82525L, 82525L, 82630L, 82630L), BQ2 = c(0L, 0L, 0L, 0L, 0L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
    BO2 = c(0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L), AP2 = c(0L, 0L, 87800L, 84805L, 
    84230L, 84230L, 84230L, 84230L, 84230L, 84230L, 84230L, 84230L, 
    84230L, 84230L, 84230L, 84230L, 83800L, 83800L, 83800L, 83800L 
    ), AQ2 = c(0L, 0L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), AO2 = c(0L, 0L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L), BP3 = c(0L, 0L, 0L, 0L, 0L, 0L, 82055L, 82055L, 82055L, 
    82055L, 82055L, 82320L, 82320L, 82320L, 82320L, 82320L, 82320L, 
    82320L, 82525L, 82525L), BQ3 = c(0L, 0L, 0L, 0L, 0L, 0L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
    BO3 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L), AP3 = c(0L, 0L, 0L, 87800L, 
    84805L, 84805L, 84805L, 84805L, 84805L, 84805L, 84805L, 84805L, 
    84805L, 84805L, 84805L, 84805L, 84230L, 84230L, 84230L, 84230L 
    ), AQ3 = c(0L, 0L, 0L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L), AO3 = c(0L, 0L, 0L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L), BP4 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 82035L, 82035L, 
    82035L, 82055L, 82055L, 82055L, 82055L, 82060L, 82060L, 82060L, 
    82320L, 82320L), BQ4 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), BO4 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L), AP4 = c(0L, 0L, 0L, 0L, 87800L, 87800L, 
    87800L, 85380L, 85380L, 85380L, 85365L, 85365L, 85365L, 85365L, 
    84980L, 84980L, 84805L, 84805L, 84805L, 84400L), AQ4 = c(0L, 
    0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    2L, 2L, 2L, 1L), AO4 = c(0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), BP5 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 80035L, 80035L, 82035L, 82035L, 
    82035L, 82035L, 82055L, 82055L, 82055L, 82060L, 82060L), 
    BQ5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L), BO5 = c(0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L 
    ), AP5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 87800L, 87800L, 87800L, 
    85380L, 85380L, 85380L, 85380L, 85365L, 85365L, 84980L, 84980L, 
    84980L, 84805L), AQ5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L), AO5 = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L)), .Names = c("DateTime", "Seq", "BP1", "BQ1", 
"BO1", "AP1", "AQ1", "AO1", "BP2", "BQ2", "BO2", "AP2", "AQ2", 
"AO2", "BP3", "BQ3", "BO3", "AP3", "AQ3", "AO3", "BP4", "BQ4", 
"BO4", "AP4", "AQ4", "AO4", "BP5", "BQ5", "BO5", "AP5", "AQ5", 
"AO5"), row.names = c(NA, 20L), class = "data.frame") 

他の情報を投稿する必要があり、本当に助けていただければ幸いです。

> sessionInfo() 
R version 3.2.2 (2015-08-14) 
Platform: x86_64-pc-linux-gnu (64-bit) 
Running under: Ubuntu precise (12.04.5 LTS) 

locale: 
[1] LC_CTYPE=en_IN.UTF-8  LC_NUMERIC=C    LC_TIME=en_IN.UTF-8  LC_COLLATE=en_IN.UTF-8  LC_MONETARY=en_IN.UTF-8 LC_MESSAGES=en_IN.UTF-8 LC_PAPER=en_IN.UTF-8  
[8] LC_NAME=C     LC_ADDRESS=C    LC_TELEPHONE=C    LC_MEASUREMENT=en_IN.UTF-8 LC_IDENTIFICATION=C  

attached base packages: 
[1] stats  graphics grDevices utils  datasets methods base  

other attached packages: 
[1] highfrequency_0.4 xts_0.9-7   zoo_1.7-12  data.table_1.9.7 

loaded via a namespace (and not attached): 
[1] tools_3.2.2  grid_3.2.2  lattice_0.20-33 
+0

あなたがdata.table使用して、バックの空のレベルを追加することができますマージを使用しています。 – Heroka

+0

特に、あなたのコードが複数のpkgに依存する場合は、 'library'または' require'への呼び出しを含めるとよいでしょう。また、必要に応じてサンプルコードを小さくすると、人々に質問をさせるのに役立ちます。 – jangorecki

+0

@jangorecki私は実際に自分のコードでライブラリを呼び出す必要がありますが、これははるかに大きなコードのほんの一部です。私は質問を可能な限りきちんとしておきたいので残りの部分を投稿することを心配しませんでした – UtdMan

答えて

3

最新development version非エクイは、この種の問題のための非常にコンパクトで高速なソリューションを可能にする合流追加:

dt = as.data.table(df) # or convert in place using setDT 
breaks = dt[, {tmp = seq(min(DateTime) - 0.01, max(DateTime) + 0.01, 0.01); 
       .(Start = tmp, End = c(tail(tmp, -1), Inf))}]; 

dt[breaks, on = .(DateTime >= Start, DateTime < End), 
    lapply(.SD, mean), by = .EACHI] 
+1

@Frank私は[このFR](https://github.com/Rdatatable/data.table/issues/1615)はそれを修正すると思います - 彼らは本当に "Start"と "End"という名前にしてください – eddi

+0

私はエラーが発生していますこのソリューションは、私はdata.tableを新しくしているので、問題が何であるか把握できません。エラー: '' .data.table'のエラー(dt、breaks、lapply(.SD、mean)、on =):DateTime> =: 関数が見つかりませんでした "。" ' – UtdMan

+0

@UtdMan add 'sessionInfo'をデバッグするのがより簡単になります – jangorecki

2

Rcppで独自のC++実装を作成しようとすることを除いて、data.tableよりはるかに優れていることはありません。とにかく、cut()の代わりにfindInterval()を使用するdata.tableの解決法がありますが、それはわずかに速くなければなりません。また、私はminとmaxの外でブレークを拡張して、余分な空き区間をテストしました。 data.table

library(data.table); 
setDT(DF); 
breaks <- seq.POSIXt(min(DF$DateTime)-0.01,max(DF$DateTime)+0.01,0.01); 
breaks; 
## [1] "2015-12-20 22:45:00 EST" "2015-12-20 22:45:00 EST" "2015-12-20 22:45:00 EST" 
## [4] "2015-12-20 22:45:00 EST" "2015-12-20 22:45:00 EST" "2015-12-20 22:45:00 EST" 
## [7] "2015-12-20 22:45:00 EST" 
diff(breaks); 
## Time differences in secs 
## [1] 0.00999999 0.00999999 0.00999999 0.00999999 0.00999999 0.00999999 
ints <- findInterval(DF$DateTime,breaks); 
ints; 
## [1] 2 2 2 2 2 2 2 2 2 2 3 3 3 3 4 6 6 6 6 6 
agg <- DF[,lapply(.SD,mean),breaks[ints],.SDcols=-1L]; 
agg; 
##     breaks Seq  BP1 BQ1 BO1 AP1 AQ1 AO1  BP2 BQ2 BO2  AP2 AQ2 AO2  BP3 BQ3 BO3 AP3 AQ3 AO3 BP4 BQ4 BO4 AP4 AQ4 AO4  BP5 BQ5 BO5 AP5 AQ5 AO5 
## 1: 2015-12-20 22:45:00 230.6 82342.5 1 1 75820 0.90 0.9 41215.5 0.5 0.5 67798.5 0.9 0.8 32822.00 0.4 0.4 59663 1.3 0.7 16407 0.2 0.2 51954 0.6 0.6 8003.5 0.1 0.1 26340 0.3 0.3 
## 2: 2015-12-20 22:45:00 398.5 82630.0 1 1 83800 1.25 1.0 82525.0 1.0 1.0 84230.0 1.0 1.0 82253.75 1.0 1.0 84805 2.0 1.0 82050 1.0 1.0 85365 1.0 1.0 81535.0 1.0 1.0 85380 1.0 1.0 
## 3: 2015-12-20 22:45:00 583.0 82630.0 1 1 83800 2.00 1.0 82525.0 1.0 1.0 84230.0 1.0 1.0 82320.00 1.0 1.0 84805 2.0 1.0 82055 1.0 1.0 84980 1.0 1.0 82035.0 1.0 1.0 85365 1.0 1.0 
## 4: 2015-12-20 22:45:00 936.4 82710.0 1 1 83796 1.20 1.0 82567.0 1.0 1.0 83886.0 1.8 1.0 82402.00 1.0 1.0 84345 1.2 1.0 82164 1.0 1.0 84759 1.6 1.0 82057.0 1.0 1.0 85022 1.2 1.0 
emp <- setdiff(seq_along(breaks),ints); 
emp; 
## [1] 1 5 7 
agg <- rbind(fill=T,agg,data.table(breaks=breaks[emp])); 
agg; 
##     breaks Seq  BP1 BQ1 BO1 AP1 AQ1 AO1  BP2 BQ2 BO2  AP2 AQ2 AO2  BP3 BQ3 BO3 AP3 AQ3 AO3 BP4 BQ4 BO4 AP4 AQ4 AO4  BP5 BQ5 BO5 AP5 AQ5 AO5 
## 1: 2015-12-20 22:45:00 230.6 82342.5 1 1 75820 0.90 0.9 41215.5 0.5 0.5 67798.5 0.9 0.8 32822.00 0.4 0.4 59663 1.3 0.7 16407 0.2 0.2 51954 0.6 0.6 8003.5 0.1 0.1 26340 0.3 0.3 
## 2: 2015-12-20 22:45:00 398.5 82630.0 1 1 83800 1.25 1.0 82525.0 1.0 1.0 84230.0 1.0 1.0 82253.75 1.0 1.0 84805 2.0 1.0 82050 1.0 1.0 85365 1.0 1.0 81535.0 1.0 1.0 85380 1.0 1.0 
## 3: 2015-12-20 22:45:00 583.0 82630.0 1 1 83800 2.00 1.0 82525.0 1.0 1.0 84230.0 1.0 1.0 82320.00 1.0 1.0 84805 2.0 1.0 82055 1.0 1.0 84980 1.0 1.0 82035.0 1.0 1.0 85365 1.0 1.0 
## 4: 2015-12-20 22:45:00 936.4 82710.0 1 1 83796 1.20 1.0 82567.0 1.0 1.0 83886.0 1.8 1.0 82402.00 1.0 1.0 84345 1.2 1.0 82164 1.0 1.0 84759 1.6 1.0 82057.0 1.0 1.0 85022 1.2 1.0 
## 5: 2015-12-20 22:45:00 NA  NA NA NA NA NA NA  NA NA NA  NA NA NA  NA NA NA NA NA NA NA NA NA NA NA NA  NA NA NA NA NA NA 
## 6: 2015-12-20 22:45:00 NA  NA NA NA NA NA NA  NA NA NA  NA NA NA  NA NA NA NA NA NA NA NA NA NA NA NA  NA NA NA NA NA NA 
## 7: 2015-12-20 22:45:00 NA  NA NA NA NA NA NA  NA NA NA  NA NA NA  NA NA NA NA NA NA NA NA NA NA NA NA  NA NA NA NA NA NA 
関連する問題