私はいくつかの大きな時系列データセットで作業しており、各ファイルには約2百万行もあります。これまではddplyを使用してデータを集計していましたが、残念ながら遅すぎるため、実際にはもっと速い方法が必要です。ここに私のコードです:dplyplyをdplyrパッケージで複製していますか? ddplyが遅すぎる
DF <- read.csv(file = "NSE/20151221/AUROPHARMA15DECFUT_20151221_ob.csv",header = FALSE,sep = "", col.names = c("DateTime","Seq","BP1","BQ1","BO1","AP1","AQ1","AO1","BP2","BQ2","BO2","AP2","AQ2","AO2","BP3","BQ3","BO3","AP3","AQ3","AO3","BP4","BQ4","BO4","AP4","AQ4","AO4","BP5","BQ5","BO5","AP5","AQ5","AO5","BP6","BQ6","BO6","AP6","AQ6","AO6","BP7","BQ7","BO7","AP7","AQ7","AO7","BP8","BQ8","BO8","AP8","AQ8","AO8","BP9","BQ9","BO9","AP9","AQ9","AO9","BP10","BQ10","BO10","AP10","AQ10","AO10", "C", "Price", "Qty", "OldPrice", "OldQty"), colClasses = c(NA, rep("integer",31), rep("NULL", 35)))
DF <- DF[which(DF$DateTime != 0),]
options(digits.secs = 3)
DF$DateTime = as.POSIXct(DF$DateTime/(10^9), origin="1970-01-01")
completecase <- DF[complete.cases(DF),]
midpoint = data.frame(DateTime=completecase$DateTime, MP=(completecase$BP1+completecase$AP1)/2)
**# creating 10 millisecond time intervals**
cuts = seq.POSIXt(from = min(midpoint$DateTime), to = max(midpoint$DateTime), by = .01)
**#creating new Time variable with the 10 millisecond breaks**
midpoint$Time = cut(midpoint$DateTime, breaks = cuts)
**#summarizing the MP variable every 10millisecond while keeping the empty time frames with the .drop = FALSE argument**
mp = ddply(midpoint, .(Time), summarise, MP = mean(MP), .drop = FALSE)
mp$Time = as.POSIXct(mp$Time)
mp_xts = xts(mp$MP, mp$Time, tzone = 'Asia/Kolkata')
mp_xts = mp_xts["2015-12-21 09:15:00.000/2015-12-21 15:30:00.000"]
mp_xts = makeReturns(mp_xts)
ddplyはあまりにも遅いです。私はdata.tableパッケージまたはdplyrパッケージを使ってこれを行うことができますが、.drop = FALSE引き数を複製することはできません。両方の他のパッケージはすべての空の時間間隔を削除しますが、分析のためにそれらを保持することが重要です。
私がしようとしているのは、基本的には、10ミリ秒ごとに自分のタイムシリーズを集計し、NAタイムフレームに0またはlocfを入力します。それが正しいのであれば、出力として6.25 * 60 * 60 * 100 = 2,250,000行が必要です。
ddplyと全く同じことをやっているうちに、これをもっと速くする方法はありますか?
> dput(DF[1:20,])
structure(list(DateTime = structure(c(1450669500.804, 1450669500.806,
1450669500.806, 1450669500.807, 1450669500.807, 1450669500.808,
1450669500.812, 1450669500.813, 1450669500.813, 1450669500.813,
1450669500.814, 1450669500.819, 1450669500.82, 1450669500.82,
1450669500.827, 1450669500.85, 1450669500.85, 1450669500.85,
1450669500.851, 1450669500.851), class = c("POSIXct", "POSIXt"
), tzone = ""), Seq = c(104L, 163L, 169L, 190L, 198L, 227L, 301L,
315L, 319L, 320L, 326L, 404L, 429L, 435L, 583L, 928L, 931L, 932L,
944L, 947L), BP1 = c(82055L, 82055L, 82055L, 82055L, 82055L,
82630L, 82630L, 82630L, 82630L, 82630L, 82630L, 82630L, 82630L,
82630L, 82630L, 82630L, 82630L, 82630L, 82830L, 82830L), BQ1 = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), BO1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), AP1 = c(0L, 87800L,
83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 83800L,
83800L, 83800L, 83800L, 83800L, 83800L, 83800L, 83795L, 83795L,
83795L, 83795L), AQ1 = c(0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L), AO1 = c(0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), BP2 = c(0L, 0L, 0L, 0L, 0L, 82055L, 82525L, 82525L,
82525L, 82525L, 82525L, 82525L, 82525L, 82525L, 82525L, 82525L,
82525L, 82525L, 82630L, 82630L), BQ2 = c(0L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
BO2 = c(0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), AP2 = c(0L, 0L, 87800L, 84805L,
84230L, 84230L, 84230L, 84230L, 84230L, 84230L, 84230L, 84230L,
84230L, 84230L, 84230L, 84230L, 83800L, 83800L, 83800L, 83800L
), AQ2 = c(0L, 0L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), AO2 = c(0L, 0L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), BP3 = c(0L, 0L, 0L, 0L, 0L, 0L, 82055L, 82055L, 82055L,
82055L, 82055L, 82320L, 82320L, 82320L, 82320L, 82320L, 82320L,
82320L, 82525L, 82525L), BQ3 = c(0L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
BO3 = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), AP3 = c(0L, 0L, 0L, 87800L,
84805L, 84805L, 84805L, 84805L, 84805L, 84805L, 84805L, 84805L,
84805L, 84805L, 84805L, 84805L, 84230L, 84230L, 84230L, 84230L
), AQ3 = c(0L, 0L, 0L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L), AO3 = c(0L, 0L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), BP4 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 82035L, 82035L,
82035L, 82055L, 82055L, 82055L, 82055L, 82060L, 82060L, 82060L,
82320L, 82320L), BQ4 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), BO4 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), AP4 = c(0L, 0L, 0L, 0L, 87800L, 87800L,
87800L, 85380L, 85380L, 85380L, 85365L, 85365L, 85365L, 85365L,
84980L, 84980L, 84805L, 84805L, 84805L, 84400L), AQ4 = c(0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 1L), AO4 = c(0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), BP5 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 80035L, 80035L, 82035L, 82035L,
82035L, 82035L, 82055L, 82055L, 82055L, 82060L, 82060L),
BQ5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), BO5 = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), AP5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 87800L, 87800L, 87800L,
85380L, 85380L, 85380L, 85380L, 85365L, 85365L, 84980L, 84980L,
84980L, 84805L), AQ5 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L), AO5 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L)), .Names = c("DateTime", "Seq", "BP1", "BQ1",
"BO1", "AP1", "AQ1", "AO1", "BP2", "BQ2", "BO2", "AP2", "AQ2",
"AO2", "BP3", "BQ3", "BO3", "AP3", "AQ3", "AO3", "BP4", "BQ4",
"BO4", "AP4", "AQ4", "AO4", "BP5", "BQ5", "BO5", "AP5", "AQ5",
"AO5"), row.names = c(NA, 20L), class = "data.frame")
他の情報を投稿する必要があり、本当に助けていただければ幸いです。
> sessionInfo()
R version 3.2.2 (2015-08-14)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu precise (12.04.5 LTS)
locale:
[1] LC_CTYPE=en_IN.UTF-8 LC_NUMERIC=C LC_TIME=en_IN.UTF-8 LC_COLLATE=en_IN.UTF-8 LC_MONETARY=en_IN.UTF-8 LC_MESSAGES=en_IN.UTF-8 LC_PAPER=en_IN.UTF-8
[8] LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_IN.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] highfrequency_0.4 xts_0.9-7 zoo_1.7-12 data.table_1.9.7
loaded via a namespace (and not attached):
[1] tools_3.2.2 grid_3.2.2 lattice_0.20-33
あなたがdata.table使用して、バックの空のレベルを追加することができますマージを使用しています。 – Heroka
特に、あなたのコードが複数のpkgに依存する場合は、 'library'または' require'への呼び出しを含めるとよいでしょう。また、必要に応じてサンプルコードを小さくすると、人々に質問をさせるのに役立ちます。 – jangorecki
@jangorecki私は実際に自分のコードでライブラリを呼び出す必要がありますが、これははるかに大きなコードのほんの一部です。私は質問を可能な限りきちんとしておきたいので残りの部分を投稿することを心配しませんでした – UtdMan