jsonlite::fromJSON
を使用して、JSON文書をデータフレームに取得しています。問題は、それが完全に平坦化されていないということです。私には知られていない理由があります(しかし、以下の人は辞書の形を取っているので言及していますが、それらは平らにならないでしょう)。R:既存のデータフレーム内のデータフレームの列を展開
df <- jsonlite::fromJSON(out, flatten = TRUE, simplifyDataFrame = TRUE)
n <- df$hits$total # 1 entry
dat <- df$hits$hits
# dat is a dataframe
dim(dat)
[1] 2 5
# class of each column in the dataframe
lapply(dat, class)
$`_index`
[1] "character"
$`_type`
[1] "character"
$`_id`
[1] "character"
$`_score`
[1] "numeric"
$`_source.samples`
[1] "list"
があり、実際にデータフレームのリストで列dat$_source.samples
です:私は、彼らが既存のデータフレーム内の二つの新しい列を形成するように、データフレームを含む列を拡張するにはどうすればよい
> dim(dat$`_source.samples`[[1]])
[1] 21 2
> dim(dat$`_source.samples`[[2]])
[1] 21 2
- と重複します最初の4行は展開の結果です。ここでは一例です:どのようにdat
の
# the first four columns look like this
> head(dat[,1:4])
_index _type _id _score
1 pnoc genes ENSG00000131051.20 1
2 pnoc genes ENSG00000000457.13 1
# the fifth column `_source.samples` that has dataframes looks like this
# just showing the dataframe in the first row of `dat`
> head(dat$`_source.samples`[[1]])
sample_id rsem.fpkm
1 C021_0001_20140916_tumor_RNASeq 39.11
2 CPBT_0001_1_tumor_RNASeq 184.56
3 CPBT_0007_1_tumor_RNASeq 41.29
4 C021_0010_001774_tumor_RNASeq 86.31
5 C021_0003_001409_tumor_RNASeq 79.24
6 CPBT_0005_1_tumor_RNASeq 66.20
だから私はこのような何かをしたいと思う:ここ
_index _type _id _score sample_id
1 pnoc genes ENSG00000000457.13 1 C021_0001_20140916_tumor_RNASeq
2 pnoc genes ENSG00000000457.13 1 CPBT_0001_1_tumor_RNASeq
3 pnoc genes ENSG00000000457.13 1 CPBT_0007_1_tumor_RNASeq
4 pnoc genes ENSG00000000457.13 1 C021_0010_001774_tumor_RNASeq
5 pnoc genes ENSG00000000457.13 1 C021_0003_001409_tumor_RNASeq
6 pnoc genes ENSG00000000457.13 1 CPBT_0005_1_tumor_RNASeq
rsem.fpkm
1 1.39
2 5.58
3 1.93
4 3.64
5 5.20
6 3.69
は再現データセットです。
> dput(dat)
structure(list(`_index` = c("pnoc", "pnoc"), `_type` = c("genes",
"genes"), `_id` = c("ENSG00000131051.20", "ENSG00000000457.13"
), `_score` = c(1, 1), `_source.samples` = list(structure(list(
sample_id = c("C021_0001_20140916_tumor_RNASeq", "CPBT_0001_1_tumor_RNASeq",
"CPBT_0007_1_tumor_RNASeq", "C021_0010_001774_tumor_RNASeq",
"C021_0003_001409_tumor_RNASeq", "CPBT_0005_1_tumor_RNASeq",
"CPBT_0008_1_tumor_RNASeq", "C021_0002_001113_tumor_RNASeq",
"C021_0013_001872_tumor_RNASeq", "C021_0005_001661_tumor_RNASeq",
"C021_0007_001669_tumor_RNASeq", "C021_0008_001699_tumor_RNASeq",
"CPBT_0006_1_tumor_RNASeq", "C021_0011_001786_tumor_RNASeq",
"C021_0009_001766_tumor_RNASeq", "CPBT_0004_1_tumor_RNASeq",
"CPBT_0003_1_tumor_RNASeq", "CPBT_0009_1_tumor_RNASeq", "C021_0006_001666_tumor_RNASeq",
"C021_0012_001825_tumor_RNASeq", "C021_0004_001418_tumor_RNASeq"
), rsem.fpkm = c(39.11, 184.56, 41.29, 86.31, 79.24, 66.2,
42.13, 88.78, 78.73, 96.79, 38.5, 105.12, 129.16, 145.13,
117.96, 86.53, 75.43, 179.01, 0, 61.61, 98.64)), .Names = c("sample_id",
"rsem.fpkm"), class = "data.frame", row.names = c(NA, 21L)),
structure(list(sample_id = c("C021_0001_20140916_tumor_RNASeq",
"CPBT_0001_1_tumor_RNASeq", "CPBT_0007_1_tumor_RNASeq", "C021_0010_001774_tumor_RNASeq",
"C021_0003_001409_tumor_RNASeq", "CPBT_0005_1_tumor_RNASeq",
"CPBT_0008_1_tumor_RNASeq", "C021_0002_001113_tumor_RNASeq",
"C021_0013_001872_tumor_RNASeq", "C021_0005_001661_tumor_RNASeq",
"C021_0007_001669_tumor_RNASeq", "C021_0008_001699_tumor_RNASeq",
"CPBT_0006_1_tumor_RNASeq", "C021_0011_001786_tumor_RNASeq",
"C021_0009_001766_tumor_RNASeq", "CPBT_0004_1_tumor_RNASeq",
"CPBT_0003_1_tumor_RNASeq", "CPBT_0009_1_tumor_RNASeq", "C021_0006_001666_tumor_RNASeq",
"C021_0012_001825_tumor_RNASeq", "C021_0004_001418_tumor_RNASeq"
), rsem.fpkm = c(1.39, 5.58, 1.93, 3.64, 5.2, 3.69, 1.75,
5.38, 3.46, 4.14, 0.96, 3.93, 4.47, 3.17, 4.38, 2.8, 2.27,
7.4, 0, 2.76, 5.55)), .Names = c("sample_id", "rsem.fpkm"
), class = "data.frame", row.names = c(NA, 21L)))), .Names = c("_index",
"_type", "_id", "_score", "_source.samples"), class = "data.frame", row.names = 1:2)
ありがとうございます!
Googleドライブのリンクはおそらく永遠にはありません...テキストに 'dput'を使用して最小限の例を提供できますか? – dash2
@ dash2再現可能な例で質問を更新します。 –