2017-06-21 10 views
1

複数のJSONファイルがディレクトリに格納されています。これらのJSONファイルは構造がネストされています。我々は、これらのJSONファイルのそれぞれからデータを読み取るために、次のコードを書いた:ディレクトリ内の複数のJSONファイルからRのデータフレームにデータをロードして処理する方法は?

library("jsonlite") 
temp = list.files(pattern="*.JSON") 

for (files in temp){ 
    data <- fromJSON(files, flatten=TRUE) 
    ... 
    } 

class(data)今のデータが ""list"であることを示しています。 names(data)は、列名に "a" "b" "c" "d" "e" "f" ...などを記述することができます。

列 "a"はネストしていますよう:names(data$a)が与える:など... " "nest1" nest2" "nest3"

我々はすべてのJSONファイルを読むためのロジックを書きたい、とif data$e == 1 and data$a$nest1 == TRUE、その後、count_nest1 += 1

{"scans": {"Bkav": {"detected": false, "version": "1.3.0.8876", "result": null, "update": "20170613"}, "TotalDefense": {"detected": false, "version": "37.1.62.1", "result": null, "update": "20170613"}, "MicroWorld-eScan": {"detected": false, "version": "12.0.250.0", "result": null, "update": "20170613"}, "nProtect": {"detected": false, "version": "2017-06-13.02", "result": null, "update": "20170613"}, "CMC": {"detected": false, "version": "1.1.0.977", "result": null, "update": "20170613"}, "CAT-QuickHeal": {"detected": false, "version": "14.00", "result": null, "update": "20170613"}, "McAfee": {"detected": false, "version": "6.0.6.653", "result": null, "update": "20170613"}, "Malwarebytes": {"detected": false, "version": "2.1.1.1115", "result": null, "update": "20170613"}, "Zillya": {"detected": false, "version": "2.0.0.3311", "result": null, "update": "20170613"}, "SUPERAntiSpyware": {"detected": false, "version": "5.6.0.1032", "result": null, "update": "20170613"}, "TheHacker": {"detected": false, "version": "6.8.0.5.1623", "result": null, "update": "20170612"}, "K7GW": {"detected": false, "version": "10.15.23651", "result": null, "update": "20170613"}, "K7AntiVirus": {"detected": false, "version": "10.15.23640", "result": null, "update": "20170613"}, "Arcabit": {"detected": false, "version": "1.0.0.806", "result": null, "update": "20170613"}, "Baidu": {"detected": false, "version": "1.0.0.2", "result": null, "update": "20170613"}, "F-Prot": {"detected": false, "version": "4.7.1.166", "result": null, "update": "20170613"}, "Symantec": {"detected": false, "version": "1.3.1.0", "result": null, "update": "20170613"}, "ESET-NOD32": {"detected": false, "version": "15577", "result": null, "update": "20170613"}, "TrendMicro-HouseCall": {"detected": false, "version": "9.900.0.1004", "result": null, "update": "20170613"}, "Avast": {"detected": false, "version": "8.0.1489.320", "result": null, "update": "20170613"}, "ClamAV": {"detected": false, "version": "0.99.2.0", "result": null, "update": "20170613"}, "Kaspersky": {"detected": false, "version": "15.0.1.13", "result": null, "update": "20170613"}, "BitDefender": {"detected": false, "version": "7.2", "result": null, "update": "20170613"}, "NANO-Antivirus": {"detected": false, "version": "1.0.76.17389", "result": null, "update": "20170613"}, "Paloalto": {"detected": false, "version": "1.0", "result": null, "update": "20170613"}, "ViRobot": {"detected": false, "version": "2014.3.20.0", "result": null, "update": "20170613"}, "Tencent": {"detected": false, "version": "1.0.0.1", "result": null, "update": "20170613"}, "Ad-Aware": {"detected": false, "version": "3.0.3.1010", "result": null, "update": "20170613"}, "Emsisoft": {"detected": false, "version": "4.0.1.883", "result": null, "update": "20170613"}, "Comodo": {"detected": false, "version": "27271", "result": null, "update": "20170613"}, "F-Secure": {"detected": false, "version": "11.0.19100.45", "result": null, "update": "20170613"}, "DrWeb": {"detected": false, "version": "7.0.28.2020", "result": null, "update": "20170613"}, "VIPRE": {"detected": false, "version": "58800", "result": null, "update": "20170613"}, "Invincea": {"detected": false, "version": "6.3.0.25415", "result": null, "update": "20170607"}, "McAfee-GW-Edition": {"detected": false, "version": "v2015", "result": null, "update": "20170613"}, "Sophos": {"detected": false, "version": "4.98.0", "result": null, "update": "20170613"}, "Ikarus": {"detected": false, "version": "0.1.5.2", "result": null, "update": "20170613"}, "Cyren": {"detected": false, "version": "5.4.30.7", "result": null, "update": "20170613"}, "Jiangmin": {"detected": false, "version": "16.0.100", "result": null, "update": "20170613"}, "Webroot": {"detected": false, "version": "1.0.0.207", "result": null, "update": "20170613"}, "Avira": {"detected": false, "version": "8.3.3.4", "result": null, "update": "20170613"}, "Kingsoft": {"detected": false, "version": "2013.8.14.323", "result": null, "update": "20170613"}, "Endgame": {"detected": false, "version": "0.7.0", "result": null, "update": "20170612"}, "Microsoft": {"detected": false, "version": "1.1.13804.0", "result": null, "update": "20170613"}, "AegisLab": {"detected": false, "version": "4.2", "result": null, "update": "20170613"}, "ZoneAlarm": {"detected": false, "version": "1.0", "result": null, "update": "20170613"}, "GData": {"detected": false, "version": "A:25.12848B:25.9761", "result": null, "update": "20170613"}, "AhnLab-V3": {"detected": false, "version": "3.9.1.17781", "result": null, "update": "20170613"}, "ALYac": {"detected": false, "version": "1.0.1.9", "result": null, "update": "20170613"}, "AVware": {"detected": false, "version": "1.5.0.42", "result": null, "update": "20170613"}, "VBA32": {"detected": false, "version": "3.12.26.4", "result": null, "update": "20170613"}, "Zoner": {"detected": false, "version": "1.0", "result": null, "update": "20170613"}, "Rising": {"detected": false, "version": "28.0.0.1", "result": null, "update": "20170613"}, "Yandex": {"detected": false, "version": "5.5.1.3", "result": null, "update": "20170608"}, "SentinelOne": {"detected": false, "version": "1.0.0.12", "result": null, "update": "20170516"}, "Fortinet": {"detected": false, "version": "5.4.233.0", "result": null, "update": "20170613"}, "AVG": {"detected": false, "version": "8.0.1489.320", "result": null, "update": "20170613"}, "Panda": {"detected": false, "version": "4.6.4.2", "result": null, "update": "20170613"}, "CrowdStrike": {"detected": false, "version": "1.0", "result": null, "update": "20170420"}, "Qihoo-360": {"detected": false, "version": "1.0.0.1120", "result": null, "update": "20170613"}}, "scan_id": "00d9d7d8e563ae71dcecc808f35f7d0845ffd91a1731d3f69e6ea5204fd7a3d7-1497385194", "sha1": "c6a6e3977402e76379f48f09a052f0f3c50f5964", "resource": "00D9D7D8E563AE71DCECC808F35F7D0845FFD91A1731D3F69E6EA5204FD7A3D7", "response_code": 1, "scan_date": "2017-06-13 20:19:54", "permalink": "https://www.virustotal.com/file/00d9d7d8e563ae71dcecc808f35f7d0845ffd91a1731d3f69e6ea5204fd7a3d7/analysis/1497385194/", "verbose_msg": "Scan finished, information embedded", "total": 60, "positives": 0, "sha256": "00d9d7d8e563ae71dcecc808f35f7d0845ffd91a1731d3f69e6ea5204fd7a3d7", "md5": "8d95236c637c042ff7df7fd7cc502ddb"} 

実際のデータファイル:最終的に、我々はnest1 == TRUE、とはそうで== TRUE、すべてのnest2カウントすべてのインスタンスのカウント...

実際のデータファイル1を持つことを望みます2:

{"scans": {"MicroWorld-eScan": {"detected": false, "version": "12.0.250.0", "result": null, "update": "20170610"}, "nProtect": {"detected": false, "version": "2017-06-10.02", "result": null, "update": "20170610"}, "CMC": {"detected": false, "version": "1.1.0.977", "result": null, "update": "20170610"}, "CAT-QuickHeal": {"detected": true, "version": "14.00", "result": "TrojDownloader.NSIS.Genome.V", "update": "20170610"}, "ALYac": {"detected": false, "version": "1.0.1.9", "result": null, "update": "20170610"}, "Malwarebytes": {"detected": true, "version": "2.1.1.1115", "result": "PUP.Optional.MyPCBackup", "update": "20170610"}, "Zillya": {"detected": false, "version": "2.0.0.3308", "result": null, "update": "20170610"}, "AegisLab": {"detected": false, "version": "4.2", "result": null, "update": "20170610"}, "TheHacker": {"detected": false, "version": "6.8.0.5.1596", "result": null, "update": "20170607"}, "K7GW": {"detected": false, "version": "10.14.23624", "result": null, "update": "20170610"}, "K7AntiVirus": {"detected": false, "version": "10.14.23624", "result": null, "update": "20170610"}, "Arcabit": {"detected": false, "version": "1.0.0.806", "result": null, "update": "20170610"}, "TrendMicro": {"detected": false, "version": "9.740.0.1012", "result": null, "update": "20170610"}, "Baidu": {"detected": true, "version": "1.0.0.2", "result": "Win32.Trojan.WisdomEyes.16070401.9500.9976", "update": "20170608"}, "F-Prot": {"detected": false, "version": "4.7.1.166", "result": null, "update": "20170610"}, "Symantec": {"detected": true, "version": "1.3.1.0", "result": "PUA.MyPCBackup", "update": "20170610"}, "TotalDefense": {"detected": false, "version": "37.1.62.1", "result": null, "update": "20170610"}, "TrendMicro-HouseCall": {"detected": false, "version": "9.900.0.1004", "result": null, "update": "20170610"}, "Paloalto": {"detected": false, "version": "1.0", "result": null, "update": "20170610"}, "ClamAV": {"detected": false, "version": "0.99.2.0", "result": null, "update": "20170610"}, "Kaspersky": {"detected": false, "version": "15.0.1.13", "result": null, "update": "20170610"}, "BitDefender": {"detected": false, "version": "7.2", "result": null, "update": "20170610"}, "NANO-Antivirus": {"detected": true, "version": "1.0.76.17389", "result": "Riskware.Win32.Unwanted.dmgktv", "update": "20170610"}, "SUPERAntiSpyware": {"detected": false, "version": "5.6.0.1032", "result": null, "update": "20170610"}, "Avast": {"detected": false, "version": "8.0.1489.320", "result": null, "update": "20170610"}, "Tencent": {"detected": false, "version": "1.0.0.1", "result": null, "update": "20170610"}, "Ad-Aware": {"detected": false, "version": "3.0.3.1010", "result": null, "update": "20170610"}, "Emsisoft": {"detected": false, "version": "4.0.1.883", "result": null, "update": "20170610"}, "Comodo": {"detected": false, "version": "27254", "result": null, "update": "20170610"}, "F-Secure": {"detected": false, "version": "11.0.19100.45", "result": null, "update": "20170610"}, "DrWeb": {"detected": true, "version": "7.0.28.2020", "result": "Program.Unwanted.567", "update": "20170610"}, "VIPRE": {"detected": false, "version": "58730", "result": null, "update": "20170610"}, "Invincea": {"detected": false, "version": "6.3.0.25415", "result": null, "update": "20170607"}, "McAfee-GW-Edition": {"detected": false, "version": "v2015", "result": null, "update": "20170610"}, "Sophos": {"detected": false, "version": "4.98.0", "result": null, "update": "20170610"}, "Ikarus": {"detected": false, "version": "0.1.5.2", "result": null, "update": "20170610"}, "Cyren": {"detected": false, "version": "5.4.30.7", "result": null, "update": "20170610"}, "Jiangmin": {"detected": false, "version": "16.0.100", "result": null, "update": "20170610"}, "Webroot": {"detected": false, "version": "1.0.0.207", "result": null, "update": "20170610"}, "Avira": {"detected": true, "version": "8.3.3.4", "result": "PUA/MyPCBackup.Gen", "update": "20170610"}, "Kingsoft": {"detected": false, "version": "2013.8.14.323", "result": null, "update": "20170610"}, "Endgame": {"detected": false, "version": "0.5.0", "result": null, "update": "20170515"}, "Microsoft": {"detected": false, "version": "1.1.13804.0", "result": null, "update": "20170610"}, "ViRobot": {"detected": false, "version": "2014.3.20.0", "result": null, "update": "20170610"}, "ZoneAlarm": {"detected": false, "version": "1.0", "result": null, "update": "20170610"}, "GData": {"detected": true, "version": "A:25.12800B:25.9740", "result": "NSIS.Adware.MyPCBackup.E", "update": "20170610"}, "AhnLab-V3": {"detected": false, "version": "3.9.0.17697", "result": null, "update": "20170610"}, "McAfee": {"detected": false, "version": "6.0.6.653", "result": null, "update": "20170610"}, "AVware": {"detected": false, "version": "1.5.0.42", "result": null, "update": "20170610"}, "VBA32": {"detected": false, "version": "3.12.26.4", "result": null, "update": "20170609"}, "Zoner": {"detected": false, "version": "1.0", "result": null, "update": "20170610"}, "ESET-NOD32": {"detected": true, "version": "15562", "result": "MSIL/MyPCBackup.D potentially unwanted", "update": "20170610"}, "Rising": {"detected": true, "version": "28.0.0.1", "result": "Malware.Undefined!8.C (cloud:I1YBt1VpobT) ", "update": "20170610"}, "Yandex": {"detected": true, "version": "5.5.1.3", "result": "Riskware.Agent!", "update": "20170608"}, "SentinelOne": {"detected": false, "version": "1.0.0.12", "result": null, "update": "20170516"}, "Fortinet": {"detected": false, "version": "5.4.233.0", "result": null, "update": "20170610"}, "AVG": {"detected": false, "version": "8.0.1489.320", "result": null, "update": "20170610"}, "Panda": {"detected": false, "version": "4.6.4.2", "result": null, "update": "20170610"}, "CrowdStrike": {"detected": false, "version": "1.0", "result": null, "update": "20170420"}, "Qihoo-360": {"detected": false, "version": "1.0.0.1120", "result": null, "update": "20170610"}}, "scan_id": "00d468fa26813736cd14ff91e84f5e31fe30eaef6b35af44cafe540870ea7873-1497129945", "sha1": "7b890323abfe8f3bd33be0bc439076b5525d03b0", "resource": "00D468FA26813736CD14FF91E84F5E31FE30EAEF6B35AF44CAFE540870EA7873", "response_code": 1, "scan_date": "2017-06-10 21:25:45", "permalink": "https://www.virustotal.com/file/00d468fa26813736cd14ff91e84f5e31fe30eaef6b35af44cafe540870ea7873/analysis/1497129945/", "verbose_msg": "Scan finished, information embedded", "total": 60, "positives": 11, "sha256": "00d468fa26813736cd14ff91e84f5e31fe30eaef6b35af44cafe540870ea7873", "md5": "45922155c9628e11441aa869c6287bb7"} 

実際のデータファイル3:

{"response_code": 0, "resource": "0E28BEDFBA37CEE5BD639AC86AC08A422C8944C3749CD2C5D7F5A0C2B37115B3", "verbose_msg": "The requested resource is not among the finished, queued or pending scans"} 

私たちは、ファイルの読み取りと応答コードを確認してください。応答コードが「0」であれば、他のcount_not_detected += 1はJSONデータを読み込み、最後に、我々はアンチウイルスAが500分の323個の総ファイルを検出し、アンチウイルスBが500分の224の合計を検出したと言うことができるように、各アンチウイルスの種類によって検出されたどのように多くのサンプルを数えますファイルなど

データが完全に平坦化され、すべてをデータフレームに格納できるのは素晴らしいことです。このためにtidyjsonパッケージを調べましたが、成功しませんでした。

答えて

0

これらの変更はまだCRANにリリースされていないが、私はtidyjsonの開発版がうまく自分のニーズに合うと思います。 devtools::install_github('jeremystan/tidyjson')を使用して最新の安定版開発版をインストールできます。私はあなたが探している正確に理解するために少し苦労しています、と述べた

。あなたは、オブジェクトのサイズ/構造を理解するために探しているなら、あなたが調査するjson_structure()json_lengths()、またはjson_types()を使用することがあります:

言っ
suppressMessages({ 
    library(jsonlite) 
    library(dplyr) 
    library(tidyjson) 
}) 

rawjson1 <- "raw_json_1.json" %>% as.tbl_json() 
rawjson2 <- "raw_json_2.json" %>% as.tbl_json() 
rawjson3 <- "raw_json_3.json" %>% as.tbl_json() 

rawjson1 %>% json_structure() 
#> # A tbl_json: 313 x 9 tibble with a "JSON" attribute 
#>   `attr(., "JSON")` document.id parent.id level index child.id 
#>      <chr>  <int>  <chr> <int> <int> <chr> 
#> 1 "{\"scans\":{\"Bkav..."   1  <NA>  0  1  1 
#> 2 "{\"Bkav\":{\"detec..."   1   1  1  1  1.1 
#> 3 "\"00d9d7d8e563ae..."   1   1  1  2  1.2 
#> 4 "\"c6a6e3977402e7..."   1   1  1  3  1.3 
#> 5 "\"00D9D7D8E563AE..."   1   1  1  4  1.4 
#> 6      1   1   1  1  5  1.5 
#> 7 "\"2017-06-13 20:..."   1   1  1  6  1.6 
#> 8 "\"https://www.vi..."   1   1  1  7  1.7 
#> 9 "\"Scan finished,..."   1   1  1  8  1.8 
#> 10      60   1   1  1  9  1.9 
#> # ... with 303 more rows, and 4 more variables: seq <list>, name <chr>, 
#> # type <fctr>, length <int> 


rawjson1 %>% gather_object() %>% json_lengths() 
#> # A tbl_json: 12 x 3 tibble with a "JSON" attribute 
#>   `attr(., "JSON")` document.id   name length 
#>      <chr>  <int>   <chr> <int> 
#> 1 "{\"Bkav\":{\"detec..."   1   scans  60 
#> 2 "\"00d9d7d8e563ae..."   1  scan_id  1 
#> 3 "\"c6a6e3977402e7..."   1   sha1  1 
#> 4 "\"00D9D7D8E563AE..."   1  resource  1 
#> 5      1   1 response_code  1 
#> 6 "\"2017-06-13 20:..."   1  scan_date  1 
#> 7 "\"https://www.vi..."   1  permalink  1 
#> 8 "\"Scan finished,..."   1 verbose_msg  1 
#> 9      60   1   total  1 
#> 10      0   1  positives  1 
#> 11 "\"00d9d7d8e563ae..."   1  sha256  1 
#> 12 "\"8d95236c637c04..."   1   md5  1 

rawjson1 %>% gather_object() %>% json_types() 
#> # A tbl_json: 12 x 3 tibble with a "JSON" attribute 
#>   `attr(., "JSON")` document.id   name type 
#>      <chr>  <int>   <chr> <fctr> 
#> 1 "{\"Bkav\":{\"detec..."   1   scans object 
#> 2 "\"00d9d7d8e563ae..."   1  scan_id string 
#> 3 "\"c6a6e3977402e7..."   1   sha1 string 
#> 4 "\"00D9D7D8E563AE..."   1  resource string 
#> 5      1   1 response_code number 
#> 6 "\"2017-06-13 20:..."   1  scan_date string 
#> 7 "\"https://www.vi..."   1  permalink string 
#> 8 "\"Scan finished,..."   1 verbose_msg string 
#> 9      60   1   total number 
#> 10      0   1  positives number 
#> 11 "\"00d9d7d8e563ae..."   1  sha256 string 
#> 12 "\"8d95236c637c04..."   1   md5 string 

、あなたの究極の目的は、さらなる調査のためのデータフレームを取得する場合、あなたはspread_all()で本当に広いdata_frameを得ることができます。あるいは、もっと有用なデータセット(レベル1のキーは列で、次に各スキャンでは行)と思います。ここでは、複数のファイルを同時に処理していることに注意してください(それぞれ固有のdocument.idが得られます)。

files <- c("raw_json_1.json", "raw_json_2.json") 

j <- files %>% as.tbl_json() 

clean <- j %>% 
spread_all(recursive=FALSE) %>% ## get the level 1 keys 
enter_object('scans') %>% gather_object() %>% ## enter and gather scans 
spread_all(recursive=FALSE) ## spread the scans out 

names(clean) 
#> [1] "document.id" "scan_id"  "sha1"   "resource"  
#> [5] "response_code" "scan_date"  "permalink"  "verbose_msg" 
#> [9] "total"   "positives"  "sha256"  "md5"   
#> [13] "name"   "detected"  "version"  "result"  
#> [17] "update" 

## use tbl_df when done parsing to strip the JSON component 
clean %>% tbl_df() %>% group_by(document.id) %>% summarize(count = n(), detected_count = sum(detected)) 
#> # A tibble: 2 x 3 
#> document.id count detected_count 
#>   <int> <int>   <int> 
#> 1   1 60    0 
#> 2   2 60    11 

## look at those with detected==TRUE 
clean %>% tbl_df() %>% filter(detected) %>% select(document.id, name, version, 
    result) 
#> # A tbl_json: 11 x 4 tibble with a "JSON" attribute 
#>   `attr(., "JSON")` document.id   name    version 
#>      <chr>  <int>   <chr>    <chr> 
#> 1 "{\"detected\":tru..."   2 CAT-QuickHeal    14.00 
#> 2 "{\"detected\":tru..."   2 Malwarebytes   2.1.1.1115 
#> 3 "{\"detected\":tru..."   2   Baidu    1.0.0.2 
#> 4 "{\"detected\":tru..."   2  Symantec    1.3.1.0 
#> 5 "{\"detected\":tru..."   2 NANO-Antivirus  1.0.76.17389 
#> 6 "{\"detected\":tru..."   2   DrWeb   7.0.28.2020 
#> 7 "{\"detected\":tru..."   2   Avira    8.3.3.4 
#> 8 "{\"detected\":tru..."   2   GData A:25.12800B:25.9740 
#> 9 "{\"detected\":tru..."   2  ESET-NOD32    15562 
#> 10 "{\"detected\":tru..."   2   Rising   28.0.0.1 
#> 11 "{\"detected\":tru..."   2   Yandex    5.5.1.3 
#> # ... with 1 more variables: result <chr> 
関連する問題