2017-11-23 24 views
0

興味のある場所に関する情報を含むJSONファイル 'data.json'があります。異なる長さのネストされたJSONをデータフレームに変換するR

data = lapply(readLines("data.json"), fromJSON) 

これにより、異なる長さの入れ子リストが作成されます。ここに最初の4行のサンプルがあります。

list(structure(list(payload = structure(list(existence_full = 1L, 
    geo_virtual = "[\"56.9459720|-2.1971226|20|within_50m|4\"]", 
    latitude = "56.945972", locality = "Stonehaven", `_records_touched` = "{\"crawl\":8,\"lssi\":0,\"polygon_centroid\":0,\"geocoder\":0,\"user_submission\":0,\"tdc\":0,\"gov\":0}", 
    address = "The Lodge, Dunottar", email = "[email protected]", 
    existence_ml = 0.569423821765872, domain_aggregate = "", 
    name = "Dunnottar Castle", search_tags = c("Dunnottar Castle Aberdeenshire", 
    "Dunotter Castle"), admin_region = "Scotland", existence = 1L, 
    category_labels = structure(c("Landmarks", "Buildings and Structures" 
    ), .Dim = 1:2), post_town = "Stonehaven", region = "Kincardineshire", 
    review_count = "719", geocode_level = "within_50m", tel = "01569 762173", 
    placerank = 65L, longitude = "-2.197123", placerank_ml = 37.2791607346447, 
    fax = "01330 860325", category_ids_text_search = "", website = "http://www.dunnottarcastle.co.uk", 
    status = "1", geocode_confidence = "20", postcode = "AB39 2TL", 
    category_ids = 108L, country = "gb", `_geocode_quality` = "4"), .Names = c("existence_full", 
"geo_virtual", "latitude", "locality", "_records_touched", "address", 
"email", "existence_ml", "domain_aggregate", "name", "search_tags", 
"admin_region", "existence", "category_labels", "post_town", 
"region", "review_count", "geocode_level", "tel", "placerank", 
"longitude", "placerank_ml", "fax", "category_ids_text_search", 
"website", "status", "geocode_confidence", "postcode", "category_ids", 
"country", "_geocode_quality")), uuid = "3867aaf3-12ab-434f-b12b-5d627b3359c3"), .Names = c("payload", 
"uuid")), structure(list(payload = structure(list(existence_full = 1L, 
    geo_virtual = "[\"56.237480|-5.073578|20|within_50m|4\"]", 
    latitude = "56.237480", locality = "Inveraray", `_records_touched` = "{\"crawl\":11,\"lssi\":0,\"polygon_centroid\":0,\"geocoder\":0,\"user_submission\":0,\"tdc\":0,\"gov\":0}", 
    address = "Cherry Park", email = "enqu[email protected]", 
    longitude = "-5.073578", domain_aggregate = "", name = "Inveraray Castle", 
    admin_region = "Scotland", search_tags = c("Inveraray Castle Tea Room", 
    "Inverary Castle"), existence = 1L, category_labels = structure(c("Social", 
    "Food and Dining", "Restaurants"), .Dim = c(1L, 3L)), region = "Argyll", 
    review_count = "532", geocode_level = "within_50m", tel = "01499 302203", 
    placerank = 67L, post_town = "Inveraray", placerank_ml = 41.1997808735227, 
    fax = "01499 302421", category_ids_text_search = "", website = "http://www.inveraray-castle.com", 
    status = "1", geocode_confidence = "20", postcode = "PA32 8XE", 
    category_ids = 347L, country = "gb", `_geocode_quality` = "4", 
    existence_ml = 0.791488110284778), .Names = c("existence_full", 
"geo_virtual", "latitude", "locality", "_records_touched", "address", 
"email", "longitude", "domain_aggregate", "name", "admin_region", 
"search_tags", "existence", "category_labels", "region", "review_count", 
"geocode_level", "tel", "placerank", "post_town", "placerank_ml", 
"fax", "category_ids_text_search", "website", "status", "geocode_confidence", 
"postcode", "category_ids", "country", "_geocode_quality", "existence_ml" 
)), uuid = "8278ab80-2cd1-4dbd-9685-0d0036b681eb"), .Names = c("payload", 
"uuid")), structure(list(payload = structure(list(existence_full = 1L, 
    geo_virtual = "[\"51.483872|-0.606820|100|rooftop|2\"]", 
    latitude = "51.483872", locality = "Windsor Castle", hours_display = "Mon-Sat 11:30 AM-11:00 PM; Sun 12:00 PM-11:00 PM", 
    `_records_touched` = "{\"crawl\":7,\"lssi\":0,\"polygon_centroid\":0,\"geocoder\":2,\"user_submission\":0,\"tdc\":0,\"gov\":0}", 
    address = "", longitude = "-0.606820", domain_aggregate = "", 
    name = "Windsor Castle", admin_region = "England", search_tags = c("The Windsor Castle", 
    "The Windsor Castle Pub", "The Windsor Castle Public House", 
    "Pub Food", "British"), existence = 1L, category_labels = structure(c("Landmarks", 
    "Buildings and Structures"), .Dim = 1:2), region = "Berkshire", 
    review_count = "", geocode_level = "rooftop", tel = "020 7766 7304", 
    placerank = 62L, post_town = "Windsor", placerank_ml = 28.1160845346327, 
    fax = "01753 832290", category_ids_text_search = "", website = "http://www.royalcollection.org.uk/visit/windsorcastle", 
    status = "1", hours = "{\"monday\":[[\"11:30\",\"23:00\"]],\"tuesday\":[[\"11:30\",\"23:00\"]],\"wednesday\":[[\"11:30\",\"23:00\"]],\"thursday\":[[\"11:30\",\"23:00\"]],\"friday\":[[\"11:30\",\"23:00\"]],\"saturday\":[[\"11:30\",\"23:00\"]],\"sunday\":[[\"12:00\",\"23:00\"]]}", 
    neighborhood = "Chalvey", geocode_confidence = "100", postcode = "SL4 1NJ", 
    category_ids = 108L, country = "gb", `_geocode_quality` = "2", 
    existence_ml = 0.885705196944165, email = "[email protected]"), .Names = c("existence_full", 
"geo_virtual", "latitude", "locality", "hours_display", "_records_touched", 
"address", "longitude", "domain_aggregate", "name", "admin_region", 
"search_tags", "existence", "category_labels", "region", "review_count", 
"geocode_level", "tel", "placerank", "post_town", "placerank_ml", 
"fax", "category_ids_text_search", "website", "status", "hours", 
"neighborhood", "geocode_confidence", "postcode", "category_ids", 
"country", "_geocode_quality", "existence_ml", "email")), uuid = "c5f7d8a9-0851-46ef-8da7-ad55e187d3a8"), .Names = c("payload", 
"uuid")), structure(list(payload = structure(list(existence_full = 1L, 
    category_ids_text_search = "", placerank_ml = 31.9857184762157, 
    longitude = "-2.191955", name = "Pitmedden Garden", domain_aggregate = "", 
    admin_region = "Scotland", languages = "English", region = "Aberdeenshire", 
    review_count = "2", geocode_level = "rooftop", tel = "01651 842352", 
    placerank = 57L, post_town = "Ellon", category_labels = structure(c("Landmarks", 
    "Gardens"), .Dim = 1:2), existence = 1L, fax = "0844 493 2102", 
    website = "http://www.nts.org.uk/Property/Pitmedden-Garden", 
    status = "1", geocode_confidence = "100", postcode = "AB41 7PD", 
    country = "gb", category_ids = 109L, `_geocode_quality` = "4", 
    existence_ml = 0.849871115334588, email = "[email protected]", 
    address = "", `_records_touched` = "{\"crawl\":6,\"lssi\":0,\"polygon_centroid\":0,\"geocoder\":0,\"user_submission\":0,\"tdc\":0,\"gov\":0}", 
    locality = "Pitmedden", latitude = "57.343233", geo_virtual = "[\"57.343233|-2.191955|100|rooftop|4\"]"), .Names = c("existence_full", 
"category_ids_text_search", "placerank_ml", "longitude", "name", 
"domain_aggregate", "admin_region", "languages", "region", "review_count", 
"geocode_level", "tel", "placerank", "post_town", "category_labels", 
"existence", "fax", "website", "status", "geocode_confidence", 
"postcode", "country", "category_ids", "_geocode_quality", "existence_ml", 
"email", "address", "_records_touched", "locality", "latitude", 
"geo_virtual")), uuid = "bb57a153-740f-42be-aa4d-ae12d4eb57d4"), .Names = c("payload", 
"uuid"))) 

リストのリストの異なる列に値を設定することで、これをデータフレームに変換したいと考えています。リスト内の各リストには、特定の場所に関する情報が含まれています。これは、uuidに分類されています。したがって、データフレームの各行には、特定の情報であるuuidに関する情報が含まれます。対応する値を持たない列の場合、NAが表示されます。

この概念に類似した質問で言及されたアプローチのいくつかを試しましたが、失敗しました。

どのようなご意見も大歓迎です!ありがとう

答えて

0

元のデータレイアウトのより広範な説明があったかもしれませんが、私はそのオブジェクトの高水準構造として何を見ているかに基づいて推測しています。 structuredat命名されていると仮定します。

> lapply(dat, names) 
[[1]] 
[1] "payload" "uuid" 

[[2]] 
[1] "payload" "uuid" 

[[3]] 
[1] "payload" "uuid" 

[[4]] 
[1] "payload" "uuid" 

ので、その後彼らは「横並び」

newdat <- mapply(cbind, payloads, uuids) 

はその後に寸法を見てバインドするデータフレーム

payloads <- lapply(dat, function(x) data.frame(x$payload)) 
uuids <- lapply(dat, function(x) data.frame(x$uuid)) 

のリストにそれらを抽出単一行のデータフレームが複数行のデータフレームに正しくコピーされているかどうかを確認してください。あなたの仕様にない機能はNAです。 「のUUID」は明らか識別子であるので、cbind操作が「ペイロード」と同じ長さの列に、その列の内容のそれぞれをコピーします

> lapply(payloads, dim) 
[[1]] 
[1] 2 32 

[[2]] 
[1] 2 33 

[[3]] 
[1] 5 35 

[[4]] 
[1] 1 32 

> lapply(uuids, dim) 
[[1]] 
[1] 1 1 

[[2]] 
[1] 1 1 

[[3]] 
[1] 1 1 

[[4]] 
[1] 1 1 

> lapply(mapply(cbind, payloads, uuids), dim) 
[[1]] 
[1] 2 33 

[[2]] 
[1] 2 34 

[[3]] 
[1] 5 36 

[[4]] 
[1] 1 33 

の連結の次のレベルは、すべてアセンブルするかもしれません互いの上にデータフレーム」自分の名前がとても似ていることから:

lapply(newdat, names) 
[[1]] 
[1] "existence_full"   "geo_virtual"    "latitude"     
[4] "locality"     "X_records_touched"  "address"     
[7] "email"     "existence_ml"    "domain_aggregate"   
[10] "name"      "search_tags"    "admin_region"    
[13] "existence"    "category_labels.1"  "category_labels.2"  
[16] "post_town"    "region"     "review_count"    
[19] "geocode_level"   "tel"      "placerank"    
[22] "longitude"    "placerank_ml"    "fax"      
[25] "category_ids_text_search" "website"     "status"     
[28] "geocode_confidence"  "postcode"     "category_ids"    
[31] "country"     "X_geocode_quality"  "x.uuid"     

[[2]] 
[1] "existence_full"   "geo_virtual"    "latitude"     
[4] "locality"     "X_records_touched"  "address"     
[7] "email"     "longitude"    "domain_aggregate"   
[10] "name"      "admin_region"    "search_tags"    
[13] "existence"    "category_labels.1"  "category_labels.2"  
[16] "category_labels.3"  "region"     "review_count"    
[19] "geocode_level"   "tel"      "placerank"    
[22] "post_town"    "placerank_ml"    "fax"      
[25] "category_ids_text_search" "website"     "status"     
[28] "geocode_confidence"  "postcode"     "category_ids"    
[31] "country"     "X_geocode_quality"  "existence_ml"    
[34] "x.uuid"     

[[3]] 
[1] "existence_full"   "geo_virtual"    "latitude"     
[4] "locality"     "hours_display"   "X_records_touched"  
[7] "address"     "longitude"    "domain_aggregate"   
[10] "name"      "admin_region"    "search_tags"    
[13] "existence"    "category_labels.1"  "category_labels.2"  
[16] "region"     "review_count"    "geocode_level"   
[19] "tel"      "placerank"    "post_town"    
[22] "placerank_ml"    "fax"      "category_ids_text_search" 
[25] "website"     "status"     "hours"     
[28] "neighborhood"    "geocode_confidence"  "postcode"     
[31] "category_ids"    "country"     "X_geocode_quality"  
[34] "existence_ml"    "email"     "x.uuid"     

[[4]] 
[1] "existence_full"   "category_ids_text_search" "placerank_ml"    
[4] "longitude"    "name"      "domain_aggregate"   
[7] "admin_region"    "languages"    "region"     
[10] "review_count"    "geocode_level"   "tel"      
[13] "placerank"    "post_town"    "category_labels.1"  
[16] "category_labels.2"  "existence"    "fax"      
[19] "website"     "status"     "geocode_confidence"  
[22] "postcode"     "country"     "category_ids"    
[25] "X_geocode_quality"  "existence_ml"    "email"     
[28] "address"     "X_records_touched"  "locality"     
[31] "latitude"     "geo_virtual"    "x.uuid"  

ハドレーのplyr -packageでrbind.fill機能は、これを効率的に行うことができます。

install.packages("plyr") 
newdat3 <- do.call(plyr::rbind.fill, newdat) 
newdat3 

いくつかの列を見ると、これは要件を満たしているようです:

> newdat3[ , c("locality", "category_labels.3", "neighborhood")] 
     locality category_labels.3 neighborhood 
1  Stonehaven    <NA>   <NA> 
2  Stonehaven    <NA>   <NA> 
3  Inveraray  Restaurants   <NA> 
4  Inveraray  Restaurants   <NA> 
5 Windsor Castle    <NA>  Chalvey 
6 Windsor Castle    <NA>  Chalvey 
7 Windsor Castle    <NA>  Chalvey 
8 Windsor Castle    <NA>  Chalvey 
9 Windsor Castle    <NA>  Chalvey 
10  Pitmedden    <NA>   <NA> 
関連する問題