2017-11-17 9 views
1

normalize私は後で解析する非常にネストされたjsonファイルを試しています。私が苦労しているのは、正常化するためには、複数のレベルを深める方法です。dicts内のdictsのjson_normalize

私はそれが欲しいものを正確に行うので、私はpandas.io.json.json_normalizeのドキュメントを調べました。

私はこれの一部を正規化して辞書がどのように機能するのか理解できましたが、まだそこにはありません。

以下のコードでは、最初のレベルしか取得できません。

import json 
import pandas as pd 
from pandas.io.json import json_normalize 

with open('authors_sample.json') as f: 
    d = json.load(f) 

raw = json_normalize(d['hits']['hits']) 

authors = json_normalize(data = d['hits']['hits'], 
         record_path = '_source', 
         meta = ['_id', ['_source', 'journal'], ['_source', 'title'], 
           ['_source', 'normalized_venue_name'] 
           ]) 

私は以下のコードと「著者の辞書に「掘る」にしようとしていますが、record_path = ['_source', 'authors']は私にTypeError: string indices must be integersをスローします。私が理解する限り、json_normalizeロジックは良いはずですが、私はまだdictlistとjsonに潜り込む方法をあまり理解していません。

私はこの簡単なことを行ったexample

authors = json_normalize(data = d['hits']['hits'], 
         record_path = ['_source', 'authors'], 
         meta = ['_id', ['_source', 'journal'], ['_source', 'title'], 
           ['_source', 'normalized_venue_name'] 
           ]) 

以下は、jsonファイルのチャンク(5レコード)です。

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5}, 
u'hits': {u'hits': [{u'_id': u'7CB3F2AD', 
    u'_index': u'scibase_listings', 
    u'_score': 1.0, 
    u'_source': {u'authors': None, 
    u'deleted': 0, 
    u'description': None, 
    u'doi': u'', 
    u'is_valid': 1, 
    u'issue': None, 
    u'journal': u'Physical Review Letters', 
    u'link': None, 
    u'meta_description': None, 
    u'meta_keywords': None, 
    u'normalized_venue_name': u'phys rev lett', 
    u'pages': None, 
    u'parent_keywords': [u'Chromatography', 
     u'Quantum mechanics', 
     u'Particle physics', 
     u'Quantum field theory', 
     u'Analytical chemistry', 
     u'Quantum chromodynamics', 
     u'Physics', 
     u'Mass spectrometry', 
     u'Chemistry'], 
    u'pub_date': u'1987-03-02 00:00:00', 
    u'pubtype': None, 
    u'rating_avg_weighted': 0, 
    u'rating_clarity': 0.0, 
    u'rating_clarity_weighted': 0.0, 
    u'rating_innovation': 0.0, 
    u'rating_innovation_weighted': 0.0, 
    u'rating_num_weighted': 0, 
    u'rating_reproducability': 0, 
    u'rating_reproducibility_weighted': 0.0, 
    u'rating_versatility': 0.0, 
    u'rating_versatility_weighted': 0.0, 
    u'review_count': 0, 
    u'tag': [u'mass spectra', u'elementary particles', u'bound states'], 
    u'title': u'Evidence for a new meson: A quasinuclear NN-bar bound state', 
    u'userAvg': 0.0, 
    u'user_id': None, 
    u'venue_name': u'Physical Review Letters', 
    u'views_count': 0, 
    u'volume': None}, 
    u'_type': u'listing'}, 
    {u'_id': u'7AF8EBC3', 
    u'_index': u'scibase_listings', 
    u'_score': 1.0, 
    u'_source': {u'authors': [{u'affiliations': [u'Punjabi University'], 
     u'author_id': u'780E3459', 
     u'author_name': u'munish puri'}, 
     {u'affiliations': [u'Punjabi University'], 
     u'author_id': u'48D92C79', 
     u'author_name': u'rajesh dhaliwal'}, 
     {u'affiliations': [u'Punjabi University'], 
     u'author_id': u'7D9BD37C', 
     u'author_name': u'r s singh'}], 
    u'deleted': 0, 
    u'description': None, 
    u'doi': u'', 
    u'is_valid': 1, 
    u'issue': None, 
    u'journal': u'Journal of Industrial Microbiology & Biotechnology', 
    u'link': None, 
    u'meta_description': None, 
    u'meta_keywords': None, 
    u'normalized_venue_name': u'j ind microbiol biotechnol', 
    u'pages': None, 
    u'parent_keywords': [u'Nuclear medicine', 
     u'Psychology', 
     u'Hydrology', 
     u'Chromatography', 
     u'X-ray crystallography', 
     u'Nuclear fusion', 
     u'Medicine', 
     u'Fluid dynamics', 
     u'Thermodynamics', 
     u'Physics', 
     u'Gas chromatography', 
     u'Radiobiology', 
     u'Engineering', 
     u'Organic chemistry', 
     u'High-performance liquid chromatography', 
     u'Chemistry', 
     u'Organic synthesis', 
     u'Psychotherapist'], 
    u'pub_date': u'2008-04-04 00:00:00', 
    u'pubtype': None, 
    u'rating_avg_weighted': 0, 
    u'rating_clarity': 0.0, 
    u'rating_clarity_weighted': 0.0, 
    u'rating_innovation': 0.0, 
    u'rating_innovation_weighted': 0.0, 
    u'rating_num_weighted': 0, 
    u'rating_reproducability': 0, 
    u'rating_reproducibility_weighted': 0.0, 
    u'rating_versatility': 0.0, 
    u'rating_versatility_weighted': 0.0, 
    u'review_count': 0, 
    u'tag': [u'flow rate', 
     u'operant conditioning', 
     u'packed bed reactor', 
     u'immobilized enzyme', 
     u'specific activity'], 
    u'title': u'Development of a stable continuous flow immobilized enzyme reactor for the hydrolysis of inulin', 
    u'userAvg': 0.0, 
    u'user_id': None, 
    u'venue_name': u'Journal of Industrial Microbiology & Biotechnology', 
    u'views_count': 0, 
    u'volume': None}, 
    u'_type': u'listing'}, 
    {u'_id': u'7521A721', 
    u'_index': u'scibase_listings', 
    u'_score': 1.0, 
    u'_source': {u'authors': [{u'author_id': u'7FF872BC', 
     u'author_name': u'barbara eileen ryan'}], 
    u'deleted': 0, 
    u'description': None, 
    u'doi': u'', 
    u'is_valid': 1, 
    u'issue': None, 
    u'journal': u'The American Historical Review', 
    u'link': None, 
    u'meta_description': None, 
    u'meta_keywords': None, 
    u'normalized_venue_name': u'american historical review', 
    u'pages': None, 
    u'parent_keywords': [u'Social science', 
     u'Politics', 
     u'Sociology', 
     u'Law'], 
    u'pub_date': u'1992-01-01 00:00:00', 
    u'pubtype': None, 
    u'rating_avg_weighted': 0, 
    u'rating_clarity': 0.0, 
    u'rating_clarity_weighted': 0.0, 
    u'rating_innovation': 0.0, 
    u'rating_innovation_weighted': 0.0, 
    u'rating_num_weighted': 0, 
    u'rating_reproducability': 0, 
    u'rating_reproducibility_weighted': 0.0, 
    u'rating_versatility': 0.0, 
    u'rating_versatility_weighted': 0.0, 
    u'review_count': 0, 
    u'tag': [u'social movements'], 
    u'title': u"Feminism and the women's movement : dynamics of change in social movement ideology, and activism", 
    u'userAvg': 0.0, 
    u'user_id': None, 
    u'venue_name': u'The American Historical Review', 
    u'views_count': 0, 
    u'volume': None}, 
    u'_type': u'listing'}, 
    {u'_id': u'7DAEB9A4', 
    u'_index': u'scibase_listings', 
    u'_score': 1.0, 
    u'_source': {u'authors': [{u'author_id': u'0299B8E9', 
     u'author_name': u'fraser j harbutt'}], 
    u'deleted': 0, 
    u'description': None, 
    u'doi': u'', 
    u'is_valid': 1, 
    u'issue': None, 
    u'journal': u'The American Historical Review', 
    u'link': None, 
    u'meta_description': None, 
    u'meta_keywords': None, 
    u'normalized_venue_name': u'american historical review', 
    u'pages': None, 
    u'parent_keywords': [u'Superconductivity', 
     u'Nuclear fusion', 
     u'Geology', 
     u'Chemistry', 
     u'Metallurgy'], 
    u'pub_date': u'1988-01-01 00:00:00', 
    u'pubtype': None, 
    u'rating_avg_weighted': 0, 
    u'rating_clarity': 0.0, 
    u'rating_clarity_weighted': 0.0, 
    u'rating_innovation': 0.0, 
    u'rating_innovation_weighted': 0.0, 
    u'rating_num_weighted': 0, 
    u'rating_reproducability': 0, 
    u'rating_reproducibility_weighted': 0.0, 
    u'rating_versatility': 0.0, 
    u'rating_versatility_weighted': 0.0, 
    u'review_count': 0, 
    u'tag': [u'iron'], 
    u'title': u'The iron curtain : Churchill, America, and the origins of the Cold War', 
    u'userAvg': 0.0, 
    u'user_id': None, 
    u'venue_name': u'The American Historical Review', 
    u'views_count': 0, 
    u'volume': None}, 
    u'_type': u'listing'}, 
    {u'_id': u'7B3236C5', 
    u'_index': u'scibase_listings', 
    u'_score': 1.0, 
    u'_source': {u'authors': [{u'author_id': u'7DAB7B72', 
     u'author_name': u'richard m freeland'}], 
    u'deleted': 0, 
    u'description': None, 
    u'doi': u'', 
    u'is_valid': 1, 
    u'issue': None, 
    u'journal': u'The American Historical Review', 
    u'link': None, 
    u'meta_description': None, 
    u'meta_keywords': None, 
    u'normalized_venue_name': u'american historical review', 
    u'pages': None, 
    u'parent_keywords': [u'Political Science', u'Economics'], 
    u'pub_date': u'1985-01-01 00:00:00', 
    u'pubtype': None, 
    u'rating_avg_weighted': 0, 
    u'rating_clarity': 0.0, 
    u'rating_clarity_weighted': 0.0, 
    u'rating_innovation': 0.0, 
    u'rating_innovation_weighted': 0.0, 
    u'rating_num_weighted': 0, 
    u'rating_reproducability': 0, 
    u'rating_reproducibility_weighted': 0.0, 
    u'rating_versatility': 0.0, 
    u'rating_versatility_weighted': 0.0, 
    u'review_count': 0, 
    u'tag': [u'foreign policy'], 
    u'title': u'The Truman Doctrine and the origins of McCarthyism : foreign policy, domestic politics, and internal security, 1946-1948', 
    u'userAvg': 0.0, 
    u'user_id': None, 
    u'venue_name': u'The American Historical Review', 
    u'views_count': 0, 
    u'volume': None}, 
    u'_type': u'listing'}], 
    u'max_score': 1.0, 
    u'total': 36429433}, 
u'timed_out': False, 
u'took': 170} 
+0

のは、それはほとんど不可能にするなし - 'u'authors':NONE'。我々はそれらをフィルタリングしようとすることができます... – MaxU

答えて

2
In [23]: lst = [l for l in raw['hits']['hits'] if l['_source'].get('authors')] 

In [24]: json_normalize(lst, 
         [['_source', 'authors']], 
         ['_id', ['_source', 'journal'], ['_source', 'title']]) 
Out[24]: 
      affiliations author_id   author_name  _id         _source.journal \ 
0 [Punjabi University] 780E3459   munish puri 7AF8EBC3 Journal of Industrial Microbiology & Biotechno... 
1 [Punjabi University] 48D92C79  rajesh dhaliwal 7AF8EBC3 Journal of Industrial Microbiology & Biotechno... 
2 [Punjabi University] 7D9BD37C   r s singh 7AF8EBC3 Journal of Industrial Microbiology & Biotechno... 
3     NaN 7FF872BC barbara eileen ryan 7521A721      The American Historical Review 
4     NaN 0299B8E9  fraser j harbutt 7DAEB9A4      The American Historical Review 
5     NaN 7DAB7B72 richard m freeland 7B3236C5      The American Historical Review 

             _source.title 
0 Development of a stable continuous flow immobi... 
1 Development of a stable continuous flow immobi... 
2 Development of a stable continuous flow immobi... 
3 Feminism and the women's movement : dynamics o... 
4 The iron curtain : Churchill, America, and the... 
5 The Truman Doctrine and the origins of McCarth... 
+0

これはほとんどそこにあります!しかし、私はまだリストに「提携」を持っています。これはあなたに起こりましたか?私はそれを通過するために 'lst'の中で別のループを行うべきですか? –

+0

@DanielVargas、あなたは 'affiliations'に複数の要素を持つことができますか、それとも常にNoneかoneですか? – MaxU

+0

は複数でも1つでもなくてもかまいません。 –

関連する問題