2017-03-22 11 views
0

私はこのpythonスクリプトを使って、librivox.orgのWebサイトからいくつかのテキストを取ります。私はyamlとjsonの両方でオーディオブックの "記述"を保存しようとしています。私がこれをやろうとしているのは、yamlを生成してPythonをPythonに変換することです。なぜ私のyaml.loadが爆発しているのかわかりません

#!/usr/bin/env python 

import sys, getopt 
import json 
import yaml 
import requests 
import subprocess 
import re 

hiera_dir    = '/home/hiera/audiobooks' 

from HTMLParser import HTMLParser 

class MLStripper(HTMLParser): 
    def __init__(self): 
     self.reset() 
     self.fed = [] 
    def handle_data(self, d): 
     self.fed.append(d) 
    def get_data(self): 
     return ''.join(self.fed) 

def strip_tags(html): 
    s = MLStripper() 
    s.feed(html) 
    return s.get_data() 


def usage(msg): 
     print msg 


def write_file(data, fn): 
     print "Writing output to %s\n" % (fn) 
     with open(fn, "w") as fh: 
       fh.write(data) 

def main(argv): 
     global top 
     global version 
     global package 
     appname     = 'unknown' 
     librivox_id    = 'unknown' 
     app_image_url   = 'unknown' 
     email     = 'unknown' 
     acctpasswd    = 'unknown' 
     password    = 'XXXXXXX' 
     try: 
       opts, args = getopt.getopt(argv,"hn:l:t:v:k:p:i:e:P:",["appname", "id=","top=","version=","package=","password=","image_url=","email=","acctpasswd="]) 
     except getopt.GetoptError: 
       print 'make_hiera_data_from_librivox_api.py -n <appname> -l <librvox id> -e <developer email> -P <developer passwd> [-t <top>] [-v <version>] [-p <password>]' 
       sys.exit(2) 
     for opt, arg in opts: 
       if opt == '-h': 
         usage ('Help called') 
         sys.exit(0) 
       elif opt in ("-n", "--appname"): 
         appname = arg 
       elif opt in ("-l", "--id"): 
         librivox_id = arg 
       elif opt in ("-t", "--top"): 
         top = arg 
       elif opt in ("-v", "--version"): 
         version = arg 
       elif opt in ("-p", "--password"): 
         password = arg 
       elif opt in ("-k", "--package"): 
         password = arg 
       elif opt in ("-i", "--image_url"): 
         app_image_url = arg 
       elif opt in ("-e", "--email"): 
         email = arg 
       elif opt in ("-P", "--acctpasswd"): 
         acctpasswd = arg 

     if (appname == 'unknown'): 
       usage ("Please specify a appname") 
       sys.exit (1) 
     if (librivox_id == 'unknown'): 
       usage ("Please specify a librivox api id") 
       sys.exit (1) 

# https://librivox.org/api/feed/audiobooks/id/9485/extended/1/format/json 
     librivox_rest_url  = "https://librivox.org/api/feed/audiobooks/id/" + librivox_id + "/extended/1/format/json" 
     try: 
       parsed   = json.loads(requests.get(librivox_rest_url).text) 
     except: 
       e = sys.exc_info()[0] 
       print "Error on %s Error [%s]" % (librivox_rest_url, e) 
       sys.exit(1) 

     try: 
       book_key  = parsed['books'].keys()[0] 
     except: 
       e = sys.exc_info()[0] 
       print "Error on %s Error [%s]" % (librivox_rest_url, e) 
       sys.exit(1) 
     apptitle  = parsed['books'][book_key]['title'] 
     app_zip_url  = parsed['books'][book_key]['url_zip_file'] 
     description  = parsed['books'][book_key]['description'] 
     description  = strip_tags(parsed['books'][book_key]['description'].encode('ascii', 'ignore').decode('ascii')) 

     description  = re.sub("^"," ", description, flags=re.MULTILINE) 
     description  = re.sub("^$"," X", description, flags=re.MULTILINE) 
     description  = re.sub("^ $"," x", description, flags=re.MULTILINE) 
     for d in description.split("\n"): 
       print "d is [%s]\n" % (d) 

     amazon_app_id = 'junk' 
     top    = 'junk' 
     package   = 'junk' 
     version   = 'junk' 
     password  = 'junk' 
     yaml_version = """--- 
amazon_app_id: '%s' 
librivox_rest_url: '%s' 
librivox_id: '%s' 
top: '%s' 
package: '%s' 
version: '%s' 
password: '%s' 
description: | 
%s 

""" % (
                 amazon_app_id 
               ,  librivox_rest_url 
               ,  librivox_id 
               ,  top 
               ,  package 
               ,  version 
               ,  password 
               ,  description) 
     print yaml_version 
     write_file(yaml_version, hiera_dir + '/' + appname + '.yaml'); 
     myyaml = yaml.load(yaml_version) 
     json_version = json.dumps(yaml.load(yaml_version), sort_keys=True, indent=2) 
     print json_version 

     write_file(json_version, doc_root_audiobook_json + '/' + appname + '.json'); 

if __name__ == "__main__": 
    main(sys.argv[1:]) 

:私はに実行しています問題は、ここで

Traceback (most recent call last): 
    File "./test-get-description.py", line 143, in <module> 
    main(sys.argv[1:]) 
    File "./test-get-description.py", line 136, in main 
    myyaml = yaml.load(yaml_version) 
    File "/usr/lib64/python2.7/site-packages/yaml/__init__.py", line 71, in load 
    return loader.get_single_data() 
    File "/usr/lib64/python2.7/site-packages/yaml/constructor.py", line 37, in get_single_data 
    node = self.get_single_node() 
    File "/usr/lib64/python2.7/site-packages/yaml/composer.py", line 36, in get_single_node 
    document = self.compose_document() 
    File "/usr/lib64/python2.7/site-packages/yaml/composer.py", line 55, in compose_document 
    node = self.compose_node(None, None) 
    File "/usr/lib64/python2.7/site-packages/yaml/composer.py", line 84, in compose_node 
    node = self.compose_mapping_node(anchor) 
    File "/usr/lib64/python2.7/site-packages/yaml/composer.py", line 127, in compose_mapping_node 
    while not self.check_event(MappingEndEvent): 
    File "/usr/lib64/python2.7/site-packages/yaml/parser.py", line 98, in check_event 
    self.current_event = self.state() 
    File "/usr/lib64/python2.7/site-packages/yaml/parser.py", line 439, in parse_block_mapping_key 
    "expected <block end>, but found %r" % token.id, token.start_mark) 
yaml.parser.ParserError: while parsing a block mapping 
    in "<unicode string>", line 2, column 1: 
    amazon_app_id: 'junk' 
    ^
expected <block end>, but found '<scalar>' 
    in "<unicode string>", line 11, column 2: 
    x 
    ^

はスクリプトです...この行...

myyaml = yaml.load(yaml_version) 

が...そこにトレース出力に失敗したということです私はこのようにスクリプトを実行する:

[[email protected] scripts]$ ./test-get-description.py -n 'junk' -l 3269 

T彼ID 3269は、このURLに1を取る:

https://librivox.org/api/feed/audiobooks/id/3269/extended/1/format/json

私はこのようなルックスを書くYAMLファイル:

--- 
amazon_app_id: 'junk' 
librivox_rest_url: 'https://librivox.org/api/feed/audiobooks/id/3269/extended/1/format/json' 
librivox_id: '3269' 
top: 'junk' 
package: 'junk' 
version: 'junk' 
password: 'junk' 
description: | 
    It is the end of the 19th century. Like thousands of others, the Rudkus family has emigrated from Lithuania to America in search of a better life. As they settle into the Packingtown neighborhood of Chicago, they find their dreams are unlikely to be realized. In fact, just the opposite is quite likely to occur. Jurgis, the main character of the novel, has brought his father Antanas, his fiance Ona, her stepmother Teta Elzbieta, Teta Elzbieta's brother Jonas and her six children, and Ona's cousin Marija Berczynskas along. The family, nave to the ways of Chicago, quickly falls prey to con men and makes a series of bad decisions that lead them into wretched poverty and terrible living conditions. All are forced to find jobs in dismal working conditions for their very survival. Jurgis, broken and discouraged, eventually finds solace in the American Socialist movement. 
x 
This novel was written during a period in American history when Trusts were formed by multiple corporations to establish monopolies that stifled competition and fixed prices. Unthinkable working conditions and unfair business practices were the norm. The Jungles author, Upton Sinclair, was an ardent Socialist of the time. Sinclair was commissioned by the Appeal To Reason, a Socialist journal of the period, to write a fictional expose on the working conditions of the immigrant laborers in the meat packing industry in Chicago. Going undercover, Sinclair spent seven weeks inside the meatpacking plants gathering details for his novel. 
x 
The Reader wishes to gratefully acknowledge the assistance, and patience, of Professor Giedrius Subacius (University of Illinois) and the folks at Lituanus for their invaluable support as I struggled with Lithuanian pronunciations. Truly, this audio book would have been far more difficult, and far less authentic, without their help. 
x 
And now, feel free to wander into The Jungle. 
x 
(Summary by Tom Weiss) 
+1

PyYamlのエミッタ機能を使ってYAMLを生成してみませんか?そうすれば、構文エラーがないことを確認できます。 – flyx

+0

私はそれを聞いたことがないので推測します。あなたは精緻化できますか?ありがとう。 –

+0

Hmm。私はエミッタが "ブロック" yamlデータを扱うことはできないと思います。しかし、私はおそらく間違っています。私はpythonエミッターでyamlブロックを生成する方法を知っていますか? –

答えて

1

問題は、あなたのリテラルスカラーです。あなたはindentation explicitly the indent is determined from the first non-empty lineを与えないからです。他の行の一部が最初の行より少ないインデントを持っているので、あなたの場合、これは2である、あなたは明示的にインデントを指定する必要があります:

description: |1 
    It is the end ..... 

あなたのラインが整列する必要はありません。

あなたが制御されていないソースからYAMLを読まないことが100%でない場合、安全でないため​​を使用しないでください。代わりにsafe_load()を使用してください。

+0

すごい!あなたはそれを行うことができるか分からなかった! –

1

あなたは文字通りのブロックとindentantionの問題を抱えている、すべてのそれのラインは維持しなければなりません少なくとも同じくぼみレベル。

YAMLブロックスタイルでは、構造体はインデントによって決まります。 一般的には、インデントは0行以上の空白文字( )と定義されています。

だから、これは動作します:

description: | 
    It is the end of the 19th century. Like thousands of others, the Rudkus family has emigrated from Lithuania to America in search of a better life. As they settle into the Packingtown neighborhood of Chicago, they find their dreams are unlikely to be realized. In fact, just the opposite is quite likely to occur. Jurgis, the main character of the novel, has brought his father Antanas, his fiance Ona, her stepmother Teta Elzbieta, Teta Elzbieta's brother Jonas and her six children, and Ona's cousin Marija Berczynskas along. The family, nave to the ways of Chicago, quickly falls prey to con men and makes a series of bad decisions that lead them into wretched poverty and terrible living conditions. All are forced to find jobs in dismal working conditions for their very survival. Jurgis, broken and discouraged, eventually finds solace in the American Socialist movement. 
    x 
    This novel was written during a period in American history when Trusts were formed by multiple corporations to establish monopolies that stifled competition and fixed prices. Unthinkable working conditions and unfair business practices were the norm. The Jungles author, Upton Sinclair, was an ardent Socialist of the time. Sinclair was commissioned by the Appeal To Reason, a Socialist journal of the period, to write a fictional expose on the working conditions of the immigrant laborers in the meat packing industry in Chicago. Going undercover, Sinclair spent seven weeks inside the meatpacking plants gathering details for his novel. 
    x 
    The Reader wishes to gratefully acknowledge the assistance, and patience, of Professor Giedrius Subacius (University of Illinois) and the folks at Lituanus for their invaluable support as I struggled with Lithuanian pronunciations. Truly, this audio book would have been far more difficult, and far less authentic, without their help. 
    x 
    And now, feel free to wander into The Jungle. 
    x 
    (Summary by Tom Weiss) 
+0

ああ。私はそれを見る。私はそれを修正する正規表現を考え出そうとしています。しかし、これまで運がなかった。たぶんバックツーバックスペースを削除する正規表現ですか?私はそれを理解することさえできません。 –

+0

これを代わりに使用してください: 're.sub(r '^ \ s *(\ S)'、r '\ 1'、description、flags = re.MULTILINE)' – klashxx

+0

ありがとう。私はここで私がここから引っ張った説明でそれを試した:https://librivox.org/api/feed/audiobooks/id/3269/extended/1/format/jsonそれはそれを修正しません。 –

関連する問題