2017-01-31 19 views
0

私はPythonでクローラを持っています。私の関数はdataLayerでスクリプトを取得し、文字列でフォーマットしてjsonに変換します。Pythonで文字列jsonの一部を削除するには

これはdataLayerです:

dataLayer = [{ 

    "site": { 
     "isMobile": false, 
     "source": (function() { 
      var userAgent = navigator.userAgent.toLocaleLowerCase(); 
      var source = "web"; 
      resultMatch = userAgent.match(/\[olx-source\/(\w+);/); 
      if (resultMatch) { 
       [, source] = resultMatch; 
      } 
      return source; 
     })() 
    }, 
    "page": { 
     "pageType": "ad_detail", 
     "detail": { 
      "parent_category_id": "2000", 
      "category_id": "2020", 
      "state_id": "2", 
      "region_id": "31", 

      "ad_id": "354269527", 
      "list_id": "295567499", 
      "city_id": "9190", 
      "zipcode":"32146045", 

     }, 

     "adDetail": { 
      "adID": "354269527", 
      "listID": "295567499", 
      "sellerName": "Glauber Marlon", 
      "adDate": "2017-01-23 18:35:26", 
     }, 

    }, 
    "session": { 
     "user": { 
      "userID": "", 
      "loginType": "" 
     } 
    }, 

    "pageType": "Ad_detail", 
    "abtestingEnable" : "1", 



// Listing information 

"listingCategory": "2020", 


// Ad information 
"adId": "354269527", 
"state": "2", 
"region": "31", 
"category": "2020", 

"pictures": "14", 
"listId": "295567499", 

//Account Information 

"loggedUser":"0", 

"referrer": "", 

//User Information 


}]; 

形式のこの私の機能とJSONに変換します。

s = page_ad.findAll('script')[25].text.replace('\'', '"') 
// if print s this line and put in JsonLint show error in function. 
s = re.search(r'\{.+\}', s, re.DOTALL).group() # get json data 
s = re.sub(r'//.+\n', '', s) # replace comment 
s = re.sub(r'\s+', '', s) # strip whitspace 
s = re.sub(r',}', '}', s) # get rid of last , in the dict 

dataLayer = json.loads(s) 

これは、変換する前に、JSONです:私は、インデックスを削除したい

{ 
    "site":{ 
     "isMobile":false, 
     "source":(function()  { 
      varuserAgent=navigator.userAgent.toLocaleLowerCase();varsource="web";resultMatch=userAgent.match(/\   [ 
       olx-source\/(\w+);/);if(resultMatch)   { 
       [ 
       , 
       source 
       ]    =resultMatch; 
      }   returnsource; 
     }  )() 
    }, 
    "page":{ 
     "pageType":"ad_detail", 
     "detail":{ 
      "parent_category_id":"2000", 
      "category_id":"2020", 
      "state_id":"2", 
      "region_id":"31", 
      "ad_id":"354269527", 
      "list_id":"295567499", 
      "city_id":"9190", 
      "zipcode":"32146045" 
     }, 
     "adDetail":{ 
      "adID":"354269527", 
      "listID":"295567499", 
      "sellerName":"GlauberMarlon", 
      "adDate":"2017-01-2318:35:26" 
     } 
    }, 
    "session":{ 
     "user":{ 
      "userID":"", 
      "loginType":"" 
     } 
    }, 
    "pageType":"Ad_detail", 
    "abtestingEnable":"1", 
    "listingCategory":"2020", 
    "adId":"354269527", 
    "state":"2", 
    "region":"31", 
    "category":"2020", 
    "pictures":"14", 
    "listId":"295567499", 
    "loggedUser":"0", 
    "referrer":"" 
} 

"site"エラーがどこにあるか。

Traceback (most recent call last): 
    File "crawler_olx_0.1.py", line 182, in <module> 
    run(link_base) 
    File "crawler_olx_0.1.py", line 52, in run 
    vehicleInformation = getVehicleInformation(page_ad) 
    File "crawler_olx_0.1.py", line 81, in getVehicleInformation 
    dataLayer = json.loads(s) 
    File "/usr/lib/python2.7/json/__init__.py", line 339, in loads 
    return _default_decoder.decode(s) 
    File "/usr/lib/python2.7/json/decoder.py", line 364, in decode 
    obj, end = self.raw_decode(s, idx=_w(s, 0).end()) 
    File "/usr/lib/python2.7/json/decoder.py", line 382, in raw_decode 
    raise ValueError("No JSON object could be decoded") 
ValueError: No JSON object could be decoded 
+2

どのようなエラーが表示されますか? – depperm

+1

いいえ... html/xml/json/...を編集しないでください。正規表現は文字列置換です。これらは文脈自由な言語です。正規表現は通常の言語用に作られています。 –

+2

「どこにエラーがありますか」は非常に良い質問です。あなたのコンピュータはどこですか? –

答えて

0

掻き落としながら、JavaScriptを実行する方が良いだろう(セレン、...) 愚かアプローチであるかもしれない、次の(2.7のために、私はPY3 + pyv8がインストールされていないのです)。

import PyV8 
jsCtx = PyV8.JSContext() 
jsCtx.enter() 

jsCtx.eval('var navigator = { "userAgent": "Mozilla/5.0 (Macintosh; U; Intel Mac OS X; en-US; rv:1.8.1.3) Gecko/20070309 Firefox/2.0.0.3" }') 
JsFilledJSON = """dataLayer = [ 
    { 
     "site": { 
      "isMobile": false, 
      "source": (function() { 
       var userAgent = navigator.userAgent.toLocaleLowerCase(); 
       var source = "web"; 
       resultMatch = userAgent.match(/\[olx-source\/(\w+);/); 
       if (resultMatch) { 
        [, source] = resultMatch; 
       } 
       return source; 
      })() 
     }, 
     "page": { 
      "pageType": "ad_detail", 
      "detail": { 
       "parent_category_id": "2000", 
       "category_id": "2020", 
       "state_id": "2", 
       "region_id": "31", 

       "ad_id": "354269527", 
       "list_id": "295567499", 
       "city_id": "9190", 
       "zipcode":"32146045", 

      } 
     } 
    } 
] 
""" 

x = jsCtx.eval("JSON.stringify(%s)" % JsFilledJSON.decode('utf-8')) 
print json.dumps(json.loads(x), indent=4, sort_keys=True) 

戻り値:

[ 
    { 
     "page": { 
      "detail": { 
       "ad_id": "354269527", 
       "category_id": "2020", 
       "city_id": "9190", 
       "list_id": "295567499", 
       "parent_category_id": "2000", 
       "region_id": "31", 
       "state_id": "2", 
       "zipcode": "32146045" 
      }, 
      "pageType": "ad_detail" 
     }, 
     "site": { 
      "isMobile": false, 
      "source": "web" 
     } 
    } 
] 

しかし、あなたはラッパーをavascriptするPyV8またはその他のpythonでJS-JSONを実行することができます。バイナリはgithubにあります。

関連する問題