0
私はPythonでクローラを持っています。私の関数はdataLayerでスクリプトを取得し、文字列でフォーマットしてjsonに変換します。Pythonで文字列jsonの一部を削除するには
これはdataLayerです:
dataLayer = [{
"site": {
"isMobile": false,
"source": (function() {
var userAgent = navigator.userAgent.toLocaleLowerCase();
var source = "web";
resultMatch = userAgent.match(/\[olx-source\/(\w+);/);
if (resultMatch) {
[, source] = resultMatch;
}
return source;
})()
},
"page": {
"pageType": "ad_detail",
"detail": {
"parent_category_id": "2000",
"category_id": "2020",
"state_id": "2",
"region_id": "31",
"ad_id": "354269527",
"list_id": "295567499",
"city_id": "9190",
"zipcode":"32146045",
},
"adDetail": {
"adID": "354269527",
"listID": "295567499",
"sellerName": "Glauber Marlon",
"adDate": "2017-01-23 18:35:26",
},
},
"session": {
"user": {
"userID": "",
"loginType": ""
}
},
"pageType": "Ad_detail",
"abtestingEnable" : "1",
// Listing information
"listingCategory": "2020",
// Ad information
"adId": "354269527",
"state": "2",
"region": "31",
"category": "2020",
"pictures": "14",
"listId": "295567499",
//Account Information
"loggedUser":"0",
"referrer": "",
//User Information
}];
形式のこの私の機能とJSONに変換します。
s = page_ad.findAll('script')[25].text.replace('\'', '"')
// if print s this line and put in JsonLint show error in function.
s = re.search(r'\{.+\}', s, re.DOTALL).group() # get json data
s = re.sub(r'//.+\n', '', s) # replace comment
s = re.sub(r'\s+', '', s) # strip whitspace
s = re.sub(r',}', '}', s) # get rid of last , in the dict
dataLayer = json.loads(s)
これは、変換する前に、JSONです:私は、インデックスを削除したい
{
"site":{
"isMobile":false,
"source":(function() {
varuserAgent=navigator.userAgent.toLocaleLowerCase();varsource="web";resultMatch=userAgent.match(/\ [
olx-source\/(\w+);/);if(resultMatch) {
[
,
source
] =resultMatch;
} returnsource;
} )()
},
"page":{
"pageType":"ad_detail",
"detail":{
"parent_category_id":"2000",
"category_id":"2020",
"state_id":"2",
"region_id":"31",
"ad_id":"354269527",
"list_id":"295567499",
"city_id":"9190",
"zipcode":"32146045"
},
"adDetail":{
"adID":"354269527",
"listID":"295567499",
"sellerName":"GlauberMarlon",
"adDate":"2017-01-2318:35:26"
}
},
"session":{
"user":{
"userID":"",
"loginType":""
}
},
"pageType":"Ad_detail",
"abtestingEnable":"1",
"listingCategory":"2020",
"adId":"354269527",
"state":"2",
"region":"31",
"category":"2020",
"pictures":"14",
"listId":"295567499",
"loggedUser":"0",
"referrer":""
}
"site"エラーがどこにあるか。
Traceback (most recent call last):
File "crawler_olx_0.1.py", line 182, in <module>
run(link_base)
File "crawler_olx_0.1.py", line 52, in run
vehicleInformation = getVehicleInformation(page_ad)
File "crawler_olx_0.1.py", line 81, in getVehicleInformation
dataLayer = json.loads(s)
File "/usr/lib/python2.7/json/__init__.py", line 339, in loads
return _default_decoder.decode(s)
File "/usr/lib/python2.7/json/decoder.py", line 364, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "/usr/lib/python2.7/json/decoder.py", line 382, in raw_decode
raise ValueError("No JSON object could be decoded")
ValueError: No JSON object could be decoded
どのようなエラーが表示されますか? – depperm
いいえ... html/xml/json/...を編集しないでください。正規表現は文字列置換です。これらは文脈自由な言語です。正規表現は通常の言語用に作られています。 –
「どこにエラーがありますか」は非常に良い質問です。あなたのコンピュータはどこですか? –