Twitterから大きなデータセットを収集しました。このファイル(twitter.json)は、このような行が含まれています。ElasticsearchのネストされたTwitterデータ(json)をマップ/インデックスする方法
[ {"created_at":"Sun Apr 16 00:00:10 +0000 2017","id":853397569807958016,"id_str":"853397569807958016","text":"\u3042\u3048\u3066\u8a00\u3046\u3051\u3069\u3001\u6642\u9593\u3060\u3088\uff01\u4f55\u304b\u3059\u308b\u3053\u3068\u3001\u3042\u3063\u305f\u3093\u3058\u3083\u306a\u3044\uff1f(\u30a8\u30b3\u30ed)","source":"\u003ca href=\"http:\/\/makebot.sh\" rel=\"nofollow\"\u003e\u3077\u3088\u3077\u3088\u30c9\u30e9\u30deCDbot\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":2230278991,"id_str":"2230278991","name":"\u3077\u3088\u3077\u3088\u304a\u307e\u3051\u30dc\u30a4\u30b9bot","screen_name":"puyo_cd_bot","location":null,"url":"http:\/\/twpf.jp\/puyo_cd_bot","description":"\u53ea\u4eca\u591a\u5fd9\u306e\u305f\u3081\u66f4\u65b0\u304c\u505c\u6ede\u3057\u3066\u3044\u307e\u3059\u3001\u3054\u4e86\u627f\u304f\u3060\u3055\u3044\u3002\u3077\u3088\u3077\u3088\u30c9\u30e9\u30decd\u306e\u304a\u307e\u3051\u30dc\u30a4\u30b9\u306e\u5b9a\u671f\u3064\u3076\u3084\u304d\u3001\u4e00\u90e8\u30ea\u30d7\u30e9\u30a4\u3067\u306e\u53cd\u5fdc\u3092\u8003\u3048\u3066\u3044\u307e\u3059\u3002\u975e\u516c\u5f0f\u3002\u767b\u9332\u6e08\u307f\u30ad\u30e3\u30e9\u306a\u3069\u8a73\u3057\u304f\u306f\u3064\u3044\u3077\u308d\u306b\u3066","protected":false,"verified":false,"followers_count":181,"friends_count":115,"listed_count":3,"favourites_count":0,"statuses_count":44139,"created_at":"Wed Dec 04 17:43:08 +0000 2013","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"1DA1F2","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/378800000837231449\/beca2ed4c8ce917b37dcbe188d0f9e31_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/378800000837231449\/beca2ed4c8ce917b37dcbe188d0f9e31_normal.jpeg","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"ja","timestamp_ms":"1492300810659"}
, { ... }
...
, { ... }
]
To give you a better visual, the 1st tweet line looks like this after validation:
{"created_at": "Sun Apr 16 00:00:10 +0000 2017",
"id": 853397569807958016,
"id_str": "853397569807958016",
"text": "\u3042\u3048\u3066\u8a00\u3046\u3051\u3069\u3001\u6642\u9593\u3060\u3088\uff01\u4f55\u304b\u3059\u308b\u3053\u3068\u3001\u3042\u3063\u305f\u3093\u3058\u3083\u306a\u3044\uff1f(\u30a8\u30b3\u30ed)",
"source": "\u003ca href=\"http:\/\/makebot.sh\" rel=\"nofollow\"\u003e\u3077\u3088\u3077\u3088\u30c9\u30e9\u30deCDbot\u003c\/a\u003e",
"truncated": false,
"in_reply_to_status_id": null,
"in_reply_to_status_id_str": null,
"in_reply_to_user_id": null,
"in_reply_to_user_id_str": null,
"in_reply_to_screen_name": null,
"user": {
"id": 2230278991,
"id_str": "2230278991",
"name": "\u3077\u3088\u3077\u3088\u304a\u307e\u3051\u30dc\u30a4\u30b9bot",
"screen_name": "puyo_cd_bot",
"location": null,
"url": "http:\/\/twpf.jp\/puyo_cd_bot",
"description": "\u53ea\u4eca\u591a\u5fd9\u306e\u305f\u3081\u66f4\u65b0\u304c\u505c\u6ede\u3057\u3066\u3044\u307e\u3059\u3001\u3054\u4e86\u627f\u304f\u3060\u3055\u3044\u3002\u3077\u3088\u3077\u3088\u30c9\u30e9\u30decd\u306e\u304a\u307e\u3051\u30dc\u30a4\u30b9\u306e\u5b9a\u671f\u3064\u3076\u3084\u304d\u3001\u4e00\u90e8\u30ea\u30d7\u30e9\u30a4\u3067\u306e\u53cd\u5fdc\u3092\u8003\u3048\u3066\u3044\u307e\u3059\u3002\u975e\u516c\u5f0f\u3002\u767b\u9332\u6e08\u307f\u30ad\u30e3\u30e9\u306a\u3069\u8a73\u3057\u304f\u306f\u3064\u3044\u3077\u308d\u306b\u3066",
"protected": false,
"verified": false,
"followers_count": 181,
"friends_count": 115,
"listed_count": 3,
"favourites_count": 0,
"statuses_count": 44139,
"created_at": "Wed Dec 04 17:43:08 +0000 2013",
"utc_offset": null,
"time_zone": null,
"geo_enabled": false,
"lang": "ja",
"contributors_enabled": false,
"is_translator": false,
"profile_background_color": "C0DEED",
"profile_background_image_url": "http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png",
"profile_background_image_url_https": "https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png",
"profile_background_tile": false,
"profile_link_color": "1DA1F2",
"profile_sidebar_border_color": "C0DEED",
"profile_sidebar_fill_color": "DDEEF6",
"profile_text_color": "333333",
"profile_use_background_image": true,
"profile_image_url": "http:\/\/pbs.twimg.com\/profile_images\/378800000837231449\/beca2ed4c8ce917b37dcbe188d0f9e31_normal.jpeg",
"profile_image_url_https": "https:\/\/pbs.twimg.com\/profile_images\/378800000837231449\/beca2ed4c8ce917b37dcbe188d0f9e31_normal.jpeg",
"default_profile": true,
"default_profile_image": false,
"following": null,
"follow_request_sent": null,
"notifications": null
},
"geo": null,
"coordinates": null,
"place": null,
"contributors": null,
"is_quote_status": false,
"retweet_count": 0,
"favorite_count": 0,
"entities": {
"hashtags": [],
"urls": [],
"user_mentions": [],
"symbols": []
},
"favorited": false,
"retweeted": false,
"filter_level": "low",
"lang": "ja",
"timestamp_ms": "1492300810659"
}
問題:
curl -XPOST 'http://localhost:9200/twitter/tweet/1' --data-binary "@/Users/jz/Documents/elasticsearch-5.3.0/twitter.json"
:
は、私は次のコマンドラインを使用してelasticsearchするために、この.jsonファイルをインポートしようとしました
しかし、それは私にこのエラーを与える:
**{"error":"Content-Type header [application/x-www-form-urlencoded] is not supported","status":406}**
は、私も、次のコードを使用しますが、まだ失敗しました:
curl --header "Content-Type:application/json" -XPOST 'http://localhost:9200/twitter/tweet/1' --data-binary "@/Users/jz/Documents/elasticsearch-5.3.0/twitter.json"
エラーメッセージは次のとおりです。
{"error":{"root_cause":[{"type":"mapper_parsing_exception","reason":"failed to parse"}],"type":"mapper_parsing_exception","reason":"failed to parse","caused_by":{"type":"not_x_content_exception","reason":"Compressor detection can only be called on some xcontent bytes or compressed xcontent bytes"}},"status":400}: {"error":{"root_cause":[{"type":"mapper_parsing_exception","reason":"failed to parse"}],"type":"mapper_parsing_exception","reason":"failed to parse","caused_by":{"type":"not_x_content_exception","reason":"Compressor detection can only be called on some xcontent bytes or compressed xcontent bytes"}},"status":400}
は、私があまりにも-dに変更しようとしましたが、再び失敗しました:
curl -XPOST 'http://localhost:9200/twitter/tweet/1' -d "@/Users/jz/Documents/elasticsearch-5.3.0/twitter.json"
エラーメッセージは--data-binaryを使用したものと同じです:
更新日:
curlを使用すると問題が発生するため、Pythonのelasticsearchライブラリを使用することにしました。ローカルホストへの接続に成功した後、私はインデックスにサンプルデータをこのようなものを使用:
es = Elasticsearch([{"host": "localhost", "port":9200}])
with open('sample0.json') as json_data:
json_docs = json.load(json_data)
for json_doc in json_docs:
my_id = json_doc.pop('_id', None)
es.index(index='testdata', doc_type='generated', id=my_id, body=json.dumps(json_doc))
エラー: C:\ Anaconda2 \ libに\ JSON \ decoder.pyc raw_decodeで(自己、秒、 IDX) 378 ""」 379試し: - > 380 OBJ、エンド= self.scan_once(S、IDX)を呼び出すとStopIterationを除く 381: 382は(ValueErrorを送出し、 "いいえJSONオブジェクトをデコードすることはできなかった")
ValueError: Expecting , delimiter: line 1 column 2241 (char 2240)
誰かが私に何かガイダンスをくれますか?ありがとう!
line1 char 2240の周りにjsonファイルのコンテンツを提供できますか? –