2016-06-24 21 views
3

Ajaxを使用してサイトからデータを取得しようとしていますが、XHRリクエストをヘッダーとボディでスムージングしています。禁じられている。XHRリクエストがScrapyで失敗しましたが、Pythonリクエストで動作します

from scrapy import Spider 
from scrapy import Request, FormRequest 
import json 

class jsonSpider(Spider): 
    name = 'json' 

    start_urls = [ 
     'http://m.ctrip.com/restapi/soa2/10932/hotel/Product/domestichotelget'] 

    def start_requests(self): 
     headers = { 
      "Host": "m.ctrip.com", 
      "User-Agent": "Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16", 
      "Accept": "application/json", 
      "Accept-Language": "en-US,en;q=0.5", 
      "Accept-Encoding": "gzip, deflate", 
      "Content-Type": "application/json", 
      "cookieOrigin": "http://wap.ctrip.com", 
      "Cache-Control": "no-cache", 
      "Referer": "http://wap.ctrip.com/webapp/hotel/hoteldetail/426638.html?days=1&atime=20160623&contrl=2&num=1&biz=1", 
      "Content-Length": "455", 
      "Origin": "http://wap.ctrip.com", 
      "Connection": "keep-alive"} 
     data = '{"biz":1,"contrl":3,"facility":0,"faclist":[],"key":"","keytp":0,"pay":0,"querys":[],"couponlist":[],"setInfo":{"cityId":2,"dstId":0,"inDay":"2016-06-24","outDay":"2016-06-25"},"sort":{"dir":1,"idx":70,"ordby":0,"size":100},"qbitmap":0,"alliance":{"ishybrid":0},"head":{"ctok":"","cver":"1.0","lang":"01","sid":"8888","syscode":"09","auth":null,"extension":[{"name":"pageid","value":"212093"},{"name":"webp","value":0},{"name":"protocal","value":"http"}]},"contentType":"json"}' 
     for url in self.start_urls: 
      yield Request(
       url, 
       self.parse, 
       method='POST', 
       headers=headers, 
       body=data 
      ) 

    def parse(self, response): 
     page = response.body 
     print(page) 

が、私はPythonの要求にXHRをシミュレートしたときに、それが正常に動作してJSONレスポンスを得た、これは私のコードが要求を使用している:これは私のコードです

import requests 

url = 'http://m.ctrip.com/restapi/soa2/10932/hotel/Product/domestichotelget' 
headers = { 
    "Host": "m.ctrip.com", 
    "User-Agent": "Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16", 
    "Accept": "application/json", 
    "Accept-Language": "en-US,en;q=0.5", 
    "Accept-Encoding": "gzip, deflate", 
    "Content-Type": "application/json", 
    "cookieOrigin": "http://wap.ctrip.com", 
    "Cache-Control": "no-cache", 
    "Referer": "http://wap.ctrip.com/webapp/hotel/hoteldetail/426638.html?days=1&atime=20160623&contrl=2&num=1&biz=1", 
    "Content-Length": "455", 
    "Origin": "http://wap.ctrip.com", 
    "Connection": "keep-alive"} 
body = '{"biz":1,"contrl":3,"facility":0,"faclist":[],"key":"","keytp":0,"pay":0,"querys":[],"couponlist":[],"setInfo":{"cityId":2,"dstId":0,"inDay":"2016-06-24","outDay":"2016-06-25"},"sort":{"dir":1,"idx":70,"ordby":0,"size":100},"qbitmap":0,"alliance":{"ishybrid":0},"head":{"ctok":"","cver":"1.0","lang":"01","sid":"8888","syscode":"09","auth":null,"extension":[{"name":"pageid","value":"212093"},{"name":"webp","value":0},{"name":"protocal","value":"http"}]},"contentType":"json"}' 


response = requests.post(url, headers=headers, data=body).content 
print(response) 

私scrapyコードが悪いのでしょうか?

答えて

2

これは、コード

from scrapy import Spider 
from scrapy import Request, FormRequest 
import json 


class jsonSpider(Spider): 
    name = 'json_spider' 

    start_urls = [ 
    'http://m.ctrip.com/restapi/soa2/10932/hotel/Product/domestichotelget'] 

    def start_requests(self): 
     headers = { 
     "Accept": "application/json", 
     "Accept-Language": "en-US,en;q=0.5", 
     "Accept-Encoding": "gzip, deflate", 
     "Connection": "keep-alive"} 
     data = {"biz":1,"contrl":3,"facility":0,"faclist":[],"key":"","keytp":0,"pay":0,"querys":[],"couponlist":[],"setInfo":{"cityId":2,"dstId":0,"inDay":"2016-06-24","outDay":"2016-06-25"},"sort":{"dir":1,"idx":70,"ordby":0,"size":100},"qbitmap":0,"alliance":{"ishybrid":0},"head":{"ctok":"","cver":"1.0","lang":"01","sid":"8888","syscode":"09","auth":None,"extension":[{"name":"pageid","value":"212093"},{"name":"webp","value":0},{"name":"protocal","value":"http"}]},"contentType":"json"} 
     for url in self.start_urls: 
     yield Request(
       url, 
       self.parse, 
       method='POST', 
       headers=headers, 
       body=json.dumps(data) 
     ) 

    def parse(self, response): 
    page = response.body 
    print(page) 
+0

同じヘッダがPythonリクエストで動作し、Scrapyを使用しない理由を説明できますか? – MetalloyD

+0

@Metalloy @Metalloy私が扱うヘッダーでは、accept-encoding、content-type、acceptなどしか記述していませんが、内容の長さが変わって悪い要求を引き起こす可能性があります。 –

2

あなたのヘッダーに"Content-Length": "455",を削除し、scrapy自体を計算してみましょうし、次の200応答を与えたあなたのために動作します。 dataの長さは477バイトなので、サーバーは受信データの最初の455バイトを受け取り、完了していないのでjsonとして解析できません。つまり、Bad Requestを返します。

+0

Pythonリクエストを使用し、Scrapyを使用しない理由について説明できますか? ? – MetalloyD

+1

screamリクエストは明らかにサイズを再計算し、既存の 'Content-Length'ヘッダをオーバーライドしますが、scrapyは別の' Content-Length'を追加するだけです。これはwiresharkによって捕捉されます。 'Content-Length: 477 \ r \ n \t原産地:http://wap.ctrip.com \ r \ n内容 - 長さ:455 \ r \ n' – sardok

関連する問題