Ajaxを使用してサイトからデータを取得しようとしていますが、XHRリクエストをヘッダーとボディでスムージングしています。禁じられている。XHRリクエストがScrapyで失敗しましたが、Pythonリクエストで動作します
from scrapy import Spider
from scrapy import Request, FormRequest
import json
class jsonSpider(Spider):
name = 'json'
start_urls = [
'http://m.ctrip.com/restapi/soa2/10932/hotel/Product/domestichotelget']
def start_requests(self):
headers = {
"Host": "m.ctrip.com",
"User-Agent": "Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16",
"Accept": "application/json",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
"Content-Type": "application/json",
"cookieOrigin": "http://wap.ctrip.com",
"Cache-Control": "no-cache",
"Referer": "http://wap.ctrip.com/webapp/hotel/hoteldetail/426638.html?days=1&atime=20160623&contrl=2&num=1&biz=1",
"Content-Length": "455",
"Origin": "http://wap.ctrip.com",
"Connection": "keep-alive"}
data = '{"biz":1,"contrl":3,"facility":0,"faclist":[],"key":"","keytp":0,"pay":0,"querys":[],"couponlist":[],"setInfo":{"cityId":2,"dstId":0,"inDay":"2016-06-24","outDay":"2016-06-25"},"sort":{"dir":1,"idx":70,"ordby":0,"size":100},"qbitmap":0,"alliance":{"ishybrid":0},"head":{"ctok":"","cver":"1.0","lang":"01","sid":"8888","syscode":"09","auth":null,"extension":[{"name":"pageid","value":"212093"},{"name":"webp","value":0},{"name":"protocal","value":"http"}]},"contentType":"json"}'
for url in self.start_urls:
yield Request(
url,
self.parse,
method='POST',
headers=headers,
body=data
)
def parse(self, response):
page = response.body
print(page)
が、私はPythonの要求にXHRをシミュレートしたときに、それが正常に動作してJSONレスポンスを得た、これは私のコードが要求を使用している:これは私のコードです
import requests
url = 'http://m.ctrip.com/restapi/soa2/10932/hotel/Product/domestichotelget'
headers = {
"Host": "m.ctrip.com",
"User-Agent": "Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16",
"Accept": "application/json",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
"Content-Type": "application/json",
"cookieOrigin": "http://wap.ctrip.com",
"Cache-Control": "no-cache",
"Referer": "http://wap.ctrip.com/webapp/hotel/hoteldetail/426638.html?days=1&atime=20160623&contrl=2&num=1&biz=1",
"Content-Length": "455",
"Origin": "http://wap.ctrip.com",
"Connection": "keep-alive"}
body = '{"biz":1,"contrl":3,"facility":0,"faclist":[],"key":"","keytp":0,"pay":0,"querys":[],"couponlist":[],"setInfo":{"cityId":2,"dstId":0,"inDay":"2016-06-24","outDay":"2016-06-25"},"sort":{"dir":1,"idx":70,"ordby":0,"size":100},"qbitmap":0,"alliance":{"ishybrid":0},"head":{"ctok":"","cver":"1.0","lang":"01","sid":"8888","syscode":"09","auth":null,"extension":[{"name":"pageid","value":"212093"},{"name":"webp","value":0},{"name":"protocal","value":"http"}]},"contentType":"json"}'
response = requests.post(url, headers=headers, data=body).content
print(response)
私scrapyコードが悪いのでしょうか?
同じヘッダがPythonリクエストで動作し、Scrapyを使用しない理由を説明できますか? – MetalloyD
@Metalloy @Metalloy私が扱うヘッダーでは、accept-encoding、content-type、acceptなどしか記述していませんが、内容の長さが変わって悪い要求を引き起こす可能性があります。 –