2017-11-09 16 views
0

私は、次のページに行くためにポストリクエストのループを送信する必要があるスクーピースパイダーをやっている、問題はそれは1つのポストリクエストを送信するだけです。クエリ文字列は各ページの要素「currentPage」を変更するので、各ページのこのキーの値を変更して投稿を送信する必要があります。しかし、私が前に言ったように、それは最初のポストリクエスト後に停止します。あなたがdont_filter=Trueを必要とする、おそらくループポストリクエストScrapy

output scrapy

import scrapy 

    headers = { 
     'accept': "*/*", 
     'origin': "http://www.**********.com", 
     'x-requested-with': "XMLHttpRequest", 
     'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", 
     'referer': "http://www.**********.com/venta/", 
     'accept-encoding': "gzip, deflate", 
     'accept-language': "en-US,en;q=0.8,es;q=0.6", 
     'cookie': "G_ENABLED_IDPS=google; cookieInterestedProject=416; visid_incap_434661=wjkf7tU+QPKDjpmWXz/BKSBz+1kAAAAAQUIPAAAAAAA7bs2fXOSL0JmeVSXo337M; incap_ses_223_434661=9zyRHwEdwGxtE8Ly00EYAxQw/VkAAAAAq7gkFJrJjsGdCgrRTwOfvg==; s_vnum=1512243236606%26vn%3D3; __utmz=other; s_cm=Natural%20Searchwww.google.com.co; s_v10=%5B%5B%27Natural%2520Search%27%2C%271509651236616%27%5D%2C%5B%27Natural%2520Search%27%2C%271509651249121%27%5D%2C%5B%27Natural%2520Search%27%2C%271509765142570%27%5D%2C%5B%27Natural%2520Search%27%2C%271509765184463%27%5D%5D; s_v8=%5B%5B%27natural%2520search%253A%2520google%253A%2520keyword%2520unavailable%27%2C%271509651236618%27%5D%2C%5B%27natural%2520search%253A%2520google%253A%2520keyword%2520unavailable%27%2C%271509651249123%27%5D%2C%5B%27natural%2520search%253A%2520google%253A%2520keyword%2520unavailable%27%2C%271509765142572%27%5D%2C%5B%27natural%2520search%253A%2520google%253A%2520keyword%2520unavailable%27%2C%271509765184465%27%5D%5D; ; s_cc=true; _ga=GA1.2.701497075.1509651237; _gid=GA1.2.1068485902.1509765143; NSC_nfuspdvbesbep-wt=ffffffff0975c87745525d5f4f58455e445a4a4229a2; OX_sd=1; OX_plg=pm; gpv_pn=metrocuadrado%3A%20buscar%3A%20resultados%20inmuebles%3A%20nuevo%20y%20usado; s_invisit=true; s_nr=1509765213941-Repeat; s_lv=1509765213944; s_lv_s=Less%20than%207%20days; s_sq=eltiempometrocuadradoprod%2Celtiempoglobal%3D%2526pid%253Dmetrocuadrado%25253A%252520buscar%25253A%252520resultados%252520inmuebles%25253A%252520nuevo%252520y%252520usado%2526pidt%253D1%2526oid%253Dhttp%25253A%25252F%25252Fwww.metrocuadrado.com%25252Fventa%25252F%252523%2526ot%253DA; madicionales=; mbarrio=; mciudad=; mgrupo=; mgrupoid=; mnrobanos=; mnrocuartos=; mnrogarajes=; msector=; mubicacion=; mvalorarriendo=; mzona=; orderBy=; selectedLocationCategory=; selectedLocationFilter=; sortType=; writtenFilters=mnrogarajes%3Bmnrobanos%3Bmnrocuartos%3Bmtiempoconstruido%3Bmarea%3Bmvalorarriendo%3Bmvalorventa%3Bmciudad%3Bmubicacion%3Bmtiponegocio%3Bmtipoinmueble%3Bmzona%3Bmsector%3Bmbarrio%3BselectedLocationCategory%3BselectedLocationFilter%3Bmestadoinmueble%3Bmadicionales%3BorderBy%3BsortType%3Bmestadoinmueble%3BcompanyType%3BcompanyName%3Bmidempresa%3Bmgrupo%3Bmgrupoid%3B; m2-srv=ffffffff0975c82e45525d5f4f58455e445a4a4229a2; mtiponegocio=venta; mtipoinmueble=; mvalorventa=; marea=; mtiempoconstruido=; companyType=; companyName=; midempresa=; mestadoinmueble=", 
     'cache-control': "no-cache", 
     'postman-token': "2e5f00b9-7c7c-32ed-1bdd-63cf2fed3cd8" 
     } 

    querystring = { 
    "":"","mnrogarajes":"","mnrobanos":"","mnrocuartos":"","mtiempoconstruido":"","marea":"","mvalorarriendo":"","mvalorventa":"","mciudad":"","mubicacion":"","mtiponegocio":"venta","mtipoinmueble":"","mzona":"","msector":"","mbarrio":"","selectedLocationCategory":"","selectedLocationFilter":"","mestadoinmueble":"","madicionales":"","orderBy":"","sortType":"","companyType":"","companyName":"","midempresa":"","mgrupo":"","mgrupoid":"","currentPage":"2","totalPropertiesCount":"115747","totalUsedPropertiesCount":"113926","totalNewPropertiesCount":"1821","sfh":"1" 
    } 

    url = 'http://www.*******.com/search/list/ajax' 
    num = 0 

    class HouseseSpider(scrapy.Spider): 
     name = "hoimom" 
     start_urls = ['http://www.********.com/venta/'] 

     def parse(self,response): 
      for num in range(2,100): 
       for href in response.xpath('.//a[@class="data-details-id" and @itemprop="url"]/@href').extract(): 
        yield scrapy.Request(url = href ,callback = self.parsei) 
       querystring["currentPage"] = str(num) 
       yield scrapy.Request(url = 'http://www.*********.com/search/list/ajax',method="POST",headers=headers,meta=querystring) 



     def parsei(self, response): 
      yield { 
      'latitude': response.xpath('//input[@id="latitude"]/@value').extract(), 
      'longitud': response.xpath('//input[@id="longitude"]/@value').extract(), 
      'precio de arriendo': response.xpath('.//dl/dt[h3/text()="Valor de arriendo"]/following-sibling::dd[1]/h4/text()').extract_first(), 
      'precio de venta': response.xpath('.//dl/dt[h3/text()="Valor de venta"]/following-sibling::dd[1]/h4/text()').extract_first(), 
      'Barrio_com': response.xpath('.//dl/dt[h3/text()="Nombre común del barrio "]/following-sibling::dd[1]/h4/text()').extract_first(), 
      'Barrio_cat': response.xpath('.//dl/dt[h3/text()="Nombre del barrio catastral"]/following-sibling::dd[1]/h4/text()').extract_first(), 
      'Estrato': response.xpath('.//dl/dt[h3/text()="Estrato"]/following-sibling::dd[1]/h4/text()').extract_first(), 
      'id': response.xpath('//input[@id="propertyId"]/@value').extract_first(), 
      'Habitaciones': response.xpath('.//dl/dt[h3/text()="Habitaciones"]/following-sibling::dd[1]/h4/text()').extract_first(), 
      'Parqueadero': response.xpath('.//dl/dt[h3/text()="Parqueadero"]/following-sibling::dd[1]/h4/text()').extract_first(), 
      'Tipo de calentador': response.xpath('.//dl/dt[h3/text()="Tipo de calentador"]/following-sibling::dd[1]/h4/text()').extract_first(), 
      'Cuarto de servicio': response.xpath('.//dl/dt[h3/text()="Cuarto de servicio"]/following-sibling::dd[1]/h4/text()').extract_first(), 
      'Tipo de acabado piso': response.xpath('.//dl/dt[h3/text()="Tipo de acabado piso"]/following-sibling::dd[1]/h4/text()').extract_first(), 
      'Area_Cons': response.xpath('.//dl/dt[h3/text()="Área construida"]/following-sibling::dd[1]/h4/text()').extract_first() 
      } 

答えて

0

。このように

yield scrapy.Request(url = 'http://www.*********.com/search/list/ajax',method="POST",headers=headers,meta=querystring, dont_filter=True) 
+0

ありがとうございます、ありがとうございます、ありがとうございます、ありがとう、それは今すぐループしますが、3回繰り返した後に情報を抽出しません。なぜそれが起こるだろうか? –

+0

href変数の値は常に異なっていますか?そうでない場合は、リクエストでdont_filter = True引数を使用してみてください。または、hrefのセレクタに何らかのエラーがあります。 – Fidan

+0

私はクラスを作成した後、私はstar_urlを置くとき、3ページの後に応答が変わるいくつかの奇妙な理由で、何らかの理由で動作します。私はdon't_filterを自分で考えなかったでしょうか? –