2017-03-27 21 views
0

ウェブサイトをクロールしようとしています。私は別の日にそれをしたい。だから私はリストに日付を格納しています。しかし、リストの項目にアクセスしようとすると、クローラはリストの最初の値に対してのみ機能します。助けてください。以下は私のコードです:すべての治療中のループ

class SpidyQuotesViewStateSpider(scrapy.Spider): 
    name = 'retail_price' 

    def start_requests(self): 
     print "start request" 
     urls = "http://fcainfoweb.nic.in/PMSver2/Reports/Report_Menu_web.aspx" 

     yield scrapy.Request(url=urls, callback=self.parse) 

    def parse(self, response): 
     dated = ["05/03/2017","04/03/2017"] 
     urls = "http://fcainfoweb.nic.in/PMSver2/Reports/Report_Menu_web.aspx" 
     #frmdata = 
     cookies1 ={} 
     val = response.headers.getlist('Set-Cookie') 
     print "login session values" ,response.headers.getlist('Set-Cookie') 
     if(len(val) != 0): 
      cookies1['ASP.NET_SessionId'] = str(response.headers.getlist('Set-Cookie')[0].split(";")[0].split("=")[1]) 
      cookies1['path'] = str(response.headers.getlist('Set-Cookie')[0].split(";")[1].split("=")[1]) 
     print cookies1; 
     for i in range(len(dated)): 
      yield scrapy.FormRequest(url=urls, callback=self.parse1, formdata={'ctl00$MainContent$btn_getdata1':"Get Data", 
      'ctl00$MainContent$Txt_FrmDate':dated[i], 
      'ctl00$MainContent$Ddl_Rpt_Option0':"Daily Prices", 
      'ctl00$MainContent$Rbl_Rpt_type':"Price report", 
      'ctl00$MainContent$ddl_Language':"English", 
      'ctl00$MainContent$Ddl_Rpt_type':"Retail", 
      '__EVENTVALIDATION':"IwZyKgfTXVzxiHxiPXGk/W8XQZBDb0EOPxJh6s8hofq0ffqOpiHSH77CafcxySF3PbkYgSMNFCJhLM2cGnL6SxT0PJuGDCJtV0V8Y4a94UErUCiSANiin+4uKckk9v9Ux8JqTVeaipppmlH+wyks2U9SgPfkNUsqw4eHCkDyB5akNNZImRIixOHHVY3JSXGkwXn7ueK9w+AgnqJzpXaWdMr9J1++M4VAFImSNF8brFSfPHe5kb/qzkGIwUr/KRouaRYK8WLWZh/Mbl9xwREwhDSxWJSOdihSE0WWoaqSMtpaR99rDDCsD3mdJqfu0aPIlREupTZRzlrmztXU0eS3949YW+ywdTRvykaMNgOW2Q4saYP5j/niKbRW6GiDnaLV2A38X/HW80+trrsjwJr9tjTKVFyikf6s/3gzyiTp11ivSkwIY2b3hutjYn7OfTDo", 
      #'__EVENTVALIDATION':"HqVo2xHk04clYwnBposXbZGhbIr181A7RbyeZv74Cia7rXSKmpOpbeSnn3XXnoDJKRxMK0W9nxKZFfkNje+P/K7gE5HVjHJr9Gr0Gs46TntzKDsvzyii8jZ7e0fdZgQCJKoXxQNgR2vNkWqChKcEldBuMHCOgJRqCNCF/JPFKpdKZoIWr7GU8rhzwLijf/Gkm+FuTULs/fl2HHK6Z1QQEozzEHFsDwzl0G4IiN//eNYfHuUBXKZ3wdZzPqG0s53WHEuSBzhqBC9AtCJOs4ZZhdtwFh8iyTJ4PlsLP9DLHYHRCOAd72UO0UH8gT7gAkKVo1I4L540DilowOR9SttH7MM/oOs9qhKlnG61FgqkYGW8zGzF/yNEXO+beVAK1RVvuO+FDnuq/g36TRnUieei5GpAZ+96CSoCIxykdvHx8R+smTNF/5erlowV4ci+tcI7", 
      '__VIEWSTATEENCRYPTED':"", 
      '__VIEWSTATEGENERATOR':"85862B00", 
      '__VIEWSTATE':"+a+3jrBEKxDdkPOzx2wXwKaTMWvCB60WPaHRfJUAZQrdFIpxSqFr5VseTclpGzeHXdxaFnxJe/PkxDKYa7sj3Wiv/os1bNeX0IEB3s45eFsHYWGiU8cvsXCGa5z7rrGRDL5hotg7k/MuUWj8w27xXZO423MN5OsHS+wh+tC/5/Xix+w3zxuQhi8jR5DnreimHbhGZn1sYaKYIGCc8mDIDRNl+w1OZ058F+3LAx96QUu5BYiMYOmrlyxrb9b2yPTmmIrI4NtC4ClBQlxuST5wMDP3vUqqWMhn4auk8ev5gHyPestCRrsAXWs07wDNnikemMwo/4wPiTEbnZQV6SLcDUw0gZpXjXwLI7mhsVjEyVNaQnJp6+Wi6FLsAEEMlFYmQut3JecpVIUkjF9uYSN2GLIbXHPs37AiEXPeQ8E/GyBMx3z1X5l8sw/xSNmFgYQC3riajn8V0+SdkuV2PbNbYKtc+uoSCNLppLYCqiOv5eWanGvAQro2Q67FBA4w2xY+V/K8mzHaGMLoDBxJxLslWyJpL5cX0C6qoXVUu8B028auAQM4eVzH1YPF5qrJiCDo", 
      #'__VIEWSTATE':"W+m8kNAS6QHiRPo+zFj00EDs/Dbq+y/XvtCmSNwOIkGKlikAlphT8HBAWQDskSm1vdNterBuo0Hy7m4xPbXMOnyEm6IlseXO3jPw+ofnI2WHAKknLil+GeS0IfMWGeoD5aNyiz3zh1jZkKU7R7hQsxwARoHRyjhf8UCooFbkVvL6ddHVYZbH5LcocmCF1BTOCqYN5y5yzfDfYbp3KNW9kH53pdmwCsjiEirdxxUGDoG1Ke3JBEXfSl+4XubirHSR8z+VlFmPPXZGU8mMogwq9Eg822RYjvbwvZG74djcf7kdfB9KXCPO9u6cWIjLiW+cfXHSXD+1XYFVf9ATU2/NV4YbUzsI4PJRwoGD4BryUNIm2JFeT4c8F4REYTA16shxz5mDTFQ6rbmg6SmqP8G9gAc2Hr9ABD8+2BUNabGhNZ8wDIZArfYS4pl5DNrlPlpqeCjhmvv0znKAJSOac3pCUej8G90ZGwQKOPORWbNVzQShoH7QvrXV8pCklcia6psuAGO+Oj72oDWPxedE4DjdjX5TbLoW4bzsk/YNfUv4JpjGR8DWpG8IFYJG9CCjMEYb", 
      '__LASTFOCUS':"", 
      '__EVENTARGUMENT':"", 
      '__EVENTTARGET':"", 
      'ctl00_MainContent_ToolkitScriptManager1_HiddenField':";;AjaxControlToolkit,+Version=4.1.51116.0,+Culture=neutral,+PublicKeyToken=28f01b0e84b6d53e:en-US:fd384f95-1b49-47cf-9b47-2fa2a921a36a:475a4ef5:addc6819:5546a2b:d2e10b12:effe2a26:37e2e5c9:5a682656:c7029a2:e9e598a9"},method='POST',cookies = cookies1) 

    def parse1(self, response): 
     path1 = "id('Panel1')" 
     value1 = response.xpath(path1).extract_first() 
     print value1 
+0

スパイダーログを共有してください – eLRuLL

答えて

0

まず、あなたは別のフォームパラメータを持つものの、同じ敷地内にクモより多くの時間を送っています。したがって、要求にdont_filter=Trueを使用する必要があります。そうでない場合は、Scrapyブロックが重複呼び出しをブロックします。

あなたが掻いているサイトでは、同じセッション中に複数のリクエストを行うことはできません。たとえば、ブラウザでhttp://fcainfoweb.nic.in/PMSver2/Reports/Report_Menu_web.aspxにアクセスし、フォームをコンパイルしてデータを取得し、最初のページに戻ってみてください。不可能です。だからあなたはあなたのクモを修正する必要があります。アイデアを与えるだけの非常に大まかなコードがあります。それは私のために働くが、生産には使用しないでください!

class SpidyQuotesViewStateSpider(scrapy.Spider): 
    name = 'retail_price' 
    urls = "http://fcainfoweb.nic.in/PMSver2/Reports/Report_Menu_web.aspx" 

    def start_requests(self): 
     dated = ["01/03/2017","05/03/2017","04/03/2017"] 
     for i in dated: 
      request = scrapy.Request(url=self.urls, dont_filter=True, callback=self.parse) 
      request.meta['question'] = i 
      yield request 

    def parse(self, response): 
     thedate = response.meta['question'] 
     cookies1 ={} 
     val = response.headers.getlist('Set-Cookie') 
     print("login session values" ,response.headers.getlist('Set-Cookie')) 
     if(len(val) != 0): 
      cookies1['ASP.NET_SessionId'] = str(str(response.headers.getlist('Set-Cookie')[0]).split(";")[0].split("=")[1]) 
      cookies1['path'] = str(str(response.headers.getlist('Set-Cookie')[0]).split(";")[1].split("=")[1]) 

     yield scrapy.FormRequest(url=self.urls, dont_filter=True, callback=self.parse1, formdata={'ctl00$MainContent$btn_getdata1':"Get Data", 
     'ctl00$MainContent$Txt_FrmDate': thedate, 
     'ctl00$MainContent$Ddl_Rpt_Option0':"Daily Prices", 
     'ctl00$MainContent$Rbl_Rpt_type':"Price report", 
     'ctl00$MainContent$ddl_Language':"English", 
     'ctl00$MainContent$Ddl_Rpt_type':"Retail", 
     '__EVENTVALIDATION':"IwZyKgfTXVzxiHxiPXGk/W8XQZBDb0EOPxJh6s8hofq0ffqOpiHSH77CafcxySF3PbkYgSMNFCJhLM2cGnL6SxT0PJuGDCJtV0V8Y4a94UErUCiSANiin+4uKckk9v9Ux8JqTVeaipppmlH+wyks2U9SgPfkNUsqw4eHCkDyB5akNNZImRIixOHHVY3JSXGkwXn7ueK9w+AgnqJzpXaWdMr9J1++M4VAFImSNF8brFSfPHe5kb/qzkGIwUr/KRouaRYK8WLWZh/Mbl9xwREwhDSxWJSOdihSE0WWoaqSMtpaR99rDDCsD3mdJqfu0aPIlREupTZRzlrmztXU0eS3949YW+ywdTRvykaMNgOW2Q4saYP5j/niKbRW6GiDnaLV2A38X/HW80+trrsjwJr9tjTKVFyikf6s/3gzyiTp11ivSkwIY2b3hutjYn7OfTDo", 
     #'__EVENTVALIDATION':"HqVo2xHk04clYwnBposXbZGhbIr181A7RbyeZv74Cia7rXSKmpOpbeSnn3XXnoDJKRxMK0W9nxKZFfkNje+P/K7gE5HVjHJr9Gr0Gs46TntzKDsvzyii8jZ7e0fdZgQCJKoXxQNgR2vNkWqChKcEldBuMHCOgJRqCNCF/JPFKpdKZoIWr7GU8rhzwLijf/Gkm+FuTULs/fl2HHK6Z1QQEozzEHFsDwzl0G4IiN//eNYfHuUBXKZ3wdZzPqG0s53WHEuSBzhqBC9AtCJOs4ZZhdtwFh8iyTJ4PlsLP9DLHYHRCOAd72UO0UH8gT7gAkKVo1I4L540DilowOR9SttH7MM/oOs9qhKlnG61FgqkYGW8zGzF/yNEXO+beVAK1RVvuO+FDnuq/g36TRnUieei5GpAZ+96CSoCIxykdvHx8R+smTNF/5erlowV4ci+tcI7", 
     '__VIEWSTATEENCRYPTED':"", 
     '__VIEWSTATEGENERATOR':"85862B00", 
     '__VIEWSTATE':"+a+3jrBEKxDdkPOzx2wXwKaTMWvCB60WPaHRfJUAZQrdFIpxSqFr5VseTclpGzeHXdxaFnxJe/PkxDKYa7sj3Wiv/os1bNeX0IEB3s45eFsHYWGiU8cvsXCGa5z7rrGRDL5hotg7k/MuUWj8w27xXZO423MN5OsHS+wh+tC/5/Xix+w3zxuQhi8jR5DnreimHbhGZn1sYaKYIGCc8mDIDRNl+w1OZ058F+3LAx96QUu5BYiMYOmrlyxrb9b2yPTmmIrI4NtC4ClBQlxuST5wMDP3vUqqWMhn4auk8ev5gHyPestCRrsAXWs07wDNnikemMwo/4wPiTEbnZQV6SLcDUw0gZpXjXwLI7mhsVjEyVNaQnJp6+Wi6FLsAEEMlFYmQut3JecpVIUkjF9uYSN2GLIbXHPs37AiEXPeQ8E/GyBMx3z1X5l8sw/xSNmFgYQC3riajn8V0+SdkuV2PbNbYKtc+uoSCNLppLYCqiOv5eWanGvAQro2Q67FBA4w2xY+V/K8mzHaGMLoDBxJxLslWyJpL5cX0C6qoXVUu8B028auAQM4eVzH1YPF5qrJiCDo", 
     #'__VIEWSTATE':"W+m8kNAS6QHiRPo+zFj00EDs/Dbq+y/XvtCmSNwOIkGKlikAlphT8HBAWQDskSm1vdNterBuo0Hy7m4xPbXMOnyEm6IlseXO3jPw+ofnI2WHAKknLil+GeS0IfMWGeoD5aNyiz3zh1jZkKU7R7hQsxwARoHRyjhf8UCooFbkVvL6ddHVYZbH5LcocmCF1BTOCqYN5y5yzfDfYbp3KNW9kH53pdmwCsjiEirdxxUGDoG1Ke3JBEXfSl+4XubirHSR8z+VlFmPPXZGU8mMogwq9Eg822RYjvbwvZG74djcf7kdfB9KXCPO9u6cWIjLiW+cfXHSXD+1XYFVf9ATU2/NV4YbUzsI4PJRwoGD4BryUNIm2JFeT4c8F4REYTA16shxz5mDTFQ6rbmg6SmqP8G9gAc2Hr9ABD8+2BUNabGhNZ8wDIZArfYS4pl5DNrlPlpqeCjhmvv0znKAJSOac3pCUej8G90ZGwQKOPORWbNVzQShoH7QvrXV8pCklcia6psuAGO+Oj72oDWPxedE4DjdjX5TbLoW4bzsk/YNfUv4JpjGR8DWpG8IFYJG9CCjMEYb", 
     '__LASTFOCUS':"", 
     '__EVENTARGUMENT':"", 
     '__EVENTTARGET':"", 
     'ctl00_MainContent_ToolkitScriptManager1_HiddenField':";;AjaxControlToolkit,+Version=4.1.51116.0,+Culture=neutral,+PublicKeyToken=28f01b0e84b6d53e:en-US:fd384f95-1b49-47cf-9b47-2fa2a921a36a:475a4ef5:addc6819:5546a2b:d2e10b12:effe2a26:37e2e5c9:5a682656:c7029a2:e9e598a9"},method='POST',cookies = cookies1) 

    def parse1(self, response): 
     path1 = "id('Panel1')" 
     value1 = response.xpath(path1).extract_first()[:574] 
     print(value1)