2016-05-02 119 views
0

私はすべてを試しましたが、parse_categoryの次のページを呼び出す方法がわかりません。私の解析で次のページを呼び出す方法 - Scrapy

私はCatergoryページに直接行くと私はLinkExtractorを試しましたが、これはうまくいきませんでした。

import scrapy.selector 
import urlparse 
from scrapy.spiders import CrawlSpider, Rule 
from scrapy.http import Request 
from msh_final.items import CrawlerMshFinalItem 


def complete_url(string): 
    return "http://www.mediamarkt.be" + string 


def get_base_url(url): 
    if url != "": 
     u = urlparse.urlparse(url) 
     return "%s://%s" % (u.scheme, u.netloc) 
    else: 
     return "" 


def encode(str): 
    return str.encode('utf8', 'ignore') 


class msh_finalSpider(CrawlSpider): 
    name = 'msh_final' 
    start_urls = ['http://www.mediamarkt.be/mcs/productlist/_Telefoon-Navigatie,98952,509451.html?langId=-17'] 

    def parse(self, response): 
     items = response.xpath('//ul[@class="infield cf"]//div[@class="infield-wrapper"]/h2/a/@href') 
     for item in items: 
      link = item.extract() 
      yield Request(complete_url(link), callback=self.parse_category) 

    def parse_category(self, response): 
     items = response.xpath("//ul[@class='products-list']/li/div") 
     for item in items: 
      msh = CrawlerMshFinalItem() 
      msh['item_price'] = encode(item.xpath('normalize-space(.//aside/div/div/div/text())').extract()[0]) 
      msh['item_name'] = encode(item.xpath('normalize-space(.//div/h2/a/text())').extract()[0]) 
      yield msh 

答えて

0

あなたはSpider代わりのCrawlSpiderからあなたのクモをinheriteし、次のコードを使用する必要があります。

class msh_finalSpider(Spider): 
    name = 'msh_final' 
    start_urls = ['http://www.mediamarkt.be/mcs/productlist/_Telefoon-Navigatie,98952,509451.html?langId=-17'] 

    def parse(self, response): 
     items = response.xpath('//ul[@class="infield cf"]//div[@class="infield-wrapper"]/h2/a/@href') 
     for item in items: 
      link = item.extract() 
      yield Request(complete_url(link), callback=self.parse_category) 

    def parse_category(self, response): 
     items = response.xpath("//ul[@class='products-list']/li/div") 
     for item in items: 
      msh = CrawlerMshFinalItem() 
      msh['item_price'] = encode(item.xpath('normalize-space(.//aside/div/div/div/text())').extract()[0]) 
      msh['item_name'] = encode(item.xpath('normalize-space(.//div/h2/a/text())').extract()[0]) 
      yield msh 

     new_link = response.xpath('//li[@class="pagination-next"]/a/@href').extract()[0] 
     yield Request(
      complete_url(new_link), 
      callback=self.parse_category 
     ) 
関連する問題