2016-07-22 9 views
1

私は治療に慣れていないので、問題がおそらくどこにあるのか分かりません。私は解決策を見つけることを望む。前もって感謝します。私はutnutu 14.04使用しています私の治療が何かをこすらないのはなぜですか?

、パイソン3.4

マイスパイダー:

import scrapy 
from scrapy.linkextractors import LinkExtractor 
from name.items import Actress 

class ActressSpider(scrapy.Spider): 
name = "name_list" 
allowed_domains = ["dmm.co.jp"] 
start_urls = ["http://actress.dmm.co.jp/-/list/=/keyword=%s/" % c for c in ['a', 'i', 'u', 'e', 'o', 'ka', 'ki', 'ku', 'ke', 'ko', 'sa', 'si', 'su', 'se', 'so', 'ta', 'ti', 'tu', 'te', 'to', 'na', 'ni', 'nu', 'ne', 'no', 'ha', 'hi', 'hu', 'he', 'ho', 'ma', 'mi', 'mu', 'me', 'mo', 'ya', 'yu', 'yo', 'ra', 'ri', 'ru', 're', 'ro', 'wa']] 

def parse(self, response): 
    for sel in response.xpath('//*[@id="mu"]/table[2]/tr/td[2]/a/@href'): 
     url = response.urljoin(sel.extract()) 
     yield scrapy.Request(url, callback = self.parse_actress_detail) 

    next_page = response.xpath('//*[@id="mu"]/table[1]/tr[2]/td[2]/a/@href') 
    for urlnext in next_page: 
     if urlnext: 
      pagination = response.urljoin(urlnext.extract()) 
     yield scrapy.Request(pagination, callback = self.parse) 


def parse_actress_detail(self, response): 
    for sel in response.xpath('//*[@id="mu"]/table[1]'): 
     item = Actress() 
     url = resposne.url 
     name = sel.xpath('tr[3]/td/table/tr/td[1]/img/@alt').extract() 
     item['name'] = name[0].encode('utf-8') 
     item['name_en'] = sel.xpath('tr[3]/td/table/tr/td[1]/img/@src').extract() 
     birth = sel.xpath('tr[3]/td/table/tr/td[2]/table/tr[1]/td[2]/text()').extract() 
     item['birth'] = birth[0].encode('utf-8') 
     starsign = sel.xpath('tr[3]/td/table/tr/td[2]/table/tr[2]/td[2]/text()').extract() 
     item['starsign'] = starsign[0].encode('utf-8') 
     bloodtype = sel.xpath('tr[3]/td/table/tr/td[2]/table/tr[3]/td[2]/text()').extract() 
     item['bloodtype'] = bloodtype[0].encode('utf-8') 
     boobs = sel.xpath('tr[3]/td/table/tr/td[2]/table/tr[4]/td[2]/text()').extract() 
     item['boobs'] = boobs[0].encode('utf-8') 
     home = sel.xpath('tr[3]/td/table/tr/td[2]/table/tr[5]/td[2]/text()').extract() 
     item['home'] = home[0].encode('utf-8') 
     hobby = sel.xpath('tr[3]/td/table/tr/td[2]/table/tr[6]/td[2]/text()').extract() 
     item['hobby'] = hobby[0].encode('utf-8') 
     item['image_urls'] = sel.xpath('tr[3]/td/table/tr/td[1]/img/@src').extract() 
     request = scrapy.Request(url, callback=self.parse_actress_detail2, meta={'item':item}) 
     yield request 

# another link section of parse's request url 
def parse_actress_detail2(self, response): 
    for sel in response.xpath('//*[@id="mu"]/table[4]/tr/td[1]/a/@href'): 
     url = response.urljoin(sel.extract()) 
     request = scrapy.Request(url, callback = self.parse_movie_detail, meta={'item':item}) 
     yield request 

    next_page = response.xpath('//*[@id="mu"]/table[5]/tr/td/a/@href') 
    for urlnext in next_page: 
     if urlnext: 
      pagination = response.urljoin(urlnext.extract()) 
     yield scrapy.Request(pagination, callback = self.parse_actress_detail2) 


def parse_movie_detail(self, response): 
    for sel in response.xpath('//*[@id="content"]/tr[1]/td[1]'): 
     item = response.meta['item'] 
     release_date = sel.xpath('table/tr[1]/td[2]/text()').extract() 
     item['release_date'] = release_date[0].encode('utf-8') 
     running_time = sel.xpath('table/tr[2]/td[2]/text()').extract() 
     item['running_time'] = running_time[0].encode('utf-8') 
     cast = sel.xpath('table/tr[3]/td[2]/a/text()').extract() 
     castjoin = [n.encode('utf-8') for n in cast] 
     item['cast'] = b', '.join(castjoin) 
     series = sel.xpath('table/tr[4]/td[2]/text()').extract() 
     item['series'] = series[0].encode('utf-8') 
     manufacturer = sel.xpath('table/tr[5]/td[2]/text()').extract() 
     item['manufacturer'] = manufacturer[0].encode('utf-8') 
     label = sel.xpath('table/tr[6]/td[2]/text()').extract() 
     item['label'] = label[0].encode('utf-8') 
     number = sel.xpath('//*[@id="cid_block"]/text()').extract() 
     item['number'] = number[0].encode('utf-8') 
     yield item 

ログ:

'downloader/request_bytes': 4350197, 
'downloader/request_count': 10107, 
'downloader/request_method_count/GET': 10107, 
'downloader/response_bytes': 169329414, 
'downloader/response_count': 10107, 
'downloader/response_status_count/200': 9905, 
'downloader/response_status_count/301': 202, 
'dupefilter/filtered': 3212, 
'finish_reason': 'finished', 
'finish_time': datetime.datetime(2016, 7, 22, 5, 41, 0, 920779), 
'log_count/DEBUG': 203, 
'log_count/INFO': 13, 
'request_depth_max': 5, 
'response_received_count': 9905, 
'scheduler/dequeued': 10107, 
'scheduler/dequeued/memory': 10107, 
'scheduler/enqueued': 10107, 
'scheduler/enqueued/memory': 10107, 
'spider_exceptions/NameError': 9659, 
'start_time': datetime.datetime(2016, 7, 22, 5, 28, 25, 342801) 

すべてのヘルプは大歓迎です。

答えて

1

あなたの統計では、'spider_exceptions/NameError': 9659,は疑わしいと思われます。

私はこの問題があなたのparse_actress_detail2コールバックにあると考えています。最初のループでは、itemが定義されていません。

def parse_actress_detail2(self, response): 
    for sel in response.xpath('//*[@id="mu"]/table[4]/tr/td[1]/a/@href'): 
     url = response.urljoin(sel.extract()) 

     request = scrapy.Request(url, 
           callback = self.parse_movie_detail, 
           meta={'item':item}) 
     #         ^
     #          | 
     #          here    
     yield request 

あなたはおそらく、あなたのようなmeta={'item': response.meta['item']}parse_movie_detailでやる意味しました。

関連する問題