2017-11-01 23 views
0

現在、私のcrawlspiderは、利用可能な6.5M以上の約20,000の製品のみをクロールします。各カテゴリは掻き取られているようですが、各カテゴリの最初の5ページだけが掻き取られているようです。私はそれが私のlinkextractorで何かと信じていますが、私は確信していません。Scrapy Crawlspiderはカテゴリ内の最初の5ページのみをクロールしています

CrawlSpider:

import scrapy 
from scrapy.spiders import CrawlSpider, Rule 
from scrapy.linkextractors import LinkExtractor 

from scrapy.selector import Selector 

class DigikeyItem(scrapy.Item): 
    # define the fields for your item here like: 
    # name = scrapy.Field() 
    partnumber = scrapy.Field() 
    manufacturer = scrapy.Field() 
    description = scrapy.Field() 
    quanity= scrapy.Field() 
    minimumquanity = scrapy.Field() 
    price = scrapy.Field() 

class DigikeySpider(CrawlSpider): 
    name = 'digikey' 
    allowed_domains = ['digikey.com'] 
    start_urls = ['https://www.digikey.com/products/en'] 

    rules = (

     Rule(LinkExtractor(allow=('products',)),callback='parse_item'), 

    ) 

    def parse_item(self, response): 
     for row in response.css('table#productTable tbody tr'): 
      item = DigikeyItem() 
      item['partnumber'] = row.css('.tr-mfgPartNumber [itemprop="name"]::text').extract_first() 
      item['manufacturer'] = row.css('[itemprop="manufacture"] [itemprop="name"]::text').extract_first() 
      item['description'] = row.css('.tr-description::text').extract_first() 
      item['quanity'] = row.css('.tr-qtyAvailable::text').extract_first() 
      item['price'] = row.css('.tr-unitPrice::text').extract_first() 
      item['minimumquanity'] = row.css('.tr-minQty::text').extract_first() 
      yield item 

設定:

BOT_NAME = 'digikey' 

SPIDER_MODULES = ['digikey.spiders'] 
NEWSPIDER_MODULE = 'digikey.spiders' 



ROBOTSTXT_OBEY = False 

DOWNLOADER_MIDDLEWARES = { 
    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 
    'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400, 
} 

出力:この特定のサイトに

2017-11-01 10:53:11 [scrapy.core.engine] INFO: Closing spider (finished) 
2017-11-01 10:53:11 [scrapy.statscollectors] INFO: Dumping Scrapy stats: 
{'downloader/exception_count': 6, 
'downloader/exception_type_count/twisted.web._newclient.ResponseNeverReceived': 6, 
'downloader/request_bytes': 1198612, 
'downloader/request_count': 988, 
'downloader/request_method_count/GET': 988, 
'downloader/response_bytes': 23932614, 
'downloader/response_count': 982, 
'downloader/response_status_count/200': 982, 
'dupefilter/filtered': 46, 
'finish_reason': 'finished', 
'finish_time': datetime.datetime(2017, 11, 1, 17, 53, 11, 421641), 
'item_scraped_count': 21783, 
'log_count/DEBUG': 22773, 
'log_count/ERROR': 2, 
'log_count/INFO': 10, 
'request_depth_max': 1, 
'response_received_count': 982, 
'retry/count': 4, 
'retry/max_reached': 2, 
'retry/reason_count/twisted.web._newclient.ResponseNeverReceived': 4, 
'scheduler/dequeued': 988, 
'scheduler/dequeued/memory': 988, 
'scheduler/enqueued': 988, 
'scheduler/enqueued/memory': 988, 
'start_time': datetime.datetime(2017, 11, 1, 17, 49, 38, 427669)} 
2017-11-01 10:53:11 [scrapy.core.engine] INFO: Spider closed (finished) 
PS C:\Users\dalla_000\digikey> 

答えて

0

、それは二段のクロールを行うには意味をなさない可能性があります

  1. 反復
  2. すべてのカテゴリのURLのリストを取得するすべてのURLを介して

1つの方法は、2つのクモとメッセージキューを使用することができます。最初のクモは、次のようになります。

import scrapy 
from bs4 import BeautifulSoup 
import re 
import math 
import urllib 
from kafka import KafkaClient, SimpleProducer 

ITEMS_PER_PAGE = 500 

class CreateXxxxxxxUrlListSpider(scrapy.Spider): 

    kafka = KafkaClient('10.0.1.12:9092') 
    producer = SimpleProducer(kafka) 

    name = "create_xxxxxxx_url_list" 
    allowed_domains = ["xxxxxxx.com"] 
    start_urls = [ 
     "http://www.xxxxxxx.com/product-search/en?stock=1" 
    ] 

    def parse(self, response): 

     soup = BeautifulSoup(response.body) 

     catfilterlinks = soup.find_all('a', {'class':'catfilterlink'}) 

     for catfilterlink in catfilterlinks: 
      location = catfilterlink['href'].split("?")[0] 
      items = re.match(".*\(([0-9]+) items\).*", catfilterlink.next_sibling).group(1) 

      for page in range(int(math.ceil(float(items)/ITEMS_PER_PAGE))): 
       if page == 0: 
        url = "http://www.xxxxxxx.com" + location + "?" + urllib.urlencode({"stock":1}) 
        self.producer.send_messages("xxxxxxx_search_page_urls", url) 
       else: 
        url = "http://www.xxxxxxx.com" + location + "/page/" + str(page + 1) + "?" + urllib.urlencode({"stock":1}) 
        self.producer.send_messages("xxxxxxx_search_page_urls", url) 

最初のクモをクロールするすべてのページのリストを取得し、メッセージキュー(例えば、カフカ)に書き込みます。

2番目のスパイダーは、カフカトピックのURLを消費してクロールします。それは次のようになります。

from scrapy_kafka.spiders import ListeningKafkaSpider 
from .. items import PageHtml 
from calendar import timegm 
import time 

class CrawlXxxxxxxUrlsSpider(ListeningKafkaSpider): 
    name = 'crawl_xxxxxxx_urls_spider' 
    allowed_domains = ["xxxxxxx.com"] 
    topic = "xxxxxxx_search_page_urls" 

    def parse(self, response): 
     item = PageHtml() 
     item['url'] = response.url 
     item['html'] = response.body_as_unicode() 
     item['ts'] = timegm(time.gmtime()) 
     return item 
     # .... or whatever