2017-08-08 6 views
0

私はすでに同様の質問をしていますが、それは新しいスパイダーですが、私は同じ問題があります(Crawling data successfully but cannot scraped or write it into csv)...私は他のスパイダーをここに置いておきます。私は通常、出力ファイルを取得する必要があるすべての情報...私を助けることができる人は誰ですか?私は金曜日にこのクモを終了しなければならない...だから、私は急いでいる!クロール中に空の出力ファイル

奇妙なことは、私のFnac.csvが作成されていても常に空であるということです。だから私は、私が必要とするすべての情報を持っているページの例で自分のスパイダーを直接実行しようとしました...だから、私は理解していない...多分問題は私のRulesまたは何かから来る?

マイクモ:

# -*- coding: utf-8 -*- 
# Every import is done for a specific use 
import scrapy          # Once you downloaded scrapy, you have to import it in your code to use it. 
import re           # To use the .re() function, which extracts just a part of the text you crawl. It's using regex (regular expressions) 
import numbers          # To use mathematics things, in this case : numbers. 
from fnac.items import FnacItem      # To return the items you want. Each item has a space allocated in the momery, created in the items.py file, which is in the second cdiscount_test directory. 
from urllib.request import urlopen     # To use urlopen, which allow the spider to find the links in a page that is in the actual page. 
from scrapy.spiders import CrawlSpider, Rule  # To use rules and LinkExtractor, which allowed the spider to follow every url on the page you crawl. 
from scrapy.linkextractors import LinkExtractor  # Look above. 
from bs4 import BeautifulSoup      # To crawl an iframe, which is a page in a page in web prgrammation. 

# Your spider 
class Fnac(CrawlSpider): 
    name = 'FnacCom'        # Name of your spider. You call it in the anaconda prompt. 
    allowed_domains = ['fnac.com']    # Web domains allowed by you, your spider cannot enter on a page which is not in that domain. 
    start_urls = ['https://www.fnac.com/Index-Vendeurs-MarketPlace/A/']  # The first link you crawl. 

    # To allow your spider to follow the urls that are on the actual page. 
    rules = (
     Rule(LinkExtractor(), callback='parse_start_url'), 
    ) 

    # Your function that crawl the actual page you're on. 
    def parse_start_url(self, response): 
     item = FnacItem() # The spider now knowws that the items you want have to be stored in the item variable. 

     # First data you want which are on the actual page. 
     nb_sales = response.xpath('//body//table[@summary="données détaillée du vendeur"]/tbody/tr/td/span/text()').re(r'([\d]*) ventes') 
     country = response.xpath('//body//table[@summary="données détaillée du vendeur"]/tbody/tr/td/text()').re(r'([A-Z].*)') 

     # To store the data in their right places. 
     item['nb_sales'] = ''.join(nb_sales).strip() 
     item['country'] = ''.join(country).strip() 

     # Find a specific link on the actual page and launch this function on it. It's the place where you will find your two first data. 
     test_list = response.xpath('//a/@href') 
     for test_list in response.xpath('.//div[@class="ProductPriceBox-item detail"]'): 
      temporary = response.xpath('//div[@class="ProductPriceBox-item detail"]/div/a/@href').extract() 
      for i in range(len(temporary)): 
       scrapy.Request(temporary[i], callback=self.parse_start_url, meta={'dont_redirect': True, 'item': item}) 

     # To find the iframe on a page, launch the next function. 
     yield scrapy.Request(response.url, callback=self.parse_iframe, meta={'dont_redirect': True, 'item': item}) 

    # Your function that crawl the iframe on a page 
    def parse_iframe(self, response): 
     f_item1 = response.meta['item'] # Just to use the same item location you used above. 

     # Find all the iframe on a page. 
     soup = BeautifulSoup(urlopen(response.url), "lxml") 
     iframexx = soup.find_all('iframe') 

     # If there's at least one iframe, launch the next function on it 
     if (len(iframexx) != 0): 
      for iframe in iframexx: 
       yield scrapy.Request(iframe.attrs['src'], callback=self.extract_or_loop, meta={'dont_redirect': True, 'item': f_item1}) 

     # If there's no iframe, launch the next function on the link of the page where you looked after the potential iframe. 
     else: 
      yield scrapy.Request(response.url, callback=self.extract_or_loop, meta={'dont_redirect': True, 'item': f_item1}) 

    # Function to find the other data. 
    def extract_or_loop(self, response): 
     f_item2 = response.meta['item'] # Just to use the same item location you used above. 

     # The rest of the data you want. 
     address = response.xpath('//body//div/p/text()').re(r'.*Adresse \: (.*)\n?.*') 
     email = response.xpath('//body//div/ul/li[contains(text(),"@")]/text()').extract() 
     name = response.xpath('//body//div/p[@class="customer-policy-label"]/text()').re(r'Infos sur la boutique \: ([a-zA-Z0-9]*\s*)') 
     phone = response.xpath('//body//div/p/text()').re(r'.*Tél \: ([\d]*)\n?.*') 
     siret = response.xpath('//body//div/p/text()').re(r'.*Siret \: ([\d]*)\n?.*') 
     vat = response.xpath('//body//div/text()').re(r'.*TVA \: (.*)') 

     # If the name of the seller exist, then return the data. 
     if (len(name) != 0): 
      f_item2['name'] = ''.join(name).strip() 
      f_item2['address'] = ''.join(address).strip() 
      f_item2['phone'] = ''.join(phone).strip() 
      f_item2['email'] = ''.join(email).strip() 
      f_item2['vat'] = ''.join(vat).strip() 
      f_item2['siret'] = ''.join(siret).strip() 
      yield f_item2 

     # If not, there was no data on the page and you have to find all the links on your page and launch the first function on them. 
     else: 
      for sel in response.xpath('//html/body'): 
       list_urls = sel.xpath('//a/@href').extract() 
       list_iframe = response.xpath('//div[@class="ProductPriceBox-item detail"]/div/a/@href').extract() 
       if (len(list_iframe) != 0): 
        for list_iframe in list_urls: 
         yield scrapy.Request(list_iframe, callback=self.parse_start_url, meta={'dont_redirect': True}) 
       for url in list_urls: 
        yield scrapy.Request(response.urljoin(url), callback=self.parse_start_url, meta={'dont_redirect': True}) 

マイセッティング:

BOT_NAME = 'fnac' 

SPIDER_MODULES = ['fnac.spiders'] 
NEWSPIDER_MODULE = 'fnac.spiders' 
DOWNLOAD_DELAY = 2 
COOKIES_ENABLED = False 
ITEM_PIPELINES = { 
    'fnac.pipelines.FnacPipeline': 300, 
} 

私のパイプライン:

# -*- coding: utf-8 -*- 
from scrapy import signals 
from scrapy.exporters import CsvItemExporter 

# Define your item pipelines here 
# 
# Don't forget to add your pipeline to the ITEM_PIPELINES setting 
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 

# Define your output file. 
class FnacPipeline(CsvItemExporter): 
    def __init__(self): 
     self.files = {} 

    @classmethod 
    def from_crawler(cls, crawler): 
     pipeline = cls() 
     crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) 
     crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) 
     return pipeline 

    def spider_opened(self, spider): 
     f = open('..\\..\\..\\..\\Fnac.csv', 'w').close() 
     file = open('..\\..\\..\\..\\Fnac.csv', 'wb') 
     self.files[spider] = file 
     self.exporter = CsvItemExporter(file) 
     self.exporter.start_exporting() 

    def spider_closed(self, spider): 
     self.exporter.finish_exporting() 
     file = self.files.pop(spider) 
     file.close() 

    def process_item(self, item, spider): 
     self.exporter.export_item(item) 
     return item 

マイアイテム:

# -*- coding: utf-8 -*- 
import scrapy 

# Define here the models for your scraped items 

# See documentation in: 
# http://doc.scrapy.org/en/latest/topics/items.html 

class FnacItem(scrapy.Item): 
    # define the fields for your items : 
    # name = scrapy.Field() 
    name = scrapy.Field() 
    nb_sales = scrapy.Field() 
    country = scrapy.Field() 
    address = scrapy.Field() 
    siret = scrapy.Field() 
    vat = scrapy.Field() 
    phone = scrapy.Field() 
    email = scrapy.Field() 
私はクモを実行するために、私のプロンプトで書く

コマンドは次のとおりです。

scrapy crawl FnacCom

出力の例です:

2017-08-08 10:21:54 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/TV-Panasonic/TV-par-marque/nsh474980/w-4#bl=MMtvh> (referer: https://www.fnac.com) 
2017-08-08 10:21:56 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/TV-Philips/TV-par-marque/nsh474981/w-4#bl=MMtvh> (referer: https://www.fnac.com) 
2017-08-08 10:21:58 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/TV-Sony/TV-par-marque/nsh475001/w-4#bl=MMtvh> (referer: https://www.fnac.com) 
2017-08-08 10:22:01 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/TV-LG/TV-par-marque/nsh474979/w-4#bl=MMtvh> (referer: https://www.fnac.com) 
2017-08-08 10:22:03 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/TV-Samsung/TV-par-marque/nsh474984/w-4#bl=MMtvh> (referer: https://www.fnac.com) 
2017-08-08 10:22:06 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/TV-Television/TV-par-marque/shi474972/w-4#bl=MMtvh> (referer: https://www.fnac.com) 
2017-08-08 10:22:08 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/TV-Television/TV-par-prix/shi474946/w-4#bl=MMtvh> (referer: https://www.fnac.com) 
2017-08-08 10:22:11 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/TV-Television/TV-par-taille-d-ecran/shi474945/w-4#bl=MMtvh> (referer: https://www.fnac.com) 
2017-08-08 10:22:12 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/TV-Television/TV-par-Technologie/shi474944/w-4#bl=MMtvh> (referer: https://www.fnac.com) 
2017-08-08 10:22:15 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/Smart-TV-TV-connectee/TV-par-Technologie/nsh474953/w-4#bl=MMtvh> (referer: https://www.fnac.com) 
2017-08-08 10:22:18 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/TV-QLED/TV-par-Technologie/nsh474948/w-4#bl=MMtvh> (referer: https://www.fnac.com) 
2017-08-08 10:22:21 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/TV-4K-UHD/TV-par-Technologie/nsh474947/w-4#bl=MMtvh> (referer: https://www.fnac.com) 
2017-08-08 10:22:23 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/Toutes-les-TV/TV-Television/nsh474940/w-4#bl=MMtvh> (referer: https://www.fnac.com) 
2017-08-08 10:22:26 [scrapy.extensions.logstats] INFO: Crawled 459 pages (at 24 pages/min), scraped 0 items (at 0 items/min) 
2017-08-08 10:22:26 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/TV-Television/shi474914/w-4#bl=MMtvh> (referer: https://www.fnac.com) 
2017-08-08 10:22:28 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/partner/canalplus#bl=MMtvh> (referer: https://www.fnac.com) 
2017-08-08 10:22:34 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/Meilleures-ventes-TV/TV-Television/nsh474942/w-4#bl=MMtvh> (referer: https://www.fnac.com) 
2017-08-08 10:22:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/Toutes-nos-Offres/Offres-de-remboursement/shi159784/w-4#bl=MMtvh> (referer: https://www.fnac.com) 
2017-08-08 10:22:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/Offres-Adherents/Toutes-nos-Offres/nsh81745/w-4#bl=MMtvh> (referer: https://www.fnac.com) 
2017-08-08 10:22:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/labofnac#bl=MMtvh#bl=MMtvh> (referer: https://www.fnac.com) 
2017-08-08 10:22:44 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/Lecteur-et-Enregistreur-DVD-Blu-Ray/Lecteur-DVD-Blu-Ray/shi475063/w-4#bl=MMtvh> (referer: https://www.fnac.com) 
2017-08-08 10:22:46 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/TV-OLED/TV-par-Technologie/nsh474949/w-4#bl=MMtvh> (referer: https://www.fnac.com) 
2017-08-08 10:22:49 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/Lecteur-DVD-Portable/Lecteur-et-Enregistreur-DVD-Blu-Ray/nsh475064/w-4#bl=MMtvh> (referer: https://www.fnac.com) 
2017-08-08 10:22:52 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/Home-Cinema/Home-Cinema-par-marque/shi475116/w-4#bl=MMtvh> (referer: https://www.fnac.com) 
2017-08-08 10:22:52 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/Univers-TV/Univers-Ecran-plat/cl179/w-4#bl=MMtvh> (referer: https://www.fnac.com) 
2017-08-08 10:22:55 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.fnac.com/Casque-TV-HiFi/Casque-par-usage/nsh450507/w-4#bl=MMtvh> (referer: https://www.fnac.com) 

はあなたの助けのためにありがとうございました!!!

+0

あなたのクモは多くの問題があります。私はここでCrawlSpiderを悪用しているので、CrawlSpiderを削除し、標準のSpiderクラスを使用することをお勧めします。 – Granitosaurus

+0

ありがとう、ありがとうございます!私はそれを行うことから始めます...他の問題はどうしてください? –

+0

'LinkExtractor()'を使いたい場合、 'scrapy.Spider'の代わりに' CrawlSpider'を書く必要がありますか? –

答えて

1

私はクモがcrawlspiderを使用して、共通のscrapyイディオムを使用せずに、明示的に書くことができますどのように披露する小さなコードリファクタリングを書いた:

class Fnac(Spider): 
    name = 'fnac.com' 
    allowed_domains = ['fnac.com'] 
    start_urls = ['https://www.fnac.com/Index-Vendeurs-MarketPlace/0/'] # The first link you crawl. 

    def parse(self, response): 
     # parse sellers 
     sellers = response.xpath("//h1[contains(selftext(),'MarketPlace')]/following-sibling::ul/li/a/@href").extract() 
     for url in sellers: 
      yield Request(url, callback=self.parse_seller) 

     # parse other pages A-Z 
     pages = response.css('.pagerletter a::attr(href)').extract() 
     for url in pages: 
      yield Request(url, callback=self.parse) 

    def parse_seller(self, response): 
     nb_sales = response.xpath('//body//table[@summary="données détaillée du vendeur"]/tbody/tr/td/span/text()').re(r'([\d]*) ventes') 
     country = response.xpath('//body//table[@summary="données détaillée du vendeur"]/tbody/tr/td/text()').re(r'([A-Z].*)') 
     item = FnacItem() 
     # To store the data in their right places. 
     item['nb_sales'] = ''.join(nb_sales).strip() 
     item['country'] = ''.join(country).strip() 
     # go to details page now 
     details_url = response.xpath("//iframe/@src[contains(.,'retour')]").extract_first() 
     yield Request(details_url, self.parse_seller_details, 
         meta={'item': item}) # carry over our item to next response 

    def parse_seller_details(self, response): 
     item = response.meta['item'] # get item that's got filled in `parse_seller` 
     address = response.xpath('//body//div/p/text()').re(r'.*Adresse \: (.*)\n?.*') 
     email = response.xpath('//body//div/ul/li[contains(text(),"@")]/text()').extract() 
     # parse here 
     yield item 
+0

私はちょうど5分で私のスパイダーをした天才に会った...ありがとう! 'response.css'と' response.xpath'の違いを教えてください。両方を使用しているため –

+0

リダイレクトされたリンクをクロールする方法は分かりますか?つまり、正しいページにリダイレクトされると、正しいページをクロールすることができるのでしょうか? –

+0

@P.Postrique '.css'はCSSセレクタを使用し、' .xpath'はxpathセレクタを使用しています:D両方ともhtmlページを解析するのに有効です。一般に、CSSセレクタはあまり強力ではありませんが、書くのは簡単ではありませんが、xpathセレクタは複雑な面ではもう少しですが、あらゆる種類のクレイジーなトリックを可能にする、非常に強力です。リダイレクトについて:リダイレクトミドルウェアを介して自動的にリダイレクトが処理されるため、心配する必要はありません。 – Granitosaurus

関連する問題