次クモを行った後、フィード出力が得られない:Scrapyスパイダーは、固定<code>start_urls</code>作品で「start_urls」変数
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from funda.items import FundaItem
class PropertyLinksSimpleSpider(CrawlSpider):
name = "property_links_simple"
allowed_domains = ["funda.nl"]
# def __init__(self, place='amsterdam', page='1'):
# self.start_urls = ["http://www.funda.nl/koop/%s/p%s/" % (place, page)]
# self.le1 = LinkExtractor(allow=r'%s+huis-\d{8}' % self.start_urls[0])
start_urls = ["http://www.funda.nl/koop/amsterdam/"]
le1 = LinkExtractor(allow=r'%s+huis-\d{8}' % start_urls[0])
# rules = (Rule(le1, callback='parse_item'),)
def parse(self, response):
links = self.le1.extract_links(response)
for link in links:
if link.url.count('/') == 6 and link.url.endswith('/'):
item = FundaItem()
item['url'] = link.url
yield item
私はコマンドscrapy crawl property_links_simple -o property_links.json
によってフィード出力でそれを実行すると、結果のファイルにリンクが含まれている予想通り:
[
{"url": "http://www.funda.nl/koop/amsterdam/huis-49708477-paul-schuitemahof-27/"},
{"url": "http://www.funda.nl/koop/amsterdam/huis-49826458-buiksloterdijk-270/"},
{"url": "http://www.funda.nl/koop/amsterdam/huis-49818887-markiespad-19/"},
{"url": "http://www.funda.nl/koop/amsterdam/huis-49801910-claus-van-amsbergstraat-86/"},
{"url": "http://www.funda.nl/koop/amsterdam/huis-49801593-jf-berghoefplantsoen-2/"},
{"url": "http://www.funda.nl/koop/amsterdam/huis-49800159-breezandpad-8/"},
{"url": "http://www.funda.nl/koop/amsterdam/huis-49805292-nieuwendammerdijk-21/"},
{"url": "http://www.funda.nl/koop/amsterdam/huis-49890140-talbotstraat-9/"},
{"url": "http://www.funda.nl/koop/amsterdam/huis-49879212-henri-berssenbruggehof-15/"},
{"url": "http://www.funda.nl/koop/amsterdam/huis-49728947-emmy-andriessestraat-374/"},
{"url": "http://www.funda.nl/koop/amsterdam/huis-49713458-jan-vrijmanstraat-29/"}
]
しかし、私はそのようなhttp://www.funda.nl/koop/rotterdam/p2/として、クモに異なるstart_urls
を渡すことができるようにしたいと思います。
[
[
はなぜもはやクモではありません:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from funda.items import FundaItem
class PropertyLinksSimpleSpider(CrawlSpider):
name = "property_links_simple"
allowed_domains = ["funda.nl"]
def __init__(self, place='amsterdam', page='1'):
self.start_urls = ["http://www.funda.nl/koop/%s/p%s/" % (place, page)]
self.le1 = LinkExtractor(allow=r'%s+huis-\d{8}' % self.start_urls[0])
# start_urls = ["http://www.funda.nl/koop/amsterdam/"]
# le1 = LinkExtractor(allow=r'%s+huis-\d{8}' % start_urls[0])
# rules = (Rule(le1, callback='parse_item'),)
def parse(self, response):
links = self.le1.extract_links(response)
for link in links:
if link.url.count('/') == 6 and link.url.endswith('/'):
item = FundaItem()
item['url'] = link.url
yield item
をしかし、私は、このコマンドscrapy crawl property_links_simple -a place=amsterdam -a page=1 -o property_links2.json
を使用して実行する場合、私は空の.jsonファイルを取得するには、次のようにこの目的を達成するために、私はそれに適応しようとしましたどんな出力も得られますか?