0
私の開始URLはhttp://www.geographic.org/streetview/usa/index.htmlです。geographic.org/streetviewをクロールするときにScrapyを使用してエラーを繰り返すことは何もない
私は、次のコードを使用しています:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from urlparse import urljoin
class StreetViewSpider(CrawlSpider):
name = "streetview"
allowed_domains = ["geographic.org"]
start_urls = ["http://www.geographic.org/streetview/usa/index.html"]
rules = (
Rule(LinkExtractor(restrict_xpaths="*.html"), follow=True),
Rule(LinkExtractor(allow=('*.html',)), callback='parse_item')
)
def parse_item(self, response):
self.logger.info('Hi, this is an item page! %s', response.url)
item = scrapy.Item()
sub_urls = response.xpath("descendant-or-self::li/descendant-or-self::*/a/@href").extract()
item['urls'] = map(lambda x: urljoin(response.url, x), sub_urls)
return item
私は彼らのURLで*.html
を持っており、ドメインwww.geographic.org
内にあるすべてのa
タグのリンクやテキストを必要としています。クローラは、view.php
がhref
に抽出されたときに実行されます。
私はscrapy crawl streetview
を使用してクローラを実行して、私が取得:
> scrapy crawl streetview
Traceback (most recent call last):
File "e:\miniconda2\lib\runpy.py", line 174, in _run_module_as_main
"__main__", fname, loader, pkg_name)
File "e:\miniconda2\lib\runpy.py", line 72, in _run_code
exec code in run_globals
File "E:\Miniconda2\Scripts\scrapy.exe\__main__.py", line 9, in <module>
File "e:\miniconda2\lib\site-packages\scrapy\cmdline.py", line 148, in execute
cmd.crawler_process = CrawlerProcess(settings)
File "e:\miniconda2\lib\site-packages\scrapy\crawler.py", line 243, in __init__
super(CrawlerProcess, self).__init__(settings)
File "e:\miniconda2\lib\site-packages\scrapy\crawler.py", line 134, in __init__
self.spider_loader = _get_spider_loader(settings)
File "e:\miniconda2\lib\site-packages\scrapy\crawler.py", line 330, in _get_spider_loader
return loader_cls.from_settings(settings.frozencopy())
File "e:\miniconda2\lib\site-packages\scrapy\spiderloader.py", line 61, in from_settings
return cls(settings)
File "e:\miniconda2\lib\site-packages\scrapy\spiderloader.py", line 25, in __init__
self._load_all_spiders()
File "e:\miniconda2\lib\site-packages\scrapy\spiderloader.py", line 47, in _load_all_spiders
for module in walk_modules(name):
File "e:\miniconda2\lib\site-packages\scrapy\utils\misc.py", line 71, in walk_modules
submod = import_module(fullpath)
File "e:\miniconda2\lib\importlib\__init__.py", line 37, in import_module
__import__(name)
File "F:\PyCharmProjects\streetview\streetview\spiders\collector.py", line 7, in <module>
class StreetViewSpider(CrawlSpider):
File "F:\PyCharmProjects\streetview\streetview\spiders\collector.py", line 13, in StreetViewSpider
Rule(LinkExtractor(allow=('*.html',)), callback='parse_item')
File "e:\miniconda2\lib\site-packages\scrapy\linkextractors\lxmlhtml.py", line 116, in __init__
canonicalize=canonicalize, deny_extensions=deny_extensions)
File "e:\miniconda2\lib\site-packages\scrapy\linkextractors\__init__.py", line 57, in __init__
for x in arg_to_iter(allow)]
File "e:\miniconda2\lib\re.py", line 194, in compile
return _compile(pattern, flags)
File "e:\miniconda2\lib\re.py", line 251, in _compile
raise error, v # invalid expression
sre_constants.error: nothing to repeat