0
私は、mongodbを使うための簡単なscrapyモジュールを作成し始めました。私のpythonに新しいですし、私は私が書いたコードの問題となっている:mongodbを使ったシンプルなpython scrapyクローラの作成
congress.py
import scrapy
from scrapy.selector import Selector
from scrapy.loader import ItemLoader
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import HtmlResponse
from congress.items import CongressItem
class CongressSpider(CrawlSpider):
name = "congres"
allowed_domains = ["www.congress.gov"]
start_urls = [
'https://www.congress.gov/members',
]
#creating a rule for my crawler. I only want it to continue to the next page, don't follow any other links.
rules = (Rule(LinkExtractor(allow=(),restrict_xpaths=("//a[@class='next']",)), callback="parse_page", follow=True),)
def parse_page(self, response):
for search in response.selector.xpath(".//li[@class='compact']"):
yield {'member' : ' '.join(search.xpath("normalize-space(span/a/text())").extract()).strip(),
'state' : ' '.join(search.xpath("normalize-space(div[@class='quick-search-member']//span[@class='result-item']/span/text())").extract()).strip(),
'District' : ' '.join(search.xpath("normalize-space(div[@class='quick-search-member']//span[@class='result-item'][2]/span/text())").extract()).strip(),
'party' : ' '.join(search.xpath("normalize-space(div[@class='quick-search-member']//span[@class='result-item'][3]/span/text())").extract()).strip(),
'Served' : ' '.join(search.xpath("normalize-space(div[@class='quick-search-member']//span[@class='result-item'][4]/span//li/text())").extract()).strip(),
}
items.py
import scrapy
class CongressItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
member = scrapy.Field()
state = scrapy.Field()
District = scrapy.Field()
party = scrapy.Field()
served = scrapy.Field()
pipelines.py
from pymongo import MongoClient
from scrapy.conf import settings
from scrapy.exceptions import DropItem
from scrapy import log
class CongressPipeline(object):
collection_name= 'members'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
)
def open_spider(self,spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.db[self.collection_name].insert(dict(item))
return item
settings.py
BOT_NAME = 'congres'
SPIDER_MODULES = ['congres.spiders']
NEWSPIDER_MODULE = 'congres.spiders'
MONGO_URI = 'mongodb://localhost:27017'
MONGO_DATABASE = 'congres'
ROBOTSTXT_OBEY = True
DOWNLOAD_DELAY = 3
ITEM_PIPELINES = {
'congress.pipelines.CongresPipeline': 300,
}
それが表示されたエラーは、インポートされ
Unhandled error in Deferred:
2017-07-09 11:15:33 [twisted] CRITICAL: Unhandled error in Deferred:
2017-07-09 11:15:34 [twisted] CRITICAL:
Traceback (most recent call last):
File "c:\python27\lib\site-packages\twisted\internet\defer.py", line 1386,
in _inlineCallbacks
result = g.send(result)
File "c:\python27\lib\site-packages\scrapy\crawler.py", line 95, in crawl
six.reraise(*exc_info)
File "c:\python27\lib\site-packages\scrapy\crawler.py", line 79, in crawl
yield self.engine.open_spider(self.spider, start_requests)
NameError: global name 'pymongo' is not defined
に変更してください。リンクとしてではありません –
コードとエラーが接続されていません。画像はblackberry_spiderに 'インデント 'エラーがあることを示していますが、ここでは' congress'スパイダーのコードを挙げています。 – Mani
@Maniがエラーを更新しました – Emonwoods