0
からScrapyクローラを実行しているが、私は、mainメソッドはscrapyでwriitenクローラIHAVE主な機能
import sys, getopt
import scrapy
from scrapy.spiders import Spider
from scrapy.http import Request
import re
class TutsplusItem(scrapy.Item):
title = scrapy.Field()
class MySpider(Spider):
name = "tutsplus"
allowed_domains = ["bbc.com"]
start_urls = ["http://www.bbc.com/"]
def __init__(self, *args):
try:
opts, args = getopt.getopt(args, "hi:o:", ["ifile=", "ofile="])
except getopt.GetoptError:
print 'test.py -i <inputfile> -o <outputfile>'
sys.exit(2)
super(MySpider, self).__init__(self,*args)
def parse(self, response):
links = response.xpath('//a/@href').extract()
# We stored already crawled links in this list
crawledLinks = []
# Pattern to check proper link
# I only want to get the tutorial posts
# linkPattern = re.compile("^\/tutorials\?page=\d+")
for link in links:
# If it is a proper link and is not checked yet, yield it to the Spider
#if linkPattern.match(link) and not link in crawledLinks:
if not link in crawledLinks:
link = "http://www.bbc.com" + link
crawledLinks.append(link)
yield Request(link, self.parse)
titles = response.xpath('//a[contains(@class, "media__link")]/text()').extract()
count=0
for title in titles:
item = TutsplusItem()
item["title"] = title
print("Title is : %s" %title)
yield item
代わりのscrapy runspider Crawler.py ARG1 ARG2 を使用してを使用してcrwalingを開始したいと思う私はしたいと思います主な機能を備えたセパレートクラスを持ち、そこから治療を開始する。どのようにこれは?