私は走ってパイプラインを呼び出せない小さな蜘蛛を書く。scrapyパイプラインを呼び出すことができない奇妙なバグコード
しばらくの間デバッグした後、バグコード領域が見つかります。
クッキーのロジックは、クッキーを取得するための最初のURLをクロールし、次にクッキーを使用してコード画像をダウンロードする2番目のURLをクロールし、3番目のURLに準備するデータを投稿します。私が正しいテキストを取得するまで、私は画像から間違って取得した場合、私は再び3番目のURLを投稿するためにダウンロードします。
私はあなたのコードをお見せしましょう:
# -*- coding: gbk -*-
import scrapy
from scrapy.http import FormRequest
import json
import os
from datetime import datetime
from scrapy.selector import Selector
from teacherCourse.handlePic import handle
from teacherCourse.items import DetailProfItem
from teacherCourse.items import DetailProfCourseItem
from teacherCourse.items import containItem
class GetTeacherCourseSpider(scrapy.Spider):
name = 'TeacherCourse'
# custom_settings = {
# 'ITEM_PIPELINES': {
# 'teacherCourse.pipelines.TeacherCoursePipeline': 300,
# }
# }
def __init__(self, selXNXQ='', titleCode=''):
self.getUrl = 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx' # first
self.vcodeUrl = 'http://jwxt.dgut.edu.cn/jwweb/sys/ValidateCode.aspx' # second
self.postUrl = 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB_rpt.aspx' # third
self.findSessionId = None # to save the cookies
self.XNXQ = selXNXQ
self.titleCode = titleCode
def start_requests(self):
request = scrapy.Request(self.getUrl,
callback = self.downloadPic)
yield request
def downloadPic(self, response):
# download the picture
# find the session id
self.findSessionId = response.headers.getlist('Set-Cookie')[0].decode().split(";")[0].split("=")
request = scrapy.Request(self.vcodeUrl,
cookies= {self.findSessionId[0]: self.findSessionId[1]},
callback = self.getAndHandleYzm)
yield request
def getAndHandleYzm(self, response):
yzm = handle(response.body)
yield FormRequest(self.postUrl,
formdata={'Sel_XNXQ': '20151',
'sel_zc': '011',
'txt_yzm': yzm,
'type': '2'},
headers={
'Referer': 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx',
'Cookie': self.findSessionId[0] + '=' + self.findSessionId[1],
},
callback=self.parse)
def parse(self, response):
body = response.body.decode('gbk')
num = body.find('alert')
if num != -1:
# means CAPTCHA validation fails, need to re-request the CAPTCHA
yield scrapy.Request(self.vcodeUrl+'?t='+'%.f' % (datetime.now().microsecond/1000),
headers={
'Referer': 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx',
'Cookie': self.findSessionId[0]+'='+self.findSessionId[1]
},
callback=self.getAndHandleYzm)
else:
# parse data
self.parseData(body)
# item = containItem()
# item['first'] = len(body)
# return item
# the parse data part is a little bit long, but it doesn't matter.
# At the last line, I did yield a item
def parseData(self, body):
# parse body data
sel = Selector(text=body)
# get all the note text data
noteTables = sel.xpath('//table[@style="border:0px;"]').extract()
noteList = [] # to store all the note text
for noteTable in noteTables:
if '<b>' in noteTable:
sele = Selector(text = noteTable)
note = (sele.xpath('//table/tr/td/b/text()').extract())
noteText = (sele.xpath('//table/tr/td/text()').extract())
# combine note and noteText
if not noteText:
noteText.append('')
noteText.append('')
else:
if len(noteText) == 1:
noteText.append('')
noteList.append(noteText)
# get all the course data
courseTables = sel.xpath('//table[@class="page_table"]/tbody').extract()
AllDetailCourse = [] # all the teachers' course
for table in courseTables:
everyTeacherC = [] # every teacher's course
s = Selector(text = table)
trs = s.xpath('//tr').extract()
for tr in trs:
sel = Selector(text = tr)
snum = (sel.xpath('//td[1]/text()').extract())
course = (sel.xpath('//td[2]/text()').extract())
credit = (sel.xpath('//td[3]/text()').extract())
teachWay = (sel.xpath('//td[4]/text()').extract())
courseType = (sel.xpath('//td[5]/text()').extract())
classNum = (sel.xpath('//td[6]/text()').extract())
className = (sel.xpath('//td[7]/text()').extract())
stuNum = (sel.xpath('//td[8]/text()').extract())
week = (sel.xpath('//td[9]/text()').extract())
section = (sel.xpath('//td[10]/text()').extract())
location = (sel.xpath('//td[11]/text()').extract())
tmpList = []
tmpList.append(snum)
tmpList.append(course)
tmpList.append(credit)
tmpList.append(teachWay)
tmpList.append(courseType)
tmpList.append(classNum)
tmpList.append(className)
tmpList.append(stuNum)
tmpList.append(week)
tmpList.append(section)
tmpList.append(location)
# to know whether every variable is empty
detailCourse = []
for each in tmpList:
if not each:
each = ''
else:
each = each[0]
detailCourse.append(each)
everyTeacherC.append(detailCourse)
AllDetailCourse.append(everyTeacherC)
# get department, teacher, gender and title
sel = Selector(text = body)
temp1 = sel.xpath('//*[@group="group"]/table/tr/td/text()').extract()
# fill two tables, which will store in the database
i = 0
# every professor
for each in temp1:
tables = containItem() # all the data in every for loop to send to the pipeline
each = each.replace(u'\xa0', u' ')
each = each.split(' ')
depart = each[0].split('£º')
teacher = each[1].split('£º')
gender = each[2].split('£º')
title = each[3].split('£º')
# first table
profItem = DetailProfItem()
profItem['XNXQ'] = self.XNXQ
profItem['department'] = depart[1] # department
profItem['teacher'] = teacher[1] # teacher
profItem['gender'] = gender[1]
profItem['title'] = title[1]
profItem['note1'] = noteList[i][0]
profItem['note2'] = noteList[i][1]
tables['first'] = profItem # add the first table
# second table
# every professor's courses
profCourses = []
for j in range(len(AllDetailCourse[i])): # how many course for every professor
profCourseItem = DetailProfCourseItem() # every course for every professor
profCourseItem['snum'] = AllDetailCourse[i][j][0] # i means i-th professor, j means j-th course, third num means what position of the course
profCourseItem['course'] = AllDetailCourse[i][j][1]
profCourseItem['credit'] = AllDetailCourse[i][j][2]
profCourseItem['teachWay'] = AllDetailCourse[i][j][3]
profCourseItem['courseType'] = AllDetailCourse[i][j][4]
profCourseItem['classNum'] = AllDetailCourse[i][j][5]
profCourseItem['className'] = AllDetailCourse[i][j][6]
profCourseItem['stuNum'] = AllDetailCourse[i][j][7]
profCourseItem['week'] = AllDetailCourse[i][j][8]
profCourseItem['section'] = AllDetailCourse[i][j][9]
profCourseItem['location'] = AllDetailCourse[i][j][10]
profCourses.append(profCourseItem) # every professor's courses
tables['second'] = profCourseItem # add the second table
i += 1
yield tables
任意の提案が高く評価されるだろう!
settings.py:(パイプラインの一部)
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'teacherCourse.pipelines.TeacherCoursePipeline': 300,
}
items.py:(私はそれが問題だとは思わない)
# detail professor course message
class DetailProfCourseItem(scrapy.Item):
snum = scrapy.Field() # serial number
course = scrapy.Field()
credit = scrapy.Field()
teachWay = scrapy.Field()
courseType = scrapy.Field()
classNum = scrapy.Field()
className = scrapy.Field()
stuNum = scrapy.Field()
week = scrapy.Field()
section = scrapy.Field()
location = scrapy.Field()
# the third item which contain first and second item
class containItem(scrapy.Item):
first = scrapy.Field() # for fist table
second = scrapy.Field() # for second table
パイプラインコード:
class TeacherCoursePipeline(object):
def process_item(self, item, spider):
print('I am called!!!!!')
print(item)
return item
そして、私がスパイダーを走らせるときscrapy crawl TeacherCourse
それ出力:
2016-07-19 17:39:18 [scrapy] INFO: Scrapy 1.1.0rc1 started (bot: teacherCourse)
2016-07-19 17:39:18 [scrapy] INFO: Overridden settings: {'BOT_NAME': 'teacherCourse', 'NEWSPIDER_MODULE': 'teacherCourse.spiders', 'ROBOTSTXT_OBEY': True, 'SPIDER_MODULES': ['teacherCourse.spiders']}
2016-07-19 17:39:18 [scrapy] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats', 'scrapy.extensions.logstats.LogStats']
2016-07-19 17:39:18 [scrapy] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2016-07-19 17:39:18 [scrapy] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2016-07-19 17:39:18 [scrapy] INFO: Enabled item pipelines:
['teacherCourse.pipelines.TeacherCoursePipeline']
2016-07-19 17:39:18 [scrapy] INFO: Spider opened
2016-07-19 17:39:18 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-07-19 17:39:18 [scrapy] DEBUG: Crawled (404) <GET http://jwxt.dgut.edu.cn/robots.txt> (referer: None)
2016-07-19 17:39:18 [scrapy] DEBUG: Crawled (200) <GET http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx> (referer: None)
2016-07-19 17:39:19 [scrapy] DEBUG: Crawled (200) <GET http://jwxt.dgut.edu.cn/jwweb/sys/ValidateCode.aspx> (referer: http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx)
2016-07-19 17:39:19 [scrapy] DEBUG: Crawled (200) <POST http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB_rpt.aspx> (referer: http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx)
2016-07-19 17:39:19 [scrapy] INFO: Closing spider (finished)
2016-07-19 17:39:19 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1330,
'downloader/request_count': 4,
'downloader/request_method_count/GET': 3,
'downloader/request_method_count/POST': 1,
'downloader/response_bytes': 230886,
'downloader/response_count': 4,
'downloader/response_status_count/200': 3,
'downloader/response_status_count/404': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 7, 19, 9, 39, 19, 861620),
'log_count/DEBUG': 4,
'log_count/INFO': 7,
'request_depth_max': 2,
'response_received_count': 4,
'scheduler/dequeued': 3,
'scheduler/dequeued/memory': 3,
'scheduler/enqueued': 3,
'scheduler/enqueued/memory': 3,
'start_time': datetime.datetime(2016, 7, 19, 9, 39, 18, 774293)}
2016-07-19 17:39:19 [scrapy] INFO: Spider closed (finished)
あなたのパイプラインコード、設定やログを共有してください。あなたのスパイダーコードだけを読むだけでは、問題が何であるか教えてくれるわけではありません。 https://stackoverflow.com/help/mcveに読んでください。あなたが何をしようとしているのか、何を得て、何を期待しているのかを説明すると、他の人があなたを助けるのに役立ちます。 –
@paultrmbrth私は自分のコードに何か間違っていると思います。しかし、論理部分ではなく、アイテムを端末に印刷することができるからです。 –
私が走って、パイプラインを呼ぶことができないのは、あなたが何を意味するのかは明らかではありません。あなたはそれをどのように見ますか?あなたのパイプラインが "呼ばれていない"と思うのはなぜですか?あなたは、 "端末にアイテムを印刷する"ことができると言います。あなたが見たもののログを共有し、代わりにログに見たいものを説明できますか?また、 'TeacherCoursePipeline'コードを共有していないので、このパイプラインで何をしようとしているのか分かりません。 –