以下のコードは基本的にAmazon Spiderのサンプルです。
amazonサーバー(または他のサーバー)が、私たちが受け持つデータが何であるかを知りたい場合はscrapy Request.metaです。 Request.metaがリクエストと一緒に渡されない場合、私たちはそのメタデータをresponse.metaにどのように受け取りますか?
どのように治療request.metaとresponse.metaがの仕事を説明することができますか?サーバは、Scrapyによって送信されたRequest.Metaデータを読み取ることができますか?
import random
from HTMLParser import HTMLParser
import scrapy
from scrapy.crawler import CrawlerProcess
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
from amazon.items import AmazonItem
from amazon.user_agents import user_agent_list
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
class Amazon(scrapy.Spider):
allowed_domains = ['amazon.in']
start_urls = ['http://www.amazon.in']
name = 'amazon'
def parse(self, response):
product_detail = response.xpath('//li[@class="s-result-item celwidget "]')
for product in product_detail:
asin = product.xpath('@data-asin').extract_first().encode('ascii', 'ignore')
url = 'http://www.amazon.in/dp/' + asin
brand = product.xpath('div/div/div/span[2]/text()').extract_first()
if brand != 'Azani':
request = scrapy.Request(url, callback=self.parse_product)
request.meta['asin'] = asin
yield request
next_page = response.xpath('//a[@id="pagnNextLink"]/@href').extract_first()
if next_page:
next_page = 'http://www.amazon.in' + next_page
request = scrapy.Request(next_page, callback=self.parse)
yield request
def offer_page(self, response):
item = response.meta['item']
seller = response.xpath('//div[@class="a-row a-spacing-mini olpOffer"]/div/h3/span/a/text()').extract()
price = response.xpath('//div[@class="a-row a-spacing-mini olpOffer"]/div/span/span/text()').extract()
seller_price = zip(seller, price)
item['brand'] = response.xpath('//div[@id="olpProductByline"]/text()').extract_first().strip().replace('by ',
'')
item['price'] = '{}'.format(seller_price)
item['no_of_seller'] = len(seller_price)
yield item
def parse_product(self, response):
def html_to_text(html):
s = MLStripper()
s.feed(html)
return s.get_data()
asin = response.meta['asin']
item = AmazonItem()
item['asin'] = asin
item['product_name'] = response.xpath('//*[@id="productTitle"]/text()').extract_first().strip()
item['bullet_point'] = html_to_text(
response.xpath('//*[@id="feature-bullets"]').extract_first()).strip()
item['description'] = html_to_text(response.xpath('//*[@id="productDescription"]').extract_first()).strip()
child_asins = response.xpath('//*[@class="dropdownAvailable"]/@value').extract()
child_asins = map(lambda x: x.split(',')[-1], child_asins)
child_asins = ','.join(child_asins)
item['child_asin'] = child_asins.encode('utf-8', 'ignore')
offer_page = 'http://www.amazon.in/gp/offer-listing/' + asin
request = scrapy.Request(offer_page, callback=self.offer_page)
request.meta['item'] = item
yield request