1
このURLには、すべての商品URLとそれぞれのTYPEが必要です。URLの種類カテゴリを取得する
ので、出力は次のようになります。
以下Product_URL1 Blouse
Product_URL2 Crop Top
Product_URL3 Tank Top
Product_URL4 Strappy Top
Product_URL5 Tube Top
は私のコードですが、私は
from scrapy.spiders import CrawlSpider
import scrapy
from scrapy.http.request import Request
class JabongItem(scrapy.Item):
base_link = scrapy.Field()
type = scrapy.Field()
count = scrapy.Field()
product_name = scrapy.Field()
product_link = scrapy.Field()
class JabongScrape(CrawlSpider):
name = "jabong"
allowed_domains = "jabong.com"
start_urls = ["http://www.jabong.com/women/clothing/tops-tees-shirts/tops", "http://www.jabong.com/women/clothing/tops-tees-shirts/tees"]
def parse(self, response):
item=JabongItem()
try:
for idx in range(0, 20):
item['type']=response.xpath("//div[contains(@class, 'options')]/label/a/text()").extract()[idx]
item['base_link']=response.url+response.xpath("//div[contains(@class, 'options')]/label/a/@href").extract()[idx] + "?ax=1&page=1&limit=" + (response.xpath("//div[contains(@class, 'options')]/label/small/text()").extract()[idx]).replace("[","").replace("]","") + "&sortField=popularity&sortBy=desc"
item['count']= (response.xpath("//div[contains(@class, 'options')]/label/small/text()").extract()[idx]).replace("[","").replace("]","")
yield Request(item['base_link'],callback=self.parse_product_link,
meta={'item': item, 'count': int(item['count'])}, dont_filter=True)
except:
pass
def parse_product_link(self,response):
item=response.meta['item']
try:
for i in range(0, response.meta['count']):
item['product_link']=response.xpath("//div[contains(@class, 'col-xxs-6 col-xs-4 col-sm-4 col-md-3 col-lg-3 product-tile img-responsive')]/a/@href").extract()[i]
# item['original_price']=response.xpath("section.row > div:nth-child(1) > a:nth-child(1) > div:nth-child(2) > div:nth-child(2) > span:nth-child(1) > span:nth-child(1)::text").extract()[idx]
print i
yield item
except:
pass
そしてjbng_base_links.txt [「型」]すべては右の項目のXPathを期待していると思います"http://www.jabong.com/women/clothing/tops-tees-shirts/tops"
私の提案は次のようになり'CrawlSpider'の代わりに' Spider'で掻き集めて別々にそれぞれの種類を擦っていくために、例えばブラウスのようなリンクがあります。http://www.jabong.com/women/clothing/tops-tees-shirts/tops/blouses/ –