class SomewebsiteProductSpider(scrapy.Spider):
name = "somewebsite"
allowed_domains = ["somewebsite.com"]
start_urls = [
]
def parse(self, response):
items = somewebsiteItem()
title = response.xpath('//h1[@id="title"]/span/text()').extract()
sale_price = response.xpath('//span[contains(@id,"ourprice") or contains(@id,"saleprice")]/text()').extract()
category = response.xpath('//a[@class="a-link-normal a-color-tertiary"]/text()').extract()
availability = response.xpath('//div[@id="availability"]//text()').extract()
items['product_name'] = ''.join(title).strip()
items['product_sale_price'] = ''.join(sale_price).strip()
items['product_category'] = ','.join(map(lambda x: x.strip(), category)).strip()
items['product_availability'] = ''.join(availability).strip()
fo = open("C:\\Users\\user1\PycharmProjects\\test.txt", "w")
fo.write("%s \n%s \n%s" % (items['product_name'], items['product_sale_price'], self.start_urls))
fo.close()
print(items)
yield items
test.pyダイナミック開始URLのリストをscrapyでクロール
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(SomewebsiteProductSpider)
process.start()
がどのように私はクロール・プロセスを起動する前にtest.pyから「SomewebsiteProductSpiders」オブジェクトへの動的start_urlsリストを渡すことができたときに?どんな助けもありがとう。 ありがとうございます。
https://doc.scrapy.org/en/latest/topics/spiders.html#scrapy .spiders.Spider.start_requests – BlackBear