2017-12-22 18 views
0

で行うことができる午前は、私は非常に多くの方法を試してみましたPythonスクリプトですが、私は見つけることができるか、XPathを使用して印刷リンクフォームアマゾンではないが、私はここでbeautifulsoup

from lxml import html 
import csv,os,json 
import requests 
from exceptions import ValueError 
from time import sleep 

def AmzonParser(url): 
    headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 
    (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'} 
    page = requests.get(url,headers=headers) 
    while True: 
     sleep(3) 
     try: 
      doc = html.fromstring(page.content) 
      XPATH_NAME = '//h1[@id="title"]//text()' 
      XPATH_SALE_PRICE = '//span[contains(@id,"ourprice") or 
      contains(@id,"saleprice")]/text()' 
      XPATH_ORIGINAL_PRICE = '//td[contains(text(),"List Price") or 
      contains(text(),"M.R.P") or contains(text(),"Price")]/following- 
      sibling::td/text()' 
      XPATH_CATEGORY = '//a[@class="a-link-normal a-color- 
      tertiary"]//text()' 
      XPATH_AVAILABILITY = '//div[@id="availability"]/span/text()' 
      XPATH_DESCRIPTION = '///*[@id="productDescription"]/p/text()' 
      XPATH_IMAGE = '//*[@id="main-image- 
      container"]/ul/li[5]/span/span/div/img/src' 


      RAW_NAME = doc.xpath(XPATH_NAME) 
      RAW_SALE_PRICE = doc.xpath(XPATH_SALE_PRICE) 
      RAW_CATEGORY = doc.xpath(XPATH_CATEGORY) 
      RAW_ORIGINAL_PRICE = doc.xpath(XPATH_ORIGINAL_PRICE) 
      RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY) 
      RAW_DESCRIPTION = doc.xpath(XPATH_DESCRIPTION) 
      RAW_IMAGE = doc.xpath(XPATH_IMAGE) 

      NAME = ' '.join(''.join(RAW_NAME).split()) if RAW_NAME else None 
      SALE_PRICE = ' '.join(''.join(RAW_SALE_PRICE).split()).strip() if 
      RAW_SALE_PRICE else None 
      CATEGORY = ' > '.join([i.strip() for i in RAW_CATEGORY]) if 
      RAW_CATEGORY else None 
      ORIGINAL_PRICE = ''.join(RAW_ORIGINAL_PRICE).strip() if 
      RAW_ORIGINAL_PRICE else None 
      AVAILABILITY = ''.join(RAw_AVAILABILITY).strip() if RAw_AVAILABILITY 
      else None 
      DESCRIPTION = ''.join(RAW_DESCRIPTION).strip() if RAW_DESCRIPTION 
      else None 
      IMAGE = ''.join(RAW_IMAGE) if RAW_IMAGE else None 

      if not ORIGINAL_PRICE: 
       ORIGINAL_PRICE = SALE_PRICE 

      if page.status_code!=200: 
       raise ValueError('captha') 
      data = { 
        'NAME':NAME, 
        'SALE_PRICE':SALE_PRICE, 
        'CATEGORY':CATEGORY, 
        'ORIGINAL_PRICE':ORIGINAL_PRICE, 
        'AVAILABILITY':AVAILABILITY, 
        'URL':url, 
        'DESCRIPTION':DESCRIPTION, 
        'IMAGE':IMAGE, 
        } 

      return data 
     except Exception as e: 
      print e 

def ReadAsin(): 
    # AsinList = 
    csv.DictReader(open(os.path.join(os.path.dirname(__file__),"Asinfeed.csv"))) 
    AsinList = ['B008HDREZ6',] 
    extracted_data = [] 
    for i in AsinList: 
     url = "http://www.amazon.com/dp/"+i 
     print "Processing: "+url 
     extracted_data.append(AmzonParser(url)) 
     sleep(5) 
     f=open('data.json','w') 
     json.dump(extracted_data,f,indent=4) 


if __name__ == "__main__": 
    ReadAsin() 

をするXPathに新しいですと、それは働いていません私はここで、画像

のリンクを取得することができ、HTML

<div class="imgTagWrapper" style="height: 296px;"> 
    <img src="https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SY450_.jpg" class="a-dynamic-image a-stretch-vertical" id="" style="max-height: 296px; max-width: 204.282px;" data-old-hires="https://images-na.ssl-images-amazon.com/images/I/91CtQU45qKL._SL1500_.jpg" data-a-manual-replacement="true"> 
</div> 
+1

Amazonはその使用条件に反しているきさげ。その代わりにAPIを使用してください。 – kjhughes

+0

lxmlはpython-requestsやamazonよりも優れたタグです。 – Galen

+0

ページではJavaScriptを使用してこのHTMLに画像を配置し、 'lxml' /' beautifulsoup'で見つけることはできません。 'lxml' /' beautifulsoup'を使うと、左側に ''// div [@ id =" altImages "] // img/@ src''という小さな画像しか得られません。あなたは '

関連する問題