2017-07-26 4 views
2

私はこのPythonスクリプトを過去1〜2日間使っていましたが、Firefox webdriverを使用するとすべてうまくいきますが、私はヘッドレスブラウザPhantomJS setNameが空であるため、setNumber = parseSetNumber(setName[0])の行でエラーError: list index out of rangeが失敗します。Python:Webdriver FirefoxとPhantomJSの違いによる問題

setName = atag.xpath("./div[contains(@class, 'product_info')]/div[contains(@class, 'product_name')]/a/text()")の行は、Firefox Webdriverを使用している場合、PhantomJS Webdriverを使用している場合は何も返しません。

エラーは、WebdriverをFirefoxからPhantomJSに切り替えると発生します。スクリプトがLinuxサーバー上で実行されるので、PhantomJSを使用します。

import time 
import os.path 
import lxml.html as LH 
import re 
import sys 
from selenium import webdriver 
from random import randint 
from selenium.webdriver.common.by import By 
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC 

PARAMS = sys.argv 
URL = PARAMS[1] 
BASEURL = URL[:URL.rfind('/')+1] 

# Parses the set name for the set number 
def parseSetNumber(string): 
    string = string.split(' ') 
    stringLength = len(string) 
    string = string[(stringLength - 1)] 
    if string.replace('.','').isdigit(): 
     return string 
    else: 
     return "" 

# Returns set reference for this site 
def parseRefId(string): 
    string = string.split('_') 
    return str(string[2]) 

try: 
    PAGE_NUMBER = 1 

    #-------------------------------------------------- 
    ## Get initial page 

    driver = webdriver.PhantomJS() 
    driver.get(PARAMS[1]) 

    #-------------------------------------------------- 
    ## Get page count 

    # Give page time to load 
    time.sleep(2) 

    PAGE_RAW = driver.page_source 
    PAGE_RAW = LH.fromstring(PAGE_RAW) 
    PAGE_COUNT_RAW = PAGE_RAW.xpath("//div[contains(@class, 'pageControlMenu')]/div/ul/li") 
    PAGE_COUNT = len(PAGE_COUNT_RAW) - 2 

    #-------------------------------------------------- 
    ## Get page if its not page one 

    while PAGE_NUMBER <= PAGE_COUNT: 
     #-------------------------------------------------- 
     ## Create empty file 
     FILE_NAME = PARAMS[3] + 'json/' + time.strftime("%Y%m%d%H") + '_' + str(PARAMS[2]) + '_' + str(PAGE_NUMBER) + '.json' 
     #-------------------------------------------------- 
     ## Create JSON file if it doesnt exist 
     if os.path.exists(FILE_NAME)==False: 
      JSON_FILE = open(FILE_NAME, "a+", encoding="utf-8") 
     else: 
      JSON_FILE = open(FILE_NAME, "w", encoding="utf-8") 
     JSON_FILE.write("{") 
     #-------------------------------------------------- 
     # Click page for next page if not page 1 
     if PAGE_NUMBER > 1: 
      index = 0 
      for atag in PAGE_COUNT_RAW: 
       if index == PAGE_NUMBER: 
        elements = driver.find_elements_by_xpath("//div[contains(@class, 'pageControlMenu')]/div/ul/li") 
        if elements: 
         element = elements[index].find_elements_by_xpath("./a") 
         if element: 
          element[0].click() 
          time.sleep(randint(3,5)) 
       index += 1 
     #-------------------------------------------------- 
     ## Remove survey box if it pops up and log 
     try: 
      surveyBox = driver.find_element_by_link_text("No, thanks") 
      if surveyBox: 
       surveyBox.click() 
       print("Store[" + str(PARAMS[2]) + "]: Survey box found on page - " + str(PAGE_NUMBER)) 
     except: 
      print("Store[" + str(PARAMS[2]) + "]: No survey box on page - " + str(PAGE_NUMBER)) 
     #-------------------------------------------------- 
     ## Proces page 
     # If page is greater then 1 then get the page source of the new page. 
     if PAGE_NUMBER > 1: 
      PAGE_RAW = driver.page_source 
      PAGE_RAW = LH.fromstring(PAGE_RAW) 
     PAGE_RAW = PAGE_RAW.xpath("//div[contains(@class, 'estore_product_container')]") 
     index = 0 
     size = len(PAGE_RAW) 
     for atag in PAGE_RAW: 
      if PAGE_NUMBER > 1 and index == 0: 
       WebDriverWait(driver,10).until(EC.presence_of_element_located((By.XPATH, "./div[contains(@class, 'product_info')]/div[contains(@class, 'product_name')]/a"))) 
      setStore = PARAMS[2] 
      setName = atag.xpath("./div[contains(@class, 'product_info')]/div[contains(@class, 'product_name')]/a/text()") 
      setNumber = parseSetNumber(setName[0]) 
      setPrice = atag.xpath("./div[contains(@class, 'product_info')]/div[contains(@class, 'product_price')]/text()") 
      setLink = atag.xpath("./div[contains(@class, 'product_info')]/div[contains(@class, 'product_name')]/a/@href") 
      setRef = atag.xpath("./div[contains(@class, 'product_info')]/div[contains(@class, 'product_price')]/@id") 
      if setRef: 
       setRef = parseRefId(setRef[0]) 
      if re.search('[0-9\.]+', setPrice[0]) is not None: 
       JSON_FILE.write("\"" + str(index) + "\":{\"store\":\"" + str(setStore) + "\",\"name\":\"" + str(setName[0]) + "\",\"number\":\"" + str(setNumber) + "\",\"price\":\"" + re.search('[0-9\.]+', setPrice[0]).group() + "\",\"ref\":\"" + str(setRef) + "\",\"link\":\"" + str(setLink[0]) + "\"}") 
       if index+1 < size: 
        JSON_FILE.write(",") 
      index += 1 
     #-------------------------------------------------- 
     ## Close JSON file 
     JSON_FILE.write("}") 
     JSON_FILE.close() 
     #-------------------------------------------------- 
     ## Increment page number 
     PAGE_NUMBER += 1 
     #-------------------------------------------------- 

    #-------------------------------------------------- 
    ## Close webdriver 
    driver.quit() 
    #-------------------------------------------------- 

except Exception as e: 
    print('Error: ' + str(e.args[0])) 

# Remove gecodriver.log file 
GHOSTDRIVER_FILE = str(PARAMS[3]) + 'jobs/ghostdriver.log' 
if os.path.exists(GHOSTDRIVER_FILE)==True: 
    os.remove(GHOSTDRIVER_FILE) 

更新これらはPhantomJSで作業していないだけの2行があるように見えます

は、彼らの両方が空の値を返します。 [OK]を

setName = atag.xpath("./div[contains(@class, 'product_info')]/div[contains(@class, 'product_name')]/a/text()") 
setLink = atag.xpath("./div[contains(@class, 'product_info')]/div[contains(@class, 'product_name')]/a/@href") 

答えて

0

は、私はこの問題を解決してきたように、私はPhantomJSを使用している場合webdriverをためset_windows_sizeオプションを追加しなければならなかったに見えます。

もともと:

driver = webdriver.PhantomJS() 
driver.get(PARAMS[1]) 

ソリューション:Firefoxはwebdriverを作品と同じように期待されているように動作webdriverを

driver = webdriver.PhantomJS() 
driver.set_window_size(1024, 768) 
driver.get(PARAMS[1]) 

今PhantomJS。