私はこのPythonスクリプトを過去1〜2日間使っていましたが、Firefox webdriverを使用するとすべてうまくいきますが、私はヘッドレスブラウザPhantomJS setName
が空であるため、setNumber = parseSetNumber(setName[0])
の行でエラーError: list index out of range
が失敗します。Python:Webdriver FirefoxとPhantomJSの違いによる問題
setName = atag.xpath("./div[contains(@class, 'product_info')]/div[contains(@class, 'product_name')]/a/text()")
の行は、Firefox Webdriverを使用している場合、PhantomJS Webdriverを使用している場合は何も返しません。
エラーは、WebdriverをFirefoxからPhantomJSに切り替えると発生します。スクリプトがLinuxサーバー上で実行されるので、PhantomJSを使用します。
import time
import os.path
import lxml.html as LH
import re
import sys
from selenium import webdriver
from random import randint
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
PARAMS = sys.argv
URL = PARAMS[1]
BASEURL = URL[:URL.rfind('/')+1]
# Parses the set name for the set number
def parseSetNumber(string):
string = string.split(' ')
stringLength = len(string)
string = string[(stringLength - 1)]
if string.replace('.','').isdigit():
return string
else:
return ""
# Returns set reference for this site
def parseRefId(string):
string = string.split('_')
return str(string[2])
try:
PAGE_NUMBER = 1
#--------------------------------------------------
## Get initial page
driver = webdriver.PhantomJS()
driver.get(PARAMS[1])
#--------------------------------------------------
## Get page count
# Give page time to load
time.sleep(2)
PAGE_RAW = driver.page_source
PAGE_RAW = LH.fromstring(PAGE_RAW)
PAGE_COUNT_RAW = PAGE_RAW.xpath("//div[contains(@class, 'pageControlMenu')]/div/ul/li")
PAGE_COUNT = len(PAGE_COUNT_RAW) - 2
#--------------------------------------------------
## Get page if its not page one
while PAGE_NUMBER <= PAGE_COUNT:
#--------------------------------------------------
## Create empty file
FILE_NAME = PARAMS[3] + 'json/' + time.strftime("%Y%m%d%H") + '_' + str(PARAMS[2]) + '_' + str(PAGE_NUMBER) + '.json'
#--------------------------------------------------
## Create JSON file if it doesnt exist
if os.path.exists(FILE_NAME)==False:
JSON_FILE = open(FILE_NAME, "a+", encoding="utf-8")
else:
JSON_FILE = open(FILE_NAME, "w", encoding="utf-8")
JSON_FILE.write("{")
#--------------------------------------------------
# Click page for next page if not page 1
if PAGE_NUMBER > 1:
index = 0
for atag in PAGE_COUNT_RAW:
if index == PAGE_NUMBER:
elements = driver.find_elements_by_xpath("//div[contains(@class, 'pageControlMenu')]/div/ul/li")
if elements:
element = elements[index].find_elements_by_xpath("./a")
if element:
element[0].click()
time.sleep(randint(3,5))
index += 1
#--------------------------------------------------
## Remove survey box if it pops up and log
try:
surveyBox = driver.find_element_by_link_text("No, thanks")
if surveyBox:
surveyBox.click()
print("Store[" + str(PARAMS[2]) + "]: Survey box found on page - " + str(PAGE_NUMBER))
except:
print("Store[" + str(PARAMS[2]) + "]: No survey box on page - " + str(PAGE_NUMBER))
#--------------------------------------------------
## Proces page
# If page is greater then 1 then get the page source of the new page.
if PAGE_NUMBER > 1:
PAGE_RAW = driver.page_source
PAGE_RAW = LH.fromstring(PAGE_RAW)
PAGE_RAW = PAGE_RAW.xpath("//div[contains(@class, 'estore_product_container')]")
index = 0
size = len(PAGE_RAW)
for atag in PAGE_RAW:
if PAGE_NUMBER > 1 and index == 0:
WebDriverWait(driver,10).until(EC.presence_of_element_located((By.XPATH, "./div[contains(@class, 'product_info')]/div[contains(@class, 'product_name')]/a")))
setStore = PARAMS[2]
setName = atag.xpath("./div[contains(@class, 'product_info')]/div[contains(@class, 'product_name')]/a/text()")
setNumber = parseSetNumber(setName[0])
setPrice = atag.xpath("./div[contains(@class, 'product_info')]/div[contains(@class, 'product_price')]/text()")
setLink = atag.xpath("./div[contains(@class, 'product_info')]/div[contains(@class, 'product_name')]/a/@href")
setRef = atag.xpath("./div[contains(@class, 'product_info')]/div[contains(@class, 'product_price')]/@id")
if setRef:
setRef = parseRefId(setRef[0])
if re.search('[0-9\.]+', setPrice[0]) is not None:
JSON_FILE.write("\"" + str(index) + "\":{\"store\":\"" + str(setStore) + "\",\"name\":\"" + str(setName[0]) + "\",\"number\":\"" + str(setNumber) + "\",\"price\":\"" + re.search('[0-9\.]+', setPrice[0]).group() + "\",\"ref\":\"" + str(setRef) + "\",\"link\":\"" + str(setLink[0]) + "\"}")
if index+1 < size:
JSON_FILE.write(",")
index += 1
#--------------------------------------------------
## Close JSON file
JSON_FILE.write("}")
JSON_FILE.close()
#--------------------------------------------------
## Increment page number
PAGE_NUMBER += 1
#--------------------------------------------------
#--------------------------------------------------
## Close webdriver
driver.quit()
#--------------------------------------------------
except Exception as e:
print('Error: ' + str(e.args[0]))
# Remove gecodriver.log file
GHOSTDRIVER_FILE = str(PARAMS[3]) + 'jobs/ghostdriver.log'
if os.path.exists(GHOSTDRIVER_FILE)==True:
os.remove(GHOSTDRIVER_FILE)
更新これらはPhantomJSで作業していないだけの2行があるように見えます
は、彼らの両方が空の値を返します。 [OK]を
setName = atag.xpath("./div[contains(@class, 'product_info')]/div[contains(@class, 'product_name')]/a/text()")
setLink = atag.xpath("./div[contains(@class, 'product_info')]/div[contains(@class, 'product_name')]/a/@href")