2017-09-21 18 views
0

を使ってこします。 Sys.time()関数を複数回使用しているため、コードが遅い(1つのNIBのcca 12秒)。私は200.000のNIB番号をスクラップして、30日間の掻き取りをする必要があります。スピードアップは、Web、私は以下のWebサイトをスクラップするRseleniumを使用していますmultiplie Rseleniumブラウザ

私は、複数のブラウザをローカルに、あるいは何らかの形でクラウドで開き、スクレイピングスクリプトを高速化することができれば興味があります。

この問題を解決するために並列計算を使用することはできますか? ご意見はありますか?

EDIT:

library(XML) 
library(RCurl) 
library(RSelenium) 
library(png) 
library(imager) 
library(RMySQL) 
library(htmltab) 
library(jsonlite) 
library(rvest) 

# function for waiting instead Sys.sleep() 
waitLoad <- function (xpath_check = "//input[@id = 'ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[2]", 
         iterations = 5){ 
    counter <- 0 
    chk <- FALSE 
    while(!chk & counter <= iterations){ 
    wait <- tryCatch(
     remDr$findElement(using = "xpath", 
         xpath_check)$getElementText(), 
     # remDr$findElement(using = "xpath", "//input[@id = 'ctl00_Content_FormContent_Img1']")$clearElement(), 
     error = function(e) print(paste0("Trazi dalje")) 
    ) 
    if(wait == "Trazi dalje"){ 
     Sys.sleep(1L) 
     counter <- sum(counter, 1) 
    }else{ 
     chk <- TRUE 
    } 
    } 
} 

# Start Selenium Server 
# docker run -d -p 4445:4444 selenium/standalone-chrome:3.5.0 
remDr <- remoteDriver(remoteServerAddr = "192.168.99.100", port = 4445L, browserName = "chrome") 
remDr$open() 

# Simulate browser session and fill out form 
remDr$navigate("http://plovila.pomorstvo.hr/") 
remDr$findElement(using = "xpath", "//select[@id = 'ctl00_Content_FormContent_uiTipObjektaDropDown']/option[@value = '1']")$clickElement() 
remDr$screenshot(display = TRUE) 

# Scrap ! 
df <- list() 
Porivni_uredjaji <- list() 
Clanovi_posade <- list() 
Vlasnici <- list() 
Korisnici <- list() 
df_2 <- list() 
Tereti <- list() 
pocetak <- 100000 
kraj <- 100003 
system.time(
for (i in pocetak:kraj){ 
    remDr$findElement(using = "xpath", "//input[@id = 'ctl00_Content_FormContent_uiNibTextBox']")$clearElement() 
    Sys.sleep(1L) 
    remDr$findElement(using = "xpath", 
        "//input[@id = 'ctl00_Content_FormContent_uiNibTextBox']")$sendKeysToElement(list(as.character(i), 
                             key = "enter")) 
    waitLoad() 
    remDr$screenshot(display = TRUE) 
    doc <- htmlParse(remDr$getPageSource()[[1]]) 
    Sys.sleep(1L) 
    Ime <- xpathSApply(doc = doc, path = "//*[@id='ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[1]", fun = xmlValue) 
    Oznaka <- xpathSApply(doc = doc, path = "//*[@id='ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[2]", fun = xmlValue) 
    NIB <- xpathSApply(doc = doc, path = "//*[@id='ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[3]", fun = xmlValue) 
    Vlasnik <- xpathSApply(doc = doc, path = "//*[@id='ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[4]", fun = xmlValue) 
    LK_LI <- xpathSApply(doc = doc, path = "/html/body/form/div[4]/div[1]/div[3]/table/tbody/tr/td[5]", fun = xmlValue) 
    br1 <- xpathSApply(doc = doc, path = "/html/body/form/div[4]/div[1]/div[3]/table/tbody/tr/td[6]", fun = xmlValue) 
    br2 <- xpathSApply(doc = doc, path = "/html/body/form/div[4]/div[1]/div[3]/table/tbody/tr/td[7]", fun = xmlValue) 
    x <- i-pocetak + 1 
    if (length(NIB)==0){ 
    Pozivni_znak <- NA 
    df[[x]] <- cbind(Ime, Oznaka, NIB, Vlasnik, LK_LI, br1, br2, Pozivni_znak) 
    df[[x]] <- as.data.frame(df[[x]], stringsAsFactors = FALSE) 
    }else{ 
    remDr$findElement(using = "xpath", "//input[@title = 'Detalji']")$clickElement() 
    waitLoad("//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiNamjenaText']", 5) 
    doc <- htmlParse(remDr$getPageSource()[[1]], encoding = "UTF-8") 
    Sys.sleep(1L) 
    list_a <- xpathSApply(doc = doc, path = "/html/body/form/div[4]/div[1]/div[3]/fieldset/h3[1]", fun = xmlValue) 
    if (length(list_a) >= 1){ 

     Namjena <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiNamjenaText']/@value") 
     json <- paste0("[", '"', Namjena, '"', "]") 
     Namjena <- fromJSON(json) 
     Namjena <- as.data.frame(Namjena, stringsAsFactors = FALSE) 
     colnames(Namjena) <- "Namjena" 
     Vrsta_plovila <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiVrstaPlovilaText']/@value") 
     json <- paste0("[", '"', Vrsta_plovila, '"', "]") 
     Vrsta_plovila <- fromJSON(json) 
     Vrsta_plovila <- as.data.frame(Vrsta_plovila, stringsAsFactors = FALSE) 
     colnames(Vrsta_plovila) <- "Vrsta_plovila" 
     Model_plovila <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiModelPlovilaText']/@value") 
     json <- paste0("[", '"', Model_plovila, '"', "]") 
     Model_plovila <- fromJSON(json) 
     Model_plovila <- as.data.frame(Model_plovila, stringsAsFactors = FALSE) 
     colnames(Model_plovila) <- "Model_plovila" 
     Duljina_trupa <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiDuljinaTrupaText']/@value") 
     json <- paste0("[", '"', Duljina_trupa, '"', "]") 
     Duljina_trupa <- fromJSON(json) 
     Duljina_trupa <- as.data.frame(Duljina_trupa, stringsAsFactors = FALSE) 
     colnames(Duljina_trupa) <- "Duljina_trupa" 
     Sirina_trupa <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiSirinaText']/@value") 
     json <- paste0("[", '"', Sirina_trupa, '"', "]") 
     Sirina_trupa <- fromJSON(json) 
     Sirina_trupa <- as.data.frame(Sirina_trupa, stringsAsFactors = FALSE) 
     colnames(Sirina_trupa) <- "Sirina_trupa" 
     Visina_trupa <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiVisinaText']/@value") 
     json <- paste0("[", '"', Visina_trupa, '"', "]") 
     Visina_trupa <- fromJSON(json) 
     Visina_trupa <- as.data.frame(Visina_trupa, stringsAsFactors = FALSE) 
     colnames(Visina_trupa) <- "Visina_trupa" 
     Gaz <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiGazText']/@value") 
     json <- paste0("[", '"', Gaz, '"', "]") 
     Gaz <- fromJSON(json) 
     Gaz <- as.data.frame(Gaz, stringsAsFactors = FALSE) 
     colnames(Gaz) <- "Gaz" 
     Nosivost <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiNosivostText']/@value") 
     json <- paste0("[", '"', Nosivost, '"', "]") 
     Nosivost <- fromJSON(json) 
     Nosivost <- as.data.frame(Nosivost, stringsAsFactors = FALSE) 
     colnames(Nosivost) <- "Nosivost" 
     GT <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiGtText']/@value") 
     json <- paste0("[", '"', GT, '"', "]") 
     GT <- fromJSON(json) 
     GT <- as.data.frame(GT, stringsAsFactors = FALSE) 
     colnames(GT) <- "GT" 
     Snaga_motora <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiUkupnaSnagaText']/@value") 
     json <- paste0("[", '"', Snaga_motora, '"', "]") 
     Snaga_motora <- fromJSON(json) 
     Snaga_motora <- as.data.frame(Snaga_motora, stringsAsFactors = FALSE) 
     colnames(Snaga_motora) <- "Snaga_motora" 
     Brodogradiliste <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiBrodogradilisteText']/@value") 
     Brodogradiliste <- gsub("\"", "'", Brodogradiliste) 
     json <- paste0("[", '"', Brodogradiliste, '"', "]") 
     Brodogradiliste <- fromJSON(json) 
     Brodogradiliste <- as.data.frame(Brodogradiliste, stringsAsFactors = FALSE) 
     Encoding(Brodogradiliste[,c(1)]) <- "UTF-8" 
     colnames(Brodogradiliste) <- "Brodogradiliste" 
     Godina_gradnje <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiGodGradnjeText']/@value") 
     json <- paste0("[", '"', Godina_gradnje, '"', "]") 
     Godina_gradnje <- fromJSON(json) 
     Godina_gradnje <- as.data.frame(Godina_gradnje, stringsAsFactors = FALSE) 
     colnames(Godina_gradnje) <- "Godina_gradnje" 
     Materijal <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiMaterijalGradnjeText']/@value") 
     json <- paste0("[", '"', Materijal, '"', "]") 
     Materijal <- fromJSON(json) 
     Materijal <- as.data.frame(Materijal, stringsAsFactors = FALSE) 
     colnames(Materijal) <- "Materijal" 
     Najveci_broj_osoba <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiMaxBrojOsobaText']/@value") 
     json <- paste0("[", '"', Najveci_broj_osoba, '"', "]") 
     Najveci_broj_osoba <- fromJSON(json) 
     Najveci_broj_osoba <- as.data.frame(Najveci_broj_osoba, stringsAsFactors = FALSE) 
     colnames(Najveci_broj_osoba) <- "Najveci_broj_osoba" 
     Najveci_broj_putnika <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiMaxBrojPutnikaText']/@value") 
     json <- paste0("[", '"', Najveci_broj_putnika, '"', "]") 
     Najveci_broj_putnika <- fromJSON(json) 
     Najveci_broj_putnika <- as.data.frame(Najveci_broj_putnika, stringsAsFactors = FALSE) 
     colnames(Najveci_broj_putnika) <- "Najveci_broj_putnika" 
     Najmanji_broj_posade <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiMinBrojPosade']/@value") 
     json <- paste0("[", '"', Najmanji_broj_posade, '"', "]") 
     Najmanji_broj_posade <- fromJSON(json) 
     Najmanji_broj_posade <- as.data.frame(Najmanji_broj_posade, stringsAsFactors = FALSE) 
     colnames(Najmanji_broj_posade) <- "Najmanji_broj_posade" 
     Prethodna_oznaka <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiPrethodnaOznakaText']/@value") 
     json <- paste0("[", '"', Prethodna_oznaka, '"', "]") 
     Prethodna_oznaka <- fromJSON(json) 
     Prethodna_oznaka <- as.data.frame(Prethodna_oznaka, stringsAsFactors = FALSE) 
     colnames(Prethodna_oznaka) <- "Prethodna_oznaka" 
     Prethodna_luka <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiPrethodnaLukaUpisaText']/@value") 
     Prethodna_luka <- gsub("\"", "'", Prethodna_luka) 
     json <- paste0("[", '"', Prethodna_luka, '"', "]") 
     Prethodna_luka <- fromJSON(json) 
     Prethodna_luka <- as.data.frame(Prethodna_luka, stringsAsFactors = FALSE) 
     colnames(Prethodna_luka) <- "Prethodna_luka" 
     Prethodna_drĹľava <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiPrethodnaDrzavaUpisaText']/@value") 
     json <- paste0("[", '"', Prethodna_drĹľava, '"', "]") 
     Prethodna_drĹľava <- fromJSON(json) 
     Prethodna_drĹľava <- as.data.frame(Prethodna_drĹľava, stringsAsFactors = FALSE) 
     colnames(Prethodna_drĹľava) <- "Prethodna_drĹľava" 

     df[[x]] <- cbind(Ime, Oznaka, NIB, Vlasnik, LK_LI, br1, br2, Namjena, Vrsta_plovila, 
         Model_plovila, Duljina_trupa, Sirina_trupa, Visina_trupa, Gaz, Nosivost, GT, 
         Snaga_motora, Brodogradiliste, Godina_gradnje, Materijal, Najveci_broj_osoba, 
         Najveci_broj_putnika, Najmanji_broj_posade, Prethodna_oznaka, 
         Prethodna_luka, Prethodna_drĹľava) 
     df[[x]] <- as.data.frame(df[[x]], stringsAsFactors = FALSE) 

     df_2 <- readHTMLTable(doc) 
     Sys.sleep(2L) 

     Porivni_uredjaji[[x]] <- tryCatch(as.data.frame(cbind(df_2[[2]], NIB), stringsAsFactors = FALSE), error=function(e) print(paste0("Error ", NIB))) 
     Clanovi_posade[[x]] <- tryCatch(as.data.frame(cbind(df_2[[3]], NIB), stringsAsFactors = FALSE), error=function(e) print(paste0("Error ", NIB))) 
     Vlasnici[[x]] <- tryCatch(as.data.frame(cbind(df_2[[4]], NIB), stringsAsFactors = FALSE), error=function(e) print(paste0("Error ", NIB))) 
     Korisnici[[x]] <- tryCatch(as.data.frame(cbind(df_2[[5]], NIB), stringsAsFactors = FALSE), error=function(e) print(paste0("Error ", NIB))) 
     Tereti[[x]] <- cbind(remDr$findElement(using = "xpath", "//*/span[@id='ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiTeretiLabel']")$getElementText(), NIB) 
    }} 
} 
) 

# manipulate data after scraping 
for (i in 1:length(df)){ 
    if (length(df[[i]]) < 13){ 
    df[[i]] <- matrix(data = rep(NA, 26), nrow = 1, ncol = 26) 
    df[[i]] <- as.data.frame(df[[i]]) 
    colnames(df[[i]]) <- c("Ime", "Oznaka", "NIB", "Vlasnik", "LK_LI", "br1", "br2","Namjena", 
          "Vrsta_plovila", "Model_plovila", "Duljina_trupa", "Sirina_trupa", "Visina_trupa", 
          "Gaz", "Nosivost", "GT", "Snaga_motora", "Brodogradiliste", "Godina_gradnje", 
          "Materijal", "Najveci_broj_osoba", "Najveci_broj_putnika", "Najmanji_broj_posade", 
          "Prethodna_oznaka", "Prethodna_luka", "Prethodna_drĹľava") 
    } 
} 

df_final <- do.call(rbind, df) 
df_final_1 <- df_final[!is.na(df_final$NIB), ] 

EDIT 2:私は、コード追加してい を、私はあなたが投稿上記のコードに問題があります。私が実行した場合:3.5.0ドッキングウィンドウの画像:

(cl <- (detectCores() - 1) %>% makeCluster) %>% registerDoParallel 
# open a remoteDriver for each node on the cluster 
# docker run -d -p 4445:4444 selenium/standalone-chrome:3.5.3 
clusterEvalQ(cl, { 
    library(RSelenium) 
    remDr <- remoteDriver(remoteServerAddr = "192.168.99.100", port = 4445L, browserName = "chrome") 
    remDr$open() 
}) 
myTitles <- c() 
ws <- foreach(x = 1:length(urls), 
       .packages = c("rvest", "magrittr", "RSelenium", "jsonlite", "htmltab", "XML", "RCurl")) %dopar% { 
    remDr$navigate(urls[x]) 
    Sys.sleep(3L) 
    remDr$getTitle()[[1]] 
       } 

それはクロムとたぶん問題エラー

Error in { : task 1 failed - " Summary: UnknownError 
    Detail: An unknown server-side error occurred while processing the command. 
    Further Details: run errorDetails method" 
+0

最も簡単な解決策は、入力をバッチに分割し、別の端末で8つの並列スクリプトを実行することです。これはコードの変更が最小限であることを意味します。これらはUIベースのブラウザなので、特定の数字を超えるとシステムパフォーマンスが低下します。 –

+0

@tarun Lalwani Bachesに入力するとどういう意味ですか?私はループを計算するためにoparallelとforeachパッケージを使用しています。しかし、私はどのようにRseleniumでそれを行うか分からない。それとも可能なのか? – Mislav

+0

あなたのコードを投稿してください、このようにコメントすることはできません –

答えて

0

を返します。

library(RSelenium) 
library(rvest) 
library(magrittr) 
library(foreach) 
library(doParallel) 

# using docker run -d -p 4445:4444 selenium/standalone-chrome:3.5.3 
# in windows 
URLsPar <- c("https://stackoverflow.com/", "https://github.com/", 
      "http://www.bbc.com/", "http://www.google.com", 
      "https://www.r-project.org/", "https://cran.r-project.org", 
      "https://twitter.com/", "https://www.facebook.com/") 

appHTML <- c() 

(cl <- (detectCores() - 1) %>% makeCluster) %>% registerDoParallel 
# open a remoteDriver for each node on the cluster 
clusterEvalQ(cl, { 
    library(RSelenium) 
    remDr <- remoteDriver(remoteServerAddr = "192.168.99.100", port = 4445L, 
         browserName = "chrome") 
    remDr$open() 
}) 
ws <- foreach(x = 1:length(URLsPar), 
       .packages = c("rvest", "magrittr", "RSelenium")) %dopar% { 
       print(URLsPar[x]) 
       remDr$navigate(URLsPar[x]) 
       remDr$getTitle()[[1]] 
       } 
> ws 
[[1]] 
[1] "Stack Overflow - Where Developers Learn, Share, & Build Careers" 

[[2]] 
[1] "The world's leading software development platform · GitHub" 

[[3]] 
[1] "BBC - Homepage" 

[[4]] 
[1] "Google" 

[[5]] 
[1] "R: The R Project for Statistical Computing" 

[[6]] 
[1] "The Comprehensive R Archive Network" 

[[7]] 
[1] "Twitter. It's what's happening." 

[[8]] 
[1] "Facebook - Log In or Sign Up"  


# close browser on each node 
clusterEvalQ(cl, { 
    remDr$close() 
}) 

stopImplicitCluster() 
+0

7 'URLsPar'ベクター内の同じURLです。多分それが問題です。また、私は3.5.0ではなく3.5.3ドライバを持っています。私が通常の方法でコードを実行すると、完全に(パラレルのないremoteDriver) – Mislav

+0

変数名の 'URLPar'という名前の 'urls' instadが最初に変更されました。今は 'URLsPar'に変更して出力します。変数名はなぜ重要なのですか? [1] "スタックオーバフロー - デベロッパーが習得し、共有し、キャリアを築く場所"と[2]と[3] '"到達不可能な\ n(セッション情報:chrome = 61.0.3163.91)\ nドライバの情報:chromedriver = 2.32.498513(2c63aa53b2c658de596ed550eb5267ec5967b351)、platform = Linux 4.4.86-boot2docker x86_64) "'なぜ今クロムに到達できないのか? – Mislav

+1

例を8つのURLに変更しました。私はあなたが持っている問題は表示されません。 https://github.com/SeleniumHQ/docker-selenium/issues/547およびhttps:// githubに関連している可能性があります。com/SeleniumHQ/docker-selenium/issues/554 – jdharrison

関連する問題