移動中のオブジェクト

-1

私はウェブスパイダーを書くことによって行くことを学んでいます。私はすべてのビジネスカテゴリのリストをallpages.comから取得しようとしています。移動中のオブジェクト

以下は私のプログラム全体です。残念ながら私は問題を切り分けることができないので、すべて貼り付けました。

このプログラムを実行すると、まず最初のページが正しくダウンロードされ、抽出されたすべてのカテゴリがカテゴリのリストに追加されます。

しかし、その後、後続のページをダウンロードすると、親カテゴリへの参照が混乱するようです。例えば。実際にはpolitical-ideological-organizations/がtravel-tourism/のサブカテゴリではない場合、URL http://www.allpages.com/travel-tourism/political-ideological-organizations/が誤って計算されます。ログを調べると、parentオブジェクトのデータが上書きされているようです。このエラーは、より多くの労働者がいるほど顕著である。

これは、私がゴルーチンへの参照によってデータを渡す前に少し上手く機能していましたが、私は本質的に同じ問題を抱えていました。

私が持っているいくつかの質問：

がどのようにログ線を介して、ピッキングに頼ることなく、これをデバッグすることができますか？

何が間違っているのですか/なぜ動作しないのですか？どのように修正できますか？

package main 

import (
     "fmt" 
     "github.com/PuerkitoBio/goquery" 
     "log" 
     "strconv" 
     "strings" 
     "regexp" 
) 

const domain = "http://www.allpages.com/" 
const categoryPage = "category.html" 

type Category struct { 
     url string 
     level uint 
     name string 
     entries int 
     parent *Category 
} 

type DownloadResult struct { 
     doc *goquery.Document 
     category *Category 
} 

const WORKERS = 2 
const SEPARATOR = "§§§" 

func main() { 

     allCategories := make([]Category, 0) 

     downloadChannel := make(chan *Category) 
     resultsChannel := make(chan *DownloadResult, 100) 

     for w := 1; w <= WORKERS; w++ { 
       go worker(downloadChannel, resultsChannel) 
     } 

     numRequests := 1 
     downloadChannel <- &Category{ domain + categoryPage, 0, "root", 0, nil } 

     for result := range resultsChannel { 
       var extractor func(doc *goquery.Document) []string 

       if result.category.level == 0 { 
         extractor = topLevelExtractor 
       } else if result.category.level == 1 { 
         extractor = secondLevelExtractor 
       } else { 
         extractor = thirdLevelExtractor 
       } 

       categories := extractCategories(result.doc, result.category, extractor) 
       allCategories = append(allCategories, *categories...) 

       //fmt.Printf("Appending categories: %v", *categories) 

       fmt.Printf("total categories = %d, total requests = %d\n", len(allCategories), numRequests) 

       for _, category := range *categories { 
         numRequests += 1 
         downloadChannel <- &category 
       } 

       // close the channels when there are no more jobs 
       if len(allCategories) > numRequests { 
         close(downloadChannel) 
         close(resultsChannel) 
       } 
     } 

     fmt.Println("Done") 
} 

func worker(downloadChannel <-chan *Category, results chan<- *DownloadResult) { 
     for target := range downloadChannel { 
       fmt.Printf("Downloading %v (addr %p) ...", target, &target) 

       doc, err := goquery.NewDocument(target.url) 
       if err != nil { 
         log.Fatal(err) 
         panic(err) 
       } 

       fmt.Print("done \n") 

       results <- &DownloadResult{doc, target} 
     } 
} 

func extractCategories(doc *goquery.Document, parent *Category, extractor func(doc *goquery.Document) []string) *[]Category { 

     numberRegex, _ := regexp.Compile("[0-9,]+") 

     log.Printf("Extracting subcategories for page %s\n", parent) 

     subCategories := extractor(doc) 

     categories := make([]Category, 0) 

     for _, subCategory := range subCategories { 
       log.Printf("Got subcategory=%s from parent=%s", subCategory, parent) 
       extracted := strings.Split(subCategory, SEPARATOR) 

       numberWithComma := numberRegex.FindString(extracted[2]) 
       number := strings.Replace(numberWithComma, ",", "", -1) 

       numRecords, err := strconv.Atoi(number) 
       if err != nil { 
         log.Fatal(err) 
         panic(err) 
       } 

       var category Category 

       level := parent.level + 1 

       if parent.level == 0 { 
         category = Category{ domain + extracted[1], level, extracted[0], numRecords, parent } 
       } else { 
         log.Printf("category URL=%s, parent=%s, parent=%v", extracted[1], parent.url, parent) 
         category = Category{ parent.url + extracted[1], level, extracted[0], numRecords, parent } 
       } 

       log.Printf("Appending category=%v (pointer=%p)", category, &category) 

       categories = append(categories, category) 
     } 

     return &categories 
} 

func topLevelExtractor(doc *goquery.Document) []string { 
     return doc.Find(".cat-listings-td .c-1s-2m-1-td1").Map(func(i int, s *goquery.Selection) string { 
       title := s.Find("a").Text() 
       url := s.Find("a").Map(func(x int, a *goquery.Selection) string { 
         v, _ := a.Attr("href") 
         return v 
       }) 
       records := s.Clone().Children().Remove().End().Text() 

       //log.Printf("Item %d: %s, %s - %s\n", i, title, records, url) 

       res := []string{title, url[0], records} 
       return strings.Join(res, SEPARATOR) 
     }) 
} 

func secondLevelExtractor(doc *goquery.Document) []string { 
     return doc.Find(".c-2m-3c-1-table .c-2m-3c-1-td1").Map(func(i int, s *goquery.Selection) string { 
       title := s.Find("a").Text() 
       url := s.Find("a").Map(func(x int, a *goquery.Selection) string { 
         v, _ := a.Attr("href") 
         return v 
       }) 
       records := s.Clone().Children().Remove().End().Text() 

       //log.Printf("Item %d: %s, %s - %s\n", i, title, records, url) 

       res := []string{title, url[0], records} 
       return strings.Join(res, SEPARATOR) 
     }) 
} 

func thirdLevelExtractor(doc *goquery.Document) []string { 
     return doc.Find(".c-2m-3c-1-table .c-2m-3c-1-td1").Map(func(i int, s *goquery.Selection) string { 
       title := s.Find("a").Text() 
       url := s.Find("a").Map(func(x int, a *goquery.Selection) string { 
         v, _ := a.Attr("href") 
         return v 
       }) 
       records := s.Clone().Children().Remove().End().Text() 

       //log.Printf("Item %d: %s, %s - %s\n", i, title, records, url) 

       res := []string{title, url[0], records} 
       return strings.Join(res, SEPARATOR) 
     }) 
}

更新固定 - 以下のコメントを参照。

出典

2017-02-08 jbrown

あなたが問題を示してずっと* *小さく、スタンドアロンスニペットを提供してもらえますか？ほとんどの場合、問題は多くのノイズに隠されています。 – Volker

@Volker残念ながら私が説明したように、何が起こっているのかわからないので、私はそれを凝縮することはできません。 – jbrown

あなたはこれが良い質問だと思いますか？答えは他の人にも役立つでしょうか？ – Volker

をループ：

  for _, category := range *categories { 
        numRequests += 1 
        downloadChannel <- &category 
      }

Iはなく、その値の実際のメモリアドレス、チャネルに一時変数categoryへの参照を送信した意味します。

私は別のループを使用してこれを修正しました：

for i := 0; i < len(*categories); i++ { 
     fmt.Printf("Queuing category: %v (%p)", categoriesValues[i], categoriesValues[i]) 

     downloadChannel <- &categoriesValues[i] 
    }

出典

2017-02-08 12:49:03 jbrown

答えて

関連する問題