2017-02-21 7 views
0

私はいくつかのコードを書いていますが、最初の部分は完全に(90値を含む)取り込みますが、2番目の部分は価格が不完全リストを保持します(30を含みます)。ループのようなそのセグメントのために働いていないようだ。
フルリストを保持するためにこのコードをどのように変更する必要がありますか?
ありがとうございます!bs4パーサーは不完全なリストを保持します

import re 
import requests 
from bs4 import BeautifulSoup 

url = "http://www.watcheszon.com/casio-g-shock-c-19_20/?pg=store" 


DATA_CONTAINER = list() 
DATA = list() 

def collectData(): 

    global DATA_CONTAINER 
    global DATA 


    for i in range(1, 5): 
     newUrl = url + "&sort=20a&page=" + str(i) 
     r = requests.get(newUrl) 
     soup = BeautifulSoup(r.content, "lxml") 
     #print(soup) 
     g_data_odd = soup.find_all("td", {"class": "productListing-data"}) 
     for item in g_data_odd:   
      t = item.find_all("div", {"class": "product_name"}) 
      i = list() 
      for name in t: 
       piece = name.find('a').text 
       i.append(piece) 
       #print(piece) 
       # for pc in piece: 
       # i.append(pc.replace("\r", "").replace("\n", "").replace("\t", "")) 
       # print(pc) 
       DATA_CONTAINER.append(piece) 

     spans = soup.find_all('span', {"class": "productSalePrice"}) 
     # create a list of lines corresponding to element texts 
     lines = [span.get_text() for span in spans] 
     # collect the dates from the list of lines using regex matching groups 
     found_dates = [] 
     for line in lines: 
      m = re.search(r'[USD]+\d{2,3}.\d{2}', line) 
      if m: 
       found_dates.append(str(m.group(0))) 
       # print the dates we collected 
     # for date in found_dates: 
     #  print(date) 

     # DATA_J = DATA_CONTAINER[:] 
     DATA = list(zip(DATA_CONTAINER, found_dates)) 
     print(DATA) 

def serializeToCSV(fileName): 
    with open(fileName, "w") as fd: 
     for item in DATA: 
      fd.write(u' '.join(item).encode('utf-8') + "\n") 

collectData() 
print(len(DATA)) 
serializeToCSV('csv.csv') 

答えて

1

this codeをお試しください:ループ内

  • CallメソッドserializeToCSV(ライン17)
  • は、ファイルを作成するためのライン53の使用 "a"オプションで

    import re 
    import requests 
    from bs4 import BeautifulSoup 
    
    url = "http://www.watcheszon.com/casio-g-shock-c-19_20/?pg=store" 
    
    
    DATA_CONTAINER = list() 
    DATA = list() 
    
    def collectData(): 
    
        global DATA_CONTAINER 
        global DATA 
    
    
        for i in range(1, 5): 
         newUrl = url + "&sort=20a&page=" + str(i) 
         r = requests.get(newUrl) 
         soup = BeautifulSoup(r.content, "lxml") 
         #print(soup) 
         g_data_odd = soup.find_all("td", {"class": "productListing-data"}) 
         for item in g_data_odd:   
          t = item.find_all("div", {"class": "product_name"}) 
          i = list() 
          for name in t: 
           piece = name.find('a').text 
           i.append(piece) 
           #print(piece) 
           # for pc in piece: 
           # i.append(pc.replace("\r", "").replace("\n", "").replace("\t", "")) 
           # print(pc) 
           DATA_CONTAINER.append(piece) 
    
         spans = soup.find_all('span', {"class": "productSalePrice"}) 
         # create a list of lines corresponding to element texts 
         lines = [span.get_text() for span in spans] 
         # collect the dates from the list of lines using regex matching groups 
         found_dates = [] 
         for line in lines: 
          m = re.search(r'[USD]+\d{2,3}.\d{2}', line) 
          if m: 
           found_dates.append(str(m.group(0))) 
           # print the dates we collected 
         # for date in found_dates: 
         #  print(date) 
    
         # DATA_J = DATA_CONTAINER[:] 
         DATA = list(zip(DATA_CONTAINER, found_dates)) 
         print(DATA) 
    
         def serializeToCSV(fileName): 
          with open(fileName, "a") as fd: 
           for item in DATA: 
            fd.write(u' '.join(str(item)) + "\n") 
    
          print(len(DATA)) 
         serializeToCSV('csv.csv') 
    
    collectData() 
    
    関連する問題