2016-03-29 16 views
-1

以下のコードでは、resultsBlockFooterの "going Allowance"とは別に、すべてのデータをスクラップから取得できます。ソース内のほとんどのデータはList(li)行く余裕はスパンで囲まれています。私はさまざまなバリエーションを試してみましたが、それを抽出するためにちょうどカンをセックスしてください。BSはスパンタグでデータを取得できません

 import csv 
from bs4 import BeautifulSoup 
import requests 



html = requests.get("http://www.sportinglife.com=156432).text 

soup = BeautifulSoup(html,'lxml') 

rows = [] 
for header in soup.find_all("div", class_="resultsBlockHeader"): 
    track = header.find("div",  class_="track").get_text(strip=True).encode('ascii', 'ignore').strip("|") 
    date = header.find("div", class_="date").get_text(strip=True).encode('ascii', 'ignore').strip("|") 
    datetime = header.find("div", class_="datetime").get_text(strip=True).encode('ascii', 'ignore').strip("|") 
    grade = header.find("div", class_="grade").get_text(strip=True).encode('ascii', 'ignore').strip("|") 
    distance = header.find("div", class_="distance").get_text(strip=True).encode('ascii', 'ignore').strip("|") 
prizes = header.find("div", class_="prizes").get_text(strip=True).encode('ascii', 'ignore').strip("|") 

results = header.find_next_sibling("div", class_="resultsBlock").find_all("ul", class_="line1") 
details = [] 
for result in results: 
    fin = result.find("li", class_="fin").get_text(strip=True) 
    greyhound = result.find("li", class_="greyhound").get_text(strip=True) 
    trap = result.find("li", class_="trap").get_text(strip=True) 
    sp = result.find("li", class_="sp").get_text(strip=True) 
    timeSec = result.find("li", class_="timeSec").get_text(strip=True) 
    timeDistance = result.find("li", class_="timeDistance").get_text(strip=True) 

    details.append({"greyhound": greyhound, "sp": sp, "fin": fin, "timeSec": timeSec, "timeDistance": timeDistance, "trap": trap }) 


results = header.find_next_sibling("div", class_="resultsBlock").find_all("ul", class_="line2") 
for index, result in enumerate(results): 
    trainer = result.find("li", class_="trainer").get_text(strip=True) 
    details[index]["trainer"] = trainer 

results = header.find_next_sibling("div", class_="resultsBlock").find_all("ul", class_="line3") 
for index, result in enumerate(results): 
    comment = result.find("li", class_="comment").get_text(strip=True) 
    details[index]["comment"] = comment 

results = header.find_next_sibling("div", class_="resultsBlock").find_all("ul", class_="line2") 
for index, result in enumerate(results): 
    firstessential = result.find("li", class_="first essential").get_text(strip=True) 
    details[index]["first essential"] = firstessential 

results = header.find_next_sibling("div", class_="resultsBlockFooter").find_all("ul", class_="line3") 
for index, result in enumerate(results): 
    goingAllowance = result.find("div", class_="Going Allowance").get_text(strip=True) 
    details[index]["Going Allowance"] = goingAllowance 

for detail in details: 
    detail.update({"track": track, "date": date, "datetime": datetime, "grade": grade, "prizes": prizes}) 
    rows.append(detail) 
with open("abc.csv","a") as f: 
    writer = csv.DictWriter(f,   [track","date","trap","fin","greyhound","datetime","sp","grade","distance"," prizes","timeSec","timeDistance","trainer","comment","first essential","going Allowance"]) 

    for row in rows: 
     writer.writerow(row) 

答えて

0

今後すべてのコードを転記するのではなく、関連する部分を含めてください。また、キャプチャに問題があるウェブサイトのHTMLまたはセクションも含めてください。私はウェブサイトを見て、私はあなたが意味すると思いますか?

test = soup.find("div", {"class":"resultsBlockFooter"}) 
'<div class="resultsBlockFooter"> 
<div><span>Going Allowance:</span> -10</div> 
<div><span>Forecast:</span> (3-4) £20.36 | <span>Tricast:</span> (3-4-2) £61.61</div> 
</div>' 

あなたは<div><span>Going Allowance:</span> -10</div>が欲しいですか?

allowance = test.content[1].text #.content can be a helpful list of the tags 
"Going Allowance: -10" 
forecast, tricast = test.content[3].text.split("|") #the rest of useful text 
関連する問題