2016-11-23 5 views
0

私はこの2つの異なる方法を試しましたが、どちらも動作できません。私はこのウェブページの統計情報を掻き集めることを試みています:http://www.cbssports.com/nfl/gametracker/boxscore/[email protected]/ "TEAM STATS"内。特定の統計カテゴリ(例:「NET YARDS RUSHING」)の後に番号が必要です。以下は私が何の成功もなしに試みたものです。Pythonを使用してテーブルからデータを掻く2.7

最初の方法:

import pickle 
import math 
import os 
import urllib2 
from lxml import etree 
from bs4 import BeautifulSoup 
from urllib import urlopen 
from openpyxl import load_workbook 
from openpyxl import Workbook 
from openpyxl.styles import Color, PatternFill, Font, Border 
from openpyxl.styles import colors 
from openpyxl.cell import Cell 

Last Two Game info Home [H] or Away [A] 
favLastGM = 'H' #Higher week number 2   
favLastGM2 = 'A' #Lower week number 1 

#Game Info (Favorite) Last Game Played - CBS Sports (Change Every Week) 
favPrevGMInfoUrl = 'http://www.cbssports.com/nfl/gametracker/boxscore/[email protected]/'  

response8 = urllib2.urlopen(favPrevGMInfoUrl) 
htmlparser8 = etree.HTMLParser() 
tree8 = etree.parse(response8,htmlparser8) 

#FAVORITE 
if favLastGM == 'A': #This Gives Opposite of Away Team Net Rushing Yards - SO HOME Net Rushing Yards 

    text = tree8.xpath('//td[contains(text(),"Net Yards Rushing")]/parent::td/following-sibling::td[1]/text()') 
    if text: 
     favDef_rushYards_L2_1 = int(text[0].strip()) 
     print("test"), 
     print favDef_rushYards_L2_1 

    print ("Enter: Total Rushing Yards Allowed from Favored Team Defense for last game played: "), 
    print favDef_rushYards_L2_1 
elif favLastGM == 'H': #This Gives Opposite of Home Team Net Rushing Yards - SO AWAY Net Rushing Yards 

    text = tree8.xpath('//td[contains(text(),"Net Yards Rushing")]/parent::td/following-sibling::td[0]/text()') 
    if text: 
     favDef_rushYards_L2_1 = int(text[0].strip()) 
     print("test"), 
     print favDef_rushYards_L2_1 


    print ("Enter: Total Rushing Yards Allowed from Favored Team Defense for last game played: "), 
    print favDef_rushYards_L2_1 
else: 
    print("***************************************************") 
    print("NOT A VALID ENTRY - favLastGM !") 
    print("***************************************************") 

第二の方法:

import pickle 
import math 
import os 
import urllib2 
from lxml import etree 
from bs4 import BeautifulSoup 
from urllib import urlopen 
from openpyxl import load_workbook 
from openpyxl import Workbook 
from openpyxl.styles import Color, PatternFill, Font, Border 
from openpyxl.styles import colors 
from openpyxl.cell import Cell 

#Last Two Game info Home [H] or Away [A] 
favLastGM = 'H' #Higher week number 2   
favLastGM2 = 'A' #Lower week number 1   

#Game Info (Favorite) Last Game Played - CBS Sports (Change Every Week) 
favPrevGMInfoUrl = 'http://www.cbssports.com/nfl/gametracker/boxscore/[email protected]/' 

favPrevGMhtml2 = urlopen(favPrevGMInfoUrl).read() 
favPrevGMsoup2 = BeautifulSoup(favPrevGMhtml2) 
favPrevGM2Reg = favPrevGMsoup2.find("table", { "class" : "team-stats" }) 
favPrevGM2Reg2 = [] 

if favLastGM == 'A': #This Gives Opposite of Away Team Net Rushing Yards - SO HOME Net Rushing Yards 

    rush = 'Net Yards Rushing' 
    for row in favPrevGM2Reg.findAll("tr"): 
     if rush in row.findNext('td'): #Change Year for every new season 
      for item in row.findAll("td"): 
       favPrevGM2Reg.append(item.text) 
    favDef_rushYards_L2_1 = float(favPrevGM2Reg[1]) 

    print ("Enter: Total Rushing Yards Allowed from Favored Team Defense for last game played: "), 
    print favDef_rushYards_L2_1 

elif favLastGM == 'H': #This Gives Opposite of Home Team Net Rushing Yards - SO AWAY Net Rushing Yards 

    rush = 'Net Yards Rushing' 
    for row in favPrevGM2Reg.findAll("tr"): 
     if rush in row.findNext('td'): #Change Year for every new season 
      for item in row.findAll("td"): 
       favPrevGM2Reg.append(item.text) 
    favDef_rushYards_L2_1 = float(favPrevGM2Reg[0]) 

    print ("Enter: Total Rushing Yards Allowed from Favored Team Defense for last game played: "), 
    print favDef_rushYards_L2_1 
else: 
    print("***************************************************") 
    print("NOT A VALID ENTRY - favLastGM !") 
    print("***************************************************") 

答えて

1

あなたは、XPathを探しています:

//td[contains(text(),"Net Yards Rushing")]/following-sibling::td 

これはありませんが、あなたがやっ始まるTDを、選択していますそうですが、親の兄弟ではなく、兄弟を欲しがっているので、兄弟姉妹の直後に次の兄弟:: tdを追加する必要がありますtd。これはあなたにテーブルの出現順に2つの結果を与えるでしょう

1

私はコード全体を書きませんでしたが、これらの2つのラインはあなたに家庭と離れて急いでヤードを与えるでしょう。

import urllib2 
from lxml import etree 

favPrevGMInfoUrl = 'http://www.cbssports.com/nfl/gametracker/boxscore/[email protected]/'  
response8 = urllib2.urlopen(favPrevGMInfoUrl) 
htmlparser8 = etree.HTMLParser() 
tree8 = etree.parse(response8,htmlparser8) 

away = tree8.xpath('//tr[@data-category="rushing_yards"]//td[@class="stat-value away"]/text()') 
home = tree8.xpath('//tr[@data-category="rushing_yards"]//td[@class="stat-value home"]/text()') 
関連する問題