2017-10-30 19 views
0

私のデータセットの最後の列の値が「作成済み」または「逃した」と予測し、プログラムを数回実行したが、精度は常に100%。私はそれが約95-100%と期待します。どんな考え?ここでは、データセットのスニペットは、(元1が000 74以上の行を持つ)である:ここではデシジョンツリークラシファイアの精度

A,L,-5,8,3,475,11.8,1,1.6,6.1,2,2.7,made 
A,L,-39,10,2,30,18.5,6,5.4,24.3,3,3.1,missed 
A,L,-20,8,3,327,6.2,0,1.8,2.3,2,0,missed 
A,W,16,5,1,504,11.7,0,1,18,2,7.3,missed 
A,L,-5,3,2,547,19.9,0,1.2,23.9,3,7.5,made 
H,W,14,4,2,600,17.6,0,0.5,5.5,2,3.8,made 
H,L,-8,6,3,692,23,1,1.9,4.4,2,4.1,made 
H,L,-10,11,3,171,14.4,0,0.9,25.2,3,5.8,missed 

は、分類器のためのコードは次のとおりです。

from math import log 
import operator 

def load_csv(filename): 
    headers = ["location","w","final_margin","shot_number","period","game_clock","shot_clock","dribbles","touch_time", 
      "shot_dist","pts_type","close_def_dist","target"] 
    df = pd.read_csv(filename, header=None, names=headers, na_values="?") 

    obj_df=list(df.values.flatten()) 

    i=0 
    new_list=[] 
    while i<len(dataset): 
     new_list.append(obj_df[i:i+13]) 
     i+=13 

    labels = ["location","w","final_margin","shot_number","period","game_clock","shot_clock", "dribbles","touch_time", 
      "shot_dist","pts_type","close_def_dist"] 
    return new_list, labels 

def calcShannonEnt(dataSet): 
    numEntries = len(dataSet) 
    labelCounts = {} 
    for featVec in dataSet: # the the number of unique elements and their occurance 
     currentLabel = featVec[-1] 
     if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 
     labelCounts[currentLabel] += 1 
    shannonEnt = 0.0 
    for key in labelCounts: 
     prob = float(labelCounts[key])/numEntries 
     shannonEnt -= prob * log(prob, 2) # log base 2 
    return shannonEnt 


def splitDataSet(dataSet, axis, value): 
    retDataSet = [] 
    for featVec in dataSet: 
     if featVec[axis] == value: 
      reducedFeatVec = featVec[:axis] # chop out axis used for splitting 
      reducedFeatVec.extend(featVec[axis + 1:]) 
      retDataSet.append(reducedFeatVec) 
    return retDataSet 


def chooseBestFeatureToSplit(dataSet): 
    numFeatures = len(dataSet[0]) - 1 # the last column is used for the labels 
    baseEntropy = calcShannonEnt(dataSet) 
    bestInfoGain = 0.0; 
    bestFeature = -1 
    for i in range(numFeatures): # iterate over all the features 
     featList = [example[i] for example in dataSet] # create a list of all the examples of this feature 
     uniqueVals = set(featList) # get a set of unique values 
     newEntropy = 0.0 
     for value in uniqueVals: 
      subDataSet = splitDataSet(dataSet, i, value) 
      prob = len(subDataSet)/float(len(dataSet)) 
      newEntropy += prob * calcShannonEnt(subDataSet) 


     infoGain = baseEntropy - newEntropy # calculate the info gain; ie reduction in entropy 
     """ 
     print("feature : " + str(i)) 
     print("baseEntropy : "+str(baseEntropy)) 
     print("newEntropy : " + str(newEntropy)) 
     print("infoGain : " + str(infoGain)) 
     """ 
     if (infoGain > bestInfoGain): # compare this to the best gain so far 
      bestInfoGain = infoGain # if better than current best, set to best 
      bestFeature = i 
    return bestFeature # returns an integer 


def majorityCnt(classList): 
    classCount = {} 
    for vote in classList: 
     if vote not in classCount.keys(): classCount[vote] = 0 
     classCount[vote] += 1 
    sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) 
    return sortedClassCount[0][0] 


def createTree(dataSet, labels): 
    # extracting data 
    classList = [example[-1] for example in dataSet] 
    if classList.count(classList[0]) == len(classList): 
     return classList[0] # stop splitting when all of the classes are equal 
    if len(dataSet[0]) == 1: # stop splitting when there are no more features in dataSet 
     return majorityCnt(classList) 
    # use Information Gain 
    bestFeat = chooseBestFeatureToSplit(dataSet) 
    bestFeatLabel = labels[bestFeat] 

    #build a tree recursively 
    myTree = {bestFeatLabel: {}} 
    #print("myTree : "+labels[bestFeat]) 
    del (labels[bestFeat]) 
    featValues = [example[bestFeat] for example in dataSet] 
    #print("featValues: "+str(featValues)) 
    uniqueVals = set(featValues) 
    #print("uniqueVals: " + str(uniqueVals)) 
    for value in uniqueVals: 
     subLabels = labels[:] # copy all of labels, so trees don't mess up existing labels 
     #print("subLabels"+str(subLabels)) 
     myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels) 
     #print("myTree : " + str(myTree)) 
    return myTree 


def classify(inputTree, featLabels, testVec): 
    firstStr = list(inputTree)[0] #print("fistStr : "+firstStr) 
    secondDict = inputTree[firstStr] 
    #print("secondDict : " + str(secondDict)) 
    featIndex = featLabels.index(firstStr) 
    #print("featIndex : " + str(featIndex)) 
    key = testVec[featIndex] 
    #print("key : " + str(key)) 
    valueOfFeat = secondDict[key] 
    #print("valueOfFeat : " + str(valueOfFeat)) 
    if isinstance(valueOfFeat, dict): 
     #print("is instance: "+str(valueOfFeat)) 
     classLabel = classify(valueOfFeat, featLabels, testVec) 
    else: 
     #print("is Not instance: " + valueOfFeat) 
     classLabel = valueOfFeat 
    return classLabel 


def storeTree(inputTree, filename): 
    import pickle 
    fw = open(filename, 'w') 
    pickle.dump(inputTree, fw) 
    fw.close() 


def grabTree(filename): 
    import pickle 
    fr = open(filename) 
    return pickle.load(fr) 

def accuracy_metric(actual, predicted): 
    correct = 0 
    for i in range(len(actual)): 
     if actual[i] == predicted[i]: 
      correct += 1 
    return correct/float(len(actual)) * 100.0 

# collect data 
myDat, labels = load_csv('data/basketball.train.csv') 
#print(myDat) 
#build a tree 
mytree = createTree(myDat, labels) 
#print(mytree) 

#run test 

predictions=[] 
for row in myDat: 
    prediction = classify(mytree, ["location","w","final_margin","shot_number","period","game_clock","shot_clock","dribbles","touch_time", 
      "shot_dist","pts_type","close_def_dist"], [row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8], 
                 row[9],row[10],row[11]]) 
    #print('Expected=%s, Got=%s' % (row[-1], prediction)) 
    predictions.append(prediction) 
actual = [row[-1] for row in myDat] 
accuracy = accuracy_metric(actual, predictions) 
print(accuracy) 
+2

あなたの投稿をvandalizeしないでください。 Stack Exchangeネットワーク上に掲載することで、SEがそのコンテンツを配布する権利(CC BY-SA 3.0ライセンス)(https://creativecommons.org/licenses/by-sa/3.0 /))。 SEのポリシーによって、どんな破壊行為も元に戻ります。この投稿とアカウントとの関連付けを解除する場合は、[解約リクエストの正しいルートは何ですか?](https://meta.stackoverflow.com/q/323395)を参照してください。 – adiga

答えて

2

あなたのデータセットを分割するように表示されません。別々の訓練データセットとテストデータセットに分けます。この結果、分類器がデータセットのオーバーフィットになっている可能性があり、データセットの外部のサンプルではうまく機能しない可能性があります。

トレーニングのデータの75%をランダムに選択し、残りの25%で精度をテストしてみます。例えば、あなたのコードの最後の部分を置き換える:

import random 

dataset, labels = load_csv('data/basketball.train.csv') 
random.shuffle(dataset) 
split_index = int(len(dataset) * 0.75) 

train_dataset = dataset[:split_index] 
test_dataset = dataset[split_index:] 

mytree = createTree(train_dataset, labels) 

predictions=[] 
for row in test_dataset: 
    prediction = classify(mytree, ["location","w","final_margin","shot_number","period","game_clock","shot_clock","dribbles","touch_time", 
      "shot_dist","pts_type","close_def_dist"], [row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8], 
                 row[9],row[10],row[11]]) 
    #print('Expected=%s, Got=%s' % (row[-1], prediction)) 
    predictions.append(prediction) 
actual = [row[-1] for row in test_dataset] 
accuracy = accuracy_metric(actual, predictions) 
print(accuracy) 

:未テスト)