私はこのデータセットWeath Based on ageを使用していますが、精度は84%
であると書かれています。scikit非常に精度が低い分類器(Naive Bayes、DecissionTreeClassifier)
1. Loaded the .txt data file and converted it to a .csv
2. Removed data with missing values
3. Extracted the class values: <=50K >50 and convert it to 0 and 1 respectively
4. For each attribute and for each string value of that attribute I
mapped it to an integer value. Example att1{'cs':0, 'cs2':1},
att2{'usa':0, 'greece':1} ... and so on
5. Called naive bayes on the new integer data set
Pythonコード:
import load_csv as load #my functions to do [1..5] of the list
import numpy as np
my_data = np.genfromtxt('out.csv', dtype = dt, delimiter = ',', skip_header = 1)
data = np.array(load.remove_missing_values(my_data)) #this funcion removes the missing data
features_train = np.array(load.remove_field_num(data, len(data[0]) - 1)) #this function extracts the data, e.g removes the class in the end of the data
label_train = np.array(load.create_labels(data))
features_train = np.array(load.convert_to_int(features_train))
my_data = np.genfromtxt('test.csv', dtype = dt, delimiter = ',', skip_header = 1)
data = np.array(load.remove_missing_values(my_data))
features_test = np.array(load.remove_field_num(data, len(data[0]) - 1))
label_test = np.array(load.create_labels(data)) #extracts the labels from the .csv data file
features_test = np.array(load.convert_to_int(features_test)) #converts the strings to ints(each unique string of an attribute is assigned a unique integer value
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.metrics import accuracy_score
clf = tree.DecisionTreeClassifier()
clf.fit(features_train, label_train)
predict = clf.predict(features_test)
score = accuracy_score(predict, label_test) #Low accuracy score
load_csvモジュール:
import numpy as np
attributes = { 'Private':0, 'Self-emp-not-inc':1, 'Self-emp-inc':2, 'Federal-gov':3, 'Local-gov':4, 'State-gov':5, 'Without-pay':6, 'Never-worked':7,
'Bachelors':0, 'Some-college':1, '11th':2, 'HS-grad':3, 'Prof-school':4, 'Assoc-acdm':5, 'Assoc-voc':6, '9th':7, '7th-8th':8, '12th':9, 'Masters':10, '1st-4th':11, '10th':12, 'Doctorate':13, '5th-6th':14, 'Preschool':15,
'Married-civ-spouse':0, 'Divorced':1, 'Never-married':2, 'Separated':3, 'Widowed':4, 'Married-spouse-absent':5, 'Married-AF-spouse':6,
'Tech-support':0, 'Craft-repair':1, 'Other-service':2, 'Sales':3, 'Exec-managerial':4, 'Prof-specialty':5, 'Handlers-cleaners':6, 'Machine-op-inspct':7, 'Adm-clerical':8,
'Farming-fishing':9, 'Transport-moving':10, 'Priv-house-serv':11, 'Protective-serv':12, 'Armed-Forces':13,
'Wife':0, 'Own-child':1, 'Husband':2, 'Not-in-family':4, 'Other-relative':5, 'Unmarried':5,
'White':0, 'Asian-Pac-Islander':1, 'Amer-Indian-Eskimo':2, 'Other':3, 'Black':4,
'Female':0, 'Male':1,
'United-States':0, 'Cambodia':1, 'England':2, 'Puerto-Rico':3, 'Canada':4, 'Germany':5, 'Outlying-US(Guam-USVI-etc)':6, 'India':7, 'Japan':8, 'Greece':9, 'South':10, 'China':11, 'Cuba':12, 'Iran':13, 'Honduras':14, 'Philippines':15, 'Italy':16, 'Poland':17, 'Jamaica':18, 'Vietnam':19, 'Mexico':20, 'Portugal':21, 'Ireland':22, 'France':23, 'Dominican-Republic':24, 'Laos':25, 'Ecuador':26, 'Taiwan':27, 'Haiti':28, 'Columbia':29, 'Hungary':30, 'Guatemala':31, 'Nicaragua':32, 'Scotland':33, 'Thailand':34, 'Yugoslavia':35, 'El-Salvador':36, 'Trinadad&Tobago':37, 'Peru':38, 'Hong':39, 'Holand-Netherlands':40
}
def remove_field_num(a, i): #function to strip values
names = list(a.dtype.names)
new_names = names[:i] + names[i + 1:]
b = a[new_names]
return b
def remove_missing_values(data):
temp = []
for i in range(len(data)):
for j in range(len(data[i])):
if data[i][j] == '?': #If a missing value '?' is encountered do not append the line to temp
break;
if j == (len(data[i]) - 1) and len(data[i]) == 15:
temp.append(data[i]) #Append the lines that do not contain '?'
return temp
def create_labels(data):
temp = []
for i in range(len(data)): #Iterate through the data
j = len(data[i]) - 1 #Extract the labels
if data[i][j] == '<=50K':
temp.append(0)
else:
temp.append(1)
return temp
def convert_to_int(data):
my_lst = []
for i in range(len(data)):
lst = []
for j in range(len(data[i])):
key = data[i][j]
if j in (1, 3, 5, 6, 7, 8, 9, 13, 14):
lst.append(int(attributes[key]))
else:
lst.append(int(key))
my_lst.append(lst)
temp = np.array(my_lst)
return temp
残念ながら、私のプログラムの精度は、私は次のことをやったデータを処理するために25%
であります私はtree
との両方を使用しようとしました。しかし、精度は非常に低いです。何が欠けているの?
フィーチャーベクターの一部を印刷できますか?例えば'data [:2、:50]' – CentAu
@CentAuはい私はベクトルをチェックして、うまく見えます。 feature_testデータには14の属性があります。これは問題になりますか? – KostasRim
あなたの 'feature_train'と' feature_test'配列の形は何ですか? (feature_train.shape) – CentAu