2016-08-26 22 views
1

私はこのデータセットWeath Based on ageを使用していますが、精度は84%であると書かれています。scikit非常に精度が低い分類器(Naive Bayes、DecissionTreeClassifier)

1. Loaded the .txt data file and converted it to a .csv 
2. Removed data with missing values 
3. Extracted the class values: <=50K >50 and convert it to 0 and 1 respectively 
4. For each attribute and for each string value of that attribute I 
    mapped it to an integer value. Example att1{'cs':0, 'cs2':1}, 
    att2{'usa':0, 'greece':1} ... and so on 
5. Called naive bayes on the new integer data set 

Pythonコード:

import load_csv as load #my functions to do [1..5] of the list 
import numpy as np 

my_data = np.genfromtxt('out.csv', dtype = dt, delimiter = ',', skip_header = 1) 

data = np.array(load.remove_missing_values(my_data))      #this funcion removes the missing data 
features_train = np.array(load.remove_field_num(data, len(data[0]) - 1)) #this function extracts the data, e.g removes the class in the end of the data 

label_train = np.array(load.create_labels(data)) 
features_train = np.array(load.convert_to_int(features_train)) 


my_data = np.genfromtxt('test.csv', dtype = dt, delimiter = ',', skip_header = 1) 

data = np.array(load.remove_missing_values(my_data)) 
features_test = np.array(load.remove_field_num(data, len(data[0]) - 1)) 

label_test = np.array(load.create_labels(data))       #extracts the labels from the .csv data file 
features_test = np.array(load.convert_to_int(features_test))    #converts the strings to ints(each unique string of an attribute is assigned a unique integer value 

from sklearn import tree 
from sklearn.naive_bayes import GaussianNB 
from sklearn import tree 
from sklearn.metrics import accuracy_score 

clf = tree.DecisionTreeClassifier() 
clf.fit(features_train, label_train) 
predict = clf.predict(features_test) 

score = accuracy_score(predict, label_test) #Low accuracy score 

load_csvモジュール:

import numpy as np 

attributes = { 'Private':0, 'Self-emp-not-inc':1, 'Self-emp-inc':2, 'Federal-gov':3, 'Local-gov':4, 'State-gov':5, 'Without-pay':6, 'Never-worked':7, 
      'Bachelors':0, 'Some-college':1, '11th':2, 'HS-grad':3, 'Prof-school':4, 'Assoc-acdm':5, 'Assoc-voc':6, '9th':7, '7th-8th':8, '12th':9, 'Masters':10, '1st-4th':11, '10th':12,     'Doctorate':13, '5th-6th':14, 'Preschool':15, 
      'Married-civ-spouse':0, 'Divorced':1, 'Never-married':2, 'Separated':3, 'Widowed':4, 'Married-spouse-absent':5, 'Married-AF-spouse':6, 
      'Tech-support':0, 'Craft-repair':1, 'Other-service':2, 'Sales':3, 'Exec-managerial':4, 'Prof-specialty':5, 'Handlers-cleaners':6, 'Machine-op-inspct':7, 'Adm-clerical':8, 
      'Farming-fishing':9, 'Transport-moving':10, 'Priv-house-serv':11, 'Protective-serv':12, 'Armed-Forces':13, 
      'Wife':0, 'Own-child':1, 'Husband':2, 'Not-in-family':4, 'Other-relative':5, 'Unmarried':5, 
      'White':0, 'Asian-Pac-Islander':1, 'Amer-Indian-Eskimo':2, 'Other':3, 'Black':4, 
      'Female':0, 'Male':1, 
      'United-States':0, 'Cambodia':1, 'England':2, 'Puerto-Rico':3, 'Canada':4, 'Germany':5, 'Outlying-US(Guam-USVI-etc)':6, 'India':7, 'Japan':8, 'Greece':9, 'South':10, 'China':11,     'Cuba':12, 'Iran':13, 'Honduras':14, 'Philippines':15, 'Italy':16, 'Poland':17, 'Jamaica':18, 'Vietnam':19, 'Mexico':20, 'Portugal':21, 'Ireland':22, 'France':23,     'Dominican-Republic':24, 'Laos':25, 'Ecuador':26, 'Taiwan':27, 'Haiti':28, 'Columbia':29, 'Hungary':30, 'Guatemala':31, 'Nicaragua':32, 'Scotland':33, 'Thailand':34, 'Yugoslavia':35,     'El-Salvador':36, 'Trinadad&Tobago':37, 'Peru':38, 'Hong':39, 'Holand-Netherlands':40 
     } 



def remove_field_num(a, i):                  #function to strip values 
    names = list(a.dtype.names) 
    new_names = names[:i] + names[i + 1:] 
    b = a[new_names] 
    return b 

def remove_missing_values(data): 
    temp = [] 
    for i in range(len(data)): 
     for j in range(len(data[i])): 
      if data[i][j] == '?':                 #If a missing value '?' is encountered do not append the line to temp 
       break; 
      if j == (len(data[i]) - 1) and len(data[i]) == 15: 
       temp.append(data[i])                #Append the lines that do not contain '?' 
    return temp 

def create_labels(data): 
    temp = [] 
    for i in range(len(data)):                 #Iterate through the data 
     j = len(data[i]) - 1                  #Extract the labels 
     if data[i][j] == '<=50K': 
      temp.append(0) 
     else: 
      temp.append(1) 
    return temp 

def convert_to_int(data): 

    my_lst = [] 
    for i in range(len(data)): 
     lst = [] 
     for j in range(len(data[i])): 
      key = data[i][j] 
      if j in (1, 3, 5, 6, 7, 8, 9, 13, 14): 
       lst.append(int(attributes[key])) 
      else: 
       lst.append(int(key))  
     my_lst.append(lst) 

    temp = np.array(my_lst) 
    return temp 

残念ながら、私のプログラムの精度は、私は次のことをやったデータを処理するために25%

であります私はtreeとの両方を使用しようとしました。しかし、精度は非常に低いです。何が欠けているの?

+0

フィーチャーベクターの一部を印刷できますか?例えば'data [:2、:50]' – CentAu

+0

@CentAuはい私はベクトルをチェックして、うまく見えます。 feature_testデータには14の属性があります。これは問題になりますか? – KostasRim

+0

あなたの 'feature_train'と' feature_test'配列の形は何ですか? (feature_train.shape) – CentAu

答えて

2

問題は前処理にあると思います。カテゴリ変数を生の数字の代わりにone_hotベクトル(0または1のベクトルだけがそのクラスの望ましい値に対応するもの)でエンコードする方が良いです。 Sklearn DictVectorizerはあなたを助けることができます。 pandasライブラリを使用すると、分類をより効率的に行うことができます。

以下は、pandasライブラリの助けを借りて簡単に達成できることを示しています。それは、横のscikit-learnに沿ってとてもうまく動作します。これにより、データ全体の20%のテストセットで81.6の精度が達成されます。

from __future__ import division 

from sklearn.cross_validation import train_test_split 
from sklearn.feature_extraction.dict_vectorizer import DictVectorizer 
from sklearn.linear_model.logistic import LogisticRegression 
from sklearn.metrics.classification import classification_report, accuracy_score 
from sklearn.naive_bayes import GaussianNB 
from sklearn.tree.tree import DecisionTreeClassifier 

import numpy as np 
import pandas as pd 


# Read the data into a pandas dataframe 
df = pd.read_csv('adult.data.csv') 

# Columns names 
cols = np.array(['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
       'marital-status', 'occupation', 'relationship', 'race', 'sex', 
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 
       'target']) 

# numeric columns 
numeric_cols = ['age', 'fnlwgt', 'education-num', 
       'capital-gain', 'capital-loss', 'hours-per-week'] 

# assign names to the columns in the dataframe 
df.columns = cols 

# replace the target variable to 0 and 1 for <50K and >50k 
df1 = df.copy() 
df1.loc[df1['target'] == ' <=50K', 'target'] = 0 
df1.loc[df1['target'] == ' >50K', 'target'] = 1 

# split the data into train and test 
X_train, X_test, y_train, y_test = train_test_split(
    df1.drop('target', axis=1), df1['target'], test_size=0.2) 


# numeric attributes 

x_num_train = X_train[numeric_cols].as_matrix() 
x_num_test = X_test[numeric_cols].as_matrix() 

# scale to <0,1> 

max_train = np.amax(x_num_train, 0) 
max_test = np.amax(x_num_test, 0)  # not really needed 

x_num_train = x_num_train/max_train 
x_num_test = x_num_test/max_train  # scale test by max_train 

# labels or target attribute 

y_train = y_train.astype(int) 
y_test = y_test.astype(int) 

# categorical attributes 

cat_train = X_train.drop(numeric_cols, axis=1) 
cat_test = X_test.drop(numeric_cols, axis=1) 

cat_train.fillna('NA', inplace=True) 
cat_test.fillna('NA', inplace=True) 

x_cat_train = cat_train.T.to_dict().values() 
x_cat_test = cat_test.T.to_dict().values() 

# vectorize (encode as one hot) 

vectorizer = DictVectorizer(sparse=False) 
vec_x_cat_train = vectorizer.fit_transform(x_cat_train) 
vec_x_cat_test = vectorizer.transform(x_cat_test) 

# build the feature vector 

x_train = np.hstack((x_num_train, vec_x_cat_train)) 
x_test = np.hstack((x_num_test, vec_x_cat_test)) 


clf = LogisticRegression().fit(x_train, y_train.values) 
pred = clf.predict(x_test) 
print classification_report(y_test.values, pred, digits=4) 
print accuracy_score(y_test.values, pred) 

clf = DecisionTreeClassifier().fit(x_train, y_train) 
predict = clf.predict(x_test) 
print classification_report(y_test.values, pred, digits=4) 
print accuracy_score(y_test.values, pred) 

clf = GaussianNB().fit(x_train, y_train) 
predict = clf.predict(x_test) 
print classification_report(y_test.values, pred, digits=4) 
print accuracy_score(y_test.values, pred) 
+0

すごくありがとう!できます! – KostasRim

+0

問題はありません。 – CentAu

関連する問題