2016-05-03 1 views
0

私はこれにかなり新しいですが、他の人が同じエラーを抱いているのを見たことがありますが、私がどのようにソリューションを実装できるか見当たりません。私は、scikitから無作為グリッド検索を使用して学習するランダムフォレストマシンを作成しようとしています。それは標準的なグリッド検索でうまく動作しますが、ランダム化されたグリッド検索を使用すると、scikitからのフィット関数の奇妙なエラーで失敗します。これに対処するための提案は素晴らしいでしょうPython TypeError:range()整数の終了引数が必要です。浮動小数点があります。適合関数付き

ここにエラーを表示する例を示します。

import scipy 
import math 
import numpy as np 
import pandas as pd 
import plotly.plotly as py 

from time import time 
from sklearn import preprocessing, metrics, cross_validation 
from sklearn.cross_validation import train_test_split 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV 
from sklearn.cross_validation import KFold 

data = pd.read_csv("data.csv", sep=",") 
data = SubFeAll.fillna(SubFeAll.mean()) # replace the NA values with the mean of the descriptor 
header = data.columns.values # Ues the column headers as the descriptor labels 
data.head() 

# Set the numpy global random number seed (similar effect to random_state) 
np.random.seed(1) 

# Random Forest results initialised 
RFr2 = [] 
RFmse = [] 
RFrmse = [] 

# Predictions results initialised 
RFpredictions = [] 

metcount = 0 

# Give the array from pandas to numpy 
npArray = np.array(data) 
print header.shape 
npheader = np.array(header[1:-1]) 
print("Array shape X = %d, Y = %d " % (npArray.shape)) 
datax, datay = npArray.shape 

# Split the data into: names labels of the molecules ; y the True results ; X the descriptors for each data point 
names = npArray[:,0] 
X = npArray[:,1:-1].astype(float) 
y = npArray[:,-1] .astype(float) 
X = preprocessing.scale(X) 
print X.shape 

# Open output files 
train_name = "Training.csv" 
fi_name = "Feature_importance.csv" 

with open(train_name,'w') as ftrain: 
     ftrain.write("This file contains the training information for all three models (Random Forest, Support Vector Regression and Partial Least Squares),\n") 
     ftrain.write("The code use a ten fold cross validation 90% training 10% test at each fold so ten training sets are used here,\n") 
     ftrain.write("Fold %d ,\n" %(metcount+1)) 
ftrain.close() 

with open(fi_name,'w') as ffeatimp: 
     ffeatimp.write("This file contains the feature importance information for the Random Forest model,\n") 
ffeatimp.close() 

# Begin the K-fold cross validation over ten folds 
kf = KFold(datax, n_folds=10) 
print "------------------- Begining Ten Fold Cross Validation -------------------" 
for train, test in kf: 
    XTrain, XTest, yTrain, yTest = X[train], X[test], y[train], y[test] 
    ytestdim = yTest.shape[0] 
    i = 0 
    with open (train_name, 'a') as ftrain: 
     while i< ytestdim : 
       ftrain.write(str(round(yTest[i],2))+',\n') 
       i += 1 
    ftrain.close() 

    print "\n" 
    # random forest grid search parameters 
    print "------------------- Begining Random Forest Grid Search -------------------" 
    rfparamgrid = {"n_estimators": scipy.stats.expon(scale=100), "max_features": ["auto", "sqrt", "log2"], "max_depth": scipy.stats.expon(scale=100)} 
    rf = RandomForestRegressor(random_state=0,n_jobs=2) 
    RfGridSearch = RandomizedSearchCV(rf,param_distributions=rfparamgrid,scoring='mean_squared_error',n_iter=20) 
    start = time() 
    RfGridSearch.fit(XTrain,yTrain) 

    # Get best random forest parameters 
    print("GridSearchCV took %.2f seconds for %d candidate parameter settings" %(time() - start,len(RfGridSearch.grid_scores_))) 
    RFtime = time() - start,len(RfGridSearch.grid_scores_) 
    report(RfGridSearch.grid_scores_) 
    print("n_estimators = %d " % RfGridSearch.best_params_['n_estimators']) 
    ne = RfGridSearch.best_params_['n_estimators'] 
    print("max_features = %s " % RfGridSearch.best_params_['max_features']) 
    mf = RfGridSearch.best_params_['max_features'] 
    print("max_depth = %d " % RfGridSearch.best_params_['max_depth']) 
    md = RfGridSearch.best_params_['max_depth'] 
    with open (train_name, 'a') as ftrain: 
      ftrain.write("Random Forest") 
      ftrain.write("RF search time, %s ,\n" % (str(RFtime))) 
      ftrain.write("Number of Trees, %s ,\n" % str(ne)) 
      ftrain.write("Number of feature at split, %s ,\n" % str(mf)) 
      ftrain.write("Max depth of tree, %s ,\n" % str(md)) 
    ftrain.close() 

与えられているエラーは、私はちょうどパラメータを逃していたが、まっすぐ前方グリッドサーチでこの正確な方法は、何の問題も働きませんように見えると思った最初は

Traceback (most recent call last): 
    File "rgscv.py", line 81, in <module> 
    RfGridSearch.fit(XTrain,yTrain) 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/grid_search.py", line 996, in fit 
    return self._fit(X, y, sampled_params) 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/grid_search.py", line 553, in _fit 
    for parameters in parameter_iterable 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 800, in __call__ 
    while self.dispatch_one_batch(iterator): 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 658, in dispatch_one_batch 
    self._dispatch(tasks) 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 566, in _dispatch 
    job = ImmediateComputeBatch(batch) 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 180, in __init__ 
    self.results = batch() 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 72, in __call__ 
    return [func(*args, **kwargs) for func, args, kwargs in self.items] 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py", line 1531, in _fit_and_score 
    estimator.fit(X_train, y_train, **fit_params) 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 276, in fit 
    for i in range(n_more_estimators): 
TypeError: range() integer end argument expected, got float. 

を下回っています。このコードは以下のとおりです。誰も私にこのエラーの原因を示唆することはできますか?推定量の

import scipy 
import math 
import numpy as np 
import pandas as pd 
import plotly.plotly as py 

from time import time 
from sklearn import preprocessing, metrics, cross_validation 
from sklearn.cross_validation import train_test_split 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV 
from sklearn.cross_validation import KFold 

data = pd.read_csv("data.csv", sep=",") 
data = data.fillna(SubFeAll.mean()) # replace the NA values with the mean of the descriptor 
header = data.columns.values # Ues the column headers as the descriptor labels 
data.head() 

# Set the numpy global random number seed (similar effect to random_state) 
np.random.seed(1) 

# Random Forest results initialised 
RFr2 = [] 
RFmse = [] 
RFrmse = [] 

# Predictions results initialised 
RFpredictions = [] 

metcount = 0 

# Give the array from pandas to numpy 
npArray = np.array(data) 
print header.shape 
npheader = np.array(header[1:-1]) 
print("Array shape X = %d, Y = %d " % (npArray.shape)) 
datax, datay = npArray.shape 

# Split the data into: names labels of the molecules ; y the True results ; X the descriptors for each data point 
names = npArray[:,0] 
X = npArray[:,1:-1].astype(float) 
y = npArray[:,-1] .astype(float) 
X = preprocessing.scale(X) 
print X.shape 

# Open output files 
train_name = "Training.csv" 
fi_name = "Feature_importance.csv" 

with open(train_name,'w') as ftrain: 
     ftrain.write("This file contains the training information for all three models (Random Forest, Support Vector Regression and Partial Least Squares),\n") 
     ftrain.write("The code use a ten fold cross validation 90% training 10% test at each fold so ten training sets are used here,\n") 
     ftrain.write("Fold %d ,\n" %(metcount+1)) 
ftrain.close() 

with open(fi_name,'w') as ffeatimp: 
     ffeatimp.write("This file contains the feature importance information for the Random Forest model,\n") 
ffeatimp.close() 

# Begin the K-fold cross validation over ten folds 
kf = KFold(datax, n_folds=10) 
print "------------------- Begining Ten Fold Cross Validation -------------------" 
for train, test in kf: 
    XTrain, XTest, yTrain, yTest = X[train], X[test], y[train], y[test] 
    ytestdim = yTest.shape[0] 
    i = 0 
    with open (train_name, 'a') as ftrain: 
     while i< ytestdim : 
       ftrain.write(str(round(yTest[i],2))+',\n') 
       i += 1 
    ftrain.close() 

    print "\n" 
    # random forest grid search parameters 
    print "------------------- Begining Random Forest Grid Search -------------------" 
    #rfparamgrid = {"n_estimators": scipy.stats.expon(scale=100), "max_features": ["auto", "sqrt", "log2"], "max_depth": scipy.stats.expon(scale=100)} 
    rfparamgrid = {"n_estimators": [10, 20, 25, 50, 100, 1000], "max_features": ["auto", "sqrt", "log2"], "max_depth": [1,2,3,5,7,10]} 
    rf = RandomForestRegressor(random_state=0,n_jobs=2) 
    RfGridSearch = GridSearchCV(rf,param_grid=rfparamgrid,scoring='mean_squared_error') 
    start = time() 
    RfGridSearch.fit(XTrain,yTrain) 

    # Get best random forest parameters 
    print("GridSearchCV took %.2f seconds for %d candidate parameter settings" %(time() - start,len(RfGridSearch.grid_scores_))) 
    RFtime = time() - start,len(RfGridSearch.grid_scores_) 
    report(RfGridSearch.grid_scores_) 
    print("n_estimators = %d " % RfGridSearch.best_params_['n_estimators']) 
    ne = RfGridSearch.best_params_['n_estimators'] 
    print("max_features = %s " % RfGridSearch.best_params_['max_features']) 
    mf = RfGridSearch.best_params_['max_features'] 
    print("max_depth = %d " % RfGridSearch.best_params_['max_depth']) 
    md = RfGridSearch.best_params_['max_depth'] 
    with open (train_name, 'a') as ftrain: 
       ftrain.write("Random Forest") 
       ftrain.write("RF search time, %s ,\n" % (str(RFtime))) 
       ftrain.write("Number of Trees, %s ,\n" % str(ne)) 
       ftrain.write("Number of feature at split, %s ,\n" % str(mf)) 
       ftrain.write("Max depth of tree, %s ,\n" % str(md)) 
    ftrain.close() 
+0

@coralv彼は明らかにライブラリを使用しています。サイトパッケージのディレクトリにあります。無意味な質問をしないでください。 – Natecat

+1

このセクションは、scikit学習ライブラリの標準的なフィット関数です。これは私が編集したコードではなく、同じ関数がGridSearchCVバージョンで正常に使用されています。これは期待どおりに動作します。 – James

+0

[scipy.stats.expon](http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.expon.html#scipy.stats.expon)はexponオブジェクトを返すようですが、 2番目の例のようにリストのように機能するようです。それをリストに変更するとそれが修正されますか? – Natecat

答えて

1

数は整数なければならない、とあなたのコードは、山車を生成します。整数を含むn_estimators値の有効なリストを作成してください。それだけで問題ありません。

+0

ありがとうございました。 – James

関連する問題