0
私はエラーを取得:私は実行時に例外TypeError:データはLabeledPointのRDDにする必要がありますが、<型 'numpy.ndarrayを'>得
TypeError: data should be an RDD of LabeledPoint, but got <type 'numpy.ndarray'>
を:
import sys
import numpy as np
from pyspark import SparkConf, SparkContext
from pyspark.mllib.classification import LogisticRegressionWithSGD
conf = (SparkConf().setMaster("local")
.setAppName("Logistic Regression")
.set("spark.executor.memory", "1g"))
sc = SparkContext(conf = conf)
def mapper(line):
feats = line.strip().split(",")
label = feats[len(feats) - 1] # Last column is the label
feats = feats[2: len(feats) - 1] # remove id and type column
feats.insert(0,label)
features = [ float(feature) for feature in feats ] # need floats
return np.array(features)
data = sc.textFile("test.csv")
parsedData = data.map(mapper)
# Train model
model = LogisticRegressionWithSGD.train(parsedData)
私はエラーを取得しますmodel = LogisticRegressionWithSGD.train(parsedData)
行。
parsedData
をRDDとする。なぜ私はこれを得ているのか分かりません。 full source code