Fleur Fleur - 1 month ago 24
Scala Question

Multiclass Classification Evaluator field does not exist error - Apache Spark

I am new to Spark and trying a basic classifier in Scala.

I'm trying to get the accuracy, but when using MulticlassClassificationEvaluator it gives the error below:

Caused by: java.lang.IllegalArgumentException: Field "label" does not exist.
at org.apache.spark.sql.types.StructType$$anonfun$apply$1.apply(StructType.scala:228)
at org.apache.spark.sql.types.StructType$$anonfun$apply$1.apply(StructType.scala:228)
at scala.collection.MapLike$class.getOrElse(MapLike.scala:128)
at scala.collection.AbstractMap.getOrElse(Map.scala:59)
at org.apache.spark.sql.types.StructType.apply(StructType.scala:227)
at org.apache.spark.ml.util.SchemaUtils$.checkNumericType(SchemaUtils.scala:71)
at org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator.evaluate(MulticlassClassificationEvaluator.scala:76)
at com.classifier.classifier_app.App$.<init>(App.scala:90)
at com.classifier.classifier_app.App$.<clinit>(App.scala)


The code is as below:

val conf = new SparkConf().setMaster("local[*]").setAppName("Classifier")
val sc = new SparkContext(conf)
val spark = SparkSession
.builder()
.appName("Email Classifier")
.config("spark.some.config.option", "some-value")
.getOrCreate()
import spark.implicits._

val spamInput = "TRAIN_00000_0.eml" //files to train model
val normalInput = "TRAIN_00002_1.eml"
val spamData = spark.read.textFile(spamInput)
val normalData = spark.read.textFile(normalInput)

case class Feature(index: Int, value: String)

val indexer = new StringIndexer()
.setInputCol("value")
.setOutputCol("label")

val regexTokenizer = new RegexTokenizer()
.setInputCol("value")
.setOutputCol("cleared")
.setPattern("\\w+").setGaps(false)

val remover = new StopWordsRemover()
.setInputCol("cleared")
.setOutputCol("filtered")

val hashingTF = new HashingTF()
.setInputCol("filtered").setOutputCol("features")
.setNumFeatures(100)

val nb = new NaiveBayes()

val indexedSpam = spamData.map(x=>Feature(0, x))
val indexedNormal = normalData.map(x=>Feature(1, x))
val trainingData = indexedSpam.union(indexedNormal)

val pipeline = new Pipeline().setStages(Array (indexer, regexTokenizer, remover, hashingTF, nb))
val model = pipeline.fit(trainingData)

model.write.overwrite().save("myNaiveBayesModel")

val spamTest = spark.read.textFile("TEST_00009_0.eml")
val normalTest = spark.read.textFile("TEST_00000_1.eml")
val sameModel = PipelineModel.load("myNaiveBayesModel")

val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
.setMetricName("accuracy")

Console.println("Spam Test")
val predictionSpam = sameModel.transform(spamTest).select("prediction")
predictionSpam.foreach(println(_))
val accuracy = evaluator.evaluate(predictionSpam)
println("Accuracy Spam: " + accuracy)

Console.println("Normal Test")
val predictionNorm = sameModel.transform(normalTest).select("prediction")
predictionNorm.foreach(println(_))
val accuracyNorm = evaluator.evaluate(predictionNorm)
println("Accuracy Normal: " + accuracyNorm)


The error occurs when initializing the MulticlassClassificationEvaluator. How should the column names be specified? Any help is appreciated.

Answer

The error is in this line:

val predictionSpam = sameModel.transform(spamTest).select("prediction")

Your dataframe contains only prediction column and no label column.

Comments