solarenqu solarenqu - 10 days ago 9
Scala Question

Spark NaiveBayesTextClassification

i'm trying to create a text classifier spark(1.6.2) app, but I don't know what am I doing wrong. This is my code:

import org.apache.spark.ml.classification.{NaiveBayes, NaiveBayesModel}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.mllib
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}


/**
* Created by kebodev on 2016.11.29..
*/
object PredTest {

def main(args: Array[String]): Unit = {

val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("IktatoSparkRunner")
.set("spark.executor.memory", "2gb")

val sc = new SparkContext(conf)

val sqlContext = new SQLContext(sc)

val trainData = sqlContext.read.json("src/main/resources/tst.json")

val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
val wordsData = tokenizer.transform(trainData)
val hashingTF = new HashingTF()
.setInputCol("words").setOutputCol("features").setNumFeatures(20)
val featurizedData = hashingTF.transform(wordsData)


val model = NaiveBayes.train(featurizedData)

}

}


The
NaiveBayes
object doesn't have train method, what should I import?

If i try to use this way:

val naBa = new NaiveBayes()
naBa.fit(featurizedData)


I get this exception:

Exception in thread "main" java.lang.IllegalArgumentException: requirement failed: Column label must be of type DoubleType but was actually StringType.
at scala.Predef$.require(Predef.scala:224)
at org.apache.spark.ml.util.SchemaUtils$.checkColumnType(SchemaUtils.scala:42)
at org.apache.spark.ml.PredictorParams$class.validateAndTransformSchema(Predictor.scala:53)
at org.apache.spark.ml.classification.Classifier.org$apache$spark$ml$classification$ClassifierParams$$super$validateAndTransformSchema(Classifier.scala:56)
at org.apache.spark.ml.classification.ClassifierParams$class.validateAndTransformSchema(Classifier.scala:40)
at org.apache.spark.ml.classification.ProbabilisticClassifier.org$apache$spark$ml$classification$ProbabilisticClassifierParams$$super$validateAndTransformSchema(ProbabilisticClassifier.scala:53)
at org.apache.spark.ml.classification.ProbabilisticClassifierParams$class.validateAndTransformSchema(ProbabilisticClassifier.scala:37)
at org.apache.spark.ml.classification.ProbabilisticClassifier.validateAndTransformSchema(ProbabilisticClassifier.scala:53)
at org.apache.spark.ml.Predictor.transformSchema(Predictor.scala:116)
at org.apache.spark.ml.PipelineStage.transformSchema(Pipeline.scala:68)
at org.apache.spark.ml.Predictor.fit(Predictor.scala:89)
at PredTest$.main(PredTest.scala:37)
at PredTest.main(PredTest.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at com.intellij.rt.execution.application.AppMain.main(AppMain.java:147)


This is how my json file looks like:

{"text":"any text","label":"6.0"}


I'm really noob in this topic. Can anyone help me how to create a model, and then how to predict a new value.

Thank you!

Answer

Labels and Feature Vectors only contain Doubles. Your label column contains a String.

See your stacktrace:

Column label must be of type DoubleType but was actually StringType.

You can use the StringIndexer or CountVectorizer to convert it appropriately. See http://spark.apache.org/docs/latest/ml-features.html#stringindexer for further details.