Anonymous Anonymous - 4 months ago
353 0

we used naieve bayes in MLlib library within pyspak to classify text files
in which use tfidf representation and multiclassmetrics for evaluation
it works fine but when compute multiclassmetrics precion, recall and fmeasure it keep give us this error repeatedly

net.razorvine.pickle.PickleException: expected zero arguments for construction of ClassDict (for numpy.dtype)
net.razorvine.pickle.objects.ClassDictConstructor.construct(ClassDictConstructor.java:23)

Python

text classification using pyspark

from pyspark import SparkContext
sc = SparkContext(local, appName="ArabicTextNBClassfication")
from pyspark import  StorageLevel
bk1_rdd = sc.textFile("E:/..../output1.txt").map(lambda line: (0, line.split()))
bk1_rdd.persist(storageLevel=StorageLevel.MEMORY_AND_DISK)
bk2_rdd = sc.textFile("E:/..../output2.txt").map(lambda line: (1, line.split()))
bk2_rdd.persist(storageLevel=StorageLevel.MEMORY_AND_DISK)
bk3_rdd = sc.textFile("E:/..../output3.txt").map(lambda line: (2, line.split()))
bk3_rdd.persist(storageLevel=StorageLevel.MEMORY_AND_DISK)
bk4_rdd = sc.textFile("E:/..../output4.txt").map(lambda line: (3, line.split()))
bk4_rdd.persist(storageLevel=StorageLevel.MEMORY_AND_DISK)
bk5_rdd = sc.textFile("E:/..../output5.txt").map(lambda line: (4, line.split()))
bk5_rdd.persist(storageLevel=StorageLevel.MEMORY_AND_DISK)
bk6_rdd = sc.textFile("E:/..../output6.txt").map(lambda line: (5, line.split()))
bk6_rdd.persist(storageLevel=StorageLevel.MEMORY_AND_DISK)
bk7_rdd = sc.textFile("E:/..../output7.txt").map(lambda line: (6, line.split()))
bk7_rdd.persist(storageLevel=StorageLevel.MEMORY_AND_DISK)
bk8_rdd = sc.textFile("E:/..../output8.txt").map(lambda line: (7, line.split()))
bk8_rdd.persist(storageLevel=StorageLevel.MEMORY_AND_DISK)

hadith_rdd = bk1_rdd.union(bk2_rdd)
hadith_rdd = hadith_rdd.union(bk3_rdd)
hadith_rdd = hadith_rdd.union(bk4_rdd)
hadith_rdd = hadith_rdd.union(bk5_rdd)
hadith_rdd = hadith_rdd.union(bk6_rdd)
hadith_rdd = hadith_rdd.union(bk7_rdd)
hadith_rdd = hadith_rdd.union(bk8_rdd)

labels = hadith_rdd.map(lambda item : item[0])
features = hadith_rdd.map(lambda item : item[1])

from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF, IDF
tf = HashingTF().transform(features) 
tf.persist(storageLevel=StorageLevel.MEMORY_AND_DISK)

import socket
import errno
try:
			idff = IDF().fit(tf)                
except socket.error as error:
			if error.errno == errno.WSAECONNRESET:
					reconnect()
					retry_action()
			else:
					raise

tf_idf_rdd = idff.transform(tf)
dataRDD = labels.zip(tf_idf_rdd).map(lambda x: LabeledPoint(x[0], x[1]))
training, test = dataRDD.randomSplit([0.6, 0.4], seed=0)
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel

try:
			NBmodel = NaiveBayes.train(training, 1.0)         
except socket.error as error:
			if error.errno == errno.WSAECONNRESET:
					reconnect()
					retry_action()
			else:
					raise
predictionAndLabels = test.map(lambda p: (NBmodel.predict(p.features), p.label))
correct = predictionAndLabels.filter(lambda x: x[0] == x[1]).count() 
testC = test.count()
accuracy = correct / float(testC)
print("Accuracy = "+format(round(accuracy, 3))+"\n")

from pyspark.mllib.evaluation import MulticlassMetrics
metrics = MulticlassMetrics(predictionAndLabels)
#this where the error starts
precision = metrics.precision()
print("overall Precision =  " + format(round(precision, 3)) + "\n")
recall = metrics.recall()
print("overall Recall =  " + format(round(recall, 3)) + "\n")
f1Score = metrics.fMeasure()
print("overall F1 Measure = " + format(round(f1Score, 3))+ "\n")
Comments