Lakmal Geekiyanage Lakmal Geekiyanage - 3 months ago 33
Python Question

Save Python Classifier in Python data pickle

I'm using this code for the suggestion prediction in my system. I'm using Naive Bayes algorithm for this prediction. Here is the code source and the code which I used.

http://ebiquity.umbc.edu/blogger/2010/12/07/naive-bayes-classifier-in-50-lines/

from __future__ import division
import collections
import math

class Model:
def __init__(self, arffFile):
self.trainingFile = arffFile
self.features = {} #all feature names and their possible values (including the class label)
self.featureNameList = [] #this is to maintain the order of features as in the arff
self.featureCounts = collections.defaultdict(lambda: 1)#contains tuples of the form (label, feature_name, feature_value)
self.featureVectors = [] #contains all the values and the label as the last entry
self.labelCounts = collections.defaultdict(lambda: 0) #these will be smoothed later

def TrainClassifier(self):
for fv in self.featureVectors:
self.labelCounts[fv[len(fv)-1]] += 1 #udpate count of the label
for counter in range(0, len(fv)-1):
self.featureCounts[(fv[len(fv)-1], self.featureNameList[counter], fv[counter])] += 1

for label in self.labelCounts: #increase label counts (smoothing). remember that the last feature is actually the label
for feature in self.featureNameList[:len(self.featureNameList)-1]:
self.labelCounts[label] += len(self.features[feature])

def Classify(self, featureVector): #featureVector is a simple list like the ones that we use to train
probabilityPerLabel = {}
for label in self.labelCounts:
logProb = 0
for featureValue in featureVector:
logProb += math.log(self.featureCounts[(label, self.featureNameList[featureVector.index(featureValue)], featureValue)]/self.labelCounts[label])
probabilityPerLabel[label] = (self.labelCounts[label]/sum(self.labelCounts.values())) * math.exp(logProb)
print (probabilityPerLabel)
return max(probabilityPerLabel, key = lambda classLabel: probabilityPerLabel[classLabel])

def GetValues(self):
file = open(self.trainingFile, 'r')
for line in file:
if line[0] != '@': #start of actual data
self.featureVectors.append(line.strip().lower().split(','))
else: #feature definitions
if line.strip().lower().find('@data') == -1 and (not line.lower().startswith('@relation')):
self.featureNameList.append(line.strip().split()[1])
self.features[self.featureNameList[len(self.featureNameList) - 1]] = line[line.find('{')+1: line.find('}')].strip().split(',')
#self.features[self.featureNameList[len(self.featureNameList) - 1]] = [featureName.strip() for featureName in line [line.find('{')+1: line.find('}')].strip().split(',')]
file.close()

def TestClassifier(self, arffFile):
file = open(arffFile, 'r')
for line in file:
if line[0] != '@':
vector = line.strip().lower().split(',')
self.Classify(vector)

if __name__ == "__main__":
model = Model("Dataset/Military.arff")
model.GetValues()
model.TrainClassifier()
model.TestClassifier("MILITARY_FOR_PREDICTION.arff")


This is the sample data set which I used to train my Classifier

@RELATION MILITARY
@ATTRIBUTE country {SriLanka, Australia, Japan, France, India, Englend}
@ATTRIBUTE ageGroup {10-20, 20-30, 30-45, 45-60, >60}
@ATTRIBUTE purpose {Education, Enjoy}
@ATTRIBUTE occupation {Accountant, Teacher, Student}
@ATTRIBUTE timeAllocated {0-30, 30-1, 1-2, 2-3, 3-4, 4-6, 6-8}
@ATTRIBUTE visited {yes, no}
@DATA
SriLanka,10-19,Enjoy,Student,0-30,yes
India,20-30,Education,Student,30-1,yes
Englend,30-45,Education,Teacher,1-2,no
Japan,45-60,Enjoy,Accountant,2-3,yes
Australia,>60,Education,Teacher,3-4,yes
France,20-30,Enjoy,Student,4-6,no
SriLanka,30-45,Education,Teacher,6-8,yes
France,45-60,Enjoy,Teacher,3-4,yes
SriLanka,10-19,Education,Student,2-3,yes
SriLanka,30-45,Education,Teacher,0-30,yes
SriLanka,10-19,Enjoy,Student,30-1,yes
SriLanka,20-30,Education,Student,1-2,yes
India,30-45,Education,Teacher,2-3,yes
Englend,45-60,Enjoy,Accountant,3-4,yes
Japan,>60,Education,Teacher,4-6,yes
Australia,>60,Enjoy,Teacher,6-8,yes
France,20-30,Education,Teacher,3-4,no
SriLanka,30-45,Enjoy,Teacher,2-3,yes
France,45-60,Education,Student,2-3,yes


I want to save my trained classifier on Python data pickle and get it back when do the prediction. How do I do this?

Can you explain the TrainClassifier function?

Answer

You can use Pickle

You can save your model like this :

import pickle
f = open('my_classifier.pickle', 'wb')
pickle.dump(classifier, f)
f.close()

and for loading later :

import pickle
f = open('my_classifier.pickle', 'rb')
classifier = pickle.load(f)
f.close()
Comments