Scott Scott - 2 months ago 34
Python Question

Matplotlib error: 'height' must be length 5 or scalar

I am attempting to plot the output of the script into a 2 series bar graph using matplotlib in python 2.7.

My script prints 'msg' which results in the following output:

KNN: 90.000000 (0.322734)

LDA: 83.641395 (0.721210)

CART: 92.600996 (0.399870)

NB: 29.214167 (1.743959)

Random Forest: 92.617598 (0.323824)

After the code outputs the results of the 'msg', I attempt to plot the results into a 2 series bar graph using matplotlib and am then returned with the following Error:

Traceback (most recent call last):
File "comparison.py", line 113, in <module>
label='mean')
File "C:\Users\Scot\Anaconda2\lib\site-packages\matplotlib\pyplot.py", line 2650, in bar
**kwargs)
File "C:\Users\Scot\Anaconda2\lib\site-packages\matplotlib\__init__.py", line 1818, in inner
return func(ax, *args, **kwargs)
File "C:\Users\Scot\Anaconda2\lib\site-packages\matplotlib\axes\_axes.py", line 2038, in bar
"must be length %d or scalar" % nbars)
ValueError: incompatible sizes: argument 'height' must be length 5 or scalar


Im not sure how to fix this, I think it may be due to the values of the results being a float value? Any help would be much appreciated.
Here is my code:

# Modules
import pandas
import numpy
import os
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from matplotlib import style
plt.rcdefaults()
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_recall_curve, average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from scipy.stats import ttest_ind, ttest_ind_from_stats
from scipy.special import stdtr
from sklearn.svm import SVC
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
import warnings

# Load KDD dataset
data_set = "NSL-KDD/KDDTest+.arff"
import os
os.system("cls")

print "Loading: ", data_set

with warnings.catch_warnings():
warnings.simplefilter("ignore")

names = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'su_attempted', 'num_root', 'num_file_creations',
'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'class',
'dst_host_srv_rerror_rate']

dataset = pandas.read_csv(data_set, names=names)

for column in dataset.columns:
if dataset[column].dtype == type(object):
le = LabelEncoder()
dataset[column] = le.fit_transform(dataset[column])

array = dataset.values
X = array[:, 0:40]
Y = array[:, 40]

# Split-out validation dataset
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = cross_validation.train_test_split(
X, Y, test_size=validation_size, random_state=seed)

# Test options and evaluation metric
num_folds = 10
num_instances = len(X_train)
seed = 10
scoring = 'accuracy'

# Algorithms
models = []
models.append(('KNN', KNeighborsClassifier()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('Random Forest', RandomForestClassifier()))
# models.append(('LR', LogisticRegression()))

# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
cv_results = cross_validation.cross_val_score(
model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean() * 100, cv_results.std()
* 100) # multiplying by 100 to show percentage
print(msg)
# print cv_results * 100 # plots all values that make the average

print ("\n")

# Perform T Test on each iteration of models.
for i in range(len(results) - 1):
for j in range(i, len(results)):
t, p = ttest_ind(results[i], results[j], equal_var=False)
print("T_Test between {} & {}: T Value = {}, P Value = {}".format(
names[i], names[j], t, p))
print("\n")

plt.style.use('ggplot')
n_groups = 5
# create plot
fig, ax = plt.subplots()
index = numpy.arange(n_groups)
bar_width = 0.35
opacity = 0.8

rects1 = plt.bar(index, cv_results, bar_width,
alpha=opacity,
# color='b',
label='mean') # Line 113

rects2 = plt.bar(index + bar_width, cv_results.std(), bar_width,
alpha=opacity,
color='g',
label='standard_d')

plt.xlabel('Models')
plt.ylabel('Percentage')
plt.title('All Model Performance')
plt.xticks(index + bar_width, (names))
plt.legend()

plt.tight_layout()
plt.show()


EDIT

printing
cv_results
appears as the following and is 7 or 8 decimal places:

[ 90.48146099 90.48146099 89.42999447 89.5960155 90.03873824
89.9833979 89.9833979 89.76203652 90.09407858 90.14941893]

[ 83.34255672 84.94742667 82.2910902 83.78527947 84.3386829
83.9513005 82.78915329 84.06198118 83.39789707 83.50857775]

[ 93.1931378 92.69507471 91.92030991 92.52905368 92.69507471
92.41837299 92.58439402 92.25235196 92.19701162 92.14167128]

[ 29.05368013 26.89540675 31.54399557 28.22357499 29.27504151
27.94687327 33.20420587 28.99833979 28.55561704 28.44493636]

[ 93.35915883 93.02711677 92.25235196 91.69894853 93.02711677
92.63973437 92.58439402 92.14167128 92.47371334 92.69507471]

tom tom
Answer Source

if you want to plot the means of cv_results, you need to calculate the means with .mean(), as you do with .std() in the second plot.

Also, you go through the process of appending cv_results for each model to results, but then when you come to do the plotting, it seems like you are still using cv_results, but this is probably only going to be the cv_results for the last model accessed in the loop.

It looks like your results will then be a list containing 5 numpy arrays. So, you could loop over that list, calculate the mean of each array, and use that to plot you barplot:

mean_results = [res.mean() for res in results]
rects1 = plt.bar(index, mean_results,  bar_width,
                 alpha=opacity,
                 #  color='b',
                 label='mean')

Alternatively, you could just append cv_results.mean() to a list during your original loop, and use that list to make your bar plot.