Satyabrat Mishra Satyabrat Mishra - 2 months ago 25
Python Question

Passing Series with dtype= 'category' as categories for Pandas Categorical function

when I run this code I get the following error :

import pandas as pd

car_colors = pd.Series(['Blue', 'Red', 'Green'],
dtype='category')

car_data = pd.Categorical(['Yellow', 'Green', 'Red', 'Blue','Purple'],
categories= car_colors, ordered=False)
print car_colors
s = pd.Series(car_data)
s




---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
D:\Users\satya1\AppData\Local\Continuum\Anaconda2\lib\site-packages\IPython\core\formatters.pyc in __call__(self, obj)
697 type_pprinters=self.type_printers,
698 deferred_pprinters=self.deferred_printers)
--> 699 printer.pretty(obj)
700 printer.flush()
701 return stream.getvalue()

D:\Users\satya1\AppData\Local\Continuum\Anaconda2\lib\site-packages\IPython\lib\pretty.pyc in pretty(self, obj)
381 if callable(meth):
382 return meth(obj, self, cycle)
--> 383 return _default_pprint(obj, self, cycle)
384 finally:
385 self.end_group()

D:\Users\satya1\AppData\Local\Continuum\Anaconda2\lib\site-packages\IPython\lib\pretty.pyc in _default_pprint(obj, p, cycle)
501 if _safe_getattr(klass, '__repr__', None) not in _baseclass_reprs:
502 # A user-provided repr. Find newlines and replace them with p.break_()
--> 503 _repr_pprint(obj, p, cycle)
504 return
505 p.begin_group(1, '<')

D:\Users\satya1\AppData\Local\Continuum\Anaconda2\lib\site-packages\IPython\lib\pretty.pyc in _repr_pprint(obj, p, cycle)
692 """A pprint that just redirects to the normal repr function."""
693 # Find newlines and replace them with p.break_()
--> 694 output = repr(obj)
695 for idx,output_line in enumerate(output.splitlines()):
696 if idx:

D:\Users\satya1\AppData\Local\Continuum\Anaconda2\lib\site-packages\pandas\core\base.pyc in __repr__(self)
65 Yields Bytestring in Py2, Unicode String in py3.
66 """
---> 67 return str(self)
68
69

D:\Users\satya1\AppData\Local\Continuum\Anaconda2\lib\site-packages\pandas\core\base.pyc in __str__(self)
45 if compat.PY3:
46 return self.__unicode__()
---> 47 return self.__bytes__()
48
49 def __bytes__(self):

D:\Users\satya1\AppData\Local\Continuum\Anaconda2\lib\site-packages\pandas\core\base.pyc in __bytes__(self)
57
58 encoding = get_option("display.encoding")
---> 59 return self.__unicode__().encode(encoding, 'replace')
60
61 def __repr__(self):

D:\Users\satya1\AppData\Local\Continuum\Anaconda2\lib\site-packages\pandas\core\series.pyc in __unicode__(self)
982
983 self.to_string(buf=buf, name=self.name, dtype=self.dtype,
--> 984 max_rows=max_rows)
985 result = buf.getvalue()
986

D:\Users\satya1\AppData\Local\Continuum\Anaconda2\lib\site-packages\pandas\core\series.pyc in to_string(self, buf, na_rep, float_format, header, index, length, dtype, name, max_rows)
1023 the_repr = self._get_repr(float_format=float_format, na_rep=na_rep,
1024 header=header, index=index, length=length,
-> 1025 dtype=dtype, name=name, max_rows=max_rows)
1026
1027 # catch contract violations

D:\Users\satya1\AppData\Local\Continuum\Anaconda2\lib\site-packages\pandas\core\series.pyc in _get_repr(self, name, header, index, length, dtype, na_rep, float_format, max_rows)
1051 float_format=float_format,
1052 max_rows=max_rows)
-> 1053 result = formatter.to_string()
1054
1055 # TODO: following check prob. not neces.

D:\Users\satya1\AppData\Local\Continuum\Anaconda2\lib\site-packages\pandas\formats\format.pyc in to_string(self)
223
224 fmt_index, have_header = self._get_formatted_index()
--> 225 fmt_values = self._get_formatted_values()
226
227 if self.truncate_v:

D:\Users\satya1\AppData\Local\Continuum\Anaconda2\lib\site-packages\pandas\formats\format.pyc in _get_formatted_values(self)
213 def _get_formatted_values(self):
214 return format_array(self.tr_series._values, None,
--> 215 float_format=self.float_format, na_rep=self.na_rep)
216
217 def to_string(self):

D:\Users\satya1\AppData\Local\Continuum\Anaconda2\lib\site-packages\pandas\formats\format.pyc in format_array(values, formatter, float_format, na_rep, digits, space, justify, decimal)
2005 space=space, justify=justify, decimal=decimal)
2006
-> 2007 return fmt_obj.get_result()
2008
2009

D:\Users\satya1\AppData\Local\Continuum\Anaconda2\lib\site-packages\pandas\formats\format.pyc in get_result(self)
2024
2025 def get_result(self):
-> 2026 fmt_values = self._format_strings()
2027 return _make_fixed_width(fmt_values, self.justify)
2028

D:\Users\satya1\AppData\Local\Continuum\Anaconda2\lib\site-packages\pandas\formats\format.pyc in _format_strings(self)
2254
2255 def _format_strings(self):
-> 2256 fmt_values = format_array(self.values.get_values(), self.formatter,
2257 float_format=self.float_format,
2258 na_rep=self.na_rep, digits=self.digits,

D:\Users\satya1\AppData\Local\Continuum\Anaconda2\lib\site-packages\pandas\core\categorical.pyc in get_values(self)
1169 if com.is_datetimelike(self.categories):
1170 return self.categories.take(self._codes, fill_value=np.nan)
-> 1171 return np.array(self)
1172
1173 def check_for_ordered(self, op):

ValueError: object __array__ method not producing an array


But the funny thing is, when I remove the dtype = 'category', the code works fine.

So in short, the categorical function is accepting series but not with dtype = 'category'

Is it a bug or am I doing something wrong

Answer

It looks like need add tolist to categories in Categorical:

car_colors = pd.Series(['Blue', 'Red', 'Green'],
           dtype='category')

car_data = pd.Categorical(['Yellow', 'Green', 'Red', 'Blue','Purple'],
                              categories = car_colors.tolist(), ordered=False)

s = pd.Series(car_data)
print (s)

0      NaN
1    Green
2      Red
3     Blue
4      NaN
dtype: category
Categories (3, object): [Blue, Red, Green]

Another solution from EdChum's comment is use cat.categories:

car_data = pd.Categorical(['Yellow', 'Green', 'Red', 'Blue','Purple'],
                              categories = car_colors.cat.categories, ordered=False)
s = pd.Series(car_data)
print (s)
0      NaN
1    Green
2      Red
3     Blue
4      NaN
dtype: category
Categories (3, object): [Blue, Green, Red]