shubham_827 shubham_827 - 1 month ago 22
Python Question

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte, while reading csv file in pandas

I know similar questions has been asked already I have seen all of them and tried but of little help. I am using OSX 10.11 El Capitan, python3.6., virtual environment, tried without that also. I am using jupyter notebook and spyder3.

I am new to python, but know basic ML and following a post to learn how to solve Kaggle challenges: Link to Blog, Link to Data Set

.I am stuck at the first few lines of code
`

import pandas as pd

destinations = pd.read_csv("destinations.csv")
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")


and it is giving me error

UnicodeDecodeError Traceback (most recent call last)
<ipython-input-19-a928a98eb1ff> in <module>()
1 import pandas as pd
----> 2 df = pd.read_csv('destinations.csv', compression='infer',date_parser=True, usecols=([0,1,3]))
3 df.head()

/usr/local/lib/python3.6/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
653 skip_blank_lines=skip_blank_lines)
654
--> 655 return _read(filepath_or_buffer, kwds)
656
657 parser_f.__name__ = name

/usr/local/lib/python3.6/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
403
404 # Create the parser.
--> 405 parser = TextFileReader(filepath_or_buffer, **kwds)
406
407 if chunksize or iterator:

/usr/local/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
762 self.options['has_index_names'] = kwds['has_index_names']
763
--> 764 self._make_engine(self.engine)
765
766 def close(self):

/usr/local/lib/python3.6/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
983 def _make_engine(self, engine='c'):
984 if engine == 'c':
--> 985 self._engine = CParserWrapper(self.f, **self.options)
986 else:
987 if engine == 'python':

/usr/local/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
1603 kwds['allow_leading_cols'] = self.index_col is not False
1604
-> 1605 self._reader = parsers.TextReader(src, **kwds)
1606
1607 # XXX

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__ (pandas/_libs/parsers.c:6175)()

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._get_header (pandas/_libs/parsers.c:9691)()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte


Some answers on stakoverflow suggested that it is because it is gzipped, but Chrome downloaded the .csv file and .csv.gz was nowhere to be seen and returned file not found error.

I then read somewhere to use
encoding='latin1'
, but after doing this I am getting parser error:

---------------------------------------------------------------------------
ParserError Traceback (most recent call last)
<ipython-input-21-f9c451f864a2> in <module>()
1 import pandas as pd
2
----> 3 destinations = pd.read_csv("destinations.csv",encoding='latin1')
4 test = pd.read_csv("test.csv")
5 train = pd.read_csv("train.csv")

/usr/local/lib/python3.6/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
653 skip_blank_lines=skip_blank_lines)
654
--> 655 return _read(filepath_or_buffer, kwds)
656
657 parser_f.__name__ = name

/usr/local/lib/python3.6/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
409
410 try:
--> 411 data = parser.read(nrows)
412 finally:
413 parser.close()

/usr/local/lib/python3.6/site-packages/pandas/io/parsers.py in read(self, nrows)
1003 raise ValueError('skipfooter not supported for iteration')
1004
-> 1005 ret = self._engine.read(nrows)
1006
1007 if self.options.get('as_recarray'):

/usr/local/lib/python3.6/site-packages/pandas/io/parsers.py in read(self, nrows)
1746 def read(self, nrows=None):
1747 try:
-> 1748 data = self._reader.read(nrows)
1749 except StopIteration:
1750 if self._first_chunk:

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.read (pandas/_libs/parsers.c:10862)()

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._read_low_memory (pandas/_libs/parsers.c:11138)()

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._read_rows (pandas/_libs/parsers.c:11884)()

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._tokenize_rows (pandas/_libs/parsers.c:11755)()

pandas/_libs/parsers.pyx in pandas._libs.parsers.raise_parser_error (pandas/_libs/parsers.c:28765)()

ParserError: Error tokenizing data. C error: Expected 2 fields in line 11, saw 3


I have spent hours to debug this, tried to open the csv files on Atom( no other app could open it), online web-apps(some crashed) but of no help.I have tried using the kernels of other people who have solved the problem, but of no help.

Answer Source

It's still most likely gzipped data. gzip's magic number is 0x1f 0x8b, which is consistent with the UnicodeDecodeError you get.

You could try decompressing the data on the fly:

with open('destinations.csv', 'rb') as fd:
    gzip_fd = gzip.GzipFile(fileobj=fd)
    destinations = pd.read_csv(gzip_fd)