muhammadn muhammadn - 6 months ago 18
Python Question

process data with header in rows in python csv

I have a csv file which has the header (name of item) within the same row as the USER_ID which i want to process.

The csv file looks like this:

adidas,,
USER_ID,USER_NAME
b012345,zaihan,Process
b212345,nurhanani,Check
b451234,nasirah,Call
c234567,ibrahim,Check
nike,,
USER_ID,USER_NAME
b842134,khalee,Call
h123455,shabree,Process
b712345,ibrahim,Process
c143322,zaihan,Check
b843432,nasirah,Call


Is there a way to process the data above into something like this:

USER_ID,USER_NAME,adidas,nike,
b12345,zaihan,Process,Check,
b212345,nurhanani,Check,,
b451234,nasirah,Call,Call,
c234567,ibrahim,Check,Process,
b842134,khalee,,Call,
h123455,shabree,,Process,


I had been writing the code for sometime and i figured i have to hardcode the header (for example, 'adidas' and 'nike') since what i read on stackoverflow is that i require unique header name because they are not unique
and it is fine if i go this way.

My python code is:

import csvkit
import sys
import os
from csvkit import convert

with open('/tmp/csvdata.csv', 'rb') as q:
reader = csvkit.reader(q)
with open('/tmp/csvdata2.csv', 'wb') as s:
data = csvkit.writer(s)
data.writerow(['Name', 'Userid', 'adidas', 'nike'])
for row in reader:
row_data = [row[0], row[1], row[2], '']
data = csvkit.writer(s)
data.writerow(row_data)

Answer

Ok, this is complicated.

Solution

from StringIO import StringIO
import re
import pandas as pd

text = """adidas,,
USER_ID,USER_NAME
b012345,zaihan,Process
b212345,nurhanani,Check
b451234,nasirah,Call
c234567,ibrahim,Check
nike,,
USER_ID,USER_NAME
b842134,khalee,Call
h123455,shabree,Process
c234567,ibrahim,Process
c143322,zaihan,Check
b451234,nasirah,Call
"""

m = re.findall(r'(.*,,\n(.*([^,]|,[^,])\n)*)', text)

dfs = range(len(m))
keys = range(len(m))
for i, f in enumerate(m):
    lines = f[0].split('\n')
    lines[1] += ','
    keys[i] = lines[0].split(',')[0]
    dfs[i] = pd.read_csv(StringIO('\n'.join(lines[1:])))

df = pd.concat(dfs, keys=keys)
df = df.set_index(['USER_ID', 'USER_NAME'], append=True).unstack(0)

df.index = df.index.droplevel(0)
df.columns = df.columns.droplevel(0)

df = df.stack().unstack()

Demonstration

print df.to_csv()

USER_ID,USER_NAME,adidas,nike
b012345,zaihan,Process,
b212345,nurhanani,Check,
b451234,nasirah,Call,Call
b842134,khalee,,Call
c143322,zaihan,,Check
c234567,ibrahim,Check,Process
h123455,shabree,,Process

Explanation

# regular expression to match line with a single value identified
# by having two commas at the end of the line.
# This grabs nike and adidas.
# It also grabs all lines after that until the next single valued line.
m = re.findall(r'(.*,,\n(.*([^,]|,[^,])\n)*)', text)

# place holder for list of sub dataframes
dfs = range(len(m))
# place holder for list of keys.  In this example this will be nike and adidas
keys = range(len(m))

# Loop through each regex match.  This example will only have 2.
for i, f in enumerate(m):
    # split on new line so I can grab and fix stuff
    lines = f[0].split('\n')
    # Fix that header row only has 2 columns and data has 3
    lines[1] += ','
    # Grab nike or adidas or other single value
    keys[i] = lines[0].split(',')[0]
    # Create dataframe by reading in rest of lines
    dfs[i] = pd.read_csv(StringIO('\n'.join(lines[1:])))

# Concat dataframes with appropriate keys and pivot stuff
df = pd.concat(dfs, keys=keys)
df = df.set_index(['USER_ID', 'USER_NAME'], append=True).unstack(0)

df.index = df.index.droplevel(0)
df.columns = df.columns.droplevel(0)

df = df.stack().unstack()