GOU7HAM GOU7HAM - 20 days ago 5
Python Question

I would like to split a huge file into many number of files with the header in all split files. Using python

"SURNAME","GIVENNAME","MIDDLENAME","UPIN","NAME","CODE"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770


Let's suppose this is the format of the huge file and I would like to split into many number of files with specified size and in each file I need the header ("SURNAME","GIVENNAME","MIDDLENAME","UPIN","NAME","CODE") to be present.Thanks in advance.




import os
import sys

def getfilesize(filename):
with open(filename,"rb") as fr:
fr.seek(0,2) # move to end of the file
size=fr.tell()
print("getfilesize: size: %s" % size)
return fr.tell()

def splitfile(filename, splitsize):
# Open original file in read only mode
if not os.path.isfile(filename):
print("No such file as: \"%s\"" % filename)
return

filesize=getfilesize(filename)
with open(filename,"rb") as fr:
counter=1
orginalfilename = filename.split(".")
readlimit = 1000000 #read 5kb at a time
n_splits = filesize//splitsize
print("splitfile: No of splits required: %s" % str(n_splits))
for i in range(n_splits+1):
chunks_count = int(splitsize)//int(readlimit)
data_5kb = fr.read(readlimit) # read
# Create split files
print("chunks_count: %d" % chunks_count)
with open(orginalfilename[0]+"_{id}.".format(id=str(counter))+orginalfilename[1],"ab") as fw:
fw.seek(0)
fw.truncate()# truncate original if present
while data_5kb:
fw.write(data_5kb)
if chunks_count:
chunks_count-=1
data_5kb = fr.read(readlimit)
else: break
counter+=1

if __name__ == "__main__":
if len(sys.argv) < 3: print("Filename or splitsize not provided: Usage: filesplit.py filename splitsizeinkb ")
else:
filesize = int(sys.argv[2]) * 1000 #make into kb
filename = sys.argv[1]
splitfile(filename, filesize)


This works fine but couldn't get the Headers and I'm sorry I'm new to Stackoverflow.

Answer

This should do it

import os

maxlines = 1000  # how many lines did you want each new file to have?
infilepath = 'path/to/file'
with open(infilepath) as infile:
    dirpath = os.path.dirname(infilepath)
    fname = os.path.basename(infilepath)
    fname, ext = fname.rsplit('.',1)

    header = infile.readline()
    outfile = open(os.path.join(dirpath, "{}{}.{}".format(fname, 0, ext)), 'w')

    for i,line in enumerate(infile):
        if not i%maxlines:
            outfile.close()
            outfile = open(os.path.join(dirpath, "{}{}.{}".format(fname, i//maxlines, ext)), 'w')
            outfile.write(header)
        outfile.write(line)

    try: outfile.close()
    except: pass