GOU7HAM GOU7HAM - 2 months ago 13
Python Question

I would like to split a huge file into many number of files with the header in all split files. Using python

"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770

Let's suppose this is the format of the huge file and I would like to split into many number of files with specified size and in each file I need the header ("SURNAME","GIVENNAME","MIDDLENAME","UPIN","NAME","CODE") to be present.Thanks in advance.

import os
import sys

def getfilesize(filename):
with open(filename,"rb") as fr:
fr.seek(0,2) # move to end of the file
print("getfilesize: size: %s" % size)
return fr.tell()

def splitfile(filename, splitsize):
# Open original file in read only mode
if not os.path.isfile(filename):
print("No such file as: \"%s\"" % filename)

with open(filename,"rb") as fr:
orginalfilename = filename.split(".")
readlimit = 1000000 #read 5kb at a time
n_splits = filesize//splitsize
print("splitfile: No of splits required: %s" % str(n_splits))
for i in range(n_splits+1):
chunks_count = int(splitsize)//int(readlimit)
data_5kb = fr.read(readlimit) # read
# Create split files
print("chunks_count: %d" % chunks_count)
with open(orginalfilename[0]+"_{id}.".format(id=str(counter))+orginalfilename[1],"ab") as fw:
fw.truncate()# truncate original if present
while data_5kb:
if chunks_count:
data_5kb = fr.read(readlimit)
else: break

if __name__ == "__main__":
if len(sys.argv) < 3: print("Filename or splitsize not provided: Usage: filesplit.py filename splitsizeinkb ")
filesize = int(sys.argv[2]) * 1000 #make into kb
filename = sys.argv[1]
splitfile(filename, filesize)

This works fine but couldn't get the Headers and I'm sorry I'm new to Stackoverflow.


This should do it

import os

maxlines = 1000  # how many lines did you want each new file to have?
infilepath = 'path/to/file'
with open(infilepath) as infile:
    dirpath = os.path.dirname(infilepath)
    fname = os.path.basename(infilepath)
    fname, ext = fname.rsplit('.',1)

    header = infile.readline()
    outfile = open(os.path.join(dirpath, "{}{}.{}".format(fname, 0, ext)), 'w')

    for i,line in enumerate(infile):
        if not i%maxlines:
            outfile = open(os.path.join(dirpath, "{}{}.{}".format(fname, i//maxlines, ext)), 'w')

    try: outfile.close()
    except: pass