Ryanb58 Ryanb58 - 28 days ago 8
Python Question

How do you append bytes onto a file inside of a zip file via Python 2.7?

I am currently working on a piece of a larger puzzle. For my piece, I have a file object and the destination of the zip file. There is never a time that I am aware of the size of the file object. I only know I have one. There for the zip has to support zip64.

My goal is to take that file object(pointer to the file) and write it to the zip file without loading the entire file into memory. I would like to do this chunk by chunk(especially if the file object is really big).

Any ideas on how I can go about doing this?

import zipfile


zip_path = "/tmp/file.zip"
file_to_zip_path = "/home/ryanb58/Desktop/movie.mp4"

with zipfile.ZipFile(zip_path, mode="w", allowZip64=True) as zip:
f = open(file_to_zip_path, 'rb')
while True:
data = f.read(1024)
zip.writestr("file.mp4", data)
if not data:
break


My issue is that when I write the new bytes to the file inside the zip. Upon it finishing, I open the zip and it is just a huge list of small files with the same name, each about 1024bytes in size. My code above ^^ I am kind of stuck, so any ideas or solutions would be great.

Answer Source

Following the advice that @J.F.Sebastian gave in his comment, I was able to write my file to a zip without bringing the full file into memory.

Here is my solution for the override.

import zipfile

BUFFER_SIZE = 1024 * 10000 # 10 megabytes.

class Zip(zipfile.ZipFile):

def write(self, fileobj, arcname=None, compress_type=None):
    """Put the bytes from file into the archive under the name
    arcname."""

    """CONST"""
    ZIP64_LIMIT = (1 << 31) - 1
    ZIP_DEFLATED = 8

    try:
        import zlib # We may need its compression method
        crc32 = zlib.crc32
    except ImportError:
        zlib = None
        crc32 = binascii.crc32

    if not self.fp:
        raise RuntimeError(
              "Attempt to write to ZIP archive that was already closed")

    st = os.stat(fileobj.name) 
    isdir = stat.S_ISDIR(st.st_mode)
    mtime = time.localtime(st.st_mtime)
    date_time = mtime[0:6]

    # Create ZipInfo instance to store file information
    if arcname is None:
        arcname = "/temp.zip"
    arcname = os.path.normpath(os.path.splitdrive(arcname)[1])

    # Strips any leading forward or back slashes for files.
    while arcname[0] in (os.sep, os.altsep):
        arcname = arcname[1:]
    if isdir:
        arcname += '/'

    # Create the zipinfo.
    zinfo = zipfile.ZipInfo(arcname, date_time)
    zinfo.external_attr = (st.st_mode & 0xFFFF) << 16L      # Unix attributes

    if isdir:
        zinfo.compress_type = ZIP_STORED
    elif compress_type is None:
        zinfo.compress_type = self.compression
    else:
        zinfo.compress_type = compress_type

    zinfo.file_size = st.st_size
    zinfo.flag_bits = 0x00
    zinfo.header_offset = self.fp.tell()    # Start of header bytes

    self._writecheck(zinfo)
    self._didModify = True

    if isdir:
        zinfo.file_size = 0
        zinfo.compress_size = 0
        zinfo.CRC = 0
        zinfo.external_attr |= 0x10  # MS-DOS directory flag
        self.filelist.append(zinfo)
        self.NameToInfo[zinfo.filename] = zinfo
        self.fp.write(zinfo.FileHeader(False))
        return

    # Must overwrite CRC and sizes with correct data later
    zinfo.CRC = CRC = 0
    zinfo.compress_size = compress_size = 0
    # Compressed size can be larger than uncompressed size
    zip64 = self._allowZip64 and \
            zinfo.file_size * 1.05 > ZIP64_LIMIT
    self.fp.write(zinfo.FileHeader())
    if zinfo.compress_type == ZIP_DEFLATED:
        cmpr = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION,
             zlib.DEFLATED, -15)
    else:
        cmpr = None
    file_size = 0
    while 1:
        buf = fileobj.read(BUFFER_SIZE)
        if not buf:
            break
        file_size = file_size + len(buf)
        CRC = crc32(buf, CRC) & 0xffffffff
        if cmpr:
            buf = cmpr.compress(buf)
            compress_size = compress_size + len(buf)
        self.fp.write(buf)

    if cmpr:
        buf = cmpr.flush()
        compress_size = compress_size + len(buf)
        self.fp.write(buf)
        zinfo.compress_size = compress_size
    else:
        zinfo.compress_size = file_size
    zinfo.CRC = CRC
    zinfo.file_size = file_size
    if not zip64 and self._allowZip64:
        if file_size > ZIP64_LIMIT:
            raise RuntimeError('File size has increased during compressing')
        if compress_size > ZIP64_LIMIT:
            raise RuntimeError('Compressed size larger than uncompressed size')
    # Seek backwards and write file header (which will now include
    # correct CRC and file sizes)
    position = self.fp.tell()       # Preserve current position in file
    self.fp.seek(zinfo.header_offset, 0)
    self.fp.write(zinfo.FileHeader())
    self.fp.seek(position, 0)
    self.filelist.append(zinfo)
    self.NameToInfo[zinfo.filename] = zinfo

As you can see I can't pass in zip64 into the FileHeader methods because on the system the code runs, it only supports Python 2.7.2 Whereas to support the correct headers for zip64 files you will need Python 2.7.4 at the minimum.

https://github.com/python/cpython/blob/2e46376c8c10908afed56ace4c7f0f7c64e80c5e/Misc/NEWS#L189