Milan Skála Milan Skála - 9 months ago 36
Python Question

How to download large files in Python 2

I'm trying to download large files (approx. 1GB) with mechanize module, but I have been unsuccessful. I've been searching for similar threads, but I have found only those, where the files are publicly accessible and no login is required to obtain a file. But this is not my case as the file is located in the private section and I need to login before the download. Here is what I've done so far.

import mechanize

g_form_id = ""

def is_form_found(form1):
return "id" in form1.attrs and form1.attrs['id'] == g_form_id

def select_form_with_id_using_br(br1, id1):
global g_form_id
g_form_id = id1
except mechanize.FormNotFoundError:
print "form not found, id: " + g_form_id

url_to_login = ""
url_to_file = ""
local_filename = "fname.exe"

br = mechanize.Browser()
br.set_handle_robots(False) # ignore robots
br.set_handle_refresh(False) # can sometimes hang without this
br.addheaders = [('User-agent', 'Firefox')]

response =
# Find login form
select_form_with_id_using_br(br, 'login-form')
# Fill in data
br.form['email'] = ''
br.form['password'] = 'password'
br.set_all_readonly(False) # allow everything to be written to

# Try to download file
br.retrieve(url_to_file, local_filename)

But I'm getting an error when 512MB is downloaded:

Traceback (most recent call last):
File "", line 34, in <module>
br.retrieve(br.retrieve(url_to_file, local_filename)
File "C:\Python27\lib\site-packages\mechanize\", line 277, in retrieve
block =
File "C:\Python27\lib\site-packages\mechanize\", line 199, in read
MemoryError: out of memory

Do you have any ideas how to solve this?


You can use bs4 and requests to get you logged in then write the streamed content. There are a few form fields required including a _token_ field that is definitely necessary:

from bs4 import BeautifulSoup
import requests
from urlparse import urljoin

data = {'email': '', 'password': 'password'}
base = ""

with requests.Session() as s:
    # update headers
    s.headers.update({'User-agent': 'Firefox'})

    # use bs4 to parse the from fields
    soup = BeautifulSoup(s.get(base).content)
    form = soup.select_one("#frm-loginForm")
    # works as it is a relative path. Not always the case.
    action = form["action"]

    # Get rest of the fields, ignore password and email.
    for inp in form.find_all("input", {"name":True,"value":True}):
        name, value = inp["name"], inp["value"]
        if name not in data:
            data[name] = value
    # login, action), data=data)
    # get protected url
    with open(local_filename, "wb") as f:
        for chk in s.get(url_to_file, stream=True).iter_content(1024):