Milan Skála Milan Skála - 2 months ago 12
Python Question

How to download large files in Python 2

I'm trying to download large files (approx. 1GB) with mechanize module, but I have been unsuccessful. I've been searching for similar threads, but I have found only those, where the files are publicly accessible and no login is required to obtain a file. But this is not my case as the file is located in the private section and I need to login before the download. Here is what I've done so far.

import mechanize

g_form_id = ""

def is_form_found(form1):
return "id" in form1.attrs and form1.attrs['id'] == g_form_id

def select_form_with_id_using_br(br1, id1):
global g_form_id
g_form_id = id1
try:
br1.select_form(predicate=is_form_found)
except mechanize.FormNotFoundError:
print "form not found, id: " + g_form_id
exit()

url_to_login = "https://example.com/"
url_to_file = "https://example.com/download/files/filename=fname.exe"
local_filename = "fname.exe"

br = mechanize.Browser()
br.set_handle_robots(False) # ignore robots
br.set_handle_refresh(False) # can sometimes hang without this
br.addheaders = [('User-agent', 'Firefox')]

response = br.open(url_to_login)
# Find login form
select_form_with_id_using_br(br, 'login-form')
# Fill in data
br.form['email'] = 'email@domain.com'
br.form['password'] = 'password'
br.set_all_readonly(False) # allow everything to be written to
br.submit()

# Try to download file
br.retrieve(url_to_file, local_filename)


But I'm getting an error when 512MB is downloaded:

Traceback (most recent call last):
File "dl.py", line 34, in <module>
br.retrieve(br.retrieve(url_to_file, local_filename)
File "C:\Python27\lib\site-packages\mechanize\_opener.py", line 277, in retrieve
block = fp.read(bs)
File "C:\Python27\lib\site-packages\mechanize\_response.py", line 199, in read
self.__cache.write(data)
MemoryError: out of memory


Do you have any ideas how to solve this?
Thanks

Answer

You can use bs4 and requests to get you logged in then write the streamed content. There are a few form fields required including a _token_ field that is definitely necessary:

from bs4 import BeautifulSoup
import requests
from urlparse import urljoin

data = {'email': 'email@domain.com', 'password': 'password'}
base = "https://support.codasip.com"

with requests.Session() as s:
    # update headers
    s.headers.update({'User-agent': 'Firefox'})

    # use bs4 to parse the from fields
    soup = BeautifulSoup(s.get(base).content)
    form = soup.select_one("#frm-loginForm")
    # works as it is a relative path. Not always the case.
    action = form["action"]

    # Get rest of the fields, ignore password and email.
    for inp in form.find_all("input", {"name":True,"value":True}):
        name, value = inp["name"], inp["value"]
        if name not in data:
            data[name] = value
    # login
    s.post(urljoin(base, action), data=data)
    # get protected url
    with open(local_filename, "wb") as f:
        for chk in s.get(url_to_file, stream=True).iter_content(1024):
            f.write(chk)
Comments