hoju hoju - 6 months ago 33
Python Question

how to follow meta refreshes in Python

Python's urllib2 follows 3xx redirects to get the final content. Is there a way to make urllib2 (or some other library such as httplib2) also follow meta refreshes? Or do I need to parse the HTML manually for the refresh meta tags?


OK, seems no library supports it so I have been using this code:

import urllib2
import urlparse
import re

def get_hops(url):
    redirect_re = re.compile('<meta[^>]*?url=(.*?)["\']', re.IGNORECASE)
    hops = []
    while url:
        if url in hops:
            url = None
            hops.insert(0, url)
            response = urllib2.urlopen(url)
            if response.geturl() != url:
                hops.insert(0, response.geturl())
            # check for redirect meta tag
            match = redirect_re.search(response.read())
            if match:
                url = urlparse.urljoin(url, match.groups()[0].strip())
                url = None
    return hops