xralf xralf - 2 months ago 18
Python Question

Get image width from HTML code

I can get the

width
attribute of an image using
BeautifulSoup
as follows:

img = soup.find("img")
width = img["width"]


The problem is that
width
can be set in
CSS
file or not set at all.

I would like to extract the value without downloading the image from
img["src"]
How can I do it in Python to extract the value if it's set somewhere (HTML or CSS) or get the default value the browser will render (if not set)?

Answer

You can partially download image, only enough to get width/height through setting Range in requests headers and use somehow variant of getimageinfo.py

Example usage:

def check_is_small_pic(url, pic_size):
    is_small = False
    r_check = requests.get(url, headers={"Range": "50"})
    image_info = getimageinfo.getImageInfo(r_check.content)
    if image_info[1] < pic_size or image_info[2] < pic_size:
        is_small = True
    return is_small

Some getimageinfo.py, quickly adjusted for python 3.5:

import io
import struct
# import urllib.request as urllib2

def getImageInfo(data):
    data = data
    size = len(data)
    #print(size)
    height = -1
    width = -1
    content_type = ''

    # handle GIFs
    if (size >= 10) and data[:6] in (b'GIF87a', b'GIF89a'):
        # Check to see if content_type is correct
        content_type = 'image/gif'
        w, h = struct.unpack(b"<HH", data[6:10])
        width = int(w)
        height = int(h)

    # See PNG 2. Edition spec (http://www.w3.org/TR/PNG/)
    # Bytes 0-7 are below, 4-byte chunk length, then 'IHDR'
    # and finally the 4-byte width, height
    elif ((size >= 24) and data.startswith(b'\211PNG\r\n\032\n')
          and (data[12:16] == b'IHDR')):
        content_type = 'image/png'
        w, h = struct.unpack(b">LL", data[16:24])
        width = int(w)
        height = int(h)

    # Maybe this is for an older PNG version.
    elif (size >= 16) and data.startswith(b'\211PNG\r\n\032\n'):
        # Check to see if we have the right content type
        content_type = 'image/png'
        w, h = struct.unpack(b">LL", data[8:16])
        width = int(w)
        height = int(h)

    # handle JPEGs
    elif (size >= 2) and data.startswith(b'\377\330'):
        content_type = 'image/jpeg'
        jpeg = io.BytesIO(data)
        jpeg.read(2)
        b = jpeg.read(1)
        try:
            while (b and ord(b) != 0xDA):
                while (ord(b) != 0xFF): b = jpeg.read(1)
                while (ord(b) == 0xFF): b = jpeg.read(1)
                if (ord(b) >= 0xC0 and ord(b) <= 0xC3):
                    jpeg.read(3)
                    h, w = struct.unpack(b">HH", jpeg.read(4))
                    break
                else:
                    jpeg.read(int(struct.unpack(b">H", jpeg.read(2))[0])-2)
                b = jpeg.read(1)
            width = int(w)
            height = int(h)
        except struct.error:
            pass
        except ValueError:
            pass

    return content_type, width, height



# from PIL import Image
# import requests
# hrefs = ['http://farm4.staticflickr.com/3894/15008518202_b016d7d289_m.jpg','https://farm4.staticflickr.com/3920/15008465772_383e697089_m.jpg','https://farm4.staticflickr.com/3902/14985871946_86abb8c56f_m.jpg']
# RANGE = 5000
# for href in hrefs:
#     req  = requests.get(href,headers={'User-Agent':'Mozilla5.0(Google spider)','Range':'bytes=0-{}'.format(RANGE)})
#     im = getImageInfo(req.content)
# 
#     print(im)
# req = urllib2.Request("http://vn-sharing.net/forum/images/smilies/onion/ngai.gif", headers={"Range": "5000"})
# r = urllib2.urlopen(req)
# 
# f = open("D:\\Pictures\\1.jpg", "rb")
# print(getImageInfo(r.read()))
# Output: >> ('image/gif', 50, 50)
# print(getImageInfo(f.read()))