Phrogz Phrogz - 4 months ago 28
HTML Question

Unescape numeric XML Entities with Lua

What's a good implementation for unescaping numeric HTML/XML entities, e.g.



and replacing them with the ASCII equivalent?

Expressed as a unit test:

local orig = "It's the "end" &ok;
"
local fixd = unescape(orig) -- Implement this
assert( fixd == "It's the \"end\" &ok;\n" )

Answer

Here's a simple implementation that also handles the core named XML entities:

function unescape(str)
  str = string.gsub( str, '&lt;', '<' )
  str = string.gsub( str, '&gt;', '>' )
  str = string.gsub( str, '&quot;', '"' )
  str = string.gsub( str, '&apos;', "'" )
  str = string.gsub( str, '&#(%d+);', function(n) return string.char(n) end )
  str = string.gsub( str, '&#x(%d+);', function(n) return string.char(tonumber(n,16)) end )
  str = string.gsub( str, '&amp;', '&' ) -- Be sure to do this after all others
  return str
end

print(unescape("&#34;Hello&quot; &apos;World&#39;")) --> "Hello" 'World'

However, note that this fails for one pathological case: a numeric ampersand entity followed by the text amp;:

print(unescape("Ampersand entity is &#38;amp;")) --> Ampersand entity is &
-- The result should actually be                     Ampersand entity is &amp;

We can fix this edge case by handling all entities at once, but the code gets a good bit uglier:

function unescape(str)
  local map={ ["lt"]="<", ["gt"]=">", ["amp"]="&", ["quot"]='"', ["apos"]="'" }
  str = string.gsub( str, '(&(#?#x)([%d%a]+);)', function(orig,n,s)
    return map[s] or n=="#" and string.char(s) or n=="#x" and string.char(tonumber(s,16)) or orig
  end )
  return str
end

print(unescape("Ampersand entity is &#38;amp;")) --> Ampersand entity is &amp;

Finally, we can unwrap it for a little more speed:

local gsub, char = string.gsub, string.char
local entityMap  = {["lt"]="<",["gt"]=">",["amp"]="&",["quot"]='"',["apos"]="'"}
local entitySwap = function(orig,n,s)
  return entityMap[s] or n=="#" and char(s) or n=="#x" and char(tonumber(s,16)) or orig
end
function unescape(str)
  return gsub( str, '(&(#?#x)([%d%a]+);)', entitySwap )
end
Comments