TigerShark TigerShark - 2 months ago 11
Javascript Question

JavaScript: How to convert multi-byte string array to 32bits int array?

I have a string which holds UTF-32 (but probably the higher 16bits will be always 0) code points. Each token is 1 of the 4 bytes of the code point of each char in a long string.
Note the bytes get interpreted as signed ints before turning into a string, I have no control over this.

// Provided:
intEncodedBytesString= "0,0,0,-31,0,0,0,-15,0,0,0,-31"; //3 chars: áñá

// Wanted
actualCodePoints = [225,241,225];


I need to turn intEncodedBytesString into the actualCodePoints array.
So far I came up with this:

var intEncodedBytesStringArray = intEncodedBytesString.toString().split(',');
var i, str = '';
var charAmount = intEncodedBytesStringArray.length / 4;

for (i = 0; i < charAmount; i++) {
var codePoint = 0;

for (var j = 0; j < 4; j++) {
var num = parseInt(intEncodedBytesStringArray[i * 4 + j], 10);
if (num != 0) {
if (num < 0) {
num = (1 << (8 * (4 - j))) + num;
}

codePoint += (num << (8 * (3 - j)));
}
}

str += String.fromCodePoint(codePoint);
}


Is there a better, simpler and/or more efficient way of doing this?

I've seen dozens of answers and code snipets to deal with similar things but nothing addressing the problem that my input bytes are in an string of signed ints :S

Edit: this code wont work with the highest code points since 1<<32 is 1 and not 2^32.

Answer

Since it's the nice simple UTF-32, yes, there's a simpler way: Just work in four-byte blocks. Also, the simple way to handle the possible negativity is (value + 256) % 256.

So:

var intEncodedBytesString = "0,0,0,-31,0,0,0,-15,0,0,0,-31"; //3 char
var actualCodePoints = [];
var bytes = intEncodedBytesString.split(",").map(Number);
for (var i = 0; i < bytes.length; i += 4) {
  actualCodePoints.push(
       (((bytes[i]     + 256) % 256) << 24) +
       (((bytes[i + 1] + 256) % 256) << 16) +
       (((bytes[i + 2] + 256) % 256) << 8) +
       (bytes[i + 3]   + 256) % 256
  );
}

Example with detailed explanation in comments:

// Starting point
var intEncodedBytesString = "0,0,0,-31,0,0,0,-15,0,0,0,-31"; //3 char
// Target array
var actualCodePoints = [];
// Get the bytes as numbers by splitting on comman running the array
// through Number to convert to number.
var bytes = intEncodedBytesString.split(",").map(Number);

// Loop through the bytes building code points
var i, cp;
for (i = 0; i < bytes.length; i += 4) {
  // (x + 256) % 256 will handle turning (for instance) -31 into 224
  // We shift the value for the first byte left 24 bits, the next byte 16 bits,
  // the next 8 bits, and don't shift the last one at all. Adding them all
  // together gives us the code point, which we push into the array.
  cp = (((bytes[i]     + 256) % 256) << 24) +
       (((bytes[i + 1] + 256) % 256) << 16) +
       (((bytes[i + 2] + 256) % 256) << 8) +
       (bytes[i + 3]   + 256) % 256;
  actualCodePoints.push(cp);
}

// Show the result
console.log(actualCodePoints);

// If the JavaScript engine supports it, show the string
if (String.fromCodePoint) { // ES2015+
  var str = String.fromCodePoint.apply(String, actualCodePoints);
  // The above could be
  // `let str = String.fromCodePoint(...actualCodePoints);`
  // on an ES2015+ engine
  console.log(str);
} else {
  console.log("(Your browser doesn't support String.fromCodePoint)");
}