alexanderson alexanderson - 2 months ago 7
C Question

BOM endian in C

I understand that Big Endian and Little Endian can be encoded with a BOM that gives them away, but i'm so confused as for how i can evaluate this in C given a file.

00 00 FE FF -> UTF-32, big-endian
FF FE 00 00 -> UTF-32, little-endian
FE FF -> UTF-16, big-endian
FF FE -> UTF-16, little-endian


I have this piece of code to get the bytes from the file, but how do i know if it's little or big endian UTF-16 assuming the file starts with a BOM \xFF\xFE or \xFE\xFF.

#include <stdio.h>
#include <stdlib.h>

int main(int argc, char *argv[]){
unsigned char c;

FILE *f = fopen(argv[1], "r");

while (fread(&c, sizeof(char), 1, f) == 1){
fprintf(stdout, "%x\n", c);
}
}


And what would the file containing this BOM look like? (In either byte or regular text)?
I hope someone can help. Thanks.

I'm confused as for how to read the file and test if the first byte or bytes containing the BOM are little endian or big endian? How do i do that?

Answer

Maybe there's a slicker way to do it than this, but it seems to work:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int main(int argc, char *argv[])
{
    unsigned char c[4];

    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s file\n", argv[0]);
        return 1;
    }
    FILE *f = fopen(argv[1], "r");

    if (f == 0)
    {
        fprintf(stderr, "%s: failed to open file %s for reading\n",
                argv[0], argv[1]);
        return 1;
    }

    size_t nbytes =  fread(c, sizeof(char), sizeof(c), f);
    if (nbytes == 2)
    {
        /* UTF16 with BOM only? */
        if (c[0] == 0xFE && c[1] == 0xFF)
            printf("UTF-16BE\n");
        else if (c[0] == 0xFF && c[1] == 0xFE)
            printf("UTF-16LE\n");
        else
            printf("Two random (non-UTF) bytes 0x%.2X and 0x%.2X\n", c[0], c[1]);
    }
    else if (nbytes == 4)
    {
        if (memcmp(c, "\xFF\xFE\x00\x00", 4) == 0)
            printf("UTF-32LE\n");
        else if (memcmp(c, "\x00\x00\xFE\xFF", 4) == 0)
            printf("UTF-32BE\n");
        else if (memcmp(c, "\xFE\xFF", 2) == 0)
            printf("UTF-16BE\n");
        else if (memcmp(c, "\xFF\xFE", 2) == 0)
            printf("UTF-16LE\n");
        else
            printf("Four random (non-UTF) bytes 0x%.2X, 0x%.2X, 0x%.2X, 0x%.2X\n",
                   c[0], c[1], c[2], c[3]);
    }
    else
    {
        fprintf(stderr, "%s: Odd-ball data size %zu\n", argv[0], nbytes);

        fclose(f);
        return 1;
    }

    fclose(f);
    return 0;
}

I used a number of custom programs to create the data I tested it with, but the results were convincing enough to me:

$ cat chk.sh     
for file in utf-*
do
    ls -l $file
    odx $file | sed 2d
    printf 'File: %-12s - content: %s\n' $file $(utf61 $file)
done
$ sh chk.sh
-rw-r--r-- 1 jleffler rd 4 Sep 19 15:01 utf-16BE
0x0000: FE FF 00 30                                       ...0
File: utf-16BE     - content: UTF-16BE
-rw-r--r-- 1 jleffler rd 2 Sep 19 15:01 utf-16BE.2
0x0000: FE FF                                             ..
File: utf-16BE.2   - content: UTF-16BE
-rw-r--r-- 1 jleffler rd 4 Sep 19 15:01 utf-16LE
0x0000: FF FE 30 00                                       ..0.
File: utf-16LE     - content: UTF-16LE
-rw-r--r-- 1 jleffler rd 2 Sep 19 15:01 utf-16LE.2
0x0000: FF FE                                             ..
File: utf-16LE.2   - content: UTF-16LE
-rw-r--r-- 1 jleffler rd 4 Sep 19 15:01 utf-32BE
0x0000: 00 00 FE FF                                       ....
File: utf-32BE     - content: UTF-32BE
-rw-r--r-- 1 jleffler rd 4 Sep 19 15:01 utf-32LE
0x0000: FF FE 00 00                                       ....
File: utf-32LE     - content: UTF-32LE
$
Comments