stoyanov stoyanov - 21 days ago 5
C++ Question

Different behaviors algorithm when working with a UTF8 on different operating systems

Simple code of algorithm:

#include <iostream>
#include <string>

std::string::size_type GetLengthWithUTF(std::string &sValue);

int main()
{
std::string sTestValueUTF8 = "\xD0\xB6\xD0\xB6\xD0\xB6";
std::string sTestValueASCII = "\x67\x67\x67";
std::string sTestValueMIX = "\x67\x67\x67\xD0\xB6\xD0\xB6\xD0\xB6";
std::string::size_type iFuncResult = 0;

std::cout << "=========== START TEST ==========\n\n";

std::cout << "+TEST UTF8 STRING\n";
std::cout << "+----+Bytes of string (sTestValueUTF8.length()) = " << sTestValueUTF8.length() << "\n";
iFuncResult = GetLengthWithUTF(sTestValueUTF8);
std::cout << "+----+Function result (GetLengthWithUTF(\"" << sTestValueUTF8 << "\")) = " << iFuncResult<< "\n\n";

std::cout << "+TEST ASCII STRING\n";
std::cout << "+----+Bytes of string (sTestValueASCII.length()) = " << sTestValueASCII.length() << "\n";
iFuncResult = GetLengthWithUTF(sTestValueASCII);
std::cout << "+----+Function result (GetLengthWithUTF(\"" << sTestValueASCII << "\")) = " << iFuncResult<< "\n\n";

std::cout << "+TEST MIX STRING\n";
std::cout << "+----+Bytes of string (sTestValueMIX.length()) = " << sTestValueMIX.length() << "\n";
iFuncResult = GetLengthWithUTF(sTestValueMIX);
std::cout << "+----+Function result (GetLengthWithUTF(\"" << sTestValueMIX << "\")) = " << iFuncResult<< "\n\n";

std::cout << "\n=========== END TEST ==========\n\n";
}

std::string::size_type GetLengthWithUTF(std::string &sValue)
{
std::cout << " +----+START GetLengthWithUTF\n";
std::cout << " +Input string is: " << sValue << "\n";
std::string::size_type i;
std::cout << " +Start cycle\n";
int iCountUTF8characters = 0;
for (i = 0; i < sValue.length(); i++)
{
std::cout << " +----+Iteration N " << i << "\n";
std::cout << " +Current character is: " << sValue[i] << ", integer value = " << (int)sValue[i] << "\n";
if (sValue[i] > 127)
{
iCountUTF8characters++;
std::cout << " +----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: " << iCountUTF8characters << "\n";
}
else
{
std::cout << " +----+If statement (sValue[i] > 127) is false.\n";
}
}

std::cout << " +End cycle\n";
iCountUTF8characters = iCountUTF8characters / 2;
std::cout << " +Return sValue.length() - (iCountUTF8characters / 2) ---> " << sValue.length() << " - (" << iCountUTF8characters << " / 2) = " << (sValue.length() - (std::string::size_type)iCountUTF8characters) <<"\n";
std::cout << " +----+ASCIID GetLengthWithUTF\n";
return (sValue.length() - (std::string::size_type)iCountUTF8characters);
}


Console compile commands:


AIX 6

g++ -o test test.cpp


RHEL Server 6.7 Santiago

g++ -o test test.cpp


Microsoft Windows v10.0.14393

cl /EHsc test.cpp






Results:


AIX 6

=========== START TEST ==========

+TEST UTF8 STRING
+----+Bytes of string (sTestValueUTF8.length()) = 6
+----+START GetLengthWithUTF
+Input string is: жжж
+Start cycle
+----+Iteration N 0
+Current character is: Ь integer value = 208
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 1
+----+Iteration N 1
+Current character is: ֬ integer value = 182
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 2
+----+Iteration N 2
+Current character is: Ь integer value = 208
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 3
+----+Iteration N 3
+Current character is: ֬ integer value = 182
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 4
+----+Iteration N 4
+Current character is: Ь integer value = 208
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 5
+----+Iteration N 5
+Current character is: ֬ integer value = 182
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 6
+End cycle
+Return sValue.length() - (iCountUTF8characters / 2) ---> 6 - (3 / 2) = 3
+----+ASCIID GetLengthWithUTF
+----+Function result (GetLengthWithUTF("жжж")) = 3

+TEST ASCII STRING
+----+Bytes of string (sTestValueASCII.length()) = 3
+----+START GetLengthWithUTF
+Input string is: ggg
+Start cycle
+----+Iteration N 0
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 1
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 2
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+End cycle
+Return sValue.length() - (iCountUTF8characters / 2) ---> 3 - (0 / 2) = 3
+----+ASCIID GetLengthWithUTF
+----+Function result (GetLengthWithUTF("ggg")) = 3

+TEST MIX STRING
+----+Bytes of string (sTestValueMIX.length()) = 9
+----+START GetLengthWithUTF
+Input string is: gggжжж
+Start cycle
+----+Iteration N 0
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 1
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 2
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 3
+Current character is: Ь integer value = 208
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 1
+----+Iteration N 4
+Current character is: ֬ integer value = 182
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 2
+----+Iteration N 5
+Current character is: Ь integer value = 208
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 3
+----+Iteration N 6
+Current character is: ֬ integer value = 182
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 4
+----+Iteration N 7
+Current character is: Ь integer value = 208
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 5
+----+Iteration N 8
+Current character is: ֬ integer value = 182
+----+If statement (sValue[i] > 127) is true, value of iCountUTF8characters is: 6
+End cycle
+Return sValue.length() - (iCountUTF8characters / 2) ---> 9 - (3 / 2) = 6
+----+ASCIID GetLengthWithUTF
+----+Function result (GetLengthWithUTF("gggжжж")) = 6


=========== END TEST ==========


RHEL Server 6.7 Santiago

=========== START TEST ==========

+TEST UTF8 STRING
+----+Bytes of string (sTestValueUTF8.length()) = 6
+----+START GetLengthWithUTF
+Input string is: жжж
+Start cycle
+----+Iteration N 0
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 1
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 2
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 3
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 4
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 5
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+End cycle
+Return sValue.length() - (iCountUTF8characters / 2) ---> 6 - (0 / 2) = 6
+----+ASCIID GetLengthWithUTF
+----+Function result (GetLengthWithUTF("жжж")) = 6

+TEST ASCII STRING
+----+Bytes of string (sTestValueASCII.length()) = 3
+----+START GetLengthWithUTF
+Input string is: ggg
+Start cycle
+----+Iteration N 0
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 1
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 2
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+End cycle
+Return sValue.length() - (iCountUTF8characters / 2) ---> 3 - (0 / 2) = 3
+----+ASCIID GetLengthWithUTF
+----+Function result (GetLengthWithUTF("ggg")) = 3

+TEST MIX STRING
+----+Bytes of string (sTestValueMIX.length()) = 9
+----+START GetLengthWithUTF
+Input string is: gggжжж
+Start cycle
+----+Iteration N 0
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 1
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 2
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 3
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 4
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 5
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 6
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 7
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 8
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+End cycle
+Return sValue.length() - (iCountUTF8characters / 2) ---> 9 - (0 / 2) = 9
+----+ASCIID GetLengthWithUTF
+----+Function result (GetLengthWithUTF("gggжжж")) = 9


=========== END TEST ==========


Microsoft Windows v10.0.14393

=========== START TEST ==========

+TEST UTF8 STRING
+----+Bytes of string (sTestValueUTF8.length()) = 6
+----+START GetLengthWithUTF
+Input string is: жжж
+Start cycle
+----+Iteration N 0
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 1
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 2
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 3
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 4
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 5
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+End cycle
+Return sValue.length() - (iCountUTF8characters / 2) ---> 6 - (0 / 2) = 6
+----+ASCIID GetLengthWithUTF
+----+Function result (GetLengthWithUTF("жжж")) = 6

+TEST ASCII STRING
+----+Bytes of string (sTestValueASCII.length()) = 3
+----+START GetLengthWithUTF
+Input string is: ggg
+Start cycle
+----+Iteration N 0
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 1
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 2
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+End cycle
+Return sValue.length() - (iCountUTF8characters / 2) ---> 3 - (0 / 2) = 3
+----+ASCIID GetLengthWithUTF
+----+Function result (GetLengthWithUTF("ggg")) = 3

+TEST MIX STRING
+----+Bytes of string (sTestValueMIX.length()) = 9
+----+START GetLengthWithUTF
+Input string is: gggжжж
+Start cycle
+----+Iteration N 0
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 1
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 2
+Current character is: g, integer value = 103
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 3
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 4
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 5
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 6
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 7
+Current character is: Ь integer value = -48
+----+If statement (sValue[i] > 127) is false.
+----+Iteration N 8
+Current character is: ֬ integer value = -74
+----+If statement (sValue[i] > 127) is false.
+End cycle
+Return sValue.length() - (iCountUTF8characters / 2) ---> 9 - (0 / 2) = 9
+----+ASCIID GetLengthWithUTF
+----+Function result (GetLengthWithUTF("gggжжж")) = 9


=========== END TEST ==========


The algorithm must calculate the number of characters in a string. As you can see from the results of the tests, it works correctly only under AIX.


I'll be glad if someone help me understand this absurd for me behavior of the algorithm for different operating systems. The algorithm was created under OS AIX. After migration from AIX to LINUX is found that there is a problem with it and I made more extensive tests whose results you see. My main question is how damn algorithm works under AIX. I can not explain it in any logical way.

Answer

It appears that the two kinds of system differ in the way they treat the sign of characters, which is allowed by the standard. Your AIX compiler treats chars as unsigned, while the other two systems treat them as signed.

On systems with unsigned characters the condition sValue[i] > 127 behaves exactly in the way that one would expect. However, the same expression never succeeds on systems with signed characters.

That is why you get negative numbers for characters with codes of 128 and above. For example, 208 becomes -48 when it is treated as a single-byte signed value.

You can fix this by forcing a conversion to unsigned, or by checking the eight's bit with a bit mask:

if (sValue[i] & 128) {
    ... // MSB is set
}