I'm saving some strings from a third party into my database (postgres). Sometimes these strings are too long and need to be truncated to fit into the column in my table.
On some random occasions I accidentally truncate the string right where there is a Unicode character, which gives me a "broken" string that I cannot save into the database. I get the following error:
Unable to translate Unicode character \uD83D at index XXX to specified code page
I ended up using a modification of xanatos answer here. The difference is that this version will strip the last grapheme, if adding it would give a string longer than length
.
public static string UnicodeSafeSubstring(this string str, int startIndex, int length)
{
if (str == null)
{
throw new ArgumentNullException(nameof(str));
}
if (startIndex < 0 || startIndex > str.Length)
{
throw new ArgumentOutOfRangeException(nameof(startIndex));
}
if (length < 0)
{
throw new ArgumentOutOfRangeException(nameof(length));
}
if (startIndex + length > str.Length)
{
throw new ArgumentOutOfRangeException(nameof(length));
}
if (length == 0)
{
return string.Empty;
}
var stringBuilder = new StringBuilder(length);
var enumerator = StringInfo.GetTextElementEnumerator(str, startIndex);
while (enumerator.MoveNext())
{
var grapheme = enumerator.GetTextElement();
startIndex += grapheme.Length;
if (startIndex > str.Length)
{
break;
}
// Skip initial Low Surrogates/Combining Marks
if (stringBuilder.Length == 0)
{
if (char.IsLowSurrogate(grapheme[0]))
{
continue;
}
var cat = char.GetUnicodeCategory(grapheme, 0);
if (cat == UnicodeCategory.NonSpacingMark || cat == UnicodeCategory.SpacingCombiningMark || cat == UnicodeCategory.EnclosingMark)
{
continue;
}
}
// Do not append the grapheme if the resulting string would be longer than the required length
if (stringBuilder.Length + grapheme.Length <= length)
{
stringBuilder.Append(grapheme);
}
if (stringBuilder.Length >= length)
{
break;
}
}
return stringBuilder.ToString();
}
}