beshio beshio - 1 month ago 27
Swift Question

Is String.cString(using: String.Encoding.utf16) working correctly?

I'm seeing strange behavior w/ string.cString(using: String.Encoding.utf16).
What I'm trying to do is to convert Swift String to utf-16 in UnsafePointer<'UInt16>. However, String.Encoding.utf16 seems to terminate conversion after it outputs first '\0'.
The same logic works for utf-8 or shiftJIS, since they don't have NULL byte on the way.

Here is the code.

func test() {
let str = "helloはろー"
let cnt = str.characters.count
// (A) utf-8 ==> works fine
let utf8 = str.cString(using: String.Encoding.utf8)!
let int8p = utf8.withUnsafeBufferPointer {
UnsafeRawPointer($0.baseAddress!).assumingMemoryBound(to: Int8.self)
}
dumpmem(0, int8p)
let str1 = String(cString: int8p)
print(str1) // helloはろー
// (B) shift-JIS ==> works fine
let sjis = str.cString(using: String.Encoding.shiftJIS)!
let uint8p = sjis.withUnsafeBufferPointer {
UnsafeRawPointer($0.baseAddress!).assumingMemoryBound(to: UInt8.self)
}
dumpmem(0, uint8p)
let str2 = String(sjisptr: uint8p)!
print(str2) // helloはろー
// (C) utf-16 ==> doesn't work nicely
let utf16 = str.cString(using: String.Encoding.utf16)!
let uint16p = utf16.withUnsafeBufferPointer {
UnsafeRawPointer($0.baseAddress!).assumingMemoryBound(to: UInt16.self)
}
dumpmem(Int32(str.characters.count*2), uint16p) // only top char (='h' converted)
let str3 = String(utf16ptr: uint16p)!
print(str3) // h+garbage ...
// (D) utf-16 w/ iteration ==> works fine.
let u16s = UnsafeMutablePointer<UInt16>.allocate(capacity: cnt*4)
var len = 0
for code in str.utf16 {
u16s[len] = code
len += 1
}
u16s[len] = 0
dumpmem(Int32(len*2), u16s)
let str4 = String(utf16ptr: u16s)!
u16s.deallocate(capacity: cnt*4)
print(str4) // helloはろー
}


and support functions follow.

// dump memory
void dumpmem(int len, const void *ptr) {
unsigned char *p = (unsigned char *)ptr;
if (!len) len = (int)strlen((char *)p);
for (int i = 0 ; i < len ; i++) {
printf("[%d] : %d : 0x%x", i, *p, *p);
if (isascii(*p)) printf(" : %c", *p);
printf("\n");
p++;
}
}
// create string from Shift JIS pointer=UInt8
extension String {
init?(sjisptr: UnsafePointer<UInt8>) {
var len = 0
while (sjisptr[len] != 0) {
len += 1
}
let data = Data(bytes: UnsafePointer<UInt8>(sjisptr), count: len)
if let ns = NSString(data: data, encoding: String.Encoding.shiftJIS.rawValue) {
self = ns as String
} else {
return nil
}
}
}
// create string from UTF-16 pointer=UInt16
extension String {
init?(utf16ptr: UnsafePointer<UInt16>) {
var len = 0
while (utf16ptr[len] != 0) {
len += 1
}
len += len
let data = Data(bytes: utf16ptr, count: len)
if let ns = NSString(data: data, encoding: String.Encoding.utf16LittleEndian.rawValue) {
self = ns as String
} else {
return nil
}
}
}


Since utf-16 conversion doesn't work nicely w/ cString(using: String.Encoding.utf16) (C), I ended up iterating (scan) string as shown in (D) above.

I may be missing something. Can anyone explain why this is happening ?

Answer

Is String.cString(using: String.Encoding.utf16) working correctly?

The answer is NO.

The word "CString" means NUL terminated byte sequence. And NUL always means a single byte 0x00 in CString.

As you know, UTF-16 representation may contain many 0x00 bytes, such encodings cannot be used as CString.

And other points to fix your code:

  • You should not take out the bufferPointer or its baseAddress out of the closure passed to withUnsafeBufferPointer(_:).

withUnsafeBufferPointer(_:)

Parameters

body

A closure with an UnsafeBufferPointer parameter that points to the contiguous storage for the array. If body has a return value, it is used as the return value for the withUnsafeBufferPointer(_:) method. The pointer argument is valid only for the duration of the closure’s execution.

And in many cases, you have no need to use withUnsafeBufferPointer(_:).

  • To get the size of UTF-16 representation, you should use str.utf16.count, not str.characters.count.

  • Swift3's String has an initializer init(cString:encoding:) method. You have no need to re-implement it.

  • len += len is sort of "unreadable". Use len * MemoryLayout<UInt16>.size or at least len * 2.

Try this code:

// dump memory written in Swift
func dumpmem(_ len: Int, _ ptr: UnsafeRawPointer) {
    var p = ptr.assumingMemoryBound(to: UInt8.self)
    let len = len == 0 ? Int(strlen(ptr.assumingMemoryBound(to: CChar.self))) : len
    for i in 0..<len {
        print(String(format: "[%d] : %2$d : 0x%2$x", i, p.pointee), terminator: "")
        if isascii(Int32(p.pointee)) != 0 {print(String(format: " : %c", p.pointee), terminator: "")}
        print()
        p += 1
    }
}
// If you use UnsafePointer<CChar>, this can be simplified.
extension String {
    init?(sjisptr: UnsafePointer<UInt8>) {
        self.init(cString: UnsafeRawPointer(sjisptr).assumingMemoryBound(to: CChar.self), encoding: .shiftJIS)
    }
}
// create string from UTF-16 with terminating U+0000
extension String {
    init?(utf16ptr: UnsafePointer<UInt16>) {
        var len = 0
        while (utf16ptr[len] != 0) { //Detecting U+0000 as a terminator.
            len += 1
        }
        let data = Data(bytes: utf16ptr, count: len * MemoryLayout<UInt16>.size)
        self.init(data: data, encoding: .utf16LittleEndian)
    }
}
func test() {
    let str = "helloはろー"

    // (A) utf-8
    let utf8 = str.cString(using: .utf8)!
    dumpmem(0, utf8)
    let str1 = String(cString: utf8)
    print(str1)  // helloはろー

    // (B) shift-JIS
    let sjis = str.cString(using: .shiftJIS)!
    sjis.withUnsafeBufferPointer {
        let uint8p = UnsafeRawPointer($0.baseAddress!).assumingMemoryBound(to: UInt8.self)
        dumpmem(0, uint8p)
        let str2 = String(sjisptr: uint8p)!
        print(str2)  // helloはろー
    }

    // (C) utf-16 ==> to create a byte representation of UTF-16 terminated with U+0000
    var utf16 = str.data(using: .utf16LittleEndian)!
    utf16.append(contentsOf: [0,0]) //Append U+0000 as terminator.
    utf16.withUnsafeBytes {(uint16p: UnsafePointer<UInt16>) in
        dumpmem(utf16.count, uint16p)
        let str3 = String(utf16ptr: uint16p)!
        print(str3)
    }

    // (D) utf-16
    let u16s = UnsafeMutablePointer<UInt16>.allocate(capacity: str.utf16.count + 1) //<- `cnt * 4` is not appropriate
    var len = 0
    for code in str.utf16 {
        u16s[len] = code
        len += 1
    }
    u16s[len] = 0 //Append U+0000 as terminator.
    dumpmem((len+1) * MemoryLayout<UInt16>.size, u16s)
    let str4 = String(utf16ptr: u16s)!
    u16s.deallocate(capacity: str.utf16.count + 1)
    print(str4)  // helloはろー
}