KindDragon KindDragon - 1 month ago 16
C++ Question

How can I optimize this C++ code?

I'm trying improve performance for my function. Profiler points to the code at inner loop. Can I improve perfomance of that code, maybe using SSE intrinsics?

void ConvertImageFrom_R16_FLOAT_To_R32_FLOAT(char* buffer, void* convertedData, DWORD width, DWORD height, UINT rowPitch)
{
struct SINGLE_FLOAT
{
union {
struct {
unsigned __int32 R_m : 23;
unsigned __int32 R_e : 8;
unsigned __int32 R_s : 1;
};
struct {
float r;
};
};
};
C_ASSERT(sizeof(SINGLE_FLOAT) == 4); // 4 bytes
struct HALF_FLOAT
{
unsigned __int16 R_m : 10;
unsigned __int16 R_e : 5;
unsigned __int16 R_s : 1;
};
C_ASSERT(sizeof(HALF_FLOAT) == 2);
SINGLE_FLOAT* d = (SINGLE_FLOAT*)convertedData;
for(DWORD j = 0; j< height; j++)
{
HALF_FLOAT* s = (HALF_FLOAT*)((char*)buffer + rowPitch * j);
for(DWORD i = 0; i< width; i++)
{
d->R_s = s->R_s;
d->R_e = s->R_e - 15 + 127;
d->R_m = s->R_m << (23-10);
d++;
s++;
}
}
}


Update:

Disassembly

; Listing generated by Microsoft (R) Optimizing Compiler Version 16.00.40219.01

TITLE Utils.cpp
.686P
.XMM
include listing.inc
.model flat

INCLUDELIB LIBCMT
INCLUDELIB OLDNAMES

PUBLIC ?ConvertImageFrom_R16_FLOAT_To_R32_FLOAT@@YAXPADPAXKKI@Z ; ConvertImageFrom_R16_FLOAT_To_R32_FLOAT
; Function compile flags: /Ogtp
; COMDAT ?ConvertImageFrom_R16_FLOAT_To_R32_FLOAT@@YAXPADPAXKKI@Z
_TEXT SEGMENT
_buffer$ = 8 ; size = 4
tv83 = 12 ; size = 4
_convertedData$ = 12 ; size = 4
_width$ = 16 ; size = 4
_height$ = 20 ; size = 4
_rowPitch$ = 24 ; size = 4
?ConvertImageFrom_R16_FLOAT_To_R32_FLOAT@@YAXPADPAXKKI@Z PROC ; ConvertImageFrom_R16_FLOAT_To_R32_FLOAT, COMDAT

; 323 : {

push ebp
mov ebp, esp

; 343 : for(DWORD j = 0; j< height; j++)

mov eax, DWORD PTR _height$[ebp]
push esi
mov esi, DWORD PTR _convertedData$[ebp]
test eax, eax
je SHORT $LN4@ConvertIma

; 324 : union SINGLE_FLOAT {
; 325 : struct {
; 326 : unsigned __int32 R_m : 23;
; 327 : unsigned __int32 R_e : 8;
; 328 : unsigned __int32 R_s : 1;
; 329 : };
; 330 : struct {
; 331 : float r;
; 332 : };
; 333 : };
; 334 : C_ASSERT(sizeof(SINGLE_FLOAT) == 4);
; 335 : struct HALF_FLOAT
; 336 : {
; 337 : unsigned __int16 R_m : 10;
; 338 : unsigned __int16 R_e : 5;
; 339 : unsigned __int16 R_s : 1;
; 340 : };
; 341 : C_ASSERT(sizeof(HALF_FLOAT) == 2);
; 342 : SINGLE_FLOAT* d = (SINGLE_FLOAT*)convertedData;

push ebx
mov ebx, DWORD PTR _buffer$[ebp]
push edi
mov DWORD PTR tv83[ebp], eax
$LL13@ConvertIma:

; 344 : {
; 345 : HALF_FLOAT* s = (HALF_FLOAT*)((char*)buffer + rowPitch * j);
; 346 : for(DWORD i = 0; i< width; i++)

mov edi, DWORD PTR _width$[ebp]
mov edx, ebx
test edi, edi
je SHORT $LN5@ConvertIma
npad 1
$LL3@ConvertIma:

; 347 : {
; 348 : d->R_s = s->R_s;

movzx ecx, WORD PTR [edx]
movzx eax, WORD PTR [edx]
shl ecx, 16 ; 00000010H
xor ecx, DWORD PTR [esi]
shl eax, 16 ; 00000010H
and ecx, 2147483647 ; 7fffffffH
xor ecx, eax
mov DWORD PTR [esi], ecx

; 349 : d->R_e = s->R_e - 15 + 127;

movzx eax, WORD PTR [edx]
shr eax, 10 ; 0000000aH
and eax, 31 ; 0000001fH
add eax, 112 ; 00000070H
shl eax, 23 ; 00000017H
xor eax, ecx
and eax, 2139095040 ; 7f800000H
xor eax, ecx
mov DWORD PTR [esi], eax

; 350 : d->R_m = s->R_m << (23-10);

movzx ecx, WORD PTR [edx]
and ecx, 1023 ; 000003ffH
shl ecx, 13 ; 0000000dH
and eax, -8388608 ; ff800000H
or ecx, eax
mov DWORD PTR [esi], ecx

; 351 : d++;

add esi, 4

; 352 : s++;

add edx, 2
dec edi
jne SHORT $LL3@ConvertIma
$LN5@ConvertIma:

; 343 : for(DWORD j = 0; j< height; j++)

add ebx, DWORD PTR _rowPitch$[ebp]
dec DWORD PTR tv83[ebp]
jne SHORT $LL13@ConvertIma
pop edi
pop ebx
$LN4@ConvertIma:
pop esi

; 353 : }
; 354 : }
; 355 : }

pop ebp
ret 0
?ConvertImageFrom_R16_FLOAT_To_R32_FLOAT@@YAXPADPAXKKI@Z ENDP ; ConvertImageFrom_R16_FLOAT_To_R32_FLOAT
_TEXT ENDS

Answer

Accessing bitfields in memory can be really tricky, depending on the architecture, of course.

You might achieve better performance if you would make a union of a float and a 32 bit integer, and simply perform all decomposition and composition using a local variables. That way the generated code could perform the entire operation using only processor registers.