user98418468459 user98418468459 - 9 days ago 4
C# Question

Significant drop in performance of Math.Round on x64 platform

I've noticed a very significant (~15x) drop in performance when using Math.Round to convert double to int while targeting x64 compared to x86. I tested it on 64 bit Windows on Core i7 3770K. Can anyone reproduce it? Is there any good reason why this is the case? Maybe some weird boundary conditions?

Just for reference I compared

Math.Round
(Test1) with 2 approximations: conditional cast (Test2) and 6755399441055744 trick (Test3).

Running times are:

---------------------------
| | x86 | x64 |
|-------+--------+--------|
| Test1 | 0,0662 | 0,9975 |
| Test2 | 0,1517 | 0,1513 |
| Test3 | 0,1966 | 0,0978 |
---------------------------


Here is the benchmark code:

using System;
using System.Diagnostics;
using System.Runtime.InteropServices;
namespace MathRoundTester
{
class Program
{
private const int IterationCount = 1000000;

private static int dummy;
static void Main(string[] args)
{
var data = new double[100];
var rand = new Random(0);
for (int i = 0; i < data.Length; ++i)
{
data[i] = rand.NextDouble() * int.MaxValue * 2 +
int.MinValue + rand.NextDouble();
}

dummy ^= Test1(data);
dummy ^= Test2(data);
dummy ^= Test3(data);
RecordTime(data, Test1);
RecordTime(data, Test2);
RecordTime(data, Test3);
Console.WriteLine(dummy);
Console.Read();
}
private static void RecordTime(double[] data, Func<double[], int> action)
{
GC.Collect();
GC.WaitForPendingFinalizers();
GC.Collect();

var sw = Stopwatch.StartNew();
dummy ^= action(data);
sw.Stop();
Console.WriteLine((sw.ElapsedTicks / (double)Stopwatch.Frequency).ToString("F4"));
}
private static int Test1(double[] data)
{
int d = 0;
for (int i = 0; i < IterationCount; ++i)
{
for (int j = 0; j < data.Length; ++j)
{
var x = data[j];
d ^= (int)Math.Round(x);
}
}
return d;
}
private static int Test2(double[] data)
{
int d = 0;
for (int i = 0; i < IterationCount; ++i)
{
for (int j = 0; j < data.Length; ++j)
{
var x = data[j];
d ^= x > 0 ? (int)(x + 0.5) : (int)(x - 0.5);
}
}
return d;
}
[StructLayout(LayoutKind.Explicit)]
private struct DoubleIntUnion
{
public DoubleIntUnion(double a)
{
Int = 0;
Double = a;
}
[FieldOffset(0)]
public double Double;
[FieldOffset(0)]
public int Int;
}
private static int Test3(double[] data)
{
int d = 0;
for (int i = 0; i < IterationCount; ++i)
{
for (int j = 0; j < data.Length; ++j)
{
var x = data[j];
d ^= new DoubleIntUnion(x + 6755399441055744.0).Int;
}
}
return d;
}
}
}


Update 2016-11-23:

Some time after AndreyAkinshin kindly posted a question on the dotnet/coreclr repo, it was added to the 1.2.0 milestone. So it seems that this issue is just an oversight and will be fixed.

Answer

Let's look at the asm of (int) Math.Round(data[j]).

LegacyJIT-x86:

01172EB0  fld         qword ptr [eax+edi*8+8]  
01172EB4  fistp       dword ptr [ebp-14h]

RyuJIT-x64:

`d7350617 c4e17b1044d010  vmovsd  xmm0,qword ptr [rax+rdx*8+10h]
`d735061e e83dce605f      call    clr!COMDouble::Round (`3695d460)
`d7350623 c4e17b2ce8      vcvttsd2si ebp,xmm0

Source of clr!COMDouble::Round:

clr!COMDouble::Round:
`3695d460 4883ec58        sub     rsp,58h
`3695d464 0f29742440      movaps  xmmword ptr [rsp+40h],xmm6
`3695d469 0f57c9          xorps   xmm1,xmm1
`3695d46c f2480f2cc0      cvttsd2si rax,xmm0
`3695d471 0f297c2430      movaps  xmmword ptr [rsp+30h],xmm7
`3695d476 0f28f0          movaps  xmm6,xmm0
`3695d479 440f29442420    movaps  xmmword ptr [rsp+20h],xmm8
`3695d47f f2480f2ac8      cvtsi2sd xmm1,rax
`3695d484 660f2ec1        ucomisd xmm0,xmm1
`3695d488 7a17            jp      clr!COMDouble::Round+0x41 (`3695d4a1)
`3695d48a 7515            jne     clr!COMDouble::Round+0x41 (`3695d4a1)
`3695d48c 0f28742440      movaps  xmm6,xmmword ptr [rsp+40h]
`3695d491 0f287c2430      movaps  xmm7,xmmword ptr [rsp+30h]
`3695d496 440f28442420    movaps  xmm8,xmmword ptr [rsp+20h]
`3695d49c 4883c458        add     rsp,58h
`3695d4a0 c3              ret
`3695d4a1 440f28c0        movaps  xmm8,xmm0
`3695d4a5 f2440f5805c23a7100 
            addsd xmm8,mmword ptr [clr!_real (`37070f70)] ds:`37070f70=3fe0000000000000
`3695d4ae 410f28c0        movaps  xmm0,xmm8
`3695d4b2 e821000000      call    clr!floor (`3695d4d8)
`3695d4b7 66410f2ec0      ucomisd xmm0,xmm8
`3695d4bc 0f28f8          movaps  xmm7,xmm0
`3695d4bf 7a06            jp      clr!COMDouble::Round+0x67 (`3695d4c7)
`3695d4c1 0f8465af3c00    je      clr! ?? ::FNODOBFM::`string'+0xdd8c4 (`36d2842c)
`3695d4c7 0f28ce          movaps  xmm1,xmm6
`3695d4ca 0f28c7          movaps  xmm0,xmm7
`3695d4cd ff1505067000    call    qword ptr [clr!_imp__copysign (`3705dad8)]
`3695d4d3 ebb7            jmp     clr!COMDouble::Round+0x2c (`3695d48c)

As you can see, LegacyJIT-x86 uses an extremely fast fld-fistp pair; according to the Instruction tables by Agner Fog, we have the following numbers for Haswell:

Instruction | Latency | Reciprocal throughput
------------|---------|----------------------
FLD m32/64  | 3       | 0.5
FIST(P) m   | 7       | 1

RyuJIT-x64 directly calls clr!COMDouble::Round (LegacyJIT-x64 do the same). You can find source code for this method in the dotnet/coreclr repo. If you are working with release-1.0.0, you need floatnative.cpp:

#if defined(_TARGET_X86_)
__declspec(naked)
double __fastcall COMDouble::Round(double d)
{
    LIMITED_METHOD_CONTRACT;

    __asm {
        fld QWORD PTR [ESP+4]
        frndint
        ret 8
    }
}

#else // !defined(_TARGET_X86_)
FCIMPL1_V(double, COMDouble::Round, double d) 
    FCALL_CONTRACT;

    double tempVal;
    double flrTempVal;
    // If the number has no fractional part do nothing
    // This shortcut is necessary to workaround precision loss in borderline cases on some platforms
    if ( d == (double)(__int64)d )
        return d;
    tempVal = (d+0.5);
    //We had a number that was equally close to 2 integers. 
    //We need to return the even one.
    flrTempVal = floor(tempVal);
    if (flrTempVal==tempVal) {
        if (0 != fmod(tempVal, 2.0)) {
            flrTempVal -= 1.0;
        }
    }
    flrTempVal = _copysign(flrTempVal, d);
    return flrTempVal;
FCIMPLEND
#endif // defined(_TARGET_X86_)

If you are working with the master branch, you could find a similar code in floatdouble.cpp.

FCIMPL1_V(double, COMDouble::Round, double x)
    FCALL_CONTRACT;

    // If the number has no fractional part do nothing
    // This shortcut is necessary to workaround precision loss in borderline cases on some platforms
    if (x == (double)((INT64)x)) {
        return x;
    }

    // We had a number that was equally close to 2 integers.
    // We need to return the even one.

    double tempVal = (x + 0.5);
    double flrTempVal = floor(tempVal);

    if ((flrTempVal == tempVal) && (fmod(tempVal, 2.0) != 0)) {
        flrTempVal -= 1.0;
    }

    return _copysign(flrTempVal, x);
FCIMPLEND

It seems that the full .NET Framework uses the same logic.

Thus, (int)Math.Round really works much faster on x86 than on x64 because of a difference in the internal implementations of different JIT compilers. Note that this behavior can be changed in the future.

By the way, you could write a small and reliable benchmark with help of BenchmarkDotNet:

[LegacyJitX86Job, LegacyJitX64Job, RyuJitX64Job]
public class MathRoundBenchmarks
{
    private const int N = 100;
    private double[] data;

    [Setup]
    public void Setup()
    {
        var rand = new Random(0);
        data = new double[N];
        for (int i = 0; i < data.Length; ++i)
        {
            data[i] = rand.NextDouble() * int.MaxValue * 2 +
                      int.MinValue + rand.NextDouble();
        }
    }

    [Benchmark(OperationsPerInvoke = N)]
    public int MathRound()
    {
        int d = 0;
        for (int i = 0; i < data.Length; ++i)
            d ^= (int) Math.Round(data[i]);
        return d;
    }
}

Results:

BenchmarkDotNet.Core=v0.9.9.0
OS=Microsoft Windows NT 6.2.9200.0
Processor=Intel(R) Core(TM) i7-4702MQ CPU 2.20GHz, ProcessorCount=8
Frequency=2143475 ticks, Resolution=466.5321 ns, Timer=TSC
CLR=MS.NET 4.0.30319.42000, Arch=64-bit RELEASE [RyuJIT]
GC=Concurrent Workstation
JitModules=clrjit-v4.6.1586.0

Type=MathRoundBenchmarks  Mode=Throughput

    Method | Platform |       Jit |     Median |    StdDev |
---------- |--------- |---------- |----------- |---------- |
 MathRound |      X64 | LegacyJit | 12.8640 ns | 0.2796 ns |
 MathRound |      X64 |    RyuJit | 13.4390 ns | 0.4365 ns |
 MathRound |      X86 | LegacyJit |  1.0278 ns | 0.0373 ns |
Comments