C++ 高度优化的矩阵乘法代码在 MSVC 和 GCC 之间的性能差异

Question

提问by Z boson

I'm seeing a big difference in performance between code compiled in MSVC (on Windows) and GCC (on Linux) for an Ivy Bridge system. The code does dense matrix multiplication. I'm getting 70% of the peak flops with GCC and only 50% with MSVC. I think I may have isolated the difference to how they both convert the following three intrinsics.

对于 Ivy Bridge 系统，我发现在 MSVC（在 Windows 上）和 GCC（在 Linux 上）中编译的代码之间的性能差异很大。该代码执行密集矩阵乘法。我使用 GCC 获得了 70% 的峰值失败率，而使用 MSVC 只有 50%。我想我可能已经隔离了它们如何转换以下三个内在函数的差异。

__m256 breg0 = _mm256_loadu_ps(&b[8*i])
_mm256_add_ps(_mm256_mul_ps(arge0,breg0), tmp0)

GCC does this

海湾合作委员会这样做

vmovups ymm9, YMMWORD PTR [rax-256]
vmulps  ymm9, ymm0, ymm9
vaddps  ymm8, ymm8, ymm9

MSVC does this

MSVC 这样做

vmulps   ymm1, ymm2, YMMWORD PTR [rax-256]
vaddps   ymm3, ymm1, ymm3

Could somebody please explain to me if and why these two solutions could give such a big difference in performance?

有人可以向我解释一下这两种解决方案是否以及为什么会产生如此大的性能差异？

Despite MSVC using one less instruction it ties the load to the mult and maybe that makes it more dependent (maybe the load can't be done out of order)? I mean Ivy Bridge can do one AVX load, one AVX mult, and one AVX add in one clock cycle but this requires each operation to be independent.

尽管 MSVC 少使用了一条指令，但它会将负载绑定到多个，也许这使它更加依赖（也许负载不能乱序完成）？我的意思是，Ivy Bridge 可以在一个时钟周期内完成一个 AVX 加载、一个 AVX 多任务和一个 AVX 添加，但这要求每个操作都是独立的。

Maybe the problem lies elsewhere? You can see the full assembly code for GCC and MSVC for the innermost loop below. You can see the C++ code for the loop here Loop unrolling to achieve maximum throughput with Ivy Bridge and Haswell

也许问题出在其他地方？您可以在下面看到最内层循环的 GCC 和 MSVC 的完整汇编代码。您可以在此处查看循环的 C++ 代码循环展开以使用 Ivy Bridge 和 Haswell 实现最大吞吐量

g++ -S -masm=intel matrix.cpp -O3 -mavx -fopenmp

.L4:
    vbroadcastss    ymm0, DWORD PTR [rcx+rdx*4]
    add rdx, 1
    add rax, 256
    vmovups ymm9, YMMWORD PTR [rax-256]
    vmulps  ymm9, ymm0, ymm9
    vaddps  ymm8, ymm8, ymm9
    vmovups ymm9, YMMWORD PTR [rax-224]
    vmulps  ymm9, ymm0, ymm9
    vaddps  ymm7, ymm7, ymm9
    vmovups ymm9, YMMWORD PTR [rax-192]
    vmulps  ymm9, ymm0, ymm9
    vaddps  ymm6, ymm6, ymm9
    vmovups ymm9, YMMWORD PTR [rax-160]
    vmulps  ymm9, ymm0, ymm9
    vaddps  ymm5, ymm5, ymm9
    vmovups ymm9, YMMWORD PTR [rax-128]
    vmulps  ymm9, ymm0, ymm9
    vaddps  ymm4, ymm4, ymm9
    vmovups ymm9, YMMWORD PTR [rax-96]
    vmulps  ymm9, ymm0, ymm9
    vaddps  ymm3, ymm3, ymm9
    vmovups ymm9, YMMWORD PTR [rax-64]
    vmulps  ymm9, ymm0, ymm9
    vaddps  ymm2, ymm2, ymm9
    vmovups ymm9, YMMWORD PTR [rax-32]
    cmp esi, edx
    vmulps  ymm0, ymm0, ymm9
    vaddps  ymm1, ymm1, ymm0
    jg  .L4

MSVC /FAc /O2 /openmp /arch:AVX ...

vbroadcastss ymm2, DWORD PTR [r10]    
lea  rax, QWORD PTR [rax+256]
lea  r10, QWORD PTR [r10+4] 
vmulps   ymm1, ymm2, YMMWORD PTR [rax-320]
vaddps   ymm3, ymm1, ymm3    
vmulps   ymm1, ymm2, YMMWORD PTR [rax-288]
vaddps   ymm4, ymm1, ymm4    
vmulps   ymm1, ymm2, YMMWORD PTR [rax-256]
vaddps   ymm5, ymm1, ymm5    
vmulps   ymm1, ymm2, YMMWORD PTR [rax-224]
vaddps   ymm6, ymm1, ymm6    
vmulps   ymm1, ymm2, YMMWORD PTR [rax-192]
vaddps   ymm7, ymm1, ymm7    
vmulps   ymm1, ymm2, YMMWORD PTR [rax-160]
vaddps   ymm8, ymm1, ymm8    
vmulps   ymm1, ymm2, YMMWORD PTR [rax-128]
vaddps   ymm9, ymm1, ymm9    
vmulps   ymm1, ymm2, YMMWORD PTR [rax-96]
vaddps   ymm10, ymm1, ymm10    
dec  rdx
jne  SHORT $LL3@AddDot4x4_

EDIT:

编辑：

I benchmark the code by claculating the total floating point operations as 2.0*n^3where n is the width of the square matrix and dividing by the time measured with omp_get_wtime(). I repeat the loop several times. In the output below I repeated it 100 times.

我通过将总浮点运算计算为2.0*n^3其中 n 是方阵的宽度并除以用测量的时间来对代码进行基准测试omp_get_wtime()。我重复循环几次。在下面的输出中，我重复了 100 次。

Output from MSVC2012 on an Intel Xeon E5 1620 (Ivy Bridge) turbo for all cores is 3.7 GHz

MSVC2012 在 Intel Xeon E5 1620 (Ivy Bridge) turbo 上的所有内核输出为 3.7 GHz

maximum GFLOPS = 236.8 = (8-wide SIMD) * (1 AVX mult + 1 AVX add) * (4 cores) * 3.7 GHz

n   64,     0.02 ms, GFLOPs   0.001, GFLOPs/s   23.88, error 0.000e+000, efficiency/core   40.34%, efficiency  10.08%, mem 0.05 MB
n  128,     0.05 ms, GFLOPs   0.004, GFLOPs/s   84.54, error 0.000e+000, efficiency/core  142.81%, efficiency  35.70%, mem 0.19 MB
n  192,     0.17 ms, GFLOPs   0.014, GFLOPs/s   85.45, error 0.000e+000, efficiency/core  144.34%, efficiency  36.09%, mem 0.42 MB
n  256,     0.29 ms, GFLOPs   0.034, GFLOPs/s  114.48, error 0.000e+000, efficiency/core  193.37%, efficiency  48.34%, mem 0.75 MB
n  320,     0.59 ms, GFLOPs   0.066, GFLOPs/s  110.50, error 0.000e+000, efficiency/core  186.66%, efficiency  46.67%, mem 1.17 MB
n  384,     1.39 ms, GFLOPs   0.113, GFLOPs/s   81.39, error 0.000e+000, efficiency/core  137.48%, efficiency  34.37%, mem 1.69 MB
n  448,     3.27 ms, GFLOPs   0.180, GFLOPs/s   55.01, error 0.000e+000, efficiency/core   92.92%, efficiency  23.23%, mem 2.30 MB
n  512,     3.60 ms, GFLOPs   0.268, GFLOPs/s   74.63, error 0.000e+000, efficiency/core  126.07%, efficiency  31.52%, mem 3.00 MB
n  576,     3.93 ms, GFLOPs   0.382, GFLOPs/s   97.24, error 0.000e+000, efficiency/core  164.26%, efficiency  41.07%, mem 3.80 MB
n  640,     5.21 ms, GFLOPs   0.524, GFLOPs/s  100.60, error 0.000e+000, efficiency/core  169.93%, efficiency  42.48%, mem 4.69 MB
n  704,     6.73 ms, GFLOPs   0.698, GFLOPs/s  103.63, error 0.000e+000, efficiency/core  175.04%, efficiency  43.76%, mem 5.67 MB
n  768,     8.55 ms, GFLOPs   0.906, GFLOPs/s  105.95, error 0.000e+000, efficiency/core  178.98%, efficiency  44.74%, mem 6.75 MB
n  832,    10.89 ms, GFLOPs   1.152, GFLOPs/s  105.76, error 0.000e+000, efficiency/core  178.65%, efficiency  44.66%, mem 7.92 MB
n  896,    13.26 ms, GFLOPs   1.439, GFLOPs/s  108.48, error 0.000e+000, efficiency/core  183.25%, efficiency  45.81%, mem 9.19 MB
n  960,    16.36 ms, GFLOPs   1.769, GFLOPs/s  108.16, error 0.000e+000, efficiency/core  182.70%, efficiency  45.67%, mem 10.55 MB
n 1024,    17.74 ms, GFLOPs   2.147, GFLOPs/s  121.05, error 0.000e+000, efficiency/core  204.47%, efficiency  51.12%, mem 12.00 MB

Answer 1

采纳答案by iwolf

Since we've covered the alignment issue, I would guess it's this: http://en.wikipedia.org/wiki/Out-of-order_execution

由于我们已经讨论了对齐问题，我想是这样的：http: //en.wikipedia.org/wiki/Out-of-order_execution

Since g++ issues a standalone load instruction, your processor can reorder the instructions to be pre-fetching the next data that will be needed while also adding and multiplying. MSVC throwing a pointer at mul makes the load and mul tied to the same instruction, so changing the execution order of the instructions doesn't help anything.

由于 g++ 发出独立的加载指令，您的处理器可以重新排序指令以预取下一个需要的数据，同时进行加法和乘法。MSVC 在 mul 处抛出一个指针会使 load 和 mul 绑定到同一条指令，因此更改指令的执行顺序没有任何帮助。

EDIT: Intel's server(s) with all the docs are less angry today, so here's more research on why out of order execution is (part of) the answer.

编辑：英特尔的服务器与所有文档今天不那么生气了，所以这里有更多关于为什么乱序执行是（部分）答案的研究。

First of all, it looks like your comment is completely right about it being possible for the MSVC version of the multiplication instruction to decode to separate μ-ops that can be optimized by a CPU's out of order engine. The fun part here is that modern microcode sequencers are programmable, so the actual behavior is both hardware and firmware dependent. The differences in the generated assembly seems to be from GCC and MSVC each trying to fight different potential bottlenecks. The GCC version tries to give leeway to the out of order engine (as we've already covered). However, the MSVC version ends up taking advantage of a feature called "micro-op fusion". This is because of the μ-op retirement limitations. The end of the pipeline can only retire 3 μ-ops per tick. Micro-op fusion, in specific cases, takes two μ-ops that mustbe done on two different execution units (i.e. memory read and arithmetic) and ties them to a single μ-op for most of the pipeline. The fused μ-op is only split into the two real μ-ops right before execution unit assignment. After the execution, the ops are fused again, allowing them to be retired as one.

首先，您的评论似乎完全正确，即乘法指令的 MSVC 版本可以解码为单独的 μ-op，而这些 μ-op 可以由 CPU 的乱序引擎进行优化。这里有趣的部分是现代微码排序器是可编程的，因此实际行为取决于硬件和固件。生成的程序集的差异似乎来自 GCC 和 MSVC，每个都试图解决不同的潜在瓶颈。GCC 版本试图给乱序引擎留出余地（正如我们已经介绍过的）。但是，MSVC 版本最终利用了一种称为“微操作融合”的功能。这是因为 μ-op 退休限制。管道的末端每个滴答只能退出 3 μ-ops。微操作融合，在特定情况下，需要两个微操作必须在两个不同的执行单元（即内存读取和算术）上完成，并将它们绑定到大多数管道的单个 μ-op。融合的 μ-op 仅在执行单元分配之前被拆分为两个真正的 μ-op。执行后，操作员再次融合，允许它们作为一个整体退役。

The out of order engine only sees the fused μ-op, so it can't pull the load op away from the multiplication. This causes the pipeline to hang while waiting for the next operand to finish its bus ride.

乱序引擎只能看到融合的 μ-op，因此它无法将负载 op 从乘法中拉开。这会导致管道在等待下一个操作数完成其总线过程时挂起。

ALL THE LINKS!!!: http://download-software.intel.com/sites/default/files/managed/71/2e/319433-017.pdf

所有链接！！！：http: //download-software.intel.com/sites/default/files/managed/71/2e/319433-017.pdf

http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf

http://www.agner.org/optimize/microarchitecture.pdf

http://www.agner.org/optimize/optimizing_assembly.pdf

http://www.agner.org/optimize/instruction_tables.ods(NOTE: Excel complains that this spreadsheet is partially corrupted or otherwise sketchy, so open at your own risk. It doesn't seem to be malicious, though, and according to the rest of my research, Agner Fog is awesome. After I opted-in to the Excel recovery step, I found it full of tons of great data)

http://www.agner.org/optimize/instruction_tables.ods（注意：Excel 抱怨此电子表格部分损坏或以其他方式粗略，因此打开风险自负。不过，它似乎不是恶意的，并且根据在我的其余研究中，Agner Fog 很棒。在我选择加入 Excel 恢复步骤后，我发现它充满了大量出色的数据）

http://cs.nyu.edu/courses/fall13/CSCI-GA.3033-008/Microprocessor-Report-Sandy-Bridge-Spans-Generations-243901.pdf

http://www.syncfusion.com/Content/downloads/ebook/Assembly_Language_Succinctly.pdf

MUCH LATER EDIT: Wow, there has been some interesting update to the discussion here. I guess I was mistaken about how much of the pipeline is actually affected by micro op fusion. Maybe there is more perf gain than I expected from the the differences in the loop condition check, where the unfused instructions allow GCC to interleave the compare and jump with the last vector load and arithmetic steps?

后期编辑：哇，这里的讨论有一些有趣的更新。我想我误会了微操作融合实际影响了多少管道。也许从循环条件检查的差异中获得的性能增益比我预期的要多，其中未融合的指令允许 GCC 将比较和跳转与最后一个向量加载和算术步骤交错？

vmovups ymm9, YMMWORD PTR [rax-32]
cmp esi, edx
vmulps  ymm0, ymm0, ymm9
vaddps  ymm1, ymm1, ymm0
jg  .L4

Answer 2

回答by Z boson

I can confirm that using the GCC code in Visual Studio does indeed improve the performance. I did this by converting the GCC object file in Linux to work in Visual Studio. The efficient went from 50% to 60% using all four cores (and 60% to 70% for a single core).

我可以确认在 Visual Studio 中使用 GCC 代码确实提高了性能。我通过将 Linux 中的 GCC 对象文件转换为在 Visual Studio 中工作来做到这一点的。使用所有四个内核的效率从 50% 上升到 60%（单个内核的效率从 60% 上升到 70%）。

Microsoft has removed inline assembly from 64-bit code and also broken their 64-bit dissembler so that code can't be resembled without modification(but the 32-bit version still works). They evidently thought intrinsics would be sufficient but as this case shows they are wrong.

Microsoft 已从 64 位代码中删除了内联程序集，并且还破坏了他们的 64 位反汇编程序，因此代码无法在不修改的情况下进行模拟（但 32 位版本仍然有效）。他们显然认为内在函数就足够了，但正如这个案例表明他们错了。

Maybe fused instructions should be separate intrinsics?

也许融合指令应该是单独的内在函数？

But Microsoft is not the only one that produces less optimal intrinsic code. If you put the code below into http://gcc.godbolt.org/you can see what Clang, ICC, and GCC do. ICC gave even worse performance than MSVC.It is using vinsertf128but I don't know why. I'm not sure what Clang is doing but it looks to be closer to GCC just in a different order (and more code).

但微软并不是唯一一个产生不太理想的内在代码的公司。如果您将下面的代码放入http://gcc.godbolt.org/，您可以看到 Clang、ICC 和 GCC 的作用。 ICC 的性能甚至比 MSVC 还要差。它正在使用，vinsertf128但我不知道为什么。我不确定 Clang 在做什么，但它看起来更接近 GCC，只是顺序不同（和更多代码）。

This explains why Agner Fog wrote in his manual "Optimizing subroutines in assembly language" in regards to "disadvantages of using intrinsic functions":

这就解释了为什么 Agner Fog 在他的手册“用汇编语言优化子程序”中写到“使用内在函数的缺点”：

The compiler can modify the code or implement it in a less efficient way than the programmer intended. It may be necessary to look at the code generated by the compiler to see if it is optimized in the way the programmer intended.

编译器可以修改代码或以比程序员预期的效率低的方式实现它。可能有必要查看编译器生成的代码，看看它是否按照程序员预期的方式进行了优化。

This is disappointing for the case for using intrinsics. This means one either has to still write 64-bit assembly code soemtimes or find a compiler which implements the intrinsics the way the programmer intended. In this case only GCC appears to do that (and perhaps Clang).

对于使用内在函数的情况，这令人失望。这意味着人们要么仍然必须编写 64 位汇编代码，要么找到一个编译器，以程序员预期的方式实现内在函数。在这种情况下，似乎只有 GCC 会这样做（也许还有 Clang）。

#include <immintrin.h>
extern "C" void AddDot4x4_vec_block_8wide(const int n, const float *a, const float *b, float *c, const int stridea, const int strideb, const int stridec) {     
    const int vec_size = 8;
    __m256 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    tmp0 = _mm256_loadu_ps(&c[0*vec_size]);
    tmp1 = _mm256_loadu_ps(&c[1*vec_size]);
    tmp2 = _mm256_loadu_ps(&c[2*vec_size]);
    tmp3 = _mm256_loadu_ps(&c[3*vec_size]);
    tmp4 = _mm256_loadu_ps(&c[4*vec_size]);
    tmp5 = _mm256_loadu_ps(&c[5*vec_size]);
    tmp6 = _mm256_loadu_ps(&c[6*vec_size]);
    tmp7 = _mm256_loadu_ps(&c[7*vec_size]);

    for(int i=0; i<n; i++) {
        __m256 areg0 = _mm256_set1_ps(a[i]);

        __m256 breg0 = _mm256_loadu_ps(&b[vec_size*(8*i + 0)]);
        tmp0 = _mm256_add_ps(_mm256_mul_ps(areg0,breg0), tmp0);    
        __m256 breg1 = _mm256_loadu_ps(&b[vec_size*(8*i + 1)]);
        tmp1 = _mm256_add_ps(_mm256_mul_ps(areg0,breg1), tmp1);
        __m256 breg2 = _mm256_loadu_ps(&b[vec_size*(8*i + 2)]);
        tmp2 = _mm256_add_ps(_mm256_mul_ps(areg0,breg2), tmp2);    
        __m256 breg3 = _mm256_loadu_ps(&b[vec_size*(8*i + 3)]);
        tmp3 = _mm256_add_ps(_mm256_mul_ps(areg0,breg3), tmp3);   
        __m256 breg4 = _mm256_loadu_ps(&b[vec_size*(8*i + 4)]);
        tmp4 = _mm256_add_ps(_mm256_mul_ps(areg0,breg4), tmp4);    
        __m256 breg5 = _mm256_loadu_ps(&b[vec_size*(8*i + 5)]);
        tmp5 = _mm256_add_ps(_mm256_mul_ps(areg0,breg5), tmp5);    
        __m256 breg6 = _mm256_loadu_ps(&b[vec_size*(8*i + 6)]);
        tmp6 = _mm256_add_ps(_mm256_mul_ps(areg0,breg6), tmp6);    
        __m256 breg7 = _mm256_loadu_ps(&b[vec_size*(8*i + 7)]);
        tmp7 = _mm256_add_ps(_mm256_mul_ps(areg0,breg7), tmp7);    
    }
    _mm256_storeu_ps(&c[0*vec_size], tmp0);
    _mm256_storeu_ps(&c[1*vec_size], tmp1);
    _mm256_storeu_ps(&c[2*vec_size], tmp2);
    _mm256_storeu_ps(&c[3*vec_size], tmp3);
    _mm256_storeu_ps(&c[4*vec_size], tmp4);
    _mm256_storeu_ps(&c[5*vec_size], tmp5);
    _mm256_storeu_ps(&c[6*vec_size], tmp6);
    _mm256_storeu_ps(&c[7*vec_size], tmp7);
}

Answer 3

回答by Ben Voigt

MSVC did exactly what you asked it to. If you want a vmovupsinstruction emitted, use the _mm256_loadu_psintrinsic.

MSVC 完全按照您的要求执行。如果您想要发出vmovups指令，请使用_mm256_loadu_ps内在指令。

C++ 高度优化的矩阵乘法代码在 MSVC 和 GCC 之间的性能差异

提问by Z boson

采纳答案by iwolf

回答by Z boson

回答by Ben Voigt

相关推荐

最近更新

标签

C++ 高度优化的矩阵乘法代码在 MSVC 和 GCC 之间的性能差异

提问by Z boson

采纳答案by iwolf

回答by Z boson

回答by Ben Voigt

相关推荐

在 C++ 中使用“boost::system::error_code”

C++ 操作符在类外重载

C++14 变量模板：它们的目的是什么？任何使用示例？

C++ 释放静态变量

相关推荐

最近更新

标签