mirror of
https://github.com/llvm/llvm-project.git
synced 2025-04-26 01:26:05 +00:00
[libc] Implement branchless head-tail comparison for bcmp (#107540)
Binary size changes: | Bytes (cache lines) | before | after | |---------------------|----------|---------| | sse4 | 419 (7) | 288 (5) | | avx | 430 (7) | 308 (5) | | avx512f | 589 (10) | 390 (7) | Benchmarks for different CPUs using https://github.com/google/fleetbench. - indus-cascadelake ``` name old speed new speed delta BM_LIBC_Bcmp_Fleet_L1 1.96GB/s ± 1% 2.19GB/s ± 0% +11.49% (p=0.000 n=29+24) BM_LIBC_Bcmp_Fleet_L2 1.90GB/s ± 1% 2.14GB/s ± 1% +12.68% (p=0.000 n=29+24) BM_LIBC_Bcmp_Fleet_LLC 513MB/s ± 4% 531MB/s ± 4% +3.53% (p=0.000 n=24+24) BM_LIBC_Bcmp_Fleet_Cold 452MB/s ± 3% 456MB/s ± 4% ~ (p=0.103 n=30+30) BM_LIBC_Bcmp_0_L1 [Bcmp_0] 2.98GB/s ± 1% 3.15GB/s ± 1% +5.59% (p=0.000 n=29+30) BM_LIBC_Bcmp_0_L2 [Bcmp_0] 2.86GB/s ± 1% 3.07GB/s ± 1% +7.21% (p=0.000 n=29+30) BM_LIBC_Bcmp_0_LLC [Bcmp_0] 738MB/s ± 7% 751MB/s ± 3% +1.68% (p=0.000 n=24+25) BM_LIBC_Bcmp_0_Cold [Bcmp_0] 643MB/s ± 3% 642MB/s ± 4% ~ (p=0.522 n=29+30) BM_LIBC_Bcmp_1_L1 [Bcmp_1] 3.08GB/s ± 0% 3.25GB/s ± 0% +5.35% (p=0.000 n=28+30) BM_LIBC_Bcmp_1_L2 [Bcmp_1] 2.97GB/s ± 1% 3.17GB/s ± 1% +6.65% (p=0.000 n=29+30) BM_LIBC_Bcmp_1_LLC [Bcmp_1] 901MB/s ±59% 871MB/s ±36% ~ (p=0.676 n=29+27) BM_LIBC_Bcmp_1_Cold [Bcmp_1] 686MB/s ± 4% 686MB/s ± 3% ~ (p=0.934 n=29+30) BM_LIBC_Bcmp_2_L1 [Bcmp_2] 1.63GB/s ± 0% 1.80GB/s ± 1% +10.19% (p=0.000 n=29+30) BM_LIBC_Bcmp_2_L2 [Bcmp_2] 1.57GB/s ± 1% 1.75GB/s ± 1% +11.46% (p=0.000 n=29+30) BM_LIBC_Bcmp_2_LLC [Bcmp_2] 451MB/s ±61% 427MB/s ±28% ~ (p=0.469 n=29+25) BM_LIBC_Bcmp_2_Cold [Bcmp_2] 353MB/s ± 4% 354MB/s ± 5% ~ (p=0.467 n=30+30) BM_LIBC_Bcmp_3_L1 [Bcmp_3] 1.91GB/s ± 1% 2.10GB/s ± 1% +9.90% (p=0.000 n=29+29) BM_LIBC_Bcmp_3_L2 [Bcmp_3] 1.84GB/s ± 1% 2.03GB/s ± 1% +10.63% (p=0.000 n=29+30) BM_LIBC_Bcmp_3_LLC [Bcmp_3] 491MB/s ±24% 538MB/s ±24% +9.66% (p=0.000 n=24+27) BM_LIBC_Bcmp_3_Cold [Bcmp_3] 417MB/s ± 4% 421MB/s ± 3% ~ (p=0.063 n=30+29) BM_LIBC_Bcmp_4_L1 [Bcmp_4] 761MB/s ± 1% 867MB/s ± 1% +14.02% (p=0.000 n=28+30) BM_LIBC_Bcmp_4_L2 [Bcmp_4] 748MB/s ± 1% 860MB/s ± 1% +15.04% (p=0.000 n=30+30) BM_LIBC_Bcmp_4_LLC [Bcmp_4] 227MB/s ±29% 260MB/s ±64% +14.70% (p=0.000 n=26+27) BM_LIBC_Bcmp_4_Cold [Bcmp_4] 187MB/s ± 3% 191MB/s ± 5% +2.26% (p=0.000 n=30+30) BM_LIBC_Bcmp_5_L1 [Bcmp_5] 1.48GB/s ± 1% 1.71GB/s ± 1% +15.26% (p=0.000 n=29+30) BM_LIBC_Bcmp_5_L2 [Bcmp_5] 1.42GB/s ± 1% 1.67GB/s ± 1% +17.68% (p=0.000 n=29+29) BM_LIBC_Bcmp_5_LLC [Bcmp_5] 412MB/s ±34% 519MB/s ±80% +25.87% (p=0.000 n=27+30) BM_LIBC_Bcmp_5_Cold [Bcmp_5] 336MB/s ± 4% 343MB/s ± 6% +2.05% (p=0.000 n=30+30) BM_LIBC_Bcmp_6_L1 [Bcmp_6] 2.87GB/s ± 0% 3.24GB/s ± 1% +12.88% (p=0.000 n=26+30) BM_LIBC_Bcmp_6_L2 [Bcmp_6] 2.78GB/s ± 1% 3.20GB/s ± 1% +15.15% (p=0.000 n=26+30) BM_LIBC_Bcmp_6_LLC [Bcmp_6] 926MB/s ±43% 1227MB/s ±76% +32.53% (p=0.000 n=27+30) BM_LIBC_Bcmp_6_Cold [Bcmp_6] 716MB/s ± 4% 737MB/s ± 6% +3.02% (p=0.000 n=28+29) BM_LIBC_Bcmp_7_L1 [Bcmp_7] 1.54GB/s ± 1% 1.56GB/s ± 0% +1.40% (p=0.000 n=29+30) BM_LIBC_Bcmp_7_L2 [Bcmp_7] 1.47GB/s ± 1% 1.52GB/s ± 1% +2.97% (p=0.000 n=27+30) BM_LIBC_Bcmp_7_LLC [Bcmp_7] 351MB/s ±23% 436MB/s ±83% +24.04% (p=0.005 n=24+29) BM_LIBC_Bcmp_7_Cold [Bcmp_7] 283MB/s ± 4% 282MB/s ± 4% ~ (p=0.644 n=30+30) BM_LIBC_Bcmp_8_L1 [Bcmp_8] 824MB/s ± 1% 1048MB/s ± 1% +27.18% (p=0.000 n=29+30) BM_LIBC_Bcmp_8_L2 [Bcmp_8] 808MB/s ± 1% 1027MB/s ± 1% +27.12% (p=0.000 n=29+29) BM_LIBC_Bcmp_8_LLC [Bcmp_8] 317MB/s ±79% 332MB/s ±74% ~ (p=0.338 n=30+29) BM_LIBC_Bcmp_8_Cold [Bcmp_8] 207MB/s ± 5% 212MB/s ± 5% +2.27% (p=0.000 n=30+30) ``` - indus-skylake ``` name old speed new speed delta BM_LIBC_Bcmp_Fleet_L1 2.06GB/s ± 2% 2.25GB/s ± 3% +9.66% (p=0.000 n=27+24) BM_LIBC_Bcmp_Fleet_L2 1.96GB/s ± 2% 2.17GB/s ± 2% +10.61% (p=0.000 n=30+24) BM_LIBC_Bcmp_Fleet_LLC 1.18GB/s ± 6% 1.32GB/s ± 5% +12.27% (p=0.000 n=28+28) BM_LIBC_Bcmp_Fleet_Cold 456MB/s ± 2% 466MB/s ± 2% +2.22% (p=0.000 n=28+28) BM_LIBC_Bcmp_0_L1 [Bcmp_0] 3.08GB/s ± 2% 3.20GB/s ± 1% +3.72% (p=0.000 n=28+22) BM_LIBC_Bcmp_0_L2 [Bcmp_0] 2.92GB/s ± 1% 3.05GB/s ± 2% +4.49% (p=0.000 n=23+23) BM_LIBC_Bcmp_0_LLC [Bcmp_0] 1.83GB/s ± 8% 1.94GB/s ± 4% +6.24% (p=0.000 n=25+27) BM_LIBC_Bcmp_0_Cold [Bcmp_0] 654MB/s ± 2% 659MB/s ± 2% +0.76% (p=0.012 n=30+29) BM_LIBC_Bcmp_1_L1 [Bcmp_1] 3.19GB/s ± 2% 3.34GB/s ± 2% +4.41% (p=0.000 n=26+23) BM_LIBC_Bcmp_1_L2 [Bcmp_1] 3.05GB/s ± 2% 3.21GB/s ± 2% +5.32% (p=0.000 n=28+25) BM_LIBC_Bcmp_1_LLC [Bcmp_1] 1.95GB/s ± 4% 2.03GB/s ±10% +3.61% (p=0.000 n=27+30) BM_LIBC_Bcmp_1_Cold [Bcmp_1] 700MB/s ± 2% 702MB/s ± 2% ~ (p=0.150 n=30+30) BM_LIBC_Bcmp_2_L1 [Bcmp_2] 1.69GB/s ± 2% 1.85GB/s ± 1% +9.31% (p=0.000 n=30+26) BM_LIBC_Bcmp_2_L2 [Bcmp_2] 1.60GB/s ± 2% 1.78GB/s ± 2% +10.90% (p=0.000 n=26+27) BM_LIBC_Bcmp_2_LLC [Bcmp_2] 1.01GB/s ± 5% 1.12GB/s ± 5% +11.40% (p=0.000 n=27+28) BM_LIBC_Bcmp_2_Cold [Bcmp_2] 355MB/s ± 3% 360MB/s ± 3% +1.46% (p=0.000 n=30+30) BM_LIBC_Bcmp_3_L1 [Bcmp_3] 1.98GB/s ± 2% 2.15GB/s ± 2% +8.89% (p=0.000 n=29+27) BM_LIBC_Bcmp_3_L2 [Bcmp_3] 1.87GB/s ± 3% 2.05GB/s ± 2% +10.06% (p=0.000 n=30+26) BM_LIBC_Bcmp_3_LLC [Bcmp_3] 1.19GB/s ± 4% 1.31GB/s ± 6% +9.82% (p=0.000 n=27+29) BM_LIBC_Bcmp_3_Cold [Bcmp_3] 424MB/s ± 3% 431MB/s ± 3% +1.58% (p=0.000 n=28+30) BM_LIBC_Bcmp_4_L1 [Bcmp_4] 849MB/s ± 2% 949MB/s ± 2% +11.84% (p=0.000 n=27+28) BM_LIBC_Bcmp_4_L2 [Bcmp_4] 815MB/s ± 3% 913MB/s ± 3% +12.06% (p=0.000 n=29+30) BM_LIBC_Bcmp_4_LLC [Bcmp_4] 512MB/s ± 9% 571MB/s ± 7% +11.40% (p=0.000 n=30+30) BM_LIBC_Bcmp_4_Cold [Bcmp_4] 187MB/s ± 3% 192MB/s ± 2% +2.56% (p=0.000 n=30+28) BM_LIBC_Bcmp_5_L1 [Bcmp_5] 1.55GB/s ± 2% 1.77GB/s ± 3% +13.93% (p=0.000 n=30+28) BM_LIBC_Bcmp_5_L2 [Bcmp_5] 1.47GB/s ± 2% 1.70GB/s ± 2% +15.96% (p=0.000 n=27+26) BM_LIBC_Bcmp_5_LLC [Bcmp_5] 939MB/s ± 5% 1084MB/s ± 4% +15.36% (p=0.000 n=28+27) BM_LIBC_Bcmp_5_Cold [Bcmp_5] 340MB/s ± 2% 347MB/s ± 3% +1.93% (p=0.000 n=30+30) BM_LIBC_Bcmp_6_L1 [Bcmp_6] 3.06GB/s ± 3% 3.40GB/s ± 2% +11.13% (p=0.000 n=30+28) BM_LIBC_Bcmp_6_L2 [Bcmp_6] 2.89GB/s ± 3% 3.24GB/s ± 2% +12.20% (p=0.000 n=29+26) BM_LIBC_Bcmp_6_LLC [Bcmp_6] 1.93GB/s ± 4% 2.09GB/s ±11% +8.16% (p=0.000 n=26+30) BM_LIBC_Bcmp_6_Cold [Bcmp_6] 746MB/s ± 2% 762MB/s ± 2% +2.11% (p=0.000 n=30+28) BM_LIBC_Bcmp_7_L1 [Bcmp_7] 1.59GB/s ± 2% 1.62GB/s ± 2% +1.72% (p=0.000 n=25+27) BM_LIBC_Bcmp_7_L2 [Bcmp_7] 1.49GB/s ± 2% 1.53GB/s ± 2% +2.62% (p=0.000 n=27+29) BM_LIBC_Bcmp_7_LLC [Bcmp_7] 852MB/s ±10% 909MB/s ± 6% +6.71% (p=0.000 n=30+29) BM_LIBC_Bcmp_7_Cold [Bcmp_7] 283MB/s ± 3% 283MB/s ± 2% ~ (p=0.617 n=30+27) BM_LIBC_Bcmp_8_L1 [Bcmp_8] 891MB/s ± 2% 1083MB/s ± 2% +21.64% (p=0.000 n=27+24) BM_LIBC_Bcmp_8_L2 [Bcmp_8] 855MB/s ± 2% 1045MB/s ± 1% +22.31% (p=0.000 n=25+23) BM_LIBC_Bcmp_8_LLC [Bcmp_8] 568MB/s ± 7% 659MB/s ± 8% +16.04% (p=0.000 n=29+30) BM_LIBC_Bcmp_8_Cold [Bcmp_8] 207MB/s ± 2% 212MB/s ± 2% +2.31% (p=0.000 n=30+27) ``` - arcadia-rome ``` name old speed new speed delta BM_LIBC_Bcmp_Fleet_L1 2.16GB/s ± 2% 2.27GB/s ± 2% +5.13% (p=0.000 n=26+30) BM_LIBC_Bcmp_Fleet_L2 2.15GB/s ± 2% 2.25GB/s ± 2% +4.64% (p=0.000 n=27+30) BM_LIBC_Bcmp_Fleet_LLC 1.73GB/s ± 3% 1.81GB/s ± 3% +4.66% (p=0.000 n=25+28) BM_LIBC_Bcmp_Fleet_Cold 494MB/s ± 1% 496MB/s ± 2% +0.45% (p=0.023 n=22+24) BM_LIBC_Bcmp_0_L1 [Bcmp_0] 3.30GB/s ± 1% 3.24GB/s ± 2% -1.70% (p=0.000 n=27+30) BM_LIBC_Bcmp_0_L2 [Bcmp_0] 3.23GB/s ± 2% 3.19GB/s ± 2% -1.28% (p=0.000 n=28+28) BM_LIBC_Bcmp_0_LLC [Bcmp_0] 2.59GB/s ± 3% 2.58GB/s ± 2% -0.65% (p=0.010 n=26+26) BM_LIBC_Bcmp_0_Cold [Bcmp_0] 720MB/s ± 1% 707MB/s ± 3% -1.75% (p=0.000 n=22+25) BM_LIBC_Bcmp_1_L1 [Bcmp_1] 3.37GB/s ± 1% 3.36GB/s ± 2% ~ (p=0.102 n=28+29) BM_LIBC_Bcmp_1_L2 [Bcmp_1] 3.32GB/s ± 2% 3.30GB/s ± 2% -0.51% (p=0.038 n=28+29) BM_LIBC_Bcmp_1_LLC [Bcmp_1] 2.67GB/s ± 4% 2.70GB/s ± 4% +0.96% (p=0.009 n=28+27) BM_LIBC_Bcmp_1_Cold [Bcmp_1] 755MB/s ± 1% 751MB/s ± 2% -0.57% (p=0.000 n=22+25) BM_LIBC_Bcmp_2_L1 [Bcmp_2] 1.79GB/s ± 1% 1.86GB/s ± 2% +3.92% (p=0.000 n=27+29) BM_LIBC_Bcmp_2_L2 [Bcmp_2] 1.77GB/s ± 2% 1.82GB/s ± 2% +2.99% (p=0.000 n=28+29) BM_LIBC_Bcmp_2_LLC [Bcmp_2] 1.41GB/s ± 4% 1.47GB/s ± 3% +3.97% (p=0.000 n=28+28) BM_LIBC_Bcmp_2_Cold [Bcmp_2] 386MB/s ± 1% 389MB/s ± 1% +0.60% (p=0.000 n=21+23) BM_LIBC_Bcmp_3_L1 [Bcmp_3] 2.07GB/s ± 2% 2.17GB/s ± 2% +4.87% (p=0.000 n=29+30) BM_LIBC_Bcmp_3_L2 [Bcmp_3] 2.07GB/s ± 2% 2.13GB/s ± 2% +3.02% (p=0.000 n=28+30) BM_LIBC_Bcmp_3_LLC [Bcmp_3] 1.66GB/s ± 2% 1.73GB/s ± 2% +4.08% (p=0.000 n=29+26) BM_LIBC_Bcmp_3_Cold [Bcmp_3] 466MB/s ± 2% 469MB/s ± 3% +0.66% (p=0.001 n=22+25) BM_LIBC_Bcmp_4_L1 [Bcmp_4] 861MB/s ± 1% 964MB/s ± 2% +11.98% (p=0.000 n=29+29) BM_LIBC_Bcmp_4_L2 [Bcmp_4] 853MB/s ± 2% 935MB/s ± 2% +9.54% (p=0.000 n=28+29) BM_LIBC_Bcmp_4_LLC [Bcmp_4] 707MB/s ± 3% 743MB/s ± 4% +5.08% (p=0.000 n=29+29) BM_LIBC_Bcmp_4_Cold [Bcmp_4] 199MB/s ± 3% 199MB/s ± 2% ~ (p=0.107 n=29+25) BM_LIBC_Bcmp_5_L1 [Bcmp_5] 1.65GB/s ± 1% 1.75GB/s ± 2% +6.15% (p=0.000 n=29+29) BM_LIBC_Bcmp_5_L2 [Bcmp_5] 1.64GB/s ± 3% 1.73GB/s ± 2% +5.37% (p=0.000 n=29+29) BM_LIBC_Bcmp_5_LLC [Bcmp_5] 1.32GB/s ± 2% 1.40GB/s ± 2% +6.21% (p=0.000 n=28+27) BM_LIBC_Bcmp_5_Cold [Bcmp_5] 370MB/s ± 3% 371MB/s ± 2% +0.16% (p=0.008 n=29+25) BM_LIBC_Bcmp_6_L1 [Bcmp_6] 3.25GB/s ± 2% 3.47GB/s ± 2% +6.74% (p=0.000 n=28+29) BM_LIBC_Bcmp_6_L2 [Bcmp_6] 3.26GB/s ± 1% 3.44GB/s ± 1% +5.43% (p=0.000 n=28+29) BM_LIBC_Bcmp_6_LLC [Bcmp_6] 2.66GB/s ± 2% 2.79GB/s ± 3% +4.90% (p=0.000 n=27+29) BM_LIBC_Bcmp_6_Cold [Bcmp_6] 812MB/s ± 3% 799MB/s ± 2% -1.57% (p=0.000 n=29+25) BM_LIBC_Bcmp_7_L1 [Bcmp_7] 1.71GB/s ± 2% 1.66GB/s ± 2% -3.14% (p=0.000 n=29+29) BM_LIBC_Bcmp_7_L2 [Bcmp_7] 1.63GB/s ± 2% 1.59GB/s ± 2% -2.50% (p=0.000 n=29+28) BM_LIBC_Bcmp_7_LLC [Bcmp_7] 1.25GB/s ± 4% 1.25GB/s ± 2% ~ (p=0.530 n=28+26) BM_LIBC_Bcmp_7_Cold [Bcmp_7] 311MB/s ± 3% 308MB/s ± 1% ~ (p=0.127 n=29+24) BM_LIBC_Bcmp_8_L1 [Bcmp_8] 869MB/s ± 2% 1098MB/s ± 2% +26.28% (p=0.000 n=27+29) BM_LIBC_Bcmp_8_L2 [Bcmp_8] 873MB/s ± 2% 1075MB/s ± 1% +23.06% (p=0.000 n=27+29) BM_LIBC_Bcmp_8_LLC [Bcmp_8] 743MB/s ± 4% 859MB/s ± 4% +15.58% (p=0.000 n=27+27) BM_LIBC_Bcmp_8_Cold [Bcmp_8] 221MB/s ± 4% 221MB/s ± 3% +0.14% (p=0.034 n=29+25) ``` - ixion-haswell ``` name old speed new speed delta BM_LIBC_Bcmp_Fleet_L1 2.27GB/s ± 5% 2.41GB/s ± 6% +6.10% (p=0.000 n=29+28) BM_LIBC_Bcmp_Fleet_L2 2.14GB/s ± 6% 2.33GB/s ± 5% +9.21% (p=0.000 n=29+30) BM_LIBC_Bcmp_Fleet_LLC 1.30GB/s ± 9% 1.43GB/s ± 8% +9.85% (p=0.000 n=30+30) BM_LIBC_Bcmp_Fleet_Cold 475MB/s ± 6% 475MB/s ± 5% ~ (p=0.839 n=30+29) BM_LIBC_Bcmp_0_L1 [Bcmp_0] 3.38GB/s ± 7% 3.46GB/s ± 6% +2.35% (p=0.009 n=30+29) BM_LIBC_Bcmp_0_L2 [Bcmp_0] 3.20GB/s ± 5% 3.32GB/s ± 6% +3.52% (p=0.000 n=28+30) BM_LIBC_Bcmp_0_LLC [Bcmp_0] 1.88GB/s ± 9% 2.00GB/s ± 6% +6.63% (p=0.000 n=30+28) BM_LIBC_Bcmp_0_Cold [Bcmp_0] 664MB/s ± 6% 655MB/s ± 6% -1.32% (p=0.025 n=30+30) BM_LIBC_Bcmp_1_L1 [Bcmp_1] 3.50GB/s ± 8% 3.61GB/s ±10% +3.09% (p=0.001 n=29+30) BM_LIBC_Bcmp_1_L2 [Bcmp_1] 3.32GB/s ± 7% 3.48GB/s ± 8% +4.89% (p=0.000 n=29+30) BM_LIBC_Bcmp_1_LLC [Bcmp_1] 2.02GB/s ± 7% 2.14GB/s ± 9% +5.82% (p=0.000 n=28+29) BM_LIBC_Bcmp_1_Cold [Bcmp_1] 716MB/s ± 6% 709MB/s ± 5% -0.97% (p=0.040 n=30+28) BM_LIBC_Bcmp_2_L1 [Bcmp_2] 1.83GB/s ± 7% 1.97GB/s ± 8% +7.90% (p=0.000 n=30+30) BM_LIBC_Bcmp_2_L2 [Bcmp_2] 1.74GB/s ± 6% 1.92GB/s ± 6% +10.29% (p=0.000 n=30+29) BM_LIBC_Bcmp_2_LLC [Bcmp_2] 1.05GB/s ± 9% 1.15GB/s ± 9% +9.73% (p=0.000 n=30+30) BM_LIBC_Bcmp_2_Cold [Bcmp_2] 379MB/s ± 6% 372MB/s ± 6% -1.74% (p=0.012 n=30+30) BM_LIBC_Bcmp_3_L1 [Bcmp_3] 2.17GB/s ± 5% 2.29GB/s ± 6% +5.61% (p=0.000 n=29+30) BM_LIBC_Bcmp_3_L2 [Bcmp_3] 2.02GB/s ± 6% 2.20GB/s ± 6% +8.75% (p=0.000 n=29+30) BM_LIBC_Bcmp_3_LLC [Bcmp_3] 1.22GB/s ± 8% 1.34GB/s ± 9% +9.19% (p=0.000 n=30+30) BM_LIBC_Bcmp_3_Cold [Bcmp_3] 447MB/s ± 3% 441MB/s ± 7% -1.40% (p=0.033 n=30+30) BM_LIBC_Bcmp_4_L1 [Bcmp_4] 902MB/s ± 6% 995MB/s ±10% +10.37% (p=0.000 n=30+30) BM_LIBC_Bcmp_4_L2 [Bcmp_4] 863MB/s ± 5% 945MB/s ±11% +9.50% (p=0.000 n=29+30) BM_LIBC_Bcmp_4_LLC [Bcmp_4] 528MB/s ±11% 559MB/s ±12% +5.75% (p=0.000 n=30+30) BM_LIBC_Bcmp_4_Cold [Bcmp_4] 183MB/s ± 4% 181MB/s ± 7% ~ (p=0.088 n=28+30) BM_LIBC_Bcmp_5_L1 [Bcmp_5] 1.70GB/s ± 6% 1.87GB/s ± 8% +10.14% (p=0.000 n=29+29) BM_LIBC_Bcmp_5_L2 [Bcmp_5] 1.60GB/s ± 5% 1.80GB/s ± 9% +12.61% (p=0.000 n=29+30) BM_LIBC_Bcmp_5_LLC [Bcmp_5] 994MB/s ±13% 1094MB/s ± 8% +10.10% (p=0.000 n=29+30) BM_LIBC_Bcmp_5_Cold [Bcmp_5] 362MB/s ± 6% 358MB/s ± 7% ~ (p=0.123 n=30+30) BM_LIBC_Bcmp_6_L1 [Bcmp_6] 3.31GB/s ± 5% 3.67GB/s ± 6% +10.90% (p=0.000 n=28+30) BM_LIBC_Bcmp_6_L2 [Bcmp_6] 3.11GB/s ± 5% 3.53GB/s ± 5% +13.59% (p=0.000 n=30+30) BM_LIBC_Bcmp_6_LLC [Bcmp_6] 1.98GB/s ± 9% 2.18GB/s ± 8% +10.34% (p=0.000 n=30+30) BM_LIBC_Bcmp_6_Cold [Bcmp_6] 754MB/s ± 5% 752MB/s ± 5% ~ (p=0.592 n=30+30) BM_LIBC_Bcmp_7_L1 [Bcmp_7] 1.72GB/s ± 5% 1.72GB/s ± 6% ~ (p=0.549 n=29+29) BM_LIBC_Bcmp_7_L2 [Bcmp_7] 1.61GB/s ± 7% 1.63GB/s ± 8% ~ (p=0.191 n=30+29) BM_LIBC_Bcmp_7_LLC [Bcmp_7] 913MB/s ± 8% 905MB/s ± 9% ~ (p=0.423 n=30+30) BM_LIBC_Bcmp_7_Cold [Bcmp_7] 304MB/s ± 6% 287MB/s ± 4% -5.57% (p=0.000 n=30+30) BM_LIBC_Bcmp_8_L1 [Bcmp_8] 961MB/s ± 5% 1124MB/s ± 6% +16.94% (p=0.000 n=30+30) BM_LIBC_Bcmp_8_L2 [Bcmp_8] 915MB/s ± 8% 1100MB/s ± 7% +20.16% (p=0.000 n=30+30) BM_LIBC_Bcmp_8_LLC [Bcmp_8] 593MB/s ± 8% 669MB/s ± 8% +12.92% (p=0.000 n=30+30) BM_LIBC_Bcmp_8_Cold [Bcmp_8] 220MB/s ± 4% 220MB/s ± 6% ~ (p=0.572 n=30+30) ``` Co-authored-by: goldvitaly@google.com <%username%@google.com>
This commit is contained in:
parent
691e3c64d0
commit
66a03295de
@ -63,6 +63,15 @@ struct Memcpy {
|
||||
namespace LIBC_NAMESPACE_DECL {
|
||||
namespace generic {
|
||||
|
||||
// Not equals: returns non-zero iff values at head or tail differ.
|
||||
// This function typically loads more data than necessary when the two buffer
|
||||
// differs.
|
||||
template <typename T>
|
||||
LIBC_INLINE uint32_t branchless_head_tail_neq(CPtr p1, CPtr p2, size_t count) {
|
||||
static_assert(cpp::is_integral_v<T>);
|
||||
return neq<T>(p1, p2, 0) | neq<T>(p1, p2, count - sizeof(T));
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Specializations for uint16_t
|
||||
template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
|
||||
@ -133,6 +142,11 @@ LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2,
|
||||
#if defined(__SSE4_1__)
|
||||
template <> struct is_vector<__m128i> : cpp::true_type {};
|
||||
template <> struct cmp_is_expensive<__m128i> : cpp::true_type {};
|
||||
LIBC_INLINE __m128i load_and_xor_m128i(CPtr p1, CPtr p2, size_t offset) {
|
||||
const auto a = load<__m128i>(p1, offset);
|
||||
const auto b = load<__m128i>(p2, offset);
|
||||
return _mm_xor_si128(a, b);
|
||||
}
|
||||
LIBC_INLINE __m128i bytewise_max(__m128i a, __m128i b) {
|
||||
return _mm_max_epu8(a, b);
|
||||
}
|
||||
@ -144,17 +158,21 @@ LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) {
|
||||
return static_cast<uint16_t>(
|
||||
_mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value))));
|
||||
}
|
||||
LIBC_INLINE bool is_zero(__m128i value) {
|
||||
return _mm_testz_si128(value, value) == 1;
|
||||
}
|
||||
template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
|
||||
const auto a = load<__m128i>(p1, offset);
|
||||
const auto b = load<__m128i>(p2, offset);
|
||||
const auto xored = _mm_xor_si128(a, b);
|
||||
return _mm_testz_si128(xored, xored) == 1; // 1 iff xored == 0
|
||||
return is_zero(load_and_xor_m128i(p1, p2, offset));
|
||||
}
|
||||
template <> LIBC_INLINE uint32_t neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
|
||||
const auto a = load<__m128i>(p1, offset);
|
||||
const auto b = load<__m128i>(p2, offset);
|
||||
const auto xored = _mm_xor_si128(a, b);
|
||||
return _mm_testz_si128(xored, xored) == 0; // 0 iff xored != 0
|
||||
return !is_zero(load_and_xor_m128i(p1, p2, offset));
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE uint32_t branchless_head_tail_neq<__m128i>(CPtr p1, CPtr p2,
|
||||
size_t count) {
|
||||
const __m128i head = load_and_xor_m128i(p1, p2, 0);
|
||||
const __m128i tail = load_and_xor_m128i(p1, p2, count - sizeof(__m128i));
|
||||
return !is_zero(_mm_or_si128(head, tail));
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
|
||||
@ -173,19 +191,34 @@ LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
|
||||
#if defined(__AVX__)
|
||||
template <> struct is_vector<__m256i> : cpp::true_type {};
|
||||
template <> struct cmp_is_expensive<__m256i> : cpp::true_type {};
|
||||
template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
|
||||
LIBC_INLINE __m256i xor_m256i(__m256i a, __m256i b) {
|
||||
return _mm256_castps_si256(
|
||||
_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
|
||||
}
|
||||
LIBC_INLINE __m256i or_m256i(__m256i a, __m256i b) {
|
||||
return _mm256_castps_si256(
|
||||
_mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
|
||||
}
|
||||
LIBC_INLINE __m256i load_and_xor_m256i(CPtr p1, CPtr p2, size_t offset) {
|
||||
const auto a = load<__m256i>(p1, offset);
|
||||
const auto b = load<__m256i>(p2, offset);
|
||||
const auto xored = _mm256_castps_si256(
|
||||
_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
|
||||
return _mm256_testz_si256(xored, xored) == 1; // 1 iff xored == 0
|
||||
return xor_m256i(a, b);
|
||||
}
|
||||
LIBC_INLINE bool is_zero(__m256i value) {
|
||||
return _mm256_testz_si256(value, value) == 1;
|
||||
}
|
||||
template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
|
||||
return is_zero(load_and_xor_m256i(p1, p2, offset));
|
||||
}
|
||||
template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
|
||||
const auto a = load<__m256i>(p1, offset);
|
||||
const auto b = load<__m256i>(p2, offset);
|
||||
const auto xored = _mm256_castps_si256(
|
||||
_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
|
||||
return _mm256_testz_si256(xored, xored) == 0; // 0 iff xored != 0
|
||||
return !is_zero(load_and_xor_m256i(p1, p2, offset));
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE uint32_t branchless_head_tail_neq<__m256i>(CPtr p1, CPtr p2,
|
||||
size_t count) {
|
||||
const __m256i head = load_and_xor_m256i(p1, p2, 0);
|
||||
const __m256i tail = load_and_xor_m256i(p1, p2, count - sizeof(__m256i));
|
||||
return !is_zero(or_m256i(head, tail));
|
||||
}
|
||||
#endif // __AVX__
|
||||
|
||||
@ -300,9 +333,22 @@ template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
|
||||
template <> LIBC_INLINE uint32_t neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
|
||||
const auto a = load<__m512i>(p1, offset);
|
||||
const auto b = load<__m512i>(p2, offset);
|
||||
const uint64_t xored = _mm512_cmpneq_epi8_mask(a, b);
|
||||
return static_cast<uint32_t>(xored >> 32) |
|
||||
static_cast<uint32_t>(xored & 0xFFFFFFFF);
|
||||
return _mm512_cmpneq_epi8_mask(a, b) != 0;
|
||||
}
|
||||
LIBC_INLINE __m512i load_and_xor_m512i(CPtr p1, CPtr p2, size_t offset) {
|
||||
const auto a = load<__m512i>(p1, offset);
|
||||
const auto b = load<__m512i>(p2, offset);
|
||||
return _mm512_xor_epi64(a, b);
|
||||
}
|
||||
LIBC_INLINE bool is_zero(__m512i value) {
|
||||
return _mm512_test_epi32_mask(value, value) == 0;
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE uint32_t branchless_head_tail_neq<__m512i>(CPtr p1, CPtr p2,
|
||||
size_t count) {
|
||||
const __m512i head = load_and_xor_m512i(p1, p2, 0);
|
||||
const __m512i tail = load_and_xor_m512i(p1, p2, count - sizeof(__m512i));
|
||||
return !is_zero(_mm512_or_epi64(head, tail));
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
|
||||
|
@ -27,7 +27,7 @@ inline_bcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
|
||||
[[maybe_unused]] LIBC_INLINE BcmpReturnType
|
||||
inline_bcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) {
|
||||
if (count <= 32)
|
||||
return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
|
||||
return generic::branchless_head_tail_neq<__m128i>(p1, p2, count);
|
||||
return generic::Bcmp<__m128i>::loop_and_tail_align_above(256, p1, p2, count);
|
||||
}
|
||||
#endif // __SSE4_1__
|
||||
@ -36,9 +36,9 @@ inline_bcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) {
|
||||
[[maybe_unused]] LIBC_INLINE BcmpReturnType
|
||||
inline_bcmp_x86_avx_gt16(CPtr p1, CPtr p2, size_t count) {
|
||||
if (count <= 32)
|
||||
return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
|
||||
return generic::branchless_head_tail_neq<__m128i>(p1, p2, count);
|
||||
if (count <= 64)
|
||||
return generic::Bcmp<__m256i>::head_tail(p1, p2, count);
|
||||
return generic::branchless_head_tail_neq<__m256i>(p1, p2, count);
|
||||
return generic::Bcmp<__m256i>::loop_and_tail_align_above(256, p1, p2, count);
|
||||
}
|
||||
#endif // __AVX__
|
||||
@ -47,11 +47,11 @@ inline_bcmp_x86_avx_gt16(CPtr p1, CPtr p2, size_t count) {
|
||||
[[maybe_unused]] LIBC_INLINE BcmpReturnType
|
||||
inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
|
||||
if (count <= 32)
|
||||
return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
|
||||
return generic::branchless_head_tail_neq<__m128i>(p1, p2, count);
|
||||
if (count <= 64)
|
||||
return generic::Bcmp<__m256i>::head_tail(p1, p2, count);
|
||||
return generic::branchless_head_tail_neq<__m256i>(p1, p2, count);
|
||||
if (count <= 128)
|
||||
return generic::Bcmp<__m512i>::head_tail(p1, p2, count);
|
||||
return generic::branchless_head_tail_neq<__m512i>(p1, p2, count);
|
||||
return generic::Bcmp<__m512i>::loop_and_tail_align_above(256, p1, p2, count);
|
||||
}
|
||||
#endif // __AVX512BW__
|
||||
@ -62,22 +62,12 @@ inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
|
||||
return BcmpReturnType::zero();
|
||||
if (count == 1)
|
||||
return generic::Bcmp<uint8_t>::block(p1, p2);
|
||||
if (count == 2)
|
||||
return generic::Bcmp<uint16_t>::block(p1, p2);
|
||||
if (count == 3)
|
||||
return generic::BcmpSequence<uint16_t, uint8_t>::block(p1, p2);
|
||||
if (count == 4)
|
||||
return generic::Bcmp<uint32_t>::block(p1, p2);
|
||||
if (count == 5)
|
||||
return generic::BcmpSequence<uint32_t, uint8_t>::block(p1, p2);
|
||||
if (count == 6)
|
||||
return generic::BcmpSequence<uint32_t, uint16_t>::block(p1, p2);
|
||||
if (count == 7)
|
||||
return generic::BcmpSequence<uint32_t, uint16_t, uint8_t>::block(p1, p2);
|
||||
if (count == 8)
|
||||
return generic::Bcmp<uint64_t>::block(p1, p2);
|
||||
if (count <= 4)
|
||||
return generic::branchless_head_tail_neq<uint16_t>(p1, p2, count);
|
||||
if (count <= 8)
|
||||
return generic::branchless_head_tail_neq<uint32_t>(p1, p2, count);
|
||||
if (count <= 16)
|
||||
return generic::Bcmp<uint64_t>::head_tail(p1, p2, count);
|
||||
return generic::branchless_head_tail_neq<uint64_t>(p1, p2, count);
|
||||
#if defined(__AVX512BW__)
|
||||
return inline_bcmp_x86_avx512bw_gt16(p1, p2, count);
|
||||
#elif defined(__AVX__)
|
||||
|
Loading…
x
Reference in New Issue
Block a user