mirror of
https://github.com/llvm/llvm-project.git
synced 2025-05-02 18:26:04 +00:00

Summary: While benchmarking uint128 division we found out that it has huge latency for small divisors https://reviews.llvm.org/D83027 ``` Benchmark Time(ns) CPU(ns) Iterations -------------------------------------------------------------------------------------------------- BM_DivideIntrinsic128UniformDivisor<unsigned __int128> 13.0 13.0 55000000 BM_DivideIntrinsic128UniformDivisor<__int128> 14.3 14.3 50000000 BM_RemainderIntrinsic128UniformDivisor<unsigned __int128> 13.5 13.5 52000000 BM_RemainderIntrinsic128UniformDivisor<__int128> 14.1 14.1 50000000 BM_DivideIntrinsic128SmallDivisor<unsigned __int128> 153 153 5000000 BM_DivideIntrinsic128SmallDivisor<__int128> 170 170 3000000 BM_RemainderIntrinsic128SmallDivisor<unsigned __int128> 153 153 5000000 BM_RemainderIntrinsic128SmallDivisor<__int128> 155 155 5000000 ``` This patch suggests a more optimized version of the division: If the divisor is 64 bit, we can proceed with the divq instruction on x86 or constant multiplication mechanisms for other platforms. Once both divisor and dividend are not less than 2**64, we use branch free subtract algorithm, it has at most 64 cycles. After that our benchmarks improved significantly ``` Benchmark Time(ns) CPU(ns) Iterations -------------------------------------------------------------------------------------------------- BM_DivideIntrinsic128UniformDivisor<unsigned __int128> 11.0 11.0 64000000 BM_DivideIntrinsic128UniformDivisor<__int128> 13.8 13.8 51000000 BM_RemainderIntrinsic128UniformDivisor<unsigned __int128> 11.6 11.6 61000000 BM_RemainderIntrinsic128UniformDivisor<__int128> 13.7 13.7 52000000 BM_DivideIntrinsic128SmallDivisor<unsigned __int128> 27.1 27.1 26000000 BM_DivideIntrinsic128SmallDivisor<__int128> 29.4 29.4 24000000 BM_RemainderIntrinsic128SmallDivisor<unsigned __int128> 27.9 27.8 26000000 BM_RemainderIntrinsic128SmallDivisor<__int128> 29.1 29.1 25000000 ``` If not using divq instrinsics, it is still much better ``` Benchmark Time(ns) CPU(ns) Iterations -------------------------------------------------------------------------------------------------- BM_DivideIntrinsic128UniformDivisor<unsigned __int128> 12.2 12.2 58000000 BM_DivideIntrinsic128UniformDivisor<__int128> 13.5 13.5 52000000 BM_RemainderIntrinsic128UniformDivisor<unsigned __int128> 12.7 12.7 56000000 BM_RemainderIntrinsic128UniformDivisor<__int128> 13.7 13.7 51000000 BM_DivideIntrinsic128SmallDivisor<unsigned __int128> 30.2 30.2 24000000 BM_DivideIntrinsic128SmallDivisor<__int128> 33.2 33.2 22000000 BM_RemainderIntrinsic128SmallDivisor<unsigned __int128> 31.4 31.4 23000000 BM_RemainderIntrinsic128SmallDivisor<__int128> 33.8 33.8 21000000 ``` PowerPC benchmarks: Was ``` BM_DivideIntrinsic128UniformDivisor<unsigned __int128> 22.3 22.3 32000000 BM_DivideIntrinsic128UniformDivisor<__int128> 23.8 23.8 30000000 BM_RemainderIntrinsic128UniformDivisor<unsigned __int128> 22.5 22.5 32000000 BM_RemainderIntrinsic128UniformDivisor<__int128> 24.9 24.9 29000000 BM_DivideIntrinsic128SmallDivisor<unsigned __int128> 394 394 2000000 BM_DivideIntrinsic128SmallDivisor<__int128> 397 397 2000000 BM_RemainderIntrinsic128SmallDivisor<unsigned __int128> 399 399 2000000 BM_RemainderIntrinsic128SmallDivisor<__int128> 397 397 2000000 ``` With this patch ``` BM_DivideIntrinsic128UniformDivisor<unsigned __int128> 21.7 21.7 33000000 BM_DivideIntrinsic128UniformDivisor<__int128> 23.0 23.0 31000000 BM_RemainderIntrinsic128UniformDivisor<unsigned __int128> 21.9 21.9 33000000 BM_RemainderIntrinsic128UniformDivisor<__int128> 23.9 23.9 30000000 BM_DivideIntrinsic128SmallDivisor<unsigned __int128> 32.7 32.6 23000000 BM_DivideIntrinsic128SmallDivisor<__int128> 33.4 33.4 21000000 BM_RemainderIntrinsic128SmallDivisor<unsigned __int128> 31.1 31.1 22000000 BM_RemainderIntrinsic128SmallDivisor<__int128> 33.2 33.2 22000000 ``` My email: danilak@google.com, I don't have commit rights Reviewers: howard.hinnant, courbet, MaskRay Reviewed By: courbet Subscribers: steven.zhang, #sanitizers Tags: #sanitizers Differential Revision: https://reviews.llvm.org/D81809
159 lines
4.9 KiB
C
159 lines
4.9 KiB
C
//===-- udivmodti4.c - Implement __udivmodti4 -----------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This file implements __udivmodti4 for the compiler_rt library.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "int_lib.h"
|
|
|
|
#ifdef CRT_HAS_128BIT
|
|
|
|
// Returns the 128 bit division result by 64 bit. Result must fit in 64 bits.
|
|
// Remainder stored in r.
|
|
// Taken and adjusted from libdivide libdivide_128_div_64_to_64 division
|
|
// fallback. For a correctness proof see the reference for this algorithm
|
|
// in Knuth, Volume 2, section 4.3.1, Algorithm D.
|
|
UNUSED
|
|
static inline du_int udiv128by64to64default(du_int u1, du_int u0, du_int v,
|
|
du_int *r) {
|
|
const unsigned n_udword_bits = sizeof(du_int) * CHAR_BIT;
|
|
const du_int b = (1ULL << (n_udword_bits / 2)); // Number base (32 bits)
|
|
du_int un1, un0; // Norm. dividend LSD's
|
|
du_int vn1, vn0; // Norm. divisor digits
|
|
du_int q1, q0; // Quotient digits
|
|
du_int un64, un21, un10; // Dividend digit pairs
|
|
du_int rhat; // A remainder
|
|
si_int s; // Shift amount for normalization
|
|
|
|
s = __builtin_clzll(v);
|
|
if (s > 0) {
|
|
// Normalize the divisor.
|
|
v = v << s;
|
|
un64 = (u1 << s) | (u0 >> (n_udword_bits - s));
|
|
un10 = u0 << s; // Shift dividend left
|
|
} else {
|
|
// Avoid undefined behavior of (u0 >> 64).
|
|
un64 = u1;
|
|
un10 = u0;
|
|
}
|
|
|
|
// Break divisor up into two 32-bit digits.
|
|
vn1 = v >> (n_udword_bits / 2);
|
|
vn0 = v & 0xFFFFFFFF;
|
|
|
|
// Break right half of dividend into two digits.
|
|
un1 = un10 >> (n_udword_bits / 2);
|
|
un0 = un10 & 0xFFFFFFFF;
|
|
|
|
// Compute the first quotient digit, q1.
|
|
q1 = un64 / vn1;
|
|
rhat = un64 - q1 * vn1;
|
|
|
|
// q1 has at most error 2. No more than 2 iterations.
|
|
while (q1 >= b || q1 * vn0 > b * rhat + un1) {
|
|
q1 = q1 - 1;
|
|
rhat = rhat + vn1;
|
|
if (rhat >= b)
|
|
break;
|
|
}
|
|
|
|
un21 = un64 * b + un1 - q1 * v;
|
|
|
|
// Compute the second quotient digit.
|
|
q0 = un21 / vn1;
|
|
rhat = un21 - q0 * vn1;
|
|
|
|
// q0 has at most error 2. No more than 2 iterations.
|
|
while (q0 >= b || q0 * vn0 > b * rhat + un0) {
|
|
q0 = q0 - 1;
|
|
rhat = rhat + vn1;
|
|
if (rhat >= b)
|
|
break;
|
|
}
|
|
|
|
*r = (un21 * b + un0 - q0 * v) >> s;
|
|
return q1 * b + q0;
|
|
}
|
|
|
|
static inline du_int udiv128by64to64(du_int u1, du_int u0, du_int v,
|
|
du_int *r) {
|
|
#if defined(__x86_64__)
|
|
du_int result;
|
|
__asm__("divq %[v]"
|
|
: "=a"(result), "=d"(*r)
|
|
: [ v ] "r"(v), "a"(u0), "d"(u1));
|
|
return result;
|
|
#else
|
|
return udiv128by64to64default(u1, u0, v, r);
|
|
#endif
|
|
}
|
|
|
|
// Effects: if rem != 0, *rem = a % b
|
|
// Returns: a / b
|
|
|
|
COMPILER_RT_ABI tu_int __udivmodti4(tu_int a, tu_int b, tu_int *rem) {
|
|
const unsigned n_utword_bits = sizeof(tu_int) * CHAR_BIT;
|
|
utwords dividend;
|
|
dividend.all = a;
|
|
utwords divisor;
|
|
divisor.all = b;
|
|
utwords quotient;
|
|
utwords remainder;
|
|
if (divisor.all > dividend.all) {
|
|
if (rem)
|
|
*rem = dividend.all;
|
|
return 0;
|
|
}
|
|
// When the divisor fits in 64 bits, we can use an optimized path.
|
|
if (divisor.s.high == 0) {
|
|
remainder.s.high = 0;
|
|
if (dividend.s.high < divisor.s.low) {
|
|
// The result fits in 64 bits.
|
|
quotient.s.low = udiv128by64to64(dividend.s.high, dividend.s.low,
|
|
divisor.s.low, &remainder.s.low);
|
|
quotient.s.high = 0;
|
|
} else {
|
|
// First, divide with the high part to get the remainder in dividend.s.high.
|
|
// After that dividend.s.high < divisor.s.low.
|
|
quotient.s.high = dividend.s.high / divisor.s.low;
|
|
dividend.s.high = dividend.s.high % divisor.s.low;
|
|
quotient.s.low = udiv128by64to64(dividend.s.high, dividend.s.low,
|
|
divisor.s.low, &remainder.s.low);
|
|
}
|
|
if (rem)
|
|
*rem = remainder.all;
|
|
return quotient.all;
|
|
}
|
|
// 0 <= shift <= 63.
|
|
si_int shift =
|
|
__builtin_clzll(divisor.s.high) - __builtin_clzll(dividend.s.high);
|
|
divisor.all <<= shift;
|
|
quotient.s.high = 0;
|
|
quotient.s.low = 0;
|
|
for (; shift >= 0; --shift) {
|
|
quotient.s.low <<= 1;
|
|
// Branch free version of.
|
|
// if (dividend.all >= divisor.all)
|
|
// {
|
|
// dividend.all -= divisor.all;
|
|
// carry = 1;
|
|
// }
|
|
const ti_int s =
|
|
(ti_int)(divisor.all - dividend.all - 1) >> (n_utword_bits - 1);
|
|
quotient.s.low |= s & 1;
|
|
dividend.all -= divisor.all & s;
|
|
divisor.all >>= 1;
|
|
}
|
|
if (rem)
|
|
*rem = dividend.all;
|
|
return quotient.all;
|
|
}
|
|
|
|
#endif // CRT_HAS_128BIT
|