diff --git a/include/linux/reciprocal_div.h b/include/linux/reciprocal_div.h index e031e9f2f9d8..585ce89c0f33 100644 --- a/include/linux/reciprocal_div.h +++ b/include/linux/reciprocal_div.h @@ -25,6 +25,9 @@ struct reciprocal_value { u8 sh1, sh2; }; +/* "reciprocal_value" and "reciprocal_divide" together implement the basic + * version of the algorithm described in Figure 4.1 of the paper. + */ struct reciprocal_value reciprocal_value(u32 d); static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R) @@ -33,4 +36,69 @@ static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R) return (t + ((a - t) >> R.sh1)) >> R.sh2; } +struct reciprocal_value_adv { + u32 m; + u8 sh, exp; + bool is_wide_m; +}; + +/* "reciprocal_value_adv" implements the advanced version of the algorithm + * described in Figure 4.2 of the paper except when "divisor > (1U << 31)" whose + * ceil(log2(d)) result will be 32 which then requires u128 divide on host. The + * exception case could be easily handled before calling "reciprocal_value_adv". + * + * The advanced version requires more complex calculation to get the reciprocal + * multiplier and other control variables, but then could reduce the required + * emulation operations. + * + * It makes no sense to use this advanced version for host divide emulation, + * those extra complexities for calculating multiplier etc could completely + * waive our saving on emulation operations. + * + * However, it makes sense to use it for JIT divide code generation for which + * we are willing to trade performance of JITed code with that of host. As shown + * by the following pseudo code, the required emulation operations could go down + * from 6 (the basic version) to 3 or 4. + * + * To use the result of "reciprocal_value_adv", suppose we want to calculate + * n/d, the pseudo C code will be: + * + * struct reciprocal_value_adv rvalue; + * u8 pre_shift, exp; + * + * // handle exception case. + * if (d >= (1U << 31)) { + * result = n >= d; + * return; + * } + * + * rvalue = reciprocal_value_adv(d, 32) + * exp = rvalue.exp; + * if (rvalue.is_wide_m && !(d & 1)) { + * // floor(log2(d & (2^32 -d))) + * pre_shift = fls(d & -d) - 1; + * rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift); + * } else { + * pre_shift = 0; + * } + * + * // code generation starts. + * if (imm == 1U << exp) { + * result = n >> exp; + * } else if (rvalue.is_wide_m) { + * // pre_shift must be zero when reached here. + * t = (n * rvalue.m) >> 32; + * result = n - t; + * result >>= 1; + * result += t; + * result >>= rvalue.sh - 1; + * } else { + * if (pre_shift) + * result = n >> pre_shift; + * result = ((u64)result * rvalue.m) >> 32; + * result >>= rvalue.sh; + * } + */ +struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec); + #endif /* _LINUX_RECIPROCAL_DIV_H */ diff --git a/lib/reciprocal_div.c b/lib/reciprocal_div.c index fcb4ce682c6f..bf043258fa00 100644 --- a/lib/reciprocal_div.c +++ b/lib/reciprocal_div.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include #include @@ -26,3 +27,43 @@ struct reciprocal_value reciprocal_value(u32 d) return R; } EXPORT_SYMBOL(reciprocal_value); + +struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec) +{ + struct reciprocal_value_adv R; + u32 l, post_shift; + u64 mhigh, mlow; + + /* ceil(log2(d)) */ + l = fls(d - 1); + /* NOTE: mlow/mhigh could overflow u64 when l == 32. This case needs to + * be handled before calling "reciprocal_value_adv", please see the + * comment at include/linux/reciprocal_div.h. + */ + WARN(l == 32, + "ceil(log2(0x%08x)) == 32, %s doesn't support such divisor", + d, __func__); + post_shift = l; + mlow = 1ULL << (32 + l); + do_div(mlow, d); + mhigh = (1ULL << (32 + l)) + (1ULL << (32 + l - prec)); + do_div(mhigh, d); + + for (; post_shift > 0; post_shift--) { + u64 lo = mlow >> 1, hi = mhigh >> 1; + + if (lo >= hi) + break; + + mlow = lo; + mhigh = hi; + } + + R.m = (u32)mhigh; + R.sh = post_shift; + R.exp = l; + R.is_wide_m = mhigh > U32_MAX; + + return R; +} +EXPORT_SYMBOL(reciprocal_value_adv);