LZ4: Port LZ4 1.9.x FAST_DEC_LOOP and enable it on x86 and ARM64
FAST_DEC_LOOP was introduced from LZ4 1.9.0[1] This change would be introduce 10% on decompress operation according to LZ4 benchmark result on X86 devices. Meanwhile, LZ4 with FAST_DEC_LOOP could get improvements on ARM64, however clang compiler has downgrade if FAST_DEC_LOOP enabled. So FAST_DEC_LOOP only enabled on X86/X86-64 or ARM64 with GCC build. LZ4 FAST_DEC_LOOP bug fixes include as well. 1. fixed read-after input in LZ4_decompress_safe() (issue 681) 2. Fix out-of-bound read in LZ4_decompress_fast() (issue 676) PS2: 1. Move common API to lz4defs.h 2. Add PPC related inline Macro defination. 3. Force inline new static apis. Here is the test result on ARM64(cortex-A53) Benchmark via ZRAM: Test case: taskset 03 /data/fio --bs=32k --randrepeat=1 --randseed=100 --refill_buffers \ --buffer_compress_percentage=75 --size=700M \ --scramble_buffers=1 --direct=1 --loops=100 --numjobs=1 \ --filename=/data/test/test --name=seq-read --rw=read --stonewall Patched: READ: bw=150MiB/s (157MB/s) Vanilla: READ: bw=135MiB/s (142MB/s) [1] https://github.com/lz4/lz4/releases/tag/v1.9.0 Signed-off-by: chenxi.mao <chenxi.mao@sony.com> Signed-off-by: Kazuki Hashimoto <kazukih@tuta.io>
This commit is contained in:
parent
70e706babd
commit
73837ba8d0
@ -322,7 +322,7 @@ static FORCE_INLINE int LZ4_compress_generic(
|
||||
*token = (BYTE)(litLength << ML_BITS);
|
||||
|
||||
/* Copy Literals */
|
||||
LZ4_wildCopy(op, anchor, op + litLength);
|
||||
LZ4_wildCopy8(op, anchor, op + litLength);
|
||||
op += litLength;
|
||||
}
|
||||
|
||||
@ -628,7 +628,7 @@ static int LZ4_compress_destSize_generic(
|
||||
*token = (BYTE)(litLength << ML_BITS);
|
||||
|
||||
/* Copy Literals */
|
||||
LZ4_wildCopy(op, anchor, op + litLength);
|
||||
LZ4_wildCopy8(op, anchor, op + litLength);
|
||||
op += litLength;
|
||||
}
|
||||
|
||||
|
@ -50,6 +50,96 @@
|
||||
#define assert(condition) ((void)0)
|
||||
#endif
|
||||
|
||||
#ifndef LZ4_FAST_DEC_LOOP
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
#define LZ4_FAST_DEC_LOOP 1
|
||||
#elif defined(__aarch64__) && !defined(__clang__)
|
||||
/* On aarch64, we disable this optimization for clang because on certain
|
||||
* mobile chipsets and clang, it reduces performance. For more information
|
||||
* refer to https://github.com/lz4/lz4/pull/707. */
|
||||
#define LZ4_FAST_DEC_LOOP 1
|
||||
#else
|
||||
#define LZ4_FAST_DEC_LOOP 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if LZ4_FAST_DEC_LOOP
|
||||
#define FASTLOOP_SAFE_DISTANCE 64
|
||||
FORCE_O2_INLINE_GCC_PPC64LE void
|
||||
LZ4_memcpy_using_offset_base(BYTE * dstPtr, const BYTE * srcPtr, BYTE * dstEnd,
|
||||
const size_t offset)
|
||||
{
|
||||
if (offset < 8) {
|
||||
dstPtr[0] = srcPtr[0];
|
||||
|
||||
dstPtr[1] = srcPtr[1];
|
||||
dstPtr[2] = srcPtr[2];
|
||||
dstPtr[3] = srcPtr[3];
|
||||
srcPtr += inc32table[offset];
|
||||
memcpy(dstPtr + 4, srcPtr, 4);
|
||||
srcPtr -= dec64table[offset];
|
||||
dstPtr += 8;
|
||||
} else {
|
||||
memcpy(dstPtr, srcPtr, 8);
|
||||
dstPtr += 8;
|
||||
srcPtr += 8;
|
||||
}
|
||||
|
||||
LZ4_wildCopy8(dstPtr, srcPtr, dstEnd);
|
||||
}
|
||||
|
||||
/* customized variant of memcpy, which can overwrite up to 32 bytes beyond dstEnd
|
||||
* this version copies two times 16 bytes (instead of one time 32 bytes)
|
||||
* because it must be compatible with offsets >= 16. */
|
||||
FORCE_O2_INLINE_GCC_PPC64LE void
|
||||
LZ4_wildCopy32(void *dstPtr, const void *srcPtr, void *dstEnd)
|
||||
{
|
||||
BYTE *d = (BYTE *) dstPtr;
|
||||
const BYTE *s = (const BYTE *)srcPtr;
|
||||
BYTE *const e = (BYTE *) dstEnd;
|
||||
|
||||
do {
|
||||
memcpy(d, s, 16);
|
||||
memcpy(d + 16, s + 16, 16);
|
||||
d += 32;
|
||||
s += 32;
|
||||
} while (d < e);
|
||||
}
|
||||
|
||||
FORCE_O2_INLINE_GCC_PPC64LE void
|
||||
LZ4_memcpy_using_offset(BYTE *dstPtr, const BYTE *srcPtr, BYTE *dstEnd,
|
||||
const size_t offset)
|
||||
{
|
||||
BYTE v[8];
|
||||
switch (offset) {
|
||||
|
||||
case 1:
|
||||
memset(v, *srcPtr, 8);
|
||||
goto copy_loop;
|
||||
case 2:
|
||||
memcpy(v, srcPtr, 2);
|
||||
memcpy(&v[2], srcPtr, 2);
|
||||
memcpy(&v[4], &v[0], 4);
|
||||
goto copy_loop;
|
||||
case 4:
|
||||
memcpy(v, srcPtr, 4);
|
||||
memcpy(&v[4], srcPtr, 4);
|
||||
goto copy_loop;
|
||||
default:
|
||||
LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
|
||||
return;
|
||||
}
|
||||
|
||||
copy_loop:
|
||||
memcpy(dstPtr, v, 8);
|
||||
dstPtr += 8;
|
||||
while (dstPtr < dstEnd) {
|
||||
memcpy(dstPtr, v, 8);
|
||||
dstPtr += 8;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* LZ4_decompress_generic() :
|
||||
* This generic decompression function covers all use cases.
|
||||
@ -80,25 +170,27 @@ static FORCE_INLINE int LZ4_decompress_generic(
|
||||
const size_t dictSize
|
||||
)
|
||||
{
|
||||
const BYTE *ip = (const BYTE *) src;
|
||||
const BYTE * const iend = ip + srcSize;
|
||||
const BYTE *ip = (const BYTE *)src;
|
||||
const BYTE *const iend = ip + srcSize;
|
||||
|
||||
BYTE *op = (BYTE *) dst;
|
||||
BYTE * const oend = op + outputSize;
|
||||
BYTE *const oend = op + outputSize;
|
||||
BYTE *cpy;
|
||||
|
||||
const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize;
|
||||
static const unsigned int inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
|
||||
static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3};
|
||||
|
||||
const int safeDecode = (endOnInput == endOnInputSize);
|
||||
const int checkOffset = ((safeDecode) && (dictSize < (int)(64 * KB)));
|
||||
|
||||
/* Set up the "end" pointers for the shortcut. */
|
||||
const BYTE *const shortiend = iend -
|
||||
(endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
|
||||
(endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
|
||||
const BYTE *const shortoend = oend -
|
||||
(endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
|
||||
(endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
|
||||
|
||||
const BYTE *match;
|
||||
size_t offset;
|
||||
unsigned int token;
|
||||
size_t length;
|
||||
|
||||
DEBUGLOG(5, "%s (srcSize:%i, dstSize:%i)", __func__,
|
||||
srcSize, outputSize);
|
||||
@ -117,15 +209,195 @@ static FORCE_INLINE int LZ4_decompress_generic(
|
||||
if ((endOnInput) && unlikely(srcSize == 0))
|
||||
return -1;
|
||||
|
||||
#if LZ4_FAST_DEC_LOOP
|
||||
if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
|
||||
DEBUGLOG(6, "skip fast decode loop");
|
||||
goto safe_decode;
|
||||
}
|
||||
|
||||
/* Fast loop : decode sequences as long as output < iend-FASTLOOP_SAFE_DISTANCE */
|
||||
while (1) {
|
||||
/* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
|
||||
assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
|
||||
if (endOnInput) {
|
||||
assert(ip < iend);
|
||||
}
|
||||
token = *ip++;
|
||||
length = token >> ML_BITS; /* literal length */
|
||||
|
||||
assert(!endOnInput || ip <= iend); /* ip < iend before the increment */
|
||||
|
||||
/* decode literal length */
|
||||
if (length == RUN_MASK) {
|
||||
variable_length_error error = ok;
|
||||
length +=
|
||||
read_variable_length(&ip, iend - RUN_MASK,
|
||||
endOnInput, endOnInput,
|
||||
&error);
|
||||
if (error == initial_error) {
|
||||
goto _output_error;
|
||||
}
|
||||
if ((safeDecode)
|
||||
&& unlikely((uptrval) (op) + length <
|
||||
(uptrval) (op))) {
|
||||
goto _output_error;
|
||||
} /* overflow detection */
|
||||
if ((safeDecode)
|
||||
&& unlikely((uptrval) (ip) + length <
|
||||
(uptrval) (ip))) {
|
||||
goto _output_error;
|
||||
}
|
||||
|
||||
/* overflow detection */
|
||||
/* copy literals */
|
||||
cpy = op + length;
|
||||
LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
|
||||
if (endOnInput) { /* LZ4_decompress_safe() */
|
||||
if ((cpy > oend - 32)
|
||||
|| (ip + length > iend - 32)) {
|
||||
goto safe_literal_copy;
|
||||
}
|
||||
LZ4_wildCopy32(op, ip, cpy);
|
||||
} else { /* LZ4_decompress_fast() */
|
||||
if (cpy > oend - 8) {
|
||||
goto safe_literal_copy;
|
||||
}
|
||||
LZ4_wildCopy8(op, ip, cpy);
|
||||
/* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
|
||||
/* it doesn't know input length, and only relies on end-of-block */
|
||||
/* properties */
|
||||
}
|
||||
ip += length;
|
||||
op = cpy;
|
||||
} else {
|
||||
cpy = op + length;
|
||||
if (endOnInput) { /* LZ4_decompress_safe() */
|
||||
DEBUGLOG(7,
|
||||
"copy %u bytes in a 16-bytes stripe",
|
||||
(unsigned)length);
|
||||
/* We don't need to check oend */
|
||||
/* since we check it once for each loop below */
|
||||
if (ip > iend - (16 + 1)) { /*max lit + offset + nextToken */
|
||||
goto safe_literal_copy;
|
||||
}
|
||||
/* Literals can only be 14, but hope compilers optimize */
|
||||
/*if we copy by a register size */
|
||||
memcpy(op, ip, 16);
|
||||
} else {
|
||||
/* LZ4_decompress_fast() cannot copy more than 8 bytes at a time */
|
||||
/* it doesn't know input length, and relies on end-of-block */
|
||||
/* properties */
|
||||
memcpy(op, ip, 8);
|
||||
if (length > 8) {
|
||||
memcpy(op + 8, ip + 8, 8);
|
||||
}
|
||||
}
|
||||
ip += length;
|
||||
op = cpy;
|
||||
}
|
||||
|
||||
/* get offset */
|
||||
offset = LZ4_readLE16(ip);
|
||||
ip += 2; /* end-of-block condition violated */
|
||||
match = op - offset;
|
||||
|
||||
/* get matchlength */
|
||||
length = token & ML_MASK;
|
||||
|
||||
if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) {
|
||||
goto _output_error;
|
||||
}
|
||||
/* Error : offset outside buffers */
|
||||
if (length == ML_MASK) {
|
||||
variable_length_error error = ok;
|
||||
length +=
|
||||
read_variable_length(&ip, iend - LASTLITERALS + 1,
|
||||
endOnInput, 0, &error);
|
||||
if (error != ok) {
|
||||
goto _output_error;
|
||||
}
|
||||
if ((safeDecode)
|
||||
&& unlikely((uptrval) (op) + length < (uptrval) op)) {
|
||||
goto _output_error;
|
||||
} /* overflow detection */
|
||||
length += MINMATCH;
|
||||
if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
|
||||
goto safe_match_copy;
|
||||
}
|
||||
} else {
|
||||
length += MINMATCH;
|
||||
if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
|
||||
goto safe_match_copy;
|
||||
}
|
||||
|
||||
/* Fastpath check: Avoids a branch in LZ4_wildCopy32 if true */
|
||||
if (!(dict == usingExtDict) || (match >= lowPrefix)) {
|
||||
if (offset >= 8) {
|
||||
memcpy(op, match, 8);
|
||||
memcpy(op + 8, match + 8, 8);
|
||||
memcpy(op + 16, match + 16, 2);
|
||||
op += length;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* match starting within external dictionary */
|
||||
if ((dict == usingExtDict) && (match < lowPrefix)) {
|
||||
if (unlikely(op + length > oend - LASTLITERALS)) {
|
||||
if (partialDecoding) {
|
||||
/* reach end of buffer */
|
||||
length =
|
||||
min(length, (size_t) (oend - op));
|
||||
} else {
|
||||
/* end-of-block condition violated */
|
||||
goto _output_error;
|
||||
}
|
||||
}
|
||||
|
||||
if (length <= (size_t) (lowPrefix - match)) {
|
||||
/* match fits entirely within external dictionary : just copy */
|
||||
memmove(op, dictEnd - (lowPrefix - match),
|
||||
length);
|
||||
op += length;
|
||||
} else {
|
||||
/* match stretches into both external dict and current block */
|
||||
size_t const copySize =
|
||||
(size_t) (lowPrefix - match);
|
||||
size_t const restSize = length - copySize;
|
||||
memcpy(op, dictEnd - copySize, copySize);
|
||||
op += copySize;
|
||||
if (restSize > (size_t) (op - lowPrefix)) { /* overlap copy */
|
||||
BYTE *const endOfMatch = op + restSize;
|
||||
const BYTE *copyFrom = lowPrefix;
|
||||
while (op < endOfMatch) {
|
||||
*op++ = *copyFrom++;
|
||||
}
|
||||
} else {
|
||||
memcpy(op, lowPrefix, restSize);
|
||||
op += restSize;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
/* copy match within block */
|
||||
cpy = op + length;
|
||||
|
||||
assert((op <= oend) && (oend - op >= 32));
|
||||
if (unlikely(offset < 16)) {
|
||||
LZ4_memcpy_using_offset(op, match, cpy, offset);
|
||||
} else {
|
||||
LZ4_wildCopy32(op, match, cpy);
|
||||
}
|
||||
|
||||
op = cpy; /* wildcopy correction */
|
||||
}
|
||||
safe_decode:
|
||||
#endif
|
||||
/* Main Loop : decode sequences */
|
||||
while (1) {
|
||||
size_t length;
|
||||
const BYTE *match;
|
||||
size_t offset;
|
||||
|
||||
/* get literal length */
|
||||
unsigned int const token = *ip++;
|
||||
length = token>>ML_BITS;
|
||||
length = token >> ML_BITS;
|
||||
|
||||
/* ip < iend before the increment */
|
||||
assert(!endOnInput || ip <= iend);
|
||||
@ -146,26 +418,27 @@ static FORCE_INLINE int LZ4_decompress_generic(
|
||||
* some compilers can produce better parallelized runtime code
|
||||
*/
|
||||
if ((endOnInput ? length != RUN_MASK : length <= 8)
|
||||
/*
|
||||
* strictly "less than" on input, to re-enter
|
||||
* the loop with at least one byte
|
||||
*/
|
||||
&& likely((endOnInput ? ip < shortiend : 1) &
|
||||
(op <= shortoend))) {
|
||||
/*
|
||||
* strictly "less than" on input, to re-enter
|
||||
* the loop with at least one byte
|
||||
*/
|
||||
&& likely((endOnInput ? ip < shortiend : 1) &
|
||||
(op <= shortoend))) {
|
||||
/* Copy the literals */
|
||||
LZ4_memcpy(op, ip, endOnInput ? 16 : 8);
|
||||
op += length; ip += length;
|
||||
op += length;
|
||||
ip += length;
|
||||
|
||||
/*
|
||||
* The second stage:
|
||||
* prepare for match copying, decode full info.
|
||||
* If it doesn't work out, the info won't be wasted.
|
||||
*/
|
||||
length = token & ML_MASK; /* match length */
|
||||
length = token & ML_MASK; /* match length */
|
||||
offset = LZ4_readLE16(ip);
|
||||
ip += 2;
|
||||
match = op - offset;
|
||||
assert(match <= op); /* check overflow */
|
||||
assert(match <= op); /* check overflow */
|
||||
|
||||
/* Do not deal with overlapping matches. */
|
||||
if ((length != ML_MASK) &&
|
||||
@ -190,28 +463,24 @@ static FORCE_INLINE int LZ4_decompress_generic(
|
||||
|
||||
/* decode literal length */
|
||||
if (length == RUN_MASK) {
|
||||
unsigned int s;
|
||||
|
||||
if (unlikely(endOnInput ? ip >= iend - RUN_MASK : 0)) {
|
||||
/* overflow detection */
|
||||
variable_length_error error = ok;
|
||||
length +=
|
||||
read_variable_length(&ip, iend - RUN_MASK,
|
||||
endOnInput, endOnInput,
|
||||
&error);
|
||||
if (error == initial_error)
|
||||
goto _output_error;
|
||||
}
|
||||
do {
|
||||
s = *ip++;
|
||||
length += s;
|
||||
} while (likely(endOnInput
|
||||
? ip < iend - RUN_MASK
|
||||
: 1) & (s == 255));
|
||||
|
||||
if ((safeDecode)
|
||||
&& unlikely((uptrval)(op) +
|
||||
length < (uptrval)(op))) {
|
||||
&& unlikely((uptrval) (op) +
|
||||
length < (uptrval) (op))) {
|
||||
/* overflow detection */
|
||||
goto _output_error;
|
||||
}
|
||||
if ((safeDecode)
|
||||
&& unlikely((uptrval)(ip) +
|
||||
length < (uptrval)(ip))) {
|
||||
&& unlikely((uptrval) (ip) +
|
||||
length < (uptrval) (ip))) {
|
||||
/* overflow detection */
|
||||
goto _output_error;
|
||||
}
|
||||
@ -219,11 +488,15 @@ static FORCE_INLINE int LZ4_decompress_generic(
|
||||
|
||||
/* copy literals */
|
||||
cpy = op + length;
|
||||
#if LZ4_FAST_DEC_LOOP
|
||||
safe_literal_copy:
|
||||
#endif
|
||||
LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
|
||||
|
||||
if (((endOnInput) && ((cpy > oend - MFLIMIT)
|
||||
|| (ip + length > iend - (2 + 1 + LASTLITERALS))))
|
||||
|| ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
|
||||
|| (ip + length >
|
||||
iend - (2 + 1 + LASTLITERALS))))
|
||||
|| ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
|
||||
if (partialDecoding) {
|
||||
if (cpy > oend) {
|
||||
/*
|
||||
@ -234,7 +507,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
|
||||
length = oend - op;
|
||||
}
|
||||
if ((endOnInput)
|
||||
&& (ip + length > iend)) {
|
||||
&& (ip + length > iend)) {
|
||||
/*
|
||||
* Error :
|
||||
* read attempt beyond
|
||||
@ -244,7 +517,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
|
||||
}
|
||||
} else {
|
||||
if ((!endOnInput)
|
||||
&& (cpy != oend)) {
|
||||
&& (cpy != oend)) {
|
||||
/*
|
||||
* Error :
|
||||
* block decoding must
|
||||
@ -253,7 +526,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
|
||||
goto _output_error;
|
||||
}
|
||||
if ((endOnInput)
|
||||
&& ((ip + length != iend)
|
||||
&& ((ip + length != iend)
|
||||
|| (cpy > oend))) {
|
||||
/*
|
||||
* Error :
|
||||
@ -276,7 +549,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
|
||||
break;
|
||||
} else {
|
||||
/* may overwrite up to WILDCOPYLENGTH beyond cpy */
|
||||
LZ4_wildCopy(op, ip, cpy);
|
||||
LZ4_wildCopy8(op, ip, cpy);
|
||||
ip += length;
|
||||
op = cpy;
|
||||
}
|
||||
@ -295,29 +568,14 @@ _copy_match:
|
||||
goto _output_error;
|
||||
}
|
||||
|
||||
/* costs ~1%; silence an msan warning when offset == 0 */
|
||||
/*
|
||||
* note : when partialDecoding, there is no guarantee that
|
||||
* at least 4 bytes remain available in output buffer
|
||||
*/
|
||||
if (!partialDecoding) {
|
||||
assert(oend > op);
|
||||
assert(oend - op >= 4);
|
||||
|
||||
LZ4_write32(op, (U32)offset);
|
||||
}
|
||||
|
||||
if (length == ML_MASK) {
|
||||
unsigned int s;
|
||||
|
||||
do {
|
||||
s = *ip++;
|
||||
|
||||
if ((endOnInput) && (ip > iend - LASTLITERALS))
|
||||
goto _output_error;
|
||||
|
||||
length += s;
|
||||
} while (s == 255);
|
||||
variable_length_error error = ok;
|
||||
length +=
|
||||
read_variable_length(&ip, iend - LASTLITERALS + 1,
|
||||
endOnInput, 0, &error);
|
||||
if (error != ok)
|
||||
goto _output_error;
|
||||
|
||||
if ((safeDecode)
|
||||
&& unlikely(
|
||||
@ -329,6 +587,10 @@ _copy_match:
|
||||
|
||||
length += MINMATCH;
|
||||
|
||||
#if LZ4_FAST_DEC_LOOP
|
||||
safe_match_copy:
|
||||
#endif
|
||||
|
||||
/* match starting within external dictionary */
|
||||
if ((dict == usingExtDict) && (match < lowPrefix)) {
|
||||
if (unlikely(op + length > oend - LASTLITERALS)) {
|
||||
@ -425,7 +687,7 @@ _copy_match:
|
||||
}
|
||||
|
||||
if (op < oCopyLimit) {
|
||||
LZ4_wildCopy(op, match, oCopyLimit);
|
||||
LZ4_wildCopy8(op, match, oCopyLimit);
|
||||
match += oCopyLimit - op;
|
||||
op = oCopyLimit;
|
||||
}
|
||||
@ -434,7 +696,7 @@ _copy_match:
|
||||
} else {
|
||||
LZ4_copy8(op, match);
|
||||
if (length > 16)
|
||||
LZ4_wildCopy(op + 8, match + 8, cpy);
|
||||
LZ4_wildCopy8(op + 8, match + 8, cpy);
|
||||
}
|
||||
op = cpy; /* wildcopy correction */
|
||||
}
|
||||
|
@ -40,6 +40,28 @@
|
||||
|
||||
#define FORCE_INLINE __always_inline
|
||||
|
||||
/* LZ4_FORCE_O2_GCC_PPC64LE and LZ4_FORCE_O2_INLINE_GCC_PPC64LE
|
||||
* gcc on ppc64le generates an unrolled SIMDized loop for LZ4_wildCopy8,
|
||||
* together with a simple 8-byte copy loop as a fall-back path.
|
||||
* However, this optimization hurts the decompression speed by >30%,
|
||||
* because the execution does not go to the optimized loop
|
||||
* for typical compressible data, and all of the preamble checks
|
||||
* before going to the fall-back path become useless overhead.
|
||||
* This optimization happens only with the -O3 flag, and -O2 generates
|
||||
* a simple 8-byte copy loop.
|
||||
* With gcc on ppc64le, all of the LZ4_decompress_* and LZ4_wildCopy8
|
||||
* functions are annotated with __attribute__((optimize("O2"))),
|
||||
* and also LZ4_wildCopy8 is forcibly inlined, so that the O2 attribute
|
||||
* of LZ4_wildCopy8 does not affect the compression speed.
|
||||
*/
|
||||
#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) && defined(__GNUC__) && !defined(__clang__)
|
||||
# define FORCE_O2_GCC_PPC64LE __attribute__((optimize("O2")))
|
||||
# define FORCE_O2_INLINE_GCC_PPC64LE (__attribute__((optimize("O2"))) FORCE_INLINE)
|
||||
#else
|
||||
# define FORCE_O2_GCC_PPC64LE FORCE_INLINE
|
||||
# define FORCE_O2_INLINE_GCC_PPC64LE FORCE_INLINE
|
||||
#endif
|
||||
|
||||
/*-************************************
|
||||
* Basic Types
|
||||
**************************************/
|
||||
@ -99,6 +121,9 @@ typedef uintptr_t uptrval;
|
||||
#define RUN_BITS (8 - ML_BITS)
|
||||
#define RUN_MASK ((1U << RUN_BITS) - 1)
|
||||
|
||||
static const unsigned inc32table[8] = { 0, 1, 2, 1, 0, 4, 4, 4 };
|
||||
static const int dec64table[8] = { 0, 0, 0, -1, -4, 1, 2, 3 };
|
||||
|
||||
/*-************************************
|
||||
* Reading and writing into memory
|
||||
**************************************/
|
||||
@ -167,7 +192,7 @@ static FORCE_INLINE void LZ4_copy8(void *dst, const void *src)
|
||||
* customized variant of memcpy,
|
||||
* which can overwrite up to 7 bytes beyond dstEnd
|
||||
*/
|
||||
static FORCE_INLINE void LZ4_wildCopy(void *dstPtr,
|
||||
static FORCE_O2_INLINE_GCC_PPC64LE void LZ4_wildCopy8(void *dstPtr,
|
||||
const void *srcPtr, void *dstEnd)
|
||||
{
|
||||
BYTE *d = (BYTE *)dstPtr;
|
||||
@ -231,6 +256,39 @@ static FORCE_INLINE unsigned int LZ4_count(
|
||||
return (unsigned int)(pIn - pStart);
|
||||
}
|
||||
|
||||
/* Read the variable-length literal or match length.
|
||||
*
|
||||
* ip - pointer to use as input.
|
||||
* lencheck - end ip. Return an error if ip advances >= lencheck.
|
||||
* loop_check - check ip >= lencheck in body of loop. Returns loop_error if so.
|
||||
* initial_check - check ip >= lencheck before start of loop. Returns initial_error if so.
|
||||
* error (output) - error code. Should be set to 0 before call.
|
||||
*/
|
||||
typedef enum { loop_error = -2, initial_error = -1, ok = 0} variable_length_error;
|
||||
static FORCE_INLINE unsigned read_variable_length(const BYTE **ip,
|
||||
const BYTE *lencheck,
|
||||
int loop_check, int initial_check,
|
||||
variable_length_error *error)
|
||||
{
|
||||
unsigned length = 0;
|
||||
unsigned s;
|
||||
if (initial_check && unlikely((*ip) >= lencheck)) { /* overflow detection */
|
||||
*error = initial_error;
|
||||
return length;
|
||||
}
|
||||
do {
|
||||
s = **ip;
|
||||
(*ip)++;
|
||||
length += s;
|
||||
if (loop_check && unlikely((*ip) >= lencheck)) { /* overflow detection */
|
||||
*error = loop_error;
|
||||
return length;
|
||||
}
|
||||
} while (s == 255);
|
||||
|
||||
return length;
|
||||
}
|
||||
|
||||
typedef enum { noLimit = 0, limitedOutput = 1 } limitedOutput_directive;
|
||||
typedef enum { byPtr, byU32, byU16 } tableType_t;
|
||||
|
||||
|
@ -293,7 +293,7 @@ static FORCE_INLINE int LZ4HC_encodeSequence(
|
||||
*token = (BYTE)(length<<ML_BITS);
|
||||
|
||||
/* Copy Literals */
|
||||
LZ4_wildCopy(*op, *anchor, (*op) + length);
|
||||
LZ4_wildCopy8(*op, *anchor, (*op) + length);
|
||||
*op += length;
|
||||
|
||||
/* Encode Offset */
|
||||
|
Loading…
Reference in New Issue
Block a user