diff --git a/3rdparty/lzma/include/7zVersion.h b/3rdparty/lzma/include/7zVersion.h index 1ddef80b08..72733f7fae 100644 --- a/3rdparty/lzma/include/7zVersion.h +++ b/3rdparty/lzma/include/7zVersion.h @@ -1,7 +1,7 @@ -#define MY_VER_MAJOR 24 -#define MY_VER_MINOR 8 +#define MY_VER_MAJOR 25 +#define MY_VER_MINOR 0 #define MY_VER_BUILD 0 -#define MY_VERSION_NUMBERS "24.08" +#define MY_VERSION_NUMBERS "25.00" #define MY_VERSION MY_VERSION_NUMBERS #ifdef MY_CPU_NAME @@ -10,12 +10,12 @@ #define MY_VERSION_CPU MY_VERSION #endif -#define MY_DATE "2024-08-11" +#define MY_DATE "2025-07-05" #undef MY_COPYRIGHT #undef MY_VERSION_COPYRIGHT_DATE #define MY_AUTHOR_NAME "Igor Pavlov" #define MY_COPYRIGHT_PD "Igor Pavlov : Public domain" -#define MY_COPYRIGHT_CR "Copyright (c) 1999-2024 Igor Pavlov" +#define MY_COPYRIGHT_CR "Copyright (c) 1999-2025 Igor Pavlov" #ifdef USE_COPYRIGHT_CR #define MY_COPYRIGHT MY_COPYRIGHT_CR diff --git a/3rdparty/lzma/include/Compiler.h b/3rdparty/lzma/include/Compiler.h index 2a9c2b7a08..b266b277bd 100644 --- a/3rdparty/lzma/include/Compiler.h +++ b/3rdparty/lzma/include/Compiler.h @@ -1,5 +1,5 @@ /* Compiler.h : Compiler specific defines and pragmas -2024-01-22 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #ifndef ZIP7_INC_COMPILER_H #define ZIP7_INC_COMPILER_H @@ -183,6 +183,16 @@ typedef void (*Z7_void_Function)(void); #define Z7_ATTRIB_NO_VECTORIZE #endif +#if defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1920) + #define Z7_PRAGMA_OPTIMIZE_FOR_CODE_SIZE _Pragma("optimize ( \"s\", on )") + #define Z7_PRAGMA_OPTIMIZE_DEFAULT _Pragma("optimize ( \"\", on )") +#else + #define Z7_PRAGMA_OPTIMIZE_FOR_CODE_SIZE + #define Z7_PRAGMA_OPTIMIZE_DEFAULT +#endif + + + #if defined(MY_CPU_X86_OR_AMD64) && ( \ defined(__clang__) && (__clang_major__ >= 4) \ || defined(__GNUC__) && (__GNUC__ >= 5)) diff --git a/3rdparty/lzma/include/CpuArch.h b/3rdparty/lzma/include/CpuArch.h index 683cfaa862..1690a5b616 100644 --- a/3rdparty/lzma/include/CpuArch.h +++ b/3rdparty/lzma/include/CpuArch.h @@ -1,5 +1,5 @@ /* CpuArch.h -- CPU specific code -2024-06-17 : Igor Pavlov : Public domain */ +Igor Pavlov : Public domain */ #ifndef ZIP7_INC_CPU_ARCH_H #define ZIP7_INC_CPU_ARCH_H @@ -47,6 +47,12 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. #define MY_CPU_SIZEOF_POINTER 4 #endif +#if defined(__SSE2__) \ + || defined(MY_CPU_AMD64) \ + || defined(_M_IX86_FP) && (_M_IX86_FP >= 2) +#define MY_CPU_SSE2 +#endif + #if defined(_M_ARM64) \ || defined(_M_ARM64EC) \ @@ -509,11 +515,19 @@ problem-4 : performace: #if defined(MY_CPU_LE_UNALIGN) && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) +#if 0 +// Z7_BSWAP16 can be slow for x86-msvc +#define GetBe16_to32(p) (Z7_BSWAP16 (*(const UInt16 *)(const void *)(p))) +#else +#define GetBe16_to32(p) (Z7_BSWAP32 (*(const UInt16 *)(const void *)(p)) >> 16) +#endif + #define GetBe32(p) Z7_BSWAP32 (*(const UInt32 *)(const void *)(p)) #define SetBe32(p, v) { (*(UInt32 *)(void *)(p)) = Z7_BSWAP32(v); } #if defined(MY_CPU_LE_UNALIGN_64) #define GetBe64(p) Z7_BSWAP64 (*(const UInt64 *)(const void *)(p)) +#define SetBe64(p, v) { (*(UInt64 *)(void *)(p)) = Z7_BSWAP64(v); } #endif #else @@ -536,21 +550,39 @@ problem-4 : performace: #define GetBe64(p) (((UInt64)GetBe32(p) << 32) | GetBe32(((const Byte *)(p)) + 4)) #endif +#ifndef SetBe64 +#define SetBe64(p, v) { Byte *_ppp_ = (Byte *)(p); UInt64 _vvv_ = (v); \ + _ppp_[0] = (Byte)(_vvv_ >> 56); \ + _ppp_[1] = (Byte)(_vvv_ >> 48); \ + _ppp_[2] = (Byte)(_vvv_ >> 40); \ + _ppp_[3] = (Byte)(_vvv_ >> 32); \ + _ppp_[4] = (Byte)(_vvv_ >> 24); \ + _ppp_[5] = (Byte)(_vvv_ >> 16); \ + _ppp_[6] = (Byte)(_vvv_ >> 8); \ + _ppp_[7] = (Byte)_vvv_; } +#endif + #ifndef GetBe16 +#ifdef GetBe16_to32 +#define GetBe16(p) ( (UInt16) GetBe16_to32(p)) +#else #define GetBe16(p) ( (UInt16) ( \ ((UInt16)((const Byte *)(p))[0] << 8) | \ ((const Byte *)(p))[1] )) #endif +#endif #if defined(MY_CPU_BE) #define Z7_CONV_BE_TO_NATIVE_CONST32(v) (v) #define Z7_CONV_LE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v) #define Z7_CONV_NATIVE_TO_BE_32(v) (v) +// #define Z7_GET_NATIVE16_FROM_2_BYTES(b0, b1) ((b1) | ((b0) << 8)) #elif defined(MY_CPU_LE) #define Z7_CONV_BE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v) #define Z7_CONV_LE_TO_NATIVE_CONST32(v) (v) #define Z7_CONV_NATIVE_TO_BE_32(v) Z7_BSWAP32(v) +// #define Z7_GET_NATIVE16_FROM_2_BYTES(b0, b1) ((b0) | ((b1) << 8)) #else #error Stop_Compiling_Unknown_Endian_CONV #endif @@ -589,6 +621,11 @@ problem-4 : performace: #endif +#ifndef GetBe16_to32 +#define GetBe16_to32(p) GetBe16(p) +#endif + + #if defined(MY_CPU_X86_OR_AMD64) \ || defined(MY_CPU_ARM_OR_ARM64) \ || defined(MY_CPU_PPC_OR_PPC64) @@ -617,6 +654,7 @@ BoolInt CPU_IsSupported_SSE2(void); BoolInt CPU_IsSupported_SSSE3(void); BoolInt CPU_IsSupported_SSE41(void); BoolInt CPU_IsSupported_SHA(void); +BoolInt CPU_IsSupported_SHA512(void); BoolInt CPU_IsSupported_PageGB(void); #elif defined(MY_CPU_ARM_OR_ARM64) @@ -634,6 +672,7 @@ BoolInt CPU_IsSupported_SHA1(void); BoolInt CPU_IsSupported_SHA2(void); BoolInt CPU_IsSupported_AES(void); #endif +BoolInt CPU_IsSupported_SHA512(void); #endif diff --git a/3rdparty/lzma/include/LzFindMt.h b/3rdparty/lzma/include/LzFindMt.h index fcb479da9e..89984f52d1 100644 --- a/3rdparty/lzma/include/LzFindMt.h +++ b/3rdparty/lzma/include/LzFindMt.h @@ -1,5 +1,5 @@ /* LzFindMt.h -- multithreaded Match finder for LZ algorithms -2024-01-22 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #ifndef ZIP7_INC_LZ_FIND_MT_H #define ZIP7_INC_LZ_FIND_MT_H @@ -12,8 +12,10 @@ EXTERN_C_BEGIN typedef struct { UInt32 numProcessedBlocks; - CThread thread; + Int32 affinityGroup; + UInt64 affinityInGroup; UInt64 affinity; + CThread thread; BoolInt wasCreated; BoolInt needStart; diff --git a/3rdparty/lzma/include/Lzma2Enc.h b/3rdparty/lzma/include/Lzma2Enc.h index cb25275c6b..1e6b50c6f4 100644 --- a/3rdparty/lzma/include/Lzma2Enc.h +++ b/3rdparty/lzma/include/Lzma2Enc.h @@ -18,6 +18,7 @@ typedef struct int numBlockThreads_Reduced; int numBlockThreads_Max; int numTotalThreads; + unsigned numThreadGroups; // 0 : no groups } CLzma2EncProps; void Lzma2EncProps_Init(CLzma2EncProps *p); diff --git a/3rdparty/lzma/include/LzmaEnc.h b/3rdparty/lzma/include/LzmaEnc.h index 9f8039a103..3feb5b4af0 100644 --- a/3rdparty/lzma/include/LzmaEnc.h +++ b/3rdparty/lzma/include/LzmaEnc.h @@ -1,5 +1,5 @@ /* LzmaEnc.h -- LZMA Encoder -2023-04-13 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #ifndef ZIP7_INC_LZMA_ENC_H #define ZIP7_INC_LZMA_ENC_H @@ -29,11 +29,13 @@ typedef struct int numThreads; /* 1 or 2, default = 2 */ // int _pad; + Int32 affinityGroup; UInt64 reduceSize; /* estimated size of data that will be compressed. default = (UInt64)(Int64)-1. Encoder uses this value to reduce dictionary size */ UInt64 affinity; + UInt64 affinityInGroup; } CLzmaEncProps; void LzmaEncProps_Init(CLzmaEncProps *p); diff --git a/3rdparty/lzma/include/MtCoder.h b/3rdparty/lzma/include/MtCoder.h index 1231d3c2a5..8166ccac2e 100644 --- a/3rdparty/lzma/include/MtCoder.h +++ b/3rdparty/lzma/include/MtCoder.h @@ -1,5 +1,5 @@ /* MtCoder.h -- Multi-thread Coder -2023-04-13 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #ifndef ZIP7_INC_MT_CODER_H #define ZIP7_INC_MT_CODER_H @@ -16,7 +16,7 @@ EXTERN_C_BEGIN #ifndef Z7_ST #define MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads) ((numThreads) + (numThreads) / 8 + 1) - #define MTCODER_THREADS_MAX 64 + #define MTCODER_THREADS_MAX 256 #define MTCODER_BLOCKS_MAX (MTCODER_GET_NUM_BLOCKS_FROM_THREADS(MTCODER_THREADS_MAX) + 3) #else #define MTCODER_THREADS_MAX 1 @@ -77,6 +77,7 @@ typedef struct CMtCoder_ size_t blockSize; /* size of input block */ unsigned numThreadsMax; + unsigned numThreadGroups; UInt64 expectedDataSize; ISeqInStreamPtr inStream; @@ -125,6 +126,8 @@ typedef struct CMtCoder_ CMtProgress mtProgress; CMtCoderBlock blocks[MTCODER_BLOCKS_MAX]; CMtCoderThread threads[MTCODER_THREADS_MAX]; + + CThreadNextGroup nextGroup; } CMtCoder; diff --git a/3rdparty/lzma/include/Sha256.h b/3rdparty/lzma/include/Sha256.h index 9e0422320c..75329cdf02 100644 --- a/3rdparty/lzma/include/Sha256.h +++ b/3rdparty/lzma/include/Sha256.h @@ -1,5 +1,5 @@ /* Sha256.h -- SHA-256 Hash -2023-04-02 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #ifndef ZIP7_INC_SHA256_H #define ZIP7_INC_SHA256_H @@ -14,6 +14,9 @@ EXTERN_C_BEGIN #define SHA256_BLOCK_SIZE (SHA256_NUM_BLOCK_WORDS * 4) #define SHA256_DIGEST_SIZE (SHA256_NUM_DIGEST_WORDS * 4) + + + typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byte *data, size_t numBlocks); /* @@ -32,9 +35,16 @@ typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byt typedef struct { - SHA256_FUNC_UPDATE_BLOCKS func_UpdateBlocks; - UInt64 count; - UInt64 _pad_2[2]; + union + { + struct + { + SHA256_FUNC_UPDATE_BLOCKS func_UpdateBlocks; + UInt64 count; + } vars; + UInt64 _pad_64bit[4]; + void *_pad_align_ptr[2]; + } v; UInt32 state[SHA256_NUM_DIGEST_WORDS]; Byte buffer[SHA256_BLOCK_SIZE]; diff --git a/3rdparty/lzma/include/Sort.h b/3rdparty/lzma/include/Sort.h index 1817b652f5..de5a4e86cf 100644 --- a/3rdparty/lzma/include/Sort.h +++ b/3rdparty/lzma/include/Sort.h @@ -1,5 +1,5 @@ /* Sort.h -- Sort functions -2023-03-05 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #ifndef ZIP7_INC_SORT_H #define ZIP7_INC_SORT_H @@ -8,10 +8,7 @@ EXTERN_C_BEGIN -void HeapSort(UInt32 *p, size_t size); -void HeapSort64(UInt64 *p, size_t size); - -/* void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size); */ +void Z7_FASTCALL HeapSort(UInt32 *p, size_t size); EXTERN_C_END diff --git a/3rdparty/lzma/include/Threads.h b/3rdparty/lzma/include/Threads.h index c1484a2773..be12e6e7fa 100644 --- a/3rdparty/lzma/include/Threads.h +++ b/3rdparty/lzma/include/Threads.h @@ -1,5 +1,5 @@ /* Threads.h -- multithreading library -2024-03-28 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #ifndef ZIP7_INC_THREADS_H #define ZIP7_INC_THREADS_H @@ -140,12 +140,22 @@ WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param WRes Thread_Wait_Close(CThread *p); #ifdef _WIN32 +WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinityMask); #define Thread_Create_With_CpuSet(p, func, param, cs) \ Thread_Create_With_Affinity(p, func, param, *cs) #else WRes Thread_Create_With_CpuSet(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, const CCpuSet *cpuSet); #endif +typedef struct +{ + unsigned NumGroups; + unsigned NextGroup; +} CThreadNextGroup; + +void ThreadNextGroup_Init(CThreadNextGroup *p, unsigned numGroups, unsigned startGroup); +unsigned ThreadNextGroup_GetNext(CThreadNextGroup *p); + #ifdef _WIN32 diff --git a/3rdparty/lzma/include/Xz.h b/3rdparty/lzma/include/Xz.h index 42bc685341..ad63b48c71 100644 --- a/3rdparty/lzma/include/Xz.h +++ b/3rdparty/lzma/include/Xz.h @@ -1,5 +1,5 @@ /* Xz.h - Xz interface -2024-01-26 : Igor Pavlov : Public domain */ +Igor Pavlov : Public domain */ #ifndef ZIP7_INC_XZ_H #define ZIP7_INC_XZ_H @@ -121,6 +121,7 @@ typedef struct UInt64 startOffset; } CXzStream; +#define Xz_CONSTRUCT(p) { (p)->numBlocks = 0; (p)->blocks = NULL; (p)->flags = 0; } void Xz_Construct(CXzStream *p); void Xz_Free(CXzStream *p, ISzAllocPtr alloc); @@ -136,8 +137,13 @@ typedef struct CXzStream *streams; } CXzs; +#define Xzs_CONSTRUCT(p) { (p)->num = 0; (p)->numAllocated = 0; (p)->streams = NULL; } void Xzs_Construct(CXzs *p); void Xzs_Free(CXzs *p, ISzAllocPtr alloc); +/* +Xzs_ReadBackward() must be called for empty CXzs object. +Xzs_ReadBackward() can return non empty object with (p->num != 0) even in case of error. +*/ SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr inStream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc); UInt64 Xzs_GetNumBlocks(const CXzs *p); @@ -268,8 +274,8 @@ typedef struct size_t outBufSize; size_t outDataWritten; // the size of data in (outBuf) that were fully unpacked - Byte shaDigest[SHA256_DIGEST_SIZE]; - Byte buf[XZ_BLOCK_HEADER_SIZE_MAX]; + UInt32 shaDigest32[SHA256_DIGEST_SIZE / 4]; + Byte buf[XZ_BLOCK_HEADER_SIZE_MAX]; // it must be aligned for 4-bytes } CXzUnpacker; /* alloc : aligned for cache line allocation is better */ diff --git a/3rdparty/lzma/include/XzEnc.h b/3rdparty/lzma/include/XzEnc.h index 77b78c014b..ac6bbf7996 100644 --- a/3rdparty/lzma/include/XzEnc.h +++ b/3rdparty/lzma/include/XzEnc.h @@ -1,5 +1,5 @@ /* XzEnc.h -- Xz Encode -2023-04-13 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #ifndef ZIP7_INC_XZ_ENC_H #define ZIP7_INC_XZ_ENC_H @@ -31,6 +31,7 @@ typedef struct CLzma2EncProps lzma2Props; CXzFilterProps filterProps; unsigned checkId; + unsigned numThreadGroups; // 0 : no groups UInt64 blockSize; int numBlockThreads_Reduced; int numBlockThreads_Max; diff --git a/3rdparty/lzma/src/7zDec.c b/3rdparty/lzma/src/7zDec.c index c9b4064e3f..520cbfd833 100644 --- a/3rdparty/lzma/src/7zDec.c +++ b/3rdparty/lzma/src/7zDec.c @@ -1,5 +1,5 @@ /* 7zDec.c -- Decoding from 7z folder -2024-03-01 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #include "Precomp.h" @@ -312,8 +312,9 @@ static BoolInt IS_MAIN_METHOD(UInt32 m) case k_PPMD: #endif return True; + default: + return False; } - return False; } static BoolInt IS_SUPPORTED_CODER(const CSzCoderInfo *c) diff --git a/3rdparty/lzma/src/AesOpt.c b/3rdparty/lzma/src/AesOpt.c index 58769ea059..b281807390 100644 --- a/3rdparty/lzma/src/AesOpt.c +++ b/3rdparty/lzma/src/AesOpt.c @@ -1,5 +1,5 @@ /* AesOpt.c -- AES optimized code for x86 AES hardware instructions -2024-03-01 : Igor Pavlov : Public domain */ +Igor Pavlov : Public domain */ #include "Precomp.h" @@ -80,19 +80,39 @@ AES_FUNC_START (name) #define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src) +#if 1 +// use aligned SSE load/store for data. +// It is required for our Aes functions, that data is aligned for 16-bytes. +// So we can use this branch of code. +// and compiler can use fused load-op SSE instructions: +// xorps xmm0, XMMWORD PTR [rdx] +#define LOAD_128(pp) (*(__m128i *)(void *)(pp)) +#define STORE_128(pp, _v) *(__m128i *)(void *)(pp) = _v +// use aligned SSE load/store for data. Alternative code with direct access +// #define LOAD_128(pp) _mm_load_si128(pp) +// #define STORE_128(pp, _v) _mm_store_si128(pp, _v) +#else +// use unaligned load/store for data: movdqu XMMWORD PTR [rdx] +#define LOAD_128(pp) _mm_loadu_si128(pp) +#define STORE_128(pp, _v) _mm_storeu_si128(pp, _v) +#endif + AES_FUNC_START2 (AesCbc_Encode_HW) { + if (numBlocks == 0) + return; + { __m128i *p = (__m128i *)(void *)ivAes; __m128i *data = (__m128i *)(void *)data8; __m128i m = *p; const __m128i k0 = p[2]; const __m128i k1 = p[3]; const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1; - for (; numBlocks != 0; numBlocks--, data++) + do { UInt32 r = numRounds2; const __m128i *w = p + 4; - __m128i temp = *data; + __m128i temp = LOAD_128(data); MM_XOR (temp, k0) MM_XOR (m, temp) MM_OP_m (_mm_aesenc_si128, k1) @@ -104,9 +124,12 @@ AES_FUNC_START2 (AesCbc_Encode_HW) } while (--r); MM_OP_m (_mm_aesenclast_si128, w[0]) - *data = m; + STORE_128(data, m); + data++; } + while (--numBlocks); *p = m; + } } @@ -139,12 +162,12 @@ AES_FUNC_START2 (AesCbc_Encode_HW) #define WOP(op) op (m0, 0) WOP_M1(op) - #define DECLARE_VAR(reg, ii) __m128i reg; -#define LOAD_data( reg, ii) reg = data[ii]; -#define STORE_data( reg, ii) data[ii] = reg; +#define LOAD_data_ii(ii) LOAD_128(data + (ii)) +#define LOAD_data( reg, ii) reg = LOAD_data_ii(ii); +#define STORE_data( reg, ii) STORE_128(data + (ii), reg); #if (NUM_WAYS > 1) -#define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]) +#define XOR_data_M1(reg, ii) MM_XOR (reg, LOAD_128(data + (ii- 1))) #endif #define MM_OP_key(op, reg) MM_OP(op, reg, key); @@ -156,25 +179,22 @@ AES_FUNC_START2 (AesCbc_Encode_HW) #define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg) #define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr; -#define CTR_END( reg, ii) MM_XOR (data[ii], reg) - +#define CTR_END( reg, ii) STORE_128(data + (ii), _mm_xor_si128(reg, \ + LOAD_128 (data + (ii)))); #define WOP_KEY(op, n) { \ const __m128i key = w[n]; \ - WOP(op); } - + WOP(op) } #define WIDE_LOOP_START \ dataEnd = data + numBlocks; \ if (numBlocks >= NUM_WAYS) \ { dataEnd -= NUM_WAYS; do { \ - #define WIDE_LOOP_END \ data += NUM_WAYS; \ } while (data <= dataEnd); \ dataEnd += NUM_WAYS; } \ - #define SINGLE_LOOP \ for (; data < dataEnd; data++) @@ -184,54 +204,73 @@ AES_FUNC_START2 (AesCbc_Encode_HW) #define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src) #define AVX_DECLARE_VAR(reg, ii) __m256i reg; -#define AVX_LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii]; -#define AVX_STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg; + +#if 1 +// use unaligned AVX load/store for data. +// It is required for our Aes functions, that data is aligned for 16-bytes. +// But we need 32-bytes reading. +// So we use intrinsics for unaligned AVX load/store. +// notes for _mm256_storeu_si256: +// msvc2022: uses vmovdqu and keeps the order of instruction sequence. +// new gcc11 uses vmovdqu +// old gcc9 could use pair of instructions: +// vmovups %xmm7, -224(%rax) +// vextracti128 $0x1, %ymm7, -208(%rax) +#define AVX_LOAD(p) _mm256_loadu_si256((const __m256i *)(const void *)(p)) +#define AVX_STORE(p, _v) _mm256_storeu_si256((__m256i *)(void *)(p), _v); +#else +// use aligned AVX load/store for data. +// for debug: we can use this branch, if we are sure that data is aligned for 32-bytes. +// msvc2022 uses vmovdqu still +// gcc uses vmovdqa (that requires 32-bytes alignment) +#define AVX_LOAD(p) (*(const __m256i *)(const void *)(p)) +#define AVX_STORE(p, _v) (*(__m256i *)(void *)(p)) = _v; +#endif + +#define AVX_LOAD_data( reg, ii) reg = AVX_LOAD((const __m256i *)(const void *)data + (ii)); +#define AVX_STORE_data( reg, ii) AVX_STORE((__m256i *)(void *)data + (ii), reg) /* -AVX_XOR_data_M1() needs unaligned memory load -if (we don't use _mm256_loadu_si256() here) -{ - Most compilers with enabled optimizations generate fused AVX (LOAD + OP) - instruction that can load unaligned data. - But GCC and CLANG without -O2 or -O1 optimizations can generate separated - LOAD-ALIGNED (vmovdqa) instruction that will fail on execution. -} -Note: some compilers generate more instructions, if we use _mm256_loadu_si256() here. -v23.02: we use _mm256_loadu_si256() here, because we need compatibility with any compiler. +AVX_XOR_data_M1() needs unaligned memory load, even if (data) +is aligned for 256-bits, because we read 32-bytes chunk that +crosses (data) position: from (data - 16bytes) to (data + 16bytes). */ -#define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256(&(((const __m256i *)(const void *)(data - 1))[ii]))) -// for debug only: the following code will fail on execution, if compiled by some compilers: -// #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii])) +#define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256((const __m256i *)(const void *)(data - 1) + (ii))) #define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg) #define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg) #define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg) #define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg) #define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg) -#define AVX_CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two) reg = _mm256_xor_si256(ctr2, key); -#define AVX_CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg) +#define AVX_CTR_START(reg, ii) \ + MM_OP (_mm256_add_epi64, ctr2, two) \ + reg = _mm256_xor_si256(ctr2, key); + +#define AVX_CTR_END(reg, ii) \ + AVX_STORE((__m256i *)(void *)data + (ii), _mm256_xor_si256(reg, \ + AVX_LOAD ((__m256i *)(void *)data + (ii)))); + #define AVX_WOP_KEY(op, n) { \ const __m256i key = w[n]; \ - WOP(op); } + WOP(op) } #define NUM_AES_KEYS_MAX 15 #define WIDE_LOOP_START_AVX(OP) \ dataEnd = data + numBlocks; \ if (numBlocks >= NUM_WAYS * 2) \ - { __m256i keys[NUM_AES_KEYS_MAX]; \ - UInt32 ii; \ - OP \ - for (ii = 0; ii < numRounds; ii++) \ - keys[ii] = _mm256_broadcastsi128_si256(p[ii]); \ - dataEnd -= NUM_WAYS * 2; do { \ - + { __m256i keys[NUM_AES_KEYS_MAX]; \ + OP \ + { UInt32 ii; for (ii = 0; ii < numRounds; ii++) \ + keys[ii] = _mm256_broadcastsi128_si256(p[ii]); } \ + dataEnd -= NUM_WAYS * 2; \ + do { \ #define WIDE_LOOP_END_AVX(OP) \ - data += NUM_WAYS * 2; \ - } while (data <= dataEnd); \ - dataEnd += NUM_WAYS * 2; \ - OP \ - _mm256_zeroupper(); \ + data += NUM_WAYS * 2; \ + } while (data <= dataEnd); \ + dataEnd += NUM_WAYS * 2; \ + OP \ + _mm256_zeroupper(); \ } \ /* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified, @@ -246,21 +285,20 @@ AES_FUNC_START2 (AesCbc_Decode_HW) __m128i *p = (__m128i *)(void *)ivAes; __m128i *data = (__m128i *)(void *)data8; __m128i iv = *p; - const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1; + const __m128i * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2 + 2 - 1; const __m128i *dataEnd; p += 2; WIDE_LOOP_START { const __m128i *w = wStart; - WOP (DECLARE_VAR) WOP (LOAD_data) WOP_KEY (AES_XOR, 1) - do { WOP_KEY (AES_DEC, 0) + w--; } while (w != p); @@ -268,7 +306,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW) MM_XOR (m0, iv) WOP_M1 (XOR_data_M1) - iv = data[NUM_WAYS - 1]; + LOAD_data(iv, NUM_WAYS - 1) WOP (STORE_data) } WIDE_LOOP_END @@ -276,7 +314,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW) SINGLE_LOOP { const __m128i *w = wStart - 1; - __m128i m = _mm_xor_si128 (w[2], *data); + __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0)); + do { MM_OP_m (_mm_aesdec_si128, w[1]) @@ -286,10 +325,9 @@ AES_FUNC_START2 (AesCbc_Decode_HW) while (w != p); MM_OP_m (_mm_aesdec_si128, w[1]) MM_OP_m (_mm_aesdeclast_si128, w[0]) - MM_XOR (m, iv) - iv = *data; - *data = m; + LOAD_data(iv, 0) + STORE_data(m, 0) } p[-2] = iv; @@ -301,9 +339,9 @@ AES_FUNC_START2 (AesCtr_Code_HW) __m128i *p = (__m128i *)(void *)ivAes; __m128i *data = (__m128i *)(void *)data8; __m128i ctr = *p; - UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1; + const UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1; const __m128i *dataEnd; - __m128i one = _mm_cvtsi32_si128(1); + const __m128i one = _mm_cvtsi32_si128(1); p += 2; @@ -322,7 +360,6 @@ AES_FUNC_START2 (AesCtr_Code_HW) } while (--r); WOP_KEY (AES_ENC_LAST, 0) - WOP (CTR_END) } WIDE_LOOP_END @@ -344,7 +381,7 @@ AES_FUNC_START2 (AesCtr_Code_HW) while (--numRounds2); MM_OP_m (_mm_aesenc_si128, w[0]) MM_OP_m (_mm_aesenclast_si128, w[1]) - MM_XOR (*data, m) + CTR_END (m, 0) } p[-2] = ctr; @@ -421,7 +458,7 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256) __m128i *data = (__m128i *)(void *)data8; __m128i iv = *p; const __m128i *dataEnd; - UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; + const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; p += 2; WIDE_LOOP_START_AVX(;) @@ -440,17 +477,17 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256) while (w != keys); AVX_WOP_KEY (AVX_AES_DEC_LAST, 0) - AVX_XOR (m0, _mm256_setr_m128i(iv, data[0])) + AVX_XOR (m0, _mm256_setr_m128i(iv, LOAD_data_ii(0))) WOP_M1 (AVX_XOR_data_M1) - iv = data[NUM_WAYS * 2 - 1]; + LOAD_data (iv, NUM_WAYS * 2 - 1) WOP (AVX_STORE_data) } WIDE_LOOP_END_AVX(;) SINGLE_LOOP { - const __m128i *w = p + *(const UInt32 *)(p + 1 - 2) * 2 + 1 - 3; - __m128i m = _mm_xor_si128 (w[2], *data); + const __m128i *w = p - 2 + (size_t)*(const UInt32 *)(p + 1 - 2) * 2; + __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0)); do { MM_OP_m (_mm_aesdec_si128, w[1]) @@ -462,8 +499,8 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256) MM_OP_m (_mm_aesdeclast_si128, w[0]) MM_XOR (m, iv) - iv = *data; - *data = m; + LOAD_data(iv, 0) + STORE_data(m, 0) } p[-2] = iv; @@ -493,9 +530,9 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256) __m128i *p = (__m128i *)(void *)ivAes; __m128i *data = (__m128i *)(void *)data8; __m128i ctr = *p; - UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; + const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; const __m128i *dataEnd; - __m128i one = _mm_cvtsi32_si128(1); + const __m128i one = _mm_cvtsi32_si128(1); __m256i ctr2, two; p += 2; @@ -536,7 +573,7 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256) while (--numRounds2); MM_OP_m (_mm_aesenc_si128, w[0]) MM_OP_m (_mm_aesenclast_si128, w[1]) - MM_XOR (*data, m) + CTR_END (m, 0) } p[-2] = ctr; @@ -731,9 +768,14 @@ AES_FUNC_START (name) AES_FUNC_START2 (AesCbc_Encode_HW) { - v128 * const p = (v128*)(void*)ivAes; - v128 *data = (v128*)(void*)data8; + if (numBlocks == 0) + return; + { + v128 * const p = (v128 *)(void *)ivAes; + v128 *data = (v128 *)(void *)data8; v128 m = *p; + const UInt32 numRounds2 = *(const UInt32 *)(p + 1); + const v128 *w = p + (size_t)numRounds2 * 2; const v128 k0 = p[2]; const v128 k1 = p[3]; const v128 k2 = p[4]; @@ -744,11 +786,14 @@ AES_FUNC_START2 (AesCbc_Encode_HW) const v128 k7 = p[9]; const v128 k8 = p[10]; const v128 k9 = p[11]; - const UInt32 numRounds2 = *(const UInt32 *)(p + 1); - const v128 *w = p + ((size_t)numRounds2 * 2); + const v128 k_z4 = w[-2]; + const v128 k_z3 = w[-1]; + const v128 k_z2 = w[0]; const v128 k_z1 = w[1]; const v128 k_z0 = w[2]; - for (; numBlocks != 0; numBlocks--, data++) + // we don't use optimization veorq_u8(*data, k_z0) that can reduce one cycle, + // because gcc/clang compilers are not good for that optimization. + do { MM_XOR_m (*data) AES_E_MC_m (k0) @@ -757,24 +802,26 @@ AES_FUNC_START2 (AesCbc_Encode_HW) AES_E_MC_m (k3) AES_E_MC_m (k4) AES_E_MC_m (k5) - AES_E_MC_m (k6) - AES_E_MC_m (k7) - AES_E_MC_m (k8) if (numRounds2 >= 6) { - AES_E_MC_m (k9) - AES_E_MC_m (p[12]) + AES_E_MC_m (k6) + AES_E_MC_m (k7) if (numRounds2 != 6) { - AES_E_MC_m (p[13]) - AES_E_MC_m (p[14]) + AES_E_MC_m (k8) + AES_E_MC_m (k9) } } - AES_E_m (k_z1) - MM_XOR_m (k_z0) - *data = m; + AES_E_MC_m (k_z4) + AES_E_MC_m (k_z3) + AES_E_MC_m (k_z2) + AES_E_m (k_z1) + MM_XOR_m (k_z0) + *data++ = m; } + while (--numBlocks); *p = m; + } } @@ -834,10 +881,10 @@ AES_FUNC_START2 (AesCbc_Encode_HW) AES_FUNC_START2 (AesCbc_Decode_HW) { - v128 *p = (v128*)(void*)ivAes; - v128 *data = (v128*)(void*)data8; + v128 *p = (v128 *)(void *)ivAes; + v128 *data = (v128 *)(void *)data8; v128 iv = *p; - const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; + const v128 * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2; const v128 *dataEnd; p += 2; @@ -858,7 +905,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW) WOP_KEY (AES_XOR, 0) MM_XOR (m0, iv) WOP_M1 (XOR_data_M1) - iv = data[NUM_WAYS - 1]; + LOAD_data(iv, NUM_WAYS - 1) WOP (STORE_data) } WIDE_LOOP_END @@ -866,7 +913,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW) SINGLE_LOOP { const v128 *w = wStart; - v128 m = *data; + v128 m; LOAD_data(m, 0) AES_D_IMC_m (w[2]) do { @@ -878,8 +925,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW) AES_D_m (w[1]) MM_XOR_m (w[0]) MM_XOR_m (iv) - iv = *data; - *data = m; + LOAD_data(iv, 0) + STORE_data(m, 0) } p[-2] = iv; @@ -888,19 +935,17 @@ AES_FUNC_START2 (AesCbc_Decode_HW) AES_FUNC_START2 (AesCtr_Code_HW) { - v128 *p = (v128*)(void*)ivAes; - v128 *data = (v128*)(void*)data8; + v128 *p = (v128 *)(void *)ivAes; + v128 *data = (v128 *)(void *)data8; uint64x2_t ctr = vreinterpretq_u64_u8(*p); - const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; + const v128 * const wEnd = p + (size_t)*(const UInt32 *)(p + 1) * 2; const v128 *dataEnd; - uint64x2_t one = vdupq_n_u64(0); - // the bug in clang: // __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2); #if defined(__clang__) && (__clang_major__ <= 9) #pragma GCC diagnostic ignored "-Wvector-conversion" #endif - one = vsetq_lane_u64(1, one, 0); + const uint64x2_t one = vsetq_lane_u64(1, vdupq_n_u64(0), 0); p += 2; WIDE_LOOP_START diff --git a/3rdparty/lzma/src/CpuArch.c b/3rdparty/lzma/src/CpuArch.c index e792f39deb..6e02551e2d 100644 --- a/3rdparty/lzma/src/CpuArch.c +++ b/3rdparty/lzma/src/CpuArch.c @@ -1,5 +1,5 @@ /* CpuArch.c -- CPU specific code -2024-07-04 : Igor Pavlov : Public domain */ +Igor Pavlov : Public domain */ #include "Precomp.h" @@ -17,7 +17,7 @@ /* cpuid instruction supports (subFunction) parameter in ECX, that is used only with some specific (function) parameter values. - But we always use only (subFunction==0). + most functions use only (subFunction==0). */ /* __cpuid(): MSVC and GCC/CLANG use same function/macro name @@ -49,43 +49,49 @@ #if defined(MY_CPU_AMD64) && defined(__PIC__) \ && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__)) -#define x86_cpuid_MACRO(p, func) { \ + /* "=&r" selects free register. It can select even rbx, if that register is free. + "=&D" for (RDI) also works, but the code can be larger with "=&D" + "2"(subFun) : 2 is (zero-based) index in the output constraint list "=c" (ECX). */ + +#define x86_cpuid_MACRO_2(p, func, subFunc) { \ __asm__ __volatile__ ( \ ASM_LN "mov %%rbx, %q1" \ ASM_LN "cpuid" \ ASM_LN "xchg %%rbx, %q1" \ - : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); } - - /* "=&r" selects free register. It can select even rbx, if that register is free. - "=&D" for (RDI) also works, but the code can be larger with "=&D" - "2"(0) means (subFunction = 0), - 2 is (zero-based) index in the output constraint list "=c" (ECX). */ + : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); } #elif defined(MY_CPU_X86) && defined(__PIC__) \ && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__)) -#define x86_cpuid_MACRO(p, func) { \ +#define x86_cpuid_MACRO_2(p, func, subFunc) { \ __asm__ __volatile__ ( \ ASM_LN "mov %%ebx, %k1" \ ASM_LN "cpuid" \ ASM_LN "xchg %%ebx, %k1" \ - : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); } + : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); } #else -#define x86_cpuid_MACRO(p, func) { \ +#define x86_cpuid_MACRO_2(p, func, subFunc) { \ __asm__ __volatile__ ( \ ASM_LN "cpuid" \ - : "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); } + : "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); } #endif +#define x86_cpuid_MACRO(p, func) x86_cpuid_MACRO_2(p, func, 0) void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) { x86_cpuid_MACRO(p, func) } +static +void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc) +{ + x86_cpuid_MACRO_2(p, func, subFunc) +} + Z7_NO_INLINE UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) @@ -205,11 +211,39 @@ void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) __asm ret 0 } +static +void __declspec(naked) Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc) +{ + UNUSED_VAR(p) + UNUSED_VAR(func) + UNUSED_VAR(subFunc) + __asm push ebx + __asm push edi + __asm mov edi, ecx // p + __asm mov eax, edx // func + __asm mov ecx, [esp + 12] // subFunc + __asm cpuid + __asm mov [edi ], eax + __asm mov [edi + 4], ebx + __asm mov [edi + 8], ecx + __asm mov [edi + 12], edx + __asm pop edi + __asm pop ebx + __asm ret 4 +} + #else // MY_CPU_AMD64 #if _MSC_VER >= 1600 #include #define MY_cpuidex __cpuidex + +static +void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc) +{ + __cpuidex((int *)p, func, subFunc); +} + #else /* __cpuid (func == (0 or 7)) requires subfunction number in ECX. @@ -219,7 +253,7 @@ void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) We still can use __cpuid for low (func) values that don't require ECX, but __cpuid() in old MSVC will be incorrect for some func values: (func == 7). So here we use the hack for old MSVC to send (subFunction) in ECX register to cpuid instruction, - where ECX value is first parameter for FASTCALL / NO_INLINE func, + where ECX value is first parameter for FASTCALL / NO_INLINE func. So the caller of MY_cpuidex_HACK() sets ECX as subFunction, and old MSVC for __cpuid() doesn't change ECX and cpuid instruction gets (subFunction) value. @@ -233,6 +267,11 @@ Z7_NO_INLINE void Z7_FASTCALL MY_cpuidex_HACK(Int32 subFunction, Int32 func, Int } #define MY_cpuidex(info, func, func2) MY_cpuidex_HACK(func2, func, info) #pragma message("======== MY_cpuidex_HACK WAS USED ========") +static +void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc) +{ + MY_cpuidex_HACK(subFunc, func, (Int32 *)p); +} #endif // _MSC_VER >= 1600 #if !defined(MY_CPU_AMD64) @@ -445,6 +484,23 @@ BoolInt CPU_IsSupported_SHA(void) } } + +BoolInt CPU_IsSupported_SHA512(void) +{ + if (!CPU_IsSupported_AVX2()) return False; // maybe CPU_IsSupported_AVX() is enough here + + if (z7_x86_cpuid_GetMaxFunc() < 7) + return False; + { + UInt32 d[4]; + z7_x86_cpuid_subFunc(d, 7, 0); + if (d[0] < 1) // d[0] - is max supported subleaf value + return False; + z7_x86_cpuid_subFunc(d, 7, 1); + return (BoolInt)(d[0]) & 1; + } +} + /* MSVC: _xgetbv() intrinsic is available since VS2010SP1. MSVC also defines (_XCR_XFEATURE_ENABLED_MASK) macro in @@ -776,6 +832,18 @@ BoolInt CPU_IsSupported_NEON(void) return z7_sysctlbyname_Get_BoolInt("hw.optional.neon"); } +BoolInt CPU_IsSupported_SHA512(void) +{ + return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha512"); +} + +/* +BoolInt CPU_IsSupported_SHA3(void) +{ + return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha3"); +} +*/ + #ifdef MY_CPU_ARM64 #define APPLE_CRYPTO_SUPPORT_VAL 1 #else @@ -860,6 +928,19 @@ MY_HWCAP_CHECK_FUNC (CRC32) MY_HWCAP_CHECK_FUNC (SHA1) MY_HWCAP_CHECK_FUNC (SHA2) MY_HWCAP_CHECK_FUNC (AES) +#ifdef MY_CPU_ARM64 +// supports HWCAP_SHA512 and HWCAP_SHA3 since 2017. +// we define them here, if they are not defined +#ifndef HWCAP_SHA3 +// #define HWCAP_SHA3 (1 << 17) +#endif +#ifndef HWCAP_SHA512 +// #pragma message("=== HWCAP_SHA512 define === ") +#define HWCAP_SHA512 (1 << 21) +#endif +MY_HWCAP_CHECK_FUNC (SHA512) +// MY_HWCAP_CHECK_FUNC (SHA3) +#endif #endif // __APPLE__ #endif // _WIN32 diff --git a/3rdparty/lzma/src/LzFind.c b/3rdparty/lzma/src/LzFind.c index 1ce404648e..6aba919d02 100644 --- a/3rdparty/lzma/src/LzFind.c +++ b/3rdparty/lzma/src/LzFind.c @@ -1,5 +1,5 @@ /* LzFind.c -- Match finder for LZ algorithms -2024-03-01 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #include "Precomp.h" @@ -404,7 +404,7 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, const unsigned nbMax = (p->numHashBytes == 2 ? 16 : (p->numHashBytes == 3 ? 24 : 32)); - if (numBits > nbMax) + if (numBits >= nbMax) numBits = nbMax; if (numBits >= 32) hs = (UInt32)0 - 1; @@ -416,14 +416,14 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, hs |= (256 << kLzHash_CrcShift_2) - 1; { const UInt32 hs2 = MatchFinder_GetHashMask2(p, historySize); - if (hs > hs2) + if (hs >= hs2) hs = hs2; } hsCur = hs; if (p->expectedDataSize < historySize) { const UInt32 hs2 = MatchFinder_GetHashMask2(p, (UInt32)p->expectedDataSize); - if (hsCur > hs2) + if (hsCur >= hs2) hsCur = hs2; } } @@ -434,7 +434,7 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, if (p->expectedDataSize < historySize) { hsCur = MatchFinder_GetHashMask(p, (UInt32)p->expectedDataSize); - if (hsCur > hs) // is it possible? + if (hsCur >= hs) // is it possible? hsCur = hs; } } @@ -890,7 +890,7 @@ static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos, return d; { const Byte *pb = cur - delta; - curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)]; + curMatch = son[_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)]; if (pb[maxLen] == cur[maxLen] && *pb == *cur) { UInt32 len = 0; @@ -925,7 +925,7 @@ static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos, break; { ptrdiff_t diff; - curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)]; + curMatch = son[_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)]; diff = (ptrdiff_t)0 - (ptrdiff_t)delta; if (cur[maxLen] == cur[(ptrdiff_t)maxLen + diff]) { @@ -972,7 +972,7 @@ UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byt // if (curMatch >= pos) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; } cmCheck = (UInt32)(pos - _cyclicBufferSize); - if ((UInt32)pos <= _cyclicBufferSize) + if ((UInt32)pos < _cyclicBufferSize) cmCheck = 0; if (cmCheck < curMatch) @@ -980,7 +980,7 @@ UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byt { const UInt32 delta = pos - curMatch; { - CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); + CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)) << 1); const Byte *pb = cur - delta; unsigned len = (len0 < len1 ? len0 : len1); const UInt32 pair0 = pair[0]; @@ -1039,7 +1039,7 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const UInt32 cmCheck; cmCheck = (UInt32)(pos - _cyclicBufferSize); - if ((UInt32)pos <= _cyclicBufferSize) + if ((UInt32)pos < _cyclicBufferSize) cmCheck = 0; if (// curMatch >= pos || // failure @@ -1048,7 +1048,7 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const { const UInt32 delta = pos - curMatch; { - CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); + CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)) << 1); const Byte *pb = cur - delta; unsigned len = (len0 < len1 ? len0 : len1); if (pb[len] == cur[len]) @@ -1595,7 +1595,7 @@ static void Bt5_MatchFinder_Skip(void *_p, UInt32 num) UInt32 pos = p->pos; \ UInt32 num2 = num; \ /* (p->pos == p->posLimit) is not allowed here !!! */ \ - { const UInt32 rem = p->posLimit - pos; if (num2 > rem) num2 = rem; } \ + { const UInt32 rem = p->posLimit - pos; if (num2 >= rem) num2 = rem; } \ num -= num2; \ { const UInt32 cycPos = p->cyclicBufferPos; \ son = p->son + cycPos; \ diff --git a/3rdparty/lzma/src/LzFindMt.c b/3rdparty/lzma/src/LzFindMt.c index ac9d59d0fd..25fcc46517 100644 --- a/3rdparty/lzma/src/LzFindMt.c +++ b/3rdparty/lzma/src/LzFindMt.c @@ -1,5 +1,5 @@ /* LzFindMt.c -- multithreaded Match finder for LZ algorithms -2024-01-22 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #include "Precomp.h" @@ -82,6 +82,8 @@ extern UInt64 g_NumIters_Bytes; Z7_NO_INLINE static void MtSync_Construct(CMtSync *p) { + p->affinityGroup = -1; + p->affinityInGroup = 0; p->affinity = 0; p->wasCreated = False; p->csWasInitialized = False; @@ -259,6 +261,12 @@ static WRes MtSync_Create_WRes(CMtSync *p, THREAD_FUNC_TYPE startAddress, void * // return ERROR_TOO_MANY_POSTS; // for debug // return EINVAL; // for debug +#ifdef _WIN32 + if (p->affinityGroup >= 0) + wres = Thread_Create_With_Group(&p->thread, startAddress, obj, + (unsigned)(UInt32)p->affinityGroup, (CAffinityMask)p->affinityInGroup); + else +#endif if (p->affinity != 0) wres = Thread_Create_With_Affinity(&p->thread, startAddress, obj, (CAffinityMask)p->affinity); else diff --git a/3rdparty/lzma/src/Lzma2Enc.c b/3rdparty/lzma/src/Lzma2Enc.c index 703e146b57..72aec69533 100644 --- a/3rdparty/lzma/src/Lzma2Enc.c +++ b/3rdparty/lzma/src/Lzma2Enc.c @@ -1,5 +1,5 @@ /* Lzma2Enc.c -- LZMA2 Encoder -2023-04-13 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #include "Precomp.h" @@ -235,6 +235,7 @@ void Lzma2EncProps_Init(CLzma2EncProps *p) p->numBlockThreads_Reduced = -1; p->numBlockThreads_Max = -1; p->numTotalThreads = -1; + p->numThreadGroups = 0; } void Lzma2EncProps_Normalize(CLzma2EncProps *p) @@ -781,6 +782,7 @@ SRes Lzma2Enc_Encode2(CLzma2EncHandle p, } p->mtCoder.numThreadsMax = (unsigned)p->props.numBlockThreads_Max; + p->mtCoder.numThreadGroups = p->props.numThreadGroups; p->mtCoder.expectedDataSize = p->expectedDataSize; { diff --git a/3rdparty/lzma/src/LzmaEnc.c b/3rdparty/lzma/src/LzmaEnc.c index 37b2787db6..84a29a5c25 100644 --- a/3rdparty/lzma/src/LzmaEnc.c +++ b/3rdparty/lzma/src/LzmaEnc.c @@ -1,5 +1,5 @@ /* LzmaEnc.c -- LZMA Encoder -2024-01-24: Igor Pavlov : Public domain */ +Igor Pavlov : Public domain */ #include "Precomp.h" @@ -62,7 +62,9 @@ void LzmaEncProps_Init(CLzmaEncProps *p) p->lc = p->lp = p->pb = p->algo = p->fb = p->btMode = p->numHashBytes = p->numThreads = -1; p->numHashOutBits = 0; p->writeEndMark = 0; + p->affinityGroup = -1; p->affinity = 0; + p->affinityInGroup = 0; } void LzmaEncProps_Normalize(CLzmaEncProps *p) @@ -72,11 +74,11 @@ void LzmaEncProps_Normalize(CLzmaEncProps *p) p->level = level; if (p->dictSize == 0) - p->dictSize = - ( level <= 3 ? ((UInt32)1 << (level * 2 + 16)) : - ( level <= 6 ? ((UInt32)1 << (level + 19)) : - ( level <= 7 ? ((UInt32)1 << 25) : ((UInt32)1 << 26) - ))); + p->dictSize = (unsigned)level <= 4 ? + (UInt32)1 << (level * 2 + 16) : + (unsigned)level <= sizeof(size_t) / 2 + 4 ? + (UInt32)1 << (level + 20) : + (UInt32)1 << (sizeof(size_t) / 2 + 24); if (p->dictSize > p->reduceSize) { @@ -92,8 +94,8 @@ void LzmaEncProps_Normalize(CLzmaEncProps *p) if (p->lp < 0) p->lp = 0; if (p->pb < 0) p->pb = 2; - if (p->algo < 0) p->algo = (level < 5 ? 0 : 1); - if (p->fb < 0) p->fb = (level < 7 ? 32 : 64); + if (p->algo < 0) p->algo = (unsigned)level < 5 ? 0 : 1; + if (p->fb < 0) p->fb = (unsigned)level < 7 ? 32 : 64; if (p->btMode < 0) p->btMode = (p->algo == 0 ? 0 : 1); if (p->numHashBytes < 0) p->numHashBytes = (p->btMode ? 4 : 5); if (p->mc == 0) p->mc = (16 + ((unsigned)p->fb >> 1)) >> (p->btMode ? 0 : 1); @@ -598,6 +600,10 @@ SRes LzmaEnc_SetProps(CLzmaEncHandle p, const CLzmaEncProps *props2) p->multiThread = (props.numThreads > 1); p->matchFinderMt.btSync.affinity = p->matchFinderMt.hashSync.affinity = props.affinity; + p->matchFinderMt.btSync.affinityGroup = + p->matchFinderMt.hashSync.affinityGroup = props.affinityGroup; + p->matchFinderMt.btSync.affinityInGroup = + p->matchFinderMt.hashSync.affinityInGroup = props.affinityInGroup; #endif return SZ_OK; diff --git a/3rdparty/lzma/src/MtCoder.c b/3rdparty/lzma/src/MtCoder.c index 03959b6cad..923b19ac48 100644 --- a/3rdparty/lzma/src/MtCoder.c +++ b/3rdparty/lzma/src/MtCoder.c @@ -1,5 +1,5 @@ /* MtCoder.c -- Multi-thread Coder -2023-09-07 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #include "Precomp.h" @@ -39,14 +39,28 @@ void MtProgressThunk_CreateVTable(CMtProgressThunk *p) static THREAD_FUNC_DECL ThreadFunc(void *pp); -static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t) +static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t +#ifdef _WIN32 + , CMtCoder * const mtc +#endif + ) { WRes wres = AutoResetEvent_OptCreate_And_Reset(&t->startEvent); + // printf("\n====== MtCoderThread_CreateAndStart : \n"); if (wres == 0) { t->stop = False; if (!Thread_WasCreated(&t->thread)) - wres = Thread_Create(&t->thread, ThreadFunc, t); + { +#ifdef _WIN32 + if (mtc->numThreadGroups) + wres = Thread_Create_With_Group(&t->thread, ThreadFunc, t, + ThreadNextGroup_GetNext(&mtc->nextGroup), // group + 0); // affinityMask + else +#endif + wres = Thread_Create(&t->thread, ThreadFunc, t); + } if (wres == 0) wres = Event_Set(&t->startEvent); } @@ -56,6 +70,7 @@ static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t) } +Z7_FORCE_INLINE static void MtCoderThread_Destruct(CMtCoderThread *t) { if (Thread_WasCreated(&t->thread)) @@ -85,7 +100,7 @@ static void MtCoderThread_Destruct(CMtCoderThread *t) static SRes ThreadFunc2(CMtCoderThread *t) { - CMtCoder *mtc = t->mtCoder; + CMtCoder * const mtc = t->mtCoder; for (;;) { @@ -185,7 +200,11 @@ static SRes ThreadFunc2(CMtCoderThread *t) if (mtc->numStartedThreads < mtc->numStartedThreadsLimit && mtc->expectedDataSize != readProcessed) { - res = MtCoderThread_CreateAndStart(&mtc->threads[mtc->numStartedThreads]); + res = MtCoderThread_CreateAndStart(&mtc->threads[mtc->numStartedThreads] +#ifdef _WIN32 + , mtc +#endif + ); if (res == SZ_OK) mtc->numStartedThreads++; else @@ -221,7 +240,7 @@ static SRes ThreadFunc2(CMtCoderThread *t) } { - CMtCoderBlock *block = &mtc->blocks[bi]; + CMtCoderBlock * const block = &mtc->blocks[bi]; block->res = res; block->bufIndex = bufIndex; block->finished = finished; @@ -311,7 +330,7 @@ static SRes ThreadFunc2(CMtCoderThread *t) static THREAD_FUNC_DECL ThreadFunc(void *pp) { - CMtCoderThread *t = (CMtCoderThread *)pp; + CMtCoderThread * const t = (CMtCoderThread *)pp; for (;;) { if (Event_Wait(&t->startEvent) != 0) @@ -319,7 +338,7 @@ static THREAD_FUNC_DECL ThreadFunc(void *pp) if (t->stop) return 0; { - SRes res = ThreadFunc2(t); + const SRes res = ThreadFunc2(t); CMtCoder *mtc = t->mtCoder; if (res != SZ_OK) { @@ -328,7 +347,7 @@ static THREAD_FUNC_DECL ThreadFunc(void *pp) #ifndef MTCODER_USE_WRITE_THREAD { - unsigned numFinished = (unsigned)InterlockedIncrement(&mtc->numFinishedThreads); + const unsigned numFinished = (unsigned)InterlockedIncrement(&mtc->numFinishedThreads); if (numFinished == mtc->numStartedThreads) if (Event_Set(&mtc->finishedEvent) != 0) return (THREAD_FUNC_RET_TYPE)SZ_ERROR_THREAD; @@ -346,6 +365,7 @@ void MtCoder_Construct(CMtCoder *p) p->blockSize = 0; p->numThreadsMax = 0; + p->numThreadGroups = 0; p->expectedDataSize = (UInt64)(Int64)-1; p->inStream = NULL; @@ -429,6 +449,8 @@ SRes MtCoder_Code(CMtCoder *p) unsigned i; SRes res = SZ_OK; + // printf("\n====== MtCoder_Code : \n"); + if (numThreads > MTCODER_THREADS_MAX) numThreads = MTCODER_THREADS_MAX; numBlocksMax = MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads); @@ -492,11 +514,22 @@ SRes MtCoder_Code(CMtCoder *p) p->numStartedThreadsLimit = numThreads; p->numStartedThreads = 0; + ThreadNextGroup_Init(&p->nextGroup, p->numThreadGroups, 0); // startGroup // for (i = 0; i < numThreads; i++) { + // here we create new thread for first block. + // And each new thread will create another new thread after block reading + // until numStartedThreadsLimit is reached. CMtCoderThread *nextThread = &p->threads[p->numStartedThreads++]; - RINOK(MtCoderThread_CreateAndStart(nextThread)) + { + const SRes res2 = MtCoderThread_CreateAndStart(nextThread +#ifdef _WIN32 + , p +#endif + ); + RINOK(res2) + } } RINOK_THREAD(Event_Set(&p->readEvent)) @@ -513,9 +546,9 @@ SRes MtCoder_Code(CMtCoder *p) RINOK_THREAD(Event_Wait(&p->writeEvents[bi])) { - const CMtCoderBlock *block = &p->blocks[bi]; - unsigned bufIndex = block->bufIndex; - BoolInt finished = block->finished; + const CMtCoderBlock * const block = &p->blocks[bi]; + const unsigned bufIndex = block->bufIndex; + const BoolInt finished = block->finished; if (res == SZ_OK && block->res != SZ_OK) res = block->res; @@ -545,7 +578,7 @@ SRes MtCoder_Code(CMtCoder *p) } #else { - WRes wres = Event_Wait(&p->finishedEvent); + const WRes wres = Event_Wait(&p->finishedEvent); res = MY_SRes_HRESULT_FROM_WRes(wres); } #endif diff --git a/3rdparty/lzma/src/Sha256.c b/3rdparty/lzma/src/Sha256.c index 14d3be9c6a..ea7ed8e751 100644 --- a/3rdparty/lzma/src/Sha256.c +++ b/3rdparty/lzma/src/Sha256.c @@ -1,18 +1,14 @@ /* Sha256.c -- SHA-256 Hash -2024-03-01 : Igor Pavlov : Public domain +: Igor Pavlov : Public domain This code is based on public domain code from Wei Dai's Crypto++ library. */ #include "Precomp.h" #include -#include "CpuArch.h" -#include "RotateDefs.h" #include "Sha256.h" - -#if defined(_MSC_VER) && (_MSC_VER < 1900) -// #define USE_MY_MM -#endif +#include "RotateDefs.h" +#include "CpuArch.h" #ifdef MY_CPU_X86_OR_AMD64 #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \ @@ -56,7 +52,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS = Sha256_UpdateBlocks; static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS_HW; - #define SHA256_UPDATE_BLOCKS(p) p->func_UpdateBlocks + #define SHA256_UPDATE_BLOCKS(p) p->v.vars.func_UpdateBlocks #else #define SHA256_UPDATE_BLOCKS(p) Sha256_UpdateBlocks #endif @@ -85,7 +81,7 @@ BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo) return False; #endif - p->func_UpdateBlocks = func; + p->v.vars.func_UpdateBlocks = func; return True; } @@ -111,7 +107,7 @@ BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo) void Sha256_InitState(CSha256 *p) { - p->count = 0; + p->v.vars.count = 0; p->state[0] = 0x6a09e667; p->state[1] = 0xbb67ae85; p->state[2] = 0x3c6ef372; @@ -122,9 +118,16 @@ void Sha256_InitState(CSha256 *p) p->state[7] = 0x5be0cd19; } + + + + + + + void Sha256_Init(CSha256 *p) { - p->func_UpdateBlocks = + p->v.vars.func_UpdateBlocks = #ifdef Z7_COMPILER_SHA256_SUPPORTED g_SHA256_FUNC_UPDATE_BLOCKS; #else @@ -133,10 +136,10 @@ void Sha256_Init(CSha256 *p) Sha256_InitState(p); } -#define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x, 22)) -#define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x, 25)) +#define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x,22)) +#define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x,25)) #define s0(x) (rotrFixed(x, 7) ^ rotrFixed(x,18) ^ (x >> 3)) -#define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >> 10)) +#define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >>10)) #define Ch(x,y,z) (z^(x&(y^z))) #define Maj(x,y,z) ((x&y)|(z&(x|y))) @@ -224,12 +227,10 @@ void Sha256_Init(CSha256 *p) #endif -// static -extern MY_ALIGN(64) -const UInt32 SHA256_K_ARRAY[64]; -MY_ALIGN(64) -const UInt32 SHA256_K_ARRAY[64] = { +extern +MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64]; +MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64] = { 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, @@ -248,27 +249,29 @@ const UInt32 SHA256_K_ARRAY[64] = { 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 }; -#define K SHA256_K_ARRAY + + +#define K SHA256_K_ARRAY + Z7_NO_INLINE void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks) { UInt32 W - #ifdef Z7_SHA256_BIG_W +#ifdef Z7_SHA256_BIG_W [64]; - #else +#else [16]; - #endif - +#endif unsigned j; - UInt32 a,b,c,d,e,f,g,h; - - #if !defined(Z7_SHA256_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4) +#if !defined(Z7_SHA256_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4) UInt32 tmp; - #endif +#endif + if (numBlocks == 0) return; + a = state[0]; b = state[1]; c = state[2]; @@ -278,7 +281,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n g = state[6]; h = state[7]; - while (numBlocks) + do { for (j = 0; j < 16; j += STEP_PRE) @@ -352,19 +355,11 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n g += state[6]; state[6] = g; h += state[7]; state[7] = h; - data += 64; - numBlocks--; + data += SHA256_BLOCK_SIZE; } - - /* Wipe variables */ - /* memset(W, 0, sizeof(W)); */ + while (--numBlocks); } -#undef S0 -#undef S1 -#undef s0 -#undef s1 -#undef K #define Sha256_UpdateBlock(p) SHA256_UPDATE_BLOCKS(p)(p->state, p->buffer, 1) @@ -372,20 +367,15 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size) { if (size == 0) return; - { - unsigned pos = (unsigned)p->count & 0x3F; - unsigned num; - - p->count += size; - - num = 64 - pos; + const unsigned pos = (unsigned)p->v.vars.count & (SHA256_BLOCK_SIZE - 1); + const unsigned num = SHA256_BLOCK_SIZE - pos; + p->v.vars.count += size; if (num > size) { memcpy(p->buffer + pos, data, size); return; } - if (pos != 0) { size -= num; @@ -395,9 +385,10 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size) } } { - size_t numBlocks = size >> 6; + const size_t numBlocks = size >> 6; + // if (numBlocks) SHA256_UPDATE_BLOCKS(p)(p->state, data, numBlocks); - size &= 0x3F; + size &= SHA256_BLOCK_SIZE - 1; if (size == 0) return; data += (numBlocks << 6); @@ -408,82 +399,69 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size) void Sha256_Final(CSha256 *p, Byte *digest) { - unsigned pos = (unsigned)p->count & 0x3F; - unsigned i; - + unsigned pos = (unsigned)p->v.vars.count & (SHA256_BLOCK_SIZE - 1); p->buffer[pos++] = 0x80; - - if (pos > (64 - 8)) + if (pos > (SHA256_BLOCK_SIZE - 4 * 2)) { - while (pos != 64) { p->buffer[pos++] = 0; } - // memset(&p->buf.buffer[pos], 0, 64 - pos); + while (pos != SHA256_BLOCK_SIZE) { p->buffer[pos++] = 0; } + // memset(&p->buf.buffer[pos], 0, SHA256_BLOCK_SIZE - pos); Sha256_UpdateBlock(p); pos = 0; } - - /* - if (pos & 3) + memset(&p->buffer[pos], 0, (SHA256_BLOCK_SIZE - 4 * 2) - pos); { - p->buffer[pos] = 0; - p->buffer[pos + 1] = 0; - p->buffer[pos + 2] = 0; - pos += 3; - pos &= ~3; + const UInt64 numBits = p->v.vars.count << 3; + SetBe32(p->buffer + SHA256_BLOCK_SIZE - 4 * 2, (UInt32)(numBits >> 32)) + SetBe32(p->buffer + SHA256_BLOCK_SIZE - 4 * 1, (UInt32)(numBits)) } - { - for (; pos < 64 - 8; pos += 4) - *(UInt32 *)(&p->buffer[pos]) = 0; - } - */ - - memset(&p->buffer[pos], 0, (64 - 8) - pos); - - { - UInt64 numBits = (p->count << 3); - SetBe32(p->buffer + 64 - 8, (UInt32)(numBits >> 32)) - SetBe32(p->buffer + 64 - 4, (UInt32)(numBits)) - } - Sha256_UpdateBlock(p); - - for (i = 0; i < 8; i += 2) +#if 1 && defined(MY_CPU_BE) + memcpy(digest, p->state, SHA256_DIGEST_SIZE); +#else { - UInt32 v0 = p->state[i]; - UInt32 v1 = p->state[(size_t)i + 1]; - SetBe32(digest , v0) - SetBe32(digest + 4, v1) - digest += 8; + unsigned i; + for (i = 0; i < 8; i += 2) + { + const UInt32 v0 = p->state[i]; + const UInt32 v1 = p->state[(size_t)i + 1]; + SetBe32(digest , v0) + SetBe32(digest + 4, v1) + digest += 4 * 2; + } } - + + + + +#endif Sha256_InitState(p); } void Sha256Prepare(void) { - #ifdef Z7_COMPILER_SHA256_SUPPORTED +#ifdef Z7_COMPILER_SHA256_SUPPORTED SHA256_FUNC_UPDATE_BLOCKS f, f_hw; f = Sha256_UpdateBlocks; f_hw = NULL; - #ifdef MY_CPU_X86_OR_AMD64 - #ifndef USE_MY_MM +#ifdef MY_CPU_X86_OR_AMD64 if (CPU_IsSupported_SHA() && CPU_IsSupported_SSSE3() - // && CPU_IsSupported_SSE41() ) - #endif - #else +#else if (CPU_IsSupported_SHA2()) - #endif +#endif { // printf("\n========== HW SHA256 ======== \n"); f = f_hw = Sha256_UpdateBlocks_HW; } g_SHA256_FUNC_UPDATE_BLOCKS = f; g_SHA256_FUNC_UPDATE_BLOCKS_HW = f_hw; - #endif +#endif } +#undef U64C +#undef K #undef S0 #undef S1 #undef s0 diff --git a/3rdparty/lzma/src/Sha256Opt.c b/3rdparty/lzma/src/Sha256Opt.c index eb38166646..1c6b50f8d3 100644 --- a/3rdparty/lzma/src/Sha256Opt.c +++ b/3rdparty/lzma/src/Sha256Opt.c @@ -1,18 +1,11 @@ /* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions -2024-03-01 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #include "Precomp.h" #include "Compiler.h" #include "CpuArch.h" -#if defined(_MSC_VER) -#if (_MSC_VER < 1900) && (_MSC_VER >= 1200) -// #define USE_MY_MM -#endif -#endif - // #define Z7_USE_HW_SHA_STUB // for debug - #ifdef MY_CPU_X86_OR_AMD64 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check #define USE_HW_SHA @@ -20,19 +13,14 @@ || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) #define USE_HW_SHA - #if !defined(_INTEL_COMPILER) + #if !defined(__INTEL_COMPILER) // icc defines __GNUC__, but icc doesn't support __attribute__(__target__) #if !defined(__SHA__) || !defined(__SSSE3__) #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) #endif #endif #elif defined(_MSC_VER) - #ifdef USE_MY_MM - #define USE_VER_MIN 1300 - #else - #define USE_VER_MIN 1900 - #endif - #if (_MSC_VER >= USE_VER_MIN) + #if (_MSC_VER >= 1900) #define USE_HW_SHA #else #define Z7_USE_HW_SHA_STUB @@ -47,23 +35,20 @@ // #pragma message("Sha256 HW") + + + // sse/sse2/ssse3: #include // sha*: #include #if defined (__clang__) && defined(_MSC_VER) - // #if !defined(__SSSE3__) - // #endif #if !defined(__SHA__) #include #endif #else -#ifdef USE_MY_MM -#include "My_mm.h" -#endif - #endif /* @@ -91,60 +76,44 @@ SHA: extern MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64]; - #define K SHA256_K_ARRAY #define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src); #define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src); -#define SHA25G_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src); - +#define SHA256_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src); #define LOAD_SHUFFLE(m, k) \ m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \ m = _mm_shuffle_epi8(m, mask); \ -#define SM1(g0, g1, g2, g3) \ - SHA256_MSG1(g3, g0); \ +#define NNN(m0, m1, m2, m3) -#define SM2(g0, g1, g2, g3) \ - tmp = _mm_alignr_epi8(g1, g0, 4); \ - ADD_EPI32(g2, tmp) \ - SHA25G_MSG2(g2, g1); \ - -// #define LS0(k, g0, g1, g2, g3) LOAD_SHUFFLE(g0, k) -// #define LS1(k, g0, g1, g2, g3) LOAD_SHUFFLE(g1, k+1) - - -#define NNN(g0, g1, g2, g3) +#define SM1(m1, m2, m3, m0) \ + SHA256_MSG1(m0, m1); \ +#define SM2(m2, m3, m0, m1) \ + ADD_EPI32(m0, _mm_alignr_epi8(m3, m2, 4)) \ + SHA256_MSG2(m0, m3); \ #define RND2(t0, t1) \ t0 = _mm_sha256rnds2_epu32(t0, t1, msg); -#define RND2_0(m, k) \ - msg = _mm_add_epi32(m, *(const __m128i *) (const void *) &K[(k) * 4]); \ + + +#define R4(k, m0, m1, m2, m3, OP0, OP1) \ + msg = _mm_add_epi32(m0, *(const __m128i *) (const void *) &K[(k) * 4]); \ RND2(state0, state1); \ msg = _mm_shuffle_epi32(msg, 0x0E); \ - - -#define RND2_1 \ + OP0(m0, m1, m2, m3) \ RND2(state1, state0); \ - - -// We use scheme with 3 rounds ahead for SHA256_MSG1 / 2 rounds ahead for SHA256_MSG2 - -#define R4(k, g0, g1, g2, g3, OP0, OP1) \ - RND2_0(g0, k) \ - OP0(g0, g1, g2, g3) \ - RND2_1 \ - OP1(g0, g1, g2, g3) \ + OP1(m0, m1, m2, m3) \ #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ - R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \ - R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \ - R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \ - R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \ + R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \ + R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \ + R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \ + R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \ #define PREPARE_STATE \ tmp = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \ @@ -161,8 +130,9 @@ ATTRIB_SHA void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) { const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203); - __m128i tmp; - __m128i state0, state1; + + + __m128i tmp, state0, state1; if (numBlocks == 0) return; @@ -262,22 +232,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ #define _ARM_USE_NEW_NEON_INTRINSICS #endif - - - - #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) #include #else - - - - - - - - #if defined(__clang__) && __clang_major__ < 16 #if !defined(__ARM_FEATURE_SHA2) && \ !defined(__ARM_FEATURE_CRYPTO) @@ -324,41 +282,70 @@ typedef uint32x4_t v128; // typedef __n128 v128; // MSVC #ifdef MY_CPU_BE - #define MY_rev32_for_LE(x) + #define MY_rev32_for_LE(x) x #else - #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))) + #define MY_rev32_for_LE(x) vrev32q_u8(x) #endif -#define LOAD_128(_p) (*(const v128 *)(const void *)(_p)) -#define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v) +#if 1 // 0 for debug +// for arm32: it works slower by some reason than direct code +/* +for arm32 it generates: +MSVC-2022, GCC-9: + vld1.32 {d18,d19}, [r10] + vst1.32 {d4,d5}, [r3] + vld1.8 {d20-d21}, [r4] +there is no align hint (like [r10:128]). So instruction allows unaligned access +*/ +#define LOAD_128_32(_p) vld1q_u32(_p) +#define LOAD_128_8(_p) vld1q_u8 (_p) +#define STORE_128_32(_p, _v) vst1q_u32(_p, _v) +#else +/* +for arm32: +MSVC-2022: + vldm r10,{d18,d19} + vstm r3,{d4,d5} + does it require strict alignment? +GCC-9: + vld1.64 {d30-d31}, [r0:64] + vldr d28, [r0, #16] + vldr d29, [r0, #24] + vst1.64 {d30-d31}, [r0:64] + vstr d28, [r0, #16] + vstr d29, [r0, #24] +there is hint [r0:64], so does it requires 64-bit alignment. +*/ +#define LOAD_128_32(_p) (*(const v128 *)(const void *)(_p)) +#define LOAD_128_8(_p) vreinterpretq_u8_u32(*(const v128 *)(const void *)(_p)) +#define STORE_128_32(_p, _v) *(v128 *)(void *)(_p) = (_v) +#endif #define LOAD_SHUFFLE(m, k) \ - m = LOAD_128((data + (k) * 16)); \ - MY_rev32_for_LE(m); \ + m = vreinterpretq_u32_u8( \ + MY_rev32_for_LE( \ + LOAD_128_8(data + (k) * 16))); \ // K array must be aligned for 16-bytes at least. extern MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64]; - #define K SHA256_K_ARRAY - #define SHA256_SU0(dest, src) dest = vsha256su0q_u32(dest, src); -#define SHA25G_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3); +#define SHA256_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3); -#define SM1(g0, g1, g2, g3) SHA256_SU0(g3, g0) -#define SM2(g0, g1, g2, g3) SHA25G_SU1(g2, g0, g1) -#define NNN(g0, g1, g2, g3) +#define SM1(m0, m1, m2, m3) SHA256_SU0(m3, m0) +#define SM2(m0, m1, m2, m3) SHA256_SU1(m2, m0, m1) +#define NNN(m0, m1, m2, m3) - -#define R4(k, g0, g1, g2, g3, OP0, OP1) \ - msg = vaddq_u32(g0, *(const v128 *) (const void *) &K[(k) * 4]); \ +#define R4(k, m0, m1, m2, m3, OP0, OP1) \ + msg = vaddq_u32(m0, *(const v128 *) (const void *) &K[(k) * 4]); \ tmp = state0; \ state0 = vsha256hq_u32( state0, state1, msg ); \ state1 = vsha256h2q_u32( state1, tmp, msg ); \ - OP0(g0, g1, g2, g3); \ - OP1(g0, g1, g2, g3); \ + OP0(m0, m1, m2, m3); \ + OP1(m0, m1, m2, m3); \ #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ @@ -379,8 +366,8 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ if (numBlocks == 0) return; - state0 = LOAD_128(&state[0]); - state1 = LOAD_128(&state[4]); + state0 = LOAD_128_32(&state[0]); + state1 = LOAD_128_32(&state[4]); do { @@ -408,8 +395,8 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ } while (--numBlocks); - STORE_128(&state[0], state0); - STORE_128(&state[4], state1); + STORE_128_32(&state[0], state0); + STORE_128_32(&state[4], state1); } #endif // USE_HW_SHA @@ -443,13 +430,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ #endif - #undef K #undef RND2 -#undef RND2_0 -#undef RND2_1 - #undef MY_rev32_for_LE + #undef NNN #undef LOAD_128 #undef STORE_128 @@ -457,7 +441,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ #undef SM1 #undef SM2 -#undef NNN + #undef R4 #undef R16 #undef PREPARE_STATE diff --git a/3rdparty/lzma/src/Sort.c b/3rdparty/lzma/src/Sort.c index e1097e3806..20e3e69dc1 100644 --- a/3rdparty/lzma/src/Sort.c +++ b/3rdparty/lzma/src/Sort.c @@ -1,141 +1,268 @@ /* Sort.c -- Sort functions -2014-04-05 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #include "Precomp.h" #include "Sort.h" +#include "CpuArch.h" -#define HeapSortDown(p, k, size, temp) \ - { for (;;) { \ - size_t s = (k << 1); \ - if (s > size) break; \ - if (s < size && p[s + 1] > p[s]) s++; \ - if (temp >= p[s]) break; \ - p[k] = p[s]; k = s; \ - } p[k] = temp; } +#if ( (defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \ + || (defined(__clang__) && Z7_has_builtin(__builtin_prefetch)) \ + ) +// the code with prefetch is slow for small arrays on x86. +// So we disable prefetch for x86. +#ifndef MY_CPU_X86 + // #pragma message("Z7_PREFETCH : __builtin_prefetch") + #define Z7_PREFETCH(a) __builtin_prefetch((a)) +#endif -void HeapSort(UInt32 *p, size_t size) -{ - if (size <= 1) - return; - p--; - { - size_t i = size / 2; - do - { - UInt32 temp = p[i]; - size_t k = i; - HeapSortDown(p, k, size, temp) - } - while (--i != 0); - } - /* - do - { - size_t k = 1; - UInt32 temp = p[size]; - p[size--] = p[1]; - HeapSortDown(p, k, size, temp) - } - while (size > 1); - */ - while (size > 3) - { - UInt32 temp = p[size]; - size_t k = (p[3] > p[2]) ? 3 : 2; - p[size--] = p[1]; - p[1] = p[k]; - HeapSortDown(p, k, size, temp) - } - { - UInt32 temp = p[size]; - p[size] = p[1]; - if (size > 2 && p[2] < temp) - { - p[1] = p[2]; - p[2] = temp; - } - else - p[1] = temp; - } -} +#elif defined(_WIN32) // || defined(_MSC_VER) && (_MSC_VER >= 1200) -void HeapSort64(UInt64 *p, size_t size) -{ - if (size <= 1) - return; - p--; - { - size_t i = size / 2; - do - { - UInt64 temp = p[i]; - size_t k = i; - HeapSortDown(p, k, size, temp) - } - while (--i != 0); - } - /* - do - { - size_t k = 1; - UInt64 temp = p[size]; - p[size--] = p[1]; - HeapSortDown(p, k, size, temp) - } - while (size > 1); - */ - while (size > 3) - { - UInt64 temp = p[size]; - size_t k = (p[3] > p[2]) ? 3 : 2; - p[size--] = p[1]; - p[1] = p[k]; - HeapSortDown(p, k, size, temp) - } - { - UInt64 temp = p[size]; - p[size] = p[1]; - if (size > 2 && p[2] < temp) - { - p[1] = p[2]; - p[2] = temp; - } - else - p[1] = temp; - } -} +#include "7zWindows.h" + +// NOTE: CLANG/GCC/MSVC can define different values for _MM_HINT_T0 / PF_TEMPORAL_LEVEL_1. +// For example, clang-cl can generate "prefetcht2" instruction for +// PreFetchCacheLine(PF_TEMPORAL_LEVEL_1) call. +// But we want to generate "prefetcht0" instruction. +// So for CLANG/GCC we must use __builtin_prefetch() in code branch above +// instead of PreFetchCacheLine() / _mm_prefetch(). + +// New msvc-x86 compiler generates "prefetcht0" instruction for PreFetchCacheLine() call. +// But old x86 cpus don't support "prefetcht0". +// So we will use PreFetchCacheLine(), only if we are sure that +// generated instruction is supported by all cpus of that isa. +#if defined(MY_CPU_AMD64) \ + || defined(MY_CPU_ARM64) \ + || defined(MY_CPU_IA64) +// we need to use additional braces for (a) in PreFetchCacheLine call, because +// PreFetchCacheLine macro doesn't use braces: +// #define PreFetchCacheLine(l, a) _mm_prefetch((CHAR CONST *) a, l) + // #pragma message("Z7_PREFETCH : PreFetchCacheLine") + #define Z7_PREFETCH(a) PreFetchCacheLine(PF_TEMPORAL_LEVEL_1, (a)) +#endif + +#endif // _WIN32 + + +#define PREFETCH_NO(p,k,s,size) + +#ifndef Z7_PREFETCH + #define SORT_PREFETCH(p,k,s,size) +#else + +// #define PREFETCH_LEVEL 2 // use it if cache line is 32-bytes +#define PREFETCH_LEVEL 3 // it is fast for most cases (64-bytes cache line prefetch) +// #define PREFETCH_LEVEL 4 // it can be faster for big array (128-bytes prefetch) + +#if PREFETCH_LEVEL == 0 + + #define SORT_PREFETCH(p,k,s,size) + +#else // PREFETCH_LEVEL != 0 /* -#define HeapSortRefDown(p, vals, n, size, temp) \ - { size_t k = n; UInt32 val = vals[temp]; for (;;) { \ - size_t s = (k << 1); \ - if (s > size) break; \ - if (s < size && vals[p[s + 1]] > vals[p[s]]) s++; \ - if (val >= vals[p[s]]) break; \ - p[k] = p[s]; k = s; \ - } p[k] = temp; } +if defined(USE_PREFETCH_FOR_ALIGNED_ARRAY) + we prefetch one value per cache line. + Use it if array is aligned for cache line size (64 bytes) + or if array is small (less than L1 cache size). -void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size) +if !defined(USE_PREFETCH_FOR_ALIGNED_ARRAY) + we perfetch all cache lines that can be required. + it can be faster for big unaligned arrays. +*/ + #define USE_PREFETCH_FOR_ALIGNED_ARRAY + +// s == k * 2 +#if 0 && PREFETCH_LEVEL <= 3 && defined(MY_CPU_X86_OR_AMD64) + // x86 supports (lea r1*8+offset) + #define PREFETCH_OFFSET(k,s) ((s) << PREFETCH_LEVEL) +#else + #define PREFETCH_OFFSET(k,s) ((k) << (PREFETCH_LEVEL + 1)) +#endif + +#if 1 && PREFETCH_LEVEL <= 3 && defined(USE_PREFETCH_FOR_ALIGNED_ARRAY) + #define PREFETCH_ADD_OFFSET 0 +#else + // last offset that can be reqiured in PREFETCH_LEVEL step: + #define PREFETCH_RANGE ((2 << PREFETCH_LEVEL) - 1) + #define PREFETCH_ADD_OFFSET PREFETCH_RANGE / 2 +#endif + +#if PREFETCH_LEVEL <= 3 + +#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY + #define SORT_PREFETCH(p,k,s,size) \ + { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_ADD_OFFSET; \ + if (s2 <= size) { \ + Z7_PREFETCH((p + s2)); \ + }} +#else /* for unaligned array */ + #define SORT_PREFETCH(p,k,s,size) \ + { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \ + if (s2 <= size) { \ + Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \ + Z7_PREFETCH((p + s2)); \ + }} +#endif + +#else // PREFETCH_LEVEL > 3 + +#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY + #define SORT_PREFETCH(p,k,s,size) \ + { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE - 16 / 2; \ + if (s2 <= size) { \ + Z7_PREFETCH((p + s2 - 16)); \ + Z7_PREFETCH((p + s2)); \ + }} +#else /* for unaligned array */ + #define SORT_PREFETCH(p,k,s,size) \ + { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \ + if (s2 <= size) { \ + Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \ + Z7_PREFETCH((p + s2 - PREFETCH_RANGE / 2)); \ + Z7_PREFETCH((p + s2)); \ + }} +#endif + +#endif // PREFETCH_LEVEL > 3 +#endif // PREFETCH_LEVEL != 0 +#endif // Z7_PREFETCH + + +#if defined(MY_CPU_ARM64) \ + /* || defined(MY_CPU_AMD64) */ \ + /* || defined(MY_CPU_ARM) && !defined(_MSC_VER) */ + // we want to use cmov, if cmov is very fast: + // - this cmov version is slower for clang-x64. + // - this cmov version is faster for gcc-arm64 for some fast arm64 cpus. + #define Z7_FAST_CMOV_SUPPORTED +#endif + +#ifdef Z7_FAST_CMOV_SUPPORTED + // we want to use cmov here, if cmov is fast: new arm64 cpus. + // we want the compiler to use conditional move for this branch + #define GET_MAX_VAL(n0, n1, max_val_slow) if (n0 < n1) n0 = n1; +#else + // use this branch, if cpu doesn't support fast conditional move. + // it uses slow array access reading: + #define GET_MAX_VAL(n0, n1, max_val_slow) n0 = max_val_slow; +#endif + +#define HeapSortDown(p, k, size, temp, macro_prefetch) \ +{ \ + for (;;) { \ + UInt32 n0, n1; \ + size_t s = k * 2; \ + if (s >= size) { \ + if (s == size) { \ + n0 = p[s]; \ + p[k] = n0; \ + if (temp < n0) k = s; \ + } \ + break; \ + } \ + n0 = p[k * 2]; \ + n1 = p[k * 2 + 1]; \ + s += n0 < n1; \ + GET_MAX_VAL(n0, n1, p[s]) \ + if (temp >= n0) break; \ + macro_prefetch(p, k, s, size) \ + p[k] = n0; \ + k = s; \ + } \ + p[k] = temp; \ +} + + +/* +stage-1 : O(n) : + we generate intermediate partially sorted binary tree: + p[0] : it's additional item for better alignment of tree structure in memory. + p[1] + p[2] p[3] + p[4] p[5] p[6] p[7] + ... + p[x] >= p[x * 2] + p[x] >= p[x * 2 + 1] + +stage-2 : O(n)*log2(N): + we move largest item p[0] from head of tree to the end of array + and insert last item to sorted binary tree. +*/ + +// (p) must be aligned for cache line size (64-bytes) for best performance + +void Z7_FASTCALL HeapSort(UInt32 *p, size_t size) { - if (size <= 1) + if (size < 2) return; - p--; + if (size == 2) { - size_t i = size / 2; + const UInt32 a0 = p[0]; + const UInt32 a1 = p[1]; + const unsigned k = a1 < a0; + p[k] = a0; + p[k ^ 1] = a1; + return; + } + { + // stage-1 : O(n) + // we transform array to partially sorted binary tree. + size_t i = --size / 2; + // (size) now is the index of the last item in tree, + // if (i) + { + do + { + const UInt32 temp = p[i]; + size_t k = i; + HeapSortDown(p, k, size, temp, PREFETCH_NO) + } + while (--i); + } + { + const UInt32 temp = p[0]; + const UInt32 a1 = p[1]; + if (temp < a1) + { + size_t k = 1; + p[0] = a1; + HeapSortDown(p, k, size, temp, PREFETCH_NO) + } + } + } + + if (size < 3) + { + // size == 2 + const UInt32 a0 = p[0]; + p[0] = p[2]; + p[2] = a0; + return; + } + if (size != 3) + { + // stage-2 : O(size) * log2(size): + // we move largest item p[0] from head to the end of array, + // and insert last item to sorted binary tree. do { - UInt32 temp = p[i]; - HeapSortRefDown(p, vals, i, size, temp); + const UInt32 temp = p[size]; + size_t k = p[2] < p[3] ? 3 : 2; + p[size--] = p[0]; + p[0] = p[1]; + p[1] = p[k]; + HeapSortDown(p, k, size, temp, SORT_PREFETCH) // PREFETCH_NO } - while (--i != 0); + while (size != 3); } - do { - UInt32 temp = p[size]; - p[size--] = p[1]; - HeapSortRefDown(p, vals, 1, size, temp); + const UInt32 a2 = p[2]; + const UInt32 a3 = p[3]; + const size_t k = a2 < a3; + p[2] = p[1]; + p[3] = p[0]; + p[k] = a3; + p[k ^ 1] = a2; } - while (size > 1); } -*/ diff --git a/3rdparty/lzma/src/Threads.c b/3rdparty/lzma/src/Threads.c index 464efeca49..177d1d9343 100644 --- a/3rdparty/lzma/src/Threads.c +++ b/3rdparty/lzma/src/Threads.c @@ -1,5 +1,5 @@ /* Threads.c -- multithreading library -2024-03-28 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #include "Precomp.h" @@ -59,6 +59,100 @@ WRes Thread_Wait_Close(CThread *p) return (res != 0 ? res : res2); } +typedef struct MY_PROCESSOR_NUMBER { + WORD Group; + BYTE Number; + BYTE Reserved; +} MY_PROCESSOR_NUMBER, *MY_PPROCESSOR_NUMBER; + +typedef struct MY_GROUP_AFFINITY { +#if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 100000) + // KAFFINITY is not defined in old mingw + ULONG_PTR +#else + KAFFINITY +#endif + Mask; + WORD Group; + WORD Reserved[3]; +} MY_GROUP_AFFINITY, *MY_PGROUP_AFFINITY; + +typedef BOOL (WINAPI *Func_SetThreadGroupAffinity)( + HANDLE hThread, + CONST MY_GROUP_AFFINITY *GroupAffinity, + MY_PGROUP_AFFINITY PreviousGroupAffinity); + +typedef BOOL (WINAPI *Func_GetThreadGroupAffinity)( + HANDLE hThread, + MY_PGROUP_AFFINITY GroupAffinity); + +typedef BOOL (WINAPI *Func_GetProcessGroupAffinity)( + HANDLE hProcess, + PUSHORT GroupCount, + PUSHORT GroupArray); + +Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION + +#if 0 +#include +#define PRF(x) x +/* +-- + before call of SetThreadGroupAffinity() + GetProcessGroupAffinity return one group. + after call of SetThreadGroupAffinity(): + GetProcessGroupAffinity return more than group, + if SetThreadGroupAffinity() was to another group. +-- + GetProcessAffinityMask MS DOCs: + { + If the calling process contains threads in multiple groups, + the function returns zero for both affinity masks. + } + but tests in win10 with 2 groups (less than 64 cores total): + GetProcessAffinityMask() still returns non-zero affinity masks + even after SetThreadGroupAffinity() calls. +*/ +static void PrintProcess_Info() +{ + { + const + Func_GetProcessGroupAffinity fn_GetProcessGroupAffinity = + (Func_GetProcessGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), + "GetProcessGroupAffinity"); + if (fn_GetProcessGroupAffinity) + { + unsigned i; + USHORT GroupCounts[64]; + USHORT GroupCount = Z7_ARRAY_SIZE(GroupCounts); + BOOL boolRes = fn_GetProcessGroupAffinity(GetCurrentProcess(), + &GroupCount, GroupCounts); + printf("\n====== GetProcessGroupAffinity : " + "boolRes=%u GroupCounts = %u :", + boolRes, (unsigned)GroupCount); + for (i = 0; i < GroupCount; i++) + printf(" %u", GroupCounts[i]); + printf("\n"); + } + } + { + DWORD_PTR processAffinityMask, systemAffinityMask; + if (GetProcessAffinityMask(GetCurrentProcess(), &processAffinityMask, &systemAffinityMask)) + { + PRF(printf("\n====== GetProcessAffinityMask : " + ": processAffinityMask=%x, systemAffinityMask=%x\n", + (UInt32)processAffinityMask, (UInt32)systemAffinityMask);) + } + else + printf("\n==GetProcessAffinityMask FAIL"); + } +} +#else +#ifndef USE_THREADS_CreateThread +// #define PRF(x) +#endif +#endif + WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param) { /* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */ @@ -72,7 +166,43 @@ WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param) unsigned threadId; *p = (HANDLE)(_beginthreadex(NULL, 0, func, param, 0, &threadId)); - + +#if 0 // 1 : for debug + { + DWORD_PTR prevMask; + DWORD_PTR affinity = 1 << 0; + prevMask = SetThreadAffinityMask(*p, (DWORD_PTR)affinity); + prevMask = prevMask; + } +#endif +#if 0 // 1 : for debug + { + /* win10: new thread will be created in same group that is assigned to parent thread + but affinity mask will contain all allowed threads of that group, + even if affinity mask of parent group is not full + win11: what group it will be created, if we have set + affinity of parent thread with ThreadGroupAffinity? + */ + const + Func_GetThreadGroupAffinity fn = + (Func_GetThreadGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), + "GetThreadGroupAffinity"); + if (fn) + { + // BOOL wres2; + MY_GROUP_AFFINITY groupAffinity; + memset(&groupAffinity, 0, sizeof(groupAffinity)); + /* wres2 = */ fn(*p, &groupAffinity); + PRF(printf("\n==Thread_Create cur = %6u GetThreadGroupAffinity(): " + "wres2_BOOL = %u, group=%u mask=%x\n", + GetCurrentThreadId(), + wres2, + groupAffinity.Group, + (UInt32)groupAffinity.Mask);) + } + } +#endif + #endif /* maybe we must use errno here, but probably GetLastError() is also OK. */ @@ -110,7 +240,84 @@ WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param */ } { - DWORD prevSuspendCount = ResumeThread(h); + const DWORD prevSuspendCount = ResumeThread(h); + /* ResumeThread() returns: + 0 : was_not_suspended + 1 : was_resumed + -1 : error + */ + if (prevSuspendCount == (DWORD)-1) + wres = GetError(); + } + } + + /* maybe we must use errno here, but probably GetLastError() is also OK. */ + return wres; + + #endif +} + + +WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinityMask) +{ +#ifdef USE_THREADS_CreateThread + + UNUSED_VAR(group) + UNUSED_VAR(affinityMask) + return Thread_Create(p, func, param); + +#else + + /* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */ + HANDLE h; + WRes wres; + unsigned threadId; + h = (HANDLE)(_beginthreadex(NULL, 0, func, param, CREATE_SUSPENDED, &threadId)); + *p = h; + wres = HandleToWRes(h); + if (h) + { + // PrintProcess_Info(); + { + const + Func_SetThreadGroupAffinity fn = + (Func_SetThreadGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), + "SetThreadGroupAffinity"); + if (fn) + { + // WRes wres2; + MY_GROUP_AFFINITY groupAffinity, prev_groupAffinity; + memset(&groupAffinity, 0, sizeof(groupAffinity)); + // groupAffinity.Mask must use only bits that supported by current group + // (groupAffinity.Mask = 0) means all allowed bits + groupAffinity.Mask = affinityMask; + groupAffinity.Group = (WORD)group; + // wres2 = + fn(h, &groupAffinity, &prev_groupAffinity); + /* + if (groupAffinity.Group == prev_groupAffinity.Group) + wres2 = wres2; + else + wres2 = wres2; + if (wres2 == 0) + { + wres2 = GetError(); + PRF(printf("\n==SetThreadGroupAffinity error: %u\n", wres2);) + } + else + { + PRF(printf("\n==Thread_Create_With_Group::SetThreadGroupAffinity()" + " threadId = %6u" + " group=%u mask=%x\n", + threadId, + prev_groupAffinity.Group, + (UInt32)prev_groupAffinity.Mask);) + } + */ + } + } + { + const DWORD prevSuspendCount = ResumeThread(h); /* ResumeThread() returns: 0 : was_not_suspended 1 : was_resumed @@ -297,6 +504,13 @@ WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param) return Thread_Create_With_CpuSet(p, func, param, NULL); } +/* +WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinity) +{ + UNUSED_VAR(group) + return Thread_Create_With_Affinity(p, func, param, affinity); +} +*/ WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, CAffinityMask affinity) { @@ -577,5 +791,22 @@ WRes AutoResetEvent_OptCreate_And_Reset(CAutoResetEvent *p) return AutoResetEvent_CreateNotSignaled(p); } +void ThreadNextGroup_Init(CThreadNextGroup *p, UInt32 numGroups, UInt32 startGroup) +{ + // printf("\n====== ThreadNextGroup_Init numGroups = %x: startGroup=%x\n", numGroups, startGroup); + if (numGroups == 0) + numGroups = 1; + p->NumGroups = numGroups; + p->NextGroup = startGroup % numGroups; +} + + +UInt32 ThreadNextGroup_GetNext(CThreadNextGroup *p) +{ + const UInt32 next = p->NextGroup; + p->NextGroup = (next + 1) % p->NumGroups; + return next; +} + #undef PRF #undef Print diff --git a/3rdparty/lzma/src/XzCrc64Opt.c b/3rdparty/lzma/src/XzCrc64Opt.c index 0c1fc2ffec..6eea4a3b6a 100644 --- a/3rdparty/lzma/src/XzCrc64Opt.c +++ b/3rdparty/lzma/src/XzCrc64Opt.c @@ -1,5 +1,5 @@ /* XzCrc64Opt.c -- CRC64 calculation (optimized functions) -2023-12-08 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #include "Precomp.h" @@ -235,7 +235,7 @@ CRC64_FUNC_PRE_BE(Z7_CRC64_NUM_TABLES_USE) v = Q32BE(1, w1) ^ Q32BE(0, w0); v ^= Q32BE(3, d1) ^ Q32BE(2, d0); #endif -#elif +#else #error Stop_Compiling_Bad_CRC64_NUM_TABLES #endif p += Z7_CRC64_NUM_TABLES_USE; diff --git a/3rdparty/lzma/src/XzDec.c b/3rdparty/lzma/src/XzDec.c index 3d1c98e631..2dac3247f9 100644 --- a/3rdparty/lzma/src/XzDec.c +++ b/3rdparty/lzma/src/XzDec.c @@ -1,5 +1,5 @@ /* XzDec.c -- Xz Decode -2024-03-01 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #include "Precomp.h" @@ -59,7 +59,7 @@ unsigned Xz_ReadVarInt(const Byte *p, size_t maxSize, UInt64 *value) for (i = 0; i < limit;) { - Byte b = p[i]; + const unsigned b = p[i]; *value |= (UInt64)(b & 0x7F) << (7 * i++); if ((b & 0x80) == 0) return (b == 0 && i != 1) ? 0 : i; @@ -796,11 +796,10 @@ SRes Xz_ParseHeader(CXzStreamFlags *p, const Byte *buf) static BoolInt Xz_CheckFooter(CXzStreamFlags flags, UInt64 indexSize, const Byte *buf) { - return indexSize == (((UInt64)GetUi32(buf + 4) + 1) << 2) - && GetUi32(buf) == CrcCalc(buf + 4, 6) - && flags == GetBe16(buf + 8) - && buf[10] == XZ_FOOTER_SIG_0 - && buf[11] == XZ_FOOTER_SIG_1; + return indexSize == (((UInt64)GetUi32a(buf + 4) + 1) << 2) + && GetUi32a(buf) == CrcCalc(buf + 4, 6) + && flags == GetBe16a(buf + 8) + && GetUi16a(buf + 10) == (XZ_FOOTER_SIG_0 | (XZ_FOOTER_SIG_1 << 8)); } #define READ_VARINT_AND_CHECK(buf, pos, size, res) \ @@ -1166,7 +1165,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, p->indexPreSize = 1 + Xz_WriteVarInt(p->buf + 1, p->numBlocks); p->indexPos = p->indexPreSize; p->indexSize += p->indexPreSize; - Sha256_Final(&p->sha, p->shaDigest); + Sha256_Final(&p->sha, (Byte *)(void *)p->shaDigest32); Sha256_Init(&p->sha); p->crc = CrcUpdate(CRC_INIT_VAL, p->buf, p->indexPreSize); p->state = XZ_STATE_STREAM_INDEX; @@ -1241,10 +1240,10 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, break; } { - Byte digest[XZ_CHECK_SIZE_MAX]; + UInt32 digest32[XZ_CHECK_SIZE_MAX / 4]; p->state = XZ_STATE_BLOCK_HEADER; p->pos = 0; - if (XzCheck_Final(&p->check, digest) && memcmp(digest, p->buf, checkSize) != 0) + if (XzCheck_Final(&p->check, (void *)digest32) && memcmp(digest32, p->buf, checkSize) != 0) return SZ_ERROR_CRC; if (p->decodeOnlyOneBlock) { @@ -1289,12 +1288,12 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, } else { - Byte digest[SHA256_DIGEST_SIZE]; + UInt32 digest32[SHA256_DIGEST_SIZE / 4]; p->state = XZ_STATE_STREAM_INDEX_CRC; p->indexSize += 4; p->pos = 0; - Sha256_Final(&p->sha, digest); - if (memcmp(digest, p->shaDigest, SHA256_DIGEST_SIZE) != 0) + Sha256_Final(&p->sha, (void *)digest32); + if (memcmp(digest32, p->shaDigest32, SHA256_DIGEST_SIZE) != 0) return SZ_ERROR_CRC; } } @@ -1313,7 +1312,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, const Byte *ptr = p->buf; p->state = XZ_STATE_STREAM_FOOTER; p->pos = 0; - if (CRC_GET_DIGEST(p->crc) != GetUi32(ptr)) + if (CRC_GET_DIGEST(p->crc) != GetUi32a(ptr)) return SZ_ERROR_CRC; } break; @@ -1343,7 +1342,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, { if (*src != 0) { - if (((UInt32)p->padSize & 3) != 0) + if ((unsigned)p->padSize & 3) return SZ_ERROR_NO_ARCHIVE; p->pos = 0; p->state = XZ_STATE_STREAM_HEADER; diff --git a/3rdparty/lzma/src/XzEnc.c b/3rdparty/lzma/src/XzEnc.c index c1affadfa6..e40f0c88eb 100644 --- a/3rdparty/lzma/src/XzEnc.c +++ b/3rdparty/lzma/src/XzEnc.c @@ -1,5 +1,5 @@ /* XzEnc.c -- Xz Encode -2024-03-01 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #include "Precomp.h" @@ -411,6 +411,7 @@ static SRes SeqInFilter_Read(ISeqInStreamPtr pp, void *data, size_t *size) } } +Z7_FORCE_INLINE static void SeqInFilter_Construct(CSeqInFilter *p) { p->buf = NULL; @@ -418,6 +419,7 @@ static void SeqInFilter_Construct(CSeqInFilter *p) p->vt.Read = SeqInFilter_Read; } +Z7_FORCE_INLINE static void SeqInFilter_Free(CSeqInFilter *p, ISzAllocPtr alloc) { if (p->StateCoder.p) @@ -507,6 +509,7 @@ void XzFilterProps_Init(CXzFilterProps *p) void XzProps_Init(CXzProps *p) { p->checkId = XZ_CHECK_CRC32; + p->numThreadGroups = 0; p->blockSize = XZ_PROPS_BLOCK_SIZE_AUTO; p->numBlockThreads_Reduced = -1; p->numBlockThreads_Max = -1; @@ -689,6 +692,7 @@ typedef struct } CLzma2WithFilters; +Z7_FORCE_INLINE static void Lzma2WithFilters_Construct(CLzma2WithFilters *p) { p->lzma2 = NULL; @@ -712,6 +716,7 @@ static SRes Lzma2WithFilters_Create(CLzma2WithFilters *p, ISzAllocPtr alloc, ISz } +Z7_FORCE_INLINE static void Lzma2WithFilters_Free(CLzma2WithFilters *p, ISzAllocPtr alloc) { #ifdef USE_SUBBLOCK @@ -1236,6 +1241,7 @@ SRes XzEnc_Encode(CXzEncHandle p, ISeqOutStreamPtr outStream, ISeqInStreamPtr in } p->mtCoder.numThreadsMax = (unsigned)props->numBlockThreads_Max; + p->mtCoder.numThreadGroups = props->numThreadGroups; p->mtCoder.expectedDataSize = p->expectedDataSize; RINOK(MtCoder_Code(&p->mtCoder)) diff --git a/3rdparty/lzma/src/XzIn.c b/3rdparty/lzma/src/XzIn.c index b68af965c1..ba316360d9 100644 --- a/3rdparty/lzma/src/XzIn.c +++ b/3rdparty/lzma/src/XzIn.c @@ -1,38 +1,39 @@ /* XzIn.c - Xz input -2023-09-07 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #include "Precomp.h" #include #include "7zCrc.h" -#include "CpuArch.h" #include "Xz.h" +#include "CpuArch.h" -/* -#define XZ_FOOTER_SIG_CHECK(p) (memcmp((p), XZ_FOOTER_SIG, XZ_FOOTER_SIG_SIZE) == 0) -*/ -#define XZ_FOOTER_SIG_CHECK(p) ((p)[0] == XZ_FOOTER_SIG_0 && (p)[1] == XZ_FOOTER_SIG_1) - +#define XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(p) \ + (GetUi16a((const Byte *)(const void *)(p) + 10) == \ + (XZ_FOOTER_SIG_0 | (XZ_FOOTER_SIG_1 << 8))) SRes Xz_ReadHeader(CXzStreamFlags *p, ISeqInStreamPtr inStream) { - Byte sig[XZ_STREAM_HEADER_SIZE]; + UInt32 data32[XZ_STREAM_HEADER_SIZE / 4]; size_t processedSize = XZ_STREAM_HEADER_SIZE; - RINOK(SeqInStream_ReadMax(inStream, sig, &processedSize)) + RINOK(SeqInStream_ReadMax(inStream, data32, &processedSize)) if (processedSize != XZ_STREAM_HEADER_SIZE - || memcmp(sig, XZ_SIG, XZ_SIG_SIZE) != 0) + || memcmp(data32, XZ_SIG, XZ_SIG_SIZE) != 0) return SZ_ERROR_NO_ARCHIVE; - return Xz_ParseHeader(p, sig); + return Xz_ParseHeader(p, (const Byte *)(const void *)data32); } -#define READ_VARINT_AND_CHECK(buf, pos, size, res) \ - { const unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \ +#define READ_VARINT_AND_CHECK(buf, size, res) \ +{ const unsigned s = Xz_ReadVarInt(buf, size, res); \ if (s == 0) return SZ_ERROR_ARCHIVE; \ - pos += s; } + size -= s; \ + buf += s; \ +} SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, UInt32 *headerSizeRes) { + MY_ALIGN(4) Byte header[XZ_BLOCK_HEADER_SIZE_MAX]; unsigned headerSize; *headerSizeRes = 0; @@ -57,8 +58,12 @@ SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, return XzBlock_Parse(p, header); } + #define ADD_SIZE_CHECK(size, val) \ - { const UInt64 newSize = size + (val); if (newSize < size) return XZ_SIZE_OVERFLOW; size = newSize; } +{ const UInt64 newSize = size + (val); \ + if (newSize < size) return XZ_SIZE_OVERFLOW; \ + size = newSize; \ +} UInt64 Xz_GetUnpackSize(const CXzStream *p) { @@ -82,76 +87,85 @@ UInt64 Xz_GetPackSize(const CXzStream *p) return size; } -/* -SRes XzBlock_ReadFooter(CXzBlock *p, CXzStreamFlags f, ISeqInStreamPtr inStream) -{ - return SeqInStream_Read(inStream, p->check, XzFlags_GetCheckSize(f)); -} -*/ -static SRes Xz_ReadIndex2(CXzStream *p, const Byte *buf, size_t size, ISzAllocPtr alloc) +// input; +// CXzStream (p) is empty object. +// size != 0 +// (size & 3) == 0 +// (buf) is aligned for at least 4 bytes. +// output: +// p->numBlocks is number of allocated items in p->blocks +// p->blocks[*] values must be ignored, if function returns error. +static SRes Xz_ParseIndex(CXzStream *p, const Byte *buf, size_t size, ISzAllocPtr alloc) { - size_t numBlocks, pos = 1; - UInt32 crc; - + size_t numBlocks; if (size < 5 || buf[0] != 0) return SZ_ERROR_ARCHIVE; - size -= 4; - crc = CrcCalc(buf, size); - if (crc != GetUi32(buf + size)) - return SZ_ERROR_ARCHIVE; - + { + const UInt32 crc = CrcCalc(buf, size); + if (crc != GetUi32a(buf + size)) + return SZ_ERROR_ARCHIVE; + } + buf++; + size--; { UInt64 numBlocks64; - READ_VARINT_AND_CHECK(buf, pos, size, &numBlocks64) + READ_VARINT_AND_CHECK(buf, size, &numBlocks64) + // (numBlocks64) is 63-bit value, so we can calculate (numBlocks64 * 2): + if (numBlocks64 * 2 > size) + return SZ_ERROR_ARCHIVE; + if (numBlocks64 >= ((size_t)1 << (sizeof(size_t) * 8 - 1)) / sizeof(CXzBlockSizes)) + return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE numBlocks = (size_t)numBlocks64; - if (numBlocks != numBlocks64 || numBlocks * 2 > size) - return SZ_ERROR_ARCHIVE; } - - Xz_Free(p, alloc); - if (numBlocks != 0) + // Xz_Free(p, alloc); // it's optional, because (p) is empty already + if (numBlocks) { - size_t i; - p->numBlocks = numBlocks; - p->blocks = (CXzBlockSizes *)ISzAlloc_Alloc(alloc, sizeof(CXzBlockSizes) * numBlocks); - if (!p->blocks) + CXzBlockSizes *blocks = (CXzBlockSizes *)ISzAlloc_Alloc(alloc, sizeof(CXzBlockSizes) * numBlocks); + if (!blocks) return SZ_ERROR_MEM; - for (i = 0; i < numBlocks; i++) + p->blocks = blocks; + p->numBlocks = numBlocks; + // the caller will call Xz_Free() in case of error + do { - CXzBlockSizes *block = &p->blocks[i]; - READ_VARINT_AND_CHECK(buf, pos, size, &block->totalSize) - READ_VARINT_AND_CHECK(buf, pos, size, &block->unpackSize) - if (block->totalSize == 0) + READ_VARINT_AND_CHECK(buf, size, &blocks->totalSize) + READ_VARINT_AND_CHECK(buf, size, &blocks->unpackSize) + if (blocks->totalSize == 0) return SZ_ERROR_ARCHIVE; + blocks++; } + while (--numBlocks); } - while ((pos & 3) != 0) - if (buf[pos++] != 0) + if (size >= 4) + return SZ_ERROR_ARCHIVE; + while (size) + if (buf[--size]) return SZ_ERROR_ARCHIVE; - return (pos == size) ? SZ_OK : SZ_ERROR_ARCHIVE; + return SZ_OK; } + +/* static SRes Xz_ReadIndex(CXzStream *p, ILookInStreamPtr stream, UInt64 indexSize, ISzAllocPtr alloc) { SRes res; size_t size; Byte *buf; - if (indexSize > ((UInt32)1 << 31)) - return SZ_ERROR_UNSUPPORTED; + if (indexSize >= ((size_t)1 << (sizeof(size_t) * 8 - 1))) + return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE size = (size_t)indexSize; - if (size != indexSize) - return SZ_ERROR_UNSUPPORTED; buf = (Byte *)ISzAlloc_Alloc(alloc, size); if (!buf) return SZ_ERROR_MEM; res = LookInStream_Read2(stream, buf, size, SZ_ERROR_UNSUPPORTED); if (res == SZ_OK) - res = Xz_ReadIndex2(p, buf, size, alloc); + res = Xz_ParseIndex(p, buf, size, alloc); ISzAlloc_Free(alloc, buf); return res; } +*/ static SRes LookInStream_SeekRead_ForArc(ILookInStreamPtr stream, UInt64 offset, void *buf, size_t size) { @@ -160,84 +174,102 @@ static SRes LookInStream_SeekRead_ForArc(ILookInStreamPtr stream, UInt64 offset, /* return LookInStream_Read2(stream, buf, size, SZ_ERROR_NO_ARCHIVE); */ } + +/* +in: + (*startOffset) is position in (stream) where xz_stream must be finished. +out: + if returns SZ_OK, then (*startOffset) is position in stream that shows start of xz_stream. +*/ static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startOffset, ISzAllocPtr alloc) { - UInt64 indexSize; - Byte buf[XZ_STREAM_FOOTER_SIZE]; + #define TEMP_BUF_SIZE (1 << 10) + UInt32 buf32[TEMP_BUF_SIZE / 4]; UInt64 pos = (UInt64)*startOffset; - if ((pos & 3) != 0 || pos < XZ_STREAM_FOOTER_SIZE) + if ((pos & 3) || pos < XZ_STREAM_FOOTER_SIZE) return SZ_ERROR_NO_ARCHIVE; - pos -= XZ_STREAM_FOOTER_SIZE; - RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf, XZ_STREAM_FOOTER_SIZE)) + RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, XZ_STREAM_FOOTER_SIZE)) - if (!XZ_FOOTER_SIG_CHECK(buf + 10)) + if (!XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(buf32)) { - UInt32 total = 0; pos += XZ_STREAM_FOOTER_SIZE; - for (;;) { - size_t i; - #define TEMP_BUF_SIZE (1 << 10) - Byte temp[TEMP_BUF_SIZE]; - - i = (pos > TEMP_BUF_SIZE) ? TEMP_BUF_SIZE : (size_t)pos; + // pos != 0 + // (pos & 3) == 0 + size_t i = pos >= TEMP_BUF_SIZE ? TEMP_BUF_SIZE : (size_t)pos; pos -= i; - RINOK(LookInStream_SeekRead_ForArc(stream, pos, temp, i)) - total += (UInt32)i; - for (; i != 0; i--) - if (temp[i - 1] != 0) + RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, i)) + i /= 4; + do + if (buf32[i - 1] != 0) break; - if (i != 0) - { - if ((i & 3) != 0) - return SZ_ERROR_NO_ARCHIVE; - pos += i; - break; - } - if (pos < XZ_STREAM_FOOTER_SIZE || total > (1 << 16)) + while (--i); + + pos += i * 4; + #define XZ_STREAM_BACKWARD_READING_PAD_MAX (1 << 16) + // here we don't support rare case with big padding for xz stream. + // so we have padding limit for backward reading. + if ((UInt64)*startOffset - pos > XZ_STREAM_BACKWARD_READING_PAD_MAX) return SZ_ERROR_NO_ARCHIVE; + if (i) + break; } - + // we try to open xz stream after skipping zero padding. + // ((UInt64)*startOffset == pos) is possible here! if (pos < XZ_STREAM_FOOTER_SIZE) return SZ_ERROR_NO_ARCHIVE; pos -= XZ_STREAM_FOOTER_SIZE; - RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf, XZ_STREAM_FOOTER_SIZE)) - if (!XZ_FOOTER_SIG_CHECK(buf + 10)) + RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, XZ_STREAM_FOOTER_SIZE)) + if (!XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(buf32)) return SZ_ERROR_NO_ARCHIVE; } - p->flags = (CXzStreamFlags)GetBe16(buf + 8); - + p->flags = (CXzStreamFlags)GetBe16a(buf32 + 2); if (!XzFlags_IsSupported(p->flags)) return SZ_ERROR_UNSUPPORTED; - { /* to eliminate GCC 6.3 warning: dereferencing type-punned pointer will break strict-aliasing rules */ - const Byte *buf_ptr = buf; - if (GetUi32(buf_ptr) != CrcCalc(buf + 4, 6)) + const UInt32 *buf_ptr = buf32; + if (GetUi32a(buf_ptr) != CrcCalc(buf32 + 1, 6)) return SZ_ERROR_ARCHIVE; } - - indexSize = ((UInt64)GetUi32(buf + 4) + 1) << 2; - - if (pos < indexSize) - return SZ_ERROR_ARCHIVE; - - pos -= indexSize; - RINOK(LookInStream_SeekTo(stream, pos)) - RINOK(Xz_ReadIndex(p, stream, indexSize, alloc)) - { - UInt64 totalSize = Xz_GetPackSize(p); - if (totalSize == XZ_SIZE_OVERFLOW - || totalSize >= ((UInt64)1 << 63) - || pos < totalSize + XZ_STREAM_HEADER_SIZE) + const UInt64 indexSize = ((UInt64)GetUi32a(buf32 + 1) + 1) << 2; + if (pos < indexSize) return SZ_ERROR_ARCHIVE; - pos -= (totalSize + XZ_STREAM_HEADER_SIZE); + pos -= indexSize; + // v25.00: relaxed indexSize check. We allow big index table. + // if (indexSize > ((UInt32)1 << 31)) + if (indexSize >= ((size_t)1 << (sizeof(size_t) * 8 - 1))) + return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE + RINOK(LookInStream_SeekTo(stream, pos)) + // RINOK(Xz_ReadIndex(p, stream, indexSize, alloc)) + { + SRes res; + const size_t size = (size_t)indexSize; + // if (size != indexSize) return SZ_ERROR_UNSUPPORTED; + Byte *buf = (Byte *)ISzAlloc_Alloc(alloc, size); + if (!buf) + return SZ_ERROR_MEM; + res = LookInStream_Read2(stream, buf, size, SZ_ERROR_UNSUPPORTED); + if (res == SZ_OK) + res = Xz_ParseIndex(p, buf, size, alloc); + ISzAlloc_Free(alloc, buf); + RINOK(res) + } + } + { + UInt64 total = Xz_GetPackSize(p); + if (total == XZ_SIZE_OVERFLOW || total >= ((UInt64)1 << 63)) + return SZ_ERROR_ARCHIVE; + total += XZ_STREAM_HEADER_SIZE; + if (pos < total) + return SZ_ERROR_ARCHIVE; + pos -= total; RINOK(LookInStream_SeekTo(stream, pos)) *startOffset = (Int64)pos; } @@ -246,7 +278,6 @@ static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startO CSecToRead secToRead; SecToRead_CreateVTable(&secToRead); secToRead.realStream = stream; - RINOK(Xz_ReadHeader(&headerFlags, &secToRead.vt)) return (p->flags == headerFlags) ? SZ_OK : SZ_ERROR_ARCHIVE; } @@ -257,8 +288,7 @@ static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startO void Xzs_Construct(CXzs *p) { - p->num = p->numAllocated = 0; - p->streams = 0; + Xzs_CONSTRUCT(p) } void Xzs_Free(CXzs *p, ISzAllocPtr alloc) @@ -268,7 +298,7 @@ void Xzs_Free(CXzs *p, ISzAllocPtr alloc) Xz_Free(&p->streams[i], alloc); ISzAlloc_Free(alloc, p->streams); p->num = p->numAllocated = 0; - p->streams = 0; + p->streams = NULL; } UInt64 Xzs_GetNumBlocks(const CXzs *p) @@ -307,34 +337,49 @@ UInt64 Xzs_GetPackSize(const CXzs *p) SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr stream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc) { Int64 endOffset = 0; + // it's supposed that CXzs object is empty here. + // if CXzs object is not empty, it will add new streams to that non-empty object. + // Xzs_Free(p, alloc); // it's optional call to empty CXzs object. RINOK(ILookInStream_Seek(stream, &endOffset, SZ_SEEK_END)) *startOffset = endOffset; for (;;) { CXzStream st; SRes res; - Xz_Construct(&st); + Xz_CONSTRUCT(&st) res = Xz_ReadBackward(&st, stream, startOffset, alloc); + // if (res == SZ_OK), then (*startOffset) is start offset of new stream if + // if (res != SZ_OK), then (*startOffset) is unchend or it's expected start offset of stream with error st.startOffset = (UInt64)*startOffset; - RINOK(res) + // we must store (st) object to array, or we must free (st) local object. + if (res != SZ_OK) + { + Xz_Free(&st, alloc); + return res; + } if (p->num == p->numAllocated) { const size_t newNum = p->num + p->num / 4 + 1; void *data = ISzAlloc_Alloc(alloc, newNum * sizeof(CXzStream)); if (!data) + { + Xz_Free(&st, alloc); return SZ_ERROR_MEM; + } p->numAllocated = newNum; if (p->num != 0) memcpy(data, p->streams, p->num * sizeof(CXzStream)); ISzAlloc_Free(alloc, p->streams); p->streams = (CXzStream *)data; } + // we use direct copying of raw data from local variable (st) to object in array. + // so we don't need to call Xz_Free(&st, alloc) after copying and after p->num++ p->streams[p->num++] = st; if (*startOffset == 0) - break; - RINOK(LookInStream_SeekTo(stream, (UInt64)*startOffset)) + return SZ_OK; + // seek operation is optional: + // RINOK(LookInStream_SeekTo(stream, (UInt64)*startOffset)) if (progress && ICompressProgress_Progress(progress, (UInt64)(endOffset - *startOffset), (UInt64)(Int64)-1) != SZ_OK) return SZ_ERROR_PROGRESS; } - return SZ_OK; }