3rdparty: Update LZMA/7zipSDK to 25.00

2025-12-16 04:08:48 +00:00 · 2025-07-06 00:40:37 +01:00 · 2025-07-06 00:40:37 +01:00 · a14c8eb7d5
commit a14c8eb7d5
parent 97ea52a6c1
28 changed files with 1238 additions and 611 deletions
--- a/3rdparty/lzma/include/7zVersion.h
+++ b/3rdparty/lzma/include/7zVersion.h
@ -1,7 +1,7 @@
-#define MY_VER_MAJOR 24
-#define MY_VER_MINOR 8
+#define MY_VER_MAJOR 25
+#define MY_VER_MINOR 0
 #define MY_VER_BUILD 0
-#define MY_VERSION_NUMBERS "24.08"
+#define MY_VERSION_NUMBERS "25.00"
 #define MY_VERSION MY_VERSION_NUMBERS

 #ifdef MY_CPU_NAME
@ -10,12 +10,12 @@
  #define MY_VERSION_CPU MY_VERSION
 #endif

-#define MY_DATE "2024-08-11"
+#define MY_DATE "2025-07-05"
 #undef MY_COPYRIGHT
 #undef MY_VERSION_COPYRIGHT_DATE
 #define MY_AUTHOR_NAME "Igor Pavlov"
 #define MY_COPYRIGHT_PD "Igor Pavlov : Public domain"
-#define MY_COPYRIGHT_CR "Copyright (c) 1999-2024 Igor Pavlov"
+#define MY_COPYRIGHT_CR "Copyright (c) 1999-2025 Igor Pavlov"

 #ifdef USE_COPYRIGHT_CR
  #define MY_COPYRIGHT MY_COPYRIGHT_CR
--- a/3rdparty/lzma/include/Compiler.h
+++ b/3rdparty/lzma/include/Compiler.h
@ -1,5 +1,5 @@
 /* Compiler.h : Compiler specific defines and pragmas
-2024-01-22 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */

 #ifndef ZIP7_INC_COMPILER_H
 #define ZIP7_INC_COMPILER_H
@ -183,6 +183,16 @@ typedef void (*Z7_void_Function)(void);
  #define Z7_ATTRIB_NO_VECTORIZE
 #endif

+#if defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1920)
+  #define Z7_PRAGMA_OPTIMIZE_FOR_CODE_SIZE _Pragma("optimize ( \"s\", on )")
+  #define Z7_PRAGMA_OPTIMIZE_DEFAULT       _Pragma("optimize ( \"\", on )")
+#else
+  #define Z7_PRAGMA_OPTIMIZE_FOR_CODE_SIZE
+  #define Z7_PRAGMA_OPTIMIZE_DEFAULT
+#endif
+
+
+
 #if defined(MY_CPU_X86_OR_AMD64) && ( \
       defined(__clang__) && (__clang_major__ >= 4) \
    || defined(__GNUC__) && (__GNUC__ >= 5))
--- a/3rdparty/lzma/include/CpuArch.h
+++ b/3rdparty/lzma/include/CpuArch.h
@ -1,5 +1,5 @@
 /* CpuArch.h -- CPU specific code
-2024-06-17 : Igor Pavlov : Public domain */
+Igor Pavlov : Public domain */

 #ifndef ZIP7_INC_CPU_ARCH_H
 #define ZIP7_INC_CPU_ARCH_H
@ -47,6 +47,12 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
  #define MY_CPU_SIZEOF_POINTER 4
 #endif

+#if defined(__SSE2__) \
+    || defined(MY_CPU_AMD64) \
+    || defined(_M_IX86_FP) && (_M_IX86_FP >= 2)
+#define MY_CPU_SSE2
+#endif
+

 #if  defined(_M_ARM64) \
  || defined(_M_ARM64EC) \
@ -509,11 +515,19 @@ problem-4 : performace:

 #if defined(MY_CPU_LE_UNALIGN) && defined(Z7_CPU_FAST_BSWAP_SUPPORTED)

+#if 0
+// Z7_BSWAP16 can be slow for x86-msvc
+#define GetBe16_to32(p)  (Z7_BSWAP16 (*(const UInt16 *)(const void *)(p)))
+#else
+#define GetBe16_to32(p)  (Z7_BSWAP32 (*(const UInt16 *)(const void *)(p)) >> 16)
+#endif
+
 #define GetBe32(p)  Z7_BSWAP32 (*(const UInt32 *)(const void *)(p))
 #define SetBe32(p, v) { (*(UInt32 *)(void *)(p)) = Z7_BSWAP32(v); }

 #if defined(MY_CPU_LE_UNALIGN_64)
 #define GetBe64(p)  Z7_BSWAP64 (*(const UInt64 *)(const void *)(p))
+#define SetBe64(p, v) { (*(UInt64 *)(void *)(p)) = Z7_BSWAP64(v); }
 #endif

 #else
@ -536,21 +550,39 @@ problem-4 : performace:
 #define GetBe64(p) (((UInt64)GetBe32(p) << 32) | GetBe32(((const Byte *)(p)) + 4))
 #endif

+#ifndef SetBe64
+#define SetBe64(p, v) { Byte *_ppp_ = (Byte *)(p); UInt64 _vvv_ = (v); \
+    _ppp_[0] = (Byte)(_vvv_ >> 56); \
+    _ppp_[1] = (Byte)(_vvv_ >> 48); \
+    _ppp_[2] = (Byte)(_vvv_ >> 40); \
+    _ppp_[3] = (Byte)(_vvv_ >> 32); \
+    _ppp_[4] = (Byte)(_vvv_ >> 24); \
+    _ppp_[5] = (Byte)(_vvv_ >> 16); \
+    _ppp_[6] = (Byte)(_vvv_ >> 8); \
+    _ppp_[7] = (Byte)_vvv_; }
+#endif
+
 #ifndef GetBe16
+#ifdef GetBe16_to32
+#define GetBe16(p) ( (UInt16) GetBe16_to32(p))
+#else
 #define GetBe16(p) ( (UInt16) ( \
    ((UInt16)((const Byte *)(p))[0] << 8) | \
             ((const Byte *)(p))[1] ))
 #endif
+#endif


 #if defined(MY_CPU_BE)
 #define Z7_CONV_BE_TO_NATIVE_CONST32(v)  (v)
 #define Z7_CONV_LE_TO_NATIVE_CONST32(v)  Z7_BSWAP32_CONST(v)
 #define Z7_CONV_NATIVE_TO_BE_32(v)       (v)
+// #define Z7_GET_NATIVE16_FROM_2_BYTES(b0, b1)  ((b1) | ((b0) << 8))
 #elif defined(MY_CPU_LE)
 #define Z7_CONV_BE_TO_NATIVE_CONST32(v)  Z7_BSWAP32_CONST(v)
 #define Z7_CONV_LE_TO_NATIVE_CONST32(v)  (v)
 #define Z7_CONV_NATIVE_TO_BE_32(v)       Z7_BSWAP32(v)
+// #define Z7_GET_NATIVE16_FROM_2_BYTES(b0, b1)  ((b0) | ((b1) << 8))
 #else
 #error Stop_Compiling_Unknown_Endian_CONV
 #endif
@ -589,6 +621,11 @@ problem-4 : performace:
 #endif


+#ifndef GetBe16_to32
+#define GetBe16_to32(p) GetBe16(p)
+#endif
+
+
 #if defined(MY_CPU_X86_OR_AMD64) \
  || defined(MY_CPU_ARM_OR_ARM64) \
  || defined(MY_CPU_PPC_OR_PPC64)
@ -617,6 +654,7 @@ BoolInt CPU_IsSupported_SSE2(void);
 BoolInt CPU_IsSupported_SSSE3(void);
 BoolInt CPU_IsSupported_SSE41(void);
 BoolInt CPU_IsSupported_SHA(void);
+BoolInt CPU_IsSupported_SHA512(void);
 BoolInt CPU_IsSupported_PageGB(void);

 #elif defined(MY_CPU_ARM_OR_ARM64)
@ -634,6 +672,7 @@ BoolInt CPU_IsSupported_SHA1(void);
 BoolInt CPU_IsSupported_SHA2(void);
 BoolInt CPU_IsSupported_AES(void);
 #endif
+BoolInt CPU_IsSupported_SHA512(void);

 #endif

--- a/3rdparty/lzma/include/LzFindMt.h
+++ b/3rdparty/lzma/include/LzFindMt.h
@ -1,5 +1,5 @@
 /* LzFindMt.h -- multithreaded Match finder for LZ algorithms
-2024-01-22 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */

 #ifndef ZIP7_INC_LZ_FIND_MT_H
 #define ZIP7_INC_LZ_FIND_MT_H
@ -12,8 +12,10 @@ EXTERN_C_BEGIN
 typedef struct
 {
  UInt32 numProcessedBlocks;
-  CThread thread;
+  Int32 affinityGroup;
+  UInt64 affinityInGroup;
  UInt64 affinity;
+  CThread thread;

  BoolInt wasCreated;
  BoolInt needStart;
--- a/3rdparty/lzma/include/Lzma2Enc.h
+++ b/3rdparty/lzma/include/Lzma2Enc.h
@ -18,6 +18,7 @@ typedef struct
  int numBlockThreads_Reduced;
  int numBlockThreads_Max;
  int numTotalThreads;
+  unsigned numThreadGroups; // 0 : no groups
 } CLzma2EncProps;

 void Lzma2EncProps_Init(CLzma2EncProps *p);
--- a/3rdparty/lzma/include/LzmaEnc.h
+++ b/3rdparty/lzma/include/LzmaEnc.h
@ -1,5 +1,5 @@
 /*  LzmaEnc.h -- LZMA Encoder
-2023-04-13 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */

 #ifndef ZIP7_INC_LZMA_ENC_H
 #define ZIP7_INC_LZMA_ENC_H
@ -29,11 +29,13 @@ typedef struct
  int numThreads;  /* 1 or 2, default = 2 */

  // int _pad;
+  Int32 affinityGroup;

  UInt64 reduceSize; /* estimated size of data that will be compressed. default = (UInt64)(Int64)-1.
                        Encoder uses this value to reduce dictionary size */

  UInt64 affinity;
+  UInt64 affinityInGroup;
 } CLzmaEncProps;

 void LzmaEncProps_Init(CLzmaEncProps *p);
--- a/3rdparty/lzma/include/MtCoder.h
+++ b/3rdparty/lzma/include/MtCoder.h
@ -1,5 +1,5 @@
 /* MtCoder.h -- Multi-thread Coder
-2023-04-13 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */

 #ifndef ZIP7_INC_MT_CODER_H
 #define ZIP7_INC_MT_CODER_H
@ -16,7 +16,7 @@ EXTERN_C_BEGIN

 #ifndef Z7_ST
  #define MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads) ((numThreads) + (numThreads) / 8 + 1)
-  #define MTCODER_THREADS_MAX 64
+  #define MTCODER_THREADS_MAX 256
  #define MTCODER_BLOCKS_MAX (MTCODER_GET_NUM_BLOCKS_FROM_THREADS(MTCODER_THREADS_MAX) + 3)
 #else
  #define MTCODER_THREADS_MAX 1
@ -77,6 +77,7 @@ typedef struct CMtCoder_
  
  size_t blockSize;        /* size of input block */
  unsigned numThreadsMax;
+  unsigned numThreadGroups;
  UInt64 expectedDataSize;

  ISeqInStreamPtr inStream;
@ -125,6 +126,8 @@ typedef struct CMtCoder_
  CMtProgress mtProgress;
  CMtCoderBlock blocks[MTCODER_BLOCKS_MAX];
  CMtCoderThread threads[MTCODER_THREADS_MAX];
+
+  CThreadNextGroup nextGroup;
 } CMtCoder;


--- a/3rdparty/lzma/include/Sha256.h
+++ b/3rdparty/lzma/include/Sha256.h
@ -1,5 +1,5 @@
 /* Sha256.h -- SHA-256 Hash
-2023-04-02 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */

 #ifndef ZIP7_INC_SHA256_H
 #define ZIP7_INC_SHA256_H
@ -14,6 +14,9 @@ EXTERN_C_BEGIN
 #define SHA256_BLOCK_SIZE   (SHA256_NUM_BLOCK_WORDS * 4)
 #define SHA256_DIGEST_SIZE  (SHA256_NUM_DIGEST_WORDS * 4)

+
+
+
 typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byte *data, size_t numBlocks);

 /*
@ -31,10 +34,17 @@ typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byt
 */

 typedef struct
+{
+  union
+  {
+    struct
    {
      SHA256_FUNC_UPDATE_BLOCKS func_UpdateBlocks;
      UInt64 count;
-  UInt64 _pad_2[2];
+    } vars;
+    UInt64 _pad_64bit[4];
+    void *_pad_align_ptr[2];
+  } v;
  UInt32 state[SHA256_NUM_DIGEST_WORDS];

  Byte buffer[SHA256_BLOCK_SIZE];
--- a/3rdparty/lzma/include/Sort.h
+++ b/3rdparty/lzma/include/Sort.h
@ -1,5 +1,5 @@
 /* Sort.h -- Sort functions
-2023-03-05 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */

 #ifndef ZIP7_INC_SORT_H
 #define ZIP7_INC_SORT_H
@ -8,10 +8,7 @@

 EXTERN_C_BEGIN

-void HeapSort(UInt32 *p, size_t size);
-void HeapSort64(UInt64 *p, size_t size);
-
-/* void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size); */
+void Z7_FASTCALL HeapSort(UInt32 *p, size_t size);

 EXTERN_C_END

--- a/3rdparty/lzma/include/Threads.h
+++ b/3rdparty/lzma/include/Threads.h
@ -1,5 +1,5 @@
 /* Threads.h -- multithreading library
-2024-03-28 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */

 #ifndef ZIP7_INC_THREADS_H
 #define ZIP7_INC_THREADS_H
@ -140,12 +140,22 @@ WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param
 WRes Thread_Wait_Close(CThread *p);

 #ifdef _WIN32
+WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinityMask);
 #define Thread_Create_With_CpuSet(p, func, param, cs) \
  Thread_Create_With_Affinity(p, func, param, *cs)
 #else
 WRes Thread_Create_With_CpuSet(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, const CCpuSet *cpuSet);
 #endif

+typedef struct
+{
+  unsigned NumGroups;
+  unsigned NextGroup;
+} CThreadNextGroup;
+
+void ThreadNextGroup_Init(CThreadNextGroup *p, unsigned numGroups, unsigned startGroup);
+unsigned ThreadNextGroup_GetNext(CThreadNextGroup *p);
+

 #ifdef _WIN32

--- a/3rdparty/lzma/include/Xz.h
+++ b/3rdparty/lzma/include/Xz.h
@ -1,5 +1,5 @@
 /* Xz.h - Xz interface
-2024-01-26 : Igor Pavlov : Public domain */
+Igor Pavlov : Public domain */

 #ifndef ZIP7_INC_XZ_H
 #define ZIP7_INC_XZ_H
@ -121,6 +121,7 @@ typedef struct
  UInt64 startOffset;
 } CXzStream;

+#define Xz_CONSTRUCT(p) { (p)->numBlocks = 0;  (p)->blocks = NULL;  (p)->flags = 0; }
 void Xz_Construct(CXzStream *p);
 void Xz_Free(CXzStream *p, ISzAllocPtr alloc);

@ -136,8 +137,13 @@ typedef struct
  CXzStream *streams;
 } CXzs;

+#define Xzs_CONSTRUCT(p) { (p)->num = 0;  (p)->numAllocated = 0;  (p)->streams = NULL; }
 void Xzs_Construct(CXzs *p);
 void Xzs_Free(CXzs *p, ISzAllocPtr alloc);
+/*
+Xzs_ReadBackward() must be called for empty CXzs object.
+Xzs_ReadBackward() can return non empty object with (p->num != 0) even in case of error.
+*/
 SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr inStream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc);

 UInt64 Xzs_GetNumBlocks(const CXzs *p);
@ -268,8 +274,8 @@ typedef struct
  size_t outBufSize;
  size_t outDataWritten; // the size of data in (outBuf) that were fully unpacked

-  Byte shaDigest[SHA256_DIGEST_SIZE];
-  Byte buf[XZ_BLOCK_HEADER_SIZE_MAX];
+  UInt32 shaDigest32[SHA256_DIGEST_SIZE / 4];
+  Byte buf[XZ_BLOCK_HEADER_SIZE_MAX]; // it must be aligned for 4-bytes
 } CXzUnpacker;

 /* alloc : aligned for cache line allocation is better */
--- a/3rdparty/lzma/include/XzEnc.h
+++ b/3rdparty/lzma/include/XzEnc.h
@ -1,5 +1,5 @@
 /* XzEnc.h -- Xz Encode
-2023-04-13 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */

 #ifndef ZIP7_INC_XZ_ENC_H
 #define ZIP7_INC_XZ_ENC_H
@ -31,6 +31,7 @@ typedef struct
  CLzma2EncProps lzma2Props;
  CXzFilterProps filterProps;
  unsigned checkId;
+  unsigned numThreadGroups; // 0 : no groups
  UInt64 blockSize;
  int numBlockThreads_Reduced;
  int numBlockThreads_Max;
--- a/3rdparty/lzma/src/7zDec.c
+++ b/3rdparty/lzma/src/7zDec.c
@ -1,5 +1,5 @@
 /* 7zDec.c -- Decoding from 7z folder
-2024-03-01 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */

 #include "Precomp.h"

@ -312,9 +312,10 @@ static BoolInt IS_MAIN_METHOD(UInt32 m)
    case k_PPMD:
  #endif
      return True;
-  }
+    default:
      return False;
  }
+}

 static BoolInt IS_SUPPORTED_CODER(const CSzCoderInfo *c)
 {
--- a/3rdparty/lzma/src/AesOpt.c
+++ b/3rdparty/lzma/src/AesOpt.c
@ -1,5 +1,5 @@
 /* AesOpt.c -- AES optimized code for x86 AES hardware instructions
-2024-03-01 : Igor Pavlov : Public domain */
+Igor Pavlov : Public domain */

 #include "Precomp.h"

@ -80,7 +80,27 @@ AES_FUNC_START (name)

 #define MM_XOR( dest, src)    MM_OP(_mm_xor_si128,    dest, src)

+#if 1
+// use aligned SSE load/store for data.
+// It is required for our Aes functions, that data is aligned for 16-bytes.
+// So we can use this branch of code.
+// and compiler can use fused load-op SSE instructions:
+//   xorps xmm0, XMMWORD PTR [rdx]
+#define LOAD_128(pp)        (*(__m128i *)(void *)(pp))
+#define STORE_128(pp, _v)    *(__m128i *)(void *)(pp) = _v
+// use aligned SSE load/store for data. Alternative code with direct access
+// #define LOAD_128(pp)        _mm_load_si128(pp)
+// #define STORE_128(pp, _v)   _mm_store_si128(pp, _v)
+#else
+// use unaligned load/store for data: movdqu XMMWORD PTR [rdx]
+#define LOAD_128(pp)        _mm_loadu_si128(pp)
+#define STORE_128(pp, _v)   _mm_storeu_si128(pp, _v)
+#endif
+
 AES_FUNC_START2 (AesCbc_Encode_HW)
+{
+  if (numBlocks == 0)
+    return;
  {
  __m128i *p = (__m128i *)(void *)ivAes;
  __m128i *data = (__m128i *)(void *)data8;
@ -88,11 +108,11 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
  const __m128i k0 = p[2];
  const __m128i k1 = p[3];
  const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
-  for (; numBlocks != 0; numBlocks--, data++)
+  do
  {
    UInt32 r = numRounds2;
    const __m128i *w = p + 4;
-    __m128i temp = *data;
+    __m128i temp = LOAD_128(data);
    MM_XOR (temp, k0)
    MM_XOR (m, temp)
    MM_OP_m (_mm_aesenc_si128, k1)
@ -104,10 +124,13 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
    }
    while (--r);
    MM_OP_m (_mm_aesenclast_si128, w[0])
-    *data = m;
+    STORE_128(data, m);
+    data++;
  }
+  while (--numBlocks);
  *p = m;
  }
+}


 #define WOP_1(op)
@ -139,12 +162,12 @@ AES_FUNC_START2 (AesCbc_Encode_HW)

 #define WOP(op)  op (m0, 0)  WOP_M1(op)

-
 #define DECLARE_VAR(reg, ii)  __m128i reg;
-#define LOAD_data(  reg, ii)  reg = data[ii];
-#define STORE_data( reg, ii)  data[ii] = reg;
+#define LOAD_data_ii(ii)      LOAD_128(data + (ii))
+#define LOAD_data(  reg, ii)  reg = LOAD_data_ii(ii);
+#define STORE_data( reg, ii)  STORE_128(data + (ii), reg);
 #if (NUM_WAYS > 1)
-#define XOR_data_M1(reg, ii)  MM_XOR (reg, data[ii- 1])
+#define XOR_data_M1(reg, ii)  MM_XOR (reg, LOAD_128(data + (ii- 1)))
 #endif

 #define MM_OP_key(op, reg)  MM_OP(op, reg, key);
@ -156,25 +179,22 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
 #define AES_XOR(      reg, ii)   MM_OP_key (_mm_xor_si128,        reg)

 #define CTR_START(reg, ii)  MM_OP (_mm_add_epi64, ctr, one)  reg = ctr;
-#define CTR_END(  reg, ii)  MM_XOR (data[ii], reg)
-
+#define CTR_END(  reg, ii)  STORE_128(data + (ii), _mm_xor_si128(reg, \
+                            LOAD_128 (data + (ii))));
 #define WOP_KEY(op, n) { \
    const __m128i key = w[n]; \
-    WOP(op); }
-
+    WOP(op) }

 #define WIDE_LOOP_START  \
    dataEnd = data + numBlocks;  \
    if (numBlocks >= NUM_WAYS)  \
    { dataEnd -= NUM_WAYS; do {  \

-
 #define WIDE_LOOP_END  \
    data += NUM_WAYS;  \
    } while (data <= dataEnd);  \
    dataEnd += NUM_WAYS; }  \

-
 #define SINGLE_LOOP  \
    for (; data < dataEnd; data++)

@ -184,34 +204,54 @@ AES_FUNC_START2 (AesCbc_Encode_HW)

 #define AVX_XOR(dest, src)    MM_OP(_mm256_xor_si256, dest, src)
 #define AVX_DECLARE_VAR(reg, ii)  __m256i reg;
-#define AVX_LOAD_data(  reg, ii)  reg = ((const __m256i *)(const void *)data)[ii];
-#define AVX_STORE_data( reg, ii)  ((__m256i *)(void *)data)[ii] = reg;
+
+#if 1
+// use unaligned AVX load/store for data.
+// It is required for our Aes functions, that data is aligned for 16-bytes.
+// But we need 32-bytes reading.
+// So we use intrinsics for unaligned AVX load/store.
+// notes for _mm256_storeu_si256:
+// msvc2022: uses vmovdqu and keeps the order of instruction sequence.
+// new gcc11 uses vmovdqu
+// old gcc9 could use pair of instructions:
+//   vmovups        %xmm7, -224(%rax)
+//   vextracti128   $0x1, %ymm7, -208(%rax)
+#define AVX_LOAD(p)         _mm256_loadu_si256((const __m256i *)(const void *)(p))
+#define AVX_STORE(p, _v)    _mm256_storeu_si256((__m256i *)(void *)(p), _v);
+#else
+// use aligned AVX load/store for data.
+// for debug: we can use this branch, if we are sure that data is aligned for 32-bytes.
+// msvc2022 uses vmovdqu still
+// gcc      uses vmovdqa (that requires 32-bytes alignment)
+#define AVX_LOAD(p)         (*(const __m256i *)(const void *)(p))
+#define AVX_STORE(p, _v)    (*(__m256i *)(void *)(p)) = _v;
+#endif
+
+#define AVX_LOAD_data(  reg, ii)  reg = AVX_LOAD((const __m256i *)(const void *)data + (ii));
+#define AVX_STORE_data( reg, ii)  AVX_STORE((__m256i *)(void *)data + (ii), reg)
 /*
-AVX_XOR_data_M1() needs unaligned memory load
-if (we don't use _mm256_loadu_si256() here)
-{
-  Most compilers with enabled optimizations generate fused AVX (LOAD + OP)
-  instruction that can load unaligned data.
-  But GCC and CLANG without -O2 or -O1 optimizations can generate separated
-  LOAD-ALIGNED (vmovdqa) instruction that will fail on execution.
-}
-Note: some compilers generate more instructions, if we use _mm256_loadu_si256() here.
-v23.02: we use _mm256_loadu_si256() here, because we need compatibility with any compiler.
+AVX_XOR_data_M1() needs unaligned memory load, even if (data)
+is aligned for 256-bits, because we read 32-bytes chunk that
+crosses (data) position: from (data - 16bytes) to (data + 16bytes).
 */
-#define AVX_XOR_data_M1(reg, ii)  AVX_XOR (reg, _mm256_loadu_si256(&(((const __m256i *)(const void *)(data - 1))[ii])))
-// for debug only: the following code will fail on execution, if compiled by some compilers:
-// #define AVX_XOR_data_M1(reg, ii)  AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii]))
+#define AVX_XOR_data_M1(reg, ii)  AVX_XOR (reg, _mm256_loadu_si256((const __m256i *)(const void *)(data - 1) + (ii)))

 #define AVX_AES_DEC(      reg, ii)   MM_OP_key (_mm256_aesdec_epi128,     reg)
 #define AVX_AES_DEC_LAST( reg, ii)   MM_OP_key (_mm256_aesdeclast_epi128, reg)
 #define AVX_AES_ENC(      reg, ii)   MM_OP_key (_mm256_aesenc_epi128,     reg)
 #define AVX_AES_ENC_LAST( reg, ii)   MM_OP_key (_mm256_aesenclast_epi128, reg)
 #define AVX_AES_XOR(      reg, ii)   MM_OP_key (_mm256_xor_si256,         reg)
-#define AVX_CTR_START(reg, ii)  MM_OP (_mm256_add_epi64, ctr2, two)  reg = _mm256_xor_si256(ctr2, key);
-#define AVX_CTR_END(  reg, ii)  AVX_XOR (((__m256i *)(void *)data)[ii], reg)
+#define AVX_CTR_START(reg, ii)  \
+    MM_OP (_mm256_add_epi64, ctr2, two) \
+    reg = _mm256_xor_si256(ctr2, key);
+
+#define AVX_CTR_END(reg, ii)  \
+    AVX_STORE((__m256i *)(void *)data + (ii), _mm256_xor_si256(reg, \
+    AVX_LOAD ((__m256i *)(void *)data + (ii))));
+
 #define AVX_WOP_KEY(op, n) { \
    const __m256i key = w[n]; \
-    WOP(op); }
+    WOP(op) }

 #define NUM_AES_KEYS_MAX 15

@ -219,12 +259,11 @@ v23.02: we use _mm256_loadu_si256() here, because we need compatibility with any
    dataEnd = data + numBlocks;  \
    if (numBlocks >= NUM_WAYS * 2)  \
    { __m256i keys[NUM_AES_KEYS_MAX];  \
-    UInt32 ii; \
      OP  \
-    for (ii = 0; ii < numRounds; ii++) \
-      keys[ii] = _mm256_broadcastsi128_si256(p[ii]); \
-    dataEnd -= NUM_WAYS * 2; do {  \
-
+      { UInt32 ii; for (ii = 0; ii < numRounds; ii++)  \
+        keys[ii] = _mm256_broadcastsi128_si256(p[ii]); }  \
+      dataEnd -= NUM_WAYS * 2; \
+      do {  \

 #define WIDE_LOOP_END_AVX(OP)  \
        data += NUM_WAYS * 2;  \
@ -246,21 +285,20 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
  __m128i *p = (__m128i *)(void *)ivAes;
  __m128i *data = (__m128i *)(void *)data8;
  __m128i iv = *p;
-  const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1;
+  const __m128i * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2 + 2 - 1;
  const __m128i *dataEnd;
  p += 2;
  
  WIDE_LOOP_START
  {
    const __m128i *w = wStart;
-    
    WOP (DECLARE_VAR)
    WOP (LOAD_data)
    WOP_KEY (AES_XOR, 1)
-
    do
    {
      WOP_KEY (AES_DEC, 0)
+
      w--;
    }
    while (w != p);
@ -268,7 +306,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)

    MM_XOR (m0, iv)
    WOP_M1 (XOR_data_M1)
-    iv = data[NUM_WAYS - 1];
+    LOAD_data(iv, NUM_WAYS - 1)
    WOP (STORE_data)
  }
  WIDE_LOOP_END
@ -276,7 +314,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
  SINGLE_LOOP
  {
    const __m128i *w = wStart - 1;
-    __m128i m = _mm_xor_si128 (w[2], *data);
+    __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0));
+    
    do
    {
      MM_OP_m (_mm_aesdec_si128, w[1])
@ -286,10 +325,9 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
    while (w != p);
    MM_OP_m (_mm_aesdec_si128,     w[1])
    MM_OP_m (_mm_aesdeclast_si128, w[0])
-
    MM_XOR (m, iv)
-    iv = *data;
-    *data = m;
+    LOAD_data(iv, 0)
+    STORE_data(m, 0)
  }
  
  p[-2] = iv;
@ -301,9 +339,9 @@ AES_FUNC_START2 (AesCtr_Code_HW)
  __m128i *p = (__m128i *)(void *)ivAes;
  __m128i *data = (__m128i *)(void *)data8;
  __m128i ctr = *p;
-  UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1;
+  const UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1;
  const __m128i *dataEnd;
-  __m128i one = _mm_cvtsi32_si128(1);
+  const __m128i one = _mm_cvtsi32_si128(1);

  p += 2;
  
@ -322,7 +360,6 @@ AES_FUNC_START2 (AesCtr_Code_HW)
    }
    while (--r);
    WOP_KEY (AES_ENC_LAST, 0)
-   
    WOP (CTR_END)
  }
  WIDE_LOOP_END
@ -344,7 +381,7 @@ AES_FUNC_START2 (AesCtr_Code_HW)
    while (--numRounds2);
    MM_OP_m (_mm_aesenc_si128,     w[0])
    MM_OP_m (_mm_aesenclast_si128, w[1])
-    MM_XOR (*data, m)
+    CTR_END (m, 0)
  }
  
  p[-2] = ctr;
@ -421,7 +458,7 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256)
  __m128i *data = (__m128i *)(void *)data8;
  __m128i iv = *p;
  const __m128i *dataEnd;
-  UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
+  const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
  p += 2;
  
  WIDE_LOOP_START_AVX(;)
@ -440,17 +477,17 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256)
    while (w != keys);
    AVX_WOP_KEY (AVX_AES_DEC_LAST, 0)

-    AVX_XOR (m0, _mm256_setr_m128i(iv, data[0]))
+    AVX_XOR (m0, _mm256_setr_m128i(iv, LOAD_data_ii(0)))
    WOP_M1 (AVX_XOR_data_M1)
-    iv = data[NUM_WAYS * 2 - 1];
+    LOAD_data (iv, NUM_WAYS * 2 - 1)
    WOP (AVX_STORE_data)
  }
  WIDE_LOOP_END_AVX(;)

  SINGLE_LOOP
  {
-    const __m128i *w = p + *(const UInt32 *)(p + 1 - 2) * 2 + 1 - 3;
-    __m128i m = _mm_xor_si128 (w[2], *data);
+    const __m128i *w = p - 2 + (size_t)*(const UInt32 *)(p + 1 - 2) * 2;
+    __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0));
    do
    {
      MM_OP_m (_mm_aesdec_si128, w[1])
@ -462,8 +499,8 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256)
    MM_OP_m (_mm_aesdeclast_si128, w[0])

    MM_XOR (m, iv)
-    iv = *data;
-    *data = m;
+    LOAD_data(iv, 0)
+    STORE_data(m, 0)
  }
  
  p[-2] = iv;
@ -493,9 +530,9 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256)
  __m128i *p = (__m128i *)(void *)ivAes;
  __m128i *data = (__m128i *)(void *)data8;
  __m128i ctr = *p;
-  UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
+  const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
  const __m128i *dataEnd;
-  __m128i one = _mm_cvtsi32_si128(1);
+  const __m128i one = _mm_cvtsi32_si128(1);
  __m256i ctr2, two;
  p += 2;
  
@ -536,7 +573,7 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256)
    while (--numRounds2);
    MM_OP_m (_mm_aesenc_si128,     w[0])
    MM_OP_m (_mm_aesenclast_si128, w[1])
-    MM_XOR (*data, m)
+    CTR_END (m, 0)
  }

  p[-2] = ctr;
@ -730,10 +767,15 @@ AES_FUNC_START (name)


 AES_FUNC_START2 (AesCbc_Encode_HW)
+{
+  if (numBlocks == 0)
+    return;
  {
  v128 * const p = (v128 *)(void *)ivAes;
  v128 *data = (v128 *)(void *)data8;
  v128 m = *p;
+  const UInt32 numRounds2 = *(const UInt32 *)(p + 1);
+  const v128 *w = p + (size_t)numRounds2 * 2;
  const v128 k0 = p[2];
  const v128 k1 = p[3];
  const v128 k2 = p[4];
@ -744,11 +786,14 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
  const v128 k7 = p[9];
  const v128 k8 = p[10];
  const v128 k9 = p[11];
-  const UInt32 numRounds2 = *(const UInt32 *)(p + 1);
-  const v128 *w = p + ((size_t)numRounds2 * 2);
+  const v128 k_z4 = w[-2];
+  const v128 k_z3 = w[-1];
+  const v128 k_z2 = w[0];
  const v128 k_z1 = w[1];
  const v128 k_z0 = w[2];
-  for (; numBlocks != 0; numBlocks--, data++)
+  // we don't use optimization veorq_u8(*data, k_z0) that can reduce one cycle,
+  // because gcc/clang compilers are not good for that optimization.
+  do
  {
    MM_XOR_m (*data)
    AES_E_MC_m (k0)
@ -757,25 +802,27 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
    AES_E_MC_m (k3)
    AES_E_MC_m (k4)
    AES_E_MC_m (k5)
-    AES_E_MC_m (k6)
-    AES_E_MC_m (k7)
-    AES_E_MC_m (k8)
    if (numRounds2 >= 6)
    {
-      AES_E_MC_m (k9)
-      AES_E_MC_m (p[12])
+      AES_E_MC_m (k6)
+      AES_E_MC_m (k7)
      if (numRounds2 != 6)
      {
-        AES_E_MC_m (p[13])
-        AES_E_MC_m (p[14])
+        AES_E_MC_m (k8)
+        AES_E_MC_m (k9)
      }
    }
+    AES_E_MC_m (k_z4)
+    AES_E_MC_m (k_z3)
+    AES_E_MC_m (k_z2)
    AES_E_m    (k_z1)
    MM_XOR_m   (k_z0)
-    *data = m;
+    *data++ = m;
  }
+  while (--numBlocks);
  *p = m;
  }
+}


 #define WOP_1(op)
@ -837,7 +884,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
  v128 *p = (v128 *)(void *)ivAes;
  v128 *data = (v128 *)(void *)data8;
  v128 iv = *p;
-  const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
+  const v128 * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2;
  const v128 *dataEnd;
  p += 2;
  
@ -858,7 +905,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
    WOP_KEY (AES_XOR, 0)
    MM_XOR (m0, iv)
    WOP_M1 (XOR_data_M1)
-    iv = data[NUM_WAYS - 1];
+    LOAD_data(iv, NUM_WAYS - 1)
    WOP (STORE_data)
  }
  WIDE_LOOP_END
@ -866,7 +913,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
  SINGLE_LOOP
  {
    const v128 *w = wStart;
-    v128 m = *data;
+    v128 m;  LOAD_data(m, 0)
    AES_D_IMC_m (w[2])
    do
    {
@ -878,8 +925,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
    AES_D_m  (w[1])
    MM_XOR_m (w[0])
    MM_XOR_m (iv)
-    iv = *data;
-    *data = m;
+    LOAD_data(iv, 0)
+    STORE_data(m, 0)
  }
  
  p[-2] = iv;
@ -891,16 +938,14 @@ AES_FUNC_START2 (AesCtr_Code_HW)
  v128 *p = (v128 *)(void *)ivAes;
  v128 *data = (v128 *)(void *)data8;
  uint64x2_t ctr = vreinterpretq_u64_u8(*p);
-  const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
+  const v128 * const wEnd = p + (size_t)*(const UInt32 *)(p + 1) * 2;
  const v128 *dataEnd;
-  uint64x2_t one = vdupq_n_u64(0);
-
 // the bug in clang:
 // __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2);
 #if defined(__clang__) && (__clang_major__ <= 9)
 #pragma GCC diagnostic ignored "-Wvector-conversion"
 #endif
-  one = vsetq_lane_u64(1, one, 0);
+  const uint64x2_t one = vsetq_lane_u64(1, vdupq_n_u64(0), 0);
  p += 2;
  
  WIDE_LOOP_START
--- a/3rdparty/lzma/src/CpuArch.c
+++ b/3rdparty/lzma/src/CpuArch.c
@ -1,5 +1,5 @@
 /* CpuArch.c -- CPU specific code
-2024-07-04 : Igor Pavlov : Public domain */
+Igor Pavlov : Public domain */

 #include "Precomp.h"

@ -17,7 +17,7 @@
 /*
  cpuid instruction supports (subFunction) parameter in ECX,
  that is used only with some specific (function) parameter values.
-  But we always use only (subFunction==0).
+  most functions use only (subFunction==0).
 */
 /*
  __cpuid(): MSVC and GCC/CLANG use same function/macro name
@ -49,43 +49,49 @@
 #if defined(MY_CPU_AMD64) && defined(__PIC__) \
    && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__))

-#define x86_cpuid_MACRO(p, func) { \
+  /* "=&r" selects free register. It can select even rbx, if that register is free.
+     "=&D" for (RDI) also works, but the code can be larger with "=&D"
+     "2"(subFun) : 2 is (zero-based) index in the output constraint list "=c" (ECX). */
+
+#define x86_cpuid_MACRO_2(p, func, subFunc) { \
  __asm__ __volatile__ ( \
    ASM_LN   "mov     %%rbx, %q1"  \
    ASM_LN   "cpuid"               \
    ASM_LN   "xchg    %%rbx, %q1"  \
-    : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); }
-
-  /* "=&r" selects free register. It can select even rbx, if that register is free.
-     "=&D" for (RDI) also works, but the code can be larger with "=&D"
-     "2"(0) means (subFunction = 0),
-     2 is (zero-based) index in the output constraint list "=c" (ECX). */
+    : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }

 #elif defined(MY_CPU_X86) && defined(__PIC__) \
    && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__))

-#define x86_cpuid_MACRO(p, func) { \
+#define x86_cpuid_MACRO_2(p, func, subFunc) { \
  __asm__ __volatile__ ( \
    ASM_LN   "mov     %%ebx, %k1"  \
    ASM_LN   "cpuid"               \
    ASM_LN   "xchg    %%ebx, %k1"  \
-    : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); }
+    : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }

 #else

-#define x86_cpuid_MACRO(p, func) { \
+#define x86_cpuid_MACRO_2(p, func, subFunc) { \
  __asm__ __volatile__ ( \
    ASM_LN   "cpuid"               \
-    : "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); }
+    : "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }

 #endif

+#define x86_cpuid_MACRO(p, func)  x86_cpuid_MACRO_2(p, func, 0)

 void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
 {
  x86_cpuid_MACRO(p, func)
 }

+static
+void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
+{
+  x86_cpuid_MACRO_2(p, func, subFunc)
+}
+

 Z7_NO_INLINE
 UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
@ -205,11 +211,39 @@ void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
  __asm   ret     0
 }

+static
+void __declspec(naked) Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
+{
+  UNUSED_VAR(p)
+  UNUSED_VAR(func)
+  UNUSED_VAR(subFunc)
+  __asm   push    ebx
+  __asm   push    edi
+  __asm   mov     edi, ecx    // p
+  __asm   mov     eax, edx    // func
+  __asm   mov     ecx, [esp + 12]  // subFunc
+  __asm   cpuid
+  __asm   mov     [edi     ], eax
+  __asm   mov     [edi +  4], ebx
+  __asm   mov     [edi +  8], ecx
+  __asm   mov     [edi + 12], edx
+  __asm   pop     edi
+  __asm   pop     ebx
+  __asm   ret     4
+}
+
 #else // MY_CPU_AMD64

    #if _MSC_VER >= 1600
      #include <intrin.h>
      #define MY_cpuidex  __cpuidex
+
+static
+void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
+{
+  __cpuidex((int *)p, func, subFunc);
+}
+
    #else
 /*
 __cpuid (func == (0 or 7)) requires subfunction number in ECX.
@ -219,7 +253,7 @@ void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
 We still can use __cpuid for low (func) values that don't require ECX,
 but __cpuid() in old MSVC will be incorrect for some func values: (func == 7).
 So here we use the hack for old MSVC to send (subFunction) in ECX register to cpuid instruction,
- where ECX value is first parameter for FASTCALL / NO_INLINE func,
+ where ECX value is first parameter for FASTCALL / NO_INLINE func.
 So the caller of MY_cpuidex_HACK() sets ECX as subFunction, and
 old MSVC for __cpuid() doesn't change ECX and cpuid instruction gets (subFunction) value.
 
@ -233,6 +267,11 @@ Z7_NO_INLINE void Z7_FASTCALL MY_cpuidex_HACK(Int32 subFunction, Int32 func, Int
 }
      #define MY_cpuidex(info, func, func2)  MY_cpuidex_HACK(func2, func, info)
      #pragma message("======== MY_cpuidex_HACK WAS USED ========")
+static
+void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
+{
+  MY_cpuidex_HACK(subFunc, func, (Int32 *)p);
+}
    #endif // _MSC_VER >= 1600

 #if !defined(MY_CPU_AMD64)
@ -445,6 +484,23 @@ BoolInt CPU_IsSupported_SHA(void)
  }
 }

+
+BoolInt CPU_IsSupported_SHA512(void)
+{
+  if (!CPU_IsSupported_AVX2()) return False; // maybe CPU_IsSupported_AVX() is enough here
+
+  if (z7_x86_cpuid_GetMaxFunc() < 7)
+    return False;
+  {
+    UInt32 d[4];
+    z7_x86_cpuid_subFunc(d, 7, 0);
+    if (d[0] < 1) // d[0] - is max supported subleaf value
+      return False;
+    z7_x86_cpuid_subFunc(d, 7, 1);
+    return (BoolInt)(d[0]) & 1;
+  }
+}
+
 /*
 MSVC: _xgetbv() intrinsic is available since VS2010SP1.
   MSVC also defines (_XCR_XFEATURE_ENABLED_MASK) macro in
@ -776,6 +832,18 @@ BoolInt CPU_IsSupported_NEON(void)
  return z7_sysctlbyname_Get_BoolInt("hw.optional.neon");
 }

+BoolInt CPU_IsSupported_SHA512(void)
+{
+  return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha512");
+}
+
+/*
+BoolInt CPU_IsSupported_SHA3(void)
+{
+  return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha3");
+}
+*/
+
 #ifdef MY_CPU_ARM64
 #define APPLE_CRYPTO_SUPPORT_VAL 1
 #else
@ -860,6 +928,19 @@ MY_HWCAP_CHECK_FUNC (CRC32)
 MY_HWCAP_CHECK_FUNC (SHA1)
 MY_HWCAP_CHECK_FUNC (SHA2)
 MY_HWCAP_CHECK_FUNC (AES)
+#ifdef MY_CPU_ARM64
+// <hwcap.h> supports HWCAP_SHA512 and HWCAP_SHA3 since 2017.
+// we define them here, if they are not defined
+#ifndef HWCAP_SHA3
+// #define HWCAP_SHA3    (1 << 17)
+#endif
+#ifndef HWCAP_SHA512
+// #pragma message("=== HWCAP_SHA512 define === ")
+#define HWCAP_SHA512  (1 << 21)
+#endif
+MY_HWCAP_CHECK_FUNC (SHA512)
+// MY_HWCAP_CHECK_FUNC (SHA3)
+#endif

 #endif // __APPLE__
 #endif // _WIN32
--- a/3rdparty/lzma/src/LzFind.c
+++ b/3rdparty/lzma/src/LzFind.c
@ -1,5 +1,5 @@
 /* LzFind.c -- Match finder for LZ algorithms
-2024-03-01 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */

 #include "Precomp.h"

@ -404,7 +404,7 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
        const unsigned nbMax =
            (p->numHashBytes == 2 ? 16 :
            (p->numHashBytes == 3 ? 24 : 32));
-        if (numBits > nbMax)
+        if (numBits >= nbMax)
          numBits = nbMax;
        if (numBits >= 32)
          hs = (UInt32)0 - 1;
@ -416,14 +416,14 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
          hs |= (256 << kLzHash_CrcShift_2) - 1;
        {
          const UInt32 hs2 = MatchFinder_GetHashMask2(p, historySize);
-          if (hs > hs2)
+          if (hs >= hs2)
            hs = hs2;
        }
        hsCur = hs;
        if (p->expectedDataSize < historySize)
        {
          const UInt32 hs2 = MatchFinder_GetHashMask2(p, (UInt32)p->expectedDataSize);
-          if (hsCur > hs2)
+          if (hsCur >= hs2)
            hsCur = hs2;
        }
      }
@ -434,7 +434,7 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
        if (p->expectedDataSize < historySize)
        {
          hsCur = MatchFinder_GetHashMask(p, (UInt32)p->expectedDataSize);
-          if (hsCur > hs) // is it possible?
+          if (hsCur >= hs) // is it possible?
            hsCur = hs;
        }
      }
@ -890,7 +890,7 @@ static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos,
      return d;
    {
      const Byte *pb = cur - delta;
-      curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)];
+      curMatch = son[_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)];
      if (pb[maxLen] == cur[maxLen] && *pb == *cur)
      {
        UInt32 len = 0;
@ -925,7 +925,7 @@ static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos,
      break;
    {
      ptrdiff_t diff;
-      curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)];
+      curMatch = son[_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)];
      diff = (ptrdiff_t)0 - (ptrdiff_t)delta;
      if (cur[maxLen] == cur[(ptrdiff_t)maxLen + diff])
      {
@ -972,7 +972,7 @@ UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byt
  // if (curMatch >= pos) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; }

  cmCheck = (UInt32)(pos - _cyclicBufferSize);
-  if ((UInt32)pos <= _cyclicBufferSize)
+  if ((UInt32)pos < _cyclicBufferSize)
    cmCheck = 0;

  if (cmCheck < curMatch)
@ -980,7 +980,7 @@ UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byt
  {
    const UInt32 delta = pos - curMatch;
    {
-      CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
+      CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)) << 1);
      const Byte *pb = cur - delta;
      unsigned len = (len0 < len1 ? len0 : len1);
      const UInt32 pair0 = pair[0];
@ -1039,7 +1039,7 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const
  UInt32 cmCheck;

  cmCheck = (UInt32)(pos - _cyclicBufferSize);
-  if ((UInt32)pos <= _cyclicBufferSize)
+  if ((UInt32)pos < _cyclicBufferSize)
    cmCheck = 0;

  if (// curMatch >= pos ||  // failure
@ -1048,7 +1048,7 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const
  {
    const UInt32 delta = pos - curMatch;
    {
-      CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
+      CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)) << 1);
      const Byte *pb = cur - delta;
      unsigned len = (len0 < len1 ? len0 : len1);
      if (pb[len] == cur[len])
@ -1595,7 +1595,7 @@ static void Bt5_MatchFinder_Skip(void *_p, UInt32 num)
    UInt32 pos = p->pos; \
    UInt32 num2 = num; \
    /* (p->pos == p->posLimit) is not allowed here !!! */ \
-    { const UInt32 rem = p->posLimit - pos; if (num2 > rem) num2 = rem; } \
+    { const UInt32 rem = p->posLimit - pos; if (num2 >= rem) num2 = rem; } \
    num -= num2; \
    { const UInt32 cycPos = p->cyclicBufferPos; \
      son = p->son + cycPos; \
--- a/3rdparty/lzma/src/LzFindMt.c
+++ b/3rdparty/lzma/src/LzFindMt.c
@ -1,5 +1,5 @@
 /* LzFindMt.c -- multithreaded Match finder for LZ algorithms
-2024-01-22 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */

 #include "Precomp.h"

@ -82,6 +82,8 @@ extern UInt64 g_NumIters_Bytes;
 Z7_NO_INLINE
 static void MtSync_Construct(CMtSync *p)
 {
+  p->affinityGroup = -1;
+  p->affinityInGroup = 0;
  p->affinity = 0;
  p->wasCreated = False;
  p->csWasInitialized = False;
@ -259,6 +261,12 @@ static WRes MtSync_Create_WRes(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *
  // return ERROR_TOO_MANY_POSTS; // for debug
  // return EINVAL; // for debug

+#ifdef _WIN32
+  if (p->affinityGroup >= 0)
+    wres = Thread_Create_With_Group(&p->thread, startAddress, obj,
+        (unsigned)(UInt32)p->affinityGroup, (CAffinityMask)p->affinityInGroup);
+  else
+#endif
  if (p->affinity != 0)
    wres = Thread_Create_With_Affinity(&p->thread, startAddress, obj, (CAffinityMask)p->affinity);
  else
--- a/3rdparty/lzma/src/Lzma2Enc.c
+++ b/3rdparty/lzma/src/Lzma2Enc.c
@ -1,5 +1,5 @@
 /* Lzma2Enc.c -- LZMA2 Encoder
-2023-04-13 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */

 #include "Precomp.h"

@ -235,6 +235,7 @@ void Lzma2EncProps_Init(CLzma2EncProps *p)
  p->numBlockThreads_Reduced = -1;
  p->numBlockThreads_Max = -1;
  p->numTotalThreads = -1;
+  p->numThreadGroups = 0;
 }

 void Lzma2EncProps_Normalize(CLzma2EncProps *p)
@ -781,6 +782,7 @@ SRes Lzma2Enc_Encode2(CLzma2EncHandle p,
    }

    p->mtCoder.numThreadsMax = (unsigned)p->props.numBlockThreads_Max;
+    p->mtCoder.numThreadGroups = p->props.numThreadGroups;
    p->mtCoder.expectedDataSize = p->expectedDataSize;
    
    {
--- a/3rdparty/lzma/src/LzmaEnc.c
+++ b/3rdparty/lzma/src/LzmaEnc.c
@ -1,5 +1,5 @@
 /* LzmaEnc.c -- LZMA Encoder
-2024-01-24: Igor Pavlov : Public domain */
+Igor Pavlov : Public domain */

 #include "Precomp.h"

@ -62,7 +62,9 @@ void LzmaEncProps_Init(CLzmaEncProps *p)
  p->lc = p->lp = p->pb = p->algo = p->fb = p->btMode = p->numHashBytes = p->numThreads = -1;
  p->numHashOutBits = 0;
  p->writeEndMark = 0;
+  p->affinityGroup = -1;
  p->affinity = 0;
+  p->affinityInGroup = 0;
 }

 void LzmaEncProps_Normalize(CLzmaEncProps *p)
@ -72,11 +74,11 @@ void LzmaEncProps_Normalize(CLzmaEncProps *p)
  p->level = level;
  
  if (p->dictSize == 0)
-    p->dictSize =
-      ( level <= 3 ? ((UInt32)1 << (level * 2 + 16)) :
-      ( level <= 6 ? ((UInt32)1 << (level + 19)) :
-      ( level <= 7 ? ((UInt32)1 << 25) : ((UInt32)1 << 26)
-      )));
+    p->dictSize = (unsigned)level <= 4 ?
+        (UInt32)1 << (level * 2 + 16) :
+        (unsigned)level <= sizeof(size_t) / 2 + 4 ?
+          (UInt32)1 << (level + 20) :
+          (UInt32)1 << (sizeof(size_t) / 2 + 24);

  if (p->dictSize > p->reduceSize)
  {
@ -92,8 +94,8 @@ void LzmaEncProps_Normalize(CLzmaEncProps *p)
  if (p->lp < 0) p->lp = 0;
  if (p->pb < 0) p->pb = 2;

-  if (p->algo < 0) p->algo = (level < 5 ? 0 : 1);
-  if (p->fb < 0) p->fb = (level < 7 ? 32 : 64);
+  if (p->algo < 0) p->algo = (unsigned)level < 5 ? 0 : 1;
+  if (p->fb < 0) p->fb = (unsigned)level < 7 ? 32 : 64;
  if (p->btMode < 0) p->btMode = (p->algo == 0 ? 0 : 1);
  if (p->numHashBytes < 0) p->numHashBytes = (p->btMode ? 4 : 5);
  if (p->mc == 0) p->mc = (16 + ((unsigned)p->fb >> 1)) >> (p->btMode ? 0 : 1);
@ -598,6 +600,10 @@ SRes LzmaEnc_SetProps(CLzmaEncHandle p, const CLzmaEncProps *props2)
  p->multiThread = (props.numThreads > 1);
  p->matchFinderMt.btSync.affinity =
  p->matchFinderMt.hashSync.affinity = props.affinity;
+  p->matchFinderMt.btSync.affinityGroup =
+  p->matchFinderMt.hashSync.affinityGroup = props.affinityGroup;
+  p->matchFinderMt.btSync.affinityInGroup =
+  p->matchFinderMt.hashSync.affinityInGroup = props.affinityInGroup;
  #endif

  return SZ_OK;
--- a/3rdparty/lzma/src/MtCoder.c
+++ b/3rdparty/lzma/src/MtCoder.c
@ -1,5 +1,5 @@
 /* MtCoder.c -- Multi-thread Coder
-2023-09-07 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */

 #include "Precomp.h"

@ -39,14 +39,28 @@ void MtProgressThunk_CreateVTable(CMtProgressThunk *p)
 static THREAD_FUNC_DECL ThreadFunc(void *pp);


-static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t)
+static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t
+#ifdef _WIN32
+    , CMtCoder * const mtc
+#endif
+    )
 {
  WRes wres = AutoResetEvent_OptCreate_And_Reset(&t->startEvent);
+  // printf("\n====== MtCoderThread_CreateAndStart : \n");
  if (wres == 0)
  {
    t->stop = False;
    if (!Thread_WasCreated(&t->thread))
+    {
+#ifdef _WIN32
+      if (mtc->numThreadGroups)
+        wres = Thread_Create_With_Group(&t->thread, ThreadFunc, t,
+            ThreadNextGroup_GetNext(&mtc->nextGroup), // group
+            0); // affinityMask
+      else
+#endif
        wres = Thread_Create(&t->thread, ThreadFunc, t);
+    }
    if (wres == 0)
      wres = Event_Set(&t->startEvent);
  }
@ -56,6 +70,7 @@ static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t)
 }


+Z7_FORCE_INLINE
 static void MtCoderThread_Destruct(CMtCoderThread *t)
 {
  if (Thread_WasCreated(&t->thread))
@ -85,7 +100,7 @@ static void MtCoderThread_Destruct(CMtCoderThread *t)

 static SRes ThreadFunc2(CMtCoderThread *t)
 {
-  CMtCoder *mtc = t->mtCoder;
+  CMtCoder * const mtc = t->mtCoder;

  for (;;)
  {
@ -185,7 +200,11 @@ static SRes ThreadFunc2(CMtCoderThread *t)
      if (mtc->numStartedThreads < mtc->numStartedThreadsLimit
          && mtc->expectedDataSize != readProcessed)
      {
-        res = MtCoderThread_CreateAndStart(&mtc->threads[mtc->numStartedThreads]);
+        res = MtCoderThread_CreateAndStart(&mtc->threads[mtc->numStartedThreads]
+#ifdef _WIN32
+            , mtc
+#endif
+            );
        if (res == SZ_OK)
          mtc->numStartedThreads++;
        else
@ -221,7 +240,7 @@ static SRes ThreadFunc2(CMtCoderThread *t)
    }

    {
-      CMtCoderBlock *block = &mtc->blocks[bi];
+      CMtCoderBlock * const block = &mtc->blocks[bi];
      block->res = res;
      block->bufIndex = bufIndex;
      block->finished = finished;
@ -311,7 +330,7 @@ static SRes ThreadFunc2(CMtCoderThread *t)

 static THREAD_FUNC_DECL ThreadFunc(void *pp)
 {
-  CMtCoderThread *t = (CMtCoderThread *)pp;
+  CMtCoderThread * const t = (CMtCoderThread *)pp;
  for (;;)
  {
    if (Event_Wait(&t->startEvent) != 0)
@ -319,7 +338,7 @@ static THREAD_FUNC_DECL ThreadFunc(void *pp)
    if (t->stop)
      return 0;
    {
-      SRes res = ThreadFunc2(t);
+      const SRes res = ThreadFunc2(t);
      CMtCoder *mtc = t->mtCoder;
      if (res != SZ_OK)
      {
@ -328,7 +347,7 @@ static THREAD_FUNC_DECL ThreadFunc(void *pp)
      
      #ifndef MTCODER_USE_WRITE_THREAD
      {
-        unsigned numFinished = (unsigned)InterlockedIncrement(&mtc->numFinishedThreads);
+        const unsigned numFinished = (unsigned)InterlockedIncrement(&mtc->numFinishedThreads);
        if (numFinished == mtc->numStartedThreads)
          if (Event_Set(&mtc->finishedEvent) != 0)
            return (THREAD_FUNC_RET_TYPE)SZ_ERROR_THREAD;
@ -346,6 +365,7 @@ void MtCoder_Construct(CMtCoder *p)
  
  p->blockSize = 0;
  p->numThreadsMax = 0;
+  p->numThreadGroups = 0;
  p->expectedDataSize = (UInt64)(Int64)-1;

  p->inStream = NULL;
@ -429,6 +449,8 @@ SRes MtCoder_Code(CMtCoder *p)
  unsigned i;
  SRes res = SZ_OK;

+  // printf("\n====== MtCoder_Code : \n");
+
  if (numThreads > MTCODER_THREADS_MAX)
      numThreads = MTCODER_THREADS_MAX;
  numBlocksMax = MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads);
@ -492,11 +514,22 @@ SRes MtCoder_Code(CMtCoder *p)

  p->numStartedThreadsLimit = numThreads;
  p->numStartedThreads = 0;
+  ThreadNextGroup_Init(&p->nextGroup, p->numThreadGroups, 0); // startGroup

  // for (i = 0; i < numThreads; i++)
  {
+    // here we create new thread for first block.
+    // And each new thread will create another new thread after block reading
+    // until numStartedThreadsLimit is reached.
    CMtCoderThread *nextThread = &p->threads[p->numStartedThreads++];
-    RINOK(MtCoderThread_CreateAndStart(nextThread))
+    {
+      const SRes res2 = MtCoderThread_CreateAndStart(nextThread
+#ifdef _WIN32
+            , p
+#endif
+            );
+      RINOK(res2)
+    }
  }

  RINOK_THREAD(Event_Set(&p->readEvent))
@ -513,9 +546,9 @@ SRes MtCoder_Code(CMtCoder *p)
      RINOK_THREAD(Event_Wait(&p->writeEvents[bi]))

      {
-        const CMtCoderBlock *block = &p->blocks[bi];
-        unsigned bufIndex = block->bufIndex;
-        BoolInt finished = block->finished;
+        const CMtCoderBlock * const block = &p->blocks[bi];
+        const unsigned bufIndex = block->bufIndex;
+        const BoolInt finished = block->finished;
        if (res == SZ_OK && block->res != SZ_OK)
          res = block->res;

@ -545,7 +578,7 @@ SRes MtCoder_Code(CMtCoder *p)
  }
  #else
  {
-    WRes wres = Event_Wait(&p->finishedEvent);
+    const WRes wres = Event_Wait(&p->finishedEvent);
    res = MY_SRes_HRESULT_FROM_WRes(wres);
  }
  #endif
--- a/3rdparty/lzma/src/Sha256.c
+++ b/3rdparty/lzma/src/Sha256.c
@ -1,18 +1,14 @@
 /* Sha256.c -- SHA-256 Hash
-2024-03-01 : Igor Pavlov : Public domain
+: Igor Pavlov : Public domain
 This code is based on public domain code from Wei Dai's Crypto++ library. */

 #include "Precomp.h"

 #include <string.h>

-#include "CpuArch.h"
-#include "RotateDefs.h"
 #include "Sha256.h"
-
-#if defined(_MSC_VER) && (_MSC_VER < 1900)
-// #define USE_MY_MM
-#endif
+#include "RotateDefs.h"
+#include "CpuArch.h"

 #ifdef MY_CPU_X86_OR_AMD64
  #if   defined(Z7_LLVM_CLANG_VERSION)  && (Z7_LLVM_CLANG_VERSION  >= 30800) \
@ -56,7 +52,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n
  static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS = Sha256_UpdateBlocks;
  static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS_HW;

-  #define SHA256_UPDATE_BLOCKS(p) p->func_UpdateBlocks
+  #define SHA256_UPDATE_BLOCKS(p) p->v.vars.func_UpdateBlocks
 #else
  #define SHA256_UPDATE_BLOCKS(p) Sha256_UpdateBlocks
 #endif
@ -85,7 +81,7 @@ BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo)
      return False;
  #endif

-  p->func_UpdateBlocks = func;
+  p->v.vars.func_UpdateBlocks = func;
  return True;
 }

@ -111,7 +107,7 @@ BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo)

 void Sha256_InitState(CSha256 *p)
 {
-  p->count = 0;
+  p->v.vars.count = 0;
  p->state[0] = 0x6a09e667;
  p->state[1] = 0xbb67ae85;
  p->state[2] = 0x3c6ef372;
@ -122,9 +118,16 @@ void Sha256_InitState(CSha256 *p)
  p->state[7] = 0x5be0cd19;
 }

+
+
+
+
+
+
+
 void Sha256_Init(CSha256 *p)
 {
-  p->func_UpdateBlocks =
+  p->v.vars.func_UpdateBlocks =
  #ifdef Z7_COMPILER_SHA256_SUPPORTED
      g_SHA256_FUNC_UPDATE_BLOCKS;
  #else
@ -224,12 +227,10 @@ void Sha256_Init(CSha256 *p)

 #endif

-// static
-extern MY_ALIGN(64)
-const UInt32 SHA256_K_ARRAY[64];

-MY_ALIGN(64)
-const UInt32 SHA256_K_ARRAY[64] = {
+extern
+MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64];
+MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64] = {
  0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
  0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
  0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
@ -248,9 +249,12 @@ const UInt32 SHA256_K_ARRAY[64] = {
  0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
 };

-#define K SHA256_K_ARRAY


+
+
+#define K SHA256_K_ARRAY
+
 Z7_NO_INLINE
 void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks)
 {
@ -260,15 +264,14 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n
 #else
      [16];
 #endif
-
  unsigned j;
-
  UInt32 a,b,c,d,e,f,g,h;
-
 #if !defined(Z7_SHA256_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4)
  UInt32 tmp;
 #endif
  
+  if (numBlocks == 0) return;
+
  a = state[0];
  b = state[1];
  c = state[2];
@ -278,7 +281,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n
  g = state[6];
  h = state[7];

-  while (numBlocks)
+  do
  {

  for (j = 0; j < 16; j += STEP_PRE)
@ -352,19 +355,11 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n
  g += state[6]; state[6] = g;
  h += state[7]; state[7] = h;

-  data += 64;
-  numBlocks--;
+  data += SHA256_BLOCK_SIZE;
+  }
+  while (--numBlocks);
 }

-  /* Wipe variables */
-  /* memset(W, 0, sizeof(W)); */
-}
-
-#undef S0
-#undef S1
-#undef s0
-#undef s1
-#undef K

 #define Sha256_UpdateBlock(p) SHA256_UPDATE_BLOCKS(p)(p->state, p->buffer, 1)

@ -372,20 +367,15 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size)
 {
  if (size == 0)
    return;
-
  {
-    unsigned pos = (unsigned)p->count & 0x3F;
-    unsigned num;
-    
-    p->count += size;
-    
-    num = 64 - pos;
+    const unsigned pos = (unsigned)p->v.vars.count & (SHA256_BLOCK_SIZE - 1);
+    const unsigned num = SHA256_BLOCK_SIZE - pos;
+    p->v.vars.count += size;
    if (num > size)
    {
      memcpy(p->buffer + pos, data, size);
      return;
    }
-    
    if (pos != 0)
    {
      size -= num;
@ -395,9 +385,10 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size)
    }
  }
  {
-    size_t numBlocks = size >> 6;
+    const size_t numBlocks = size >> 6;
+    // if (numBlocks)
    SHA256_UPDATE_BLOCKS(p)(p->state, data, numBlocks);
-    size &= 0x3F;
+    size &= SHA256_BLOCK_SIZE - 1;
    if (size == 0)
      return;
    data += (numBlocks << 6);
@ -408,53 +399,41 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size)

 void Sha256_Final(CSha256 *p, Byte *digest)
 {
-  unsigned pos = (unsigned)p->count & 0x3F;
-  unsigned i;
-  
+  unsigned pos = (unsigned)p->v.vars.count & (SHA256_BLOCK_SIZE - 1);
  p->buffer[pos++] = 0x80;
-  
-  if (pos > (64 - 8))
+  if (pos > (SHA256_BLOCK_SIZE - 4 * 2))
  {
-    while (pos != 64) { p->buffer[pos++] = 0; }
-    // memset(&p->buf.buffer[pos], 0, 64 - pos);
+    while (pos != SHA256_BLOCK_SIZE) { p->buffer[pos++] = 0; }
+    // memset(&p->buf.buffer[pos], 0, SHA256_BLOCK_SIZE - pos);
    Sha256_UpdateBlock(p);
    pos = 0;
  }
-
-  /*
-  if (pos & 3)
+  memset(&p->buffer[pos], 0, (SHA256_BLOCK_SIZE - 4 * 2) - pos);
  {
-    p->buffer[pos] = 0;
-    p->buffer[pos + 1] = 0;
-    p->buffer[pos + 2] = 0;
-    pos += 3;
-    pos &= ~3;
+    const UInt64 numBits = p->v.vars.count << 3;
+    SetBe32(p->buffer + SHA256_BLOCK_SIZE - 4 * 2, (UInt32)(numBits >> 32))
+    SetBe32(p->buffer + SHA256_BLOCK_SIZE - 4 * 1, (UInt32)(numBits))
  }
-  {
-    for (; pos < 64 - 8; pos += 4)
-      *(UInt32 *)(&p->buffer[pos]) = 0;
-  }
-  */
-
-  memset(&p->buffer[pos], 0, (64 - 8) - pos);
-
-  {
-    UInt64 numBits = (p->count << 3);
-    SetBe32(p->buffer + 64 - 8, (UInt32)(numBits >> 32))
-    SetBe32(p->buffer + 64 - 4, (UInt32)(numBits))
-  }
-  
  Sha256_UpdateBlock(p);
-
+#if 1 && defined(MY_CPU_BE)
+  memcpy(digest, p->state, SHA256_DIGEST_SIZE);
+#else
+  {
+    unsigned i;
    for (i = 0; i < 8; i += 2)
    {
-    UInt32 v0 = p->state[i];
-    UInt32 v1 = p->state[(size_t)i + 1];
+      const UInt32 v0 = p->state[i];
+      const UInt32 v1 = p->state[(size_t)i + 1];
      SetBe32(digest    , v0)
      SetBe32(digest + 4, v1)
-    digest += 8;
+      digest += 4 * 2;
+    }
  }

+
+
+
+#endif
  Sha256_InitState(p);
 }

@ -466,12 +445,9 @@ void Sha256Prepare(void)
  f = Sha256_UpdateBlocks;
  f_hw = NULL;
 #ifdef MY_CPU_X86_OR_AMD64
-  #ifndef USE_MY_MM
  if (CPU_IsSupported_SHA()
      && CPU_IsSupported_SSSE3()
-      // && CPU_IsSupported_SSE41()
      )
-  #endif
 #else
  if (CPU_IsSupported_SHA2())
 #endif
@ -484,6 +460,8 @@ void Sha256Prepare(void)
 #endif
 }

+#undef U64C
+#undef K
 #undef S0
 #undef S1
 #undef s0
--- a/3rdparty/lzma/src/Sha256Opt.c
+++ b/3rdparty/lzma/src/Sha256Opt.c
@ -1,18 +1,11 @@
 /* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions
-2024-03-01 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */

 #include "Precomp.h"
 #include "Compiler.h"
 #include "CpuArch.h"

-#if defined(_MSC_VER)
-#if (_MSC_VER < 1900) && (_MSC_VER >= 1200)
-// #define USE_MY_MM
-#endif
-#endif
-
 // #define Z7_USE_HW_SHA_STUB // for debug
-
 #ifdef MY_CPU_X86_OR_AMD64
  #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check
      #define USE_HW_SHA
@ -20,19 +13,14 @@
     || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
     || defined(Z7_GCC_VERSION)         && (Z7_GCC_VERSION         >= 40900)
      #define USE_HW_SHA
-      #if !defined(_INTEL_COMPILER)
+      #if !defined(__INTEL_COMPILER)
      // icc defines __GNUC__, but icc doesn't support __attribute__(__target__)
      #if !defined(__SHA__) || !defined(__SSSE3__)
        #define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
      #endif
      #endif
  #elif defined(_MSC_VER)
-    #ifdef USE_MY_MM
-      #define USE_VER_MIN 1300
-    #else
-      #define USE_VER_MIN 1900
-    #endif
-    #if (_MSC_VER >= USE_VER_MIN)
+    #if (_MSC_VER >= 1900)
      #define USE_HW_SHA
    #else
      #define Z7_USE_HW_SHA_STUB
@ -47,23 +35,20 @@

 // #pragma message("Sha256 HW")

+
+
+
 // sse/sse2/ssse3:
 #include <tmmintrin.h>
 // sha*:
 #include <immintrin.h>

 #if defined (__clang__) && defined(_MSC_VER)
-  // #if !defined(__SSSE3__)
-  // #endif
  #if !defined(__SHA__)
    #include <shaintrin.h>
  #endif
 #else

-#ifdef USE_MY_MM
-#include "My_mm.h"
-#endif
-
 #endif

 /*
@ -91,54 +76,38 @@ SHA:
 extern
 MY_ALIGN(64)
 const UInt32 SHA256_K_ARRAY[64];
-
 #define K SHA256_K_ARRAY


 #define ADD_EPI32(dest, src)      dest = _mm_add_epi32(dest, src);
 #define SHA256_MSG1(dest, src)    dest = _mm_sha256msg1_epu32(dest, src);
-#define SHA25G_MSG2(dest, src)    dest = _mm_sha256msg2_epu32(dest, src);
-
+#define SHA256_MSG2(dest, src)    dest = _mm_sha256msg2_epu32(dest, src);

 #define LOAD_SHUFFLE(m, k) \
    m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \
    m = _mm_shuffle_epi8(m, mask); \

-#define SM1(g0, g1, g2, g3) \
-    SHA256_MSG1(g3, g0); \
+#define NNN(m0, m1, m2, m3)

-#define SM2(g0, g1, g2, g3) \
-    tmp = _mm_alignr_epi8(g1, g0, 4); \
-    ADD_EPI32(g2, tmp) \
-    SHA25G_MSG2(g2, g1); \
-
-// #define LS0(k, g0, g1, g2, g3) LOAD_SHUFFLE(g0, k)
-// #define LS1(k, g0, g1, g2, g3) LOAD_SHUFFLE(g1, k+1)
-
-
-#define NNN(g0, g1, g2, g3)
+#define SM1(m1, m2, m3, m0) \
+    SHA256_MSG1(m0, m1); \

+#define SM2(m2, m3, m0, m1) \
+    ADD_EPI32(m0, _mm_alignr_epi8(m3, m2, 4)) \
+    SHA256_MSG2(m0, m3); \

 #define RND2(t0, t1) \
    t0 = _mm_sha256rnds2_epu32(t0, t1, msg);

-#define RND2_0(m, k) \
-    msg = _mm_add_epi32(m, *(const __m128i *) (const void *) &K[(k) * 4]); \
+
+
+#define R4(k, m0, m1, m2, m3, OP0, OP1) \
+    msg = _mm_add_epi32(m0, *(const __m128i *) (const void *) &K[(k) * 4]); \
    RND2(state0, state1); \
    msg = _mm_shuffle_epi32(msg, 0x0E); \
-
-
-#define RND2_1 \
+    OP0(m0, m1, m2, m3) \
    RND2(state1, state0); \
-
-
-// We use scheme with 3 rounds ahead for SHA256_MSG1 / 2 rounds ahead for SHA256_MSG2
-
-#define R4(k, g0, g1, g2, g3, OP0, OP1) \
-    RND2_0(g0, k) \
-    OP0(g0, g1, g2, g3) \
-    RND2_1 \
-    OP1(g0, g1, g2, g3) \
+    OP1(m0, m1, m2, m3) \

 #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
    R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \
@ -161,8 +130,9 @@ ATTRIB_SHA
 void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
 {
  const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203);
-  __m128i tmp;
-  __m128i state0, state1;
+   
+  
+  __m128i tmp, state0, state1;

  if (numBlocks == 0)
    return;
@ -262,22 +232,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
  #define _ARM_USE_NEW_NEON_INTRINSICS
 #endif

-
-
-
-
 #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
 #include <arm64_neon.h>
 #else

-
-
-
-
-
-
-
-
 #if defined(__clang__) && __clang_major__ < 16
 #if !defined(__ARM_FEATURE_SHA2) && \
    !defined(__ARM_FEATURE_CRYPTO)
@ -324,41 +282,70 @@ typedef uint32x4_t v128;
 // typedef __n128 v128; // MSVC

 #ifdef MY_CPU_BE
-  #define MY_rev32_for_LE(x)
+  #define MY_rev32_for_LE(x) x
 #else
-  #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x)))
+  #define MY_rev32_for_LE(x) vrev32q_u8(x)
 #endif

-#define LOAD_128(_p)      (*(const v128 *)(const void *)(_p))
-#define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v)
+#if 1 // 0 for debug
+// for arm32: it works slower by some reason than direct code
+/*
+for arm32 it generates:
+MSVC-2022, GCC-9:
+    vld1.32 {d18,d19}, [r10]
+    vst1.32 {d4,d5}, [r3]
+    vld1.8  {d20-d21}, [r4]
+there is no align hint (like [r10:128]).  So instruction allows unaligned access
+*/
+#define LOAD_128_32(_p)       vld1q_u32(_p)
+#define LOAD_128_8(_p)        vld1q_u8 (_p)
+#define STORE_128_32(_p, _v)  vst1q_u32(_p, _v)
+#else
+/*
+for arm32:
+MSVC-2022:
+    vldm r10,{d18,d19}
+    vstm r3,{d4,d5}
+    does it require strict alignment?
+GCC-9:
+    vld1.64 {d30-d31}, [r0:64]
+    vldr  d28, [r0, #16]
+    vldr  d29, [r0, #24]
+    vst1.64 {d30-d31}, [r0:64]
+    vstr  d28, [r0, #16]
+    vstr  d29, [r0, #24]
+there is hint [r0:64], so does it requires 64-bit alignment.
+*/
+#define LOAD_128_32(_p)       (*(const v128 *)(const void *)(_p))
+#define LOAD_128_8(_p)        vreinterpretq_u8_u32(*(const v128 *)(const void *)(_p))
+#define STORE_128_32(_p, _v)  *(v128 *)(void *)(_p) = (_v)
+#endif

 #define LOAD_SHUFFLE(m, k) \
-    m = LOAD_128((data + (k) * 16)); \
-    MY_rev32_for_LE(m); \
+    m = vreinterpretq_u32_u8( \
+        MY_rev32_for_LE( \
+        LOAD_128_8(data + (k) * 16))); \

 // K array must be aligned for 16-bytes at least.
 extern
 MY_ALIGN(64)
 const UInt32 SHA256_K_ARRAY[64];
-
 #define K SHA256_K_ARRAY

-
 #define SHA256_SU0(dest, src)        dest = vsha256su0q_u32(dest, src);
-#define SHA25G_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3);
+#define SHA256_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3);

-#define SM1(g0, g1, g2, g3)  SHA256_SU0(g3, g0)
-#define SM2(g0, g1, g2, g3)  SHA25G_SU1(g2, g0, g1)
-#define NNN(g0, g1, g2, g3)
+#define SM1(m0, m1, m2, m3)  SHA256_SU0(m3, m0)
+#define SM2(m0, m1, m2, m3)  SHA256_SU1(m2, m0, m1)
+#define NNN(m0, m1, m2, m3)

-
-#define R4(k, g0, g1, g2, g3, OP0, OP1) \
-    msg = vaddq_u32(g0, *(const v128 *) (const void *) &K[(k) * 4]); \
+#define R4(k, m0, m1, m2, m3, OP0, OP1) \
+    msg = vaddq_u32(m0, *(const v128 *) (const void *) &K[(k) * 4]); \
    tmp = state0; \
    state0 = vsha256hq_u32( state0, state1, msg ); \
    state1 = vsha256h2q_u32( state1, tmp, msg ); \
-    OP0(g0, g1, g2, g3); \
-    OP1(g0, g1, g2, g3); \
+    OP0(m0, m1, m2, m3); \
+    OP1(m0, m1, m2, m3); \


 #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
@ -379,8 +366,8 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
  if (numBlocks == 0)
    return;

-  state0 = LOAD_128(&state[0]);
-  state1 = LOAD_128(&state[4]);
+  state0 = LOAD_128_32(&state[0]);
+  state1 = LOAD_128_32(&state[4]);
  
  do
  {
@ -408,8 +395,8 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
  }
  while (--numBlocks);

-  STORE_128(&state[0], state0);
-  STORE_128(&state[4], state1);
+  STORE_128_32(&state[0], state0);
+  STORE_128_32(&state[4], state1);
 }

 #endif // USE_HW_SHA
@ -443,13 +430,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
 #endif


-
 #undef K
 #undef RND2
-#undef RND2_0
-#undef RND2_1
-
 #undef MY_rev32_for_LE
+
 #undef NNN
 #undef LOAD_128
 #undef STORE_128
@ -457,7 +441,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
 #undef SM1
 #undef SM2

-#undef NNN
+
 #undef R4
 #undef R16
 #undef PREPARE_STATE
--- a/3rdparty/lzma/src/Sort.c
+++ b/3rdparty/lzma/src/Sort.c
@ -1,141 +1,268 @@
 /* Sort.c -- Sort functions
-2014-04-05 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */

 #include "Precomp.h"

 #include "Sort.h"
+#include "CpuArch.h"

-#define HeapSortDown(p, k, size, temp) \
-  { for (;;) { \
-    size_t s = (k << 1); \
-    if (s > size) break; \
-    if (s < size && p[s + 1] > p[s]) s++; \
-    if (temp >= p[s]) break; \
-    p[k] = p[s]; k = s; \
-  } p[k] = temp; }
+#if (  (defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \
+    || (defined(__clang__) && Z7_has_builtin(__builtin_prefetch)) \
+    )
+// the code with prefetch is slow for small arrays on x86.
+// So we disable prefetch for x86.
+#ifndef MY_CPU_X86
+  // #pragma message("Z7_PREFETCH : __builtin_prefetch")
+  #define Z7_PREFETCH(a)  __builtin_prefetch((a))
+#endif
+
+#elif defined(_WIN32) // || defined(_MSC_VER) && (_MSC_VER >= 1200)
+
+#include "7zWindows.h"
+
+// NOTE: CLANG/GCC/MSVC can define different values for _MM_HINT_T0 / PF_TEMPORAL_LEVEL_1.
+// For example, clang-cl can generate "prefetcht2" instruction for
+// PreFetchCacheLine(PF_TEMPORAL_LEVEL_1) call.
+// But we want to generate "prefetcht0" instruction.
+// So for CLANG/GCC we must use __builtin_prefetch() in code branch above
+// instead of PreFetchCacheLine() / _mm_prefetch().
+
+// New msvc-x86 compiler generates "prefetcht0" instruction for PreFetchCacheLine() call.
+// But old x86 cpus don't support "prefetcht0".
+// So we will use PreFetchCacheLine(), only if we are sure that
+// generated instruction is supported by all cpus of that isa.
+#if defined(MY_CPU_AMD64) \
+    || defined(MY_CPU_ARM64) \
+    || defined(MY_CPU_IA64)
+// we need to use additional braces for (a) in PreFetchCacheLine call, because
+// PreFetchCacheLine macro doesn't use braces:
+//   #define PreFetchCacheLine(l, a)  _mm_prefetch((CHAR CONST *) a, l)
+  // #pragma message("Z7_PREFETCH : PreFetchCacheLine")
+  #define Z7_PREFETCH(a)  PreFetchCacheLine(PF_TEMPORAL_LEVEL_1, (a))
+#endif
+
+#endif // _WIN32
+
+
+#define PREFETCH_NO(p,k,s,size)
+
+#ifndef Z7_PREFETCH
+  #define SORT_PREFETCH(p,k,s,size)
+#else
+
+// #define PREFETCH_LEVEL 2  // use it if cache line is 32-bytes
+#define PREFETCH_LEVEL 3  // it is fast for most cases (64-bytes cache line prefetch)
+// #define PREFETCH_LEVEL 4  // it can be faster for big array (128-bytes prefetch)
+
+#if PREFETCH_LEVEL == 0
+
+  #define SORT_PREFETCH(p,k,s,size)
+
+#else // PREFETCH_LEVEL != 0

-void HeapSort(UInt32 *p, size_t size)
-{
-  if (size <= 1)
-    return;
-  p--;
-  {
-    size_t i = size / 2;
-    do
-    {
-      UInt32 temp = p[i];
-      size_t k = i;
-      HeapSortDown(p, k, size, temp)
-    }
-    while (--i != 0);
-  }
 /*
+if  defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
+    we prefetch one value per cache line.
+    Use it if array is aligned for cache line size (64 bytes)
+    or if array is small (less than L1 cache size).
+
+if !defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
+    we perfetch all cache lines that can be required.
+    it can be faster for big unaligned arrays.
+*/
+  #define USE_PREFETCH_FOR_ALIGNED_ARRAY
+
+// s == k * 2
+#if 0 && PREFETCH_LEVEL <= 3 && defined(MY_CPU_X86_OR_AMD64)
+  // x86 supports (lea r1*8+offset)
+  #define PREFETCH_OFFSET(k,s)  ((s) << PREFETCH_LEVEL)
+#else
+  #define PREFETCH_OFFSET(k,s)  ((k) << (PREFETCH_LEVEL + 1))
+#endif
+
+#if 1 && PREFETCH_LEVEL <= 3 && defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
+  #define PREFETCH_ADD_OFFSET   0
+#else
+  // last offset that can be reqiured in PREFETCH_LEVEL step:
+  #define PREFETCH_RANGE        ((2 << PREFETCH_LEVEL) - 1)
+  #define PREFETCH_ADD_OFFSET   PREFETCH_RANGE / 2
+#endif
+
+#if PREFETCH_LEVEL <= 3
+
+#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
+  #define SORT_PREFETCH(p,k,s,size) \
+  { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_ADD_OFFSET; \
+    if (s2 <= size) { \
+      Z7_PREFETCH((p + s2)); \
+  }}
+#else /* for unaligned array */
+  #define SORT_PREFETCH(p,k,s,size) \
+  { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
+    if (s2 <= size) { \
+      Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
+      Z7_PREFETCH((p + s2)); \
+  }}
+#endif
+
+#else // PREFETCH_LEVEL > 3
+
+#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
+  #define SORT_PREFETCH(p,k,s,size) \
+  { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE - 16 / 2; \
+    if (s2 <= size) { \
+      Z7_PREFETCH((p + s2 - 16)); \
+      Z7_PREFETCH((p + s2)); \
+  }}
+#else /* for unaligned array */
+  #define SORT_PREFETCH(p,k,s,size) \
+  { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
+    if (s2 <= size) { \
+      Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
+      Z7_PREFETCH((p + s2 - PREFETCH_RANGE / 2)); \
+      Z7_PREFETCH((p + s2)); \
+  }}
+#endif
+
+#endif // PREFETCH_LEVEL > 3
+#endif // PREFETCH_LEVEL != 0
+#endif // Z7_PREFETCH
+
+
+#if defined(MY_CPU_ARM64) \
+    /* || defined(MY_CPU_AMD64) */ \
+    /* || defined(MY_CPU_ARM) && !defined(_MSC_VER) */
+  // we want to use cmov, if cmov is very fast:
+  // - this cmov version is slower for clang-x64.
+  // - this cmov version is faster for gcc-arm64 for some fast arm64 cpus.
+  #define Z7_FAST_CMOV_SUPPORTED
+#endif
+ 
+#ifdef Z7_FAST_CMOV_SUPPORTED
+  // we want to use cmov here, if cmov is fast: new arm64 cpus.
+  // we want the compiler to use conditional move for this branch
+  #define GET_MAX_VAL(n0, n1, max_val_slow)  if (n0 < n1) n0 = n1;
+#else
+  // use this branch, if cpu doesn't support fast conditional move.
+  // it uses slow array access reading:
+  #define GET_MAX_VAL(n0, n1, max_val_slow)  n0 = max_val_slow;
+#endif
+
+#define HeapSortDown(p, k, size, temp, macro_prefetch) \
+{ \
+  for (;;) { \
+    UInt32 n0, n1; \
+    size_t s = k * 2; \
+    if (s >= size) { \
+      if (s == size) { \
+        n0 = p[s]; \
+        p[k] = n0; \
+        if (temp < n0) k = s; \
+      } \
+      break; \
+    } \
+    n0 = p[k * 2]; \
+    n1 = p[k * 2 + 1]; \
+    s += n0 < n1; \
+    GET_MAX_VAL(n0, n1, p[s]) \
+    if (temp >= n0) break; \
+    macro_prefetch(p, k, s, size) \
+    p[k] = n0; \
+    k = s; \
+  } \
+  p[k] = temp; \
+}
+
+
+/*
+stage-1 : O(n) :
+  we generate intermediate partially sorted binary tree:
+  p[0]  : it's additional item for better alignment of tree structure in memory.
+  p[1]
+  p[2]       p[3]
+  p[4] p[5]  p[6] p[7]
+  ...
+  p[x] >= p[x * 2]
+  p[x] >= p[x * 2 + 1]
+  
+stage-2 : O(n)*log2(N):
+  we move largest item p[0] from head of tree to the end of array
+  and insert last item to sorted binary tree.
+*/
+
+// (p) must be aligned for cache line size (64-bytes) for best performance
+
+void Z7_FASTCALL HeapSort(UInt32 *p, size_t size)
+{
+  if (size < 2)
+    return;
+  if (size == 2)
+  {
+    const UInt32 a0 = p[0];
+    const UInt32 a1 = p[1];
+    const unsigned k = a1 < a0;
+    p[k] = a0;
+    p[k ^ 1] = a1;
+    return;
+  }
+  {
+    // stage-1 : O(n)
+    // we transform array to partially sorted binary tree.
+    size_t i = --size / 2;
+    // (size) now is the index of the last item in tree,
+    // if (i)
+    {
      do
+      {
+        const UInt32 temp = p[i];
+        size_t k = i;
+        HeapSortDown(p, k, size, temp, PREFETCH_NO)
+      }
+      while (--i);
+    }
+    {
+      const UInt32 temp = p[0];
+      const UInt32 a1 = p[1];
+      if (temp < a1)
      {
        size_t k = 1;
-    UInt32 temp = p[size];
-    p[size--] = p[1];
-    HeapSortDown(p, k, size, temp)
+        p[0] = a1;
+        HeapSortDown(p, k, size, temp, PREFETCH_NO)
      }
-  while (size > 1);
-  */
-  while (size > 3)
-  {
-    UInt32 temp = p[size];
-    size_t k = (p[3] > p[2]) ? 3 : 2;
-    p[size--] = p[1];
-    p[1] = p[k];
-    HeapSortDown(p, k, size, temp)
-  }
-  {
-    UInt32 temp = p[size];
-    p[size] = p[1];
-    if (size > 2 && p[2] < temp)
-    {
-      p[1] = p[2];
-      p[2] = temp;
-    }
-    else
-      p[1] = temp;
    }
  }

-void HeapSort64(UInt64 *p, size_t size)
+  if (size < 3)
  {
-  if (size <= 1)
+    // size == 2
+    const UInt32 a0 = p[0];
+    p[0] = p[2];
+    p[2] = a0;
    return;
-  p--;
+  }
+  if (size != 3)
  {
-    size_t i = size / 2;
+    // stage-2 : O(size) * log2(size):
+    // we move largest item p[0] from head to the end of array,
+    // and insert last item to sorted binary tree.
    do
    {
-      UInt64 temp = p[i];
-      size_t k = i;
-      HeapSortDown(p, k, size, temp)
-    }
-    while (--i != 0);
-  }
-  /*
-  do
-  {
-    size_t k = 1;
-    UInt64 temp = p[size];
-    p[size--] = p[1];
-    HeapSortDown(p, k, size, temp)
-  }
-  while (size > 1);
-  */
-  while (size > 3)
-  {
-    UInt64 temp = p[size];
-    size_t k = (p[3] > p[2]) ? 3 : 2;
-    p[size--] = p[1];
+      const UInt32 temp = p[size];
+      size_t k = p[2] < p[3] ? 3 : 2;
+      p[size--] = p[0];
+      p[0] = p[1];
      p[1] = p[k];
-    HeapSortDown(p, k, size, temp)
+      HeapSortDown(p, k, size, temp, SORT_PREFETCH) // PREFETCH_NO
+    }
+    while (size != 3);
  }
  {
-    UInt64 temp = p[size];
-    p[size] = p[1];
-    if (size > 2 && p[2] < temp)
-    {
-      p[1] = p[2];
-      p[2] = temp;
-    }
-    else
-      p[1] = temp;
+    const UInt32 a2 = p[2];
+    const UInt32 a3 = p[3];
+    const size_t k = a2 < a3;
+    p[2] = p[1];
+    p[3] = p[0];
+    p[k] = a3;
+    p[k ^ 1] = a2;
  }
 }
-
-/*
-#define HeapSortRefDown(p, vals, n, size, temp) \
-  { size_t k = n; UInt32 val = vals[temp]; for (;;) { \
-    size_t s = (k << 1); \
-    if (s > size) break; \
-    if (s < size && vals[p[s + 1]] > vals[p[s]]) s++; \
-    if (val >= vals[p[s]]) break; \
-    p[k] = p[s]; k = s; \
-  } p[k] = temp; }
-
-void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size)
-{
-  if (size <= 1)
-    return;
-  p--;
-  {
-    size_t i = size / 2;
-    do
-    {
-      UInt32 temp = p[i];
-      HeapSortRefDown(p, vals, i, size, temp);
-    }
-    while (--i != 0);
-  }
-  do
-  {
-    UInt32 temp = p[size];
-    p[size--] = p[1];
-    HeapSortRefDown(p, vals, 1, size, temp);
-  }
-  while (size > 1);
-}
-*/
--- a/3rdparty/lzma/src/Threads.c
+++ b/3rdparty/lzma/src/Threads.c
@ -1,5 +1,5 @@
 /* Threads.c -- multithreading library
-2024-03-28 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */

 #include "Precomp.h"

@ -59,6 +59,100 @@ WRes Thread_Wait_Close(CThread *p)
  return (res != 0 ? res : res2);
 }

+typedef struct MY_PROCESSOR_NUMBER {
+    WORD  Group;
+    BYTE  Number;
+    BYTE  Reserved;
+} MY_PROCESSOR_NUMBER, *MY_PPROCESSOR_NUMBER;
+
+typedef struct MY_GROUP_AFFINITY {
+#if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 100000)
+    // KAFFINITY is not defined in old mingw
+    ULONG_PTR
+#else
+    KAFFINITY
+#endif
+      Mask;
+    WORD   Group;
+    WORD   Reserved[3];
+} MY_GROUP_AFFINITY, *MY_PGROUP_AFFINITY;
+
+typedef BOOL (WINAPI *Func_SetThreadGroupAffinity)(
+    HANDLE hThread,
+    CONST MY_GROUP_AFFINITY *GroupAffinity,
+    MY_PGROUP_AFFINITY PreviousGroupAffinity);
+
+typedef BOOL (WINAPI *Func_GetThreadGroupAffinity)(
+    HANDLE hThread,
+    MY_PGROUP_AFFINITY GroupAffinity);
+
+typedef BOOL (WINAPI *Func_GetProcessGroupAffinity)(
+    HANDLE hProcess,
+    PUSHORT GroupCount,
+    PUSHORT GroupArray);
+
+Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION
+
+#if 0
+#include <stdio.h>
+#define PRF(x) x
+/*
+--
+  before call of SetThreadGroupAffinity()
+    GetProcessGroupAffinity return one group.
+  after call of SetThreadGroupAffinity():
+    GetProcessGroupAffinity return more than group,
+    if SetThreadGroupAffinity() was to another group.
+--
+  GetProcessAffinityMask MS DOCs:
+  {
+    If the calling process contains threads in multiple groups,
+    the function returns zero for both affinity masks.
+  }
+  but tests in win10 with 2 groups (less than 64 cores total):
+    GetProcessAffinityMask() still returns non-zero affinity masks
+    even after SetThreadGroupAffinity() calls.
+*/
+static void PrintProcess_Info()
+{
+  {
+    const
+      Func_GetProcessGroupAffinity fn_GetProcessGroupAffinity =
+     (Func_GetProcessGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
+          "GetProcessGroupAffinity");
+    if (fn_GetProcessGroupAffinity)
+    {
+      unsigned i;
+      USHORT GroupCounts[64];
+      USHORT GroupCount = Z7_ARRAY_SIZE(GroupCounts);
+      BOOL boolRes = fn_GetProcessGroupAffinity(GetCurrentProcess(),
+          &GroupCount, GroupCounts);
+      printf("\n====== GetProcessGroupAffinity : "
+          "boolRes=%u GroupCounts = %u :",
+          boolRes, (unsigned)GroupCount);
+      for (i = 0; i < GroupCount; i++)
+        printf(" %u", GroupCounts[i]);
+      printf("\n");
+    }
+  }
+  {
+    DWORD_PTR processAffinityMask, systemAffinityMask;
+    if (GetProcessAffinityMask(GetCurrentProcess(), &processAffinityMask, &systemAffinityMask))
+    {
+      PRF(printf("\n====== GetProcessAffinityMask : "
+        ": processAffinityMask=%x, systemAffinityMask=%x\n",
+        (UInt32)processAffinityMask, (UInt32)systemAffinityMask);)
+    }
+    else
+      printf("\n==GetProcessAffinityMask FAIL");
+  }
+}
+#else
+#ifndef USE_THREADS_CreateThread
+// #define PRF(x)
+#endif
+#endif
+
 WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param)
 {
  /* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */
@ -73,6 +167,42 @@ WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param)
  unsigned threadId;
  *p = (HANDLE)(_beginthreadex(NULL, 0, func, param, 0, &threadId));

+#if 0 // 1 : for debug
+  {
+      DWORD_PTR prevMask;
+      DWORD_PTR affinity = 1 << 0;
+      prevMask = SetThreadAffinityMask(*p, (DWORD_PTR)affinity);
+      prevMask = prevMask;
+  }
+#endif
+#if 0 // 1 : for debug
+  {
+      /* win10: new thread will be created in same group that is assigned to parent thread
+                but affinity mask will contain all allowed threads of that group,
+                even if affinity mask of parent group is not full
+         win11: what group it will be created, if we have set
+                affinity of parent thread with ThreadGroupAffinity?
+      */
+      const
+         Func_GetThreadGroupAffinity fn =
+        (Func_GetThreadGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
+             "GetThreadGroupAffinity");
+      if (fn)
+      {
+        // BOOL wres2;
+        MY_GROUP_AFFINITY groupAffinity;
+        memset(&groupAffinity, 0, sizeof(groupAffinity));
+        /* wres2 = */ fn(*p, &groupAffinity);
+        PRF(printf("\n==Thread_Create cur = %6u GetThreadGroupAffinity(): "
+            "wres2_BOOL = %u, group=%u mask=%x\n",
+            GetCurrentThreadId(),
+            wres2,
+            groupAffinity.Group,
+            (UInt32)groupAffinity.Mask);)
+      }
+  }
+#endif
+
  #endif

  /* maybe we must use errno here, but probably GetLastError() is also OK. */
@ -110,7 +240,84 @@ WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param
      */
    }
    {
-      DWORD prevSuspendCount = ResumeThread(h);
+      const DWORD prevSuspendCount = ResumeThread(h);
+      /* ResumeThread() returns:
+         0 : was_not_suspended
+         1 : was_resumed
+        -1 : error
+      */
+      if (prevSuspendCount == (DWORD)-1)
+        wres = GetError();
+    }
+  }
+
+  /* maybe we must use errno here, but probably GetLastError() is also OK. */
+  return wres;
+
+  #endif
+}
+
+
+WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinityMask)
+{
+#ifdef USE_THREADS_CreateThread
+
+  UNUSED_VAR(group)
+  UNUSED_VAR(affinityMask)
+  return Thread_Create(p, func, param);
+  
+#else
+  
+  /* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */
+  HANDLE h;
+  WRes wres;
+  unsigned threadId;
+  h = (HANDLE)(_beginthreadex(NULL, 0, func, param, CREATE_SUSPENDED, &threadId));
+  *p = h;
+  wres = HandleToWRes(h);
+  if (h)
+  {
+    // PrintProcess_Info();
+    {
+      const
+         Func_SetThreadGroupAffinity fn =
+        (Func_SetThreadGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
+             "SetThreadGroupAffinity");
+      if (fn)
+      {
+        // WRes wres2;
+        MY_GROUP_AFFINITY groupAffinity, prev_groupAffinity;
+        memset(&groupAffinity, 0, sizeof(groupAffinity));
+        // groupAffinity.Mask must use only bits that supported by current group
+        // (groupAffinity.Mask = 0) means all allowed bits
+        groupAffinity.Mask = affinityMask;
+        groupAffinity.Group = (WORD)group;
+        // wres2 =
+        fn(h, &groupAffinity, &prev_groupAffinity);
+        /*
+        if (groupAffinity.Group == prev_groupAffinity.Group)
+          wres2 = wres2;
+        else
+          wres2 = wres2;
+        if (wres2 == 0)
+        {
+          wres2 = GetError();
+          PRF(printf("\n==SetThreadGroupAffinity error: %u\n", wres2);)
+        }
+        else
+        {
+          PRF(printf("\n==Thread_Create_With_Group::SetThreadGroupAffinity()"
+            " threadId = %6u"
+            " group=%u mask=%x\n",
+            threadId,
+            prev_groupAffinity.Group,
+            (UInt32)prev_groupAffinity.Mask);)
+        }
+        */
+      }
+    }
+    {
+      const DWORD prevSuspendCount = ResumeThread(h);
      /* ResumeThread() returns:
         0 : was_not_suspended
         1 : was_resumed
@ -297,6 +504,13 @@ WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param)
  return Thread_Create_With_CpuSet(p, func, param, NULL);
 }

+/*
+WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinity)
+{
+  UNUSED_VAR(group)
+  return Thread_Create_With_Affinity(p, func, param, affinity);
+}
+*/

 WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, CAffinityMask affinity)
 {
@ -577,5 +791,22 @@ WRes AutoResetEvent_OptCreate_And_Reset(CAutoResetEvent *p)
  return AutoResetEvent_CreateNotSignaled(p);
 }

+void ThreadNextGroup_Init(CThreadNextGroup *p, UInt32 numGroups, UInt32 startGroup)
+{
+  // printf("\n====== ThreadNextGroup_Init numGroups = %x: startGroup=%x\n", numGroups, startGroup);
+  if (numGroups == 0)
+      numGroups = 1;
+  p->NumGroups = numGroups;
+  p->NextGroup = startGroup % numGroups;
+}
+
+
+UInt32 ThreadNextGroup_GetNext(CThreadNextGroup *p)
+{
+  const UInt32 next = p->NextGroup;
+  p->NextGroup = (next + 1) % p->NumGroups;
+  return next;
+}
+
 #undef PRF
 #undef Print
--- a/3rdparty/lzma/src/XzCrc64Opt.c
+++ b/3rdparty/lzma/src/XzCrc64Opt.c
@ -1,5 +1,5 @@
 /* XzCrc64Opt.c -- CRC64 calculation (optimized functions)
-2023-12-08 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */

 #include "Precomp.h"

@ -235,7 +235,7 @@ CRC64_FUNC_PRE_BE(Z7_CRC64_NUM_TABLES_USE)
      v  = Q32BE(1, w1) ^ Q32BE(0, w0);
      v ^= Q32BE(3, d1) ^ Q32BE(2, d0);
  #endif
-#elif
+#else
 #error Stop_Compiling_Bad_CRC64_NUM_TABLES
 #endif
      p += Z7_CRC64_NUM_TABLES_USE;
--- a/3rdparty/lzma/src/XzDec.c
+++ b/3rdparty/lzma/src/XzDec.c
@ -1,5 +1,5 @@
 /* XzDec.c -- Xz Decode
-2024-03-01 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */

 #include "Precomp.h"

@ -59,7 +59,7 @@ unsigned Xz_ReadVarInt(const Byte *p, size_t maxSize, UInt64 *value)

  for (i = 0; i < limit;)
  {
-    Byte b = p[i];
+    const unsigned b = p[i];
    *value |= (UInt64)(b & 0x7F) << (7 * i++);
    if ((b & 0x80) == 0)
      return (b == 0 && i != 1) ? 0 : i;
@ -796,11 +796,10 @@ SRes Xz_ParseHeader(CXzStreamFlags *p, const Byte *buf)

 static BoolInt Xz_CheckFooter(CXzStreamFlags flags, UInt64 indexSize, const Byte *buf)
 {
-  return indexSize == (((UInt64)GetUi32(buf + 4) + 1) << 2)
-      && GetUi32(buf) == CrcCalc(buf + 4, 6)
-      && flags == GetBe16(buf + 8)
-      && buf[10] == XZ_FOOTER_SIG_0
-      && buf[11] == XZ_FOOTER_SIG_1;
+  return indexSize == (((UInt64)GetUi32a(buf + 4) + 1) << 2)
+      && GetUi32a(buf) == CrcCalc(buf + 4, 6)
+      && flags == GetBe16a(buf + 8)
+      && GetUi16a(buf + 10) == (XZ_FOOTER_SIG_0 | (XZ_FOOTER_SIG_1 << 8));
 }

 #define READ_VARINT_AND_CHECK(buf, pos, size, res) \
@ -1166,7 +1165,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
            p->indexPreSize = 1 + Xz_WriteVarInt(p->buf + 1, p->numBlocks);
            p->indexPos = p->indexPreSize;
            p->indexSize += p->indexPreSize;
-            Sha256_Final(&p->sha, p->shaDigest);
+            Sha256_Final(&p->sha, (Byte *)(void *)p->shaDigest32);
            Sha256_Init(&p->sha);
            p->crc = CrcUpdate(CRC_INIT_VAL, p->buf, p->indexPreSize);
            p->state = XZ_STATE_STREAM_INDEX;
@ -1241,10 +1240,10 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
              break;
          }
          {
-            Byte digest[XZ_CHECK_SIZE_MAX];
+            UInt32 digest32[XZ_CHECK_SIZE_MAX / 4];
            p->state = XZ_STATE_BLOCK_HEADER;
            p->pos = 0;
-            if (XzCheck_Final(&p->check, digest) && memcmp(digest, p->buf, checkSize) != 0)
+            if (XzCheck_Final(&p->check, (void *)digest32) && memcmp(digest32, p->buf, checkSize) != 0)
              return SZ_ERROR_CRC;
            if (p->decodeOnlyOneBlock)
            {
@ -1289,12 +1288,12 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
          }
          else
          {
-            Byte digest[SHA256_DIGEST_SIZE];
+            UInt32 digest32[SHA256_DIGEST_SIZE / 4];
            p->state = XZ_STATE_STREAM_INDEX_CRC;
            p->indexSize += 4;
            p->pos = 0;
-            Sha256_Final(&p->sha, digest);
-            if (memcmp(digest, p->shaDigest, SHA256_DIGEST_SIZE) != 0)
+            Sha256_Final(&p->sha, (void *)digest32);
+            if (memcmp(digest32, p->shaDigest32, SHA256_DIGEST_SIZE) != 0)
              return SZ_ERROR_CRC;
          }
        }
@ -1313,7 +1312,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
          const Byte *ptr = p->buf;
          p->state = XZ_STATE_STREAM_FOOTER;
          p->pos = 0;
-          if (CRC_GET_DIGEST(p->crc) != GetUi32(ptr))
+          if (CRC_GET_DIGEST(p->crc) != GetUi32a(ptr))
            return SZ_ERROR_CRC;
        }
        break;
@ -1343,7 +1342,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
      {
        if (*src != 0)
        {
-          if (((UInt32)p->padSize & 3) != 0)
+          if ((unsigned)p->padSize & 3)
            return SZ_ERROR_NO_ARCHIVE;
          p->pos = 0;
          p->state = XZ_STATE_STREAM_HEADER;
--- a/3rdparty/lzma/src/XzEnc.c
+++ b/3rdparty/lzma/src/XzEnc.c
@ -1,5 +1,5 @@
 /* XzEnc.c -- Xz Encode
-2024-03-01 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */

 #include "Precomp.h"

@ -411,6 +411,7 @@ static SRes SeqInFilter_Read(ISeqInStreamPtr pp, void *data, size_t *size)
  }
 }

+Z7_FORCE_INLINE
 static void SeqInFilter_Construct(CSeqInFilter *p)
 {
  p->buf = NULL;
@ -418,6 +419,7 @@ static void SeqInFilter_Construct(CSeqInFilter *p)
  p->vt.Read = SeqInFilter_Read;
 }

+Z7_FORCE_INLINE
 static void SeqInFilter_Free(CSeqInFilter *p, ISzAllocPtr alloc)
 {
  if (p->StateCoder.p)
@ -507,6 +509,7 @@ void XzFilterProps_Init(CXzFilterProps *p)
 void XzProps_Init(CXzProps *p)
 {
  p->checkId = XZ_CHECK_CRC32;
+  p->numThreadGroups = 0;
  p->blockSize = XZ_PROPS_BLOCK_SIZE_AUTO;
  p->numBlockThreads_Reduced = -1;
  p->numBlockThreads_Max = -1;
@ -689,6 +692,7 @@ typedef struct
 } CLzma2WithFilters;


+Z7_FORCE_INLINE
 static void Lzma2WithFilters_Construct(CLzma2WithFilters *p)
 {
  p->lzma2 = NULL;
@ -712,6 +716,7 @@ static SRes Lzma2WithFilters_Create(CLzma2WithFilters *p, ISzAllocPtr alloc, ISz
 }


+Z7_FORCE_INLINE
 static void Lzma2WithFilters_Free(CLzma2WithFilters *p, ISzAllocPtr alloc)
 {
  #ifdef USE_SUBBLOCK
@ -1236,6 +1241,7 @@ SRes XzEnc_Encode(CXzEncHandle p, ISeqOutStreamPtr outStream, ISeqInStreamPtr in
    }

    p->mtCoder.numThreadsMax = (unsigned)props->numBlockThreads_Max;
+    p->mtCoder.numThreadGroups = props->numThreadGroups;
    p->mtCoder.expectedDataSize = p->expectedDataSize;
    
    RINOK(MtCoder_Code(&p->mtCoder))
--- a/3rdparty/lzma/src/XzIn.c
+++ b/3rdparty/lzma/src/XzIn.c
@ -1,38 +1,39 @@
 /* XzIn.c - Xz input
-2023-09-07 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */

 #include "Precomp.h"

 #include <string.h>

 #include "7zCrc.h"
-#include "CpuArch.h"
 #include "Xz.h"
+#include "CpuArch.h"

-/*
-#define XZ_FOOTER_SIG_CHECK(p) (memcmp((p), XZ_FOOTER_SIG, XZ_FOOTER_SIG_SIZE) == 0)
-*/
-#define XZ_FOOTER_SIG_CHECK(p) ((p)[0] == XZ_FOOTER_SIG_0 && (p)[1] == XZ_FOOTER_SIG_1)
-
+#define XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(p) \
+    (GetUi16a((const Byte *)(const void *)(p) + 10) == \
+      (XZ_FOOTER_SIG_0 | (XZ_FOOTER_SIG_1 << 8)))

 SRes Xz_ReadHeader(CXzStreamFlags *p, ISeqInStreamPtr inStream)
 {
-  Byte sig[XZ_STREAM_HEADER_SIZE];
+  UInt32 data32[XZ_STREAM_HEADER_SIZE / 4];
  size_t processedSize = XZ_STREAM_HEADER_SIZE;
-  RINOK(SeqInStream_ReadMax(inStream, sig, &processedSize))
+  RINOK(SeqInStream_ReadMax(inStream, data32, &processedSize))
  if (processedSize != XZ_STREAM_HEADER_SIZE
-      || memcmp(sig, XZ_SIG, XZ_SIG_SIZE) != 0)
+      || memcmp(data32, XZ_SIG, XZ_SIG_SIZE) != 0)
    return SZ_ERROR_NO_ARCHIVE;
-  return Xz_ParseHeader(p, sig);
+  return Xz_ParseHeader(p, (const Byte *)(const void *)data32);
 }

-#define READ_VARINT_AND_CHECK(buf, pos, size, res) \
-  { const unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \
+#define READ_VARINT_AND_CHECK(buf, size, res) \
+{ const unsigned s = Xz_ReadVarInt(buf, size, res); \
  if (s == 0) return SZ_ERROR_ARCHIVE; \
-  pos += s; }
+  size -= s; \
+  buf += s; \
+}

 SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, UInt32 *headerSizeRes)
 {
+  MY_ALIGN(4)
  Byte header[XZ_BLOCK_HEADER_SIZE_MAX];
  unsigned headerSize;
  *headerSizeRes = 0;
@ -57,8 +58,12 @@ SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex,
  return XzBlock_Parse(p, header);
 }

+
 #define ADD_SIZE_CHECK(size, val) \
-  { const UInt64 newSize = size + (val); if (newSize < size) return XZ_SIZE_OVERFLOW; size = newSize; }
+{ const UInt64 newSize = size + (val); \
+  if (newSize < size) return XZ_SIZE_OVERFLOW; \
+  size = newSize; \
+}

 UInt64 Xz_GetUnpackSize(const CXzStream *p)
 {
@ -82,76 +87,85 @@ UInt64 Xz_GetPackSize(const CXzStream *p)
  return size;
 }

-/*
-SRes XzBlock_ReadFooter(CXzBlock *p, CXzStreamFlags f, ISeqInStreamPtr inStream)
-{
-  return SeqInStream_Read(inStream, p->check, XzFlags_GetCheckSize(f));
-}
-*/

-static SRes Xz_ReadIndex2(CXzStream *p, const Byte *buf, size_t size, ISzAllocPtr alloc)
+// input;
+//   CXzStream (p) is empty object.
+//   size != 0
+//   (size & 3) == 0
+//   (buf) is aligned for at least 4 bytes.
+// output:
+//   p->numBlocks is number of allocated items in p->blocks
+//   p->blocks[*] values must be ignored, if function returns error.
+static SRes Xz_ParseIndex(CXzStream *p, const Byte *buf, size_t size, ISzAllocPtr alloc)
 {
-  size_t numBlocks, pos = 1;
-  UInt32 crc;
-
+  size_t numBlocks;
  if (size < 5 || buf[0] != 0)
    return SZ_ERROR_ARCHIVE;
-
  size -= 4;
-  crc = CrcCalc(buf, size);
-  if (crc != GetUi32(buf + size))
+  {
+    const UInt32 crc = CrcCalc(buf, size);
+    if (crc != GetUi32a(buf + size))
      return SZ_ERROR_ARCHIVE;
-
+  }
+  buf++;
+  size--;
  {
    UInt64 numBlocks64;
-    READ_VARINT_AND_CHECK(buf, pos, size, &numBlocks64)
+    READ_VARINT_AND_CHECK(buf, size, &numBlocks64)
+    // (numBlocks64) is 63-bit value, so we can calculate (numBlocks64 * 2):
+    if (numBlocks64 * 2 > size)
+      return SZ_ERROR_ARCHIVE;
+    if (numBlocks64 >= ((size_t)1 << (sizeof(size_t) * 8 - 1)) / sizeof(CXzBlockSizes))
+      return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE
    numBlocks = (size_t)numBlocks64;
-    if (numBlocks != numBlocks64 || numBlocks * 2 > size)
-      return SZ_ERROR_ARCHIVE;
  }
-  
-  Xz_Free(p, alloc);
-  if (numBlocks != 0)
+  // Xz_Free(p, alloc); // it's optional, because (p) is empty already
+  if (numBlocks)
  {
-    size_t i;
-    p->numBlocks = numBlocks;
-    p->blocks = (CXzBlockSizes *)ISzAlloc_Alloc(alloc, sizeof(CXzBlockSizes) * numBlocks);
-    if (!p->blocks)
+    CXzBlockSizes *blocks = (CXzBlockSizes *)ISzAlloc_Alloc(alloc, sizeof(CXzBlockSizes) * numBlocks);
+    if (!blocks)
      return SZ_ERROR_MEM;
-    for (i = 0; i < numBlocks; i++)
+    p->blocks = blocks;
+    p->numBlocks = numBlocks;
+    // the caller will call Xz_Free() in case of error
+    do
    {
-      CXzBlockSizes *block = &p->blocks[i];
-      READ_VARINT_AND_CHECK(buf, pos, size, &block->totalSize)
-      READ_VARINT_AND_CHECK(buf, pos, size, &block->unpackSize)
-      if (block->totalSize == 0)
+      READ_VARINT_AND_CHECK(buf, size, &blocks->totalSize)
+      READ_VARINT_AND_CHECK(buf, size, &blocks->unpackSize)
+      if (blocks->totalSize == 0)
        return SZ_ERROR_ARCHIVE;
+      blocks++;
    }
+    while (--numBlocks);
  }
-  while ((pos & 3) != 0)
-    if (buf[pos++] != 0)
+  if (size >= 4)
    return SZ_ERROR_ARCHIVE;
-  return (pos == size) ? SZ_OK : SZ_ERROR_ARCHIVE;
+  while (size)
+    if (buf[--size])
+      return SZ_ERROR_ARCHIVE;
+  return SZ_OK;
 }

+
+/*
 static SRes Xz_ReadIndex(CXzStream *p, ILookInStreamPtr stream, UInt64 indexSize, ISzAllocPtr alloc)
 {
  SRes res;
  size_t size;
  Byte *buf;
-  if (indexSize > ((UInt32)1 << 31))
-    return SZ_ERROR_UNSUPPORTED;
+  if (indexSize >= ((size_t)1 << (sizeof(size_t) * 8 - 1)))
+    return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE
  size = (size_t)indexSize;
-  if (size != indexSize)
-    return SZ_ERROR_UNSUPPORTED;
  buf = (Byte *)ISzAlloc_Alloc(alloc, size);
  if (!buf)
    return SZ_ERROR_MEM;
  res = LookInStream_Read2(stream, buf, size, SZ_ERROR_UNSUPPORTED);
  if (res == SZ_OK)
-    res = Xz_ReadIndex2(p, buf, size, alloc);
+    res = Xz_ParseIndex(p, buf, size, alloc);
  ISzAlloc_Free(alloc, buf);
  return res;
 }
+*/

 static SRes LookInStream_SeekRead_ForArc(ILookInStreamPtr stream, UInt64 offset, void *buf, size_t size)
 {
@ -160,84 +174,102 @@ static SRes LookInStream_SeekRead_ForArc(ILookInStreamPtr stream, UInt64 offset,
  /* return LookInStream_Read2(stream, buf, size, SZ_ERROR_NO_ARCHIVE); */
 }

+
+/*
+in:
+  (*startOffset) is position in (stream) where xz_stream must be finished.
+out:
+  if returns SZ_OK, then (*startOffset) is position in stream that shows start of xz_stream.
+*/
 static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startOffset, ISzAllocPtr alloc)
 {
-  UInt64 indexSize;
-  Byte buf[XZ_STREAM_FOOTER_SIZE];
+  #define TEMP_BUF_SIZE  (1 << 10)
+  UInt32 buf32[TEMP_BUF_SIZE / 4];
  UInt64 pos = (UInt64)*startOffset;

-  if ((pos & 3) != 0 || pos < XZ_STREAM_FOOTER_SIZE)
+  if ((pos & 3) || pos < XZ_STREAM_FOOTER_SIZE)
    return SZ_ERROR_NO_ARCHIVE;
-
  pos -= XZ_STREAM_FOOTER_SIZE;
-  RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf, XZ_STREAM_FOOTER_SIZE))
+  RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, XZ_STREAM_FOOTER_SIZE))
  
-  if (!XZ_FOOTER_SIG_CHECK(buf + 10))
+  if (!XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(buf32))
  {
-    UInt32 total = 0;
    pos += XZ_STREAM_FOOTER_SIZE;
-    
    for (;;)
    {
-      size_t i;
-      #define TEMP_BUF_SIZE (1 << 10)
-      Byte temp[TEMP_BUF_SIZE];
-      
-      i = (pos > TEMP_BUF_SIZE) ? TEMP_BUF_SIZE : (size_t)pos;
+      // pos != 0
+      // (pos & 3) == 0
+      size_t i = pos >= TEMP_BUF_SIZE ? TEMP_BUF_SIZE : (size_t)pos;
      pos -= i;
-      RINOK(LookInStream_SeekRead_ForArc(stream, pos, temp, i))
-      total += (UInt32)i;
-      for (; i != 0; i--)
-        if (temp[i - 1] != 0)
+      RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, i))
+      i /= 4;
+      do
+        if (buf32[i - 1] != 0)
          break;
-      if (i != 0)
-      {
-        if ((i & 3) != 0)
-          return SZ_ERROR_NO_ARCHIVE;
-        pos += i;
-        break;
-      }
-      if (pos < XZ_STREAM_FOOTER_SIZE || total > (1 << 16))
-        return SZ_ERROR_NO_ARCHIVE;
-    }
+      while (--i);

+      pos += i * 4;
+      #define XZ_STREAM_BACKWARD_READING_PAD_MAX (1 << 16)
+      // here we don't support rare case with big padding for xz stream.
+      // so we have padding limit for backward reading.
+      if ((UInt64)*startOffset - pos > XZ_STREAM_BACKWARD_READING_PAD_MAX)
+        return SZ_ERROR_NO_ARCHIVE;
+      if (i)
+        break;
+    }
+    // we try to open xz stream after skipping zero padding.
+    // ((UInt64)*startOffset == pos) is possible here!
    if (pos < XZ_STREAM_FOOTER_SIZE)
      return SZ_ERROR_NO_ARCHIVE;
    pos -= XZ_STREAM_FOOTER_SIZE;
-    RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf, XZ_STREAM_FOOTER_SIZE))
-    if (!XZ_FOOTER_SIG_CHECK(buf + 10))
+    RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, XZ_STREAM_FOOTER_SIZE))
+    if (!XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(buf32))
      return SZ_ERROR_NO_ARCHIVE;
  }
  
-  p->flags = (CXzStreamFlags)GetBe16(buf + 8);
-
+  p->flags = (CXzStreamFlags)GetBe16a(buf32 + 2);
  if (!XzFlags_IsSupported(p->flags))
    return SZ_ERROR_UNSUPPORTED;
-
  {
    /* to eliminate GCC 6.3 warning:
       dereferencing type-punned pointer will break strict-aliasing rules */
-    const Byte *buf_ptr = buf;
-    if (GetUi32(buf_ptr) != CrcCalc(buf + 4, 6))
+    const UInt32 *buf_ptr = buf32;
+    if (GetUi32a(buf_ptr) != CrcCalc(buf32 + 1, 6))
      return SZ_ERROR_ARCHIVE;
  }
-
-  indexSize = ((UInt64)GetUi32(buf + 4) + 1) << 2;
-
+  {
+    const UInt64 indexSize = ((UInt64)GetUi32a(buf32 + 1) + 1) << 2;
    if (pos < indexSize)
      return SZ_ERROR_ARCHIVE;
-
    pos -= indexSize;
+    // v25.00: relaxed indexSize check. We allow big index table.
+    // if (indexSize > ((UInt32)1 << 31))
+    if (indexSize >= ((size_t)1 << (sizeof(size_t) * 8 - 1)))
+      return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE
    RINOK(LookInStream_SeekTo(stream, pos))
-  RINOK(Xz_ReadIndex(p, stream, indexSize, alloc))
-
+    // RINOK(Xz_ReadIndex(p, stream, indexSize, alloc))
    {
-    UInt64 totalSize = Xz_GetPackSize(p);
-    if (totalSize == XZ_SIZE_OVERFLOW
-        || totalSize >= ((UInt64)1 << 63)
-        || pos < totalSize + XZ_STREAM_HEADER_SIZE)
+      SRes res;
+      const size_t size = (size_t)indexSize;
+      // if (size != indexSize) return SZ_ERROR_UNSUPPORTED;
+      Byte *buf = (Byte *)ISzAlloc_Alloc(alloc, size);
+      if (!buf)
+        return SZ_ERROR_MEM;
+      res = LookInStream_Read2(stream, buf, size, SZ_ERROR_UNSUPPORTED);
+      if (res == SZ_OK)
+        res = Xz_ParseIndex(p, buf, size, alloc);
+      ISzAlloc_Free(alloc, buf);
+      RINOK(res)
+    }
+  }
+  {
+    UInt64 total = Xz_GetPackSize(p);
+    if (total == XZ_SIZE_OVERFLOW || total >= ((UInt64)1 << 63))
      return SZ_ERROR_ARCHIVE;
-    pos -= (totalSize + XZ_STREAM_HEADER_SIZE);
+    total += XZ_STREAM_HEADER_SIZE;
+    if (pos < total)
+      return SZ_ERROR_ARCHIVE;
+    pos -= total;
    RINOK(LookInStream_SeekTo(stream, pos))
    *startOffset = (Int64)pos;
  }
@ -246,7 +278,6 @@ static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startO
    CSecToRead secToRead;
    SecToRead_CreateVTable(&secToRead);
    secToRead.realStream = stream;
-
    RINOK(Xz_ReadHeader(&headerFlags, &secToRead.vt))
    return (p->flags == headerFlags) ? SZ_OK : SZ_ERROR_ARCHIVE;
  }
@ -257,8 +288,7 @@ static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startO

 void Xzs_Construct(CXzs *p)
 {
-  p->num = p->numAllocated = 0;
-  p->streams = 0;
+  Xzs_CONSTRUCT(p)
 }

 void Xzs_Free(CXzs *p, ISzAllocPtr alloc)
@ -268,7 +298,7 @@ void Xzs_Free(CXzs *p, ISzAllocPtr alloc)
    Xz_Free(&p->streams[i], alloc);
  ISzAlloc_Free(alloc, p->streams);
  p->num = p->numAllocated = 0;
-  p->streams = 0;
+  p->streams = NULL;
 }

 UInt64 Xzs_GetNumBlocks(const CXzs *p)
@ -307,34 +337,49 @@ UInt64 Xzs_GetPackSize(const CXzs *p)
 SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr stream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc)
 {
  Int64 endOffset = 0;
+  // it's supposed that CXzs object is empty here.
+  // if CXzs object is not empty, it will add new streams to that non-empty object.
+  // Xzs_Free(p, alloc); // it's optional call to empty CXzs object.
  RINOK(ILookInStream_Seek(stream, &endOffset, SZ_SEEK_END))
  *startOffset = endOffset;
  for (;;)
  {
    CXzStream st;
    SRes res;
-    Xz_Construct(&st);
+    Xz_CONSTRUCT(&st)
    res = Xz_ReadBackward(&st, stream, startOffset, alloc);
+    // if (res == SZ_OK), then (*startOffset) is start offset of new stream if
+    // if (res != SZ_OK), then (*startOffset) is unchend or it's expected start offset of stream with error
    st.startOffset = (UInt64)*startOffset;
-    RINOK(res)
+    // we must store (st) object to array, or we must free (st) local object.
+    if (res != SZ_OK)
+    {
+      Xz_Free(&st, alloc);
+      return res;
+    }
    if (p->num == p->numAllocated)
    {
      const size_t newNum = p->num + p->num / 4 + 1;
      void *data = ISzAlloc_Alloc(alloc, newNum * sizeof(CXzStream));
      if (!data)
+      {
+        Xz_Free(&st, alloc);
        return SZ_ERROR_MEM;
+      }
      p->numAllocated = newNum;
      if (p->num != 0)
        memcpy(data, p->streams, p->num * sizeof(CXzStream));
      ISzAlloc_Free(alloc, p->streams);
      p->streams = (CXzStream *)data;
    }
+    // we use direct copying of raw data from local variable (st) to object in array.
+    // so we don't need to call Xz_Free(&st, alloc) after copying and after p->num++
    p->streams[p->num++] = st;
    if (*startOffset == 0)
-      break;
-    RINOK(LookInStream_SeekTo(stream, (UInt64)*startOffset))
+      return SZ_OK;
+    // seek operation is optional:
+    // RINOK(LookInStream_SeekTo(stream, (UInt64)*startOffset))
    if (progress && ICompressProgress_Progress(progress, (UInt64)(endOffset - *startOffset), (UInt64)(Int64)-1) != SZ_OK)
      return SZ_ERROR_PROGRESS;
  }
-  return SZ_OK;
 }