diff --git a/3rdparty/lzma/include/7zVersion.h b/3rdparty/lzma/include/7zVersion.h
index 1ddef80b08..72733f7fae 100644
--- a/3rdparty/lzma/include/7zVersion.h
+++ b/3rdparty/lzma/include/7zVersion.h
@@ -1,7 +1,7 @@
-#define MY_VER_MAJOR 24
-#define MY_VER_MINOR 8
+#define MY_VER_MAJOR 25
+#define MY_VER_MINOR 0
 #define MY_VER_BUILD 0
-#define MY_VERSION_NUMBERS "24.08"
+#define MY_VERSION_NUMBERS "25.00"
 #define MY_VERSION MY_VERSION_NUMBERS
 
 #ifdef MY_CPU_NAME
@@ -10,12 +10,12 @@
   #define MY_VERSION_CPU MY_VERSION
 #endif
 
-#define MY_DATE "2024-08-11"
+#define MY_DATE "2025-07-05"
 #undef MY_COPYRIGHT
 #undef MY_VERSION_COPYRIGHT_DATE
 #define MY_AUTHOR_NAME "Igor Pavlov"
 #define MY_COPYRIGHT_PD "Igor Pavlov : Public domain"
-#define MY_COPYRIGHT_CR "Copyright (c) 1999-2024 Igor Pavlov"
+#define MY_COPYRIGHT_CR "Copyright (c) 1999-2025 Igor Pavlov"
 
 #ifdef USE_COPYRIGHT_CR
   #define MY_COPYRIGHT MY_COPYRIGHT_CR
diff --git a/3rdparty/lzma/include/Compiler.h b/3rdparty/lzma/include/Compiler.h
index 2a9c2b7a08..b266b277bd 100644
--- a/3rdparty/lzma/include/Compiler.h
+++ b/3rdparty/lzma/include/Compiler.h
@@ -1,5 +1,5 @@
 /* Compiler.h : Compiler specific defines and pragmas
-2024-01-22 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #ifndef ZIP7_INC_COMPILER_H
 #define ZIP7_INC_COMPILER_H
@@ -183,6 +183,16 @@ typedef void (*Z7_void_Function)(void);
   #define Z7_ATTRIB_NO_VECTORIZE
 #endif
 
+#if defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1920)
+  #define Z7_PRAGMA_OPTIMIZE_FOR_CODE_SIZE _Pragma("optimize ( \"s\", on )")
+  #define Z7_PRAGMA_OPTIMIZE_DEFAULT       _Pragma("optimize ( \"\", on )")
+#else
+  #define Z7_PRAGMA_OPTIMIZE_FOR_CODE_SIZE
+  #define Z7_PRAGMA_OPTIMIZE_DEFAULT
+#endif
+
+
+
 #if defined(MY_CPU_X86_OR_AMD64) && ( \
        defined(__clang__) && (__clang_major__ >= 4) \
     || defined(__GNUC__) && (__GNUC__ >= 5))
diff --git a/3rdparty/lzma/include/CpuArch.h b/3rdparty/lzma/include/CpuArch.h
index 683cfaa862..1690a5b616 100644
--- a/3rdparty/lzma/include/CpuArch.h
+++ b/3rdparty/lzma/include/CpuArch.h
@@ -1,5 +1,5 @@
 /* CpuArch.h -- CPU specific code
-2024-06-17 : Igor Pavlov : Public domain */
+Igor Pavlov : Public domain */
 
 #ifndef ZIP7_INC_CPU_ARCH_H
 #define ZIP7_INC_CPU_ARCH_H
@@ -47,6 +47,12 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
   #define MY_CPU_SIZEOF_POINTER 4
 #endif
 
+#if defined(__SSE2__) \
+    || defined(MY_CPU_AMD64) \
+    || defined(_M_IX86_FP) && (_M_IX86_FP >= 2)
+#define MY_CPU_SSE2
+#endif
+
 
 #if  defined(_M_ARM64) \
   || defined(_M_ARM64EC) \
@@ -509,11 +515,19 @@ problem-4 : performace:
 
 #if defined(MY_CPU_LE_UNALIGN) && defined(Z7_CPU_FAST_BSWAP_SUPPORTED)
 
+#if 0
+// Z7_BSWAP16 can be slow for x86-msvc
+#define GetBe16_to32(p)  (Z7_BSWAP16 (*(const UInt16 *)(const void *)(p)))
+#else
+#define GetBe16_to32(p)  (Z7_BSWAP32 (*(const UInt16 *)(const void *)(p)) >> 16)
+#endif
+
 #define GetBe32(p)  Z7_BSWAP32 (*(const UInt32 *)(const void *)(p))
 #define SetBe32(p, v) { (*(UInt32 *)(void *)(p)) = Z7_BSWAP32(v); }
 
 #if defined(MY_CPU_LE_UNALIGN_64)
 #define GetBe64(p)  Z7_BSWAP64 (*(const UInt64 *)(const void *)(p))
+#define SetBe64(p, v) { (*(UInt64 *)(void *)(p)) = Z7_BSWAP64(v); }
 #endif
 
 #else
@@ -536,21 +550,39 @@ problem-4 : performace:
 #define GetBe64(p) (((UInt64)GetBe32(p) << 32) | GetBe32(((const Byte *)(p)) + 4))
 #endif
 
+#ifndef SetBe64
+#define SetBe64(p, v) { Byte *_ppp_ = (Byte *)(p); UInt64 _vvv_ = (v); \
+    _ppp_[0] = (Byte)(_vvv_ >> 56); \
+    _ppp_[1] = (Byte)(_vvv_ >> 48); \
+    _ppp_[2] = (Byte)(_vvv_ >> 40); \
+    _ppp_[3] = (Byte)(_vvv_ >> 32); \
+    _ppp_[4] = (Byte)(_vvv_ >> 24); \
+    _ppp_[5] = (Byte)(_vvv_ >> 16); \
+    _ppp_[6] = (Byte)(_vvv_ >> 8); \
+    _ppp_[7] = (Byte)_vvv_; }
+#endif
+
 #ifndef GetBe16
+#ifdef GetBe16_to32
+#define GetBe16(p) ( (UInt16) GetBe16_to32(p))
+#else
 #define GetBe16(p) ( (UInt16) ( \
     ((UInt16)((const Byte *)(p))[0] << 8) | \
              ((const Byte *)(p))[1] ))
 #endif
+#endif
 
 
 #if defined(MY_CPU_BE)
 #define Z7_CONV_BE_TO_NATIVE_CONST32(v)  (v)
 #define Z7_CONV_LE_TO_NATIVE_CONST32(v)  Z7_BSWAP32_CONST(v)
 #define Z7_CONV_NATIVE_TO_BE_32(v)       (v)
+// #define Z7_GET_NATIVE16_FROM_2_BYTES(b0, b1)  ((b1) | ((b0) << 8))
 #elif defined(MY_CPU_LE)
 #define Z7_CONV_BE_TO_NATIVE_CONST32(v)  Z7_BSWAP32_CONST(v)
 #define Z7_CONV_LE_TO_NATIVE_CONST32(v)  (v)
 #define Z7_CONV_NATIVE_TO_BE_32(v)       Z7_BSWAP32(v)
+// #define Z7_GET_NATIVE16_FROM_2_BYTES(b0, b1)  ((b0) | ((b1) << 8))
 #else
 #error Stop_Compiling_Unknown_Endian_CONV
 #endif
@@ -589,6 +621,11 @@ problem-4 : performace:
 #endif
 
 
+#ifndef GetBe16_to32
+#define GetBe16_to32(p) GetBe16(p)
+#endif
+
+
 #if defined(MY_CPU_X86_OR_AMD64) \
   || defined(MY_CPU_ARM_OR_ARM64) \
   || defined(MY_CPU_PPC_OR_PPC64)
@@ -617,6 +654,7 @@ BoolInt CPU_IsSupported_SSE2(void);
 BoolInt CPU_IsSupported_SSSE3(void);
 BoolInt CPU_IsSupported_SSE41(void);
 BoolInt CPU_IsSupported_SHA(void);
+BoolInt CPU_IsSupported_SHA512(void);
 BoolInt CPU_IsSupported_PageGB(void);
 
 #elif defined(MY_CPU_ARM_OR_ARM64)
@@ -634,6 +672,7 @@ BoolInt CPU_IsSupported_SHA1(void);
 BoolInt CPU_IsSupported_SHA2(void);
 BoolInt CPU_IsSupported_AES(void);
 #endif
+BoolInt CPU_IsSupported_SHA512(void);
 
 #endif
 
diff --git a/3rdparty/lzma/include/LzFindMt.h b/3rdparty/lzma/include/LzFindMt.h
index fcb479da9e..89984f52d1 100644
--- a/3rdparty/lzma/include/LzFindMt.h
+++ b/3rdparty/lzma/include/LzFindMt.h
@@ -1,5 +1,5 @@
 /* LzFindMt.h -- multithreaded Match finder for LZ algorithms
-2024-01-22 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #ifndef ZIP7_INC_LZ_FIND_MT_H
 #define ZIP7_INC_LZ_FIND_MT_H
@@ -12,8 +12,10 @@ EXTERN_C_BEGIN
 typedef struct
 {
   UInt32 numProcessedBlocks;
-  CThread thread;
+  Int32 affinityGroup;
+  UInt64 affinityInGroup;
   UInt64 affinity;
+  CThread thread;
 
   BoolInt wasCreated;
   BoolInt needStart;
diff --git a/3rdparty/lzma/include/Lzma2Enc.h b/3rdparty/lzma/include/Lzma2Enc.h
index cb25275c6b..1e6b50c6f4 100644
--- a/3rdparty/lzma/include/Lzma2Enc.h
+++ b/3rdparty/lzma/include/Lzma2Enc.h
@@ -18,6 +18,7 @@ typedef struct
   int numBlockThreads_Reduced;
   int numBlockThreads_Max;
   int numTotalThreads;
+  unsigned numThreadGroups; // 0 : no groups
 } CLzma2EncProps;
 
 void Lzma2EncProps_Init(CLzma2EncProps *p);
diff --git a/3rdparty/lzma/include/LzmaEnc.h b/3rdparty/lzma/include/LzmaEnc.h
index 9f8039a103..3feb5b4af0 100644
--- a/3rdparty/lzma/include/LzmaEnc.h
+++ b/3rdparty/lzma/include/LzmaEnc.h
@@ -1,5 +1,5 @@
 /*  LzmaEnc.h -- LZMA Encoder
-2023-04-13 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #ifndef ZIP7_INC_LZMA_ENC_H
 #define ZIP7_INC_LZMA_ENC_H
@@ -29,11 +29,13 @@ typedef struct
   int numThreads;  /* 1 or 2, default = 2 */
 
   // int _pad;
+  Int32 affinityGroup;
 
   UInt64 reduceSize; /* estimated size of data that will be compressed. default = (UInt64)(Int64)-1.
                         Encoder uses this value to reduce dictionary size */
 
   UInt64 affinity;
+  UInt64 affinityInGroup;
 } CLzmaEncProps;
 
 void LzmaEncProps_Init(CLzmaEncProps *p);
diff --git a/3rdparty/lzma/include/MtCoder.h b/3rdparty/lzma/include/MtCoder.h
index 1231d3c2a5..8166ccac2e 100644
--- a/3rdparty/lzma/include/MtCoder.h
+++ b/3rdparty/lzma/include/MtCoder.h
@@ -1,5 +1,5 @@
 /* MtCoder.h -- Multi-thread Coder
-2023-04-13 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #ifndef ZIP7_INC_MT_CODER_H
 #define ZIP7_INC_MT_CODER_H
@@ -16,7 +16,7 @@ EXTERN_C_BEGIN
 
 #ifndef Z7_ST
   #define MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads) ((numThreads) + (numThreads) / 8 + 1)
-  #define MTCODER_THREADS_MAX 64
+  #define MTCODER_THREADS_MAX 256
   #define MTCODER_BLOCKS_MAX (MTCODER_GET_NUM_BLOCKS_FROM_THREADS(MTCODER_THREADS_MAX) + 3)
 #else
   #define MTCODER_THREADS_MAX 1
@@ -77,6 +77,7 @@ typedef struct CMtCoder_
   
   size_t blockSize;        /* size of input block */
   unsigned numThreadsMax;
+  unsigned numThreadGroups;
   UInt64 expectedDataSize;
 
   ISeqInStreamPtr inStream;
@@ -125,6 +126,8 @@ typedef struct CMtCoder_
   CMtProgress mtProgress;
   CMtCoderBlock blocks[MTCODER_BLOCKS_MAX];
   CMtCoderThread threads[MTCODER_THREADS_MAX];
+
+  CThreadNextGroup nextGroup;
 } CMtCoder;
 
 
diff --git a/3rdparty/lzma/include/Sha256.h b/3rdparty/lzma/include/Sha256.h
index 9e0422320c..75329cdf02 100644
--- a/3rdparty/lzma/include/Sha256.h
+++ b/3rdparty/lzma/include/Sha256.h
@@ -1,5 +1,5 @@
 /* Sha256.h -- SHA-256 Hash
-2023-04-02 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #ifndef ZIP7_INC_SHA256_H
 #define ZIP7_INC_SHA256_H
@@ -14,6 +14,9 @@ EXTERN_C_BEGIN
 #define SHA256_BLOCK_SIZE   (SHA256_NUM_BLOCK_WORDS * 4)
 #define SHA256_DIGEST_SIZE  (SHA256_NUM_DIGEST_WORDS * 4)
 
+
+
+
 typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byte *data, size_t numBlocks);
 
 /*
@@ -32,9 +35,16 @@ typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byt
 
 typedef struct
 {
-  SHA256_FUNC_UPDATE_BLOCKS func_UpdateBlocks;
-  UInt64 count;
-  UInt64 _pad_2[2];
+  union
+  {
+    struct
+    {
+      SHA256_FUNC_UPDATE_BLOCKS func_UpdateBlocks;
+      UInt64 count;
+    } vars;
+    UInt64 _pad_64bit[4];
+    void *_pad_align_ptr[2];
+  } v;
   UInt32 state[SHA256_NUM_DIGEST_WORDS];
 
   Byte buffer[SHA256_BLOCK_SIZE];
diff --git a/3rdparty/lzma/include/Sort.h b/3rdparty/lzma/include/Sort.h
index 1817b652f5..de5a4e86cf 100644
--- a/3rdparty/lzma/include/Sort.h
+++ b/3rdparty/lzma/include/Sort.h
@@ -1,5 +1,5 @@
 /* Sort.h -- Sort functions
-2023-03-05 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #ifndef ZIP7_INC_SORT_H
 #define ZIP7_INC_SORT_H
@@ -8,10 +8,7 @@
 
 EXTERN_C_BEGIN
 
-void HeapSort(UInt32 *p, size_t size);
-void HeapSort64(UInt64 *p, size_t size);
-
-/* void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size); */
+void Z7_FASTCALL HeapSort(UInt32 *p, size_t size);
 
 EXTERN_C_END
 
diff --git a/3rdparty/lzma/include/Threads.h b/3rdparty/lzma/include/Threads.h
index c1484a2773..be12e6e7fa 100644
--- a/3rdparty/lzma/include/Threads.h
+++ b/3rdparty/lzma/include/Threads.h
@@ -1,5 +1,5 @@
 /* Threads.h -- multithreading library
-2024-03-28 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #ifndef ZIP7_INC_THREADS_H
 #define ZIP7_INC_THREADS_H
@@ -140,12 +140,22 @@ WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param
 WRes Thread_Wait_Close(CThread *p);
 
 #ifdef _WIN32
+WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinityMask);
 #define Thread_Create_With_CpuSet(p, func, param, cs) \
   Thread_Create_With_Affinity(p, func, param, *cs)
 #else
 WRes Thread_Create_With_CpuSet(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, const CCpuSet *cpuSet);
 #endif
 
+typedef struct
+{
+  unsigned NumGroups;
+  unsigned NextGroup;
+} CThreadNextGroup;
+
+void ThreadNextGroup_Init(CThreadNextGroup *p, unsigned numGroups, unsigned startGroup);
+unsigned ThreadNextGroup_GetNext(CThreadNextGroup *p);
+
 
 #ifdef _WIN32
 
diff --git a/3rdparty/lzma/include/Xz.h b/3rdparty/lzma/include/Xz.h
index 42bc685341..ad63b48c71 100644
--- a/3rdparty/lzma/include/Xz.h
+++ b/3rdparty/lzma/include/Xz.h
@@ -1,5 +1,5 @@
 /* Xz.h - Xz interface
-2024-01-26 : Igor Pavlov : Public domain */
+Igor Pavlov : Public domain */
 
 #ifndef ZIP7_INC_XZ_H
 #define ZIP7_INC_XZ_H
@@ -121,6 +121,7 @@ typedef struct
   UInt64 startOffset;
 } CXzStream;
 
+#define Xz_CONSTRUCT(p) { (p)->numBlocks = 0;  (p)->blocks = NULL;  (p)->flags = 0; }
 void Xz_Construct(CXzStream *p);
 void Xz_Free(CXzStream *p, ISzAllocPtr alloc);
 
@@ -136,8 +137,13 @@ typedef struct
   CXzStream *streams;
 } CXzs;
 
+#define Xzs_CONSTRUCT(p) { (p)->num = 0;  (p)->numAllocated = 0;  (p)->streams = NULL; }
 void Xzs_Construct(CXzs *p);
 void Xzs_Free(CXzs *p, ISzAllocPtr alloc);
+/*
+Xzs_ReadBackward() must be called for empty CXzs object.
+Xzs_ReadBackward() can return non empty object with (p->num != 0) even in case of error.
+*/
 SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr inStream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc);
 
 UInt64 Xzs_GetNumBlocks(const CXzs *p);
@@ -268,8 +274,8 @@ typedef struct
   size_t outBufSize;
   size_t outDataWritten; // the size of data in (outBuf) that were fully unpacked
 
-  Byte shaDigest[SHA256_DIGEST_SIZE];
-  Byte buf[XZ_BLOCK_HEADER_SIZE_MAX];
+  UInt32 shaDigest32[SHA256_DIGEST_SIZE / 4];
+  Byte buf[XZ_BLOCK_HEADER_SIZE_MAX]; // it must be aligned for 4-bytes
 } CXzUnpacker;
 
 /* alloc : aligned for cache line allocation is better */
diff --git a/3rdparty/lzma/include/XzEnc.h b/3rdparty/lzma/include/XzEnc.h
index 77b78c014b..ac6bbf7996 100644
--- a/3rdparty/lzma/include/XzEnc.h
+++ b/3rdparty/lzma/include/XzEnc.h
@@ -1,5 +1,5 @@
 /* XzEnc.h -- Xz Encode
-2023-04-13 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #ifndef ZIP7_INC_XZ_ENC_H
 #define ZIP7_INC_XZ_ENC_H
@@ -31,6 +31,7 @@ typedef struct
   CLzma2EncProps lzma2Props;
   CXzFilterProps filterProps;
   unsigned checkId;
+  unsigned numThreadGroups; // 0 : no groups
   UInt64 blockSize;
   int numBlockThreads_Reduced;
   int numBlockThreads_Max;
diff --git a/3rdparty/lzma/src/7zDec.c b/3rdparty/lzma/src/7zDec.c
index c9b4064e3f..520cbfd833 100644
--- a/3rdparty/lzma/src/7zDec.c
+++ b/3rdparty/lzma/src/7zDec.c
@@ -1,5 +1,5 @@
 /* 7zDec.c -- Decoding from 7z folder
-2024-03-01 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
@@ -312,8 +312,9 @@ static BoolInt IS_MAIN_METHOD(UInt32 m)
     case k_PPMD:
   #endif
       return True;
+    default:
+      return False;
   }
-  return False;
 }
 
 static BoolInt IS_SUPPORTED_CODER(const CSzCoderInfo *c)
diff --git a/3rdparty/lzma/src/AesOpt.c b/3rdparty/lzma/src/AesOpt.c
index 58769ea059..b281807390 100644
--- a/3rdparty/lzma/src/AesOpt.c
+++ b/3rdparty/lzma/src/AesOpt.c
@@ -1,5 +1,5 @@
 /* AesOpt.c -- AES optimized code for x86 AES hardware instructions
-2024-03-01 : Igor Pavlov : Public domain */
+Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
@@ -80,19 +80,39 @@ AES_FUNC_START (name)
 
 #define MM_XOR( dest, src)    MM_OP(_mm_xor_si128,    dest, src)
 
+#if 1
+// use aligned SSE load/store for data.
+// It is required for our Aes functions, that data is aligned for 16-bytes.
+// So we can use this branch of code.
+// and compiler can use fused load-op SSE instructions:
+//   xorps xmm0, XMMWORD PTR [rdx]
+#define LOAD_128(pp)        (*(__m128i *)(void *)(pp))
+#define STORE_128(pp, _v)    *(__m128i *)(void *)(pp) = _v
+// use aligned SSE load/store for data. Alternative code with direct access
+// #define LOAD_128(pp)        _mm_load_si128(pp)
+// #define STORE_128(pp, _v)   _mm_store_si128(pp, _v)
+#else
+// use unaligned load/store for data: movdqu XMMWORD PTR [rdx]
+#define LOAD_128(pp)        _mm_loadu_si128(pp)
+#define STORE_128(pp, _v)   _mm_storeu_si128(pp, _v)
+#endif
+
 AES_FUNC_START2 (AesCbc_Encode_HW)
 {
+  if (numBlocks == 0)
+    return;
+  {
   __m128i *p = (__m128i *)(void *)ivAes;
   __m128i *data = (__m128i *)(void *)data8;
   __m128i m = *p;
   const __m128i k0 = p[2];
   const __m128i k1 = p[3];
   const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
-  for (; numBlocks != 0; numBlocks--, data++)
+  do
   {
     UInt32 r = numRounds2;
     const __m128i *w = p + 4;
-    __m128i temp = *data;
+    __m128i temp = LOAD_128(data);
     MM_XOR (temp, k0)
     MM_XOR (m, temp)
     MM_OP_m (_mm_aesenc_si128, k1)
@@ -104,9 +124,12 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
     }
     while (--r);
     MM_OP_m (_mm_aesenclast_si128, w[0])
-    *data = m;
+    STORE_128(data, m);
+    data++;
   }
+  while (--numBlocks);
   *p = m;
+  }
 }
 
 
@@ -139,12 +162,12 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
 
 #define WOP(op)  op (m0, 0)  WOP_M1(op)
 
-
 #define DECLARE_VAR(reg, ii)  __m128i reg;
-#define LOAD_data(  reg, ii)  reg = data[ii];
-#define STORE_data( reg, ii)  data[ii] = reg;
+#define LOAD_data_ii(ii)      LOAD_128(data + (ii))
+#define LOAD_data(  reg, ii)  reg = LOAD_data_ii(ii);
+#define STORE_data( reg, ii)  STORE_128(data + (ii), reg);
 #if (NUM_WAYS > 1)
-#define XOR_data_M1(reg, ii)  MM_XOR (reg, data[ii- 1])
+#define XOR_data_M1(reg, ii)  MM_XOR (reg, LOAD_128(data + (ii- 1)))
 #endif
 
 #define MM_OP_key(op, reg)  MM_OP(op, reg, key);
@@ -156,25 +179,22 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
 #define AES_XOR(      reg, ii)   MM_OP_key (_mm_xor_si128,        reg)
 
 #define CTR_START(reg, ii)  MM_OP (_mm_add_epi64, ctr, one)  reg = ctr;
-#define CTR_END(  reg, ii)  MM_XOR (data[ii], reg)
-
+#define CTR_END(  reg, ii)  STORE_128(data + (ii), _mm_xor_si128(reg, \
+                            LOAD_128 (data + (ii))));
 #define WOP_KEY(op, n) { \
     const __m128i key = w[n]; \
-    WOP(op); }
-
+    WOP(op) }
 
 #define WIDE_LOOP_START  \
     dataEnd = data + numBlocks;  \
     if (numBlocks >= NUM_WAYS)  \
     { dataEnd -= NUM_WAYS; do {  \
 
-
 #define WIDE_LOOP_END  \
     data += NUM_WAYS;  \
     } while (data <= dataEnd);  \
     dataEnd += NUM_WAYS; }  \
 
-
 #define SINGLE_LOOP  \
     for (; data < dataEnd; data++)
 
@@ -184,54 +204,73 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
 
 #define AVX_XOR(dest, src)    MM_OP(_mm256_xor_si256, dest, src)
 #define AVX_DECLARE_VAR(reg, ii)  __m256i reg;
-#define AVX_LOAD_data(  reg, ii)  reg = ((const __m256i *)(const void *)data)[ii];
-#define AVX_STORE_data( reg, ii)  ((__m256i *)(void *)data)[ii] = reg;
+
+#if 1
+// use unaligned AVX load/store for data.
+// It is required for our Aes functions, that data is aligned for 16-bytes.
+// But we need 32-bytes reading.
+// So we use intrinsics for unaligned AVX load/store.
+// notes for _mm256_storeu_si256:
+// msvc2022: uses vmovdqu and keeps the order of instruction sequence.
+// new gcc11 uses vmovdqu
+// old gcc9 could use pair of instructions:
+//   vmovups        %xmm7, -224(%rax)
+//   vextracti128   $0x1, %ymm7, -208(%rax)
+#define AVX_LOAD(p)         _mm256_loadu_si256((const __m256i *)(const void *)(p))
+#define AVX_STORE(p, _v)    _mm256_storeu_si256((__m256i *)(void *)(p), _v);
+#else
+// use aligned AVX load/store for data.
+// for debug: we can use this branch, if we are sure that data is aligned for 32-bytes.
+// msvc2022 uses vmovdqu still
+// gcc      uses vmovdqa (that requires 32-bytes alignment)
+#define AVX_LOAD(p)         (*(const __m256i *)(const void *)(p))
+#define AVX_STORE(p, _v)    (*(__m256i *)(void *)(p)) = _v;
+#endif
+
+#define AVX_LOAD_data(  reg, ii)  reg = AVX_LOAD((const __m256i *)(const void *)data + (ii));
+#define AVX_STORE_data( reg, ii)  AVX_STORE((__m256i *)(void *)data + (ii), reg)
 /*
-AVX_XOR_data_M1() needs unaligned memory load
-if (we don't use _mm256_loadu_si256() here)
-{
-  Most compilers with enabled optimizations generate fused AVX (LOAD + OP)
-  instruction that can load unaligned data.
-  But GCC and CLANG without -O2 or -O1 optimizations can generate separated
-  LOAD-ALIGNED (vmovdqa) instruction that will fail on execution.
-}
-Note: some compilers generate more instructions, if we use _mm256_loadu_si256() here.
-v23.02: we use _mm256_loadu_si256() here, because we need compatibility with any compiler.
+AVX_XOR_data_M1() needs unaligned memory load, even if (data)
+is aligned for 256-bits, because we read 32-bytes chunk that
+crosses (data) position: from (data - 16bytes) to (data + 16bytes).
 */
-#define AVX_XOR_data_M1(reg, ii)  AVX_XOR (reg, _mm256_loadu_si256(&(((const __m256i *)(const void *)(data - 1))[ii])))
-// for debug only: the following code will fail on execution, if compiled by some compilers:
-// #define AVX_XOR_data_M1(reg, ii)  AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii]))
+#define AVX_XOR_data_M1(reg, ii)  AVX_XOR (reg, _mm256_loadu_si256((const __m256i *)(const void *)(data - 1) + (ii)))
 
 #define AVX_AES_DEC(      reg, ii)   MM_OP_key (_mm256_aesdec_epi128,     reg)
 #define AVX_AES_DEC_LAST( reg, ii)   MM_OP_key (_mm256_aesdeclast_epi128, reg)
 #define AVX_AES_ENC(      reg, ii)   MM_OP_key (_mm256_aesenc_epi128,     reg)
 #define AVX_AES_ENC_LAST( reg, ii)   MM_OP_key (_mm256_aesenclast_epi128, reg)
 #define AVX_AES_XOR(      reg, ii)   MM_OP_key (_mm256_xor_si256,         reg)
-#define AVX_CTR_START(reg, ii)  MM_OP (_mm256_add_epi64, ctr2, two)  reg = _mm256_xor_si256(ctr2, key);
-#define AVX_CTR_END(  reg, ii)  AVX_XOR (((__m256i *)(void *)data)[ii], reg)
+#define AVX_CTR_START(reg, ii)  \
+    MM_OP (_mm256_add_epi64, ctr2, two) \
+    reg = _mm256_xor_si256(ctr2, key);
+
+#define AVX_CTR_END(reg, ii)  \
+    AVX_STORE((__m256i *)(void *)data + (ii), _mm256_xor_si256(reg, \
+    AVX_LOAD ((__m256i *)(void *)data + (ii))));
+
 #define AVX_WOP_KEY(op, n) { \
     const __m256i key = w[n]; \
-    WOP(op); }
+    WOP(op) }
 
 #define NUM_AES_KEYS_MAX 15
 
 #define WIDE_LOOP_START_AVX(OP)  \
     dataEnd = data + numBlocks;  \
     if (numBlocks >= NUM_WAYS * 2)  \
-    { __m256i keys[NUM_AES_KEYS_MAX]; \
-    UInt32 ii; \
-    OP \
-    for (ii = 0; ii < numRounds; ii++) \
-      keys[ii] = _mm256_broadcastsi128_si256(p[ii]); \
-    dataEnd -= NUM_WAYS * 2; do {  \
-
+    { __m256i keys[NUM_AES_KEYS_MAX];  \
+      OP  \
+      { UInt32 ii; for (ii = 0; ii < numRounds; ii++)  \
+        keys[ii] = _mm256_broadcastsi128_si256(p[ii]); }  \
+      dataEnd -= NUM_WAYS * 2; \
+      do {  \
 
 #define WIDE_LOOP_END_AVX(OP)  \
-    data += NUM_WAYS * 2;  \
-    } while (data <= dataEnd);  \
-    dataEnd += NUM_WAYS * 2;  \
-    OP  \
-    _mm256_zeroupper();  \
+        data += NUM_WAYS * 2;  \
+      } while (data <= dataEnd);  \
+      dataEnd += NUM_WAYS * 2;  \
+      OP  \
+      _mm256_zeroupper();  \
     }  \
 
 /* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified,
@@ -246,21 +285,20 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
   __m128i *p = (__m128i *)(void *)ivAes;
   __m128i *data = (__m128i *)(void *)data8;
   __m128i iv = *p;
-  const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1;
+  const __m128i * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2 + 2 - 1;
   const __m128i *dataEnd;
   p += 2;
   
   WIDE_LOOP_START
   {
     const __m128i *w = wStart;
-    
     WOP (DECLARE_VAR)
     WOP (LOAD_data)
     WOP_KEY (AES_XOR, 1)
-
     do
     {
       WOP_KEY (AES_DEC, 0)
+
       w--;
     }
     while (w != p);
@@ -268,7 +306,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
 
     MM_XOR (m0, iv)
     WOP_M1 (XOR_data_M1)
-    iv = data[NUM_WAYS - 1];
+    LOAD_data(iv, NUM_WAYS - 1)
     WOP (STORE_data)
   }
   WIDE_LOOP_END
@@ -276,7 +314,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
   SINGLE_LOOP
   {
     const __m128i *w = wStart - 1;
-    __m128i m = _mm_xor_si128 (w[2], *data);
+    __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0));
+    
     do
     {
       MM_OP_m (_mm_aesdec_si128, w[1])
@@ -286,10 +325,9 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
     while (w != p);
     MM_OP_m (_mm_aesdec_si128,     w[1])
     MM_OP_m (_mm_aesdeclast_si128, w[0])
-
     MM_XOR (m, iv)
-    iv = *data;
-    *data = m;
+    LOAD_data(iv, 0)
+    STORE_data(m, 0)
   }
   
   p[-2] = iv;
@@ -301,9 +339,9 @@ AES_FUNC_START2 (AesCtr_Code_HW)
   __m128i *p = (__m128i *)(void *)ivAes;
   __m128i *data = (__m128i *)(void *)data8;
   __m128i ctr = *p;
-  UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1;
+  const UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1;
   const __m128i *dataEnd;
-  __m128i one = _mm_cvtsi32_si128(1);
+  const __m128i one = _mm_cvtsi32_si128(1);
 
   p += 2;
   
@@ -322,7 +360,6 @@ AES_FUNC_START2 (AesCtr_Code_HW)
     }
     while (--r);
     WOP_KEY (AES_ENC_LAST, 0)
-   
     WOP (CTR_END)
   }
   WIDE_LOOP_END
@@ -344,7 +381,7 @@ AES_FUNC_START2 (AesCtr_Code_HW)
     while (--numRounds2);
     MM_OP_m (_mm_aesenc_si128,     w[0])
     MM_OP_m (_mm_aesenclast_si128, w[1])
-    MM_XOR (*data, m)
+    CTR_END (m, 0)
   }
   
   p[-2] = ctr;
@@ -421,7 +458,7 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256)
   __m128i *data = (__m128i *)(void *)data8;
   __m128i iv = *p;
   const __m128i *dataEnd;
-  UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
+  const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
   p += 2;
   
   WIDE_LOOP_START_AVX(;)
@@ -440,17 +477,17 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256)
     while (w != keys);
     AVX_WOP_KEY (AVX_AES_DEC_LAST, 0)
 
-    AVX_XOR (m0, _mm256_setr_m128i(iv, data[0]))
+    AVX_XOR (m0, _mm256_setr_m128i(iv, LOAD_data_ii(0)))
     WOP_M1 (AVX_XOR_data_M1)
-    iv = data[NUM_WAYS * 2 - 1];
+    LOAD_data (iv, NUM_WAYS * 2 - 1)
     WOP (AVX_STORE_data)
   }
   WIDE_LOOP_END_AVX(;)
 
   SINGLE_LOOP
   {
-    const __m128i *w = p + *(const UInt32 *)(p + 1 - 2) * 2 + 1 - 3;
-    __m128i m = _mm_xor_si128 (w[2], *data);
+    const __m128i *w = p - 2 + (size_t)*(const UInt32 *)(p + 1 - 2) * 2;
+    __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0));
     do
     {
       MM_OP_m (_mm_aesdec_si128, w[1])
@@ -462,8 +499,8 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256)
     MM_OP_m (_mm_aesdeclast_si128, w[0])
 
     MM_XOR (m, iv)
-    iv = *data;
-    *data = m;
+    LOAD_data(iv, 0)
+    STORE_data(m, 0)
   }
   
   p[-2] = iv;
@@ -493,9 +530,9 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256)
   __m128i *p = (__m128i *)(void *)ivAes;
   __m128i *data = (__m128i *)(void *)data8;
   __m128i ctr = *p;
-  UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
+  const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
   const __m128i *dataEnd;
-  __m128i one = _mm_cvtsi32_si128(1);
+  const __m128i one = _mm_cvtsi32_si128(1);
   __m256i ctr2, two;
   p += 2;
   
@@ -536,7 +573,7 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256)
     while (--numRounds2);
     MM_OP_m (_mm_aesenc_si128,     w[0])
     MM_OP_m (_mm_aesenclast_si128, w[1])
-    MM_XOR (*data, m)
+    CTR_END (m, 0)
   }
 
   p[-2] = ctr;
@@ -731,9 +768,14 @@ AES_FUNC_START (name)
 
 AES_FUNC_START2 (AesCbc_Encode_HW)
 {
-  v128 * const p = (v128*)(void*)ivAes;
-  v128 *data = (v128*)(void*)data8;
+  if (numBlocks == 0)
+    return;
+  {
+  v128 * const p = (v128 *)(void *)ivAes;
+  v128 *data = (v128 *)(void *)data8;
   v128 m = *p;
+  const UInt32 numRounds2 = *(const UInt32 *)(p + 1);
+  const v128 *w = p + (size_t)numRounds2 * 2;
   const v128 k0 = p[2];
   const v128 k1 = p[3];
   const v128 k2 = p[4];
@@ -744,11 +786,14 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
   const v128 k7 = p[9];
   const v128 k8 = p[10];
   const v128 k9 = p[11];
-  const UInt32 numRounds2 = *(const UInt32 *)(p + 1);
-  const v128 *w = p + ((size_t)numRounds2 * 2);
+  const v128 k_z4 = w[-2];
+  const v128 k_z3 = w[-1];
+  const v128 k_z2 = w[0];
   const v128 k_z1 = w[1];
   const v128 k_z0 = w[2];
-  for (; numBlocks != 0; numBlocks--, data++)
+  // we don't use optimization veorq_u8(*data, k_z0) that can reduce one cycle,
+  // because gcc/clang compilers are not good for that optimization.
+  do
   {
     MM_XOR_m (*data)
     AES_E_MC_m (k0)
@@ -757,24 +802,26 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
     AES_E_MC_m (k3)
     AES_E_MC_m (k4)
     AES_E_MC_m (k5)
-    AES_E_MC_m (k6)
-    AES_E_MC_m (k7)
-    AES_E_MC_m (k8)
     if (numRounds2 >= 6)
     {
-      AES_E_MC_m (k9)
-      AES_E_MC_m (p[12])
+      AES_E_MC_m (k6)
+      AES_E_MC_m (k7)
       if (numRounds2 != 6)
       {
-        AES_E_MC_m (p[13])
-        AES_E_MC_m (p[14])
+        AES_E_MC_m (k8)
+        AES_E_MC_m (k9)
       }
     }
-    AES_E_m  (k_z1)
-    MM_XOR_m (k_z0)
-    *data = m;
+    AES_E_MC_m (k_z4)
+    AES_E_MC_m (k_z3)
+    AES_E_MC_m (k_z2)
+    AES_E_m    (k_z1)
+    MM_XOR_m   (k_z0)
+    *data++ = m;
   }
+  while (--numBlocks);
   *p = m;
+  }
 }
 
 
@@ -834,10 +881,10 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
 
 AES_FUNC_START2 (AesCbc_Decode_HW)
 {
-  v128 *p = (v128*)(void*)ivAes;
-  v128 *data = (v128*)(void*)data8;
+  v128 *p = (v128 *)(void *)ivAes;
+  v128 *data = (v128 *)(void *)data8;
   v128 iv = *p;
-  const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
+  const v128 * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2;
   const v128 *dataEnd;
   p += 2;
   
@@ -858,7 +905,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
     WOP_KEY (AES_XOR, 0)
     MM_XOR (m0, iv)
     WOP_M1 (XOR_data_M1)
-    iv = data[NUM_WAYS - 1];
+    LOAD_data(iv, NUM_WAYS - 1)
     WOP (STORE_data)
   }
   WIDE_LOOP_END
@@ -866,7 +913,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
   SINGLE_LOOP
   {
     const v128 *w = wStart;
-    v128 m = *data;
+    v128 m;  LOAD_data(m, 0)
     AES_D_IMC_m (w[2])
     do
     {
@@ -878,8 +925,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
     AES_D_m  (w[1])
     MM_XOR_m (w[0])
     MM_XOR_m (iv)
-    iv = *data;
-    *data = m;
+    LOAD_data(iv, 0)
+    STORE_data(m, 0)
   }
   
   p[-2] = iv;
@@ -888,19 +935,17 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
 
 AES_FUNC_START2 (AesCtr_Code_HW)
 {
-  v128 *p = (v128*)(void*)ivAes;
-  v128 *data = (v128*)(void*)data8;
+  v128 *p = (v128 *)(void *)ivAes;
+  v128 *data = (v128 *)(void *)data8;
   uint64x2_t ctr = vreinterpretq_u64_u8(*p);
-  const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
+  const v128 * const wEnd = p + (size_t)*(const UInt32 *)(p + 1) * 2;
   const v128 *dataEnd;
-  uint64x2_t one = vdupq_n_u64(0);
-
 // the bug in clang:
 // __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2);
 #if defined(__clang__) && (__clang_major__ <= 9)
 #pragma GCC diagnostic ignored "-Wvector-conversion"
 #endif
-  one = vsetq_lane_u64(1, one, 0);
+  const uint64x2_t one = vsetq_lane_u64(1, vdupq_n_u64(0), 0);
   p += 2;
   
   WIDE_LOOP_START
diff --git a/3rdparty/lzma/src/CpuArch.c b/3rdparty/lzma/src/CpuArch.c
index e792f39deb..6e02551e2d 100644
--- a/3rdparty/lzma/src/CpuArch.c
+++ b/3rdparty/lzma/src/CpuArch.c
@@ -1,5 +1,5 @@
 /* CpuArch.c -- CPU specific code
-2024-07-04 : Igor Pavlov : Public domain */
+Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
@@ -17,7 +17,7 @@
 /*
   cpuid instruction supports (subFunction) parameter in ECX,
   that is used only with some specific (function) parameter values.
-  But we always use only (subFunction==0).
+  most functions use only (subFunction==0).
 */
 /*
   __cpuid(): MSVC and GCC/CLANG use same function/macro name
@@ -49,43 +49,49 @@
 #if defined(MY_CPU_AMD64) && defined(__PIC__) \
     && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__))
 
-#define x86_cpuid_MACRO(p, func) { \
+  /* "=&r" selects free register. It can select even rbx, if that register is free.
+     "=&D" for (RDI) also works, but the code can be larger with "=&D"
+     "2"(subFun) : 2 is (zero-based) index in the output constraint list "=c" (ECX). */
+
+#define x86_cpuid_MACRO_2(p, func, subFunc) { \
   __asm__ __volatile__ ( \
     ASM_LN   "mov     %%rbx, %q1"  \
     ASM_LN   "cpuid"               \
     ASM_LN   "xchg    %%rbx, %q1"  \
-    : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); }
-
-  /* "=&r" selects free register. It can select even rbx, if that register is free.
-     "=&D" for (RDI) also works, but the code can be larger with "=&D"
-     "2"(0) means (subFunction = 0),
-     2 is (zero-based) index in the output constraint list "=c" (ECX). */
+    : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }
 
 #elif defined(MY_CPU_X86) && defined(__PIC__) \
     && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__))
 
-#define x86_cpuid_MACRO(p, func) { \
+#define x86_cpuid_MACRO_2(p, func, subFunc) { \
   __asm__ __volatile__ ( \
     ASM_LN   "mov     %%ebx, %k1"  \
     ASM_LN   "cpuid"               \
     ASM_LN   "xchg    %%ebx, %k1"  \
-    : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); }
+    : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }
 
 #else
 
-#define x86_cpuid_MACRO(p, func) { \
+#define x86_cpuid_MACRO_2(p, func, subFunc) { \
   __asm__ __volatile__ ( \
     ASM_LN   "cpuid"               \
-    : "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); }
+    : "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }
 
 #endif
 
+#define x86_cpuid_MACRO(p, func)  x86_cpuid_MACRO_2(p, func, 0)
 
 void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
 {
   x86_cpuid_MACRO(p, func)
 }
 
+static
+void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
+{
+  x86_cpuid_MACRO_2(p, func, subFunc)
+}
+
 
 Z7_NO_INLINE
 UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
@@ -205,11 +211,39 @@ void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
   __asm   ret     0
 }
 
+static
+void __declspec(naked) Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
+{
+  UNUSED_VAR(p)
+  UNUSED_VAR(func)
+  UNUSED_VAR(subFunc)
+  __asm   push    ebx
+  __asm   push    edi
+  __asm   mov     edi, ecx    // p
+  __asm   mov     eax, edx    // func
+  __asm   mov     ecx, [esp + 12]  // subFunc
+  __asm   cpuid
+  __asm   mov     [edi     ], eax
+  __asm   mov     [edi +  4], ebx
+  __asm   mov     [edi +  8], ecx
+  __asm   mov     [edi + 12], edx
+  __asm   pop     edi
+  __asm   pop     ebx
+  __asm   ret     4
+}
+
 #else // MY_CPU_AMD64
 
     #if _MSC_VER >= 1600
       #include <intrin.h>
       #define MY_cpuidex  __cpuidex
+
+static
+void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
+{
+  __cpuidex((int *)p, func, subFunc);
+}
+
     #else
 /*
  __cpuid (func == (0 or 7)) requires subfunction number in ECX.
@@ -219,7 +253,7 @@ void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
  We still can use __cpuid for low (func) values that don't require ECX,
  but __cpuid() in old MSVC will be incorrect for some func values: (func == 7).
  So here we use the hack for old MSVC to send (subFunction) in ECX register to cpuid instruction,
- where ECX value is first parameter for FASTCALL / NO_INLINE func,
+ where ECX value is first parameter for FASTCALL / NO_INLINE func.
  So the caller of MY_cpuidex_HACK() sets ECX as subFunction, and
  old MSVC for __cpuid() doesn't change ECX and cpuid instruction gets (subFunction) value.
  
@@ -233,6 +267,11 @@ Z7_NO_INLINE void Z7_FASTCALL MY_cpuidex_HACK(Int32 subFunction, Int32 func, Int
 }
       #define MY_cpuidex(info, func, func2)  MY_cpuidex_HACK(func2, func, info)
       #pragma message("======== MY_cpuidex_HACK WAS USED ========")
+static
+void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
+{
+  MY_cpuidex_HACK(subFunc, func, (Int32 *)p);
+}
     #endif // _MSC_VER >= 1600
 
 #if !defined(MY_CPU_AMD64)
@@ -445,6 +484,23 @@ BoolInt CPU_IsSupported_SHA(void)
   }
 }
 
+
+BoolInt CPU_IsSupported_SHA512(void)
+{
+  if (!CPU_IsSupported_AVX2()) return False; // maybe CPU_IsSupported_AVX() is enough here
+
+  if (z7_x86_cpuid_GetMaxFunc() < 7)
+    return False;
+  {
+    UInt32 d[4];
+    z7_x86_cpuid_subFunc(d, 7, 0);
+    if (d[0] < 1) // d[0] - is max supported subleaf value
+      return False;
+    z7_x86_cpuid_subFunc(d, 7, 1);
+    return (BoolInt)(d[0]) & 1;
+  }
+}
+
 /*
 MSVC: _xgetbv() intrinsic is available since VS2010SP1.
    MSVC also defines (_XCR_XFEATURE_ENABLED_MASK) macro in
@@ -776,6 +832,18 @@ BoolInt CPU_IsSupported_NEON(void)
   return z7_sysctlbyname_Get_BoolInt("hw.optional.neon");
 }
 
+BoolInt CPU_IsSupported_SHA512(void)
+{
+  return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha512");
+}
+
+/*
+BoolInt CPU_IsSupported_SHA3(void)
+{
+  return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha3");
+}
+*/
+
 #ifdef MY_CPU_ARM64
 #define APPLE_CRYPTO_SUPPORT_VAL 1
 #else
@@ -860,6 +928,19 @@ MY_HWCAP_CHECK_FUNC (CRC32)
 MY_HWCAP_CHECK_FUNC (SHA1)
 MY_HWCAP_CHECK_FUNC (SHA2)
 MY_HWCAP_CHECK_FUNC (AES)
+#ifdef MY_CPU_ARM64
+// <hwcap.h> supports HWCAP_SHA512 and HWCAP_SHA3 since 2017.
+// we define them here, if they are not defined
+#ifndef HWCAP_SHA3
+// #define HWCAP_SHA3    (1 << 17)
+#endif
+#ifndef HWCAP_SHA512
+// #pragma message("=== HWCAP_SHA512 define === ")
+#define HWCAP_SHA512  (1 << 21)
+#endif
+MY_HWCAP_CHECK_FUNC (SHA512)
+// MY_HWCAP_CHECK_FUNC (SHA3)
+#endif
 
 #endif // __APPLE__
 #endif // _WIN32
diff --git a/3rdparty/lzma/src/LzFind.c b/3rdparty/lzma/src/LzFind.c
index 1ce404648e..6aba919d02 100644
--- a/3rdparty/lzma/src/LzFind.c
+++ b/3rdparty/lzma/src/LzFind.c
@@ -1,5 +1,5 @@
 /* LzFind.c -- Match finder for LZ algorithms
-2024-03-01 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
@@ -404,7 +404,7 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
         const unsigned nbMax =
             (p->numHashBytes == 2 ? 16 :
             (p->numHashBytes == 3 ? 24 : 32));
-        if (numBits > nbMax)
+        if (numBits >= nbMax)
           numBits = nbMax;
         if (numBits >= 32)
           hs = (UInt32)0 - 1;
@@ -416,14 +416,14 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
           hs |= (256 << kLzHash_CrcShift_2) - 1;
         {
           const UInt32 hs2 = MatchFinder_GetHashMask2(p, historySize);
-          if (hs > hs2)
+          if (hs >= hs2)
             hs = hs2;
         }
         hsCur = hs;
         if (p->expectedDataSize < historySize)
         {
           const UInt32 hs2 = MatchFinder_GetHashMask2(p, (UInt32)p->expectedDataSize);
-          if (hsCur > hs2)
+          if (hsCur >= hs2)
             hsCur = hs2;
         }
       }
@@ -434,7 +434,7 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
         if (p->expectedDataSize < historySize)
         {
           hsCur = MatchFinder_GetHashMask(p, (UInt32)p->expectedDataSize);
-          if (hsCur > hs) // is it possible?
+          if (hsCur >= hs) // is it possible?
             hsCur = hs;
         }
       }
@@ -890,7 +890,7 @@ static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos,
       return d;
     {
       const Byte *pb = cur - delta;
-      curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)];
+      curMatch = son[_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)];
       if (pb[maxLen] == cur[maxLen] && *pb == *cur)
       {
         UInt32 len = 0;
@@ -925,7 +925,7 @@ static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos,
       break;
     {
       ptrdiff_t diff;
-      curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)];
+      curMatch = son[_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)];
       diff = (ptrdiff_t)0 - (ptrdiff_t)delta;
       if (cur[maxLen] == cur[(ptrdiff_t)maxLen + diff])
       {
@@ -972,7 +972,7 @@ UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byt
   // if (curMatch >= pos) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; }
 
   cmCheck = (UInt32)(pos - _cyclicBufferSize);
-  if ((UInt32)pos <= _cyclicBufferSize)
+  if ((UInt32)pos < _cyclicBufferSize)
     cmCheck = 0;
 
   if (cmCheck < curMatch)
@@ -980,7 +980,7 @@ UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byt
   {
     const UInt32 delta = pos - curMatch;
     {
-      CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
+      CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)) << 1);
       const Byte *pb = cur - delta;
       unsigned len = (len0 < len1 ? len0 : len1);
       const UInt32 pair0 = pair[0];
@@ -1039,7 +1039,7 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const
   UInt32 cmCheck;
 
   cmCheck = (UInt32)(pos - _cyclicBufferSize);
-  if ((UInt32)pos <= _cyclicBufferSize)
+  if ((UInt32)pos < _cyclicBufferSize)
     cmCheck = 0;
 
   if (// curMatch >= pos ||  // failure
@@ -1048,7 +1048,7 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const
   {
     const UInt32 delta = pos - curMatch;
     {
-      CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
+      CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)) << 1);
       const Byte *pb = cur - delta;
       unsigned len = (len0 < len1 ? len0 : len1);
       if (pb[len] == cur[len])
@@ -1595,7 +1595,7 @@ static void Bt5_MatchFinder_Skip(void *_p, UInt32 num)
     UInt32 pos = p->pos; \
     UInt32 num2 = num; \
     /* (p->pos == p->posLimit) is not allowed here !!! */ \
-    { const UInt32 rem = p->posLimit - pos; if (num2 > rem) num2 = rem; } \
+    { const UInt32 rem = p->posLimit - pos; if (num2 >= rem) num2 = rem; } \
     num -= num2; \
     { const UInt32 cycPos = p->cyclicBufferPos; \
       son = p->son + cycPos; \
diff --git a/3rdparty/lzma/src/LzFindMt.c b/3rdparty/lzma/src/LzFindMt.c
index ac9d59d0fd..25fcc46517 100644
--- a/3rdparty/lzma/src/LzFindMt.c
+++ b/3rdparty/lzma/src/LzFindMt.c
@@ -1,5 +1,5 @@
 /* LzFindMt.c -- multithreaded Match finder for LZ algorithms
-2024-01-22 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
@@ -82,6 +82,8 @@ extern UInt64 g_NumIters_Bytes;
 Z7_NO_INLINE
 static void MtSync_Construct(CMtSync *p)
 {
+  p->affinityGroup = -1;
+  p->affinityInGroup = 0;
   p->affinity = 0;
   p->wasCreated = False;
   p->csWasInitialized = False;
@@ -259,6 +261,12 @@ static WRes MtSync_Create_WRes(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *
   // return ERROR_TOO_MANY_POSTS; // for debug
   // return EINVAL; // for debug
 
+#ifdef _WIN32
+  if (p->affinityGroup >= 0)
+    wres = Thread_Create_With_Group(&p->thread, startAddress, obj,
+        (unsigned)(UInt32)p->affinityGroup, (CAffinityMask)p->affinityInGroup);
+  else
+#endif
   if (p->affinity != 0)
     wres = Thread_Create_With_Affinity(&p->thread, startAddress, obj, (CAffinityMask)p->affinity);
   else
diff --git a/3rdparty/lzma/src/Lzma2Enc.c b/3rdparty/lzma/src/Lzma2Enc.c
index 703e146b57..72aec69533 100644
--- a/3rdparty/lzma/src/Lzma2Enc.c
+++ b/3rdparty/lzma/src/Lzma2Enc.c
@@ -1,5 +1,5 @@
 /* Lzma2Enc.c -- LZMA2 Encoder
-2023-04-13 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
@@ -235,6 +235,7 @@ void Lzma2EncProps_Init(CLzma2EncProps *p)
   p->numBlockThreads_Reduced = -1;
   p->numBlockThreads_Max = -1;
   p->numTotalThreads = -1;
+  p->numThreadGroups = 0;
 }
 
 void Lzma2EncProps_Normalize(CLzma2EncProps *p)
@@ -781,6 +782,7 @@ SRes Lzma2Enc_Encode2(CLzma2EncHandle p,
     }
 
     p->mtCoder.numThreadsMax = (unsigned)p->props.numBlockThreads_Max;
+    p->mtCoder.numThreadGroups = p->props.numThreadGroups;
     p->mtCoder.expectedDataSize = p->expectedDataSize;
     
     {
diff --git a/3rdparty/lzma/src/LzmaEnc.c b/3rdparty/lzma/src/LzmaEnc.c
index 37b2787db6..84a29a5c25 100644
--- a/3rdparty/lzma/src/LzmaEnc.c
+++ b/3rdparty/lzma/src/LzmaEnc.c
@@ -1,5 +1,5 @@
 /* LzmaEnc.c -- LZMA Encoder
-2024-01-24: Igor Pavlov : Public domain */
+Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
@@ -62,7 +62,9 @@ void LzmaEncProps_Init(CLzmaEncProps *p)
   p->lc = p->lp = p->pb = p->algo = p->fb = p->btMode = p->numHashBytes = p->numThreads = -1;
   p->numHashOutBits = 0;
   p->writeEndMark = 0;
+  p->affinityGroup = -1;
   p->affinity = 0;
+  p->affinityInGroup = 0;
 }
 
 void LzmaEncProps_Normalize(CLzmaEncProps *p)
@@ -72,11 +74,11 @@ void LzmaEncProps_Normalize(CLzmaEncProps *p)
   p->level = level;
   
   if (p->dictSize == 0)
-    p->dictSize =
-      ( level <= 3 ? ((UInt32)1 << (level * 2 + 16)) :
-      ( level <= 6 ? ((UInt32)1 << (level + 19)) :
-      ( level <= 7 ? ((UInt32)1 << 25) : ((UInt32)1 << 26)
-      )));
+    p->dictSize = (unsigned)level <= 4 ?
+        (UInt32)1 << (level * 2 + 16) :
+        (unsigned)level <= sizeof(size_t) / 2 + 4 ?
+          (UInt32)1 << (level + 20) :
+          (UInt32)1 << (sizeof(size_t) / 2 + 24);
 
   if (p->dictSize > p->reduceSize)
   {
@@ -92,8 +94,8 @@ void LzmaEncProps_Normalize(CLzmaEncProps *p)
   if (p->lp < 0) p->lp = 0;
   if (p->pb < 0) p->pb = 2;
 
-  if (p->algo < 0) p->algo = (level < 5 ? 0 : 1);
-  if (p->fb < 0) p->fb = (level < 7 ? 32 : 64);
+  if (p->algo < 0) p->algo = (unsigned)level < 5 ? 0 : 1;
+  if (p->fb < 0) p->fb = (unsigned)level < 7 ? 32 : 64;
   if (p->btMode < 0) p->btMode = (p->algo == 0 ? 0 : 1);
   if (p->numHashBytes < 0) p->numHashBytes = (p->btMode ? 4 : 5);
   if (p->mc == 0) p->mc = (16 + ((unsigned)p->fb >> 1)) >> (p->btMode ? 0 : 1);
@@ -598,6 +600,10 @@ SRes LzmaEnc_SetProps(CLzmaEncHandle p, const CLzmaEncProps *props2)
   p->multiThread = (props.numThreads > 1);
   p->matchFinderMt.btSync.affinity =
   p->matchFinderMt.hashSync.affinity = props.affinity;
+  p->matchFinderMt.btSync.affinityGroup =
+  p->matchFinderMt.hashSync.affinityGroup = props.affinityGroup;
+  p->matchFinderMt.btSync.affinityInGroup =
+  p->matchFinderMt.hashSync.affinityInGroup = props.affinityInGroup;
   #endif
 
   return SZ_OK;
diff --git a/3rdparty/lzma/src/MtCoder.c b/3rdparty/lzma/src/MtCoder.c
index 03959b6cad..923b19ac48 100644
--- a/3rdparty/lzma/src/MtCoder.c
+++ b/3rdparty/lzma/src/MtCoder.c
@@ -1,5 +1,5 @@
 /* MtCoder.c -- Multi-thread Coder
-2023-09-07 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
@@ -39,14 +39,28 @@ void MtProgressThunk_CreateVTable(CMtProgressThunk *p)
 static THREAD_FUNC_DECL ThreadFunc(void *pp);
 
 
-static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t)
+static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t
+#ifdef _WIN32
+    , CMtCoder * const mtc
+#endif
+    )
 {
   WRes wres = AutoResetEvent_OptCreate_And_Reset(&t->startEvent);
+  // printf("\n====== MtCoderThread_CreateAndStart : \n");
   if (wres == 0)
   {
     t->stop = False;
     if (!Thread_WasCreated(&t->thread))
-      wres = Thread_Create(&t->thread, ThreadFunc, t);
+    {
+#ifdef _WIN32
+      if (mtc->numThreadGroups)
+        wres = Thread_Create_With_Group(&t->thread, ThreadFunc, t,
+            ThreadNextGroup_GetNext(&mtc->nextGroup), // group
+            0); // affinityMask
+      else
+#endif
+        wres = Thread_Create(&t->thread, ThreadFunc, t);
+    }
     if (wres == 0)
       wres = Event_Set(&t->startEvent);
   }
@@ -56,6 +70,7 @@ static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t)
 }
 
 
+Z7_FORCE_INLINE
 static void MtCoderThread_Destruct(CMtCoderThread *t)
 {
   if (Thread_WasCreated(&t->thread))
@@ -85,7 +100,7 @@ static void MtCoderThread_Destruct(CMtCoderThread *t)
 
 static SRes ThreadFunc2(CMtCoderThread *t)
 {
-  CMtCoder *mtc = t->mtCoder;
+  CMtCoder * const mtc = t->mtCoder;
 
   for (;;)
   {
@@ -185,7 +200,11 @@ static SRes ThreadFunc2(CMtCoderThread *t)
       if (mtc->numStartedThreads < mtc->numStartedThreadsLimit
           && mtc->expectedDataSize != readProcessed)
       {
-        res = MtCoderThread_CreateAndStart(&mtc->threads[mtc->numStartedThreads]);
+        res = MtCoderThread_CreateAndStart(&mtc->threads[mtc->numStartedThreads]
+#ifdef _WIN32
+            , mtc
+#endif
+            );
         if (res == SZ_OK)
           mtc->numStartedThreads++;
         else
@@ -221,7 +240,7 @@ static SRes ThreadFunc2(CMtCoderThread *t)
     }
 
     {
-      CMtCoderBlock *block = &mtc->blocks[bi];
+      CMtCoderBlock * const block = &mtc->blocks[bi];
       block->res = res;
       block->bufIndex = bufIndex;
       block->finished = finished;
@@ -311,7 +330,7 @@ static SRes ThreadFunc2(CMtCoderThread *t)
 
 static THREAD_FUNC_DECL ThreadFunc(void *pp)
 {
-  CMtCoderThread *t = (CMtCoderThread *)pp;
+  CMtCoderThread * const t = (CMtCoderThread *)pp;
   for (;;)
   {
     if (Event_Wait(&t->startEvent) != 0)
@@ -319,7 +338,7 @@ static THREAD_FUNC_DECL ThreadFunc(void *pp)
     if (t->stop)
       return 0;
     {
-      SRes res = ThreadFunc2(t);
+      const SRes res = ThreadFunc2(t);
       CMtCoder *mtc = t->mtCoder;
       if (res != SZ_OK)
       {
@@ -328,7 +347,7 @@ static THREAD_FUNC_DECL ThreadFunc(void *pp)
       
       #ifndef MTCODER_USE_WRITE_THREAD
       {
-        unsigned numFinished = (unsigned)InterlockedIncrement(&mtc->numFinishedThreads);
+        const unsigned numFinished = (unsigned)InterlockedIncrement(&mtc->numFinishedThreads);
         if (numFinished == mtc->numStartedThreads)
           if (Event_Set(&mtc->finishedEvent) != 0)
             return (THREAD_FUNC_RET_TYPE)SZ_ERROR_THREAD;
@@ -346,6 +365,7 @@ void MtCoder_Construct(CMtCoder *p)
   
   p->blockSize = 0;
   p->numThreadsMax = 0;
+  p->numThreadGroups = 0;
   p->expectedDataSize = (UInt64)(Int64)-1;
 
   p->inStream = NULL;
@@ -429,6 +449,8 @@ SRes MtCoder_Code(CMtCoder *p)
   unsigned i;
   SRes res = SZ_OK;
 
+  // printf("\n====== MtCoder_Code : \n");
+
   if (numThreads > MTCODER_THREADS_MAX)
       numThreads = MTCODER_THREADS_MAX;
   numBlocksMax = MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads);
@@ -492,11 +514,22 @@ SRes MtCoder_Code(CMtCoder *p)
 
   p->numStartedThreadsLimit = numThreads;
   p->numStartedThreads = 0;
+  ThreadNextGroup_Init(&p->nextGroup, p->numThreadGroups, 0); // startGroup
 
   // for (i = 0; i < numThreads; i++)
   {
+    // here we create new thread for first block.
+    // And each new thread will create another new thread after block reading
+    // until numStartedThreadsLimit is reached.
     CMtCoderThread *nextThread = &p->threads[p->numStartedThreads++];
-    RINOK(MtCoderThread_CreateAndStart(nextThread))
+    {
+      const SRes res2 = MtCoderThread_CreateAndStart(nextThread
+#ifdef _WIN32
+            , p
+#endif
+            );
+      RINOK(res2)
+    }
   }
 
   RINOK_THREAD(Event_Set(&p->readEvent))
@@ -513,9 +546,9 @@ SRes MtCoder_Code(CMtCoder *p)
       RINOK_THREAD(Event_Wait(&p->writeEvents[bi]))
 
       {
-        const CMtCoderBlock *block = &p->blocks[bi];
-        unsigned bufIndex = block->bufIndex;
-        BoolInt finished = block->finished;
+        const CMtCoderBlock * const block = &p->blocks[bi];
+        const unsigned bufIndex = block->bufIndex;
+        const BoolInt finished = block->finished;
         if (res == SZ_OK && block->res != SZ_OK)
           res = block->res;
 
@@ -545,7 +578,7 @@ SRes MtCoder_Code(CMtCoder *p)
   }
   #else
   {
-    WRes wres = Event_Wait(&p->finishedEvent);
+    const WRes wres = Event_Wait(&p->finishedEvent);
     res = MY_SRes_HRESULT_FROM_WRes(wres);
   }
   #endif
diff --git a/3rdparty/lzma/src/Sha256.c b/3rdparty/lzma/src/Sha256.c
index 14d3be9c6a..ea7ed8e751 100644
--- a/3rdparty/lzma/src/Sha256.c
+++ b/3rdparty/lzma/src/Sha256.c
@@ -1,18 +1,14 @@
 /* Sha256.c -- SHA-256 Hash
-2024-03-01 : Igor Pavlov : Public domain
+: Igor Pavlov : Public domain
 This code is based on public domain code from Wei Dai's Crypto++ library. */
 
 #include "Precomp.h"
 
 #include <string.h>
 
-#include "CpuArch.h"
-#include "RotateDefs.h"
 #include "Sha256.h"
-
-#if defined(_MSC_VER) && (_MSC_VER < 1900)
-// #define USE_MY_MM
-#endif
+#include "RotateDefs.h"
+#include "CpuArch.h"
 
 #ifdef MY_CPU_X86_OR_AMD64
   #if   defined(Z7_LLVM_CLANG_VERSION)  && (Z7_LLVM_CLANG_VERSION  >= 30800) \
@@ -56,7 +52,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n
   static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS = Sha256_UpdateBlocks;
   static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS_HW;
 
-  #define SHA256_UPDATE_BLOCKS(p) p->func_UpdateBlocks
+  #define SHA256_UPDATE_BLOCKS(p) p->v.vars.func_UpdateBlocks
 #else
   #define SHA256_UPDATE_BLOCKS(p) Sha256_UpdateBlocks
 #endif
@@ -85,7 +81,7 @@ BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo)
       return False;
   #endif
 
-  p->func_UpdateBlocks = func;
+  p->v.vars.func_UpdateBlocks = func;
   return True;
 }
 
@@ -111,7 +107,7 @@ BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo)
 
 void Sha256_InitState(CSha256 *p)
 {
-  p->count = 0;
+  p->v.vars.count = 0;
   p->state[0] = 0x6a09e667;
   p->state[1] = 0xbb67ae85;
   p->state[2] = 0x3c6ef372;
@@ -122,9 +118,16 @@ void Sha256_InitState(CSha256 *p)
   p->state[7] = 0x5be0cd19;
 }
 
+
+
+
+
+
+
+
 void Sha256_Init(CSha256 *p)
 {
-  p->func_UpdateBlocks =
+  p->v.vars.func_UpdateBlocks =
   #ifdef Z7_COMPILER_SHA256_SUPPORTED
       g_SHA256_FUNC_UPDATE_BLOCKS;
   #else
@@ -133,10 +136,10 @@ void Sha256_Init(CSha256 *p)
   Sha256_InitState(p);
 }
 
-#define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x, 22))
-#define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x, 25))
+#define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x,22))
+#define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x,25))
 #define s0(x) (rotrFixed(x, 7) ^ rotrFixed(x,18) ^ (x >> 3))
-#define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >> 10))
+#define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >>10))
 
 #define Ch(x,y,z) (z^(x&(y^z)))
 #define Maj(x,y,z) ((x&y)|(z&(x|y)))
@@ -224,12 +227,10 @@ void Sha256_Init(CSha256 *p)
 
 #endif
 
-// static
-extern MY_ALIGN(64)
-const UInt32 SHA256_K_ARRAY[64];
 
-MY_ALIGN(64)
-const UInt32 SHA256_K_ARRAY[64] = {
+extern
+MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64];
+MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64] = {
   0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
   0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
   0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
@@ -248,27 +249,29 @@ const UInt32 SHA256_K_ARRAY[64] = {
   0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
 };
 
-#define K SHA256_K_ARRAY
 
 
+
+
+#define K SHA256_K_ARRAY
+
 Z7_NO_INLINE
 void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks)
 {
   UInt32 W
-  #ifdef Z7_SHA256_BIG_W
+#ifdef Z7_SHA256_BIG_W
       [64];
-  #else
+#else
       [16];
-  #endif
-
+#endif
   unsigned j;
-
   UInt32 a,b,c,d,e,f,g,h;
-
-  #if !defined(Z7_SHA256_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4)
+#if !defined(Z7_SHA256_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4)
   UInt32 tmp;
-  #endif
+#endif
   
+  if (numBlocks == 0) return;
+
   a = state[0];
   b = state[1];
   c = state[2];
@@ -278,7 +281,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n
   g = state[6];
   h = state[7];
 
-  while (numBlocks)
+  do
   {
 
   for (j = 0; j < 16; j += STEP_PRE)
@@ -352,19 +355,11 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n
   g += state[6]; state[6] = g;
   h += state[7]; state[7] = h;
 
-  data += 64;
-  numBlocks--;
+  data += SHA256_BLOCK_SIZE;
   }
-
-  /* Wipe variables */
-  /* memset(W, 0, sizeof(W)); */
+  while (--numBlocks);
 }
 
-#undef S0
-#undef S1
-#undef s0
-#undef s1
-#undef K
 
 #define Sha256_UpdateBlock(p) SHA256_UPDATE_BLOCKS(p)(p->state, p->buffer, 1)
 
@@ -372,20 +367,15 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size)
 {
   if (size == 0)
     return;
-
   {
-    unsigned pos = (unsigned)p->count & 0x3F;
-    unsigned num;
-    
-    p->count += size;
-    
-    num = 64 - pos;
+    const unsigned pos = (unsigned)p->v.vars.count & (SHA256_BLOCK_SIZE - 1);
+    const unsigned num = SHA256_BLOCK_SIZE - pos;
+    p->v.vars.count += size;
     if (num > size)
     {
       memcpy(p->buffer + pos, data, size);
       return;
     }
-    
     if (pos != 0)
     {
       size -= num;
@@ -395,9 +385,10 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size)
     }
   }
   {
-    size_t numBlocks = size >> 6;
+    const size_t numBlocks = size >> 6;
+    // if (numBlocks)
     SHA256_UPDATE_BLOCKS(p)(p->state, data, numBlocks);
-    size &= 0x3F;
+    size &= SHA256_BLOCK_SIZE - 1;
     if (size == 0)
       return;
     data += (numBlocks << 6);
@@ -408,82 +399,69 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size)
 
 void Sha256_Final(CSha256 *p, Byte *digest)
 {
-  unsigned pos = (unsigned)p->count & 0x3F;
-  unsigned i;
-  
+  unsigned pos = (unsigned)p->v.vars.count & (SHA256_BLOCK_SIZE - 1);
   p->buffer[pos++] = 0x80;
-  
-  if (pos > (64 - 8))
+  if (pos > (SHA256_BLOCK_SIZE - 4 * 2))
   {
-    while (pos != 64) { p->buffer[pos++] = 0; }
-    // memset(&p->buf.buffer[pos], 0, 64 - pos);
+    while (pos != SHA256_BLOCK_SIZE) { p->buffer[pos++] = 0; }
+    // memset(&p->buf.buffer[pos], 0, SHA256_BLOCK_SIZE - pos);
     Sha256_UpdateBlock(p);
     pos = 0;
   }
-
-  /*
-  if (pos & 3)
+  memset(&p->buffer[pos], 0, (SHA256_BLOCK_SIZE - 4 * 2) - pos);
   {
-    p->buffer[pos] = 0;
-    p->buffer[pos + 1] = 0;
-    p->buffer[pos + 2] = 0;
-    pos += 3;
-    pos &= ~3;
+    const UInt64 numBits = p->v.vars.count << 3;
+    SetBe32(p->buffer + SHA256_BLOCK_SIZE - 4 * 2, (UInt32)(numBits >> 32))
+    SetBe32(p->buffer + SHA256_BLOCK_SIZE - 4 * 1, (UInt32)(numBits))
   }
-  {
-    for (; pos < 64 - 8; pos += 4)
-      *(UInt32 *)(&p->buffer[pos]) = 0;
-  }
-  */
-
-  memset(&p->buffer[pos], 0, (64 - 8) - pos);
-
-  {
-    UInt64 numBits = (p->count << 3);
-    SetBe32(p->buffer + 64 - 8, (UInt32)(numBits >> 32))
-    SetBe32(p->buffer + 64 - 4, (UInt32)(numBits))
-  }
-  
   Sha256_UpdateBlock(p);
-
-  for (i = 0; i < 8; i += 2)
+#if 1 && defined(MY_CPU_BE)
+  memcpy(digest, p->state, SHA256_DIGEST_SIZE);
+#else
   {
-    UInt32 v0 = p->state[i];
-    UInt32 v1 = p->state[(size_t)i + 1];
-    SetBe32(digest    , v0)
-    SetBe32(digest + 4, v1)
-    digest += 8;
+    unsigned i;
+    for (i = 0; i < 8; i += 2)
+    {
+      const UInt32 v0 = p->state[i];
+      const UInt32 v1 = p->state[(size_t)i + 1];
+      SetBe32(digest    , v0)
+      SetBe32(digest + 4, v1)
+      digest += 4 * 2;
+    }
   }
-  
+
+
+
+
+#endif
   Sha256_InitState(p);
 }
 
 
 void Sha256Prepare(void)
 {
-  #ifdef Z7_COMPILER_SHA256_SUPPORTED
+#ifdef Z7_COMPILER_SHA256_SUPPORTED
   SHA256_FUNC_UPDATE_BLOCKS f, f_hw;
   f = Sha256_UpdateBlocks;
   f_hw = NULL;
-  #ifdef MY_CPU_X86_OR_AMD64
-  #ifndef USE_MY_MM
+#ifdef MY_CPU_X86_OR_AMD64
   if (CPU_IsSupported_SHA()
       && CPU_IsSupported_SSSE3()
-      // && CPU_IsSupported_SSE41()
       )
-  #endif
-  #else
+#else
   if (CPU_IsSupported_SHA2())
-  #endif
+#endif
   {
     // printf("\n========== HW SHA256 ======== \n");
     f = f_hw = Sha256_UpdateBlocks_HW;
   }
   g_SHA256_FUNC_UPDATE_BLOCKS    = f;
   g_SHA256_FUNC_UPDATE_BLOCKS_HW = f_hw;
-  #endif
+#endif
 }
 
+#undef U64C
+#undef K
 #undef S0
 #undef S1
 #undef s0
diff --git a/3rdparty/lzma/src/Sha256Opt.c b/3rdparty/lzma/src/Sha256Opt.c
index eb38166646..1c6b50f8d3 100644
--- a/3rdparty/lzma/src/Sha256Opt.c
+++ b/3rdparty/lzma/src/Sha256Opt.c
@@ -1,18 +1,11 @@
 /* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions
-2024-03-01 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 #include "Compiler.h"
 #include "CpuArch.h"
 
-#if defined(_MSC_VER)
-#if (_MSC_VER < 1900) && (_MSC_VER >= 1200)
-// #define USE_MY_MM
-#endif
-#endif
-
 // #define Z7_USE_HW_SHA_STUB // for debug
-
 #ifdef MY_CPU_X86_OR_AMD64
   #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check
       #define USE_HW_SHA
@@ -20,19 +13,14 @@
      || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
      || defined(Z7_GCC_VERSION)         && (Z7_GCC_VERSION         >= 40900)
       #define USE_HW_SHA
-      #if !defined(_INTEL_COMPILER)
+      #if !defined(__INTEL_COMPILER)
       // icc defines __GNUC__, but icc doesn't support __attribute__(__target__)
       #if !defined(__SHA__) || !defined(__SSSE3__)
         #define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
       #endif
       #endif
   #elif defined(_MSC_VER)
-    #ifdef USE_MY_MM
-      #define USE_VER_MIN 1300
-    #else
-      #define USE_VER_MIN 1900
-    #endif
-    #if (_MSC_VER >= USE_VER_MIN)
+    #if (_MSC_VER >= 1900)
       #define USE_HW_SHA
     #else
       #define Z7_USE_HW_SHA_STUB
@@ -47,23 +35,20 @@
 
 // #pragma message("Sha256 HW")
 
+
+
+
 // sse/sse2/ssse3:
 #include <tmmintrin.h>
 // sha*:
 #include <immintrin.h>
 
 #if defined (__clang__) && defined(_MSC_VER)
-  // #if !defined(__SSSE3__)
-  // #endif
   #if !defined(__SHA__)
     #include <shaintrin.h>
   #endif
 #else
 
-#ifdef USE_MY_MM
-#include "My_mm.h"
-#endif
-
 #endif
 
 /*
@@ -91,60 +76,44 @@ SHA:
 extern
 MY_ALIGN(64)
 const UInt32 SHA256_K_ARRAY[64];
-
 #define K SHA256_K_ARRAY
 
 
 #define ADD_EPI32(dest, src)      dest = _mm_add_epi32(dest, src);
 #define SHA256_MSG1(dest, src)    dest = _mm_sha256msg1_epu32(dest, src);
-#define SHA25G_MSG2(dest, src)    dest = _mm_sha256msg2_epu32(dest, src);
-
+#define SHA256_MSG2(dest, src)    dest = _mm_sha256msg2_epu32(dest, src);
 
 #define LOAD_SHUFFLE(m, k) \
     m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \
     m = _mm_shuffle_epi8(m, mask); \
 
-#define SM1(g0, g1, g2, g3) \
-    SHA256_MSG1(g3, g0); \
+#define NNN(m0, m1, m2, m3)
 
-#define SM2(g0, g1, g2, g3) \
-    tmp = _mm_alignr_epi8(g1, g0, 4); \
-    ADD_EPI32(g2, tmp) \
-    SHA25G_MSG2(g2, g1); \
-
-// #define LS0(k, g0, g1, g2, g3) LOAD_SHUFFLE(g0, k)
-// #define LS1(k, g0, g1, g2, g3) LOAD_SHUFFLE(g1, k+1)
-
-
-#define NNN(g0, g1, g2, g3)
+#define SM1(m1, m2, m3, m0) \
+    SHA256_MSG1(m0, m1); \
 
+#define SM2(m2, m3, m0, m1) \
+    ADD_EPI32(m0, _mm_alignr_epi8(m3, m2, 4)) \
+    SHA256_MSG2(m0, m3); \
 
 #define RND2(t0, t1) \
     t0 = _mm_sha256rnds2_epu32(t0, t1, msg);
 
-#define RND2_0(m, k) \
-    msg = _mm_add_epi32(m, *(const __m128i *) (const void *) &K[(k) * 4]); \
+
+
+#define R4(k, m0, m1, m2, m3, OP0, OP1) \
+    msg = _mm_add_epi32(m0, *(const __m128i *) (const void *) &K[(k) * 4]); \
     RND2(state0, state1); \
     msg = _mm_shuffle_epi32(msg, 0x0E); \
-
-
-#define RND2_1 \
+    OP0(m0, m1, m2, m3) \
     RND2(state1, state0); \
-
-
-// We use scheme with 3 rounds ahead for SHA256_MSG1 / 2 rounds ahead for SHA256_MSG2
-
-#define R4(k, g0, g1, g2, g3, OP0, OP1) \
-    RND2_0(g0, k) \
-    OP0(g0, g1, g2, g3) \
-    RND2_1 \
-    OP1(g0, g1, g2, g3) \
+    OP1(m0, m1, m2, m3) \
 
 #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
-    R4 ( (k)*4+0,        m0,m1,m2,m3, OP0, OP1 ) \
-    R4 ( (k)*4+1,        m1,m2,m3,m0, OP2, OP3 ) \
-    R4 ( (k)*4+2,        m2,m3,m0,m1, OP4, OP5 ) \
-    R4 ( (k)*4+3,        m3,m0,m1,m2, OP6, OP7 ) \
+    R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \
+    R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \
+    R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \
+    R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \
 
 #define PREPARE_STATE \
     tmp    = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \
@@ -161,8 +130,9 @@ ATTRIB_SHA
 void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
 {
   const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203);
-  __m128i tmp;
-  __m128i state0, state1;
+   
+  
+  __m128i tmp, state0, state1;
 
   if (numBlocks == 0)
     return;
@@ -262,22 +232,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
   #define _ARM_USE_NEW_NEON_INTRINSICS
 #endif
 
-
-
-
-
 #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
 #include <arm64_neon.h>
 #else
 
-
-
-
-
-
-
-
-
 #if defined(__clang__) && __clang_major__ < 16
 #if !defined(__ARM_FEATURE_SHA2) && \
     !defined(__ARM_FEATURE_CRYPTO)
@@ -324,41 +282,70 @@ typedef uint32x4_t v128;
 // typedef __n128 v128; // MSVC
 
 #ifdef MY_CPU_BE
-  #define MY_rev32_for_LE(x)
+  #define MY_rev32_for_LE(x) x
 #else
-  #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x)))
+  #define MY_rev32_for_LE(x) vrev32q_u8(x)
 #endif
 
-#define LOAD_128(_p)      (*(const v128 *)(const void *)(_p))
-#define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v)
+#if 1 // 0 for debug
+// for arm32: it works slower by some reason than direct code
+/*
+for arm32 it generates:
+MSVC-2022, GCC-9:
+    vld1.32 {d18,d19}, [r10]
+    vst1.32 {d4,d5}, [r3]
+    vld1.8  {d20-d21}, [r4]
+there is no align hint (like [r10:128]).  So instruction allows unaligned access
+*/
+#define LOAD_128_32(_p)       vld1q_u32(_p)
+#define LOAD_128_8(_p)        vld1q_u8 (_p)
+#define STORE_128_32(_p, _v)  vst1q_u32(_p, _v)
+#else
+/*
+for arm32:
+MSVC-2022:
+    vldm r10,{d18,d19}
+    vstm r3,{d4,d5}
+    does it require strict alignment?
+GCC-9:
+    vld1.64 {d30-d31}, [r0:64]
+    vldr  d28, [r0, #16]
+    vldr  d29, [r0, #24]
+    vst1.64 {d30-d31}, [r0:64]
+    vstr  d28, [r0, #16]
+    vstr  d29, [r0, #24]
+there is hint [r0:64], so does it requires 64-bit alignment.
+*/
+#define LOAD_128_32(_p)       (*(const v128 *)(const void *)(_p))
+#define LOAD_128_8(_p)        vreinterpretq_u8_u32(*(const v128 *)(const void *)(_p))
+#define STORE_128_32(_p, _v)  *(v128 *)(void *)(_p) = (_v)
+#endif
 
 #define LOAD_SHUFFLE(m, k) \
-    m = LOAD_128((data + (k) * 16)); \
-    MY_rev32_for_LE(m); \
+    m = vreinterpretq_u32_u8( \
+        MY_rev32_for_LE( \
+        LOAD_128_8(data + (k) * 16))); \
 
 // K array must be aligned for 16-bytes at least.
 extern
 MY_ALIGN(64)
 const UInt32 SHA256_K_ARRAY[64];
-
 #define K SHA256_K_ARRAY
 
-
 #define SHA256_SU0(dest, src)        dest = vsha256su0q_u32(dest, src);
-#define SHA25G_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3);
+#define SHA256_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3);
 
-#define SM1(g0, g1, g2, g3)  SHA256_SU0(g3, g0)
-#define SM2(g0, g1, g2, g3)  SHA25G_SU1(g2, g0, g1)
-#define NNN(g0, g1, g2, g3)
+#define SM1(m0, m1, m2, m3)  SHA256_SU0(m3, m0)
+#define SM2(m0, m1, m2, m3)  SHA256_SU1(m2, m0, m1)
+#define NNN(m0, m1, m2, m3)
 
-
-#define R4(k, g0, g1, g2, g3, OP0, OP1) \
-    msg = vaddq_u32(g0, *(const v128 *) (const void *) &K[(k) * 4]); \
+#define R4(k, m0, m1, m2, m3, OP0, OP1) \
+    msg = vaddq_u32(m0, *(const v128 *) (const void *) &K[(k) * 4]); \
     tmp = state0; \
     state0 = vsha256hq_u32( state0, state1, msg ); \
     state1 = vsha256h2q_u32( state1, tmp, msg ); \
-    OP0(g0, g1, g2, g3); \
-    OP1(g0, g1, g2, g3); \
+    OP0(m0, m1, m2, m3); \
+    OP1(m0, m1, m2, m3); \
 
 
 #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
@@ -379,8 +366,8 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
   if (numBlocks == 0)
     return;
 
-  state0 = LOAD_128(&state[0]);
-  state1 = LOAD_128(&state[4]);
+  state0 = LOAD_128_32(&state[0]);
+  state1 = LOAD_128_32(&state[4]);
   
   do
   {
@@ -408,8 +395,8 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
   }
   while (--numBlocks);
 
-  STORE_128(&state[0], state0);
-  STORE_128(&state[4], state1);
+  STORE_128_32(&state[0], state0);
+  STORE_128_32(&state[4], state1);
 }
 
 #endif // USE_HW_SHA
@@ -443,13 +430,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
 #endif
 
 
-
 #undef K
 #undef RND2
-#undef RND2_0
-#undef RND2_1
-
 #undef MY_rev32_for_LE
+
 #undef NNN
 #undef LOAD_128
 #undef STORE_128
@@ -457,7 +441,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
 #undef SM1
 #undef SM2
 
-#undef NNN
+
 #undef R4
 #undef R16
 #undef PREPARE_STATE
diff --git a/3rdparty/lzma/src/Sort.c b/3rdparty/lzma/src/Sort.c
index e1097e3806..20e3e69dc1 100644
--- a/3rdparty/lzma/src/Sort.c
+++ b/3rdparty/lzma/src/Sort.c
@@ -1,141 +1,268 @@
 /* Sort.c -- Sort functions
-2014-04-05 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
 #include "Sort.h"
+#include "CpuArch.h"
 
-#define HeapSortDown(p, k, size, temp) \
-  { for (;;) { \
-    size_t s = (k << 1); \
-    if (s > size) break; \
-    if (s < size && p[s + 1] > p[s]) s++; \
-    if (temp >= p[s]) break; \
-    p[k] = p[s]; k = s; \
-  } p[k] = temp; }
+#if (  (defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \
+    || (defined(__clang__) && Z7_has_builtin(__builtin_prefetch)) \
+    )
+// the code with prefetch is slow for small arrays on x86.
+// So we disable prefetch for x86.
+#ifndef MY_CPU_X86
+  // #pragma message("Z7_PREFETCH : __builtin_prefetch")
+  #define Z7_PREFETCH(a)  __builtin_prefetch((a))
+#endif
 
-void HeapSort(UInt32 *p, size_t size)
-{
-  if (size <= 1)
-    return;
-  p--;
-  {
-    size_t i = size / 2;
-    do
-    {
-      UInt32 temp = p[i];
-      size_t k = i;
-      HeapSortDown(p, k, size, temp)
-    }
-    while (--i != 0);
-  }
-  /*
-  do
-  {
-    size_t k = 1;
-    UInt32 temp = p[size];
-    p[size--] = p[1];
-    HeapSortDown(p, k, size, temp)
-  }
-  while (size > 1);
-  */
-  while (size > 3)
-  {
-    UInt32 temp = p[size];
-    size_t k = (p[3] > p[2]) ? 3 : 2;
-    p[size--] = p[1];
-    p[1] = p[k];
-    HeapSortDown(p, k, size, temp)
-  }
-  {
-    UInt32 temp = p[size];
-    p[size] = p[1];
-    if (size > 2 && p[2] < temp)
-    {
-      p[1] = p[2];
-      p[2] = temp;
-    }
-    else
-      p[1] = temp;
-  }
-}
+#elif defined(_WIN32) // || defined(_MSC_VER) && (_MSC_VER >= 1200)
 
-void HeapSort64(UInt64 *p, size_t size)
-{
-  if (size <= 1)
-    return;
-  p--;
-  {
-    size_t i = size / 2;
-    do
-    {
-      UInt64 temp = p[i];
-      size_t k = i;
-      HeapSortDown(p, k, size, temp)
-    }
-    while (--i != 0);
-  }
-  /*
-  do
-  {
-    size_t k = 1;
-    UInt64 temp = p[size];
-    p[size--] = p[1];
-    HeapSortDown(p, k, size, temp)
-  }
-  while (size > 1);
-  */
-  while (size > 3)
-  {
-    UInt64 temp = p[size];
-    size_t k = (p[3] > p[2]) ? 3 : 2;
-    p[size--] = p[1];
-    p[1] = p[k];
-    HeapSortDown(p, k, size, temp)
-  }
-  {
-    UInt64 temp = p[size];
-    p[size] = p[1];
-    if (size > 2 && p[2] < temp)
-    {
-      p[1] = p[2];
-      p[2] = temp;
-    }
-    else
-      p[1] = temp;
-  }
-}
+#include "7zWindows.h"
+
+// NOTE: CLANG/GCC/MSVC can define different values for _MM_HINT_T0 / PF_TEMPORAL_LEVEL_1.
+// For example, clang-cl can generate "prefetcht2" instruction for
+// PreFetchCacheLine(PF_TEMPORAL_LEVEL_1) call.
+// But we want to generate "prefetcht0" instruction.
+// So for CLANG/GCC we must use __builtin_prefetch() in code branch above
+// instead of PreFetchCacheLine() / _mm_prefetch().
+
+// New msvc-x86 compiler generates "prefetcht0" instruction for PreFetchCacheLine() call.
+// But old x86 cpus don't support "prefetcht0".
+// So we will use PreFetchCacheLine(), only if we are sure that
+// generated instruction is supported by all cpus of that isa.
+#if defined(MY_CPU_AMD64) \
+    || defined(MY_CPU_ARM64) \
+    || defined(MY_CPU_IA64)
+// we need to use additional braces for (a) in PreFetchCacheLine call, because
+// PreFetchCacheLine macro doesn't use braces:
+//   #define PreFetchCacheLine(l, a)  _mm_prefetch((CHAR CONST *) a, l)
+  // #pragma message("Z7_PREFETCH : PreFetchCacheLine")
+  #define Z7_PREFETCH(a)  PreFetchCacheLine(PF_TEMPORAL_LEVEL_1, (a))
+#endif
+
+#endif // _WIN32
+
+
+#define PREFETCH_NO(p,k,s,size)
+
+#ifndef Z7_PREFETCH
+  #define SORT_PREFETCH(p,k,s,size)
+#else
+
+// #define PREFETCH_LEVEL 2  // use it if cache line is 32-bytes
+#define PREFETCH_LEVEL 3  // it is fast for most cases (64-bytes cache line prefetch)
+// #define PREFETCH_LEVEL 4  // it can be faster for big array (128-bytes prefetch)
+
+#if PREFETCH_LEVEL == 0
+
+  #define SORT_PREFETCH(p,k,s,size)
+
+#else // PREFETCH_LEVEL != 0
 
 /*
-#define HeapSortRefDown(p, vals, n, size, temp) \
-  { size_t k = n; UInt32 val = vals[temp]; for (;;) { \
-    size_t s = (k << 1); \
-    if (s > size) break; \
-    if (s < size && vals[p[s + 1]] > vals[p[s]]) s++; \
-    if (val >= vals[p[s]]) break; \
-    p[k] = p[s]; k = s; \
-  } p[k] = temp; }
+if  defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
+    we prefetch one value per cache line.
+    Use it if array is aligned for cache line size (64 bytes)
+    or if array is small (less than L1 cache size).
 
-void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size)
+if !defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
+    we perfetch all cache lines that can be required.
+    it can be faster for big unaligned arrays.
+*/
+  #define USE_PREFETCH_FOR_ALIGNED_ARRAY
+
+// s == k * 2
+#if 0 && PREFETCH_LEVEL <= 3 && defined(MY_CPU_X86_OR_AMD64)
+  // x86 supports (lea r1*8+offset)
+  #define PREFETCH_OFFSET(k,s)  ((s) << PREFETCH_LEVEL)
+#else
+  #define PREFETCH_OFFSET(k,s)  ((k) << (PREFETCH_LEVEL + 1))
+#endif
+
+#if 1 && PREFETCH_LEVEL <= 3 && defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
+  #define PREFETCH_ADD_OFFSET   0
+#else
+  // last offset that can be reqiured in PREFETCH_LEVEL step:
+  #define PREFETCH_RANGE        ((2 << PREFETCH_LEVEL) - 1)
+  #define PREFETCH_ADD_OFFSET   PREFETCH_RANGE / 2
+#endif
+
+#if PREFETCH_LEVEL <= 3
+
+#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
+  #define SORT_PREFETCH(p,k,s,size) \
+  { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_ADD_OFFSET; \
+    if (s2 <= size) { \
+      Z7_PREFETCH((p + s2)); \
+  }}
+#else /* for unaligned array */
+  #define SORT_PREFETCH(p,k,s,size) \
+  { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
+    if (s2 <= size) { \
+      Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
+      Z7_PREFETCH((p + s2)); \
+  }}
+#endif
+
+#else // PREFETCH_LEVEL > 3
+
+#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
+  #define SORT_PREFETCH(p,k,s,size) \
+  { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE - 16 / 2; \
+    if (s2 <= size) { \
+      Z7_PREFETCH((p + s2 - 16)); \
+      Z7_PREFETCH((p + s2)); \
+  }}
+#else /* for unaligned array */
+  #define SORT_PREFETCH(p,k,s,size) \
+  { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
+    if (s2 <= size) { \
+      Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
+      Z7_PREFETCH((p + s2 - PREFETCH_RANGE / 2)); \
+      Z7_PREFETCH((p + s2)); \
+  }}
+#endif
+
+#endif // PREFETCH_LEVEL > 3
+#endif // PREFETCH_LEVEL != 0
+#endif // Z7_PREFETCH
+
+
+#if defined(MY_CPU_ARM64) \
+    /* || defined(MY_CPU_AMD64) */ \
+    /* || defined(MY_CPU_ARM) && !defined(_MSC_VER) */
+  // we want to use cmov, if cmov is very fast:
+  // - this cmov version is slower for clang-x64.
+  // - this cmov version is faster for gcc-arm64 for some fast arm64 cpus.
+  #define Z7_FAST_CMOV_SUPPORTED
+#endif
+ 
+#ifdef Z7_FAST_CMOV_SUPPORTED
+  // we want to use cmov here, if cmov is fast: new arm64 cpus.
+  // we want the compiler to use conditional move for this branch
+  #define GET_MAX_VAL(n0, n1, max_val_slow)  if (n0 < n1) n0 = n1;
+#else
+  // use this branch, if cpu doesn't support fast conditional move.
+  // it uses slow array access reading:
+  #define GET_MAX_VAL(n0, n1, max_val_slow)  n0 = max_val_slow;
+#endif
+
+#define HeapSortDown(p, k, size, temp, macro_prefetch) \
+{ \
+  for (;;) { \
+    UInt32 n0, n1; \
+    size_t s = k * 2; \
+    if (s >= size) { \
+      if (s == size) { \
+        n0 = p[s]; \
+        p[k] = n0; \
+        if (temp < n0) k = s; \
+      } \
+      break; \
+    } \
+    n0 = p[k * 2]; \
+    n1 = p[k * 2 + 1]; \
+    s += n0 < n1; \
+    GET_MAX_VAL(n0, n1, p[s]) \
+    if (temp >= n0) break; \
+    macro_prefetch(p, k, s, size) \
+    p[k] = n0; \
+    k = s; \
+  } \
+  p[k] = temp; \
+}
+
+
+/*
+stage-1 : O(n) :
+  we generate intermediate partially sorted binary tree:
+  p[0]  : it's additional item for better alignment of tree structure in memory.
+  p[1]
+  p[2]       p[3]
+  p[4] p[5]  p[6] p[7]
+  ...
+  p[x] >= p[x * 2]
+  p[x] >= p[x * 2 + 1]
+  
+stage-2 : O(n)*log2(N):
+  we move largest item p[0] from head of tree to the end of array
+  and insert last item to sorted binary tree.
+*/
+
+// (p) must be aligned for cache line size (64-bytes) for best performance
+
+void Z7_FASTCALL HeapSort(UInt32 *p, size_t size)
 {
-  if (size <= 1)
+  if (size < 2)
     return;
-  p--;
+  if (size == 2)
   {
-    size_t i = size / 2;
+    const UInt32 a0 = p[0];
+    const UInt32 a1 = p[1];
+    const unsigned k = a1 < a0;
+    p[k] = a0;
+    p[k ^ 1] = a1;
+    return;
+  }
+  {
+    // stage-1 : O(n)
+    // we transform array to partially sorted binary tree.
+    size_t i = --size / 2;
+    // (size) now is the index of the last item in tree,
+    // if (i)
+    {
+      do
+      {
+        const UInt32 temp = p[i];
+        size_t k = i;
+        HeapSortDown(p, k, size, temp, PREFETCH_NO)
+      }
+      while (--i);
+    }
+    {
+      const UInt32 temp = p[0];
+      const UInt32 a1 = p[1];
+      if (temp < a1)
+      {
+        size_t k = 1;
+        p[0] = a1;
+        HeapSortDown(p, k, size, temp, PREFETCH_NO)
+      }
+    }
+  }
+
+  if (size < 3)
+  {
+    // size == 2
+    const UInt32 a0 = p[0];
+    p[0] = p[2];
+    p[2] = a0;
+    return;
+  }
+  if (size != 3)
+  {
+    // stage-2 : O(size) * log2(size):
+    // we move largest item p[0] from head to the end of array,
+    // and insert last item to sorted binary tree.
     do
     {
-      UInt32 temp = p[i];
-      HeapSortRefDown(p, vals, i, size, temp);
+      const UInt32 temp = p[size];
+      size_t k = p[2] < p[3] ? 3 : 2;
+      p[size--] = p[0];
+      p[0] = p[1];
+      p[1] = p[k];
+      HeapSortDown(p, k, size, temp, SORT_PREFETCH) // PREFETCH_NO
     }
-    while (--i != 0);
+    while (size != 3);
   }
-  do
   {
-    UInt32 temp = p[size];
-    p[size--] = p[1];
-    HeapSortRefDown(p, vals, 1, size, temp);
+    const UInt32 a2 = p[2];
+    const UInt32 a3 = p[3];
+    const size_t k = a2 < a3;
+    p[2] = p[1];
+    p[3] = p[0];
+    p[k] = a3;
+    p[k ^ 1] = a2;
   }
-  while (size > 1);
 }
-*/
diff --git a/3rdparty/lzma/src/Threads.c b/3rdparty/lzma/src/Threads.c
index 464efeca49..177d1d9343 100644
--- a/3rdparty/lzma/src/Threads.c
+++ b/3rdparty/lzma/src/Threads.c
@@ -1,5 +1,5 @@
 /* Threads.c -- multithreading library
-2024-03-28 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
@@ -59,6 +59,100 @@ WRes Thread_Wait_Close(CThread *p)
   return (res != 0 ? res : res2);
 }
 
+typedef struct MY_PROCESSOR_NUMBER {
+    WORD  Group;
+    BYTE  Number;
+    BYTE  Reserved;
+} MY_PROCESSOR_NUMBER, *MY_PPROCESSOR_NUMBER;
+
+typedef struct MY_GROUP_AFFINITY {
+#if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 100000)
+    // KAFFINITY is not defined in old mingw
+    ULONG_PTR
+#else
+    KAFFINITY
+#endif
+      Mask;
+    WORD   Group;
+    WORD   Reserved[3];
+} MY_GROUP_AFFINITY, *MY_PGROUP_AFFINITY;
+
+typedef BOOL (WINAPI *Func_SetThreadGroupAffinity)(
+    HANDLE hThread,
+    CONST MY_GROUP_AFFINITY *GroupAffinity,
+    MY_PGROUP_AFFINITY PreviousGroupAffinity);
+
+typedef BOOL (WINAPI *Func_GetThreadGroupAffinity)(
+    HANDLE hThread,
+    MY_PGROUP_AFFINITY GroupAffinity);
+
+typedef BOOL (WINAPI *Func_GetProcessGroupAffinity)(
+    HANDLE hProcess,
+    PUSHORT GroupCount,
+    PUSHORT GroupArray);
+
+Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION
+
+#if 0
+#include <stdio.h>
+#define PRF(x) x
+/*
+--
+  before call of SetThreadGroupAffinity()
+    GetProcessGroupAffinity return one group.
+  after call of SetThreadGroupAffinity():
+    GetProcessGroupAffinity return more than group,
+    if SetThreadGroupAffinity() was to another group.
+--
+  GetProcessAffinityMask MS DOCs:
+  {
+    If the calling process contains threads in multiple groups,
+    the function returns zero for both affinity masks.
+  }
+  but tests in win10 with 2 groups (less than 64 cores total):
+    GetProcessAffinityMask() still returns non-zero affinity masks
+    even after SetThreadGroupAffinity() calls.
+*/
+static void PrintProcess_Info()
+{
+  {
+    const
+      Func_GetProcessGroupAffinity fn_GetProcessGroupAffinity =
+     (Func_GetProcessGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
+          "GetProcessGroupAffinity");
+    if (fn_GetProcessGroupAffinity)
+    {
+      unsigned i;
+      USHORT GroupCounts[64];
+      USHORT GroupCount = Z7_ARRAY_SIZE(GroupCounts);
+      BOOL boolRes = fn_GetProcessGroupAffinity(GetCurrentProcess(),
+          &GroupCount, GroupCounts);
+      printf("\n====== GetProcessGroupAffinity : "
+          "boolRes=%u GroupCounts = %u :",
+          boolRes, (unsigned)GroupCount);
+      for (i = 0; i < GroupCount; i++)
+        printf(" %u", GroupCounts[i]);
+      printf("\n");
+    }
+  }
+  {
+    DWORD_PTR processAffinityMask, systemAffinityMask;
+    if (GetProcessAffinityMask(GetCurrentProcess(), &processAffinityMask, &systemAffinityMask))
+    {
+      PRF(printf("\n====== GetProcessAffinityMask : "
+        ": processAffinityMask=%x, systemAffinityMask=%x\n",
+        (UInt32)processAffinityMask, (UInt32)systemAffinityMask);)
+    }
+    else
+      printf("\n==GetProcessAffinityMask FAIL");
+  }
+}
+#else
+#ifndef USE_THREADS_CreateThread
+// #define PRF(x)
+#endif
+#endif
+
 WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param)
 {
   /* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */
@@ -72,7 +166,43 @@ WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param)
   
   unsigned threadId;
   *p = (HANDLE)(_beginthreadex(NULL, 0, func, param, 0, &threadId));
-  
+
+#if 0 // 1 : for debug
+  {
+      DWORD_PTR prevMask;
+      DWORD_PTR affinity = 1 << 0;
+      prevMask = SetThreadAffinityMask(*p, (DWORD_PTR)affinity);
+      prevMask = prevMask;
+  }
+#endif
+#if 0 // 1 : for debug
+  {
+      /* win10: new thread will be created in same group that is assigned to parent thread
+                but affinity mask will contain all allowed threads of that group,
+                even if affinity mask of parent group is not full
+         win11: what group it will be created, if we have set
+                affinity of parent thread with ThreadGroupAffinity?
+      */
+      const
+         Func_GetThreadGroupAffinity fn =
+        (Func_GetThreadGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
+             "GetThreadGroupAffinity");
+      if (fn)
+      {
+        // BOOL wres2;
+        MY_GROUP_AFFINITY groupAffinity;
+        memset(&groupAffinity, 0, sizeof(groupAffinity));
+        /* wres2 = */ fn(*p, &groupAffinity);
+        PRF(printf("\n==Thread_Create cur = %6u GetThreadGroupAffinity(): "
+            "wres2_BOOL = %u, group=%u mask=%x\n",
+            GetCurrentThreadId(),
+            wres2,
+            groupAffinity.Group,
+            (UInt32)groupAffinity.Mask);)
+      }
+  }
+#endif
+
   #endif
 
   /* maybe we must use errno here, but probably GetLastError() is also OK. */
@@ -110,7 +240,84 @@ WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param
       */
     }
     {
-      DWORD prevSuspendCount = ResumeThread(h);
+      const DWORD prevSuspendCount = ResumeThread(h);
+      /* ResumeThread() returns:
+         0 : was_not_suspended
+         1 : was_resumed
+        -1 : error
+      */
+      if (prevSuspendCount == (DWORD)-1)
+        wres = GetError();
+    }
+  }
+
+  /* maybe we must use errno here, but probably GetLastError() is also OK. */
+  return wres;
+
+  #endif
+}
+
+
+WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinityMask)
+{
+#ifdef USE_THREADS_CreateThread
+
+  UNUSED_VAR(group)
+  UNUSED_VAR(affinityMask)
+  return Thread_Create(p, func, param);
+  
+#else
+  
+  /* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */
+  HANDLE h;
+  WRes wres;
+  unsigned threadId;
+  h = (HANDLE)(_beginthreadex(NULL, 0, func, param, CREATE_SUSPENDED, &threadId));
+  *p = h;
+  wres = HandleToWRes(h);
+  if (h)
+  {
+    // PrintProcess_Info();
+    {
+      const
+         Func_SetThreadGroupAffinity fn =
+        (Func_SetThreadGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
+             "SetThreadGroupAffinity");
+      if (fn)
+      {
+        // WRes wres2;
+        MY_GROUP_AFFINITY groupAffinity, prev_groupAffinity;
+        memset(&groupAffinity, 0, sizeof(groupAffinity));
+        // groupAffinity.Mask must use only bits that supported by current group
+        // (groupAffinity.Mask = 0) means all allowed bits
+        groupAffinity.Mask = affinityMask;
+        groupAffinity.Group = (WORD)group;
+        // wres2 =
+        fn(h, &groupAffinity, &prev_groupAffinity);
+        /*
+        if (groupAffinity.Group == prev_groupAffinity.Group)
+          wres2 = wres2;
+        else
+          wres2 = wres2;
+        if (wres2 == 0)
+        {
+          wres2 = GetError();
+          PRF(printf("\n==SetThreadGroupAffinity error: %u\n", wres2);)
+        }
+        else
+        {
+          PRF(printf("\n==Thread_Create_With_Group::SetThreadGroupAffinity()"
+            " threadId = %6u"
+            " group=%u mask=%x\n",
+            threadId,
+            prev_groupAffinity.Group,
+            (UInt32)prev_groupAffinity.Mask);)
+        }
+        */
+      }
+    }
+    {
+      const DWORD prevSuspendCount = ResumeThread(h);
       /* ResumeThread() returns:
          0 : was_not_suspended
          1 : was_resumed
@@ -297,6 +504,13 @@ WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param)
   return Thread_Create_With_CpuSet(p, func, param, NULL);
 }
 
+/*
+WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinity)
+{
+  UNUSED_VAR(group)
+  return Thread_Create_With_Affinity(p, func, param, affinity);
+}
+*/
 
 WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, CAffinityMask affinity)
 {
@@ -577,5 +791,22 @@ WRes AutoResetEvent_OptCreate_And_Reset(CAutoResetEvent *p)
   return AutoResetEvent_CreateNotSignaled(p);
 }
 
+void ThreadNextGroup_Init(CThreadNextGroup *p, UInt32 numGroups, UInt32 startGroup)
+{
+  // printf("\n====== ThreadNextGroup_Init numGroups = %x: startGroup=%x\n", numGroups, startGroup);
+  if (numGroups == 0)
+      numGroups = 1;
+  p->NumGroups = numGroups;
+  p->NextGroup = startGroup % numGroups;
+}
+
+
+UInt32 ThreadNextGroup_GetNext(CThreadNextGroup *p)
+{
+  const UInt32 next = p->NextGroup;
+  p->NextGroup = (next + 1) % p->NumGroups;
+  return next;
+}
+
 #undef PRF
 #undef Print
diff --git a/3rdparty/lzma/src/XzCrc64Opt.c b/3rdparty/lzma/src/XzCrc64Opt.c
index 0c1fc2ffec..6eea4a3b6a 100644
--- a/3rdparty/lzma/src/XzCrc64Opt.c
+++ b/3rdparty/lzma/src/XzCrc64Opt.c
@@ -1,5 +1,5 @@
 /* XzCrc64Opt.c -- CRC64 calculation (optimized functions)
-2023-12-08 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
@@ -235,7 +235,7 @@ CRC64_FUNC_PRE_BE(Z7_CRC64_NUM_TABLES_USE)
       v  = Q32BE(1, w1) ^ Q32BE(0, w0);
       v ^= Q32BE(3, d1) ^ Q32BE(2, d0);
   #endif
-#elif
+#else
 #error Stop_Compiling_Bad_CRC64_NUM_TABLES
 #endif
       p += Z7_CRC64_NUM_TABLES_USE;
diff --git a/3rdparty/lzma/src/XzDec.c b/3rdparty/lzma/src/XzDec.c
index 3d1c98e631..2dac3247f9 100644
--- a/3rdparty/lzma/src/XzDec.c
+++ b/3rdparty/lzma/src/XzDec.c
@@ -1,5 +1,5 @@
 /* XzDec.c -- Xz Decode
-2024-03-01 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
@@ -59,7 +59,7 @@ unsigned Xz_ReadVarInt(const Byte *p, size_t maxSize, UInt64 *value)
 
   for (i = 0; i < limit;)
   {
-    Byte b = p[i];
+    const unsigned b = p[i];
     *value |= (UInt64)(b & 0x7F) << (7 * i++);
     if ((b & 0x80) == 0)
       return (b == 0 && i != 1) ? 0 : i;
@@ -796,11 +796,10 @@ SRes Xz_ParseHeader(CXzStreamFlags *p, const Byte *buf)
 
 static BoolInt Xz_CheckFooter(CXzStreamFlags flags, UInt64 indexSize, const Byte *buf)
 {
-  return indexSize == (((UInt64)GetUi32(buf + 4) + 1) << 2)
-      && GetUi32(buf) == CrcCalc(buf + 4, 6)
-      && flags == GetBe16(buf + 8)
-      && buf[10] == XZ_FOOTER_SIG_0
-      && buf[11] == XZ_FOOTER_SIG_1;
+  return indexSize == (((UInt64)GetUi32a(buf + 4) + 1) << 2)
+      && GetUi32a(buf) == CrcCalc(buf + 4, 6)
+      && flags == GetBe16a(buf + 8)
+      && GetUi16a(buf + 10) == (XZ_FOOTER_SIG_0 | (XZ_FOOTER_SIG_1 << 8));
 }
 
 #define READ_VARINT_AND_CHECK(buf, pos, size, res) \
@@ -1166,7 +1165,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
             p->indexPreSize = 1 + Xz_WriteVarInt(p->buf + 1, p->numBlocks);
             p->indexPos = p->indexPreSize;
             p->indexSize += p->indexPreSize;
-            Sha256_Final(&p->sha, p->shaDigest);
+            Sha256_Final(&p->sha, (Byte *)(void *)p->shaDigest32);
             Sha256_Init(&p->sha);
             p->crc = CrcUpdate(CRC_INIT_VAL, p->buf, p->indexPreSize);
             p->state = XZ_STATE_STREAM_INDEX;
@@ -1241,10 +1240,10 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
               break;
           }
           {
-            Byte digest[XZ_CHECK_SIZE_MAX];
+            UInt32 digest32[XZ_CHECK_SIZE_MAX / 4];
             p->state = XZ_STATE_BLOCK_HEADER;
             p->pos = 0;
-            if (XzCheck_Final(&p->check, digest) && memcmp(digest, p->buf, checkSize) != 0)
+            if (XzCheck_Final(&p->check, (void *)digest32) && memcmp(digest32, p->buf, checkSize) != 0)
               return SZ_ERROR_CRC;
             if (p->decodeOnlyOneBlock)
             {
@@ -1289,12 +1288,12 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
           }
           else
           {
-            Byte digest[SHA256_DIGEST_SIZE];
+            UInt32 digest32[SHA256_DIGEST_SIZE / 4];
             p->state = XZ_STATE_STREAM_INDEX_CRC;
             p->indexSize += 4;
             p->pos = 0;
-            Sha256_Final(&p->sha, digest);
-            if (memcmp(digest, p->shaDigest, SHA256_DIGEST_SIZE) != 0)
+            Sha256_Final(&p->sha, (void *)digest32);
+            if (memcmp(digest32, p->shaDigest32, SHA256_DIGEST_SIZE) != 0)
               return SZ_ERROR_CRC;
           }
         }
@@ -1313,7 +1312,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
           const Byte *ptr = p->buf;
           p->state = XZ_STATE_STREAM_FOOTER;
           p->pos = 0;
-          if (CRC_GET_DIGEST(p->crc) != GetUi32(ptr))
+          if (CRC_GET_DIGEST(p->crc) != GetUi32a(ptr))
             return SZ_ERROR_CRC;
         }
         break;
@@ -1343,7 +1342,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
       {
         if (*src != 0)
         {
-          if (((UInt32)p->padSize & 3) != 0)
+          if ((unsigned)p->padSize & 3)
             return SZ_ERROR_NO_ARCHIVE;
           p->pos = 0;
           p->state = XZ_STATE_STREAM_HEADER;
diff --git a/3rdparty/lzma/src/XzEnc.c b/3rdparty/lzma/src/XzEnc.c
index c1affadfa6..e40f0c88eb 100644
--- a/3rdparty/lzma/src/XzEnc.c
+++ b/3rdparty/lzma/src/XzEnc.c
@@ -1,5 +1,5 @@
 /* XzEnc.c -- Xz Encode
-2024-03-01 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
@@ -411,6 +411,7 @@ static SRes SeqInFilter_Read(ISeqInStreamPtr pp, void *data, size_t *size)
   }
 }
 
+Z7_FORCE_INLINE
 static void SeqInFilter_Construct(CSeqInFilter *p)
 {
   p->buf = NULL;
@@ -418,6 +419,7 @@ static void SeqInFilter_Construct(CSeqInFilter *p)
   p->vt.Read = SeqInFilter_Read;
 }
 
+Z7_FORCE_INLINE
 static void SeqInFilter_Free(CSeqInFilter *p, ISzAllocPtr alloc)
 {
   if (p->StateCoder.p)
@@ -507,6 +509,7 @@ void XzFilterProps_Init(CXzFilterProps *p)
 void XzProps_Init(CXzProps *p)
 {
   p->checkId = XZ_CHECK_CRC32;
+  p->numThreadGroups = 0;
   p->blockSize = XZ_PROPS_BLOCK_SIZE_AUTO;
   p->numBlockThreads_Reduced = -1;
   p->numBlockThreads_Max = -1;
@@ -689,6 +692,7 @@ typedef struct
 } CLzma2WithFilters;
 
 
+Z7_FORCE_INLINE
 static void Lzma2WithFilters_Construct(CLzma2WithFilters *p)
 {
   p->lzma2 = NULL;
@@ -712,6 +716,7 @@ static SRes Lzma2WithFilters_Create(CLzma2WithFilters *p, ISzAllocPtr alloc, ISz
 }
 
 
+Z7_FORCE_INLINE
 static void Lzma2WithFilters_Free(CLzma2WithFilters *p, ISzAllocPtr alloc)
 {
   #ifdef USE_SUBBLOCK
@@ -1236,6 +1241,7 @@ SRes XzEnc_Encode(CXzEncHandle p, ISeqOutStreamPtr outStream, ISeqInStreamPtr in
     }
 
     p->mtCoder.numThreadsMax = (unsigned)props->numBlockThreads_Max;
+    p->mtCoder.numThreadGroups = props->numThreadGroups;
     p->mtCoder.expectedDataSize = p->expectedDataSize;
     
     RINOK(MtCoder_Code(&p->mtCoder))
diff --git a/3rdparty/lzma/src/XzIn.c b/3rdparty/lzma/src/XzIn.c
index b68af965c1..ba316360d9 100644
--- a/3rdparty/lzma/src/XzIn.c
+++ b/3rdparty/lzma/src/XzIn.c
@@ -1,38 +1,39 @@
 /* XzIn.c - Xz input
-2023-09-07 : Igor Pavlov : Public domain */
+: Igor Pavlov : Public domain */
 
 #include "Precomp.h"
 
 #include <string.h>
 
 #include "7zCrc.h"
-#include "CpuArch.h"
 #include "Xz.h"
+#include "CpuArch.h"
 
-/*
-#define XZ_FOOTER_SIG_CHECK(p) (memcmp((p), XZ_FOOTER_SIG, XZ_FOOTER_SIG_SIZE) == 0)
-*/
-#define XZ_FOOTER_SIG_CHECK(p) ((p)[0] == XZ_FOOTER_SIG_0 && (p)[1] == XZ_FOOTER_SIG_1)
-
+#define XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(p) \
+    (GetUi16a((const Byte *)(const void *)(p) + 10) == \
+      (XZ_FOOTER_SIG_0 | (XZ_FOOTER_SIG_1 << 8)))
 
 SRes Xz_ReadHeader(CXzStreamFlags *p, ISeqInStreamPtr inStream)
 {
-  Byte sig[XZ_STREAM_HEADER_SIZE];
+  UInt32 data32[XZ_STREAM_HEADER_SIZE / 4];
   size_t processedSize = XZ_STREAM_HEADER_SIZE;
-  RINOK(SeqInStream_ReadMax(inStream, sig, &processedSize))
+  RINOK(SeqInStream_ReadMax(inStream, data32, &processedSize))
   if (processedSize != XZ_STREAM_HEADER_SIZE
-      || memcmp(sig, XZ_SIG, XZ_SIG_SIZE) != 0)
+      || memcmp(data32, XZ_SIG, XZ_SIG_SIZE) != 0)
     return SZ_ERROR_NO_ARCHIVE;
-  return Xz_ParseHeader(p, sig);
+  return Xz_ParseHeader(p, (const Byte *)(const void *)data32);
 }
 
-#define READ_VARINT_AND_CHECK(buf, pos, size, res) \
-  { const unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \
+#define READ_VARINT_AND_CHECK(buf, size, res) \
+{ const unsigned s = Xz_ReadVarInt(buf, size, res); \
   if (s == 0) return SZ_ERROR_ARCHIVE; \
-  pos += s; }
+  size -= s; \
+  buf += s; \
+}
 
 SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, UInt32 *headerSizeRes)
 {
+  MY_ALIGN(4)
   Byte header[XZ_BLOCK_HEADER_SIZE_MAX];
   unsigned headerSize;
   *headerSizeRes = 0;
@@ -57,8 +58,12 @@ SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex,
   return XzBlock_Parse(p, header);
 }
 
+
 #define ADD_SIZE_CHECK(size, val) \
-  { const UInt64 newSize = size + (val); if (newSize < size) return XZ_SIZE_OVERFLOW; size = newSize; }
+{ const UInt64 newSize = size + (val); \
+  if (newSize < size) return XZ_SIZE_OVERFLOW; \
+  size = newSize; \
+}
 
 UInt64 Xz_GetUnpackSize(const CXzStream *p)
 {
@@ -82,76 +87,85 @@ UInt64 Xz_GetPackSize(const CXzStream *p)
   return size;
 }
 
-/*
-SRes XzBlock_ReadFooter(CXzBlock *p, CXzStreamFlags f, ISeqInStreamPtr inStream)
-{
-  return SeqInStream_Read(inStream, p->check, XzFlags_GetCheckSize(f));
-}
-*/
 
-static SRes Xz_ReadIndex2(CXzStream *p, const Byte *buf, size_t size, ISzAllocPtr alloc)
+// input;
+//   CXzStream (p) is empty object.
+//   size != 0
+//   (size & 3) == 0
+//   (buf) is aligned for at least 4 bytes.
+// output:
+//   p->numBlocks is number of allocated items in p->blocks
+//   p->blocks[*] values must be ignored, if function returns error.
+static SRes Xz_ParseIndex(CXzStream *p, const Byte *buf, size_t size, ISzAllocPtr alloc)
 {
-  size_t numBlocks, pos = 1;
-  UInt32 crc;
-
+  size_t numBlocks;
   if (size < 5 || buf[0] != 0)
     return SZ_ERROR_ARCHIVE;
-
   size -= 4;
-  crc = CrcCalc(buf, size);
-  if (crc != GetUi32(buf + size))
-    return SZ_ERROR_ARCHIVE;
-
+  {
+    const UInt32 crc = CrcCalc(buf, size);
+    if (crc != GetUi32a(buf + size))
+      return SZ_ERROR_ARCHIVE;
+  }
+  buf++;
+  size--;
   {
     UInt64 numBlocks64;
-    READ_VARINT_AND_CHECK(buf, pos, size, &numBlocks64)
+    READ_VARINT_AND_CHECK(buf, size, &numBlocks64)
+    // (numBlocks64) is 63-bit value, so we can calculate (numBlocks64 * 2):
+    if (numBlocks64 * 2 > size)
+      return SZ_ERROR_ARCHIVE;
+    if (numBlocks64 >= ((size_t)1 << (sizeof(size_t) * 8 - 1)) / sizeof(CXzBlockSizes))
+      return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE
     numBlocks = (size_t)numBlocks64;
-    if (numBlocks != numBlocks64 || numBlocks * 2 > size)
-      return SZ_ERROR_ARCHIVE;
   }
-  
-  Xz_Free(p, alloc);
-  if (numBlocks != 0)
+  // Xz_Free(p, alloc); // it's optional, because (p) is empty already
+  if (numBlocks)
   {
-    size_t i;
-    p->numBlocks = numBlocks;
-    p->blocks = (CXzBlockSizes *)ISzAlloc_Alloc(alloc, sizeof(CXzBlockSizes) * numBlocks);
-    if (!p->blocks)
+    CXzBlockSizes *blocks = (CXzBlockSizes *)ISzAlloc_Alloc(alloc, sizeof(CXzBlockSizes) * numBlocks);
+    if (!blocks)
       return SZ_ERROR_MEM;
-    for (i = 0; i < numBlocks; i++)
+    p->blocks = blocks;
+    p->numBlocks = numBlocks;
+    // the caller will call Xz_Free() in case of error
+    do
     {
-      CXzBlockSizes *block = &p->blocks[i];
-      READ_VARINT_AND_CHECK(buf, pos, size, &block->totalSize)
-      READ_VARINT_AND_CHECK(buf, pos, size, &block->unpackSize)
-      if (block->totalSize == 0)
+      READ_VARINT_AND_CHECK(buf, size, &blocks->totalSize)
+      READ_VARINT_AND_CHECK(buf, size, &blocks->unpackSize)
+      if (blocks->totalSize == 0)
         return SZ_ERROR_ARCHIVE;
+      blocks++;
     }
+    while (--numBlocks);
   }
-  while ((pos & 3) != 0)
-    if (buf[pos++] != 0)
+  if (size >= 4)
+    return SZ_ERROR_ARCHIVE;
+  while (size)
+    if (buf[--size])
       return SZ_ERROR_ARCHIVE;
-  return (pos == size) ? SZ_OK : SZ_ERROR_ARCHIVE;
+  return SZ_OK;
 }
 
+
+/*
 static SRes Xz_ReadIndex(CXzStream *p, ILookInStreamPtr stream, UInt64 indexSize, ISzAllocPtr alloc)
 {
   SRes res;
   size_t size;
   Byte *buf;
-  if (indexSize > ((UInt32)1 << 31))
-    return SZ_ERROR_UNSUPPORTED;
+  if (indexSize >= ((size_t)1 << (sizeof(size_t) * 8 - 1)))
+    return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE
   size = (size_t)indexSize;
-  if (size != indexSize)
-    return SZ_ERROR_UNSUPPORTED;
   buf = (Byte *)ISzAlloc_Alloc(alloc, size);
   if (!buf)
     return SZ_ERROR_MEM;
   res = LookInStream_Read2(stream, buf, size, SZ_ERROR_UNSUPPORTED);
   if (res == SZ_OK)
-    res = Xz_ReadIndex2(p, buf, size, alloc);
+    res = Xz_ParseIndex(p, buf, size, alloc);
   ISzAlloc_Free(alloc, buf);
   return res;
 }
+*/
 
 static SRes LookInStream_SeekRead_ForArc(ILookInStreamPtr stream, UInt64 offset, void *buf, size_t size)
 {
@@ -160,84 +174,102 @@ static SRes LookInStream_SeekRead_ForArc(ILookInStreamPtr stream, UInt64 offset,
   /* return LookInStream_Read2(stream, buf, size, SZ_ERROR_NO_ARCHIVE); */
 }
 
+
+/*
+in:
+  (*startOffset) is position in (stream) where xz_stream must be finished.
+out:
+  if returns SZ_OK, then (*startOffset) is position in stream that shows start of xz_stream.
+*/
 static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startOffset, ISzAllocPtr alloc)
 {
-  UInt64 indexSize;
-  Byte buf[XZ_STREAM_FOOTER_SIZE];
+  #define TEMP_BUF_SIZE  (1 << 10)
+  UInt32 buf32[TEMP_BUF_SIZE / 4];
   UInt64 pos = (UInt64)*startOffset;
 
-  if ((pos & 3) != 0 || pos < XZ_STREAM_FOOTER_SIZE)
+  if ((pos & 3) || pos < XZ_STREAM_FOOTER_SIZE)
     return SZ_ERROR_NO_ARCHIVE;
-
   pos -= XZ_STREAM_FOOTER_SIZE;
-  RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf, XZ_STREAM_FOOTER_SIZE))
+  RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, XZ_STREAM_FOOTER_SIZE))
   
-  if (!XZ_FOOTER_SIG_CHECK(buf + 10))
+  if (!XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(buf32))
   {
-    UInt32 total = 0;
     pos += XZ_STREAM_FOOTER_SIZE;
-    
     for (;;)
     {
-      size_t i;
-      #define TEMP_BUF_SIZE (1 << 10)
-      Byte temp[TEMP_BUF_SIZE];
-      
-      i = (pos > TEMP_BUF_SIZE) ? TEMP_BUF_SIZE : (size_t)pos;
+      // pos != 0
+      // (pos & 3) == 0
+      size_t i = pos >= TEMP_BUF_SIZE ? TEMP_BUF_SIZE : (size_t)pos;
       pos -= i;
-      RINOK(LookInStream_SeekRead_ForArc(stream, pos, temp, i))
-      total += (UInt32)i;
-      for (; i != 0; i--)
-        if (temp[i - 1] != 0)
+      RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, i))
+      i /= 4;
+      do
+        if (buf32[i - 1] != 0)
           break;
-      if (i != 0)
-      {
-        if ((i & 3) != 0)
-          return SZ_ERROR_NO_ARCHIVE;
-        pos += i;
-        break;
-      }
-      if (pos < XZ_STREAM_FOOTER_SIZE || total > (1 << 16))
+      while (--i);
+
+      pos += i * 4;
+      #define XZ_STREAM_BACKWARD_READING_PAD_MAX (1 << 16)
+      // here we don't support rare case with big padding for xz stream.
+      // so we have padding limit for backward reading.
+      if ((UInt64)*startOffset - pos > XZ_STREAM_BACKWARD_READING_PAD_MAX)
         return SZ_ERROR_NO_ARCHIVE;
+      if (i)
+        break;
     }
-    
+    // we try to open xz stream after skipping zero padding.
+    // ((UInt64)*startOffset == pos) is possible here!
     if (pos < XZ_STREAM_FOOTER_SIZE)
       return SZ_ERROR_NO_ARCHIVE;
     pos -= XZ_STREAM_FOOTER_SIZE;
-    RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf, XZ_STREAM_FOOTER_SIZE))
-    if (!XZ_FOOTER_SIG_CHECK(buf + 10))
+    RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, XZ_STREAM_FOOTER_SIZE))
+    if (!XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(buf32))
       return SZ_ERROR_NO_ARCHIVE;
   }
   
-  p->flags = (CXzStreamFlags)GetBe16(buf + 8);
-
+  p->flags = (CXzStreamFlags)GetBe16a(buf32 + 2);
   if (!XzFlags_IsSupported(p->flags))
     return SZ_ERROR_UNSUPPORTED;
-
   {
     /* to eliminate GCC 6.3 warning:
        dereferencing type-punned pointer will break strict-aliasing rules */
-    const Byte *buf_ptr = buf;
-    if (GetUi32(buf_ptr) != CrcCalc(buf + 4, 6))
+    const UInt32 *buf_ptr = buf32;
+    if (GetUi32a(buf_ptr) != CrcCalc(buf32 + 1, 6))
       return SZ_ERROR_ARCHIVE;
   }
-
-  indexSize = ((UInt64)GetUi32(buf + 4) + 1) << 2;
-
-  if (pos < indexSize)
-    return SZ_ERROR_ARCHIVE;
-
-  pos -= indexSize;
-  RINOK(LookInStream_SeekTo(stream, pos))
-  RINOK(Xz_ReadIndex(p, stream, indexSize, alloc))
-
   {
-    UInt64 totalSize = Xz_GetPackSize(p);
-    if (totalSize == XZ_SIZE_OVERFLOW
-        || totalSize >= ((UInt64)1 << 63)
-        || pos < totalSize + XZ_STREAM_HEADER_SIZE)
+    const UInt64 indexSize = ((UInt64)GetUi32a(buf32 + 1) + 1) << 2;
+    if (pos < indexSize)
       return SZ_ERROR_ARCHIVE;
-    pos -= (totalSize + XZ_STREAM_HEADER_SIZE);
+    pos -= indexSize;
+    // v25.00: relaxed indexSize check. We allow big index table.
+    // if (indexSize > ((UInt32)1 << 31))
+    if (indexSize >= ((size_t)1 << (sizeof(size_t) * 8 - 1)))
+      return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE
+    RINOK(LookInStream_SeekTo(stream, pos))
+    // RINOK(Xz_ReadIndex(p, stream, indexSize, alloc))
+    {
+      SRes res;
+      const size_t size = (size_t)indexSize;
+      // if (size != indexSize) return SZ_ERROR_UNSUPPORTED;
+      Byte *buf = (Byte *)ISzAlloc_Alloc(alloc, size);
+      if (!buf)
+        return SZ_ERROR_MEM;
+      res = LookInStream_Read2(stream, buf, size, SZ_ERROR_UNSUPPORTED);
+      if (res == SZ_OK)
+        res = Xz_ParseIndex(p, buf, size, alloc);
+      ISzAlloc_Free(alloc, buf);
+      RINOK(res)
+    }
+  }
+  {
+    UInt64 total = Xz_GetPackSize(p);
+    if (total == XZ_SIZE_OVERFLOW || total >= ((UInt64)1 << 63))
+      return SZ_ERROR_ARCHIVE;
+    total += XZ_STREAM_HEADER_SIZE;
+    if (pos < total)
+      return SZ_ERROR_ARCHIVE;
+    pos -= total;
     RINOK(LookInStream_SeekTo(stream, pos))
     *startOffset = (Int64)pos;
   }
@@ -246,7 +278,6 @@ static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startO
     CSecToRead secToRead;
     SecToRead_CreateVTable(&secToRead);
     secToRead.realStream = stream;
-
     RINOK(Xz_ReadHeader(&headerFlags, &secToRead.vt))
     return (p->flags == headerFlags) ? SZ_OK : SZ_ERROR_ARCHIVE;
   }
@@ -257,8 +288,7 @@ static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startO
 
 void Xzs_Construct(CXzs *p)
 {
-  p->num = p->numAllocated = 0;
-  p->streams = 0;
+  Xzs_CONSTRUCT(p)
 }
 
 void Xzs_Free(CXzs *p, ISzAllocPtr alloc)
@@ -268,7 +298,7 @@ void Xzs_Free(CXzs *p, ISzAllocPtr alloc)
     Xz_Free(&p->streams[i], alloc);
   ISzAlloc_Free(alloc, p->streams);
   p->num = p->numAllocated = 0;
-  p->streams = 0;
+  p->streams = NULL;
 }
 
 UInt64 Xzs_GetNumBlocks(const CXzs *p)
@@ -307,34 +337,49 @@ UInt64 Xzs_GetPackSize(const CXzs *p)
 SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr stream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc)
 {
   Int64 endOffset = 0;
+  // it's supposed that CXzs object is empty here.
+  // if CXzs object is not empty, it will add new streams to that non-empty object.
+  // Xzs_Free(p, alloc); // it's optional call to empty CXzs object.
   RINOK(ILookInStream_Seek(stream, &endOffset, SZ_SEEK_END))
   *startOffset = endOffset;
   for (;;)
   {
     CXzStream st;
     SRes res;
-    Xz_Construct(&st);
+    Xz_CONSTRUCT(&st)
     res = Xz_ReadBackward(&st, stream, startOffset, alloc);
+    // if (res == SZ_OK), then (*startOffset) is start offset of new stream if
+    // if (res != SZ_OK), then (*startOffset) is unchend or it's expected start offset of stream with error
     st.startOffset = (UInt64)*startOffset;
-    RINOK(res)
+    // we must store (st) object to array, or we must free (st) local object.
+    if (res != SZ_OK)
+    {
+      Xz_Free(&st, alloc);
+      return res;
+    }
     if (p->num == p->numAllocated)
     {
       const size_t newNum = p->num + p->num / 4 + 1;
       void *data = ISzAlloc_Alloc(alloc, newNum * sizeof(CXzStream));
       if (!data)
+      {
+        Xz_Free(&st, alloc);
         return SZ_ERROR_MEM;
+      }
       p->numAllocated = newNum;
       if (p->num != 0)
         memcpy(data, p->streams, p->num * sizeof(CXzStream));
       ISzAlloc_Free(alloc, p->streams);
       p->streams = (CXzStream *)data;
     }
+    // we use direct copying of raw data from local variable (st) to object in array.
+    // so we don't need to call Xz_Free(&st, alloc) after copying and after p->num++
     p->streams[p->num++] = st;
     if (*startOffset == 0)
-      break;
-    RINOK(LookInStream_SeekTo(stream, (UInt64)*startOffset))
+      return SZ_OK;
+    // seek operation is optional:
+    // RINOK(LookInStream_SeekTo(stream, (UInt64)*startOffset))
     if (progress && ICompressProgress_Progress(progress, (UInt64)(endOffset - *startOffset), (UInt64)(Int64)-1) != SZ_OK)
       return SZ_ERROR_PROGRESS;
   }
-  return SZ_OK;
 }