mirror of
https://github.com/PCSX2/pcsx2.git
synced 2025-12-16 04:08:48 +00:00
3rdparty: Update LZMA/7zipSDK to 25.00
This commit is contained in:
parent
97ea52a6c1
commit
a14c8eb7d5
10
3rdparty/lzma/include/7zVersion.h
vendored
10
3rdparty/lzma/include/7zVersion.h
vendored
@ -1,7 +1,7 @@
|
||||
#define MY_VER_MAJOR 24
|
||||
#define MY_VER_MINOR 8
|
||||
#define MY_VER_MAJOR 25
|
||||
#define MY_VER_MINOR 0
|
||||
#define MY_VER_BUILD 0
|
||||
#define MY_VERSION_NUMBERS "24.08"
|
||||
#define MY_VERSION_NUMBERS "25.00"
|
||||
#define MY_VERSION MY_VERSION_NUMBERS
|
||||
|
||||
#ifdef MY_CPU_NAME
|
||||
@ -10,12 +10,12 @@
|
||||
#define MY_VERSION_CPU MY_VERSION
|
||||
#endif
|
||||
|
||||
#define MY_DATE "2024-08-11"
|
||||
#define MY_DATE "2025-07-05"
|
||||
#undef MY_COPYRIGHT
|
||||
#undef MY_VERSION_COPYRIGHT_DATE
|
||||
#define MY_AUTHOR_NAME "Igor Pavlov"
|
||||
#define MY_COPYRIGHT_PD "Igor Pavlov : Public domain"
|
||||
#define MY_COPYRIGHT_CR "Copyright (c) 1999-2024 Igor Pavlov"
|
||||
#define MY_COPYRIGHT_CR "Copyright (c) 1999-2025 Igor Pavlov"
|
||||
|
||||
#ifdef USE_COPYRIGHT_CR
|
||||
#define MY_COPYRIGHT MY_COPYRIGHT_CR
|
||||
|
||||
12
3rdparty/lzma/include/Compiler.h
vendored
12
3rdparty/lzma/include/Compiler.h
vendored
@ -1,5 +1,5 @@
|
||||
/* Compiler.h : Compiler specific defines and pragmas
|
||||
2024-01-22 : Igor Pavlov : Public domain */
|
||||
: Igor Pavlov : Public domain */
|
||||
|
||||
#ifndef ZIP7_INC_COMPILER_H
|
||||
#define ZIP7_INC_COMPILER_H
|
||||
@ -183,6 +183,16 @@ typedef void (*Z7_void_Function)(void);
|
||||
#define Z7_ATTRIB_NO_VECTORIZE
|
||||
#endif
|
||||
|
||||
#if defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1920)
|
||||
#define Z7_PRAGMA_OPTIMIZE_FOR_CODE_SIZE _Pragma("optimize ( \"s\", on )")
|
||||
#define Z7_PRAGMA_OPTIMIZE_DEFAULT _Pragma("optimize ( \"\", on )")
|
||||
#else
|
||||
#define Z7_PRAGMA_OPTIMIZE_FOR_CODE_SIZE
|
||||
#define Z7_PRAGMA_OPTIMIZE_DEFAULT
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
#if defined(MY_CPU_X86_OR_AMD64) && ( \
|
||||
defined(__clang__) && (__clang_major__ >= 4) \
|
||||
|| defined(__GNUC__) && (__GNUC__ >= 5))
|
||||
|
||||
41
3rdparty/lzma/include/CpuArch.h
vendored
41
3rdparty/lzma/include/CpuArch.h
vendored
@ -1,5 +1,5 @@
|
||||
/* CpuArch.h -- CPU specific code
|
||||
2024-06-17 : Igor Pavlov : Public domain */
|
||||
Igor Pavlov : Public domain */
|
||||
|
||||
#ifndef ZIP7_INC_CPU_ARCH_H
|
||||
#define ZIP7_INC_CPU_ARCH_H
|
||||
@ -47,6 +47,12 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
|
||||
#define MY_CPU_SIZEOF_POINTER 4
|
||||
#endif
|
||||
|
||||
#if defined(__SSE2__) \
|
||||
|| defined(MY_CPU_AMD64) \
|
||||
|| defined(_M_IX86_FP) && (_M_IX86_FP >= 2)
|
||||
#define MY_CPU_SSE2
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(_M_ARM64) \
|
||||
|| defined(_M_ARM64EC) \
|
||||
@ -509,11 +515,19 @@ problem-4 : performace:
|
||||
|
||||
#if defined(MY_CPU_LE_UNALIGN) && defined(Z7_CPU_FAST_BSWAP_SUPPORTED)
|
||||
|
||||
#if 0
|
||||
// Z7_BSWAP16 can be slow for x86-msvc
|
||||
#define GetBe16_to32(p) (Z7_BSWAP16 (*(const UInt16 *)(const void *)(p)))
|
||||
#else
|
||||
#define GetBe16_to32(p) (Z7_BSWAP32 (*(const UInt16 *)(const void *)(p)) >> 16)
|
||||
#endif
|
||||
|
||||
#define GetBe32(p) Z7_BSWAP32 (*(const UInt32 *)(const void *)(p))
|
||||
#define SetBe32(p, v) { (*(UInt32 *)(void *)(p)) = Z7_BSWAP32(v); }
|
||||
|
||||
#if defined(MY_CPU_LE_UNALIGN_64)
|
||||
#define GetBe64(p) Z7_BSWAP64 (*(const UInt64 *)(const void *)(p))
|
||||
#define SetBe64(p, v) { (*(UInt64 *)(void *)(p)) = Z7_BSWAP64(v); }
|
||||
#endif
|
||||
|
||||
#else
|
||||
@ -536,21 +550,39 @@ problem-4 : performace:
|
||||
#define GetBe64(p) (((UInt64)GetBe32(p) << 32) | GetBe32(((const Byte *)(p)) + 4))
|
||||
#endif
|
||||
|
||||
#ifndef SetBe64
|
||||
#define SetBe64(p, v) { Byte *_ppp_ = (Byte *)(p); UInt64 _vvv_ = (v); \
|
||||
_ppp_[0] = (Byte)(_vvv_ >> 56); \
|
||||
_ppp_[1] = (Byte)(_vvv_ >> 48); \
|
||||
_ppp_[2] = (Byte)(_vvv_ >> 40); \
|
||||
_ppp_[3] = (Byte)(_vvv_ >> 32); \
|
||||
_ppp_[4] = (Byte)(_vvv_ >> 24); \
|
||||
_ppp_[5] = (Byte)(_vvv_ >> 16); \
|
||||
_ppp_[6] = (Byte)(_vvv_ >> 8); \
|
||||
_ppp_[7] = (Byte)_vvv_; }
|
||||
#endif
|
||||
|
||||
#ifndef GetBe16
|
||||
#ifdef GetBe16_to32
|
||||
#define GetBe16(p) ( (UInt16) GetBe16_to32(p))
|
||||
#else
|
||||
#define GetBe16(p) ( (UInt16) ( \
|
||||
((UInt16)((const Byte *)(p))[0] << 8) | \
|
||||
((const Byte *)(p))[1] ))
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(MY_CPU_BE)
|
||||
#define Z7_CONV_BE_TO_NATIVE_CONST32(v) (v)
|
||||
#define Z7_CONV_LE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v)
|
||||
#define Z7_CONV_NATIVE_TO_BE_32(v) (v)
|
||||
// #define Z7_GET_NATIVE16_FROM_2_BYTES(b0, b1) ((b1) | ((b0) << 8))
|
||||
#elif defined(MY_CPU_LE)
|
||||
#define Z7_CONV_BE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v)
|
||||
#define Z7_CONV_LE_TO_NATIVE_CONST32(v) (v)
|
||||
#define Z7_CONV_NATIVE_TO_BE_32(v) Z7_BSWAP32(v)
|
||||
// #define Z7_GET_NATIVE16_FROM_2_BYTES(b0, b1) ((b0) | ((b1) << 8))
|
||||
#else
|
||||
#error Stop_Compiling_Unknown_Endian_CONV
|
||||
#endif
|
||||
@ -589,6 +621,11 @@ problem-4 : performace:
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef GetBe16_to32
|
||||
#define GetBe16_to32(p) GetBe16(p)
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(MY_CPU_X86_OR_AMD64) \
|
||||
|| defined(MY_CPU_ARM_OR_ARM64) \
|
||||
|| defined(MY_CPU_PPC_OR_PPC64)
|
||||
@ -617,6 +654,7 @@ BoolInt CPU_IsSupported_SSE2(void);
|
||||
BoolInt CPU_IsSupported_SSSE3(void);
|
||||
BoolInt CPU_IsSupported_SSE41(void);
|
||||
BoolInt CPU_IsSupported_SHA(void);
|
||||
BoolInt CPU_IsSupported_SHA512(void);
|
||||
BoolInt CPU_IsSupported_PageGB(void);
|
||||
|
||||
#elif defined(MY_CPU_ARM_OR_ARM64)
|
||||
@ -634,6 +672,7 @@ BoolInt CPU_IsSupported_SHA1(void);
|
||||
BoolInt CPU_IsSupported_SHA2(void);
|
||||
BoolInt CPU_IsSupported_AES(void);
|
||||
#endif
|
||||
BoolInt CPU_IsSupported_SHA512(void);
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
6
3rdparty/lzma/include/LzFindMt.h
vendored
6
3rdparty/lzma/include/LzFindMt.h
vendored
@ -1,5 +1,5 @@
|
||||
/* LzFindMt.h -- multithreaded Match finder for LZ algorithms
|
||||
2024-01-22 : Igor Pavlov : Public domain */
|
||||
: Igor Pavlov : Public domain */
|
||||
|
||||
#ifndef ZIP7_INC_LZ_FIND_MT_H
|
||||
#define ZIP7_INC_LZ_FIND_MT_H
|
||||
@ -12,8 +12,10 @@ EXTERN_C_BEGIN
|
||||
typedef struct
|
||||
{
|
||||
UInt32 numProcessedBlocks;
|
||||
CThread thread;
|
||||
Int32 affinityGroup;
|
||||
UInt64 affinityInGroup;
|
||||
UInt64 affinity;
|
||||
CThread thread;
|
||||
|
||||
BoolInt wasCreated;
|
||||
BoolInt needStart;
|
||||
|
||||
1
3rdparty/lzma/include/Lzma2Enc.h
vendored
1
3rdparty/lzma/include/Lzma2Enc.h
vendored
@ -18,6 +18,7 @@ typedef struct
|
||||
int numBlockThreads_Reduced;
|
||||
int numBlockThreads_Max;
|
||||
int numTotalThreads;
|
||||
unsigned numThreadGroups; // 0 : no groups
|
||||
} CLzma2EncProps;
|
||||
|
||||
void Lzma2EncProps_Init(CLzma2EncProps *p);
|
||||
|
||||
4
3rdparty/lzma/include/LzmaEnc.h
vendored
4
3rdparty/lzma/include/LzmaEnc.h
vendored
@ -1,5 +1,5 @@
|
||||
/* LzmaEnc.h -- LZMA Encoder
|
||||
2023-04-13 : Igor Pavlov : Public domain */
|
||||
: Igor Pavlov : Public domain */
|
||||
|
||||
#ifndef ZIP7_INC_LZMA_ENC_H
|
||||
#define ZIP7_INC_LZMA_ENC_H
|
||||
@ -29,11 +29,13 @@ typedef struct
|
||||
int numThreads; /* 1 or 2, default = 2 */
|
||||
|
||||
// int _pad;
|
||||
Int32 affinityGroup;
|
||||
|
||||
UInt64 reduceSize; /* estimated size of data that will be compressed. default = (UInt64)(Int64)-1.
|
||||
Encoder uses this value to reduce dictionary size */
|
||||
|
||||
UInt64 affinity;
|
||||
UInt64 affinityInGroup;
|
||||
} CLzmaEncProps;
|
||||
|
||||
void LzmaEncProps_Init(CLzmaEncProps *p);
|
||||
|
||||
7
3rdparty/lzma/include/MtCoder.h
vendored
7
3rdparty/lzma/include/MtCoder.h
vendored
@ -1,5 +1,5 @@
|
||||
/* MtCoder.h -- Multi-thread Coder
|
||||
2023-04-13 : Igor Pavlov : Public domain */
|
||||
: Igor Pavlov : Public domain */
|
||||
|
||||
#ifndef ZIP7_INC_MT_CODER_H
|
||||
#define ZIP7_INC_MT_CODER_H
|
||||
@ -16,7 +16,7 @@ EXTERN_C_BEGIN
|
||||
|
||||
#ifndef Z7_ST
|
||||
#define MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads) ((numThreads) + (numThreads) / 8 + 1)
|
||||
#define MTCODER_THREADS_MAX 64
|
||||
#define MTCODER_THREADS_MAX 256
|
||||
#define MTCODER_BLOCKS_MAX (MTCODER_GET_NUM_BLOCKS_FROM_THREADS(MTCODER_THREADS_MAX) + 3)
|
||||
#else
|
||||
#define MTCODER_THREADS_MAX 1
|
||||
@ -77,6 +77,7 @@ typedef struct CMtCoder_
|
||||
|
||||
size_t blockSize; /* size of input block */
|
||||
unsigned numThreadsMax;
|
||||
unsigned numThreadGroups;
|
||||
UInt64 expectedDataSize;
|
||||
|
||||
ISeqInStreamPtr inStream;
|
||||
@ -125,6 +126,8 @@ typedef struct CMtCoder_
|
||||
CMtProgress mtProgress;
|
||||
CMtCoderBlock blocks[MTCODER_BLOCKS_MAX];
|
||||
CMtCoderThread threads[MTCODER_THREADS_MAX];
|
||||
|
||||
CThreadNextGroup nextGroup;
|
||||
} CMtCoder;
|
||||
|
||||
|
||||
|
||||
14
3rdparty/lzma/include/Sha256.h
vendored
14
3rdparty/lzma/include/Sha256.h
vendored
@ -1,5 +1,5 @@
|
||||
/* Sha256.h -- SHA-256 Hash
|
||||
2023-04-02 : Igor Pavlov : Public domain */
|
||||
: Igor Pavlov : Public domain */
|
||||
|
||||
#ifndef ZIP7_INC_SHA256_H
|
||||
#define ZIP7_INC_SHA256_H
|
||||
@ -14,6 +14,9 @@ EXTERN_C_BEGIN
|
||||
#define SHA256_BLOCK_SIZE (SHA256_NUM_BLOCK_WORDS * 4)
|
||||
#define SHA256_DIGEST_SIZE (SHA256_NUM_DIGEST_WORDS * 4)
|
||||
|
||||
|
||||
|
||||
|
||||
typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byte *data, size_t numBlocks);
|
||||
|
||||
/*
|
||||
@ -31,10 +34,17 @@ typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byt
|
||||
*/
|
||||
|
||||
typedef struct
|
||||
{
|
||||
union
|
||||
{
|
||||
struct
|
||||
{
|
||||
SHA256_FUNC_UPDATE_BLOCKS func_UpdateBlocks;
|
||||
UInt64 count;
|
||||
UInt64 _pad_2[2];
|
||||
} vars;
|
||||
UInt64 _pad_64bit[4];
|
||||
void *_pad_align_ptr[2];
|
||||
} v;
|
||||
UInt32 state[SHA256_NUM_DIGEST_WORDS];
|
||||
|
||||
Byte buffer[SHA256_BLOCK_SIZE];
|
||||
|
||||
7
3rdparty/lzma/include/Sort.h
vendored
7
3rdparty/lzma/include/Sort.h
vendored
@ -1,5 +1,5 @@
|
||||
/* Sort.h -- Sort functions
|
||||
2023-03-05 : Igor Pavlov : Public domain */
|
||||
: Igor Pavlov : Public domain */
|
||||
|
||||
#ifndef ZIP7_INC_SORT_H
|
||||
#define ZIP7_INC_SORT_H
|
||||
@ -8,10 +8,7 @@
|
||||
|
||||
EXTERN_C_BEGIN
|
||||
|
||||
void HeapSort(UInt32 *p, size_t size);
|
||||
void HeapSort64(UInt64 *p, size_t size);
|
||||
|
||||
/* void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size); */
|
||||
void Z7_FASTCALL HeapSort(UInt32 *p, size_t size);
|
||||
|
||||
EXTERN_C_END
|
||||
|
||||
|
||||
12
3rdparty/lzma/include/Threads.h
vendored
12
3rdparty/lzma/include/Threads.h
vendored
@ -1,5 +1,5 @@
|
||||
/* Threads.h -- multithreading library
|
||||
2024-03-28 : Igor Pavlov : Public domain */
|
||||
: Igor Pavlov : Public domain */
|
||||
|
||||
#ifndef ZIP7_INC_THREADS_H
|
||||
#define ZIP7_INC_THREADS_H
|
||||
@ -140,12 +140,22 @@ WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param
|
||||
WRes Thread_Wait_Close(CThread *p);
|
||||
|
||||
#ifdef _WIN32
|
||||
WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinityMask);
|
||||
#define Thread_Create_With_CpuSet(p, func, param, cs) \
|
||||
Thread_Create_With_Affinity(p, func, param, *cs)
|
||||
#else
|
||||
WRes Thread_Create_With_CpuSet(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, const CCpuSet *cpuSet);
|
||||
#endif
|
||||
|
||||
typedef struct
|
||||
{
|
||||
unsigned NumGroups;
|
||||
unsigned NextGroup;
|
||||
} CThreadNextGroup;
|
||||
|
||||
void ThreadNextGroup_Init(CThreadNextGroup *p, unsigned numGroups, unsigned startGroup);
|
||||
unsigned ThreadNextGroup_GetNext(CThreadNextGroup *p);
|
||||
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
|
||||
12
3rdparty/lzma/include/Xz.h
vendored
12
3rdparty/lzma/include/Xz.h
vendored
@ -1,5 +1,5 @@
|
||||
/* Xz.h - Xz interface
|
||||
2024-01-26 : Igor Pavlov : Public domain */
|
||||
Igor Pavlov : Public domain */
|
||||
|
||||
#ifndef ZIP7_INC_XZ_H
|
||||
#define ZIP7_INC_XZ_H
|
||||
@ -121,6 +121,7 @@ typedef struct
|
||||
UInt64 startOffset;
|
||||
} CXzStream;
|
||||
|
||||
#define Xz_CONSTRUCT(p) { (p)->numBlocks = 0; (p)->blocks = NULL; (p)->flags = 0; }
|
||||
void Xz_Construct(CXzStream *p);
|
||||
void Xz_Free(CXzStream *p, ISzAllocPtr alloc);
|
||||
|
||||
@ -136,8 +137,13 @@ typedef struct
|
||||
CXzStream *streams;
|
||||
} CXzs;
|
||||
|
||||
#define Xzs_CONSTRUCT(p) { (p)->num = 0; (p)->numAllocated = 0; (p)->streams = NULL; }
|
||||
void Xzs_Construct(CXzs *p);
|
||||
void Xzs_Free(CXzs *p, ISzAllocPtr alloc);
|
||||
/*
|
||||
Xzs_ReadBackward() must be called for empty CXzs object.
|
||||
Xzs_ReadBackward() can return non empty object with (p->num != 0) even in case of error.
|
||||
*/
|
||||
SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr inStream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc);
|
||||
|
||||
UInt64 Xzs_GetNumBlocks(const CXzs *p);
|
||||
@ -268,8 +274,8 @@ typedef struct
|
||||
size_t outBufSize;
|
||||
size_t outDataWritten; // the size of data in (outBuf) that were fully unpacked
|
||||
|
||||
Byte shaDigest[SHA256_DIGEST_SIZE];
|
||||
Byte buf[XZ_BLOCK_HEADER_SIZE_MAX];
|
||||
UInt32 shaDigest32[SHA256_DIGEST_SIZE / 4];
|
||||
Byte buf[XZ_BLOCK_HEADER_SIZE_MAX]; // it must be aligned for 4-bytes
|
||||
} CXzUnpacker;
|
||||
|
||||
/* alloc : aligned for cache line allocation is better */
|
||||
|
||||
3
3rdparty/lzma/include/XzEnc.h
vendored
3
3rdparty/lzma/include/XzEnc.h
vendored
@ -1,5 +1,5 @@
|
||||
/* XzEnc.h -- Xz Encode
|
||||
2023-04-13 : Igor Pavlov : Public domain */
|
||||
: Igor Pavlov : Public domain */
|
||||
|
||||
#ifndef ZIP7_INC_XZ_ENC_H
|
||||
#define ZIP7_INC_XZ_ENC_H
|
||||
@ -31,6 +31,7 @@ typedef struct
|
||||
CLzma2EncProps lzma2Props;
|
||||
CXzFilterProps filterProps;
|
||||
unsigned checkId;
|
||||
unsigned numThreadGroups; // 0 : no groups
|
||||
UInt64 blockSize;
|
||||
int numBlockThreads_Reduced;
|
||||
int numBlockThreads_Max;
|
||||
|
||||
5
3rdparty/lzma/src/7zDec.c
vendored
5
3rdparty/lzma/src/7zDec.c
vendored
@ -1,5 +1,5 @@
|
||||
/* 7zDec.c -- Decoding from 7z folder
|
||||
2024-03-01 : Igor Pavlov : Public domain */
|
||||
: Igor Pavlov : Public domain */
|
||||
|
||||
#include "Precomp.h"
|
||||
|
||||
@ -312,9 +312,10 @@ static BoolInt IS_MAIN_METHOD(UInt32 m)
|
||||
case k_PPMD:
|
||||
#endif
|
||||
return True;
|
||||
}
|
||||
default:
|
||||
return False;
|
||||
}
|
||||
}
|
||||
|
||||
static BoolInt IS_SUPPORTED_CODER(const CSzCoderInfo *c)
|
||||
{
|
||||
|
||||
203
3rdparty/lzma/src/AesOpt.c
vendored
203
3rdparty/lzma/src/AesOpt.c
vendored
@ -1,5 +1,5 @@
|
||||
/* AesOpt.c -- AES optimized code for x86 AES hardware instructions
|
||||
2024-03-01 : Igor Pavlov : Public domain */
|
||||
Igor Pavlov : Public domain */
|
||||
|
||||
#include "Precomp.h"
|
||||
|
||||
@ -80,7 +80,27 @@ AES_FUNC_START (name)
|
||||
|
||||
#define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src)
|
||||
|
||||
#if 1
|
||||
// use aligned SSE load/store for data.
|
||||
// It is required for our Aes functions, that data is aligned for 16-bytes.
|
||||
// So we can use this branch of code.
|
||||
// and compiler can use fused load-op SSE instructions:
|
||||
// xorps xmm0, XMMWORD PTR [rdx]
|
||||
#define LOAD_128(pp) (*(__m128i *)(void *)(pp))
|
||||
#define STORE_128(pp, _v) *(__m128i *)(void *)(pp) = _v
|
||||
// use aligned SSE load/store for data. Alternative code with direct access
|
||||
// #define LOAD_128(pp) _mm_load_si128(pp)
|
||||
// #define STORE_128(pp, _v) _mm_store_si128(pp, _v)
|
||||
#else
|
||||
// use unaligned load/store for data: movdqu XMMWORD PTR [rdx]
|
||||
#define LOAD_128(pp) _mm_loadu_si128(pp)
|
||||
#define STORE_128(pp, _v) _mm_storeu_si128(pp, _v)
|
||||
#endif
|
||||
|
||||
AES_FUNC_START2 (AesCbc_Encode_HW)
|
||||
{
|
||||
if (numBlocks == 0)
|
||||
return;
|
||||
{
|
||||
__m128i *p = (__m128i *)(void *)ivAes;
|
||||
__m128i *data = (__m128i *)(void *)data8;
|
||||
@ -88,11 +108,11 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
|
||||
const __m128i k0 = p[2];
|
||||
const __m128i k1 = p[3];
|
||||
const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
|
||||
for (; numBlocks != 0; numBlocks--, data++)
|
||||
do
|
||||
{
|
||||
UInt32 r = numRounds2;
|
||||
const __m128i *w = p + 4;
|
||||
__m128i temp = *data;
|
||||
__m128i temp = LOAD_128(data);
|
||||
MM_XOR (temp, k0)
|
||||
MM_XOR (m, temp)
|
||||
MM_OP_m (_mm_aesenc_si128, k1)
|
||||
@ -104,10 +124,13 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
|
||||
}
|
||||
while (--r);
|
||||
MM_OP_m (_mm_aesenclast_si128, w[0])
|
||||
*data = m;
|
||||
STORE_128(data, m);
|
||||
data++;
|
||||
}
|
||||
while (--numBlocks);
|
||||
*p = m;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#define WOP_1(op)
|
||||
@ -139,12 +162,12 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
|
||||
|
||||
#define WOP(op) op (m0, 0) WOP_M1(op)
|
||||
|
||||
|
||||
#define DECLARE_VAR(reg, ii) __m128i reg;
|
||||
#define LOAD_data( reg, ii) reg = data[ii];
|
||||
#define STORE_data( reg, ii) data[ii] = reg;
|
||||
#define LOAD_data_ii(ii) LOAD_128(data + (ii))
|
||||
#define LOAD_data( reg, ii) reg = LOAD_data_ii(ii);
|
||||
#define STORE_data( reg, ii) STORE_128(data + (ii), reg);
|
||||
#if (NUM_WAYS > 1)
|
||||
#define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1])
|
||||
#define XOR_data_M1(reg, ii) MM_XOR (reg, LOAD_128(data + (ii- 1)))
|
||||
#endif
|
||||
|
||||
#define MM_OP_key(op, reg) MM_OP(op, reg, key);
|
||||
@ -156,25 +179,22 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
|
||||
#define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg)
|
||||
|
||||
#define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr;
|
||||
#define CTR_END( reg, ii) MM_XOR (data[ii], reg)
|
||||
|
||||
#define CTR_END( reg, ii) STORE_128(data + (ii), _mm_xor_si128(reg, \
|
||||
LOAD_128 (data + (ii))));
|
||||
#define WOP_KEY(op, n) { \
|
||||
const __m128i key = w[n]; \
|
||||
WOP(op); }
|
||||
|
||||
WOP(op) }
|
||||
|
||||
#define WIDE_LOOP_START \
|
||||
dataEnd = data + numBlocks; \
|
||||
if (numBlocks >= NUM_WAYS) \
|
||||
{ dataEnd -= NUM_WAYS; do { \
|
||||
|
||||
|
||||
#define WIDE_LOOP_END \
|
||||
data += NUM_WAYS; \
|
||||
} while (data <= dataEnd); \
|
||||
dataEnd += NUM_WAYS; } \
|
||||
|
||||
|
||||
#define SINGLE_LOOP \
|
||||
for (; data < dataEnd; data++)
|
||||
|
||||
@ -184,34 +204,54 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
|
||||
|
||||
#define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src)
|
||||
#define AVX_DECLARE_VAR(reg, ii) __m256i reg;
|
||||
#define AVX_LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii];
|
||||
#define AVX_STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg;
|
||||
|
||||
#if 1
|
||||
// use unaligned AVX load/store for data.
|
||||
// It is required for our Aes functions, that data is aligned for 16-bytes.
|
||||
// But we need 32-bytes reading.
|
||||
// So we use intrinsics for unaligned AVX load/store.
|
||||
// notes for _mm256_storeu_si256:
|
||||
// msvc2022: uses vmovdqu and keeps the order of instruction sequence.
|
||||
// new gcc11 uses vmovdqu
|
||||
// old gcc9 could use pair of instructions:
|
||||
// vmovups %xmm7, -224(%rax)
|
||||
// vextracti128 $0x1, %ymm7, -208(%rax)
|
||||
#define AVX_LOAD(p) _mm256_loadu_si256((const __m256i *)(const void *)(p))
|
||||
#define AVX_STORE(p, _v) _mm256_storeu_si256((__m256i *)(void *)(p), _v);
|
||||
#else
|
||||
// use aligned AVX load/store for data.
|
||||
// for debug: we can use this branch, if we are sure that data is aligned for 32-bytes.
|
||||
// msvc2022 uses vmovdqu still
|
||||
// gcc uses vmovdqa (that requires 32-bytes alignment)
|
||||
#define AVX_LOAD(p) (*(const __m256i *)(const void *)(p))
|
||||
#define AVX_STORE(p, _v) (*(__m256i *)(void *)(p)) = _v;
|
||||
#endif
|
||||
|
||||
#define AVX_LOAD_data( reg, ii) reg = AVX_LOAD((const __m256i *)(const void *)data + (ii));
|
||||
#define AVX_STORE_data( reg, ii) AVX_STORE((__m256i *)(void *)data + (ii), reg)
|
||||
/*
|
||||
AVX_XOR_data_M1() needs unaligned memory load
|
||||
if (we don't use _mm256_loadu_si256() here)
|
||||
{
|
||||
Most compilers with enabled optimizations generate fused AVX (LOAD + OP)
|
||||
instruction that can load unaligned data.
|
||||
But GCC and CLANG without -O2 or -O1 optimizations can generate separated
|
||||
LOAD-ALIGNED (vmovdqa) instruction that will fail on execution.
|
||||
}
|
||||
Note: some compilers generate more instructions, if we use _mm256_loadu_si256() here.
|
||||
v23.02: we use _mm256_loadu_si256() here, because we need compatibility with any compiler.
|
||||
AVX_XOR_data_M1() needs unaligned memory load, even if (data)
|
||||
is aligned for 256-bits, because we read 32-bytes chunk that
|
||||
crosses (data) position: from (data - 16bytes) to (data + 16bytes).
|
||||
*/
|
||||
#define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256(&(((const __m256i *)(const void *)(data - 1))[ii])))
|
||||
// for debug only: the following code will fail on execution, if compiled by some compilers:
|
||||
// #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii]))
|
||||
#define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256((const __m256i *)(const void *)(data - 1) + (ii)))
|
||||
|
||||
#define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg)
|
||||
#define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg)
|
||||
#define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg)
|
||||
#define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg)
|
||||
#define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg)
|
||||
#define AVX_CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two) reg = _mm256_xor_si256(ctr2, key);
|
||||
#define AVX_CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg)
|
||||
#define AVX_CTR_START(reg, ii) \
|
||||
MM_OP (_mm256_add_epi64, ctr2, two) \
|
||||
reg = _mm256_xor_si256(ctr2, key);
|
||||
|
||||
#define AVX_CTR_END(reg, ii) \
|
||||
AVX_STORE((__m256i *)(void *)data + (ii), _mm256_xor_si256(reg, \
|
||||
AVX_LOAD ((__m256i *)(void *)data + (ii))));
|
||||
|
||||
#define AVX_WOP_KEY(op, n) { \
|
||||
const __m256i key = w[n]; \
|
||||
WOP(op); }
|
||||
WOP(op) }
|
||||
|
||||
#define NUM_AES_KEYS_MAX 15
|
||||
|
||||
@ -219,12 +259,11 @@ v23.02: we use _mm256_loadu_si256() here, because we need compatibility with any
|
||||
dataEnd = data + numBlocks; \
|
||||
if (numBlocks >= NUM_WAYS * 2) \
|
||||
{ __m256i keys[NUM_AES_KEYS_MAX]; \
|
||||
UInt32 ii; \
|
||||
OP \
|
||||
for (ii = 0; ii < numRounds; ii++) \
|
||||
keys[ii] = _mm256_broadcastsi128_si256(p[ii]); \
|
||||
dataEnd -= NUM_WAYS * 2; do { \
|
||||
|
||||
{ UInt32 ii; for (ii = 0; ii < numRounds; ii++) \
|
||||
keys[ii] = _mm256_broadcastsi128_si256(p[ii]); } \
|
||||
dataEnd -= NUM_WAYS * 2; \
|
||||
do { \
|
||||
|
||||
#define WIDE_LOOP_END_AVX(OP) \
|
||||
data += NUM_WAYS * 2; \
|
||||
@ -246,21 +285,20 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
|
||||
__m128i *p = (__m128i *)(void *)ivAes;
|
||||
__m128i *data = (__m128i *)(void *)data8;
|
||||
__m128i iv = *p;
|
||||
const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1;
|
||||
const __m128i * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2 + 2 - 1;
|
||||
const __m128i *dataEnd;
|
||||
p += 2;
|
||||
|
||||
WIDE_LOOP_START
|
||||
{
|
||||
const __m128i *w = wStart;
|
||||
|
||||
WOP (DECLARE_VAR)
|
||||
WOP (LOAD_data)
|
||||
WOP_KEY (AES_XOR, 1)
|
||||
|
||||
do
|
||||
{
|
||||
WOP_KEY (AES_DEC, 0)
|
||||
|
||||
w--;
|
||||
}
|
||||
while (w != p);
|
||||
@ -268,7 +306,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
|
||||
|
||||
MM_XOR (m0, iv)
|
||||
WOP_M1 (XOR_data_M1)
|
||||
iv = data[NUM_WAYS - 1];
|
||||
LOAD_data(iv, NUM_WAYS - 1)
|
||||
WOP (STORE_data)
|
||||
}
|
||||
WIDE_LOOP_END
|
||||
@ -276,7 +314,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
|
||||
SINGLE_LOOP
|
||||
{
|
||||
const __m128i *w = wStart - 1;
|
||||
__m128i m = _mm_xor_si128 (w[2], *data);
|
||||
__m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0));
|
||||
|
||||
do
|
||||
{
|
||||
MM_OP_m (_mm_aesdec_si128, w[1])
|
||||
@ -286,10 +325,9 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
|
||||
while (w != p);
|
||||
MM_OP_m (_mm_aesdec_si128, w[1])
|
||||
MM_OP_m (_mm_aesdeclast_si128, w[0])
|
||||
|
||||
MM_XOR (m, iv)
|
||||
iv = *data;
|
||||
*data = m;
|
||||
LOAD_data(iv, 0)
|
||||
STORE_data(m, 0)
|
||||
}
|
||||
|
||||
p[-2] = iv;
|
||||
@ -301,9 +339,9 @@ AES_FUNC_START2 (AesCtr_Code_HW)
|
||||
__m128i *p = (__m128i *)(void *)ivAes;
|
||||
__m128i *data = (__m128i *)(void *)data8;
|
||||
__m128i ctr = *p;
|
||||
UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1;
|
||||
const UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1;
|
||||
const __m128i *dataEnd;
|
||||
__m128i one = _mm_cvtsi32_si128(1);
|
||||
const __m128i one = _mm_cvtsi32_si128(1);
|
||||
|
||||
p += 2;
|
||||
|
||||
@ -322,7 +360,6 @@ AES_FUNC_START2 (AesCtr_Code_HW)
|
||||
}
|
||||
while (--r);
|
||||
WOP_KEY (AES_ENC_LAST, 0)
|
||||
|
||||
WOP (CTR_END)
|
||||
}
|
||||
WIDE_LOOP_END
|
||||
@ -344,7 +381,7 @@ AES_FUNC_START2 (AesCtr_Code_HW)
|
||||
while (--numRounds2);
|
||||
MM_OP_m (_mm_aesenc_si128, w[0])
|
||||
MM_OP_m (_mm_aesenclast_si128, w[1])
|
||||
MM_XOR (*data, m)
|
||||
CTR_END (m, 0)
|
||||
}
|
||||
|
||||
p[-2] = ctr;
|
||||
@ -421,7 +458,7 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256)
|
||||
__m128i *data = (__m128i *)(void *)data8;
|
||||
__m128i iv = *p;
|
||||
const __m128i *dataEnd;
|
||||
UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
|
||||
const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
|
||||
p += 2;
|
||||
|
||||
WIDE_LOOP_START_AVX(;)
|
||||
@ -440,17 +477,17 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256)
|
||||
while (w != keys);
|
||||
AVX_WOP_KEY (AVX_AES_DEC_LAST, 0)
|
||||
|
||||
AVX_XOR (m0, _mm256_setr_m128i(iv, data[0]))
|
||||
AVX_XOR (m0, _mm256_setr_m128i(iv, LOAD_data_ii(0)))
|
||||
WOP_M1 (AVX_XOR_data_M1)
|
||||
iv = data[NUM_WAYS * 2 - 1];
|
||||
LOAD_data (iv, NUM_WAYS * 2 - 1)
|
||||
WOP (AVX_STORE_data)
|
||||
}
|
||||
WIDE_LOOP_END_AVX(;)
|
||||
|
||||
SINGLE_LOOP
|
||||
{
|
||||
const __m128i *w = p + *(const UInt32 *)(p + 1 - 2) * 2 + 1 - 3;
|
||||
__m128i m = _mm_xor_si128 (w[2], *data);
|
||||
const __m128i *w = p - 2 + (size_t)*(const UInt32 *)(p + 1 - 2) * 2;
|
||||
__m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0));
|
||||
do
|
||||
{
|
||||
MM_OP_m (_mm_aesdec_si128, w[1])
|
||||
@ -462,8 +499,8 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256)
|
||||
MM_OP_m (_mm_aesdeclast_si128, w[0])
|
||||
|
||||
MM_XOR (m, iv)
|
||||
iv = *data;
|
||||
*data = m;
|
||||
LOAD_data(iv, 0)
|
||||
STORE_data(m, 0)
|
||||
}
|
||||
|
||||
p[-2] = iv;
|
||||
@ -493,9 +530,9 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256)
|
||||
__m128i *p = (__m128i *)(void *)ivAes;
|
||||
__m128i *data = (__m128i *)(void *)data8;
|
||||
__m128i ctr = *p;
|
||||
UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
|
||||
const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
|
||||
const __m128i *dataEnd;
|
||||
__m128i one = _mm_cvtsi32_si128(1);
|
||||
const __m128i one = _mm_cvtsi32_si128(1);
|
||||
__m256i ctr2, two;
|
||||
p += 2;
|
||||
|
||||
@ -536,7 +573,7 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256)
|
||||
while (--numRounds2);
|
||||
MM_OP_m (_mm_aesenc_si128, w[0])
|
||||
MM_OP_m (_mm_aesenclast_si128, w[1])
|
||||
MM_XOR (*data, m)
|
||||
CTR_END (m, 0)
|
||||
}
|
||||
|
||||
p[-2] = ctr;
|
||||
@ -730,10 +767,15 @@ AES_FUNC_START (name)
|
||||
|
||||
|
||||
AES_FUNC_START2 (AesCbc_Encode_HW)
|
||||
{
|
||||
if (numBlocks == 0)
|
||||
return;
|
||||
{
|
||||
v128 * const p = (v128 *)(void *)ivAes;
|
||||
v128 *data = (v128 *)(void *)data8;
|
||||
v128 m = *p;
|
||||
const UInt32 numRounds2 = *(const UInt32 *)(p + 1);
|
||||
const v128 *w = p + (size_t)numRounds2 * 2;
|
||||
const v128 k0 = p[2];
|
||||
const v128 k1 = p[3];
|
||||
const v128 k2 = p[4];
|
||||
@ -744,11 +786,14 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
|
||||
const v128 k7 = p[9];
|
||||
const v128 k8 = p[10];
|
||||
const v128 k9 = p[11];
|
||||
const UInt32 numRounds2 = *(const UInt32 *)(p + 1);
|
||||
const v128 *w = p + ((size_t)numRounds2 * 2);
|
||||
const v128 k_z4 = w[-2];
|
||||
const v128 k_z3 = w[-1];
|
||||
const v128 k_z2 = w[0];
|
||||
const v128 k_z1 = w[1];
|
||||
const v128 k_z0 = w[2];
|
||||
for (; numBlocks != 0; numBlocks--, data++)
|
||||
// we don't use optimization veorq_u8(*data, k_z0) that can reduce one cycle,
|
||||
// because gcc/clang compilers are not good for that optimization.
|
||||
do
|
||||
{
|
||||
MM_XOR_m (*data)
|
||||
AES_E_MC_m (k0)
|
||||
@ -757,25 +802,27 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
|
||||
AES_E_MC_m (k3)
|
||||
AES_E_MC_m (k4)
|
||||
AES_E_MC_m (k5)
|
||||
AES_E_MC_m (k6)
|
||||
AES_E_MC_m (k7)
|
||||
AES_E_MC_m (k8)
|
||||
if (numRounds2 >= 6)
|
||||
{
|
||||
AES_E_MC_m (k9)
|
||||
AES_E_MC_m (p[12])
|
||||
AES_E_MC_m (k6)
|
||||
AES_E_MC_m (k7)
|
||||
if (numRounds2 != 6)
|
||||
{
|
||||
AES_E_MC_m (p[13])
|
||||
AES_E_MC_m (p[14])
|
||||
AES_E_MC_m (k8)
|
||||
AES_E_MC_m (k9)
|
||||
}
|
||||
}
|
||||
AES_E_MC_m (k_z4)
|
||||
AES_E_MC_m (k_z3)
|
||||
AES_E_MC_m (k_z2)
|
||||
AES_E_m (k_z1)
|
||||
MM_XOR_m (k_z0)
|
||||
*data = m;
|
||||
*data++ = m;
|
||||
}
|
||||
while (--numBlocks);
|
||||
*p = m;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#define WOP_1(op)
|
||||
@ -837,7 +884,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
|
||||
v128 *p = (v128 *)(void *)ivAes;
|
||||
v128 *data = (v128 *)(void *)data8;
|
||||
v128 iv = *p;
|
||||
const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
|
||||
const v128 * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2;
|
||||
const v128 *dataEnd;
|
||||
p += 2;
|
||||
|
||||
@ -858,7 +905,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
|
||||
WOP_KEY (AES_XOR, 0)
|
||||
MM_XOR (m0, iv)
|
||||
WOP_M1 (XOR_data_M1)
|
||||
iv = data[NUM_WAYS - 1];
|
||||
LOAD_data(iv, NUM_WAYS - 1)
|
||||
WOP (STORE_data)
|
||||
}
|
||||
WIDE_LOOP_END
|
||||
@ -866,7 +913,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
|
||||
SINGLE_LOOP
|
||||
{
|
||||
const v128 *w = wStart;
|
||||
v128 m = *data;
|
||||
v128 m; LOAD_data(m, 0)
|
||||
AES_D_IMC_m (w[2])
|
||||
do
|
||||
{
|
||||
@ -878,8 +925,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
|
||||
AES_D_m (w[1])
|
||||
MM_XOR_m (w[0])
|
||||
MM_XOR_m (iv)
|
||||
iv = *data;
|
||||
*data = m;
|
||||
LOAD_data(iv, 0)
|
||||
STORE_data(m, 0)
|
||||
}
|
||||
|
||||
p[-2] = iv;
|
||||
@ -891,16 +938,14 @@ AES_FUNC_START2 (AesCtr_Code_HW)
|
||||
v128 *p = (v128 *)(void *)ivAes;
|
||||
v128 *data = (v128 *)(void *)data8;
|
||||
uint64x2_t ctr = vreinterpretq_u64_u8(*p);
|
||||
const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
|
||||
const v128 * const wEnd = p + (size_t)*(const UInt32 *)(p + 1) * 2;
|
||||
const v128 *dataEnd;
|
||||
uint64x2_t one = vdupq_n_u64(0);
|
||||
|
||||
// the bug in clang:
|
||||
// __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2);
|
||||
#if defined(__clang__) && (__clang_major__ <= 9)
|
||||
#pragma GCC diagnostic ignored "-Wvector-conversion"
|
||||
#endif
|
||||
one = vsetq_lane_u64(1, one, 0);
|
||||
const uint64x2_t one = vsetq_lane_u64(1, vdupq_n_u64(0), 0);
|
||||
p += 2;
|
||||
|
||||
WIDE_LOOP_START
|
||||
|
||||
109
3rdparty/lzma/src/CpuArch.c
vendored
109
3rdparty/lzma/src/CpuArch.c
vendored
@ -1,5 +1,5 @@
|
||||
/* CpuArch.c -- CPU specific code
|
||||
2024-07-04 : Igor Pavlov : Public domain */
|
||||
Igor Pavlov : Public domain */
|
||||
|
||||
#include "Precomp.h"
|
||||
|
||||
@ -17,7 +17,7 @@
|
||||
/*
|
||||
cpuid instruction supports (subFunction) parameter in ECX,
|
||||
that is used only with some specific (function) parameter values.
|
||||
But we always use only (subFunction==0).
|
||||
most functions use only (subFunction==0).
|
||||
*/
|
||||
/*
|
||||
__cpuid(): MSVC and GCC/CLANG use same function/macro name
|
||||
@ -49,43 +49,49 @@
|
||||
#if defined(MY_CPU_AMD64) && defined(__PIC__) \
|
||||
&& ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__))
|
||||
|
||||
#define x86_cpuid_MACRO(p, func) { \
|
||||
/* "=&r" selects free register. It can select even rbx, if that register is free.
|
||||
"=&D" for (RDI) also works, but the code can be larger with "=&D"
|
||||
"2"(subFun) : 2 is (zero-based) index in the output constraint list "=c" (ECX). */
|
||||
|
||||
#define x86_cpuid_MACRO_2(p, func, subFunc) { \
|
||||
__asm__ __volatile__ ( \
|
||||
ASM_LN "mov %%rbx, %q1" \
|
||||
ASM_LN "cpuid" \
|
||||
ASM_LN "xchg %%rbx, %q1" \
|
||||
: "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); }
|
||||
|
||||
/* "=&r" selects free register. It can select even rbx, if that register is free.
|
||||
"=&D" for (RDI) also works, but the code can be larger with "=&D"
|
||||
"2"(0) means (subFunction = 0),
|
||||
2 is (zero-based) index in the output constraint list "=c" (ECX). */
|
||||
: "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }
|
||||
|
||||
#elif defined(MY_CPU_X86) && defined(__PIC__) \
|
||||
&& ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__))
|
||||
|
||||
#define x86_cpuid_MACRO(p, func) { \
|
||||
#define x86_cpuid_MACRO_2(p, func, subFunc) { \
|
||||
__asm__ __volatile__ ( \
|
||||
ASM_LN "mov %%ebx, %k1" \
|
||||
ASM_LN "cpuid" \
|
||||
ASM_LN "xchg %%ebx, %k1" \
|
||||
: "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); }
|
||||
: "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }
|
||||
|
||||
#else
|
||||
|
||||
#define x86_cpuid_MACRO(p, func) { \
|
||||
#define x86_cpuid_MACRO_2(p, func, subFunc) { \
|
||||
__asm__ __volatile__ ( \
|
||||
ASM_LN "cpuid" \
|
||||
: "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); }
|
||||
: "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }
|
||||
|
||||
#endif
|
||||
|
||||
#define x86_cpuid_MACRO(p, func) x86_cpuid_MACRO_2(p, func, 0)
|
||||
|
||||
void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
|
||||
{
|
||||
x86_cpuid_MACRO(p, func)
|
||||
}
|
||||
|
||||
static
|
||||
void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
|
||||
{
|
||||
x86_cpuid_MACRO_2(p, func, subFunc)
|
||||
}
|
||||
|
||||
|
||||
Z7_NO_INLINE
|
||||
UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
|
||||
@ -205,11 +211,39 @@ void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
|
||||
__asm ret 0
|
||||
}
|
||||
|
||||
static
|
||||
void __declspec(naked) Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
|
||||
{
|
||||
UNUSED_VAR(p)
|
||||
UNUSED_VAR(func)
|
||||
UNUSED_VAR(subFunc)
|
||||
__asm push ebx
|
||||
__asm push edi
|
||||
__asm mov edi, ecx // p
|
||||
__asm mov eax, edx // func
|
||||
__asm mov ecx, [esp + 12] // subFunc
|
||||
__asm cpuid
|
||||
__asm mov [edi ], eax
|
||||
__asm mov [edi + 4], ebx
|
||||
__asm mov [edi + 8], ecx
|
||||
__asm mov [edi + 12], edx
|
||||
__asm pop edi
|
||||
__asm pop ebx
|
||||
__asm ret 4
|
||||
}
|
||||
|
||||
#else // MY_CPU_AMD64
|
||||
|
||||
#if _MSC_VER >= 1600
|
||||
#include <intrin.h>
|
||||
#define MY_cpuidex __cpuidex
|
||||
|
||||
static
|
||||
void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
|
||||
{
|
||||
__cpuidex((int *)p, func, subFunc);
|
||||
}
|
||||
|
||||
#else
|
||||
/*
|
||||
__cpuid (func == (0 or 7)) requires subfunction number in ECX.
|
||||
@ -219,7 +253,7 @@ void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
|
||||
We still can use __cpuid for low (func) values that don't require ECX,
|
||||
but __cpuid() in old MSVC will be incorrect for some func values: (func == 7).
|
||||
So here we use the hack for old MSVC to send (subFunction) in ECX register to cpuid instruction,
|
||||
where ECX value is first parameter for FASTCALL / NO_INLINE func,
|
||||
where ECX value is first parameter for FASTCALL / NO_INLINE func.
|
||||
So the caller of MY_cpuidex_HACK() sets ECX as subFunction, and
|
||||
old MSVC for __cpuid() doesn't change ECX and cpuid instruction gets (subFunction) value.
|
||||
|
||||
@ -233,6 +267,11 @@ Z7_NO_INLINE void Z7_FASTCALL MY_cpuidex_HACK(Int32 subFunction, Int32 func, Int
|
||||
}
|
||||
#define MY_cpuidex(info, func, func2) MY_cpuidex_HACK(func2, func, info)
|
||||
#pragma message("======== MY_cpuidex_HACK WAS USED ========")
|
||||
static
|
||||
void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
|
||||
{
|
||||
MY_cpuidex_HACK(subFunc, func, (Int32 *)p);
|
||||
}
|
||||
#endif // _MSC_VER >= 1600
|
||||
|
||||
#if !defined(MY_CPU_AMD64)
|
||||
@ -445,6 +484,23 @@ BoolInt CPU_IsSupported_SHA(void)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
BoolInt CPU_IsSupported_SHA512(void)
|
||||
{
|
||||
if (!CPU_IsSupported_AVX2()) return False; // maybe CPU_IsSupported_AVX() is enough here
|
||||
|
||||
if (z7_x86_cpuid_GetMaxFunc() < 7)
|
||||
return False;
|
||||
{
|
||||
UInt32 d[4];
|
||||
z7_x86_cpuid_subFunc(d, 7, 0);
|
||||
if (d[0] < 1) // d[0] - is max supported subleaf value
|
||||
return False;
|
||||
z7_x86_cpuid_subFunc(d, 7, 1);
|
||||
return (BoolInt)(d[0]) & 1;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
MSVC: _xgetbv() intrinsic is available since VS2010SP1.
|
||||
MSVC also defines (_XCR_XFEATURE_ENABLED_MASK) macro in
|
||||
@ -776,6 +832,18 @@ BoolInt CPU_IsSupported_NEON(void)
|
||||
return z7_sysctlbyname_Get_BoolInt("hw.optional.neon");
|
||||
}
|
||||
|
||||
BoolInt CPU_IsSupported_SHA512(void)
|
||||
{
|
||||
return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha512");
|
||||
}
|
||||
|
||||
/*
|
||||
BoolInt CPU_IsSupported_SHA3(void)
|
||||
{
|
||||
return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha3");
|
||||
}
|
||||
*/
|
||||
|
||||
#ifdef MY_CPU_ARM64
|
||||
#define APPLE_CRYPTO_SUPPORT_VAL 1
|
||||
#else
|
||||
@ -860,6 +928,19 @@ MY_HWCAP_CHECK_FUNC (CRC32)
|
||||
MY_HWCAP_CHECK_FUNC (SHA1)
|
||||
MY_HWCAP_CHECK_FUNC (SHA2)
|
||||
MY_HWCAP_CHECK_FUNC (AES)
|
||||
#ifdef MY_CPU_ARM64
|
||||
// <hwcap.h> supports HWCAP_SHA512 and HWCAP_SHA3 since 2017.
|
||||
// we define them here, if they are not defined
|
||||
#ifndef HWCAP_SHA3
|
||||
// #define HWCAP_SHA3 (1 << 17)
|
||||
#endif
|
||||
#ifndef HWCAP_SHA512
|
||||
// #pragma message("=== HWCAP_SHA512 define === ")
|
||||
#define HWCAP_SHA512 (1 << 21)
|
||||
#endif
|
||||
MY_HWCAP_CHECK_FUNC (SHA512)
|
||||
// MY_HWCAP_CHECK_FUNC (SHA3)
|
||||
#endif
|
||||
|
||||
#endif // __APPLE__
|
||||
#endif // _WIN32
|
||||
|
||||
24
3rdparty/lzma/src/LzFind.c
vendored
24
3rdparty/lzma/src/LzFind.c
vendored
@ -1,5 +1,5 @@
|
||||
/* LzFind.c -- Match finder for LZ algorithms
|
||||
2024-03-01 : Igor Pavlov : Public domain */
|
||||
: Igor Pavlov : Public domain */
|
||||
|
||||
#include "Precomp.h"
|
||||
|
||||
@ -404,7 +404,7 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
|
||||
const unsigned nbMax =
|
||||
(p->numHashBytes == 2 ? 16 :
|
||||
(p->numHashBytes == 3 ? 24 : 32));
|
||||
if (numBits > nbMax)
|
||||
if (numBits >= nbMax)
|
||||
numBits = nbMax;
|
||||
if (numBits >= 32)
|
||||
hs = (UInt32)0 - 1;
|
||||
@ -416,14 +416,14 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
|
||||
hs |= (256 << kLzHash_CrcShift_2) - 1;
|
||||
{
|
||||
const UInt32 hs2 = MatchFinder_GetHashMask2(p, historySize);
|
||||
if (hs > hs2)
|
||||
if (hs >= hs2)
|
||||
hs = hs2;
|
||||
}
|
||||
hsCur = hs;
|
||||
if (p->expectedDataSize < historySize)
|
||||
{
|
||||
const UInt32 hs2 = MatchFinder_GetHashMask2(p, (UInt32)p->expectedDataSize);
|
||||
if (hsCur > hs2)
|
||||
if (hsCur >= hs2)
|
||||
hsCur = hs2;
|
||||
}
|
||||
}
|
||||
@ -434,7 +434,7 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
|
||||
if (p->expectedDataSize < historySize)
|
||||
{
|
||||
hsCur = MatchFinder_GetHashMask(p, (UInt32)p->expectedDataSize);
|
||||
if (hsCur > hs) // is it possible?
|
||||
if (hsCur >= hs) // is it possible?
|
||||
hsCur = hs;
|
||||
}
|
||||
}
|
||||
@ -890,7 +890,7 @@ static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos,
|
||||
return d;
|
||||
{
|
||||
const Byte *pb = cur - delta;
|
||||
curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)];
|
||||
curMatch = son[_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)];
|
||||
if (pb[maxLen] == cur[maxLen] && *pb == *cur)
|
||||
{
|
||||
UInt32 len = 0;
|
||||
@ -925,7 +925,7 @@ static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos,
|
||||
break;
|
||||
{
|
||||
ptrdiff_t diff;
|
||||
curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)];
|
||||
curMatch = son[_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)];
|
||||
diff = (ptrdiff_t)0 - (ptrdiff_t)delta;
|
||||
if (cur[maxLen] == cur[(ptrdiff_t)maxLen + diff])
|
||||
{
|
||||
@ -972,7 +972,7 @@ UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byt
|
||||
// if (curMatch >= pos) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; }
|
||||
|
||||
cmCheck = (UInt32)(pos - _cyclicBufferSize);
|
||||
if ((UInt32)pos <= _cyclicBufferSize)
|
||||
if ((UInt32)pos < _cyclicBufferSize)
|
||||
cmCheck = 0;
|
||||
|
||||
if (cmCheck < curMatch)
|
||||
@ -980,7 +980,7 @@ UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byt
|
||||
{
|
||||
const UInt32 delta = pos - curMatch;
|
||||
{
|
||||
CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
|
||||
CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)) << 1);
|
||||
const Byte *pb = cur - delta;
|
||||
unsigned len = (len0 < len1 ? len0 : len1);
|
||||
const UInt32 pair0 = pair[0];
|
||||
@ -1039,7 +1039,7 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const
|
||||
UInt32 cmCheck;
|
||||
|
||||
cmCheck = (UInt32)(pos - _cyclicBufferSize);
|
||||
if ((UInt32)pos <= _cyclicBufferSize)
|
||||
if ((UInt32)pos < _cyclicBufferSize)
|
||||
cmCheck = 0;
|
||||
|
||||
if (// curMatch >= pos || // failure
|
||||
@ -1048,7 +1048,7 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const
|
||||
{
|
||||
const UInt32 delta = pos - curMatch;
|
||||
{
|
||||
CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
|
||||
CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)) << 1);
|
||||
const Byte *pb = cur - delta;
|
||||
unsigned len = (len0 < len1 ? len0 : len1);
|
||||
if (pb[len] == cur[len])
|
||||
@ -1595,7 +1595,7 @@ static void Bt5_MatchFinder_Skip(void *_p, UInt32 num)
|
||||
UInt32 pos = p->pos; \
|
||||
UInt32 num2 = num; \
|
||||
/* (p->pos == p->posLimit) is not allowed here !!! */ \
|
||||
{ const UInt32 rem = p->posLimit - pos; if (num2 > rem) num2 = rem; } \
|
||||
{ const UInt32 rem = p->posLimit - pos; if (num2 >= rem) num2 = rem; } \
|
||||
num -= num2; \
|
||||
{ const UInt32 cycPos = p->cyclicBufferPos; \
|
||||
son = p->son + cycPos; \
|
||||
|
||||
10
3rdparty/lzma/src/LzFindMt.c
vendored
10
3rdparty/lzma/src/LzFindMt.c
vendored
@ -1,5 +1,5 @@
|
||||
/* LzFindMt.c -- multithreaded Match finder for LZ algorithms
|
||||
2024-01-22 : Igor Pavlov : Public domain */
|
||||
: Igor Pavlov : Public domain */
|
||||
|
||||
#include "Precomp.h"
|
||||
|
||||
@ -82,6 +82,8 @@ extern UInt64 g_NumIters_Bytes;
|
||||
Z7_NO_INLINE
|
||||
static void MtSync_Construct(CMtSync *p)
|
||||
{
|
||||
p->affinityGroup = -1;
|
||||
p->affinityInGroup = 0;
|
||||
p->affinity = 0;
|
||||
p->wasCreated = False;
|
||||
p->csWasInitialized = False;
|
||||
@ -259,6 +261,12 @@ static WRes MtSync_Create_WRes(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *
|
||||
// return ERROR_TOO_MANY_POSTS; // for debug
|
||||
// return EINVAL; // for debug
|
||||
|
||||
#ifdef _WIN32
|
||||
if (p->affinityGroup >= 0)
|
||||
wres = Thread_Create_With_Group(&p->thread, startAddress, obj,
|
||||
(unsigned)(UInt32)p->affinityGroup, (CAffinityMask)p->affinityInGroup);
|
||||
else
|
||||
#endif
|
||||
if (p->affinity != 0)
|
||||
wres = Thread_Create_With_Affinity(&p->thread, startAddress, obj, (CAffinityMask)p->affinity);
|
||||
else
|
||||
|
||||
4
3rdparty/lzma/src/Lzma2Enc.c
vendored
4
3rdparty/lzma/src/Lzma2Enc.c
vendored
@ -1,5 +1,5 @@
|
||||
/* Lzma2Enc.c -- LZMA2 Encoder
|
||||
2023-04-13 : Igor Pavlov : Public domain */
|
||||
: Igor Pavlov : Public domain */
|
||||
|
||||
#include "Precomp.h"
|
||||
|
||||
@ -235,6 +235,7 @@ void Lzma2EncProps_Init(CLzma2EncProps *p)
|
||||
p->numBlockThreads_Reduced = -1;
|
||||
p->numBlockThreads_Max = -1;
|
||||
p->numTotalThreads = -1;
|
||||
p->numThreadGroups = 0;
|
||||
}
|
||||
|
||||
void Lzma2EncProps_Normalize(CLzma2EncProps *p)
|
||||
@ -781,6 +782,7 @@ SRes Lzma2Enc_Encode2(CLzma2EncHandle p,
|
||||
}
|
||||
|
||||
p->mtCoder.numThreadsMax = (unsigned)p->props.numBlockThreads_Max;
|
||||
p->mtCoder.numThreadGroups = p->props.numThreadGroups;
|
||||
p->mtCoder.expectedDataSize = p->expectedDataSize;
|
||||
|
||||
{
|
||||
|
||||
22
3rdparty/lzma/src/LzmaEnc.c
vendored
22
3rdparty/lzma/src/LzmaEnc.c
vendored
@ -1,5 +1,5 @@
|
||||
/* LzmaEnc.c -- LZMA Encoder
|
||||
2024-01-24: Igor Pavlov : Public domain */
|
||||
Igor Pavlov : Public domain */
|
||||
|
||||
#include "Precomp.h"
|
||||
|
||||
@ -62,7 +62,9 @@ void LzmaEncProps_Init(CLzmaEncProps *p)
|
||||
p->lc = p->lp = p->pb = p->algo = p->fb = p->btMode = p->numHashBytes = p->numThreads = -1;
|
||||
p->numHashOutBits = 0;
|
||||
p->writeEndMark = 0;
|
||||
p->affinityGroup = -1;
|
||||
p->affinity = 0;
|
||||
p->affinityInGroup = 0;
|
||||
}
|
||||
|
||||
void LzmaEncProps_Normalize(CLzmaEncProps *p)
|
||||
@ -72,11 +74,11 @@ void LzmaEncProps_Normalize(CLzmaEncProps *p)
|
||||
p->level = level;
|
||||
|
||||
if (p->dictSize == 0)
|
||||
p->dictSize =
|
||||
( level <= 3 ? ((UInt32)1 << (level * 2 + 16)) :
|
||||
( level <= 6 ? ((UInt32)1 << (level + 19)) :
|
||||
( level <= 7 ? ((UInt32)1 << 25) : ((UInt32)1 << 26)
|
||||
)));
|
||||
p->dictSize = (unsigned)level <= 4 ?
|
||||
(UInt32)1 << (level * 2 + 16) :
|
||||
(unsigned)level <= sizeof(size_t) / 2 + 4 ?
|
||||
(UInt32)1 << (level + 20) :
|
||||
(UInt32)1 << (sizeof(size_t) / 2 + 24);
|
||||
|
||||
if (p->dictSize > p->reduceSize)
|
||||
{
|
||||
@ -92,8 +94,8 @@ void LzmaEncProps_Normalize(CLzmaEncProps *p)
|
||||
if (p->lp < 0) p->lp = 0;
|
||||
if (p->pb < 0) p->pb = 2;
|
||||
|
||||
if (p->algo < 0) p->algo = (level < 5 ? 0 : 1);
|
||||
if (p->fb < 0) p->fb = (level < 7 ? 32 : 64);
|
||||
if (p->algo < 0) p->algo = (unsigned)level < 5 ? 0 : 1;
|
||||
if (p->fb < 0) p->fb = (unsigned)level < 7 ? 32 : 64;
|
||||
if (p->btMode < 0) p->btMode = (p->algo == 0 ? 0 : 1);
|
||||
if (p->numHashBytes < 0) p->numHashBytes = (p->btMode ? 4 : 5);
|
||||
if (p->mc == 0) p->mc = (16 + ((unsigned)p->fb >> 1)) >> (p->btMode ? 0 : 1);
|
||||
@ -598,6 +600,10 @@ SRes LzmaEnc_SetProps(CLzmaEncHandle p, const CLzmaEncProps *props2)
|
||||
p->multiThread = (props.numThreads > 1);
|
||||
p->matchFinderMt.btSync.affinity =
|
||||
p->matchFinderMt.hashSync.affinity = props.affinity;
|
||||
p->matchFinderMt.btSync.affinityGroup =
|
||||
p->matchFinderMt.hashSync.affinityGroup = props.affinityGroup;
|
||||
p->matchFinderMt.btSync.affinityInGroup =
|
||||
p->matchFinderMt.hashSync.affinityInGroup = props.affinityInGroup;
|
||||
#endif
|
||||
|
||||
return SZ_OK;
|
||||
|
||||
59
3rdparty/lzma/src/MtCoder.c
vendored
59
3rdparty/lzma/src/MtCoder.c
vendored
@ -1,5 +1,5 @@
|
||||
/* MtCoder.c -- Multi-thread Coder
|
||||
2023-09-07 : Igor Pavlov : Public domain */
|
||||
: Igor Pavlov : Public domain */
|
||||
|
||||
#include "Precomp.h"
|
||||
|
||||
@ -39,14 +39,28 @@ void MtProgressThunk_CreateVTable(CMtProgressThunk *p)
|
||||
static THREAD_FUNC_DECL ThreadFunc(void *pp);
|
||||
|
||||
|
||||
static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t)
|
||||
static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t
|
||||
#ifdef _WIN32
|
||||
, CMtCoder * const mtc
|
||||
#endif
|
||||
)
|
||||
{
|
||||
WRes wres = AutoResetEvent_OptCreate_And_Reset(&t->startEvent);
|
||||
// printf("\n====== MtCoderThread_CreateAndStart : \n");
|
||||
if (wres == 0)
|
||||
{
|
||||
t->stop = False;
|
||||
if (!Thread_WasCreated(&t->thread))
|
||||
{
|
||||
#ifdef _WIN32
|
||||
if (mtc->numThreadGroups)
|
||||
wres = Thread_Create_With_Group(&t->thread, ThreadFunc, t,
|
||||
ThreadNextGroup_GetNext(&mtc->nextGroup), // group
|
||||
0); // affinityMask
|
||||
else
|
||||
#endif
|
||||
wres = Thread_Create(&t->thread, ThreadFunc, t);
|
||||
}
|
||||
if (wres == 0)
|
||||
wres = Event_Set(&t->startEvent);
|
||||
}
|
||||
@ -56,6 +70,7 @@ static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t)
|
||||
}
|
||||
|
||||
|
||||
Z7_FORCE_INLINE
|
||||
static void MtCoderThread_Destruct(CMtCoderThread *t)
|
||||
{
|
||||
if (Thread_WasCreated(&t->thread))
|
||||
@ -85,7 +100,7 @@ static void MtCoderThread_Destruct(CMtCoderThread *t)
|
||||
|
||||
static SRes ThreadFunc2(CMtCoderThread *t)
|
||||
{
|
||||
CMtCoder *mtc = t->mtCoder;
|
||||
CMtCoder * const mtc = t->mtCoder;
|
||||
|
||||
for (;;)
|
||||
{
|
||||
@ -185,7 +200,11 @@ static SRes ThreadFunc2(CMtCoderThread *t)
|
||||
if (mtc->numStartedThreads < mtc->numStartedThreadsLimit
|
||||
&& mtc->expectedDataSize != readProcessed)
|
||||
{
|
||||
res = MtCoderThread_CreateAndStart(&mtc->threads[mtc->numStartedThreads]);
|
||||
res = MtCoderThread_CreateAndStart(&mtc->threads[mtc->numStartedThreads]
|
||||
#ifdef _WIN32
|
||||
, mtc
|
||||
#endif
|
||||
);
|
||||
if (res == SZ_OK)
|
||||
mtc->numStartedThreads++;
|
||||
else
|
||||
@ -221,7 +240,7 @@ static SRes ThreadFunc2(CMtCoderThread *t)
|
||||
}
|
||||
|
||||
{
|
||||
CMtCoderBlock *block = &mtc->blocks[bi];
|
||||
CMtCoderBlock * const block = &mtc->blocks[bi];
|
||||
block->res = res;
|
||||
block->bufIndex = bufIndex;
|
||||
block->finished = finished;
|
||||
@ -311,7 +330,7 @@ static SRes ThreadFunc2(CMtCoderThread *t)
|
||||
|
||||
static THREAD_FUNC_DECL ThreadFunc(void *pp)
|
||||
{
|
||||
CMtCoderThread *t = (CMtCoderThread *)pp;
|
||||
CMtCoderThread * const t = (CMtCoderThread *)pp;
|
||||
for (;;)
|
||||
{
|
||||
if (Event_Wait(&t->startEvent) != 0)
|
||||
@ -319,7 +338,7 @@ static THREAD_FUNC_DECL ThreadFunc(void *pp)
|
||||
if (t->stop)
|
||||
return 0;
|
||||
{
|
||||
SRes res = ThreadFunc2(t);
|
||||
const SRes res = ThreadFunc2(t);
|
||||
CMtCoder *mtc = t->mtCoder;
|
||||
if (res != SZ_OK)
|
||||
{
|
||||
@ -328,7 +347,7 @@ static THREAD_FUNC_DECL ThreadFunc(void *pp)
|
||||
|
||||
#ifndef MTCODER_USE_WRITE_THREAD
|
||||
{
|
||||
unsigned numFinished = (unsigned)InterlockedIncrement(&mtc->numFinishedThreads);
|
||||
const unsigned numFinished = (unsigned)InterlockedIncrement(&mtc->numFinishedThreads);
|
||||
if (numFinished == mtc->numStartedThreads)
|
||||
if (Event_Set(&mtc->finishedEvent) != 0)
|
||||
return (THREAD_FUNC_RET_TYPE)SZ_ERROR_THREAD;
|
||||
@ -346,6 +365,7 @@ void MtCoder_Construct(CMtCoder *p)
|
||||
|
||||
p->blockSize = 0;
|
||||
p->numThreadsMax = 0;
|
||||
p->numThreadGroups = 0;
|
||||
p->expectedDataSize = (UInt64)(Int64)-1;
|
||||
|
||||
p->inStream = NULL;
|
||||
@ -429,6 +449,8 @@ SRes MtCoder_Code(CMtCoder *p)
|
||||
unsigned i;
|
||||
SRes res = SZ_OK;
|
||||
|
||||
// printf("\n====== MtCoder_Code : \n");
|
||||
|
||||
if (numThreads > MTCODER_THREADS_MAX)
|
||||
numThreads = MTCODER_THREADS_MAX;
|
||||
numBlocksMax = MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads);
|
||||
@ -492,11 +514,22 @@ SRes MtCoder_Code(CMtCoder *p)
|
||||
|
||||
p->numStartedThreadsLimit = numThreads;
|
||||
p->numStartedThreads = 0;
|
||||
ThreadNextGroup_Init(&p->nextGroup, p->numThreadGroups, 0); // startGroup
|
||||
|
||||
// for (i = 0; i < numThreads; i++)
|
||||
{
|
||||
// here we create new thread for first block.
|
||||
// And each new thread will create another new thread after block reading
|
||||
// until numStartedThreadsLimit is reached.
|
||||
CMtCoderThread *nextThread = &p->threads[p->numStartedThreads++];
|
||||
RINOK(MtCoderThread_CreateAndStart(nextThread))
|
||||
{
|
||||
const SRes res2 = MtCoderThread_CreateAndStart(nextThread
|
||||
#ifdef _WIN32
|
||||
, p
|
||||
#endif
|
||||
);
|
||||
RINOK(res2)
|
||||
}
|
||||
}
|
||||
|
||||
RINOK_THREAD(Event_Set(&p->readEvent))
|
||||
@ -513,9 +546,9 @@ SRes MtCoder_Code(CMtCoder *p)
|
||||
RINOK_THREAD(Event_Wait(&p->writeEvents[bi]))
|
||||
|
||||
{
|
||||
const CMtCoderBlock *block = &p->blocks[bi];
|
||||
unsigned bufIndex = block->bufIndex;
|
||||
BoolInt finished = block->finished;
|
||||
const CMtCoderBlock * const block = &p->blocks[bi];
|
||||
const unsigned bufIndex = block->bufIndex;
|
||||
const BoolInt finished = block->finished;
|
||||
if (res == SZ_OK && block->res != SZ_OK)
|
||||
res = block->res;
|
||||
|
||||
@ -545,7 +578,7 @@ SRes MtCoder_Code(CMtCoder *p)
|
||||
}
|
||||
#else
|
||||
{
|
||||
WRes wres = Event_Wait(&p->finishedEvent);
|
||||
const WRes wres = Event_Wait(&p->finishedEvent);
|
||||
res = MY_SRes_HRESULT_FROM_WRes(wres);
|
||||
}
|
||||
#endif
|
||||
|
||||
134
3rdparty/lzma/src/Sha256.c
vendored
134
3rdparty/lzma/src/Sha256.c
vendored
@ -1,18 +1,14 @@
|
||||
/* Sha256.c -- SHA-256 Hash
|
||||
2024-03-01 : Igor Pavlov : Public domain
|
||||
: Igor Pavlov : Public domain
|
||||
This code is based on public domain code from Wei Dai's Crypto++ library. */
|
||||
|
||||
#include "Precomp.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "CpuArch.h"
|
||||
#include "RotateDefs.h"
|
||||
#include "Sha256.h"
|
||||
|
||||
#if defined(_MSC_VER) && (_MSC_VER < 1900)
|
||||
// #define USE_MY_MM
|
||||
#endif
|
||||
#include "RotateDefs.h"
|
||||
#include "CpuArch.h"
|
||||
|
||||
#ifdef MY_CPU_X86_OR_AMD64
|
||||
#if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \
|
||||
@ -56,7 +52,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n
|
||||
static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS = Sha256_UpdateBlocks;
|
||||
static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS_HW;
|
||||
|
||||
#define SHA256_UPDATE_BLOCKS(p) p->func_UpdateBlocks
|
||||
#define SHA256_UPDATE_BLOCKS(p) p->v.vars.func_UpdateBlocks
|
||||
#else
|
||||
#define SHA256_UPDATE_BLOCKS(p) Sha256_UpdateBlocks
|
||||
#endif
|
||||
@ -85,7 +81,7 @@ BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo)
|
||||
return False;
|
||||
#endif
|
||||
|
||||
p->func_UpdateBlocks = func;
|
||||
p->v.vars.func_UpdateBlocks = func;
|
||||
return True;
|
||||
}
|
||||
|
||||
@ -111,7 +107,7 @@ BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo)
|
||||
|
||||
void Sha256_InitState(CSha256 *p)
|
||||
{
|
||||
p->count = 0;
|
||||
p->v.vars.count = 0;
|
||||
p->state[0] = 0x6a09e667;
|
||||
p->state[1] = 0xbb67ae85;
|
||||
p->state[2] = 0x3c6ef372;
|
||||
@ -122,9 +118,16 @@ void Sha256_InitState(CSha256 *p)
|
||||
p->state[7] = 0x5be0cd19;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
void Sha256_Init(CSha256 *p)
|
||||
{
|
||||
p->func_UpdateBlocks =
|
||||
p->v.vars.func_UpdateBlocks =
|
||||
#ifdef Z7_COMPILER_SHA256_SUPPORTED
|
||||
g_SHA256_FUNC_UPDATE_BLOCKS;
|
||||
#else
|
||||
@ -224,12 +227,10 @@ void Sha256_Init(CSha256 *p)
|
||||
|
||||
#endif
|
||||
|
||||
// static
|
||||
extern MY_ALIGN(64)
|
||||
const UInt32 SHA256_K_ARRAY[64];
|
||||
|
||||
MY_ALIGN(64)
|
||||
const UInt32 SHA256_K_ARRAY[64] = {
|
||||
extern
|
||||
MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64];
|
||||
MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64] = {
|
||||
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
|
||||
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
|
||||
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
|
||||
@ -248,9 +249,12 @@ const UInt32 SHA256_K_ARRAY[64] = {
|
||||
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
|
||||
};
|
||||
|
||||
#define K SHA256_K_ARRAY
|
||||
|
||||
|
||||
|
||||
|
||||
#define K SHA256_K_ARRAY
|
||||
|
||||
Z7_NO_INLINE
|
||||
void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks)
|
||||
{
|
||||
@ -260,15 +264,14 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n
|
||||
#else
|
||||
[16];
|
||||
#endif
|
||||
|
||||
unsigned j;
|
||||
|
||||
UInt32 a,b,c,d,e,f,g,h;
|
||||
|
||||
#if !defined(Z7_SHA256_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4)
|
||||
UInt32 tmp;
|
||||
#endif
|
||||
|
||||
if (numBlocks == 0) return;
|
||||
|
||||
a = state[0];
|
||||
b = state[1];
|
||||
c = state[2];
|
||||
@ -278,7 +281,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n
|
||||
g = state[6];
|
||||
h = state[7];
|
||||
|
||||
while (numBlocks)
|
||||
do
|
||||
{
|
||||
|
||||
for (j = 0; j < 16; j += STEP_PRE)
|
||||
@ -352,19 +355,11 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n
|
||||
g += state[6]; state[6] = g;
|
||||
h += state[7]; state[7] = h;
|
||||
|
||||
data += 64;
|
||||
numBlocks--;
|
||||
data += SHA256_BLOCK_SIZE;
|
||||
}
|
||||
while (--numBlocks);
|
||||
}
|
||||
|
||||
/* Wipe variables */
|
||||
/* memset(W, 0, sizeof(W)); */
|
||||
}
|
||||
|
||||
#undef S0
|
||||
#undef S1
|
||||
#undef s0
|
||||
#undef s1
|
||||
#undef K
|
||||
|
||||
#define Sha256_UpdateBlock(p) SHA256_UPDATE_BLOCKS(p)(p->state, p->buffer, 1)
|
||||
|
||||
@ -372,20 +367,15 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size)
|
||||
{
|
||||
if (size == 0)
|
||||
return;
|
||||
|
||||
{
|
||||
unsigned pos = (unsigned)p->count & 0x3F;
|
||||
unsigned num;
|
||||
|
||||
p->count += size;
|
||||
|
||||
num = 64 - pos;
|
||||
const unsigned pos = (unsigned)p->v.vars.count & (SHA256_BLOCK_SIZE - 1);
|
||||
const unsigned num = SHA256_BLOCK_SIZE - pos;
|
||||
p->v.vars.count += size;
|
||||
if (num > size)
|
||||
{
|
||||
memcpy(p->buffer + pos, data, size);
|
||||
return;
|
||||
}
|
||||
|
||||
if (pos != 0)
|
||||
{
|
||||
size -= num;
|
||||
@ -395,9 +385,10 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size)
|
||||
}
|
||||
}
|
||||
{
|
||||
size_t numBlocks = size >> 6;
|
||||
const size_t numBlocks = size >> 6;
|
||||
// if (numBlocks)
|
||||
SHA256_UPDATE_BLOCKS(p)(p->state, data, numBlocks);
|
||||
size &= 0x3F;
|
||||
size &= SHA256_BLOCK_SIZE - 1;
|
||||
if (size == 0)
|
||||
return;
|
||||
data += (numBlocks << 6);
|
||||
@ -408,53 +399,41 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size)
|
||||
|
||||
void Sha256_Final(CSha256 *p, Byte *digest)
|
||||
{
|
||||
unsigned pos = (unsigned)p->count & 0x3F;
|
||||
unsigned i;
|
||||
|
||||
unsigned pos = (unsigned)p->v.vars.count & (SHA256_BLOCK_SIZE - 1);
|
||||
p->buffer[pos++] = 0x80;
|
||||
|
||||
if (pos > (64 - 8))
|
||||
if (pos > (SHA256_BLOCK_SIZE - 4 * 2))
|
||||
{
|
||||
while (pos != 64) { p->buffer[pos++] = 0; }
|
||||
// memset(&p->buf.buffer[pos], 0, 64 - pos);
|
||||
while (pos != SHA256_BLOCK_SIZE) { p->buffer[pos++] = 0; }
|
||||
// memset(&p->buf.buffer[pos], 0, SHA256_BLOCK_SIZE - pos);
|
||||
Sha256_UpdateBlock(p);
|
||||
pos = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
if (pos & 3)
|
||||
memset(&p->buffer[pos], 0, (SHA256_BLOCK_SIZE - 4 * 2) - pos);
|
||||
{
|
||||
p->buffer[pos] = 0;
|
||||
p->buffer[pos + 1] = 0;
|
||||
p->buffer[pos + 2] = 0;
|
||||
pos += 3;
|
||||
pos &= ~3;
|
||||
const UInt64 numBits = p->v.vars.count << 3;
|
||||
SetBe32(p->buffer + SHA256_BLOCK_SIZE - 4 * 2, (UInt32)(numBits >> 32))
|
||||
SetBe32(p->buffer + SHA256_BLOCK_SIZE - 4 * 1, (UInt32)(numBits))
|
||||
}
|
||||
{
|
||||
for (; pos < 64 - 8; pos += 4)
|
||||
*(UInt32 *)(&p->buffer[pos]) = 0;
|
||||
}
|
||||
*/
|
||||
|
||||
memset(&p->buffer[pos], 0, (64 - 8) - pos);
|
||||
|
||||
{
|
||||
UInt64 numBits = (p->count << 3);
|
||||
SetBe32(p->buffer + 64 - 8, (UInt32)(numBits >> 32))
|
||||
SetBe32(p->buffer + 64 - 4, (UInt32)(numBits))
|
||||
}
|
||||
|
||||
Sha256_UpdateBlock(p);
|
||||
|
||||
#if 1 && defined(MY_CPU_BE)
|
||||
memcpy(digest, p->state, SHA256_DIGEST_SIZE);
|
||||
#else
|
||||
{
|
||||
unsigned i;
|
||||
for (i = 0; i < 8; i += 2)
|
||||
{
|
||||
UInt32 v0 = p->state[i];
|
||||
UInt32 v1 = p->state[(size_t)i + 1];
|
||||
const UInt32 v0 = p->state[i];
|
||||
const UInt32 v1 = p->state[(size_t)i + 1];
|
||||
SetBe32(digest , v0)
|
||||
SetBe32(digest + 4, v1)
|
||||
digest += 8;
|
||||
digest += 4 * 2;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
Sha256_InitState(p);
|
||||
}
|
||||
|
||||
@ -466,12 +445,9 @@ void Sha256Prepare(void)
|
||||
f = Sha256_UpdateBlocks;
|
||||
f_hw = NULL;
|
||||
#ifdef MY_CPU_X86_OR_AMD64
|
||||
#ifndef USE_MY_MM
|
||||
if (CPU_IsSupported_SHA()
|
||||
&& CPU_IsSupported_SSSE3()
|
||||
// && CPU_IsSupported_SSE41()
|
||||
)
|
||||
#endif
|
||||
#else
|
||||
if (CPU_IsSupported_SHA2())
|
||||
#endif
|
||||
@ -484,6 +460,8 @@ void Sha256Prepare(void)
|
||||
#endif
|
||||
}
|
||||
|
||||
#undef U64C
|
||||
#undef K
|
||||
#undef S0
|
||||
#undef S1
|
||||
#undef s0
|
||||
|
||||
164
3rdparty/lzma/src/Sha256Opt.c
vendored
164
3rdparty/lzma/src/Sha256Opt.c
vendored
@ -1,18 +1,11 @@
|
||||
/* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions
|
||||
2024-03-01 : Igor Pavlov : Public domain */
|
||||
: Igor Pavlov : Public domain */
|
||||
|
||||
#include "Precomp.h"
|
||||
#include "Compiler.h"
|
||||
#include "CpuArch.h"
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#if (_MSC_VER < 1900) && (_MSC_VER >= 1200)
|
||||
// #define USE_MY_MM
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// #define Z7_USE_HW_SHA_STUB // for debug
|
||||
|
||||
#ifdef MY_CPU_X86_OR_AMD64
|
||||
#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check
|
||||
#define USE_HW_SHA
|
||||
@ -20,19 +13,14 @@
|
||||
|| defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
|
||||
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900)
|
||||
#define USE_HW_SHA
|
||||
#if !defined(_INTEL_COMPILER)
|
||||
#if !defined(__INTEL_COMPILER)
|
||||
// icc defines __GNUC__, but icc doesn't support __attribute__(__target__)
|
||||
#if !defined(__SHA__) || !defined(__SSSE3__)
|
||||
#define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
|
||||
#endif
|
||||
#endif
|
||||
#elif defined(_MSC_VER)
|
||||
#ifdef USE_MY_MM
|
||||
#define USE_VER_MIN 1300
|
||||
#else
|
||||
#define USE_VER_MIN 1900
|
||||
#endif
|
||||
#if (_MSC_VER >= USE_VER_MIN)
|
||||
#if (_MSC_VER >= 1900)
|
||||
#define USE_HW_SHA
|
||||
#else
|
||||
#define Z7_USE_HW_SHA_STUB
|
||||
@ -47,23 +35,20 @@
|
||||
|
||||
// #pragma message("Sha256 HW")
|
||||
|
||||
|
||||
|
||||
|
||||
// sse/sse2/ssse3:
|
||||
#include <tmmintrin.h>
|
||||
// sha*:
|
||||
#include <immintrin.h>
|
||||
|
||||
#if defined (__clang__) && defined(_MSC_VER)
|
||||
// #if !defined(__SSSE3__)
|
||||
// #endif
|
||||
#if !defined(__SHA__)
|
||||
#include <shaintrin.h>
|
||||
#endif
|
||||
#else
|
||||
|
||||
#ifdef USE_MY_MM
|
||||
#include "My_mm.h"
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
@ -91,54 +76,38 @@ SHA:
|
||||
extern
|
||||
MY_ALIGN(64)
|
||||
const UInt32 SHA256_K_ARRAY[64];
|
||||
|
||||
#define K SHA256_K_ARRAY
|
||||
|
||||
|
||||
#define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src);
|
||||
#define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src);
|
||||
#define SHA25G_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src);
|
||||
|
||||
#define SHA256_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src);
|
||||
|
||||
#define LOAD_SHUFFLE(m, k) \
|
||||
m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \
|
||||
m = _mm_shuffle_epi8(m, mask); \
|
||||
|
||||
#define SM1(g0, g1, g2, g3) \
|
||||
SHA256_MSG1(g3, g0); \
|
||||
#define NNN(m0, m1, m2, m3)
|
||||
|
||||
#define SM2(g0, g1, g2, g3) \
|
||||
tmp = _mm_alignr_epi8(g1, g0, 4); \
|
||||
ADD_EPI32(g2, tmp) \
|
||||
SHA25G_MSG2(g2, g1); \
|
||||
|
||||
// #define LS0(k, g0, g1, g2, g3) LOAD_SHUFFLE(g0, k)
|
||||
// #define LS1(k, g0, g1, g2, g3) LOAD_SHUFFLE(g1, k+1)
|
||||
|
||||
|
||||
#define NNN(g0, g1, g2, g3)
|
||||
#define SM1(m1, m2, m3, m0) \
|
||||
SHA256_MSG1(m0, m1); \
|
||||
|
||||
#define SM2(m2, m3, m0, m1) \
|
||||
ADD_EPI32(m0, _mm_alignr_epi8(m3, m2, 4)) \
|
||||
SHA256_MSG2(m0, m3); \
|
||||
|
||||
#define RND2(t0, t1) \
|
||||
t0 = _mm_sha256rnds2_epu32(t0, t1, msg);
|
||||
|
||||
#define RND2_0(m, k) \
|
||||
msg = _mm_add_epi32(m, *(const __m128i *) (const void *) &K[(k) * 4]); \
|
||||
|
||||
|
||||
#define R4(k, m0, m1, m2, m3, OP0, OP1) \
|
||||
msg = _mm_add_epi32(m0, *(const __m128i *) (const void *) &K[(k) * 4]); \
|
||||
RND2(state0, state1); \
|
||||
msg = _mm_shuffle_epi32(msg, 0x0E); \
|
||||
|
||||
|
||||
#define RND2_1 \
|
||||
OP0(m0, m1, m2, m3) \
|
||||
RND2(state1, state0); \
|
||||
|
||||
|
||||
// We use scheme with 3 rounds ahead for SHA256_MSG1 / 2 rounds ahead for SHA256_MSG2
|
||||
|
||||
#define R4(k, g0, g1, g2, g3, OP0, OP1) \
|
||||
RND2_0(g0, k) \
|
||||
OP0(g0, g1, g2, g3) \
|
||||
RND2_1 \
|
||||
OP1(g0, g1, g2, g3) \
|
||||
OP1(m0, m1, m2, m3) \
|
||||
|
||||
#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
|
||||
R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \
|
||||
@ -161,8 +130,9 @@ ATTRIB_SHA
|
||||
void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
|
||||
{
|
||||
const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203);
|
||||
__m128i tmp;
|
||||
__m128i state0, state1;
|
||||
|
||||
|
||||
__m128i tmp, state0, state1;
|
||||
|
||||
if (numBlocks == 0)
|
||||
return;
|
||||
@ -262,22 +232,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
|
||||
#define _ARM_USE_NEW_NEON_INTRINSICS
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
|
||||
#include <arm64_neon.h>
|
||||
#else
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#if defined(__clang__) && __clang_major__ < 16
|
||||
#if !defined(__ARM_FEATURE_SHA2) && \
|
||||
!defined(__ARM_FEATURE_CRYPTO)
|
||||
@ -324,41 +282,70 @@ typedef uint32x4_t v128;
|
||||
// typedef __n128 v128; // MSVC
|
||||
|
||||
#ifdef MY_CPU_BE
|
||||
#define MY_rev32_for_LE(x)
|
||||
#define MY_rev32_for_LE(x) x
|
||||
#else
|
||||
#define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x)))
|
||||
#define MY_rev32_for_LE(x) vrev32q_u8(x)
|
||||
#endif
|
||||
|
||||
#define LOAD_128(_p) (*(const v128 *)(const void *)(_p))
|
||||
#define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v)
|
||||
#if 1 // 0 for debug
|
||||
// for arm32: it works slower by some reason than direct code
|
||||
/*
|
||||
for arm32 it generates:
|
||||
MSVC-2022, GCC-9:
|
||||
vld1.32 {d18,d19}, [r10]
|
||||
vst1.32 {d4,d5}, [r3]
|
||||
vld1.8 {d20-d21}, [r4]
|
||||
there is no align hint (like [r10:128]). So instruction allows unaligned access
|
||||
*/
|
||||
#define LOAD_128_32(_p) vld1q_u32(_p)
|
||||
#define LOAD_128_8(_p) vld1q_u8 (_p)
|
||||
#define STORE_128_32(_p, _v) vst1q_u32(_p, _v)
|
||||
#else
|
||||
/*
|
||||
for arm32:
|
||||
MSVC-2022:
|
||||
vldm r10,{d18,d19}
|
||||
vstm r3,{d4,d5}
|
||||
does it require strict alignment?
|
||||
GCC-9:
|
||||
vld1.64 {d30-d31}, [r0:64]
|
||||
vldr d28, [r0, #16]
|
||||
vldr d29, [r0, #24]
|
||||
vst1.64 {d30-d31}, [r0:64]
|
||||
vstr d28, [r0, #16]
|
||||
vstr d29, [r0, #24]
|
||||
there is hint [r0:64], so does it requires 64-bit alignment.
|
||||
*/
|
||||
#define LOAD_128_32(_p) (*(const v128 *)(const void *)(_p))
|
||||
#define LOAD_128_8(_p) vreinterpretq_u8_u32(*(const v128 *)(const void *)(_p))
|
||||
#define STORE_128_32(_p, _v) *(v128 *)(void *)(_p) = (_v)
|
||||
#endif
|
||||
|
||||
#define LOAD_SHUFFLE(m, k) \
|
||||
m = LOAD_128((data + (k) * 16)); \
|
||||
MY_rev32_for_LE(m); \
|
||||
m = vreinterpretq_u32_u8( \
|
||||
MY_rev32_for_LE( \
|
||||
LOAD_128_8(data + (k) * 16))); \
|
||||
|
||||
// K array must be aligned for 16-bytes at least.
|
||||
extern
|
||||
MY_ALIGN(64)
|
||||
const UInt32 SHA256_K_ARRAY[64];
|
||||
|
||||
#define K SHA256_K_ARRAY
|
||||
|
||||
|
||||
#define SHA256_SU0(dest, src) dest = vsha256su0q_u32(dest, src);
|
||||
#define SHA25G_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3);
|
||||
#define SHA256_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3);
|
||||
|
||||
#define SM1(g0, g1, g2, g3) SHA256_SU0(g3, g0)
|
||||
#define SM2(g0, g1, g2, g3) SHA25G_SU1(g2, g0, g1)
|
||||
#define NNN(g0, g1, g2, g3)
|
||||
#define SM1(m0, m1, m2, m3) SHA256_SU0(m3, m0)
|
||||
#define SM2(m0, m1, m2, m3) SHA256_SU1(m2, m0, m1)
|
||||
#define NNN(m0, m1, m2, m3)
|
||||
|
||||
|
||||
#define R4(k, g0, g1, g2, g3, OP0, OP1) \
|
||||
msg = vaddq_u32(g0, *(const v128 *) (const void *) &K[(k) * 4]); \
|
||||
#define R4(k, m0, m1, m2, m3, OP0, OP1) \
|
||||
msg = vaddq_u32(m0, *(const v128 *) (const void *) &K[(k) * 4]); \
|
||||
tmp = state0; \
|
||||
state0 = vsha256hq_u32( state0, state1, msg ); \
|
||||
state1 = vsha256h2q_u32( state1, tmp, msg ); \
|
||||
OP0(g0, g1, g2, g3); \
|
||||
OP1(g0, g1, g2, g3); \
|
||||
OP0(m0, m1, m2, m3); \
|
||||
OP1(m0, m1, m2, m3); \
|
||||
|
||||
|
||||
#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
|
||||
@ -379,8 +366,8 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
|
||||
if (numBlocks == 0)
|
||||
return;
|
||||
|
||||
state0 = LOAD_128(&state[0]);
|
||||
state1 = LOAD_128(&state[4]);
|
||||
state0 = LOAD_128_32(&state[0]);
|
||||
state1 = LOAD_128_32(&state[4]);
|
||||
|
||||
do
|
||||
{
|
||||
@ -408,8 +395,8 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
|
||||
}
|
||||
while (--numBlocks);
|
||||
|
||||
STORE_128(&state[0], state0);
|
||||
STORE_128(&state[4], state1);
|
||||
STORE_128_32(&state[0], state0);
|
||||
STORE_128_32(&state[4], state1);
|
||||
}
|
||||
|
||||
#endif // USE_HW_SHA
|
||||
@ -443,13 +430,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
#undef K
|
||||
#undef RND2
|
||||
#undef RND2_0
|
||||
#undef RND2_1
|
||||
|
||||
#undef MY_rev32_for_LE
|
||||
|
||||
#undef NNN
|
||||
#undef LOAD_128
|
||||
#undef STORE_128
|
||||
@ -457,7 +441,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
|
||||
#undef SM1
|
||||
#undef SM2
|
||||
|
||||
#undef NNN
|
||||
|
||||
#undef R4
|
||||
#undef R16
|
||||
#undef PREPARE_STATE
|
||||
|
||||
359
3rdparty/lzma/src/Sort.c
vendored
359
3rdparty/lzma/src/Sort.c
vendored
@ -1,141 +1,268 @@
|
||||
/* Sort.c -- Sort functions
|
||||
2014-04-05 : Igor Pavlov : Public domain */
|
||||
: Igor Pavlov : Public domain */
|
||||
|
||||
#include "Precomp.h"
|
||||
|
||||
#include "Sort.h"
|
||||
#include "CpuArch.h"
|
||||
|
||||
#define HeapSortDown(p, k, size, temp) \
|
||||
{ for (;;) { \
|
||||
size_t s = (k << 1); \
|
||||
if (s > size) break; \
|
||||
if (s < size && p[s + 1] > p[s]) s++; \
|
||||
if (temp >= p[s]) break; \
|
||||
p[k] = p[s]; k = s; \
|
||||
} p[k] = temp; }
|
||||
#if ( (defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \
|
||||
|| (defined(__clang__) && Z7_has_builtin(__builtin_prefetch)) \
|
||||
)
|
||||
// the code with prefetch is slow for small arrays on x86.
|
||||
// So we disable prefetch for x86.
|
||||
#ifndef MY_CPU_X86
|
||||
// #pragma message("Z7_PREFETCH : __builtin_prefetch")
|
||||
#define Z7_PREFETCH(a) __builtin_prefetch((a))
|
||||
#endif
|
||||
|
||||
#elif defined(_WIN32) // || defined(_MSC_VER) && (_MSC_VER >= 1200)
|
||||
|
||||
#include "7zWindows.h"
|
||||
|
||||
// NOTE: CLANG/GCC/MSVC can define different values for _MM_HINT_T0 / PF_TEMPORAL_LEVEL_1.
|
||||
// For example, clang-cl can generate "prefetcht2" instruction for
|
||||
// PreFetchCacheLine(PF_TEMPORAL_LEVEL_1) call.
|
||||
// But we want to generate "prefetcht0" instruction.
|
||||
// So for CLANG/GCC we must use __builtin_prefetch() in code branch above
|
||||
// instead of PreFetchCacheLine() / _mm_prefetch().
|
||||
|
||||
// New msvc-x86 compiler generates "prefetcht0" instruction for PreFetchCacheLine() call.
|
||||
// But old x86 cpus don't support "prefetcht0".
|
||||
// So we will use PreFetchCacheLine(), only if we are sure that
|
||||
// generated instruction is supported by all cpus of that isa.
|
||||
#if defined(MY_CPU_AMD64) \
|
||||
|| defined(MY_CPU_ARM64) \
|
||||
|| defined(MY_CPU_IA64)
|
||||
// we need to use additional braces for (a) in PreFetchCacheLine call, because
|
||||
// PreFetchCacheLine macro doesn't use braces:
|
||||
// #define PreFetchCacheLine(l, a) _mm_prefetch((CHAR CONST *) a, l)
|
||||
// #pragma message("Z7_PREFETCH : PreFetchCacheLine")
|
||||
#define Z7_PREFETCH(a) PreFetchCacheLine(PF_TEMPORAL_LEVEL_1, (a))
|
||||
#endif
|
||||
|
||||
#endif // _WIN32
|
||||
|
||||
|
||||
#define PREFETCH_NO(p,k,s,size)
|
||||
|
||||
#ifndef Z7_PREFETCH
|
||||
#define SORT_PREFETCH(p,k,s,size)
|
||||
#else
|
||||
|
||||
// #define PREFETCH_LEVEL 2 // use it if cache line is 32-bytes
|
||||
#define PREFETCH_LEVEL 3 // it is fast for most cases (64-bytes cache line prefetch)
|
||||
// #define PREFETCH_LEVEL 4 // it can be faster for big array (128-bytes prefetch)
|
||||
|
||||
#if PREFETCH_LEVEL == 0
|
||||
|
||||
#define SORT_PREFETCH(p,k,s,size)
|
||||
|
||||
#else // PREFETCH_LEVEL != 0
|
||||
|
||||
void HeapSort(UInt32 *p, size_t size)
|
||||
{
|
||||
if (size <= 1)
|
||||
return;
|
||||
p--;
|
||||
{
|
||||
size_t i = size / 2;
|
||||
do
|
||||
{
|
||||
UInt32 temp = p[i];
|
||||
size_t k = i;
|
||||
HeapSortDown(p, k, size, temp)
|
||||
}
|
||||
while (--i != 0);
|
||||
}
|
||||
/*
|
||||
if defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
|
||||
we prefetch one value per cache line.
|
||||
Use it if array is aligned for cache line size (64 bytes)
|
||||
or if array is small (less than L1 cache size).
|
||||
|
||||
if !defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
|
||||
we perfetch all cache lines that can be required.
|
||||
it can be faster for big unaligned arrays.
|
||||
*/
|
||||
#define USE_PREFETCH_FOR_ALIGNED_ARRAY
|
||||
|
||||
// s == k * 2
|
||||
#if 0 && PREFETCH_LEVEL <= 3 && defined(MY_CPU_X86_OR_AMD64)
|
||||
// x86 supports (lea r1*8+offset)
|
||||
#define PREFETCH_OFFSET(k,s) ((s) << PREFETCH_LEVEL)
|
||||
#else
|
||||
#define PREFETCH_OFFSET(k,s) ((k) << (PREFETCH_LEVEL + 1))
|
||||
#endif
|
||||
|
||||
#if 1 && PREFETCH_LEVEL <= 3 && defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
|
||||
#define PREFETCH_ADD_OFFSET 0
|
||||
#else
|
||||
// last offset that can be reqiured in PREFETCH_LEVEL step:
|
||||
#define PREFETCH_RANGE ((2 << PREFETCH_LEVEL) - 1)
|
||||
#define PREFETCH_ADD_OFFSET PREFETCH_RANGE / 2
|
||||
#endif
|
||||
|
||||
#if PREFETCH_LEVEL <= 3
|
||||
|
||||
#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
|
||||
#define SORT_PREFETCH(p,k,s,size) \
|
||||
{ const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_ADD_OFFSET; \
|
||||
if (s2 <= size) { \
|
||||
Z7_PREFETCH((p + s2)); \
|
||||
}}
|
||||
#else /* for unaligned array */
|
||||
#define SORT_PREFETCH(p,k,s,size) \
|
||||
{ const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
|
||||
if (s2 <= size) { \
|
||||
Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
|
||||
Z7_PREFETCH((p + s2)); \
|
||||
}}
|
||||
#endif
|
||||
|
||||
#else // PREFETCH_LEVEL > 3
|
||||
|
||||
#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
|
||||
#define SORT_PREFETCH(p,k,s,size) \
|
||||
{ const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE - 16 / 2; \
|
||||
if (s2 <= size) { \
|
||||
Z7_PREFETCH((p + s2 - 16)); \
|
||||
Z7_PREFETCH((p + s2)); \
|
||||
}}
|
||||
#else /* for unaligned array */
|
||||
#define SORT_PREFETCH(p,k,s,size) \
|
||||
{ const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
|
||||
if (s2 <= size) { \
|
||||
Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
|
||||
Z7_PREFETCH((p + s2 - PREFETCH_RANGE / 2)); \
|
||||
Z7_PREFETCH((p + s2)); \
|
||||
}}
|
||||
#endif
|
||||
|
||||
#endif // PREFETCH_LEVEL > 3
|
||||
#endif // PREFETCH_LEVEL != 0
|
||||
#endif // Z7_PREFETCH
|
||||
|
||||
|
||||
#if defined(MY_CPU_ARM64) \
|
||||
/* || defined(MY_CPU_AMD64) */ \
|
||||
/* || defined(MY_CPU_ARM) && !defined(_MSC_VER) */
|
||||
// we want to use cmov, if cmov is very fast:
|
||||
// - this cmov version is slower for clang-x64.
|
||||
// - this cmov version is faster for gcc-arm64 for some fast arm64 cpus.
|
||||
#define Z7_FAST_CMOV_SUPPORTED
|
||||
#endif
|
||||
|
||||
#ifdef Z7_FAST_CMOV_SUPPORTED
|
||||
// we want to use cmov here, if cmov is fast: new arm64 cpus.
|
||||
// we want the compiler to use conditional move for this branch
|
||||
#define GET_MAX_VAL(n0, n1, max_val_slow) if (n0 < n1) n0 = n1;
|
||||
#else
|
||||
// use this branch, if cpu doesn't support fast conditional move.
|
||||
// it uses slow array access reading:
|
||||
#define GET_MAX_VAL(n0, n1, max_val_slow) n0 = max_val_slow;
|
||||
#endif
|
||||
|
||||
#define HeapSortDown(p, k, size, temp, macro_prefetch) \
|
||||
{ \
|
||||
for (;;) { \
|
||||
UInt32 n0, n1; \
|
||||
size_t s = k * 2; \
|
||||
if (s >= size) { \
|
||||
if (s == size) { \
|
||||
n0 = p[s]; \
|
||||
p[k] = n0; \
|
||||
if (temp < n0) k = s; \
|
||||
} \
|
||||
break; \
|
||||
} \
|
||||
n0 = p[k * 2]; \
|
||||
n1 = p[k * 2 + 1]; \
|
||||
s += n0 < n1; \
|
||||
GET_MAX_VAL(n0, n1, p[s]) \
|
||||
if (temp >= n0) break; \
|
||||
macro_prefetch(p, k, s, size) \
|
||||
p[k] = n0; \
|
||||
k = s; \
|
||||
} \
|
||||
p[k] = temp; \
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
stage-1 : O(n) :
|
||||
we generate intermediate partially sorted binary tree:
|
||||
p[0] : it's additional item for better alignment of tree structure in memory.
|
||||
p[1]
|
||||
p[2] p[3]
|
||||
p[4] p[5] p[6] p[7]
|
||||
...
|
||||
p[x] >= p[x * 2]
|
||||
p[x] >= p[x * 2 + 1]
|
||||
|
||||
stage-2 : O(n)*log2(N):
|
||||
we move largest item p[0] from head of tree to the end of array
|
||||
and insert last item to sorted binary tree.
|
||||
*/
|
||||
|
||||
// (p) must be aligned for cache line size (64-bytes) for best performance
|
||||
|
||||
void Z7_FASTCALL HeapSort(UInt32 *p, size_t size)
|
||||
{
|
||||
if (size < 2)
|
||||
return;
|
||||
if (size == 2)
|
||||
{
|
||||
const UInt32 a0 = p[0];
|
||||
const UInt32 a1 = p[1];
|
||||
const unsigned k = a1 < a0;
|
||||
p[k] = a0;
|
||||
p[k ^ 1] = a1;
|
||||
return;
|
||||
}
|
||||
{
|
||||
// stage-1 : O(n)
|
||||
// we transform array to partially sorted binary tree.
|
||||
size_t i = --size / 2;
|
||||
// (size) now is the index of the last item in tree,
|
||||
// if (i)
|
||||
{
|
||||
do
|
||||
{
|
||||
const UInt32 temp = p[i];
|
||||
size_t k = i;
|
||||
HeapSortDown(p, k, size, temp, PREFETCH_NO)
|
||||
}
|
||||
while (--i);
|
||||
}
|
||||
{
|
||||
const UInt32 temp = p[0];
|
||||
const UInt32 a1 = p[1];
|
||||
if (temp < a1)
|
||||
{
|
||||
size_t k = 1;
|
||||
UInt32 temp = p[size];
|
||||
p[size--] = p[1];
|
||||
HeapSortDown(p, k, size, temp)
|
||||
p[0] = a1;
|
||||
HeapSortDown(p, k, size, temp, PREFETCH_NO)
|
||||
}
|
||||
while (size > 1);
|
||||
*/
|
||||
while (size > 3)
|
||||
{
|
||||
UInt32 temp = p[size];
|
||||
size_t k = (p[3] > p[2]) ? 3 : 2;
|
||||
p[size--] = p[1];
|
||||
p[1] = p[k];
|
||||
HeapSortDown(p, k, size, temp)
|
||||
}
|
||||
{
|
||||
UInt32 temp = p[size];
|
||||
p[size] = p[1];
|
||||
if (size > 2 && p[2] < temp)
|
||||
{
|
||||
p[1] = p[2];
|
||||
p[2] = temp;
|
||||
}
|
||||
else
|
||||
p[1] = temp;
|
||||
}
|
||||
}
|
||||
|
||||
void HeapSort64(UInt64 *p, size_t size)
|
||||
if (size < 3)
|
||||
{
|
||||
if (size <= 1)
|
||||
// size == 2
|
||||
const UInt32 a0 = p[0];
|
||||
p[0] = p[2];
|
||||
p[2] = a0;
|
||||
return;
|
||||
p--;
|
||||
}
|
||||
if (size != 3)
|
||||
{
|
||||
size_t i = size / 2;
|
||||
// stage-2 : O(size) * log2(size):
|
||||
// we move largest item p[0] from head to the end of array,
|
||||
// and insert last item to sorted binary tree.
|
||||
do
|
||||
{
|
||||
UInt64 temp = p[i];
|
||||
size_t k = i;
|
||||
HeapSortDown(p, k, size, temp)
|
||||
}
|
||||
while (--i != 0);
|
||||
}
|
||||
/*
|
||||
do
|
||||
{
|
||||
size_t k = 1;
|
||||
UInt64 temp = p[size];
|
||||
p[size--] = p[1];
|
||||
HeapSortDown(p, k, size, temp)
|
||||
}
|
||||
while (size > 1);
|
||||
*/
|
||||
while (size > 3)
|
||||
{
|
||||
UInt64 temp = p[size];
|
||||
size_t k = (p[3] > p[2]) ? 3 : 2;
|
||||
p[size--] = p[1];
|
||||
const UInt32 temp = p[size];
|
||||
size_t k = p[2] < p[3] ? 3 : 2;
|
||||
p[size--] = p[0];
|
||||
p[0] = p[1];
|
||||
p[1] = p[k];
|
||||
HeapSortDown(p, k, size, temp)
|
||||
HeapSortDown(p, k, size, temp, SORT_PREFETCH) // PREFETCH_NO
|
||||
}
|
||||
while (size != 3);
|
||||
}
|
||||
{
|
||||
UInt64 temp = p[size];
|
||||
p[size] = p[1];
|
||||
if (size > 2 && p[2] < temp)
|
||||
{
|
||||
p[1] = p[2];
|
||||
p[2] = temp;
|
||||
}
|
||||
else
|
||||
p[1] = temp;
|
||||
const UInt32 a2 = p[2];
|
||||
const UInt32 a3 = p[3];
|
||||
const size_t k = a2 < a3;
|
||||
p[2] = p[1];
|
||||
p[3] = p[0];
|
||||
p[k] = a3;
|
||||
p[k ^ 1] = a2;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
#define HeapSortRefDown(p, vals, n, size, temp) \
|
||||
{ size_t k = n; UInt32 val = vals[temp]; for (;;) { \
|
||||
size_t s = (k << 1); \
|
||||
if (s > size) break; \
|
||||
if (s < size && vals[p[s + 1]] > vals[p[s]]) s++; \
|
||||
if (val >= vals[p[s]]) break; \
|
||||
p[k] = p[s]; k = s; \
|
||||
} p[k] = temp; }
|
||||
|
||||
void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size)
|
||||
{
|
||||
if (size <= 1)
|
||||
return;
|
||||
p--;
|
||||
{
|
||||
size_t i = size / 2;
|
||||
do
|
||||
{
|
||||
UInt32 temp = p[i];
|
||||
HeapSortRefDown(p, vals, i, size, temp);
|
||||
}
|
||||
while (--i != 0);
|
||||
}
|
||||
do
|
||||
{
|
||||
UInt32 temp = p[size];
|
||||
p[size--] = p[1];
|
||||
HeapSortRefDown(p, vals, 1, size, temp);
|
||||
}
|
||||
while (size > 1);
|
||||
}
|
||||
*/
|
||||
|
||||
235
3rdparty/lzma/src/Threads.c
vendored
235
3rdparty/lzma/src/Threads.c
vendored
@ -1,5 +1,5 @@
|
||||
/* Threads.c -- multithreading library
|
||||
2024-03-28 : Igor Pavlov : Public domain */
|
||||
: Igor Pavlov : Public domain */
|
||||
|
||||
#include "Precomp.h"
|
||||
|
||||
@ -59,6 +59,100 @@ WRes Thread_Wait_Close(CThread *p)
|
||||
return (res != 0 ? res : res2);
|
||||
}
|
||||
|
||||
typedef struct MY_PROCESSOR_NUMBER {
|
||||
WORD Group;
|
||||
BYTE Number;
|
||||
BYTE Reserved;
|
||||
} MY_PROCESSOR_NUMBER, *MY_PPROCESSOR_NUMBER;
|
||||
|
||||
typedef struct MY_GROUP_AFFINITY {
|
||||
#if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 100000)
|
||||
// KAFFINITY is not defined in old mingw
|
||||
ULONG_PTR
|
||||
#else
|
||||
KAFFINITY
|
||||
#endif
|
||||
Mask;
|
||||
WORD Group;
|
||||
WORD Reserved[3];
|
||||
} MY_GROUP_AFFINITY, *MY_PGROUP_AFFINITY;
|
||||
|
||||
typedef BOOL (WINAPI *Func_SetThreadGroupAffinity)(
|
||||
HANDLE hThread,
|
||||
CONST MY_GROUP_AFFINITY *GroupAffinity,
|
||||
MY_PGROUP_AFFINITY PreviousGroupAffinity);
|
||||
|
||||
typedef BOOL (WINAPI *Func_GetThreadGroupAffinity)(
|
||||
HANDLE hThread,
|
||||
MY_PGROUP_AFFINITY GroupAffinity);
|
||||
|
||||
typedef BOOL (WINAPI *Func_GetProcessGroupAffinity)(
|
||||
HANDLE hProcess,
|
||||
PUSHORT GroupCount,
|
||||
PUSHORT GroupArray);
|
||||
|
||||
Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION
|
||||
|
||||
#if 0
|
||||
#include <stdio.h>
|
||||
#define PRF(x) x
|
||||
/*
|
||||
--
|
||||
before call of SetThreadGroupAffinity()
|
||||
GetProcessGroupAffinity return one group.
|
||||
after call of SetThreadGroupAffinity():
|
||||
GetProcessGroupAffinity return more than group,
|
||||
if SetThreadGroupAffinity() was to another group.
|
||||
--
|
||||
GetProcessAffinityMask MS DOCs:
|
||||
{
|
||||
If the calling process contains threads in multiple groups,
|
||||
the function returns zero for both affinity masks.
|
||||
}
|
||||
but tests in win10 with 2 groups (less than 64 cores total):
|
||||
GetProcessAffinityMask() still returns non-zero affinity masks
|
||||
even after SetThreadGroupAffinity() calls.
|
||||
*/
|
||||
static void PrintProcess_Info()
|
||||
{
|
||||
{
|
||||
const
|
||||
Func_GetProcessGroupAffinity fn_GetProcessGroupAffinity =
|
||||
(Func_GetProcessGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
|
||||
"GetProcessGroupAffinity");
|
||||
if (fn_GetProcessGroupAffinity)
|
||||
{
|
||||
unsigned i;
|
||||
USHORT GroupCounts[64];
|
||||
USHORT GroupCount = Z7_ARRAY_SIZE(GroupCounts);
|
||||
BOOL boolRes = fn_GetProcessGroupAffinity(GetCurrentProcess(),
|
||||
&GroupCount, GroupCounts);
|
||||
printf("\n====== GetProcessGroupAffinity : "
|
||||
"boolRes=%u GroupCounts = %u :",
|
||||
boolRes, (unsigned)GroupCount);
|
||||
for (i = 0; i < GroupCount; i++)
|
||||
printf(" %u", GroupCounts[i]);
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
{
|
||||
DWORD_PTR processAffinityMask, systemAffinityMask;
|
||||
if (GetProcessAffinityMask(GetCurrentProcess(), &processAffinityMask, &systemAffinityMask))
|
||||
{
|
||||
PRF(printf("\n====== GetProcessAffinityMask : "
|
||||
": processAffinityMask=%x, systemAffinityMask=%x\n",
|
||||
(UInt32)processAffinityMask, (UInt32)systemAffinityMask);)
|
||||
}
|
||||
else
|
||||
printf("\n==GetProcessAffinityMask FAIL");
|
||||
}
|
||||
}
|
||||
#else
|
||||
#ifndef USE_THREADS_CreateThread
|
||||
// #define PRF(x)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param)
|
||||
{
|
||||
/* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */
|
||||
@ -73,6 +167,42 @@ WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param)
|
||||
unsigned threadId;
|
||||
*p = (HANDLE)(_beginthreadex(NULL, 0, func, param, 0, &threadId));
|
||||
|
||||
#if 0 // 1 : for debug
|
||||
{
|
||||
DWORD_PTR prevMask;
|
||||
DWORD_PTR affinity = 1 << 0;
|
||||
prevMask = SetThreadAffinityMask(*p, (DWORD_PTR)affinity);
|
||||
prevMask = prevMask;
|
||||
}
|
||||
#endif
|
||||
#if 0 // 1 : for debug
|
||||
{
|
||||
/* win10: new thread will be created in same group that is assigned to parent thread
|
||||
but affinity mask will contain all allowed threads of that group,
|
||||
even if affinity mask of parent group is not full
|
||||
win11: what group it will be created, if we have set
|
||||
affinity of parent thread with ThreadGroupAffinity?
|
||||
*/
|
||||
const
|
||||
Func_GetThreadGroupAffinity fn =
|
||||
(Func_GetThreadGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
|
||||
"GetThreadGroupAffinity");
|
||||
if (fn)
|
||||
{
|
||||
// BOOL wres2;
|
||||
MY_GROUP_AFFINITY groupAffinity;
|
||||
memset(&groupAffinity, 0, sizeof(groupAffinity));
|
||||
/* wres2 = */ fn(*p, &groupAffinity);
|
||||
PRF(printf("\n==Thread_Create cur = %6u GetThreadGroupAffinity(): "
|
||||
"wres2_BOOL = %u, group=%u mask=%x\n",
|
||||
GetCurrentThreadId(),
|
||||
wres2,
|
||||
groupAffinity.Group,
|
||||
(UInt32)groupAffinity.Mask);)
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
/* maybe we must use errno here, but probably GetLastError() is also OK. */
|
||||
@ -110,7 +240,84 @@ WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param
|
||||
*/
|
||||
}
|
||||
{
|
||||
DWORD prevSuspendCount = ResumeThread(h);
|
||||
const DWORD prevSuspendCount = ResumeThread(h);
|
||||
/* ResumeThread() returns:
|
||||
0 : was_not_suspended
|
||||
1 : was_resumed
|
||||
-1 : error
|
||||
*/
|
||||
if (prevSuspendCount == (DWORD)-1)
|
||||
wres = GetError();
|
||||
}
|
||||
}
|
||||
|
||||
/* maybe we must use errno here, but probably GetLastError() is also OK. */
|
||||
return wres;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinityMask)
|
||||
{
|
||||
#ifdef USE_THREADS_CreateThread
|
||||
|
||||
UNUSED_VAR(group)
|
||||
UNUSED_VAR(affinityMask)
|
||||
return Thread_Create(p, func, param);
|
||||
|
||||
#else
|
||||
|
||||
/* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */
|
||||
HANDLE h;
|
||||
WRes wres;
|
||||
unsigned threadId;
|
||||
h = (HANDLE)(_beginthreadex(NULL, 0, func, param, CREATE_SUSPENDED, &threadId));
|
||||
*p = h;
|
||||
wres = HandleToWRes(h);
|
||||
if (h)
|
||||
{
|
||||
// PrintProcess_Info();
|
||||
{
|
||||
const
|
||||
Func_SetThreadGroupAffinity fn =
|
||||
(Func_SetThreadGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
|
||||
"SetThreadGroupAffinity");
|
||||
if (fn)
|
||||
{
|
||||
// WRes wres2;
|
||||
MY_GROUP_AFFINITY groupAffinity, prev_groupAffinity;
|
||||
memset(&groupAffinity, 0, sizeof(groupAffinity));
|
||||
// groupAffinity.Mask must use only bits that supported by current group
|
||||
// (groupAffinity.Mask = 0) means all allowed bits
|
||||
groupAffinity.Mask = affinityMask;
|
||||
groupAffinity.Group = (WORD)group;
|
||||
// wres2 =
|
||||
fn(h, &groupAffinity, &prev_groupAffinity);
|
||||
/*
|
||||
if (groupAffinity.Group == prev_groupAffinity.Group)
|
||||
wres2 = wres2;
|
||||
else
|
||||
wres2 = wres2;
|
||||
if (wres2 == 0)
|
||||
{
|
||||
wres2 = GetError();
|
||||
PRF(printf("\n==SetThreadGroupAffinity error: %u\n", wres2);)
|
||||
}
|
||||
else
|
||||
{
|
||||
PRF(printf("\n==Thread_Create_With_Group::SetThreadGroupAffinity()"
|
||||
" threadId = %6u"
|
||||
" group=%u mask=%x\n",
|
||||
threadId,
|
||||
prev_groupAffinity.Group,
|
||||
(UInt32)prev_groupAffinity.Mask);)
|
||||
}
|
||||
*/
|
||||
}
|
||||
}
|
||||
{
|
||||
const DWORD prevSuspendCount = ResumeThread(h);
|
||||
/* ResumeThread() returns:
|
||||
0 : was_not_suspended
|
||||
1 : was_resumed
|
||||
@ -297,6 +504,13 @@ WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param)
|
||||
return Thread_Create_With_CpuSet(p, func, param, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinity)
|
||||
{
|
||||
UNUSED_VAR(group)
|
||||
return Thread_Create_With_Affinity(p, func, param, affinity);
|
||||
}
|
||||
*/
|
||||
|
||||
WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, CAffinityMask affinity)
|
||||
{
|
||||
@ -577,5 +791,22 @@ WRes AutoResetEvent_OptCreate_And_Reset(CAutoResetEvent *p)
|
||||
return AutoResetEvent_CreateNotSignaled(p);
|
||||
}
|
||||
|
||||
void ThreadNextGroup_Init(CThreadNextGroup *p, UInt32 numGroups, UInt32 startGroup)
|
||||
{
|
||||
// printf("\n====== ThreadNextGroup_Init numGroups = %x: startGroup=%x\n", numGroups, startGroup);
|
||||
if (numGroups == 0)
|
||||
numGroups = 1;
|
||||
p->NumGroups = numGroups;
|
||||
p->NextGroup = startGroup % numGroups;
|
||||
}
|
||||
|
||||
|
||||
UInt32 ThreadNextGroup_GetNext(CThreadNextGroup *p)
|
||||
{
|
||||
const UInt32 next = p->NextGroup;
|
||||
p->NextGroup = (next + 1) % p->NumGroups;
|
||||
return next;
|
||||
}
|
||||
|
||||
#undef PRF
|
||||
#undef Print
|
||||
|
||||
4
3rdparty/lzma/src/XzCrc64Opt.c
vendored
4
3rdparty/lzma/src/XzCrc64Opt.c
vendored
@ -1,5 +1,5 @@
|
||||
/* XzCrc64Opt.c -- CRC64 calculation (optimized functions)
|
||||
2023-12-08 : Igor Pavlov : Public domain */
|
||||
: Igor Pavlov : Public domain */
|
||||
|
||||
#include "Precomp.h"
|
||||
|
||||
@ -235,7 +235,7 @@ CRC64_FUNC_PRE_BE(Z7_CRC64_NUM_TABLES_USE)
|
||||
v = Q32BE(1, w1) ^ Q32BE(0, w0);
|
||||
v ^= Q32BE(3, d1) ^ Q32BE(2, d0);
|
||||
#endif
|
||||
#elif
|
||||
#else
|
||||
#error Stop_Compiling_Bad_CRC64_NUM_TABLES
|
||||
#endif
|
||||
p += Z7_CRC64_NUM_TABLES_USE;
|
||||
|
||||
29
3rdparty/lzma/src/XzDec.c
vendored
29
3rdparty/lzma/src/XzDec.c
vendored
@ -1,5 +1,5 @@
|
||||
/* XzDec.c -- Xz Decode
|
||||
2024-03-01 : Igor Pavlov : Public domain */
|
||||
: Igor Pavlov : Public domain */
|
||||
|
||||
#include "Precomp.h"
|
||||
|
||||
@ -59,7 +59,7 @@ unsigned Xz_ReadVarInt(const Byte *p, size_t maxSize, UInt64 *value)
|
||||
|
||||
for (i = 0; i < limit;)
|
||||
{
|
||||
Byte b = p[i];
|
||||
const unsigned b = p[i];
|
||||
*value |= (UInt64)(b & 0x7F) << (7 * i++);
|
||||
if ((b & 0x80) == 0)
|
||||
return (b == 0 && i != 1) ? 0 : i;
|
||||
@ -796,11 +796,10 @@ SRes Xz_ParseHeader(CXzStreamFlags *p, const Byte *buf)
|
||||
|
||||
static BoolInt Xz_CheckFooter(CXzStreamFlags flags, UInt64 indexSize, const Byte *buf)
|
||||
{
|
||||
return indexSize == (((UInt64)GetUi32(buf + 4) + 1) << 2)
|
||||
&& GetUi32(buf) == CrcCalc(buf + 4, 6)
|
||||
&& flags == GetBe16(buf + 8)
|
||||
&& buf[10] == XZ_FOOTER_SIG_0
|
||||
&& buf[11] == XZ_FOOTER_SIG_1;
|
||||
return indexSize == (((UInt64)GetUi32a(buf + 4) + 1) << 2)
|
||||
&& GetUi32a(buf) == CrcCalc(buf + 4, 6)
|
||||
&& flags == GetBe16a(buf + 8)
|
||||
&& GetUi16a(buf + 10) == (XZ_FOOTER_SIG_0 | (XZ_FOOTER_SIG_1 << 8));
|
||||
}
|
||||
|
||||
#define READ_VARINT_AND_CHECK(buf, pos, size, res) \
|
||||
@ -1166,7 +1165,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
|
||||
p->indexPreSize = 1 + Xz_WriteVarInt(p->buf + 1, p->numBlocks);
|
||||
p->indexPos = p->indexPreSize;
|
||||
p->indexSize += p->indexPreSize;
|
||||
Sha256_Final(&p->sha, p->shaDigest);
|
||||
Sha256_Final(&p->sha, (Byte *)(void *)p->shaDigest32);
|
||||
Sha256_Init(&p->sha);
|
||||
p->crc = CrcUpdate(CRC_INIT_VAL, p->buf, p->indexPreSize);
|
||||
p->state = XZ_STATE_STREAM_INDEX;
|
||||
@ -1241,10 +1240,10 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
|
||||
break;
|
||||
}
|
||||
{
|
||||
Byte digest[XZ_CHECK_SIZE_MAX];
|
||||
UInt32 digest32[XZ_CHECK_SIZE_MAX / 4];
|
||||
p->state = XZ_STATE_BLOCK_HEADER;
|
||||
p->pos = 0;
|
||||
if (XzCheck_Final(&p->check, digest) && memcmp(digest, p->buf, checkSize) != 0)
|
||||
if (XzCheck_Final(&p->check, (void *)digest32) && memcmp(digest32, p->buf, checkSize) != 0)
|
||||
return SZ_ERROR_CRC;
|
||||
if (p->decodeOnlyOneBlock)
|
||||
{
|
||||
@ -1289,12 +1288,12 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
|
||||
}
|
||||
else
|
||||
{
|
||||
Byte digest[SHA256_DIGEST_SIZE];
|
||||
UInt32 digest32[SHA256_DIGEST_SIZE / 4];
|
||||
p->state = XZ_STATE_STREAM_INDEX_CRC;
|
||||
p->indexSize += 4;
|
||||
p->pos = 0;
|
||||
Sha256_Final(&p->sha, digest);
|
||||
if (memcmp(digest, p->shaDigest, SHA256_DIGEST_SIZE) != 0)
|
||||
Sha256_Final(&p->sha, (void *)digest32);
|
||||
if (memcmp(digest32, p->shaDigest32, SHA256_DIGEST_SIZE) != 0)
|
||||
return SZ_ERROR_CRC;
|
||||
}
|
||||
}
|
||||
@ -1313,7 +1312,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
|
||||
const Byte *ptr = p->buf;
|
||||
p->state = XZ_STATE_STREAM_FOOTER;
|
||||
p->pos = 0;
|
||||
if (CRC_GET_DIGEST(p->crc) != GetUi32(ptr))
|
||||
if (CRC_GET_DIGEST(p->crc) != GetUi32a(ptr))
|
||||
return SZ_ERROR_CRC;
|
||||
}
|
||||
break;
|
||||
@ -1343,7 +1342,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
|
||||
{
|
||||
if (*src != 0)
|
||||
{
|
||||
if (((UInt32)p->padSize & 3) != 0)
|
||||
if ((unsigned)p->padSize & 3)
|
||||
return SZ_ERROR_NO_ARCHIVE;
|
||||
p->pos = 0;
|
||||
p->state = XZ_STATE_STREAM_HEADER;
|
||||
|
||||
8
3rdparty/lzma/src/XzEnc.c
vendored
8
3rdparty/lzma/src/XzEnc.c
vendored
@ -1,5 +1,5 @@
|
||||
/* XzEnc.c -- Xz Encode
|
||||
2024-03-01 : Igor Pavlov : Public domain */
|
||||
: Igor Pavlov : Public domain */
|
||||
|
||||
#include "Precomp.h"
|
||||
|
||||
@ -411,6 +411,7 @@ static SRes SeqInFilter_Read(ISeqInStreamPtr pp, void *data, size_t *size)
|
||||
}
|
||||
}
|
||||
|
||||
Z7_FORCE_INLINE
|
||||
static void SeqInFilter_Construct(CSeqInFilter *p)
|
||||
{
|
||||
p->buf = NULL;
|
||||
@ -418,6 +419,7 @@ static void SeqInFilter_Construct(CSeqInFilter *p)
|
||||
p->vt.Read = SeqInFilter_Read;
|
||||
}
|
||||
|
||||
Z7_FORCE_INLINE
|
||||
static void SeqInFilter_Free(CSeqInFilter *p, ISzAllocPtr alloc)
|
||||
{
|
||||
if (p->StateCoder.p)
|
||||
@ -507,6 +509,7 @@ void XzFilterProps_Init(CXzFilterProps *p)
|
||||
void XzProps_Init(CXzProps *p)
|
||||
{
|
||||
p->checkId = XZ_CHECK_CRC32;
|
||||
p->numThreadGroups = 0;
|
||||
p->blockSize = XZ_PROPS_BLOCK_SIZE_AUTO;
|
||||
p->numBlockThreads_Reduced = -1;
|
||||
p->numBlockThreads_Max = -1;
|
||||
@ -689,6 +692,7 @@ typedef struct
|
||||
} CLzma2WithFilters;
|
||||
|
||||
|
||||
Z7_FORCE_INLINE
|
||||
static void Lzma2WithFilters_Construct(CLzma2WithFilters *p)
|
||||
{
|
||||
p->lzma2 = NULL;
|
||||
@ -712,6 +716,7 @@ static SRes Lzma2WithFilters_Create(CLzma2WithFilters *p, ISzAllocPtr alloc, ISz
|
||||
}
|
||||
|
||||
|
||||
Z7_FORCE_INLINE
|
||||
static void Lzma2WithFilters_Free(CLzma2WithFilters *p, ISzAllocPtr alloc)
|
||||
{
|
||||
#ifdef USE_SUBBLOCK
|
||||
@ -1236,6 +1241,7 @@ SRes XzEnc_Encode(CXzEncHandle p, ISeqOutStreamPtr outStream, ISeqInStreamPtr in
|
||||
}
|
||||
|
||||
p->mtCoder.numThreadsMax = (unsigned)props->numBlockThreads_Max;
|
||||
p->mtCoder.numThreadGroups = props->numThreadGroups;
|
||||
p->mtCoder.expectedDataSize = p->expectedDataSize;
|
||||
|
||||
RINOK(MtCoder_Code(&p->mtCoder))
|
||||
|
||||
257
3rdparty/lzma/src/XzIn.c
vendored
257
3rdparty/lzma/src/XzIn.c
vendored
@ -1,38 +1,39 @@
|
||||
/* XzIn.c - Xz input
|
||||
2023-09-07 : Igor Pavlov : Public domain */
|
||||
: Igor Pavlov : Public domain */
|
||||
|
||||
#include "Precomp.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "7zCrc.h"
|
||||
#include "CpuArch.h"
|
||||
#include "Xz.h"
|
||||
#include "CpuArch.h"
|
||||
|
||||
/*
|
||||
#define XZ_FOOTER_SIG_CHECK(p) (memcmp((p), XZ_FOOTER_SIG, XZ_FOOTER_SIG_SIZE) == 0)
|
||||
*/
|
||||
#define XZ_FOOTER_SIG_CHECK(p) ((p)[0] == XZ_FOOTER_SIG_0 && (p)[1] == XZ_FOOTER_SIG_1)
|
||||
|
||||
#define XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(p) \
|
||||
(GetUi16a((const Byte *)(const void *)(p) + 10) == \
|
||||
(XZ_FOOTER_SIG_0 | (XZ_FOOTER_SIG_1 << 8)))
|
||||
|
||||
SRes Xz_ReadHeader(CXzStreamFlags *p, ISeqInStreamPtr inStream)
|
||||
{
|
||||
Byte sig[XZ_STREAM_HEADER_SIZE];
|
||||
UInt32 data32[XZ_STREAM_HEADER_SIZE / 4];
|
||||
size_t processedSize = XZ_STREAM_HEADER_SIZE;
|
||||
RINOK(SeqInStream_ReadMax(inStream, sig, &processedSize))
|
||||
RINOK(SeqInStream_ReadMax(inStream, data32, &processedSize))
|
||||
if (processedSize != XZ_STREAM_HEADER_SIZE
|
||||
|| memcmp(sig, XZ_SIG, XZ_SIG_SIZE) != 0)
|
||||
|| memcmp(data32, XZ_SIG, XZ_SIG_SIZE) != 0)
|
||||
return SZ_ERROR_NO_ARCHIVE;
|
||||
return Xz_ParseHeader(p, sig);
|
||||
return Xz_ParseHeader(p, (const Byte *)(const void *)data32);
|
||||
}
|
||||
|
||||
#define READ_VARINT_AND_CHECK(buf, pos, size, res) \
|
||||
{ const unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \
|
||||
#define READ_VARINT_AND_CHECK(buf, size, res) \
|
||||
{ const unsigned s = Xz_ReadVarInt(buf, size, res); \
|
||||
if (s == 0) return SZ_ERROR_ARCHIVE; \
|
||||
pos += s; }
|
||||
size -= s; \
|
||||
buf += s; \
|
||||
}
|
||||
|
||||
SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, UInt32 *headerSizeRes)
|
||||
{
|
||||
MY_ALIGN(4)
|
||||
Byte header[XZ_BLOCK_HEADER_SIZE_MAX];
|
||||
unsigned headerSize;
|
||||
*headerSizeRes = 0;
|
||||
@ -57,8 +58,12 @@ SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex,
|
||||
return XzBlock_Parse(p, header);
|
||||
}
|
||||
|
||||
|
||||
#define ADD_SIZE_CHECK(size, val) \
|
||||
{ const UInt64 newSize = size + (val); if (newSize < size) return XZ_SIZE_OVERFLOW; size = newSize; }
|
||||
{ const UInt64 newSize = size + (val); \
|
||||
if (newSize < size) return XZ_SIZE_OVERFLOW; \
|
||||
size = newSize; \
|
||||
}
|
||||
|
||||
UInt64 Xz_GetUnpackSize(const CXzStream *p)
|
||||
{
|
||||
@ -82,76 +87,85 @@ UInt64 Xz_GetPackSize(const CXzStream *p)
|
||||
return size;
|
||||
}
|
||||
|
||||
/*
|
||||
SRes XzBlock_ReadFooter(CXzBlock *p, CXzStreamFlags f, ISeqInStreamPtr inStream)
|
||||
{
|
||||
return SeqInStream_Read(inStream, p->check, XzFlags_GetCheckSize(f));
|
||||
}
|
||||
*/
|
||||
|
||||
static SRes Xz_ReadIndex2(CXzStream *p, const Byte *buf, size_t size, ISzAllocPtr alloc)
|
||||
// input;
|
||||
// CXzStream (p) is empty object.
|
||||
// size != 0
|
||||
// (size & 3) == 0
|
||||
// (buf) is aligned for at least 4 bytes.
|
||||
// output:
|
||||
// p->numBlocks is number of allocated items in p->blocks
|
||||
// p->blocks[*] values must be ignored, if function returns error.
|
||||
static SRes Xz_ParseIndex(CXzStream *p, const Byte *buf, size_t size, ISzAllocPtr alloc)
|
||||
{
|
||||
size_t numBlocks, pos = 1;
|
||||
UInt32 crc;
|
||||
|
||||
size_t numBlocks;
|
||||
if (size < 5 || buf[0] != 0)
|
||||
return SZ_ERROR_ARCHIVE;
|
||||
|
||||
size -= 4;
|
||||
crc = CrcCalc(buf, size);
|
||||
if (crc != GetUi32(buf + size))
|
||||
{
|
||||
const UInt32 crc = CrcCalc(buf, size);
|
||||
if (crc != GetUi32a(buf + size))
|
||||
return SZ_ERROR_ARCHIVE;
|
||||
|
||||
}
|
||||
buf++;
|
||||
size--;
|
||||
{
|
||||
UInt64 numBlocks64;
|
||||
READ_VARINT_AND_CHECK(buf, pos, size, &numBlocks64)
|
||||
READ_VARINT_AND_CHECK(buf, size, &numBlocks64)
|
||||
// (numBlocks64) is 63-bit value, so we can calculate (numBlocks64 * 2):
|
||||
if (numBlocks64 * 2 > size)
|
||||
return SZ_ERROR_ARCHIVE;
|
||||
if (numBlocks64 >= ((size_t)1 << (sizeof(size_t) * 8 - 1)) / sizeof(CXzBlockSizes))
|
||||
return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE
|
||||
numBlocks = (size_t)numBlocks64;
|
||||
if (numBlocks != numBlocks64 || numBlocks * 2 > size)
|
||||
return SZ_ERROR_ARCHIVE;
|
||||
}
|
||||
|
||||
Xz_Free(p, alloc);
|
||||
if (numBlocks != 0)
|
||||
// Xz_Free(p, alloc); // it's optional, because (p) is empty already
|
||||
if (numBlocks)
|
||||
{
|
||||
size_t i;
|
||||
p->numBlocks = numBlocks;
|
||||
p->blocks = (CXzBlockSizes *)ISzAlloc_Alloc(alloc, sizeof(CXzBlockSizes) * numBlocks);
|
||||
if (!p->blocks)
|
||||
CXzBlockSizes *blocks = (CXzBlockSizes *)ISzAlloc_Alloc(alloc, sizeof(CXzBlockSizes) * numBlocks);
|
||||
if (!blocks)
|
||||
return SZ_ERROR_MEM;
|
||||
for (i = 0; i < numBlocks; i++)
|
||||
p->blocks = blocks;
|
||||
p->numBlocks = numBlocks;
|
||||
// the caller will call Xz_Free() in case of error
|
||||
do
|
||||
{
|
||||
CXzBlockSizes *block = &p->blocks[i];
|
||||
READ_VARINT_AND_CHECK(buf, pos, size, &block->totalSize)
|
||||
READ_VARINT_AND_CHECK(buf, pos, size, &block->unpackSize)
|
||||
if (block->totalSize == 0)
|
||||
READ_VARINT_AND_CHECK(buf, size, &blocks->totalSize)
|
||||
READ_VARINT_AND_CHECK(buf, size, &blocks->unpackSize)
|
||||
if (blocks->totalSize == 0)
|
||||
return SZ_ERROR_ARCHIVE;
|
||||
blocks++;
|
||||
}
|
||||
while (--numBlocks);
|
||||
}
|
||||
while ((pos & 3) != 0)
|
||||
if (buf[pos++] != 0)
|
||||
if (size >= 4)
|
||||
return SZ_ERROR_ARCHIVE;
|
||||
return (pos == size) ? SZ_OK : SZ_ERROR_ARCHIVE;
|
||||
while (size)
|
||||
if (buf[--size])
|
||||
return SZ_ERROR_ARCHIVE;
|
||||
return SZ_OK;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
static SRes Xz_ReadIndex(CXzStream *p, ILookInStreamPtr stream, UInt64 indexSize, ISzAllocPtr alloc)
|
||||
{
|
||||
SRes res;
|
||||
size_t size;
|
||||
Byte *buf;
|
||||
if (indexSize > ((UInt32)1 << 31))
|
||||
return SZ_ERROR_UNSUPPORTED;
|
||||
if (indexSize >= ((size_t)1 << (sizeof(size_t) * 8 - 1)))
|
||||
return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE
|
||||
size = (size_t)indexSize;
|
||||
if (size != indexSize)
|
||||
return SZ_ERROR_UNSUPPORTED;
|
||||
buf = (Byte *)ISzAlloc_Alloc(alloc, size);
|
||||
if (!buf)
|
||||
return SZ_ERROR_MEM;
|
||||
res = LookInStream_Read2(stream, buf, size, SZ_ERROR_UNSUPPORTED);
|
||||
if (res == SZ_OK)
|
||||
res = Xz_ReadIndex2(p, buf, size, alloc);
|
||||
res = Xz_ParseIndex(p, buf, size, alloc);
|
||||
ISzAlloc_Free(alloc, buf);
|
||||
return res;
|
||||
}
|
||||
*/
|
||||
|
||||
static SRes LookInStream_SeekRead_ForArc(ILookInStreamPtr stream, UInt64 offset, void *buf, size_t size)
|
||||
{
|
||||
@ -160,84 +174,102 @@ static SRes LookInStream_SeekRead_ForArc(ILookInStreamPtr stream, UInt64 offset,
|
||||
/* return LookInStream_Read2(stream, buf, size, SZ_ERROR_NO_ARCHIVE); */
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
in:
|
||||
(*startOffset) is position in (stream) where xz_stream must be finished.
|
||||
out:
|
||||
if returns SZ_OK, then (*startOffset) is position in stream that shows start of xz_stream.
|
||||
*/
|
||||
static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startOffset, ISzAllocPtr alloc)
|
||||
{
|
||||
UInt64 indexSize;
|
||||
Byte buf[XZ_STREAM_FOOTER_SIZE];
|
||||
#define TEMP_BUF_SIZE (1 << 10)
|
||||
UInt32 buf32[TEMP_BUF_SIZE / 4];
|
||||
UInt64 pos = (UInt64)*startOffset;
|
||||
|
||||
if ((pos & 3) != 0 || pos < XZ_STREAM_FOOTER_SIZE)
|
||||
if ((pos & 3) || pos < XZ_STREAM_FOOTER_SIZE)
|
||||
return SZ_ERROR_NO_ARCHIVE;
|
||||
|
||||
pos -= XZ_STREAM_FOOTER_SIZE;
|
||||
RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf, XZ_STREAM_FOOTER_SIZE))
|
||||
RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, XZ_STREAM_FOOTER_SIZE))
|
||||
|
||||
if (!XZ_FOOTER_SIG_CHECK(buf + 10))
|
||||
if (!XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(buf32))
|
||||
{
|
||||
UInt32 total = 0;
|
||||
pos += XZ_STREAM_FOOTER_SIZE;
|
||||
|
||||
for (;;)
|
||||
{
|
||||
size_t i;
|
||||
#define TEMP_BUF_SIZE (1 << 10)
|
||||
Byte temp[TEMP_BUF_SIZE];
|
||||
|
||||
i = (pos > TEMP_BUF_SIZE) ? TEMP_BUF_SIZE : (size_t)pos;
|
||||
// pos != 0
|
||||
// (pos & 3) == 0
|
||||
size_t i = pos >= TEMP_BUF_SIZE ? TEMP_BUF_SIZE : (size_t)pos;
|
||||
pos -= i;
|
||||
RINOK(LookInStream_SeekRead_ForArc(stream, pos, temp, i))
|
||||
total += (UInt32)i;
|
||||
for (; i != 0; i--)
|
||||
if (temp[i - 1] != 0)
|
||||
RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, i))
|
||||
i /= 4;
|
||||
do
|
||||
if (buf32[i - 1] != 0)
|
||||
break;
|
||||
if (i != 0)
|
||||
{
|
||||
if ((i & 3) != 0)
|
||||
return SZ_ERROR_NO_ARCHIVE;
|
||||
pos += i;
|
||||
break;
|
||||
}
|
||||
if (pos < XZ_STREAM_FOOTER_SIZE || total > (1 << 16))
|
||||
return SZ_ERROR_NO_ARCHIVE;
|
||||
}
|
||||
while (--i);
|
||||
|
||||
pos += i * 4;
|
||||
#define XZ_STREAM_BACKWARD_READING_PAD_MAX (1 << 16)
|
||||
// here we don't support rare case with big padding for xz stream.
|
||||
// so we have padding limit for backward reading.
|
||||
if ((UInt64)*startOffset - pos > XZ_STREAM_BACKWARD_READING_PAD_MAX)
|
||||
return SZ_ERROR_NO_ARCHIVE;
|
||||
if (i)
|
||||
break;
|
||||
}
|
||||
// we try to open xz stream after skipping zero padding.
|
||||
// ((UInt64)*startOffset == pos) is possible here!
|
||||
if (pos < XZ_STREAM_FOOTER_SIZE)
|
||||
return SZ_ERROR_NO_ARCHIVE;
|
||||
pos -= XZ_STREAM_FOOTER_SIZE;
|
||||
RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf, XZ_STREAM_FOOTER_SIZE))
|
||||
if (!XZ_FOOTER_SIG_CHECK(buf + 10))
|
||||
RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, XZ_STREAM_FOOTER_SIZE))
|
||||
if (!XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(buf32))
|
||||
return SZ_ERROR_NO_ARCHIVE;
|
||||
}
|
||||
|
||||
p->flags = (CXzStreamFlags)GetBe16(buf + 8);
|
||||
|
||||
p->flags = (CXzStreamFlags)GetBe16a(buf32 + 2);
|
||||
if (!XzFlags_IsSupported(p->flags))
|
||||
return SZ_ERROR_UNSUPPORTED;
|
||||
|
||||
{
|
||||
/* to eliminate GCC 6.3 warning:
|
||||
dereferencing type-punned pointer will break strict-aliasing rules */
|
||||
const Byte *buf_ptr = buf;
|
||||
if (GetUi32(buf_ptr) != CrcCalc(buf + 4, 6))
|
||||
const UInt32 *buf_ptr = buf32;
|
||||
if (GetUi32a(buf_ptr) != CrcCalc(buf32 + 1, 6))
|
||||
return SZ_ERROR_ARCHIVE;
|
||||
}
|
||||
|
||||
indexSize = ((UInt64)GetUi32(buf + 4) + 1) << 2;
|
||||
|
||||
{
|
||||
const UInt64 indexSize = ((UInt64)GetUi32a(buf32 + 1) + 1) << 2;
|
||||
if (pos < indexSize)
|
||||
return SZ_ERROR_ARCHIVE;
|
||||
|
||||
pos -= indexSize;
|
||||
// v25.00: relaxed indexSize check. We allow big index table.
|
||||
// if (indexSize > ((UInt32)1 << 31))
|
||||
if (indexSize >= ((size_t)1 << (sizeof(size_t) * 8 - 1)))
|
||||
return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE
|
||||
RINOK(LookInStream_SeekTo(stream, pos))
|
||||
RINOK(Xz_ReadIndex(p, stream, indexSize, alloc))
|
||||
|
||||
// RINOK(Xz_ReadIndex(p, stream, indexSize, alloc))
|
||||
{
|
||||
UInt64 totalSize = Xz_GetPackSize(p);
|
||||
if (totalSize == XZ_SIZE_OVERFLOW
|
||||
|| totalSize >= ((UInt64)1 << 63)
|
||||
|| pos < totalSize + XZ_STREAM_HEADER_SIZE)
|
||||
SRes res;
|
||||
const size_t size = (size_t)indexSize;
|
||||
// if (size != indexSize) return SZ_ERROR_UNSUPPORTED;
|
||||
Byte *buf = (Byte *)ISzAlloc_Alloc(alloc, size);
|
||||
if (!buf)
|
||||
return SZ_ERROR_MEM;
|
||||
res = LookInStream_Read2(stream, buf, size, SZ_ERROR_UNSUPPORTED);
|
||||
if (res == SZ_OK)
|
||||
res = Xz_ParseIndex(p, buf, size, alloc);
|
||||
ISzAlloc_Free(alloc, buf);
|
||||
RINOK(res)
|
||||
}
|
||||
}
|
||||
{
|
||||
UInt64 total = Xz_GetPackSize(p);
|
||||
if (total == XZ_SIZE_OVERFLOW || total >= ((UInt64)1 << 63))
|
||||
return SZ_ERROR_ARCHIVE;
|
||||
pos -= (totalSize + XZ_STREAM_HEADER_SIZE);
|
||||
total += XZ_STREAM_HEADER_SIZE;
|
||||
if (pos < total)
|
||||
return SZ_ERROR_ARCHIVE;
|
||||
pos -= total;
|
||||
RINOK(LookInStream_SeekTo(stream, pos))
|
||||
*startOffset = (Int64)pos;
|
||||
}
|
||||
@ -246,7 +278,6 @@ static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startO
|
||||
CSecToRead secToRead;
|
||||
SecToRead_CreateVTable(&secToRead);
|
||||
secToRead.realStream = stream;
|
||||
|
||||
RINOK(Xz_ReadHeader(&headerFlags, &secToRead.vt))
|
||||
return (p->flags == headerFlags) ? SZ_OK : SZ_ERROR_ARCHIVE;
|
||||
}
|
||||
@ -257,8 +288,7 @@ static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startO
|
||||
|
||||
void Xzs_Construct(CXzs *p)
|
||||
{
|
||||
p->num = p->numAllocated = 0;
|
||||
p->streams = 0;
|
||||
Xzs_CONSTRUCT(p)
|
||||
}
|
||||
|
||||
void Xzs_Free(CXzs *p, ISzAllocPtr alloc)
|
||||
@ -268,7 +298,7 @@ void Xzs_Free(CXzs *p, ISzAllocPtr alloc)
|
||||
Xz_Free(&p->streams[i], alloc);
|
||||
ISzAlloc_Free(alloc, p->streams);
|
||||
p->num = p->numAllocated = 0;
|
||||
p->streams = 0;
|
||||
p->streams = NULL;
|
||||
}
|
||||
|
||||
UInt64 Xzs_GetNumBlocks(const CXzs *p)
|
||||
@ -307,34 +337,49 @@ UInt64 Xzs_GetPackSize(const CXzs *p)
|
||||
SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr stream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc)
|
||||
{
|
||||
Int64 endOffset = 0;
|
||||
// it's supposed that CXzs object is empty here.
|
||||
// if CXzs object is not empty, it will add new streams to that non-empty object.
|
||||
// Xzs_Free(p, alloc); // it's optional call to empty CXzs object.
|
||||
RINOK(ILookInStream_Seek(stream, &endOffset, SZ_SEEK_END))
|
||||
*startOffset = endOffset;
|
||||
for (;;)
|
||||
{
|
||||
CXzStream st;
|
||||
SRes res;
|
||||
Xz_Construct(&st);
|
||||
Xz_CONSTRUCT(&st)
|
||||
res = Xz_ReadBackward(&st, stream, startOffset, alloc);
|
||||
// if (res == SZ_OK), then (*startOffset) is start offset of new stream if
|
||||
// if (res != SZ_OK), then (*startOffset) is unchend or it's expected start offset of stream with error
|
||||
st.startOffset = (UInt64)*startOffset;
|
||||
RINOK(res)
|
||||
// we must store (st) object to array, or we must free (st) local object.
|
||||
if (res != SZ_OK)
|
||||
{
|
||||
Xz_Free(&st, alloc);
|
||||
return res;
|
||||
}
|
||||
if (p->num == p->numAllocated)
|
||||
{
|
||||
const size_t newNum = p->num + p->num / 4 + 1;
|
||||
void *data = ISzAlloc_Alloc(alloc, newNum * sizeof(CXzStream));
|
||||
if (!data)
|
||||
{
|
||||
Xz_Free(&st, alloc);
|
||||
return SZ_ERROR_MEM;
|
||||
}
|
||||
p->numAllocated = newNum;
|
||||
if (p->num != 0)
|
||||
memcpy(data, p->streams, p->num * sizeof(CXzStream));
|
||||
ISzAlloc_Free(alloc, p->streams);
|
||||
p->streams = (CXzStream *)data;
|
||||
}
|
||||
// we use direct copying of raw data from local variable (st) to object in array.
|
||||
// so we don't need to call Xz_Free(&st, alloc) after copying and after p->num++
|
||||
p->streams[p->num++] = st;
|
||||
if (*startOffset == 0)
|
||||
break;
|
||||
RINOK(LookInStream_SeekTo(stream, (UInt64)*startOffset))
|
||||
return SZ_OK;
|
||||
// seek operation is optional:
|
||||
// RINOK(LookInStream_SeekTo(stream, (UInt64)*startOffset))
|
||||
if (progress && ICompressProgress_Progress(progress, (UInt64)(endOffset - *startOffset), (UInt64)(Int64)-1) != SZ_OK)
|
||||
return SZ_ERROR_PROGRESS;
|
||||
}
|
||||
return SZ_OK;
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user