3rdparty: Update LZMA/7zipSDK to 25.00

This commit is contained in:
JordanTheToaster 2025-07-06 00:40:37 +01:00 committed by lightningterror
parent 97ea52a6c1
commit a14c8eb7d5
28 changed files with 1238 additions and 611 deletions

View File

@ -1,7 +1,7 @@
#define MY_VER_MAJOR 24
#define MY_VER_MINOR 8
#define MY_VER_MAJOR 25
#define MY_VER_MINOR 0
#define MY_VER_BUILD 0
#define MY_VERSION_NUMBERS "24.08"
#define MY_VERSION_NUMBERS "25.00"
#define MY_VERSION MY_VERSION_NUMBERS
#ifdef MY_CPU_NAME
@ -10,12 +10,12 @@
#define MY_VERSION_CPU MY_VERSION
#endif
#define MY_DATE "2024-08-11"
#define MY_DATE "2025-07-05"
#undef MY_COPYRIGHT
#undef MY_VERSION_COPYRIGHT_DATE
#define MY_AUTHOR_NAME "Igor Pavlov"
#define MY_COPYRIGHT_PD "Igor Pavlov : Public domain"
#define MY_COPYRIGHT_CR "Copyright (c) 1999-2024 Igor Pavlov"
#define MY_COPYRIGHT_CR "Copyright (c) 1999-2025 Igor Pavlov"
#ifdef USE_COPYRIGHT_CR
#define MY_COPYRIGHT MY_COPYRIGHT_CR

View File

@ -1,5 +1,5 @@
/* Compiler.h : Compiler specific defines and pragmas
2024-01-22 : Igor Pavlov : Public domain */
: Igor Pavlov : Public domain */
#ifndef ZIP7_INC_COMPILER_H
#define ZIP7_INC_COMPILER_H
@ -183,6 +183,16 @@ typedef void (*Z7_void_Function)(void);
#define Z7_ATTRIB_NO_VECTORIZE
#endif
#if defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1920)
#define Z7_PRAGMA_OPTIMIZE_FOR_CODE_SIZE _Pragma("optimize ( \"s\", on )")
#define Z7_PRAGMA_OPTIMIZE_DEFAULT _Pragma("optimize ( \"\", on )")
#else
#define Z7_PRAGMA_OPTIMIZE_FOR_CODE_SIZE
#define Z7_PRAGMA_OPTIMIZE_DEFAULT
#endif
#if defined(MY_CPU_X86_OR_AMD64) && ( \
defined(__clang__) && (__clang_major__ >= 4) \
|| defined(__GNUC__) && (__GNUC__ >= 5))

View File

@ -1,5 +1,5 @@
/* CpuArch.h -- CPU specific code
2024-06-17 : Igor Pavlov : Public domain */
Igor Pavlov : Public domain */
#ifndef ZIP7_INC_CPU_ARCH_H
#define ZIP7_INC_CPU_ARCH_H
@ -47,6 +47,12 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
#define MY_CPU_SIZEOF_POINTER 4
#endif
#if defined(__SSE2__) \
|| defined(MY_CPU_AMD64) \
|| defined(_M_IX86_FP) && (_M_IX86_FP >= 2)
#define MY_CPU_SSE2
#endif
#if defined(_M_ARM64) \
|| defined(_M_ARM64EC) \
@ -509,11 +515,19 @@ problem-4 : performace:
#if defined(MY_CPU_LE_UNALIGN) && defined(Z7_CPU_FAST_BSWAP_SUPPORTED)
#if 0
// Z7_BSWAP16 can be slow for x86-msvc
#define GetBe16_to32(p) (Z7_BSWAP16 (*(const UInt16 *)(const void *)(p)))
#else
#define GetBe16_to32(p) (Z7_BSWAP32 (*(const UInt16 *)(const void *)(p)) >> 16)
#endif
#define GetBe32(p) Z7_BSWAP32 (*(const UInt32 *)(const void *)(p))
#define SetBe32(p, v) { (*(UInt32 *)(void *)(p)) = Z7_BSWAP32(v); }
#if defined(MY_CPU_LE_UNALIGN_64)
#define GetBe64(p) Z7_BSWAP64 (*(const UInt64 *)(const void *)(p))
#define SetBe64(p, v) { (*(UInt64 *)(void *)(p)) = Z7_BSWAP64(v); }
#endif
#else
@ -536,21 +550,39 @@ problem-4 : performace:
#define GetBe64(p) (((UInt64)GetBe32(p) << 32) | GetBe32(((const Byte *)(p)) + 4))
#endif
#ifndef SetBe64
#define SetBe64(p, v) { Byte *_ppp_ = (Byte *)(p); UInt64 _vvv_ = (v); \
_ppp_[0] = (Byte)(_vvv_ >> 56); \
_ppp_[1] = (Byte)(_vvv_ >> 48); \
_ppp_[2] = (Byte)(_vvv_ >> 40); \
_ppp_[3] = (Byte)(_vvv_ >> 32); \
_ppp_[4] = (Byte)(_vvv_ >> 24); \
_ppp_[5] = (Byte)(_vvv_ >> 16); \
_ppp_[6] = (Byte)(_vvv_ >> 8); \
_ppp_[7] = (Byte)_vvv_; }
#endif
#ifndef GetBe16
#ifdef GetBe16_to32
#define GetBe16(p) ( (UInt16) GetBe16_to32(p))
#else
#define GetBe16(p) ( (UInt16) ( \
((UInt16)((const Byte *)(p))[0] << 8) | \
((const Byte *)(p))[1] ))
#endif
#endif
#if defined(MY_CPU_BE)
#define Z7_CONV_BE_TO_NATIVE_CONST32(v) (v)
#define Z7_CONV_LE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v)
#define Z7_CONV_NATIVE_TO_BE_32(v) (v)
// #define Z7_GET_NATIVE16_FROM_2_BYTES(b0, b1) ((b1) | ((b0) << 8))
#elif defined(MY_CPU_LE)
#define Z7_CONV_BE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v)
#define Z7_CONV_LE_TO_NATIVE_CONST32(v) (v)
#define Z7_CONV_NATIVE_TO_BE_32(v) Z7_BSWAP32(v)
// #define Z7_GET_NATIVE16_FROM_2_BYTES(b0, b1) ((b0) | ((b1) << 8))
#else
#error Stop_Compiling_Unknown_Endian_CONV
#endif
@ -589,6 +621,11 @@ problem-4 : performace:
#endif
#ifndef GetBe16_to32
#define GetBe16_to32(p) GetBe16(p)
#endif
#if defined(MY_CPU_X86_OR_AMD64) \
|| defined(MY_CPU_ARM_OR_ARM64) \
|| defined(MY_CPU_PPC_OR_PPC64)
@ -617,6 +654,7 @@ BoolInt CPU_IsSupported_SSE2(void);
BoolInt CPU_IsSupported_SSSE3(void);
BoolInt CPU_IsSupported_SSE41(void);
BoolInt CPU_IsSupported_SHA(void);
BoolInt CPU_IsSupported_SHA512(void);
BoolInt CPU_IsSupported_PageGB(void);
#elif defined(MY_CPU_ARM_OR_ARM64)
@ -634,6 +672,7 @@ BoolInt CPU_IsSupported_SHA1(void);
BoolInt CPU_IsSupported_SHA2(void);
BoolInt CPU_IsSupported_AES(void);
#endif
BoolInt CPU_IsSupported_SHA512(void);
#endif

View File

@ -1,5 +1,5 @@
/* LzFindMt.h -- multithreaded Match finder for LZ algorithms
2024-01-22 : Igor Pavlov : Public domain */
: Igor Pavlov : Public domain */
#ifndef ZIP7_INC_LZ_FIND_MT_H
#define ZIP7_INC_LZ_FIND_MT_H
@ -12,8 +12,10 @@ EXTERN_C_BEGIN
typedef struct
{
UInt32 numProcessedBlocks;
CThread thread;
Int32 affinityGroup;
UInt64 affinityInGroup;
UInt64 affinity;
CThread thread;
BoolInt wasCreated;
BoolInt needStart;

View File

@ -18,6 +18,7 @@ typedef struct
int numBlockThreads_Reduced;
int numBlockThreads_Max;
int numTotalThreads;
unsigned numThreadGroups; // 0 : no groups
} CLzma2EncProps;
void Lzma2EncProps_Init(CLzma2EncProps *p);

View File

@ -1,5 +1,5 @@
/* LzmaEnc.h -- LZMA Encoder
2023-04-13 : Igor Pavlov : Public domain */
: Igor Pavlov : Public domain */
#ifndef ZIP7_INC_LZMA_ENC_H
#define ZIP7_INC_LZMA_ENC_H
@ -29,11 +29,13 @@ typedef struct
int numThreads; /* 1 or 2, default = 2 */
// int _pad;
Int32 affinityGroup;
UInt64 reduceSize; /* estimated size of data that will be compressed. default = (UInt64)(Int64)-1.
Encoder uses this value to reduce dictionary size */
UInt64 affinity;
UInt64 affinityInGroup;
} CLzmaEncProps;
void LzmaEncProps_Init(CLzmaEncProps *p);

View File

@ -1,5 +1,5 @@
/* MtCoder.h -- Multi-thread Coder
2023-04-13 : Igor Pavlov : Public domain */
: Igor Pavlov : Public domain */
#ifndef ZIP7_INC_MT_CODER_H
#define ZIP7_INC_MT_CODER_H
@ -16,7 +16,7 @@ EXTERN_C_BEGIN
#ifndef Z7_ST
#define MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads) ((numThreads) + (numThreads) / 8 + 1)
#define MTCODER_THREADS_MAX 64
#define MTCODER_THREADS_MAX 256
#define MTCODER_BLOCKS_MAX (MTCODER_GET_NUM_BLOCKS_FROM_THREADS(MTCODER_THREADS_MAX) + 3)
#else
#define MTCODER_THREADS_MAX 1
@ -77,6 +77,7 @@ typedef struct CMtCoder_
size_t blockSize; /* size of input block */
unsigned numThreadsMax;
unsigned numThreadGroups;
UInt64 expectedDataSize;
ISeqInStreamPtr inStream;
@ -125,6 +126,8 @@ typedef struct CMtCoder_
CMtProgress mtProgress;
CMtCoderBlock blocks[MTCODER_BLOCKS_MAX];
CMtCoderThread threads[MTCODER_THREADS_MAX];
CThreadNextGroup nextGroup;
} CMtCoder;

View File

@ -1,5 +1,5 @@
/* Sha256.h -- SHA-256 Hash
2023-04-02 : Igor Pavlov : Public domain */
: Igor Pavlov : Public domain */
#ifndef ZIP7_INC_SHA256_H
#define ZIP7_INC_SHA256_H
@ -14,6 +14,9 @@ EXTERN_C_BEGIN
#define SHA256_BLOCK_SIZE (SHA256_NUM_BLOCK_WORDS * 4)
#define SHA256_DIGEST_SIZE (SHA256_NUM_DIGEST_WORDS * 4)
typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byte *data, size_t numBlocks);
/*
@ -31,10 +34,17 @@ typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byt
*/
typedef struct
{
union
{
struct
{
SHA256_FUNC_UPDATE_BLOCKS func_UpdateBlocks;
UInt64 count;
UInt64 _pad_2[2];
} vars;
UInt64 _pad_64bit[4];
void *_pad_align_ptr[2];
} v;
UInt32 state[SHA256_NUM_DIGEST_WORDS];
Byte buffer[SHA256_BLOCK_SIZE];

View File

@ -1,5 +1,5 @@
/* Sort.h -- Sort functions
2023-03-05 : Igor Pavlov : Public domain */
: Igor Pavlov : Public domain */
#ifndef ZIP7_INC_SORT_H
#define ZIP7_INC_SORT_H
@ -8,10 +8,7 @@
EXTERN_C_BEGIN
void HeapSort(UInt32 *p, size_t size);
void HeapSort64(UInt64 *p, size_t size);
/* void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size); */
void Z7_FASTCALL HeapSort(UInt32 *p, size_t size);
EXTERN_C_END

View File

@ -1,5 +1,5 @@
/* Threads.h -- multithreading library
2024-03-28 : Igor Pavlov : Public domain */
: Igor Pavlov : Public domain */
#ifndef ZIP7_INC_THREADS_H
#define ZIP7_INC_THREADS_H
@ -140,12 +140,22 @@ WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param
WRes Thread_Wait_Close(CThread *p);
#ifdef _WIN32
WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinityMask);
#define Thread_Create_With_CpuSet(p, func, param, cs) \
Thread_Create_With_Affinity(p, func, param, *cs)
#else
WRes Thread_Create_With_CpuSet(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, const CCpuSet *cpuSet);
#endif
typedef struct
{
unsigned NumGroups;
unsigned NextGroup;
} CThreadNextGroup;
void ThreadNextGroup_Init(CThreadNextGroup *p, unsigned numGroups, unsigned startGroup);
unsigned ThreadNextGroup_GetNext(CThreadNextGroup *p);
#ifdef _WIN32

View File

@ -1,5 +1,5 @@
/* Xz.h - Xz interface
2024-01-26 : Igor Pavlov : Public domain */
Igor Pavlov : Public domain */
#ifndef ZIP7_INC_XZ_H
#define ZIP7_INC_XZ_H
@ -121,6 +121,7 @@ typedef struct
UInt64 startOffset;
} CXzStream;
#define Xz_CONSTRUCT(p) { (p)->numBlocks = 0; (p)->blocks = NULL; (p)->flags = 0; }
void Xz_Construct(CXzStream *p);
void Xz_Free(CXzStream *p, ISzAllocPtr alloc);
@ -136,8 +137,13 @@ typedef struct
CXzStream *streams;
} CXzs;
#define Xzs_CONSTRUCT(p) { (p)->num = 0; (p)->numAllocated = 0; (p)->streams = NULL; }
void Xzs_Construct(CXzs *p);
void Xzs_Free(CXzs *p, ISzAllocPtr alloc);
/*
Xzs_ReadBackward() must be called for empty CXzs object.
Xzs_ReadBackward() can return non empty object with (p->num != 0) even in case of error.
*/
SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr inStream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc);
UInt64 Xzs_GetNumBlocks(const CXzs *p);
@ -268,8 +274,8 @@ typedef struct
size_t outBufSize;
size_t outDataWritten; // the size of data in (outBuf) that were fully unpacked
Byte shaDigest[SHA256_DIGEST_SIZE];
Byte buf[XZ_BLOCK_HEADER_SIZE_MAX];
UInt32 shaDigest32[SHA256_DIGEST_SIZE / 4];
Byte buf[XZ_BLOCK_HEADER_SIZE_MAX]; // it must be aligned for 4-bytes
} CXzUnpacker;
/* alloc : aligned for cache line allocation is better */

View File

@ -1,5 +1,5 @@
/* XzEnc.h -- Xz Encode
2023-04-13 : Igor Pavlov : Public domain */
: Igor Pavlov : Public domain */
#ifndef ZIP7_INC_XZ_ENC_H
#define ZIP7_INC_XZ_ENC_H
@ -31,6 +31,7 @@ typedef struct
CLzma2EncProps lzma2Props;
CXzFilterProps filterProps;
unsigned checkId;
unsigned numThreadGroups; // 0 : no groups
UInt64 blockSize;
int numBlockThreads_Reduced;
int numBlockThreads_Max;

View File

@ -1,5 +1,5 @@
/* 7zDec.c -- Decoding from 7z folder
2024-03-01 : Igor Pavlov : Public domain */
: Igor Pavlov : Public domain */
#include "Precomp.h"
@ -312,9 +312,10 @@ static BoolInt IS_MAIN_METHOD(UInt32 m)
case k_PPMD:
#endif
return True;
}
default:
return False;
}
}
static BoolInt IS_SUPPORTED_CODER(const CSzCoderInfo *c)
{

View File

@ -1,5 +1,5 @@
/* AesOpt.c -- AES optimized code for x86 AES hardware instructions
2024-03-01 : Igor Pavlov : Public domain */
Igor Pavlov : Public domain */
#include "Precomp.h"
@ -80,7 +80,27 @@ AES_FUNC_START (name)
#define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src)
#if 1
// use aligned SSE load/store for data.
// It is required for our Aes functions, that data is aligned for 16-bytes.
// So we can use this branch of code.
// and compiler can use fused load-op SSE instructions:
// xorps xmm0, XMMWORD PTR [rdx]
#define LOAD_128(pp) (*(__m128i *)(void *)(pp))
#define STORE_128(pp, _v) *(__m128i *)(void *)(pp) = _v
// use aligned SSE load/store for data. Alternative code with direct access
// #define LOAD_128(pp) _mm_load_si128(pp)
// #define STORE_128(pp, _v) _mm_store_si128(pp, _v)
#else
// use unaligned load/store for data: movdqu XMMWORD PTR [rdx]
#define LOAD_128(pp) _mm_loadu_si128(pp)
#define STORE_128(pp, _v) _mm_storeu_si128(pp, _v)
#endif
AES_FUNC_START2 (AesCbc_Encode_HW)
{
if (numBlocks == 0)
return;
{
__m128i *p = (__m128i *)(void *)ivAes;
__m128i *data = (__m128i *)(void *)data8;
@ -88,11 +108,11 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
const __m128i k0 = p[2];
const __m128i k1 = p[3];
const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
for (; numBlocks != 0; numBlocks--, data++)
do
{
UInt32 r = numRounds2;
const __m128i *w = p + 4;
__m128i temp = *data;
__m128i temp = LOAD_128(data);
MM_XOR (temp, k0)
MM_XOR (m, temp)
MM_OP_m (_mm_aesenc_si128, k1)
@ -104,10 +124,13 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
}
while (--r);
MM_OP_m (_mm_aesenclast_si128, w[0])
*data = m;
STORE_128(data, m);
data++;
}
while (--numBlocks);
*p = m;
}
}
#define WOP_1(op)
@ -139,12 +162,12 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
#define WOP(op) op (m0, 0) WOP_M1(op)
#define DECLARE_VAR(reg, ii) __m128i reg;
#define LOAD_data( reg, ii) reg = data[ii];
#define STORE_data( reg, ii) data[ii] = reg;
#define LOAD_data_ii(ii) LOAD_128(data + (ii))
#define LOAD_data( reg, ii) reg = LOAD_data_ii(ii);
#define STORE_data( reg, ii) STORE_128(data + (ii), reg);
#if (NUM_WAYS > 1)
#define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1])
#define XOR_data_M1(reg, ii) MM_XOR (reg, LOAD_128(data + (ii- 1)))
#endif
#define MM_OP_key(op, reg) MM_OP(op, reg, key);
@ -156,25 +179,22 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
#define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg)
#define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr;
#define CTR_END( reg, ii) MM_XOR (data[ii], reg)
#define CTR_END( reg, ii) STORE_128(data + (ii), _mm_xor_si128(reg, \
LOAD_128 (data + (ii))));
#define WOP_KEY(op, n) { \
const __m128i key = w[n]; \
WOP(op); }
WOP(op) }
#define WIDE_LOOP_START \
dataEnd = data + numBlocks; \
if (numBlocks >= NUM_WAYS) \
{ dataEnd -= NUM_WAYS; do { \
#define WIDE_LOOP_END \
data += NUM_WAYS; \
} while (data <= dataEnd); \
dataEnd += NUM_WAYS; } \
#define SINGLE_LOOP \
for (; data < dataEnd; data++)
@ -184,34 +204,54 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
#define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src)
#define AVX_DECLARE_VAR(reg, ii) __m256i reg;
#define AVX_LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii];
#define AVX_STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg;
#if 1
// use unaligned AVX load/store for data.
// It is required for our Aes functions, that data is aligned for 16-bytes.
// But we need 32-bytes reading.
// So we use intrinsics for unaligned AVX load/store.
// notes for _mm256_storeu_si256:
// msvc2022: uses vmovdqu and keeps the order of instruction sequence.
// new gcc11 uses vmovdqu
// old gcc9 could use pair of instructions:
// vmovups %xmm7, -224(%rax)
// vextracti128 $0x1, %ymm7, -208(%rax)
#define AVX_LOAD(p) _mm256_loadu_si256((const __m256i *)(const void *)(p))
#define AVX_STORE(p, _v) _mm256_storeu_si256((__m256i *)(void *)(p), _v);
#else
// use aligned AVX load/store for data.
// for debug: we can use this branch, if we are sure that data is aligned for 32-bytes.
// msvc2022 uses vmovdqu still
// gcc uses vmovdqa (that requires 32-bytes alignment)
#define AVX_LOAD(p) (*(const __m256i *)(const void *)(p))
#define AVX_STORE(p, _v) (*(__m256i *)(void *)(p)) = _v;
#endif
#define AVX_LOAD_data( reg, ii) reg = AVX_LOAD((const __m256i *)(const void *)data + (ii));
#define AVX_STORE_data( reg, ii) AVX_STORE((__m256i *)(void *)data + (ii), reg)
/*
AVX_XOR_data_M1() needs unaligned memory load
if (we don't use _mm256_loadu_si256() here)
{
Most compilers with enabled optimizations generate fused AVX (LOAD + OP)
instruction that can load unaligned data.
But GCC and CLANG without -O2 or -O1 optimizations can generate separated
LOAD-ALIGNED (vmovdqa) instruction that will fail on execution.
}
Note: some compilers generate more instructions, if we use _mm256_loadu_si256() here.
v23.02: we use _mm256_loadu_si256() here, because we need compatibility with any compiler.
AVX_XOR_data_M1() needs unaligned memory load, even if (data)
is aligned for 256-bits, because we read 32-bytes chunk that
crosses (data) position: from (data - 16bytes) to (data + 16bytes).
*/
#define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256(&(((const __m256i *)(const void *)(data - 1))[ii])))
// for debug only: the following code will fail on execution, if compiled by some compilers:
// #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii]))
#define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256((const __m256i *)(const void *)(data - 1) + (ii)))
#define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg)
#define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg)
#define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg)
#define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg)
#define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg)
#define AVX_CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two) reg = _mm256_xor_si256(ctr2, key);
#define AVX_CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg)
#define AVX_CTR_START(reg, ii) \
MM_OP (_mm256_add_epi64, ctr2, two) \
reg = _mm256_xor_si256(ctr2, key);
#define AVX_CTR_END(reg, ii) \
AVX_STORE((__m256i *)(void *)data + (ii), _mm256_xor_si256(reg, \
AVX_LOAD ((__m256i *)(void *)data + (ii))));
#define AVX_WOP_KEY(op, n) { \
const __m256i key = w[n]; \
WOP(op); }
WOP(op) }
#define NUM_AES_KEYS_MAX 15
@ -219,12 +259,11 @@ v23.02: we use _mm256_loadu_si256() here, because we need compatibility with any
dataEnd = data + numBlocks; \
if (numBlocks >= NUM_WAYS * 2) \
{ __m256i keys[NUM_AES_KEYS_MAX]; \
UInt32 ii; \
OP \
for (ii = 0; ii < numRounds; ii++) \
keys[ii] = _mm256_broadcastsi128_si256(p[ii]); \
dataEnd -= NUM_WAYS * 2; do { \
{ UInt32 ii; for (ii = 0; ii < numRounds; ii++) \
keys[ii] = _mm256_broadcastsi128_si256(p[ii]); } \
dataEnd -= NUM_WAYS * 2; \
do { \
#define WIDE_LOOP_END_AVX(OP) \
data += NUM_WAYS * 2; \
@ -246,21 +285,20 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
__m128i *p = (__m128i *)(void *)ivAes;
__m128i *data = (__m128i *)(void *)data8;
__m128i iv = *p;
const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1;
const __m128i * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2 + 2 - 1;
const __m128i *dataEnd;
p += 2;
WIDE_LOOP_START
{
const __m128i *w = wStart;
WOP (DECLARE_VAR)
WOP (LOAD_data)
WOP_KEY (AES_XOR, 1)
do
{
WOP_KEY (AES_DEC, 0)
w--;
}
while (w != p);
@ -268,7 +306,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
MM_XOR (m0, iv)
WOP_M1 (XOR_data_M1)
iv = data[NUM_WAYS - 1];
LOAD_data(iv, NUM_WAYS - 1)
WOP (STORE_data)
}
WIDE_LOOP_END
@ -276,7 +314,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
SINGLE_LOOP
{
const __m128i *w = wStart - 1;
__m128i m = _mm_xor_si128 (w[2], *data);
__m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0));
do
{
MM_OP_m (_mm_aesdec_si128, w[1])
@ -286,10 +325,9 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
while (w != p);
MM_OP_m (_mm_aesdec_si128, w[1])
MM_OP_m (_mm_aesdeclast_si128, w[0])
MM_XOR (m, iv)
iv = *data;
*data = m;
LOAD_data(iv, 0)
STORE_data(m, 0)
}
p[-2] = iv;
@ -301,9 +339,9 @@ AES_FUNC_START2 (AesCtr_Code_HW)
__m128i *p = (__m128i *)(void *)ivAes;
__m128i *data = (__m128i *)(void *)data8;
__m128i ctr = *p;
UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1;
const UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1;
const __m128i *dataEnd;
__m128i one = _mm_cvtsi32_si128(1);
const __m128i one = _mm_cvtsi32_si128(1);
p += 2;
@ -322,7 +360,6 @@ AES_FUNC_START2 (AesCtr_Code_HW)
}
while (--r);
WOP_KEY (AES_ENC_LAST, 0)
WOP (CTR_END)
}
WIDE_LOOP_END
@ -344,7 +381,7 @@ AES_FUNC_START2 (AesCtr_Code_HW)
while (--numRounds2);
MM_OP_m (_mm_aesenc_si128, w[0])
MM_OP_m (_mm_aesenclast_si128, w[1])
MM_XOR (*data, m)
CTR_END (m, 0)
}
p[-2] = ctr;
@ -421,7 +458,7 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256)
__m128i *data = (__m128i *)(void *)data8;
__m128i iv = *p;
const __m128i *dataEnd;
UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
p += 2;
WIDE_LOOP_START_AVX(;)
@ -440,17 +477,17 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256)
while (w != keys);
AVX_WOP_KEY (AVX_AES_DEC_LAST, 0)
AVX_XOR (m0, _mm256_setr_m128i(iv, data[0]))
AVX_XOR (m0, _mm256_setr_m128i(iv, LOAD_data_ii(0)))
WOP_M1 (AVX_XOR_data_M1)
iv = data[NUM_WAYS * 2 - 1];
LOAD_data (iv, NUM_WAYS * 2 - 1)
WOP (AVX_STORE_data)
}
WIDE_LOOP_END_AVX(;)
SINGLE_LOOP
{
const __m128i *w = p + *(const UInt32 *)(p + 1 - 2) * 2 + 1 - 3;
__m128i m = _mm_xor_si128 (w[2], *data);
const __m128i *w = p - 2 + (size_t)*(const UInt32 *)(p + 1 - 2) * 2;
__m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0));
do
{
MM_OP_m (_mm_aesdec_si128, w[1])
@ -462,8 +499,8 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256)
MM_OP_m (_mm_aesdeclast_si128, w[0])
MM_XOR (m, iv)
iv = *data;
*data = m;
LOAD_data(iv, 0)
STORE_data(m, 0)
}
p[-2] = iv;
@ -493,9 +530,9 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256)
__m128i *p = (__m128i *)(void *)ivAes;
__m128i *data = (__m128i *)(void *)data8;
__m128i ctr = *p;
UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
const __m128i *dataEnd;
__m128i one = _mm_cvtsi32_si128(1);
const __m128i one = _mm_cvtsi32_si128(1);
__m256i ctr2, two;
p += 2;
@ -536,7 +573,7 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256)
while (--numRounds2);
MM_OP_m (_mm_aesenc_si128, w[0])
MM_OP_m (_mm_aesenclast_si128, w[1])
MM_XOR (*data, m)
CTR_END (m, 0)
}
p[-2] = ctr;
@ -730,10 +767,15 @@ AES_FUNC_START (name)
AES_FUNC_START2 (AesCbc_Encode_HW)
{
if (numBlocks == 0)
return;
{
v128 * const p = (v128 *)(void *)ivAes;
v128 *data = (v128 *)(void *)data8;
v128 m = *p;
const UInt32 numRounds2 = *(const UInt32 *)(p + 1);
const v128 *w = p + (size_t)numRounds2 * 2;
const v128 k0 = p[2];
const v128 k1 = p[3];
const v128 k2 = p[4];
@ -744,11 +786,14 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
const v128 k7 = p[9];
const v128 k8 = p[10];
const v128 k9 = p[11];
const UInt32 numRounds2 = *(const UInt32 *)(p + 1);
const v128 *w = p + ((size_t)numRounds2 * 2);
const v128 k_z4 = w[-2];
const v128 k_z3 = w[-1];
const v128 k_z2 = w[0];
const v128 k_z1 = w[1];
const v128 k_z0 = w[2];
for (; numBlocks != 0; numBlocks--, data++)
// we don't use optimization veorq_u8(*data, k_z0) that can reduce one cycle,
// because gcc/clang compilers are not good for that optimization.
do
{
MM_XOR_m (*data)
AES_E_MC_m (k0)
@ -757,25 +802,27 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
AES_E_MC_m (k3)
AES_E_MC_m (k4)
AES_E_MC_m (k5)
AES_E_MC_m (k6)
AES_E_MC_m (k7)
AES_E_MC_m (k8)
if (numRounds2 >= 6)
{
AES_E_MC_m (k9)
AES_E_MC_m (p[12])
AES_E_MC_m (k6)
AES_E_MC_m (k7)
if (numRounds2 != 6)
{
AES_E_MC_m (p[13])
AES_E_MC_m (p[14])
AES_E_MC_m (k8)
AES_E_MC_m (k9)
}
}
AES_E_MC_m (k_z4)
AES_E_MC_m (k_z3)
AES_E_MC_m (k_z2)
AES_E_m (k_z1)
MM_XOR_m (k_z0)
*data = m;
*data++ = m;
}
while (--numBlocks);
*p = m;
}
}
#define WOP_1(op)
@ -837,7 +884,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
v128 *p = (v128 *)(void *)ivAes;
v128 *data = (v128 *)(void *)data8;
v128 iv = *p;
const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
const v128 * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2;
const v128 *dataEnd;
p += 2;
@ -858,7 +905,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
WOP_KEY (AES_XOR, 0)
MM_XOR (m0, iv)
WOP_M1 (XOR_data_M1)
iv = data[NUM_WAYS - 1];
LOAD_data(iv, NUM_WAYS - 1)
WOP (STORE_data)
}
WIDE_LOOP_END
@ -866,7 +913,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
SINGLE_LOOP
{
const v128 *w = wStart;
v128 m = *data;
v128 m; LOAD_data(m, 0)
AES_D_IMC_m (w[2])
do
{
@ -878,8 +925,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
AES_D_m (w[1])
MM_XOR_m (w[0])
MM_XOR_m (iv)
iv = *data;
*data = m;
LOAD_data(iv, 0)
STORE_data(m, 0)
}
p[-2] = iv;
@ -891,16 +938,14 @@ AES_FUNC_START2 (AesCtr_Code_HW)
v128 *p = (v128 *)(void *)ivAes;
v128 *data = (v128 *)(void *)data8;
uint64x2_t ctr = vreinterpretq_u64_u8(*p);
const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
const v128 * const wEnd = p + (size_t)*(const UInt32 *)(p + 1) * 2;
const v128 *dataEnd;
uint64x2_t one = vdupq_n_u64(0);
// the bug in clang:
// __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2);
#if defined(__clang__) && (__clang_major__ <= 9)
#pragma GCC diagnostic ignored "-Wvector-conversion"
#endif
one = vsetq_lane_u64(1, one, 0);
const uint64x2_t one = vsetq_lane_u64(1, vdupq_n_u64(0), 0);
p += 2;
WIDE_LOOP_START

View File

@ -1,5 +1,5 @@
/* CpuArch.c -- CPU specific code
2024-07-04 : Igor Pavlov : Public domain */
Igor Pavlov : Public domain */
#include "Precomp.h"
@ -17,7 +17,7 @@
/*
cpuid instruction supports (subFunction) parameter in ECX,
that is used only with some specific (function) parameter values.
But we always use only (subFunction==0).
most functions use only (subFunction==0).
*/
/*
__cpuid(): MSVC and GCC/CLANG use same function/macro name
@ -49,43 +49,49 @@
#if defined(MY_CPU_AMD64) && defined(__PIC__) \
&& ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__))
#define x86_cpuid_MACRO(p, func) { \
/* "=&r" selects free register. It can select even rbx, if that register is free.
"=&D" for (RDI) also works, but the code can be larger with "=&D"
"2"(subFun) : 2 is (zero-based) index in the output constraint list "=c" (ECX). */
#define x86_cpuid_MACRO_2(p, func, subFunc) { \
__asm__ __volatile__ ( \
ASM_LN "mov %%rbx, %q1" \
ASM_LN "cpuid" \
ASM_LN "xchg %%rbx, %q1" \
: "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); }
/* "=&r" selects free register. It can select even rbx, if that register is free.
"=&D" for (RDI) also works, but the code can be larger with "=&D"
"2"(0) means (subFunction = 0),
2 is (zero-based) index in the output constraint list "=c" (ECX). */
: "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }
#elif defined(MY_CPU_X86) && defined(__PIC__) \
&& ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__))
#define x86_cpuid_MACRO(p, func) { \
#define x86_cpuid_MACRO_2(p, func, subFunc) { \
__asm__ __volatile__ ( \
ASM_LN "mov %%ebx, %k1" \
ASM_LN "cpuid" \
ASM_LN "xchg %%ebx, %k1" \
: "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); }
: "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }
#else
#define x86_cpuid_MACRO(p, func) { \
#define x86_cpuid_MACRO_2(p, func, subFunc) { \
__asm__ __volatile__ ( \
ASM_LN "cpuid" \
: "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); }
: "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }
#endif
#define x86_cpuid_MACRO(p, func) x86_cpuid_MACRO_2(p, func, 0)
void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
{
x86_cpuid_MACRO(p, func)
}
static
void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
{
x86_cpuid_MACRO_2(p, func, subFunc)
}
Z7_NO_INLINE
UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
@ -205,11 +211,39 @@ void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
__asm ret 0
}
static
void __declspec(naked) Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
{
UNUSED_VAR(p)
UNUSED_VAR(func)
UNUSED_VAR(subFunc)
__asm push ebx
__asm push edi
__asm mov edi, ecx // p
__asm mov eax, edx // func
__asm mov ecx, [esp + 12] // subFunc
__asm cpuid
__asm mov [edi ], eax
__asm mov [edi + 4], ebx
__asm mov [edi + 8], ecx
__asm mov [edi + 12], edx
__asm pop edi
__asm pop ebx
__asm ret 4
}
#else // MY_CPU_AMD64
#if _MSC_VER >= 1600
#include <intrin.h>
#define MY_cpuidex __cpuidex
static
void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
{
__cpuidex((int *)p, func, subFunc);
}
#else
/*
__cpuid (func == (0 or 7)) requires subfunction number in ECX.
@ -219,7 +253,7 @@ void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
We still can use __cpuid for low (func) values that don't require ECX,
but __cpuid() in old MSVC will be incorrect for some func values: (func == 7).
So here we use the hack for old MSVC to send (subFunction) in ECX register to cpuid instruction,
where ECX value is first parameter for FASTCALL / NO_INLINE func,
where ECX value is first parameter for FASTCALL / NO_INLINE func.
So the caller of MY_cpuidex_HACK() sets ECX as subFunction, and
old MSVC for __cpuid() doesn't change ECX and cpuid instruction gets (subFunction) value.
@ -233,6 +267,11 @@ Z7_NO_INLINE void Z7_FASTCALL MY_cpuidex_HACK(Int32 subFunction, Int32 func, Int
}
#define MY_cpuidex(info, func, func2) MY_cpuidex_HACK(func2, func, info)
#pragma message("======== MY_cpuidex_HACK WAS USED ========")
static
void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
{
MY_cpuidex_HACK(subFunc, func, (Int32 *)p);
}
#endif // _MSC_VER >= 1600
#if !defined(MY_CPU_AMD64)
@ -445,6 +484,23 @@ BoolInt CPU_IsSupported_SHA(void)
}
}
BoolInt CPU_IsSupported_SHA512(void)
{
if (!CPU_IsSupported_AVX2()) return False; // maybe CPU_IsSupported_AVX() is enough here
if (z7_x86_cpuid_GetMaxFunc() < 7)
return False;
{
UInt32 d[4];
z7_x86_cpuid_subFunc(d, 7, 0);
if (d[0] < 1) // d[0] - is max supported subleaf value
return False;
z7_x86_cpuid_subFunc(d, 7, 1);
return (BoolInt)(d[0]) & 1;
}
}
/*
MSVC: _xgetbv() intrinsic is available since VS2010SP1.
MSVC also defines (_XCR_XFEATURE_ENABLED_MASK) macro in
@ -776,6 +832,18 @@ BoolInt CPU_IsSupported_NEON(void)
return z7_sysctlbyname_Get_BoolInt("hw.optional.neon");
}
BoolInt CPU_IsSupported_SHA512(void)
{
return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha512");
}
/*
BoolInt CPU_IsSupported_SHA3(void)
{
return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha3");
}
*/
#ifdef MY_CPU_ARM64
#define APPLE_CRYPTO_SUPPORT_VAL 1
#else
@ -860,6 +928,19 @@ MY_HWCAP_CHECK_FUNC (CRC32)
MY_HWCAP_CHECK_FUNC (SHA1)
MY_HWCAP_CHECK_FUNC (SHA2)
MY_HWCAP_CHECK_FUNC (AES)
#ifdef MY_CPU_ARM64
// <hwcap.h> supports HWCAP_SHA512 and HWCAP_SHA3 since 2017.
// we define them here, if they are not defined
#ifndef HWCAP_SHA3
// #define HWCAP_SHA3 (1 << 17)
#endif
#ifndef HWCAP_SHA512
// #pragma message("=== HWCAP_SHA512 define === ")
#define HWCAP_SHA512 (1 << 21)
#endif
MY_HWCAP_CHECK_FUNC (SHA512)
// MY_HWCAP_CHECK_FUNC (SHA3)
#endif
#endif // __APPLE__
#endif // _WIN32

View File

@ -1,5 +1,5 @@
/* LzFind.c -- Match finder for LZ algorithms
2024-03-01 : Igor Pavlov : Public domain */
: Igor Pavlov : Public domain */
#include "Precomp.h"
@ -404,7 +404,7 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
const unsigned nbMax =
(p->numHashBytes == 2 ? 16 :
(p->numHashBytes == 3 ? 24 : 32));
if (numBits > nbMax)
if (numBits >= nbMax)
numBits = nbMax;
if (numBits >= 32)
hs = (UInt32)0 - 1;
@ -416,14 +416,14 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
hs |= (256 << kLzHash_CrcShift_2) - 1;
{
const UInt32 hs2 = MatchFinder_GetHashMask2(p, historySize);
if (hs > hs2)
if (hs >= hs2)
hs = hs2;
}
hsCur = hs;
if (p->expectedDataSize < historySize)
{
const UInt32 hs2 = MatchFinder_GetHashMask2(p, (UInt32)p->expectedDataSize);
if (hsCur > hs2)
if (hsCur >= hs2)
hsCur = hs2;
}
}
@ -434,7 +434,7 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
if (p->expectedDataSize < historySize)
{
hsCur = MatchFinder_GetHashMask(p, (UInt32)p->expectedDataSize);
if (hsCur > hs) // is it possible?
if (hsCur >= hs) // is it possible?
hsCur = hs;
}
}
@ -890,7 +890,7 @@ static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos,
return d;
{
const Byte *pb = cur - delta;
curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)];
curMatch = son[_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)];
if (pb[maxLen] == cur[maxLen] && *pb == *cur)
{
UInt32 len = 0;
@ -925,7 +925,7 @@ static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos,
break;
{
ptrdiff_t diff;
curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)];
curMatch = son[_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)];
diff = (ptrdiff_t)0 - (ptrdiff_t)delta;
if (cur[maxLen] == cur[(ptrdiff_t)maxLen + diff])
{
@ -972,7 +972,7 @@ UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byt
// if (curMatch >= pos) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; }
cmCheck = (UInt32)(pos - _cyclicBufferSize);
if ((UInt32)pos <= _cyclicBufferSize)
if ((UInt32)pos < _cyclicBufferSize)
cmCheck = 0;
if (cmCheck < curMatch)
@ -980,7 +980,7 @@ UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byt
{
const UInt32 delta = pos - curMatch;
{
CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)) << 1);
const Byte *pb = cur - delta;
unsigned len = (len0 < len1 ? len0 : len1);
const UInt32 pair0 = pair[0];
@ -1039,7 +1039,7 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const
UInt32 cmCheck;
cmCheck = (UInt32)(pos - _cyclicBufferSize);
if ((UInt32)pos <= _cyclicBufferSize)
if ((UInt32)pos < _cyclicBufferSize)
cmCheck = 0;
if (// curMatch >= pos || // failure
@ -1048,7 +1048,7 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const
{
const UInt32 delta = pos - curMatch;
{
CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)) << 1);
const Byte *pb = cur - delta;
unsigned len = (len0 < len1 ? len0 : len1);
if (pb[len] == cur[len])
@ -1595,7 +1595,7 @@ static void Bt5_MatchFinder_Skip(void *_p, UInt32 num)
UInt32 pos = p->pos; \
UInt32 num2 = num; \
/* (p->pos == p->posLimit) is not allowed here !!! */ \
{ const UInt32 rem = p->posLimit - pos; if (num2 > rem) num2 = rem; } \
{ const UInt32 rem = p->posLimit - pos; if (num2 >= rem) num2 = rem; } \
num -= num2; \
{ const UInt32 cycPos = p->cyclicBufferPos; \
son = p->son + cycPos; \

View File

@ -1,5 +1,5 @@
/* LzFindMt.c -- multithreaded Match finder for LZ algorithms
2024-01-22 : Igor Pavlov : Public domain */
: Igor Pavlov : Public domain */
#include "Precomp.h"
@ -82,6 +82,8 @@ extern UInt64 g_NumIters_Bytes;
Z7_NO_INLINE
static void MtSync_Construct(CMtSync *p)
{
p->affinityGroup = -1;
p->affinityInGroup = 0;
p->affinity = 0;
p->wasCreated = False;
p->csWasInitialized = False;
@ -259,6 +261,12 @@ static WRes MtSync_Create_WRes(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *
// return ERROR_TOO_MANY_POSTS; // for debug
// return EINVAL; // for debug
#ifdef _WIN32
if (p->affinityGroup >= 0)
wres = Thread_Create_With_Group(&p->thread, startAddress, obj,
(unsigned)(UInt32)p->affinityGroup, (CAffinityMask)p->affinityInGroup);
else
#endif
if (p->affinity != 0)
wres = Thread_Create_With_Affinity(&p->thread, startAddress, obj, (CAffinityMask)p->affinity);
else

View File

@ -1,5 +1,5 @@
/* Lzma2Enc.c -- LZMA2 Encoder
2023-04-13 : Igor Pavlov : Public domain */
: Igor Pavlov : Public domain */
#include "Precomp.h"
@ -235,6 +235,7 @@ void Lzma2EncProps_Init(CLzma2EncProps *p)
p->numBlockThreads_Reduced = -1;
p->numBlockThreads_Max = -1;
p->numTotalThreads = -1;
p->numThreadGroups = 0;
}
void Lzma2EncProps_Normalize(CLzma2EncProps *p)
@ -781,6 +782,7 @@ SRes Lzma2Enc_Encode2(CLzma2EncHandle p,
}
p->mtCoder.numThreadsMax = (unsigned)p->props.numBlockThreads_Max;
p->mtCoder.numThreadGroups = p->props.numThreadGroups;
p->mtCoder.expectedDataSize = p->expectedDataSize;
{

View File

@ -1,5 +1,5 @@
/* LzmaEnc.c -- LZMA Encoder
2024-01-24: Igor Pavlov : Public domain */
Igor Pavlov : Public domain */
#include "Precomp.h"
@ -62,7 +62,9 @@ void LzmaEncProps_Init(CLzmaEncProps *p)
p->lc = p->lp = p->pb = p->algo = p->fb = p->btMode = p->numHashBytes = p->numThreads = -1;
p->numHashOutBits = 0;
p->writeEndMark = 0;
p->affinityGroup = -1;
p->affinity = 0;
p->affinityInGroup = 0;
}
void LzmaEncProps_Normalize(CLzmaEncProps *p)
@ -72,11 +74,11 @@ void LzmaEncProps_Normalize(CLzmaEncProps *p)
p->level = level;
if (p->dictSize == 0)
p->dictSize =
( level <= 3 ? ((UInt32)1 << (level * 2 + 16)) :
( level <= 6 ? ((UInt32)1 << (level + 19)) :
( level <= 7 ? ((UInt32)1 << 25) : ((UInt32)1 << 26)
)));
p->dictSize = (unsigned)level <= 4 ?
(UInt32)1 << (level * 2 + 16) :
(unsigned)level <= sizeof(size_t) / 2 + 4 ?
(UInt32)1 << (level + 20) :
(UInt32)1 << (sizeof(size_t) / 2 + 24);
if (p->dictSize > p->reduceSize)
{
@ -92,8 +94,8 @@ void LzmaEncProps_Normalize(CLzmaEncProps *p)
if (p->lp < 0) p->lp = 0;
if (p->pb < 0) p->pb = 2;
if (p->algo < 0) p->algo = (level < 5 ? 0 : 1);
if (p->fb < 0) p->fb = (level < 7 ? 32 : 64);
if (p->algo < 0) p->algo = (unsigned)level < 5 ? 0 : 1;
if (p->fb < 0) p->fb = (unsigned)level < 7 ? 32 : 64;
if (p->btMode < 0) p->btMode = (p->algo == 0 ? 0 : 1);
if (p->numHashBytes < 0) p->numHashBytes = (p->btMode ? 4 : 5);
if (p->mc == 0) p->mc = (16 + ((unsigned)p->fb >> 1)) >> (p->btMode ? 0 : 1);
@ -598,6 +600,10 @@ SRes LzmaEnc_SetProps(CLzmaEncHandle p, const CLzmaEncProps *props2)
p->multiThread = (props.numThreads > 1);
p->matchFinderMt.btSync.affinity =
p->matchFinderMt.hashSync.affinity = props.affinity;
p->matchFinderMt.btSync.affinityGroup =
p->matchFinderMt.hashSync.affinityGroup = props.affinityGroup;
p->matchFinderMt.btSync.affinityInGroup =
p->matchFinderMt.hashSync.affinityInGroup = props.affinityInGroup;
#endif
return SZ_OK;

View File

@ -1,5 +1,5 @@
/* MtCoder.c -- Multi-thread Coder
2023-09-07 : Igor Pavlov : Public domain */
: Igor Pavlov : Public domain */
#include "Precomp.h"
@ -39,14 +39,28 @@ void MtProgressThunk_CreateVTable(CMtProgressThunk *p)
static THREAD_FUNC_DECL ThreadFunc(void *pp);
static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t)
static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t
#ifdef _WIN32
, CMtCoder * const mtc
#endif
)
{
WRes wres = AutoResetEvent_OptCreate_And_Reset(&t->startEvent);
// printf("\n====== MtCoderThread_CreateAndStart : \n");
if (wres == 0)
{
t->stop = False;
if (!Thread_WasCreated(&t->thread))
{
#ifdef _WIN32
if (mtc->numThreadGroups)
wres = Thread_Create_With_Group(&t->thread, ThreadFunc, t,
ThreadNextGroup_GetNext(&mtc->nextGroup), // group
0); // affinityMask
else
#endif
wres = Thread_Create(&t->thread, ThreadFunc, t);
}
if (wres == 0)
wres = Event_Set(&t->startEvent);
}
@ -56,6 +70,7 @@ static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t)
}
Z7_FORCE_INLINE
static void MtCoderThread_Destruct(CMtCoderThread *t)
{
if (Thread_WasCreated(&t->thread))
@ -85,7 +100,7 @@ static void MtCoderThread_Destruct(CMtCoderThread *t)
static SRes ThreadFunc2(CMtCoderThread *t)
{
CMtCoder *mtc = t->mtCoder;
CMtCoder * const mtc = t->mtCoder;
for (;;)
{
@ -185,7 +200,11 @@ static SRes ThreadFunc2(CMtCoderThread *t)
if (mtc->numStartedThreads < mtc->numStartedThreadsLimit
&& mtc->expectedDataSize != readProcessed)
{
res = MtCoderThread_CreateAndStart(&mtc->threads[mtc->numStartedThreads]);
res = MtCoderThread_CreateAndStart(&mtc->threads[mtc->numStartedThreads]
#ifdef _WIN32
, mtc
#endif
);
if (res == SZ_OK)
mtc->numStartedThreads++;
else
@ -221,7 +240,7 @@ static SRes ThreadFunc2(CMtCoderThread *t)
}
{
CMtCoderBlock *block = &mtc->blocks[bi];
CMtCoderBlock * const block = &mtc->blocks[bi];
block->res = res;
block->bufIndex = bufIndex;
block->finished = finished;
@ -311,7 +330,7 @@ static SRes ThreadFunc2(CMtCoderThread *t)
static THREAD_FUNC_DECL ThreadFunc(void *pp)
{
CMtCoderThread *t = (CMtCoderThread *)pp;
CMtCoderThread * const t = (CMtCoderThread *)pp;
for (;;)
{
if (Event_Wait(&t->startEvent) != 0)
@ -319,7 +338,7 @@ static THREAD_FUNC_DECL ThreadFunc(void *pp)
if (t->stop)
return 0;
{
SRes res = ThreadFunc2(t);
const SRes res = ThreadFunc2(t);
CMtCoder *mtc = t->mtCoder;
if (res != SZ_OK)
{
@ -328,7 +347,7 @@ static THREAD_FUNC_DECL ThreadFunc(void *pp)
#ifndef MTCODER_USE_WRITE_THREAD
{
unsigned numFinished = (unsigned)InterlockedIncrement(&mtc->numFinishedThreads);
const unsigned numFinished = (unsigned)InterlockedIncrement(&mtc->numFinishedThreads);
if (numFinished == mtc->numStartedThreads)
if (Event_Set(&mtc->finishedEvent) != 0)
return (THREAD_FUNC_RET_TYPE)SZ_ERROR_THREAD;
@ -346,6 +365,7 @@ void MtCoder_Construct(CMtCoder *p)
p->blockSize = 0;
p->numThreadsMax = 0;
p->numThreadGroups = 0;
p->expectedDataSize = (UInt64)(Int64)-1;
p->inStream = NULL;
@ -429,6 +449,8 @@ SRes MtCoder_Code(CMtCoder *p)
unsigned i;
SRes res = SZ_OK;
// printf("\n====== MtCoder_Code : \n");
if (numThreads > MTCODER_THREADS_MAX)
numThreads = MTCODER_THREADS_MAX;
numBlocksMax = MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads);
@ -492,11 +514,22 @@ SRes MtCoder_Code(CMtCoder *p)
p->numStartedThreadsLimit = numThreads;
p->numStartedThreads = 0;
ThreadNextGroup_Init(&p->nextGroup, p->numThreadGroups, 0); // startGroup
// for (i = 0; i < numThreads; i++)
{
// here we create new thread for first block.
// And each new thread will create another new thread after block reading
// until numStartedThreadsLimit is reached.
CMtCoderThread *nextThread = &p->threads[p->numStartedThreads++];
RINOK(MtCoderThread_CreateAndStart(nextThread))
{
const SRes res2 = MtCoderThread_CreateAndStart(nextThread
#ifdef _WIN32
, p
#endif
);
RINOK(res2)
}
}
RINOK_THREAD(Event_Set(&p->readEvent))
@ -513,9 +546,9 @@ SRes MtCoder_Code(CMtCoder *p)
RINOK_THREAD(Event_Wait(&p->writeEvents[bi]))
{
const CMtCoderBlock *block = &p->blocks[bi];
unsigned bufIndex = block->bufIndex;
BoolInt finished = block->finished;
const CMtCoderBlock * const block = &p->blocks[bi];
const unsigned bufIndex = block->bufIndex;
const BoolInt finished = block->finished;
if (res == SZ_OK && block->res != SZ_OK)
res = block->res;
@ -545,7 +578,7 @@ SRes MtCoder_Code(CMtCoder *p)
}
#else
{
WRes wres = Event_Wait(&p->finishedEvent);
const WRes wres = Event_Wait(&p->finishedEvent);
res = MY_SRes_HRESULT_FROM_WRes(wres);
}
#endif

View File

@ -1,18 +1,14 @@
/* Sha256.c -- SHA-256 Hash
2024-03-01 : Igor Pavlov : Public domain
: Igor Pavlov : Public domain
This code is based on public domain code from Wei Dai's Crypto++ library. */
#include "Precomp.h"
#include <string.h>
#include "CpuArch.h"
#include "RotateDefs.h"
#include "Sha256.h"
#if defined(_MSC_VER) && (_MSC_VER < 1900)
// #define USE_MY_MM
#endif
#include "RotateDefs.h"
#include "CpuArch.h"
#ifdef MY_CPU_X86_OR_AMD64
#if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \
@ -56,7 +52,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n
static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS = Sha256_UpdateBlocks;
static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS_HW;
#define SHA256_UPDATE_BLOCKS(p) p->func_UpdateBlocks
#define SHA256_UPDATE_BLOCKS(p) p->v.vars.func_UpdateBlocks
#else
#define SHA256_UPDATE_BLOCKS(p) Sha256_UpdateBlocks
#endif
@ -85,7 +81,7 @@ BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo)
return False;
#endif
p->func_UpdateBlocks = func;
p->v.vars.func_UpdateBlocks = func;
return True;
}
@ -111,7 +107,7 @@ BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo)
void Sha256_InitState(CSha256 *p)
{
p->count = 0;
p->v.vars.count = 0;
p->state[0] = 0x6a09e667;
p->state[1] = 0xbb67ae85;
p->state[2] = 0x3c6ef372;
@ -122,9 +118,16 @@ void Sha256_InitState(CSha256 *p)
p->state[7] = 0x5be0cd19;
}
void Sha256_Init(CSha256 *p)
{
p->func_UpdateBlocks =
p->v.vars.func_UpdateBlocks =
#ifdef Z7_COMPILER_SHA256_SUPPORTED
g_SHA256_FUNC_UPDATE_BLOCKS;
#else
@ -224,12 +227,10 @@ void Sha256_Init(CSha256 *p)
#endif
// static
extern MY_ALIGN(64)
const UInt32 SHA256_K_ARRAY[64];
MY_ALIGN(64)
const UInt32 SHA256_K_ARRAY[64] = {
extern
MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64];
MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
@ -248,9 +249,12 @@ const UInt32 SHA256_K_ARRAY[64] = {
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
#define K SHA256_K_ARRAY
#define K SHA256_K_ARRAY
Z7_NO_INLINE
void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks)
{
@ -260,15 +264,14 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n
#else
[16];
#endif
unsigned j;
UInt32 a,b,c,d,e,f,g,h;
#if !defined(Z7_SHA256_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4)
UInt32 tmp;
#endif
if (numBlocks == 0) return;
a = state[0];
b = state[1];
c = state[2];
@ -278,7 +281,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n
g = state[6];
h = state[7];
while (numBlocks)
do
{
for (j = 0; j < 16; j += STEP_PRE)
@ -352,19 +355,11 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n
g += state[6]; state[6] = g;
h += state[7]; state[7] = h;
data += 64;
numBlocks--;
data += SHA256_BLOCK_SIZE;
}
while (--numBlocks);
}
/* Wipe variables */
/* memset(W, 0, sizeof(W)); */
}
#undef S0
#undef S1
#undef s0
#undef s1
#undef K
#define Sha256_UpdateBlock(p) SHA256_UPDATE_BLOCKS(p)(p->state, p->buffer, 1)
@ -372,20 +367,15 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size)
{
if (size == 0)
return;
{
unsigned pos = (unsigned)p->count & 0x3F;
unsigned num;
p->count += size;
num = 64 - pos;
const unsigned pos = (unsigned)p->v.vars.count & (SHA256_BLOCK_SIZE - 1);
const unsigned num = SHA256_BLOCK_SIZE - pos;
p->v.vars.count += size;
if (num > size)
{
memcpy(p->buffer + pos, data, size);
return;
}
if (pos != 0)
{
size -= num;
@ -395,9 +385,10 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size)
}
}
{
size_t numBlocks = size >> 6;
const size_t numBlocks = size >> 6;
// if (numBlocks)
SHA256_UPDATE_BLOCKS(p)(p->state, data, numBlocks);
size &= 0x3F;
size &= SHA256_BLOCK_SIZE - 1;
if (size == 0)
return;
data += (numBlocks << 6);
@ -408,53 +399,41 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size)
void Sha256_Final(CSha256 *p, Byte *digest)
{
unsigned pos = (unsigned)p->count & 0x3F;
unsigned i;
unsigned pos = (unsigned)p->v.vars.count & (SHA256_BLOCK_SIZE - 1);
p->buffer[pos++] = 0x80;
if (pos > (64 - 8))
if (pos > (SHA256_BLOCK_SIZE - 4 * 2))
{
while (pos != 64) { p->buffer[pos++] = 0; }
// memset(&p->buf.buffer[pos], 0, 64 - pos);
while (pos != SHA256_BLOCK_SIZE) { p->buffer[pos++] = 0; }
// memset(&p->buf.buffer[pos], 0, SHA256_BLOCK_SIZE - pos);
Sha256_UpdateBlock(p);
pos = 0;
}
/*
if (pos & 3)
memset(&p->buffer[pos], 0, (SHA256_BLOCK_SIZE - 4 * 2) - pos);
{
p->buffer[pos] = 0;
p->buffer[pos + 1] = 0;
p->buffer[pos + 2] = 0;
pos += 3;
pos &= ~3;
const UInt64 numBits = p->v.vars.count << 3;
SetBe32(p->buffer + SHA256_BLOCK_SIZE - 4 * 2, (UInt32)(numBits >> 32))
SetBe32(p->buffer + SHA256_BLOCK_SIZE - 4 * 1, (UInt32)(numBits))
}
{
for (; pos < 64 - 8; pos += 4)
*(UInt32 *)(&p->buffer[pos]) = 0;
}
*/
memset(&p->buffer[pos], 0, (64 - 8) - pos);
{
UInt64 numBits = (p->count << 3);
SetBe32(p->buffer + 64 - 8, (UInt32)(numBits >> 32))
SetBe32(p->buffer + 64 - 4, (UInt32)(numBits))
}
Sha256_UpdateBlock(p);
#if 1 && defined(MY_CPU_BE)
memcpy(digest, p->state, SHA256_DIGEST_SIZE);
#else
{
unsigned i;
for (i = 0; i < 8; i += 2)
{
UInt32 v0 = p->state[i];
UInt32 v1 = p->state[(size_t)i + 1];
const UInt32 v0 = p->state[i];
const UInt32 v1 = p->state[(size_t)i + 1];
SetBe32(digest , v0)
SetBe32(digest + 4, v1)
digest += 8;
digest += 4 * 2;
}
}
#endif
Sha256_InitState(p);
}
@ -466,12 +445,9 @@ void Sha256Prepare(void)
f = Sha256_UpdateBlocks;
f_hw = NULL;
#ifdef MY_CPU_X86_OR_AMD64
#ifndef USE_MY_MM
if (CPU_IsSupported_SHA()
&& CPU_IsSupported_SSSE3()
// && CPU_IsSupported_SSE41()
)
#endif
#else
if (CPU_IsSupported_SHA2())
#endif
@ -484,6 +460,8 @@ void Sha256Prepare(void)
#endif
}
#undef U64C
#undef K
#undef S0
#undef S1
#undef s0

View File

@ -1,18 +1,11 @@
/* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions
2024-03-01 : Igor Pavlov : Public domain */
: Igor Pavlov : Public domain */
#include "Precomp.h"
#include "Compiler.h"
#include "CpuArch.h"
#if defined(_MSC_VER)
#if (_MSC_VER < 1900) && (_MSC_VER >= 1200)
// #define USE_MY_MM
#endif
#endif
// #define Z7_USE_HW_SHA_STUB // for debug
#ifdef MY_CPU_X86_OR_AMD64
#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check
#define USE_HW_SHA
@ -20,19 +13,14 @@
|| defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900)
#define USE_HW_SHA
#if !defined(_INTEL_COMPILER)
#if !defined(__INTEL_COMPILER)
// icc defines __GNUC__, but icc doesn't support __attribute__(__target__)
#if !defined(__SHA__) || !defined(__SSSE3__)
#define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
#endif
#endif
#elif defined(_MSC_VER)
#ifdef USE_MY_MM
#define USE_VER_MIN 1300
#else
#define USE_VER_MIN 1900
#endif
#if (_MSC_VER >= USE_VER_MIN)
#if (_MSC_VER >= 1900)
#define USE_HW_SHA
#else
#define Z7_USE_HW_SHA_STUB
@ -47,23 +35,20 @@
// #pragma message("Sha256 HW")
// sse/sse2/ssse3:
#include <tmmintrin.h>
// sha*:
#include <immintrin.h>
#if defined (__clang__) && defined(_MSC_VER)
// #if !defined(__SSSE3__)
// #endif
#if !defined(__SHA__)
#include <shaintrin.h>
#endif
#else
#ifdef USE_MY_MM
#include "My_mm.h"
#endif
#endif
/*
@ -91,54 +76,38 @@ SHA:
extern
MY_ALIGN(64)
const UInt32 SHA256_K_ARRAY[64];
#define K SHA256_K_ARRAY
#define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src);
#define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src);
#define SHA25G_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src);
#define SHA256_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src);
#define LOAD_SHUFFLE(m, k) \
m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \
m = _mm_shuffle_epi8(m, mask); \
#define SM1(g0, g1, g2, g3) \
SHA256_MSG1(g3, g0); \
#define NNN(m0, m1, m2, m3)
#define SM2(g0, g1, g2, g3) \
tmp = _mm_alignr_epi8(g1, g0, 4); \
ADD_EPI32(g2, tmp) \
SHA25G_MSG2(g2, g1); \
// #define LS0(k, g0, g1, g2, g3) LOAD_SHUFFLE(g0, k)
// #define LS1(k, g0, g1, g2, g3) LOAD_SHUFFLE(g1, k+1)
#define NNN(g0, g1, g2, g3)
#define SM1(m1, m2, m3, m0) \
SHA256_MSG1(m0, m1); \
#define SM2(m2, m3, m0, m1) \
ADD_EPI32(m0, _mm_alignr_epi8(m3, m2, 4)) \
SHA256_MSG2(m0, m3); \
#define RND2(t0, t1) \
t0 = _mm_sha256rnds2_epu32(t0, t1, msg);
#define RND2_0(m, k) \
msg = _mm_add_epi32(m, *(const __m128i *) (const void *) &K[(k) * 4]); \
#define R4(k, m0, m1, m2, m3, OP0, OP1) \
msg = _mm_add_epi32(m0, *(const __m128i *) (const void *) &K[(k) * 4]); \
RND2(state0, state1); \
msg = _mm_shuffle_epi32(msg, 0x0E); \
#define RND2_1 \
OP0(m0, m1, m2, m3) \
RND2(state1, state0); \
// We use scheme with 3 rounds ahead for SHA256_MSG1 / 2 rounds ahead for SHA256_MSG2
#define R4(k, g0, g1, g2, g3, OP0, OP1) \
RND2_0(g0, k) \
OP0(g0, g1, g2, g3) \
RND2_1 \
OP1(g0, g1, g2, g3) \
OP1(m0, m1, m2, m3) \
#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \
@ -161,8 +130,9 @@ ATTRIB_SHA
void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
{
const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203);
__m128i tmp;
__m128i state0, state1;
__m128i tmp, state0, state1;
if (numBlocks == 0)
return;
@ -262,22 +232,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
#define _ARM_USE_NEW_NEON_INTRINSICS
#endif
#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
#include <arm64_neon.h>
#else
#if defined(__clang__) && __clang_major__ < 16
#if !defined(__ARM_FEATURE_SHA2) && \
!defined(__ARM_FEATURE_CRYPTO)
@ -324,41 +282,70 @@ typedef uint32x4_t v128;
// typedef __n128 v128; // MSVC
#ifdef MY_CPU_BE
#define MY_rev32_for_LE(x)
#define MY_rev32_for_LE(x) x
#else
#define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x)))
#define MY_rev32_for_LE(x) vrev32q_u8(x)
#endif
#define LOAD_128(_p) (*(const v128 *)(const void *)(_p))
#define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v)
#if 1 // 0 for debug
// for arm32: it works slower by some reason than direct code
/*
for arm32 it generates:
MSVC-2022, GCC-9:
vld1.32 {d18,d19}, [r10]
vst1.32 {d4,d5}, [r3]
vld1.8 {d20-d21}, [r4]
there is no align hint (like [r10:128]). So instruction allows unaligned access
*/
#define LOAD_128_32(_p) vld1q_u32(_p)
#define LOAD_128_8(_p) vld1q_u8 (_p)
#define STORE_128_32(_p, _v) vst1q_u32(_p, _v)
#else
/*
for arm32:
MSVC-2022:
vldm r10,{d18,d19}
vstm r3,{d4,d5}
does it require strict alignment?
GCC-9:
vld1.64 {d30-d31}, [r0:64]
vldr d28, [r0, #16]
vldr d29, [r0, #24]
vst1.64 {d30-d31}, [r0:64]
vstr d28, [r0, #16]
vstr d29, [r0, #24]
there is hint [r0:64], so does it requires 64-bit alignment.
*/
#define LOAD_128_32(_p) (*(const v128 *)(const void *)(_p))
#define LOAD_128_8(_p) vreinterpretq_u8_u32(*(const v128 *)(const void *)(_p))
#define STORE_128_32(_p, _v) *(v128 *)(void *)(_p) = (_v)
#endif
#define LOAD_SHUFFLE(m, k) \
m = LOAD_128((data + (k) * 16)); \
MY_rev32_for_LE(m); \
m = vreinterpretq_u32_u8( \
MY_rev32_for_LE( \
LOAD_128_8(data + (k) * 16))); \
// K array must be aligned for 16-bytes at least.
extern
MY_ALIGN(64)
const UInt32 SHA256_K_ARRAY[64];
#define K SHA256_K_ARRAY
#define SHA256_SU0(dest, src) dest = vsha256su0q_u32(dest, src);
#define SHA25G_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3);
#define SHA256_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3);
#define SM1(g0, g1, g2, g3) SHA256_SU0(g3, g0)
#define SM2(g0, g1, g2, g3) SHA25G_SU1(g2, g0, g1)
#define NNN(g0, g1, g2, g3)
#define SM1(m0, m1, m2, m3) SHA256_SU0(m3, m0)
#define SM2(m0, m1, m2, m3) SHA256_SU1(m2, m0, m1)
#define NNN(m0, m1, m2, m3)
#define R4(k, g0, g1, g2, g3, OP0, OP1) \
msg = vaddq_u32(g0, *(const v128 *) (const void *) &K[(k) * 4]); \
#define R4(k, m0, m1, m2, m3, OP0, OP1) \
msg = vaddq_u32(m0, *(const v128 *) (const void *) &K[(k) * 4]); \
tmp = state0; \
state0 = vsha256hq_u32( state0, state1, msg ); \
state1 = vsha256h2q_u32( state1, tmp, msg ); \
OP0(g0, g1, g2, g3); \
OP1(g0, g1, g2, g3); \
OP0(m0, m1, m2, m3); \
OP1(m0, m1, m2, m3); \
#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
@ -379,8 +366,8 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
if (numBlocks == 0)
return;
state0 = LOAD_128(&state[0]);
state1 = LOAD_128(&state[4]);
state0 = LOAD_128_32(&state[0]);
state1 = LOAD_128_32(&state[4]);
do
{
@ -408,8 +395,8 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
}
while (--numBlocks);
STORE_128(&state[0], state0);
STORE_128(&state[4], state1);
STORE_128_32(&state[0], state0);
STORE_128_32(&state[4], state1);
}
#endif // USE_HW_SHA
@ -443,13 +430,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
#endif
#undef K
#undef RND2
#undef RND2_0
#undef RND2_1
#undef MY_rev32_for_LE
#undef NNN
#undef LOAD_128
#undef STORE_128
@ -457,7 +441,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
#undef SM1
#undef SM2
#undef NNN
#undef R4
#undef R16
#undef PREPARE_STATE

View File

@ -1,141 +1,268 @@
/* Sort.c -- Sort functions
2014-04-05 : Igor Pavlov : Public domain */
: Igor Pavlov : Public domain */
#include "Precomp.h"
#include "Sort.h"
#include "CpuArch.h"
#define HeapSortDown(p, k, size, temp) \
{ for (;;) { \
size_t s = (k << 1); \
if (s > size) break; \
if (s < size && p[s + 1] > p[s]) s++; \
if (temp >= p[s]) break; \
p[k] = p[s]; k = s; \
} p[k] = temp; }
#if ( (defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \
|| (defined(__clang__) && Z7_has_builtin(__builtin_prefetch)) \
)
// the code with prefetch is slow for small arrays on x86.
// So we disable prefetch for x86.
#ifndef MY_CPU_X86
// #pragma message("Z7_PREFETCH : __builtin_prefetch")
#define Z7_PREFETCH(a) __builtin_prefetch((a))
#endif
#elif defined(_WIN32) // || defined(_MSC_VER) && (_MSC_VER >= 1200)
#include "7zWindows.h"
// NOTE: CLANG/GCC/MSVC can define different values for _MM_HINT_T0 / PF_TEMPORAL_LEVEL_1.
// For example, clang-cl can generate "prefetcht2" instruction for
// PreFetchCacheLine(PF_TEMPORAL_LEVEL_1) call.
// But we want to generate "prefetcht0" instruction.
// So for CLANG/GCC we must use __builtin_prefetch() in code branch above
// instead of PreFetchCacheLine() / _mm_prefetch().
// New msvc-x86 compiler generates "prefetcht0" instruction for PreFetchCacheLine() call.
// But old x86 cpus don't support "prefetcht0".
// So we will use PreFetchCacheLine(), only if we are sure that
// generated instruction is supported by all cpus of that isa.
#if defined(MY_CPU_AMD64) \
|| defined(MY_CPU_ARM64) \
|| defined(MY_CPU_IA64)
// we need to use additional braces for (a) in PreFetchCacheLine call, because
// PreFetchCacheLine macro doesn't use braces:
// #define PreFetchCacheLine(l, a) _mm_prefetch((CHAR CONST *) a, l)
// #pragma message("Z7_PREFETCH : PreFetchCacheLine")
#define Z7_PREFETCH(a) PreFetchCacheLine(PF_TEMPORAL_LEVEL_1, (a))
#endif
#endif // _WIN32
#define PREFETCH_NO(p,k,s,size)
#ifndef Z7_PREFETCH
#define SORT_PREFETCH(p,k,s,size)
#else
// #define PREFETCH_LEVEL 2 // use it if cache line is 32-bytes
#define PREFETCH_LEVEL 3 // it is fast for most cases (64-bytes cache line prefetch)
// #define PREFETCH_LEVEL 4 // it can be faster for big array (128-bytes prefetch)
#if PREFETCH_LEVEL == 0
#define SORT_PREFETCH(p,k,s,size)
#else // PREFETCH_LEVEL != 0
void HeapSort(UInt32 *p, size_t size)
{
if (size <= 1)
return;
p--;
{
size_t i = size / 2;
do
{
UInt32 temp = p[i];
size_t k = i;
HeapSortDown(p, k, size, temp)
}
while (--i != 0);
}
/*
if defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
we prefetch one value per cache line.
Use it if array is aligned for cache line size (64 bytes)
or if array is small (less than L1 cache size).
if !defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
we perfetch all cache lines that can be required.
it can be faster for big unaligned arrays.
*/
#define USE_PREFETCH_FOR_ALIGNED_ARRAY
// s == k * 2
#if 0 && PREFETCH_LEVEL <= 3 && defined(MY_CPU_X86_OR_AMD64)
// x86 supports (lea r1*8+offset)
#define PREFETCH_OFFSET(k,s) ((s) << PREFETCH_LEVEL)
#else
#define PREFETCH_OFFSET(k,s) ((k) << (PREFETCH_LEVEL + 1))
#endif
#if 1 && PREFETCH_LEVEL <= 3 && defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
#define PREFETCH_ADD_OFFSET 0
#else
// last offset that can be reqiured in PREFETCH_LEVEL step:
#define PREFETCH_RANGE ((2 << PREFETCH_LEVEL) - 1)
#define PREFETCH_ADD_OFFSET PREFETCH_RANGE / 2
#endif
#if PREFETCH_LEVEL <= 3
#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
#define SORT_PREFETCH(p,k,s,size) \
{ const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_ADD_OFFSET; \
if (s2 <= size) { \
Z7_PREFETCH((p + s2)); \
}}
#else /* for unaligned array */
#define SORT_PREFETCH(p,k,s,size) \
{ const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
if (s2 <= size) { \
Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
Z7_PREFETCH((p + s2)); \
}}
#endif
#else // PREFETCH_LEVEL > 3
#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
#define SORT_PREFETCH(p,k,s,size) \
{ const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE - 16 / 2; \
if (s2 <= size) { \
Z7_PREFETCH((p + s2 - 16)); \
Z7_PREFETCH((p + s2)); \
}}
#else /* for unaligned array */
#define SORT_PREFETCH(p,k,s,size) \
{ const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
if (s2 <= size) { \
Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
Z7_PREFETCH((p + s2 - PREFETCH_RANGE / 2)); \
Z7_PREFETCH((p + s2)); \
}}
#endif
#endif // PREFETCH_LEVEL > 3
#endif // PREFETCH_LEVEL != 0
#endif // Z7_PREFETCH
#if defined(MY_CPU_ARM64) \
/* || defined(MY_CPU_AMD64) */ \
/* || defined(MY_CPU_ARM) && !defined(_MSC_VER) */
// we want to use cmov, if cmov is very fast:
// - this cmov version is slower for clang-x64.
// - this cmov version is faster for gcc-arm64 for some fast arm64 cpus.
#define Z7_FAST_CMOV_SUPPORTED
#endif
#ifdef Z7_FAST_CMOV_SUPPORTED
// we want to use cmov here, if cmov is fast: new arm64 cpus.
// we want the compiler to use conditional move for this branch
#define GET_MAX_VAL(n0, n1, max_val_slow) if (n0 < n1) n0 = n1;
#else
// use this branch, if cpu doesn't support fast conditional move.
// it uses slow array access reading:
#define GET_MAX_VAL(n0, n1, max_val_slow) n0 = max_val_slow;
#endif
#define HeapSortDown(p, k, size, temp, macro_prefetch) \
{ \
for (;;) { \
UInt32 n0, n1; \
size_t s = k * 2; \
if (s >= size) { \
if (s == size) { \
n0 = p[s]; \
p[k] = n0; \
if (temp < n0) k = s; \
} \
break; \
} \
n0 = p[k * 2]; \
n1 = p[k * 2 + 1]; \
s += n0 < n1; \
GET_MAX_VAL(n0, n1, p[s]) \
if (temp >= n0) break; \
macro_prefetch(p, k, s, size) \
p[k] = n0; \
k = s; \
} \
p[k] = temp; \
}
/*
stage-1 : O(n) :
we generate intermediate partially sorted binary tree:
p[0] : it's additional item for better alignment of tree structure in memory.
p[1]
p[2] p[3]
p[4] p[5] p[6] p[7]
...
p[x] >= p[x * 2]
p[x] >= p[x * 2 + 1]
stage-2 : O(n)*log2(N):
we move largest item p[0] from head of tree to the end of array
and insert last item to sorted binary tree.
*/
// (p) must be aligned for cache line size (64-bytes) for best performance
void Z7_FASTCALL HeapSort(UInt32 *p, size_t size)
{
if (size < 2)
return;
if (size == 2)
{
const UInt32 a0 = p[0];
const UInt32 a1 = p[1];
const unsigned k = a1 < a0;
p[k] = a0;
p[k ^ 1] = a1;
return;
}
{
// stage-1 : O(n)
// we transform array to partially sorted binary tree.
size_t i = --size / 2;
// (size) now is the index of the last item in tree,
// if (i)
{
do
{
const UInt32 temp = p[i];
size_t k = i;
HeapSortDown(p, k, size, temp, PREFETCH_NO)
}
while (--i);
}
{
const UInt32 temp = p[0];
const UInt32 a1 = p[1];
if (temp < a1)
{
size_t k = 1;
UInt32 temp = p[size];
p[size--] = p[1];
HeapSortDown(p, k, size, temp)
p[0] = a1;
HeapSortDown(p, k, size, temp, PREFETCH_NO)
}
while (size > 1);
*/
while (size > 3)
{
UInt32 temp = p[size];
size_t k = (p[3] > p[2]) ? 3 : 2;
p[size--] = p[1];
p[1] = p[k];
HeapSortDown(p, k, size, temp)
}
{
UInt32 temp = p[size];
p[size] = p[1];
if (size > 2 && p[2] < temp)
{
p[1] = p[2];
p[2] = temp;
}
else
p[1] = temp;
}
}
void HeapSort64(UInt64 *p, size_t size)
if (size < 3)
{
if (size <= 1)
// size == 2
const UInt32 a0 = p[0];
p[0] = p[2];
p[2] = a0;
return;
p--;
}
if (size != 3)
{
size_t i = size / 2;
// stage-2 : O(size) * log2(size):
// we move largest item p[0] from head to the end of array,
// and insert last item to sorted binary tree.
do
{
UInt64 temp = p[i];
size_t k = i;
HeapSortDown(p, k, size, temp)
}
while (--i != 0);
}
/*
do
{
size_t k = 1;
UInt64 temp = p[size];
p[size--] = p[1];
HeapSortDown(p, k, size, temp)
}
while (size > 1);
*/
while (size > 3)
{
UInt64 temp = p[size];
size_t k = (p[3] > p[2]) ? 3 : 2;
p[size--] = p[1];
const UInt32 temp = p[size];
size_t k = p[2] < p[3] ? 3 : 2;
p[size--] = p[0];
p[0] = p[1];
p[1] = p[k];
HeapSortDown(p, k, size, temp)
HeapSortDown(p, k, size, temp, SORT_PREFETCH) // PREFETCH_NO
}
while (size != 3);
}
{
UInt64 temp = p[size];
p[size] = p[1];
if (size > 2 && p[2] < temp)
{
p[1] = p[2];
p[2] = temp;
}
else
p[1] = temp;
const UInt32 a2 = p[2];
const UInt32 a3 = p[3];
const size_t k = a2 < a3;
p[2] = p[1];
p[3] = p[0];
p[k] = a3;
p[k ^ 1] = a2;
}
}
/*
#define HeapSortRefDown(p, vals, n, size, temp) \
{ size_t k = n; UInt32 val = vals[temp]; for (;;) { \
size_t s = (k << 1); \
if (s > size) break; \
if (s < size && vals[p[s + 1]] > vals[p[s]]) s++; \
if (val >= vals[p[s]]) break; \
p[k] = p[s]; k = s; \
} p[k] = temp; }
void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size)
{
if (size <= 1)
return;
p--;
{
size_t i = size / 2;
do
{
UInt32 temp = p[i];
HeapSortRefDown(p, vals, i, size, temp);
}
while (--i != 0);
}
do
{
UInt32 temp = p[size];
p[size--] = p[1];
HeapSortRefDown(p, vals, 1, size, temp);
}
while (size > 1);
}
*/

View File

@ -1,5 +1,5 @@
/* Threads.c -- multithreading library
2024-03-28 : Igor Pavlov : Public domain */
: Igor Pavlov : Public domain */
#include "Precomp.h"
@ -59,6 +59,100 @@ WRes Thread_Wait_Close(CThread *p)
return (res != 0 ? res : res2);
}
typedef struct MY_PROCESSOR_NUMBER {
WORD Group;
BYTE Number;
BYTE Reserved;
} MY_PROCESSOR_NUMBER, *MY_PPROCESSOR_NUMBER;
typedef struct MY_GROUP_AFFINITY {
#if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 100000)
// KAFFINITY is not defined in old mingw
ULONG_PTR
#else
KAFFINITY
#endif
Mask;
WORD Group;
WORD Reserved[3];
} MY_GROUP_AFFINITY, *MY_PGROUP_AFFINITY;
typedef BOOL (WINAPI *Func_SetThreadGroupAffinity)(
HANDLE hThread,
CONST MY_GROUP_AFFINITY *GroupAffinity,
MY_PGROUP_AFFINITY PreviousGroupAffinity);
typedef BOOL (WINAPI *Func_GetThreadGroupAffinity)(
HANDLE hThread,
MY_PGROUP_AFFINITY GroupAffinity);
typedef BOOL (WINAPI *Func_GetProcessGroupAffinity)(
HANDLE hProcess,
PUSHORT GroupCount,
PUSHORT GroupArray);
Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION
#if 0
#include <stdio.h>
#define PRF(x) x
/*
--
before call of SetThreadGroupAffinity()
GetProcessGroupAffinity return one group.
after call of SetThreadGroupAffinity():
GetProcessGroupAffinity return more than group,
if SetThreadGroupAffinity() was to another group.
--
GetProcessAffinityMask MS DOCs:
{
If the calling process contains threads in multiple groups,
the function returns zero for both affinity masks.
}
but tests in win10 with 2 groups (less than 64 cores total):
GetProcessAffinityMask() still returns non-zero affinity masks
even after SetThreadGroupAffinity() calls.
*/
static void PrintProcess_Info()
{
{
const
Func_GetProcessGroupAffinity fn_GetProcessGroupAffinity =
(Func_GetProcessGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
"GetProcessGroupAffinity");
if (fn_GetProcessGroupAffinity)
{
unsigned i;
USHORT GroupCounts[64];
USHORT GroupCount = Z7_ARRAY_SIZE(GroupCounts);
BOOL boolRes = fn_GetProcessGroupAffinity(GetCurrentProcess(),
&GroupCount, GroupCounts);
printf("\n====== GetProcessGroupAffinity : "
"boolRes=%u GroupCounts = %u :",
boolRes, (unsigned)GroupCount);
for (i = 0; i < GroupCount; i++)
printf(" %u", GroupCounts[i]);
printf("\n");
}
}
{
DWORD_PTR processAffinityMask, systemAffinityMask;
if (GetProcessAffinityMask(GetCurrentProcess(), &processAffinityMask, &systemAffinityMask))
{
PRF(printf("\n====== GetProcessAffinityMask : "
": processAffinityMask=%x, systemAffinityMask=%x\n",
(UInt32)processAffinityMask, (UInt32)systemAffinityMask);)
}
else
printf("\n==GetProcessAffinityMask FAIL");
}
}
#else
#ifndef USE_THREADS_CreateThread
// #define PRF(x)
#endif
#endif
WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param)
{
/* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */
@ -73,6 +167,42 @@ WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param)
unsigned threadId;
*p = (HANDLE)(_beginthreadex(NULL, 0, func, param, 0, &threadId));
#if 0 // 1 : for debug
{
DWORD_PTR prevMask;
DWORD_PTR affinity = 1 << 0;
prevMask = SetThreadAffinityMask(*p, (DWORD_PTR)affinity);
prevMask = prevMask;
}
#endif
#if 0 // 1 : for debug
{
/* win10: new thread will be created in same group that is assigned to parent thread
but affinity mask will contain all allowed threads of that group,
even if affinity mask of parent group is not full
win11: what group it will be created, if we have set
affinity of parent thread with ThreadGroupAffinity?
*/
const
Func_GetThreadGroupAffinity fn =
(Func_GetThreadGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
"GetThreadGroupAffinity");
if (fn)
{
// BOOL wres2;
MY_GROUP_AFFINITY groupAffinity;
memset(&groupAffinity, 0, sizeof(groupAffinity));
/* wres2 = */ fn(*p, &groupAffinity);
PRF(printf("\n==Thread_Create cur = %6u GetThreadGroupAffinity(): "
"wres2_BOOL = %u, group=%u mask=%x\n",
GetCurrentThreadId(),
wres2,
groupAffinity.Group,
(UInt32)groupAffinity.Mask);)
}
}
#endif
#endif
/* maybe we must use errno here, but probably GetLastError() is also OK. */
@ -110,7 +240,84 @@ WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param
*/
}
{
DWORD prevSuspendCount = ResumeThread(h);
const DWORD prevSuspendCount = ResumeThread(h);
/* ResumeThread() returns:
0 : was_not_suspended
1 : was_resumed
-1 : error
*/
if (prevSuspendCount == (DWORD)-1)
wres = GetError();
}
}
/* maybe we must use errno here, but probably GetLastError() is also OK. */
return wres;
#endif
}
WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinityMask)
{
#ifdef USE_THREADS_CreateThread
UNUSED_VAR(group)
UNUSED_VAR(affinityMask)
return Thread_Create(p, func, param);
#else
/* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */
HANDLE h;
WRes wres;
unsigned threadId;
h = (HANDLE)(_beginthreadex(NULL, 0, func, param, CREATE_SUSPENDED, &threadId));
*p = h;
wres = HandleToWRes(h);
if (h)
{
// PrintProcess_Info();
{
const
Func_SetThreadGroupAffinity fn =
(Func_SetThreadGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
"SetThreadGroupAffinity");
if (fn)
{
// WRes wres2;
MY_GROUP_AFFINITY groupAffinity, prev_groupAffinity;
memset(&groupAffinity, 0, sizeof(groupAffinity));
// groupAffinity.Mask must use only bits that supported by current group
// (groupAffinity.Mask = 0) means all allowed bits
groupAffinity.Mask = affinityMask;
groupAffinity.Group = (WORD)group;
// wres2 =
fn(h, &groupAffinity, &prev_groupAffinity);
/*
if (groupAffinity.Group == prev_groupAffinity.Group)
wres2 = wres2;
else
wres2 = wres2;
if (wres2 == 0)
{
wres2 = GetError();
PRF(printf("\n==SetThreadGroupAffinity error: %u\n", wres2);)
}
else
{
PRF(printf("\n==Thread_Create_With_Group::SetThreadGroupAffinity()"
" threadId = %6u"
" group=%u mask=%x\n",
threadId,
prev_groupAffinity.Group,
(UInt32)prev_groupAffinity.Mask);)
}
*/
}
}
{
const DWORD prevSuspendCount = ResumeThread(h);
/* ResumeThread() returns:
0 : was_not_suspended
1 : was_resumed
@ -297,6 +504,13 @@ WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param)
return Thread_Create_With_CpuSet(p, func, param, NULL);
}
/*
WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinity)
{
UNUSED_VAR(group)
return Thread_Create_With_Affinity(p, func, param, affinity);
}
*/
WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, CAffinityMask affinity)
{
@ -577,5 +791,22 @@ WRes AutoResetEvent_OptCreate_And_Reset(CAutoResetEvent *p)
return AutoResetEvent_CreateNotSignaled(p);
}
void ThreadNextGroup_Init(CThreadNextGroup *p, UInt32 numGroups, UInt32 startGroup)
{
// printf("\n====== ThreadNextGroup_Init numGroups = %x: startGroup=%x\n", numGroups, startGroup);
if (numGroups == 0)
numGroups = 1;
p->NumGroups = numGroups;
p->NextGroup = startGroup % numGroups;
}
UInt32 ThreadNextGroup_GetNext(CThreadNextGroup *p)
{
const UInt32 next = p->NextGroup;
p->NextGroup = (next + 1) % p->NumGroups;
return next;
}
#undef PRF
#undef Print

View File

@ -1,5 +1,5 @@
/* XzCrc64Opt.c -- CRC64 calculation (optimized functions)
2023-12-08 : Igor Pavlov : Public domain */
: Igor Pavlov : Public domain */
#include "Precomp.h"
@ -235,7 +235,7 @@ CRC64_FUNC_PRE_BE(Z7_CRC64_NUM_TABLES_USE)
v = Q32BE(1, w1) ^ Q32BE(0, w0);
v ^= Q32BE(3, d1) ^ Q32BE(2, d0);
#endif
#elif
#else
#error Stop_Compiling_Bad_CRC64_NUM_TABLES
#endif
p += Z7_CRC64_NUM_TABLES_USE;

View File

@ -1,5 +1,5 @@
/* XzDec.c -- Xz Decode
2024-03-01 : Igor Pavlov : Public domain */
: Igor Pavlov : Public domain */
#include "Precomp.h"
@ -59,7 +59,7 @@ unsigned Xz_ReadVarInt(const Byte *p, size_t maxSize, UInt64 *value)
for (i = 0; i < limit;)
{
Byte b = p[i];
const unsigned b = p[i];
*value |= (UInt64)(b & 0x7F) << (7 * i++);
if ((b & 0x80) == 0)
return (b == 0 && i != 1) ? 0 : i;
@ -796,11 +796,10 @@ SRes Xz_ParseHeader(CXzStreamFlags *p, const Byte *buf)
static BoolInt Xz_CheckFooter(CXzStreamFlags flags, UInt64 indexSize, const Byte *buf)
{
return indexSize == (((UInt64)GetUi32(buf + 4) + 1) << 2)
&& GetUi32(buf) == CrcCalc(buf + 4, 6)
&& flags == GetBe16(buf + 8)
&& buf[10] == XZ_FOOTER_SIG_0
&& buf[11] == XZ_FOOTER_SIG_1;
return indexSize == (((UInt64)GetUi32a(buf + 4) + 1) << 2)
&& GetUi32a(buf) == CrcCalc(buf + 4, 6)
&& flags == GetBe16a(buf + 8)
&& GetUi16a(buf + 10) == (XZ_FOOTER_SIG_0 | (XZ_FOOTER_SIG_1 << 8));
}
#define READ_VARINT_AND_CHECK(buf, pos, size, res) \
@ -1166,7 +1165,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
p->indexPreSize = 1 + Xz_WriteVarInt(p->buf + 1, p->numBlocks);
p->indexPos = p->indexPreSize;
p->indexSize += p->indexPreSize;
Sha256_Final(&p->sha, p->shaDigest);
Sha256_Final(&p->sha, (Byte *)(void *)p->shaDigest32);
Sha256_Init(&p->sha);
p->crc = CrcUpdate(CRC_INIT_VAL, p->buf, p->indexPreSize);
p->state = XZ_STATE_STREAM_INDEX;
@ -1241,10 +1240,10 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
break;
}
{
Byte digest[XZ_CHECK_SIZE_MAX];
UInt32 digest32[XZ_CHECK_SIZE_MAX / 4];
p->state = XZ_STATE_BLOCK_HEADER;
p->pos = 0;
if (XzCheck_Final(&p->check, digest) && memcmp(digest, p->buf, checkSize) != 0)
if (XzCheck_Final(&p->check, (void *)digest32) && memcmp(digest32, p->buf, checkSize) != 0)
return SZ_ERROR_CRC;
if (p->decodeOnlyOneBlock)
{
@ -1289,12 +1288,12 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
}
else
{
Byte digest[SHA256_DIGEST_SIZE];
UInt32 digest32[SHA256_DIGEST_SIZE / 4];
p->state = XZ_STATE_STREAM_INDEX_CRC;
p->indexSize += 4;
p->pos = 0;
Sha256_Final(&p->sha, digest);
if (memcmp(digest, p->shaDigest, SHA256_DIGEST_SIZE) != 0)
Sha256_Final(&p->sha, (void *)digest32);
if (memcmp(digest32, p->shaDigest32, SHA256_DIGEST_SIZE) != 0)
return SZ_ERROR_CRC;
}
}
@ -1313,7 +1312,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
const Byte *ptr = p->buf;
p->state = XZ_STATE_STREAM_FOOTER;
p->pos = 0;
if (CRC_GET_DIGEST(p->crc) != GetUi32(ptr))
if (CRC_GET_DIGEST(p->crc) != GetUi32a(ptr))
return SZ_ERROR_CRC;
}
break;
@ -1343,7 +1342,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
{
if (*src != 0)
{
if (((UInt32)p->padSize & 3) != 0)
if ((unsigned)p->padSize & 3)
return SZ_ERROR_NO_ARCHIVE;
p->pos = 0;
p->state = XZ_STATE_STREAM_HEADER;

View File

@ -1,5 +1,5 @@
/* XzEnc.c -- Xz Encode
2024-03-01 : Igor Pavlov : Public domain */
: Igor Pavlov : Public domain */
#include "Precomp.h"
@ -411,6 +411,7 @@ static SRes SeqInFilter_Read(ISeqInStreamPtr pp, void *data, size_t *size)
}
}
Z7_FORCE_INLINE
static void SeqInFilter_Construct(CSeqInFilter *p)
{
p->buf = NULL;
@ -418,6 +419,7 @@ static void SeqInFilter_Construct(CSeqInFilter *p)
p->vt.Read = SeqInFilter_Read;
}
Z7_FORCE_INLINE
static void SeqInFilter_Free(CSeqInFilter *p, ISzAllocPtr alloc)
{
if (p->StateCoder.p)
@ -507,6 +509,7 @@ void XzFilterProps_Init(CXzFilterProps *p)
void XzProps_Init(CXzProps *p)
{
p->checkId = XZ_CHECK_CRC32;
p->numThreadGroups = 0;
p->blockSize = XZ_PROPS_BLOCK_SIZE_AUTO;
p->numBlockThreads_Reduced = -1;
p->numBlockThreads_Max = -1;
@ -689,6 +692,7 @@ typedef struct
} CLzma2WithFilters;
Z7_FORCE_INLINE
static void Lzma2WithFilters_Construct(CLzma2WithFilters *p)
{
p->lzma2 = NULL;
@ -712,6 +716,7 @@ static SRes Lzma2WithFilters_Create(CLzma2WithFilters *p, ISzAllocPtr alloc, ISz
}
Z7_FORCE_INLINE
static void Lzma2WithFilters_Free(CLzma2WithFilters *p, ISzAllocPtr alloc)
{
#ifdef USE_SUBBLOCK
@ -1236,6 +1241,7 @@ SRes XzEnc_Encode(CXzEncHandle p, ISeqOutStreamPtr outStream, ISeqInStreamPtr in
}
p->mtCoder.numThreadsMax = (unsigned)props->numBlockThreads_Max;
p->mtCoder.numThreadGroups = props->numThreadGroups;
p->mtCoder.expectedDataSize = p->expectedDataSize;
RINOK(MtCoder_Code(&p->mtCoder))

View File

@ -1,38 +1,39 @@
/* XzIn.c - Xz input
2023-09-07 : Igor Pavlov : Public domain */
: Igor Pavlov : Public domain */
#include "Precomp.h"
#include <string.h>
#include "7zCrc.h"
#include "CpuArch.h"
#include "Xz.h"
#include "CpuArch.h"
/*
#define XZ_FOOTER_SIG_CHECK(p) (memcmp((p), XZ_FOOTER_SIG, XZ_FOOTER_SIG_SIZE) == 0)
*/
#define XZ_FOOTER_SIG_CHECK(p) ((p)[0] == XZ_FOOTER_SIG_0 && (p)[1] == XZ_FOOTER_SIG_1)
#define XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(p) \
(GetUi16a((const Byte *)(const void *)(p) + 10) == \
(XZ_FOOTER_SIG_0 | (XZ_FOOTER_SIG_1 << 8)))
SRes Xz_ReadHeader(CXzStreamFlags *p, ISeqInStreamPtr inStream)
{
Byte sig[XZ_STREAM_HEADER_SIZE];
UInt32 data32[XZ_STREAM_HEADER_SIZE / 4];
size_t processedSize = XZ_STREAM_HEADER_SIZE;
RINOK(SeqInStream_ReadMax(inStream, sig, &processedSize))
RINOK(SeqInStream_ReadMax(inStream, data32, &processedSize))
if (processedSize != XZ_STREAM_HEADER_SIZE
|| memcmp(sig, XZ_SIG, XZ_SIG_SIZE) != 0)
|| memcmp(data32, XZ_SIG, XZ_SIG_SIZE) != 0)
return SZ_ERROR_NO_ARCHIVE;
return Xz_ParseHeader(p, sig);
return Xz_ParseHeader(p, (const Byte *)(const void *)data32);
}
#define READ_VARINT_AND_CHECK(buf, pos, size, res) \
{ const unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \
#define READ_VARINT_AND_CHECK(buf, size, res) \
{ const unsigned s = Xz_ReadVarInt(buf, size, res); \
if (s == 0) return SZ_ERROR_ARCHIVE; \
pos += s; }
size -= s; \
buf += s; \
}
SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, UInt32 *headerSizeRes)
{
MY_ALIGN(4)
Byte header[XZ_BLOCK_HEADER_SIZE_MAX];
unsigned headerSize;
*headerSizeRes = 0;
@ -57,8 +58,12 @@ SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex,
return XzBlock_Parse(p, header);
}
#define ADD_SIZE_CHECK(size, val) \
{ const UInt64 newSize = size + (val); if (newSize < size) return XZ_SIZE_OVERFLOW; size = newSize; }
{ const UInt64 newSize = size + (val); \
if (newSize < size) return XZ_SIZE_OVERFLOW; \
size = newSize; \
}
UInt64 Xz_GetUnpackSize(const CXzStream *p)
{
@ -82,76 +87,85 @@ UInt64 Xz_GetPackSize(const CXzStream *p)
return size;
}
/*
SRes XzBlock_ReadFooter(CXzBlock *p, CXzStreamFlags f, ISeqInStreamPtr inStream)
{
return SeqInStream_Read(inStream, p->check, XzFlags_GetCheckSize(f));
}
*/
static SRes Xz_ReadIndex2(CXzStream *p, const Byte *buf, size_t size, ISzAllocPtr alloc)
// input;
// CXzStream (p) is empty object.
// size != 0
// (size & 3) == 0
// (buf) is aligned for at least 4 bytes.
// output:
// p->numBlocks is number of allocated items in p->blocks
// p->blocks[*] values must be ignored, if function returns error.
static SRes Xz_ParseIndex(CXzStream *p, const Byte *buf, size_t size, ISzAllocPtr alloc)
{
size_t numBlocks, pos = 1;
UInt32 crc;
size_t numBlocks;
if (size < 5 || buf[0] != 0)
return SZ_ERROR_ARCHIVE;
size -= 4;
crc = CrcCalc(buf, size);
if (crc != GetUi32(buf + size))
{
const UInt32 crc = CrcCalc(buf, size);
if (crc != GetUi32a(buf + size))
return SZ_ERROR_ARCHIVE;
}
buf++;
size--;
{
UInt64 numBlocks64;
READ_VARINT_AND_CHECK(buf, pos, size, &numBlocks64)
READ_VARINT_AND_CHECK(buf, size, &numBlocks64)
// (numBlocks64) is 63-bit value, so we can calculate (numBlocks64 * 2):
if (numBlocks64 * 2 > size)
return SZ_ERROR_ARCHIVE;
if (numBlocks64 >= ((size_t)1 << (sizeof(size_t) * 8 - 1)) / sizeof(CXzBlockSizes))
return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE
numBlocks = (size_t)numBlocks64;
if (numBlocks != numBlocks64 || numBlocks * 2 > size)
return SZ_ERROR_ARCHIVE;
}
Xz_Free(p, alloc);
if (numBlocks != 0)
// Xz_Free(p, alloc); // it's optional, because (p) is empty already
if (numBlocks)
{
size_t i;
p->numBlocks = numBlocks;
p->blocks = (CXzBlockSizes *)ISzAlloc_Alloc(alloc, sizeof(CXzBlockSizes) * numBlocks);
if (!p->blocks)
CXzBlockSizes *blocks = (CXzBlockSizes *)ISzAlloc_Alloc(alloc, sizeof(CXzBlockSizes) * numBlocks);
if (!blocks)
return SZ_ERROR_MEM;
for (i = 0; i < numBlocks; i++)
p->blocks = blocks;
p->numBlocks = numBlocks;
// the caller will call Xz_Free() in case of error
do
{
CXzBlockSizes *block = &p->blocks[i];
READ_VARINT_AND_CHECK(buf, pos, size, &block->totalSize)
READ_VARINT_AND_CHECK(buf, pos, size, &block->unpackSize)
if (block->totalSize == 0)
READ_VARINT_AND_CHECK(buf, size, &blocks->totalSize)
READ_VARINT_AND_CHECK(buf, size, &blocks->unpackSize)
if (blocks->totalSize == 0)
return SZ_ERROR_ARCHIVE;
blocks++;
}
while (--numBlocks);
}
while ((pos & 3) != 0)
if (buf[pos++] != 0)
if (size >= 4)
return SZ_ERROR_ARCHIVE;
return (pos == size) ? SZ_OK : SZ_ERROR_ARCHIVE;
while (size)
if (buf[--size])
return SZ_ERROR_ARCHIVE;
return SZ_OK;
}
/*
static SRes Xz_ReadIndex(CXzStream *p, ILookInStreamPtr stream, UInt64 indexSize, ISzAllocPtr alloc)
{
SRes res;
size_t size;
Byte *buf;
if (indexSize > ((UInt32)1 << 31))
return SZ_ERROR_UNSUPPORTED;
if (indexSize >= ((size_t)1 << (sizeof(size_t) * 8 - 1)))
return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE
size = (size_t)indexSize;
if (size != indexSize)
return SZ_ERROR_UNSUPPORTED;
buf = (Byte *)ISzAlloc_Alloc(alloc, size);
if (!buf)
return SZ_ERROR_MEM;
res = LookInStream_Read2(stream, buf, size, SZ_ERROR_UNSUPPORTED);
if (res == SZ_OK)
res = Xz_ReadIndex2(p, buf, size, alloc);
res = Xz_ParseIndex(p, buf, size, alloc);
ISzAlloc_Free(alloc, buf);
return res;
}
*/
static SRes LookInStream_SeekRead_ForArc(ILookInStreamPtr stream, UInt64 offset, void *buf, size_t size)
{
@ -160,84 +174,102 @@ static SRes LookInStream_SeekRead_ForArc(ILookInStreamPtr stream, UInt64 offset,
/* return LookInStream_Read2(stream, buf, size, SZ_ERROR_NO_ARCHIVE); */
}
/*
in:
(*startOffset) is position in (stream) where xz_stream must be finished.
out:
if returns SZ_OK, then (*startOffset) is position in stream that shows start of xz_stream.
*/
static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startOffset, ISzAllocPtr alloc)
{
UInt64 indexSize;
Byte buf[XZ_STREAM_FOOTER_SIZE];
#define TEMP_BUF_SIZE (1 << 10)
UInt32 buf32[TEMP_BUF_SIZE / 4];
UInt64 pos = (UInt64)*startOffset;
if ((pos & 3) != 0 || pos < XZ_STREAM_FOOTER_SIZE)
if ((pos & 3) || pos < XZ_STREAM_FOOTER_SIZE)
return SZ_ERROR_NO_ARCHIVE;
pos -= XZ_STREAM_FOOTER_SIZE;
RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf, XZ_STREAM_FOOTER_SIZE))
RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, XZ_STREAM_FOOTER_SIZE))
if (!XZ_FOOTER_SIG_CHECK(buf + 10))
if (!XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(buf32))
{
UInt32 total = 0;
pos += XZ_STREAM_FOOTER_SIZE;
for (;;)
{
size_t i;
#define TEMP_BUF_SIZE (1 << 10)
Byte temp[TEMP_BUF_SIZE];
i = (pos > TEMP_BUF_SIZE) ? TEMP_BUF_SIZE : (size_t)pos;
// pos != 0
// (pos & 3) == 0
size_t i = pos >= TEMP_BUF_SIZE ? TEMP_BUF_SIZE : (size_t)pos;
pos -= i;
RINOK(LookInStream_SeekRead_ForArc(stream, pos, temp, i))
total += (UInt32)i;
for (; i != 0; i--)
if (temp[i - 1] != 0)
RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, i))
i /= 4;
do
if (buf32[i - 1] != 0)
break;
if (i != 0)
{
if ((i & 3) != 0)
return SZ_ERROR_NO_ARCHIVE;
pos += i;
break;
}
if (pos < XZ_STREAM_FOOTER_SIZE || total > (1 << 16))
return SZ_ERROR_NO_ARCHIVE;
}
while (--i);
pos += i * 4;
#define XZ_STREAM_BACKWARD_READING_PAD_MAX (1 << 16)
// here we don't support rare case with big padding for xz stream.
// so we have padding limit for backward reading.
if ((UInt64)*startOffset - pos > XZ_STREAM_BACKWARD_READING_PAD_MAX)
return SZ_ERROR_NO_ARCHIVE;
if (i)
break;
}
// we try to open xz stream after skipping zero padding.
// ((UInt64)*startOffset == pos) is possible here!
if (pos < XZ_STREAM_FOOTER_SIZE)
return SZ_ERROR_NO_ARCHIVE;
pos -= XZ_STREAM_FOOTER_SIZE;
RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf, XZ_STREAM_FOOTER_SIZE))
if (!XZ_FOOTER_SIG_CHECK(buf + 10))
RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, XZ_STREAM_FOOTER_SIZE))
if (!XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(buf32))
return SZ_ERROR_NO_ARCHIVE;
}
p->flags = (CXzStreamFlags)GetBe16(buf + 8);
p->flags = (CXzStreamFlags)GetBe16a(buf32 + 2);
if (!XzFlags_IsSupported(p->flags))
return SZ_ERROR_UNSUPPORTED;
{
/* to eliminate GCC 6.3 warning:
dereferencing type-punned pointer will break strict-aliasing rules */
const Byte *buf_ptr = buf;
if (GetUi32(buf_ptr) != CrcCalc(buf + 4, 6))
const UInt32 *buf_ptr = buf32;
if (GetUi32a(buf_ptr) != CrcCalc(buf32 + 1, 6))
return SZ_ERROR_ARCHIVE;
}
indexSize = ((UInt64)GetUi32(buf + 4) + 1) << 2;
{
const UInt64 indexSize = ((UInt64)GetUi32a(buf32 + 1) + 1) << 2;
if (pos < indexSize)
return SZ_ERROR_ARCHIVE;
pos -= indexSize;
// v25.00: relaxed indexSize check. We allow big index table.
// if (indexSize > ((UInt32)1 << 31))
if (indexSize >= ((size_t)1 << (sizeof(size_t) * 8 - 1)))
return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE
RINOK(LookInStream_SeekTo(stream, pos))
RINOK(Xz_ReadIndex(p, stream, indexSize, alloc))
// RINOK(Xz_ReadIndex(p, stream, indexSize, alloc))
{
UInt64 totalSize = Xz_GetPackSize(p);
if (totalSize == XZ_SIZE_OVERFLOW
|| totalSize >= ((UInt64)1 << 63)
|| pos < totalSize + XZ_STREAM_HEADER_SIZE)
SRes res;
const size_t size = (size_t)indexSize;
// if (size != indexSize) return SZ_ERROR_UNSUPPORTED;
Byte *buf = (Byte *)ISzAlloc_Alloc(alloc, size);
if (!buf)
return SZ_ERROR_MEM;
res = LookInStream_Read2(stream, buf, size, SZ_ERROR_UNSUPPORTED);
if (res == SZ_OK)
res = Xz_ParseIndex(p, buf, size, alloc);
ISzAlloc_Free(alloc, buf);
RINOK(res)
}
}
{
UInt64 total = Xz_GetPackSize(p);
if (total == XZ_SIZE_OVERFLOW || total >= ((UInt64)1 << 63))
return SZ_ERROR_ARCHIVE;
pos -= (totalSize + XZ_STREAM_HEADER_SIZE);
total += XZ_STREAM_HEADER_SIZE;
if (pos < total)
return SZ_ERROR_ARCHIVE;
pos -= total;
RINOK(LookInStream_SeekTo(stream, pos))
*startOffset = (Int64)pos;
}
@ -246,7 +278,6 @@ static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startO
CSecToRead secToRead;
SecToRead_CreateVTable(&secToRead);
secToRead.realStream = stream;
RINOK(Xz_ReadHeader(&headerFlags, &secToRead.vt))
return (p->flags == headerFlags) ? SZ_OK : SZ_ERROR_ARCHIVE;
}
@ -257,8 +288,7 @@ static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startO
void Xzs_Construct(CXzs *p)
{
p->num = p->numAllocated = 0;
p->streams = 0;
Xzs_CONSTRUCT(p)
}
void Xzs_Free(CXzs *p, ISzAllocPtr alloc)
@ -268,7 +298,7 @@ void Xzs_Free(CXzs *p, ISzAllocPtr alloc)
Xz_Free(&p->streams[i], alloc);
ISzAlloc_Free(alloc, p->streams);
p->num = p->numAllocated = 0;
p->streams = 0;
p->streams = NULL;
}
UInt64 Xzs_GetNumBlocks(const CXzs *p)
@ -307,34 +337,49 @@ UInt64 Xzs_GetPackSize(const CXzs *p)
SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr stream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc)
{
Int64 endOffset = 0;
// it's supposed that CXzs object is empty here.
// if CXzs object is not empty, it will add new streams to that non-empty object.
// Xzs_Free(p, alloc); // it's optional call to empty CXzs object.
RINOK(ILookInStream_Seek(stream, &endOffset, SZ_SEEK_END))
*startOffset = endOffset;
for (;;)
{
CXzStream st;
SRes res;
Xz_Construct(&st);
Xz_CONSTRUCT(&st)
res = Xz_ReadBackward(&st, stream, startOffset, alloc);
// if (res == SZ_OK), then (*startOffset) is start offset of new stream if
// if (res != SZ_OK), then (*startOffset) is unchend or it's expected start offset of stream with error
st.startOffset = (UInt64)*startOffset;
RINOK(res)
// we must store (st) object to array, or we must free (st) local object.
if (res != SZ_OK)
{
Xz_Free(&st, alloc);
return res;
}
if (p->num == p->numAllocated)
{
const size_t newNum = p->num + p->num / 4 + 1;
void *data = ISzAlloc_Alloc(alloc, newNum * sizeof(CXzStream));
if (!data)
{
Xz_Free(&st, alloc);
return SZ_ERROR_MEM;
}
p->numAllocated = newNum;
if (p->num != 0)
memcpy(data, p->streams, p->num * sizeof(CXzStream));
ISzAlloc_Free(alloc, p->streams);
p->streams = (CXzStream *)data;
}
// we use direct copying of raw data from local variable (st) to object in array.
// so we don't need to call Xz_Free(&st, alloc) after copying and after p->num++
p->streams[p->num++] = st;
if (*startOffset == 0)
break;
RINOK(LookInStream_SeekTo(stream, (UInt64)*startOffset))
return SZ_OK;
// seek operation is optional:
// RINOK(LookInStream_SeekTo(stream, (UInt64)*startOffset))
if (progress && ICompressProgress_Progress(progress, (UInt64)(endOffset - *startOffset), (UInt64)(Int64)-1) != SZ_OK)
return SZ_ERROR_PROGRESS;
}
return SZ_OK;
}