diff --git a/3rdparty/xbyak/xbyak/xbyak.h b/3rdparty/xbyak/xbyak/xbyak.h index 41b5cc0684..01fddf7505 100644 --- a/3rdparty/xbyak/xbyak/xbyak.h +++ b/3rdparty/xbyak/xbyak/xbyak.h @@ -77,7 +77,11 @@ #endif #include #include - #define XBYAK_TLS __declspec(thread) + #ifdef _MSC_VER + #define XBYAK_TLS __declspec(thread) + #else + #define XBYAK_TLS __thread + #endif #elif defined(__GNUC__) #include #include @@ -95,7 +99,9 @@ #include #endif -#if !defined(MFD_CLOEXEC) // defined only linux 3.17 or later +// MFD_CLOEXEC defined only linux 3.17 or later. +// Android wraps the memfd_create syscall from API version 30. +#if !defined(MFD_CLOEXEC) || (defined(__ANDROID__) && __ANDROID_API__ < 30) #undef XBYAK_USE_MEMFD #endif @@ -112,7 +118,7 @@ #endif #endif -#if (__cplusplus >= 201103) || (defined(_MSC_VER) && _MSC_VER >= 1800) +#if (__cplusplus >= 201103) || (defined(_MSC_VER) && _MSC_VER >= 1900) #undef XBYAK_TLS #define XBYAK_TLS thread_local #define XBYAK_VARIADIC_TEMPLATE @@ -138,11 +144,18 @@ #pragma warning(disable : 4127) /* constant expresison */ #endif +// disable -Warray-bounds because it may be a bug of gcc. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104603 +#if defined(__GNUC__) && !defined(__clang__) + #define XBYAK_DISABLE_WARNING_ARRAY_BOUNDS + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Warray-bounds" +#endif + namespace Xbyak { enum { DEFAULT_MAX_CODE_SIZE = 4096, - VERSION = 0x6000 /* 0xABCD = A.BC(D) */ + VERSION = 0x6730 /* 0xABCD = A.BC(.D) */ }; #ifndef MIE_INTEGER_TYPE_DEFINED @@ -291,10 +304,10 @@ inline void SetError(int err) { inline void ClearError() { local::GetErrorRef() = 0; } -inline int GetError() { return local::GetErrorRef(); } +inline int GetError() { return Xbyak::local::GetErrorRef(); } -#define XBYAK_THROW(err) { local::SetError(err); return; } -#define XBYAK_THROW_RET(err, r) { local::SetError(err); return r; } +#define XBYAK_THROW(err) { Xbyak::local::SetError(err); return; } +#define XBYAK_THROW_RET(err, r) { Xbyak::local::SetError(err); return r; } #else class Error : public std::exception { @@ -358,14 +371,36 @@ inline const To CastTo(From p) XBYAK_NOEXCEPT } namespace inner { -static const size_t ALIGN_PAGE_SIZE = 4096; +#ifdef _WIN32 +struct SystemInfo { + SYSTEM_INFO info; + SystemInfo() + { + GetSystemInfo(&info); + } +}; +#endif +//static const size_t ALIGN_PAGE_SIZE = 4096; +inline size_t getPageSize() +{ +#ifdef _WIN32 + static const SystemInfo si; + return si.info.dwPageSize; +#elif defined(__GNUC__) + static const long pageSize = sysconf(_SC_PAGESIZE); + if (pageSize > 0) { + return (size_t)pageSize; + } +#endif + return 4096; +} inline bool IsInDisp8(uint32_t x) { return 0xFFFFFF80 <= x || x <= 0x7F; } inline bool IsInInt32(uint64_t x) { return ~uint64_t(0x7fffffffu) <= x || x <= 0x7FFFFFFFU; } inline uint32_t VerifyInInt32(uint64_t x) { -#ifdef XBYAK64 +#if defined(XBYAK64) && !defined(__ILP32__) if (!IsInInt32(x)) XBYAK_THROW_RET(ERR_OFFSET_IS_TOO_BIG, 0) #endif return static_cast(x); @@ -383,7 +418,8 @@ enum LabelMode { custom allocator */ struct Allocator { - virtual uint8_t *alloc(size_t size) { return reinterpret_cast(AlignedMalloc(size, inner::ALIGN_PAGE_SIZE)); } + explicit Allocator(const std::string& = "") {} // same interface with MmapAllocator + virtual uint8_t *alloc(size_t size) { return reinterpret_cast(AlignedMalloc(size, inner::getPageSize())); } virtual void free(uint8_t *p) { AlignedFree(p); } virtual ~Allocator() {} /* override to return false if you call protect() manually */ @@ -414,13 +450,24 @@ inline int getMacOsVersion() } // util #endif -class MmapAllocator : Allocator { - typedef XBYAK_STD_UNORDERED_MAP SizeList; - SizeList sizeList_; +class MmapAllocator : public Allocator { + struct Allocation { + size_t size; +#if defined(XBYAK_USE_MEMFD) + // fd_ is only used with XBYAK_USE_MEMFD. We keep the file open + // during the lifetime of each allocation in order to support + // checkpoint/restore by unprivileged users. + int fd; +#endif + }; + const std::string name_; // only used with XBYAK_USE_MEMFD + typedef XBYAK_STD_UNORDERED_MAP AllocationList; + AllocationList allocList_; public: + explicit MmapAllocator(const std::string& name = "xbyak") : name_(name) {} uint8_t *alloc(size_t size) { - const size_t alignedSizeM1 = inner::ALIGN_PAGE_SIZE - 1; + const size_t alignedSizeM1 = inner::getPageSize() - 1; size = (size + alignedSizeM1) & ~alignedSizeM1; #if defined(MAP_ANONYMOUS) int mode = MAP_PRIVATE | MAP_ANONYMOUS; @@ -435,30 +482,42 @@ public: #endif int fd = -1; #if defined(XBYAK_USE_MEMFD) - fd = memfd_create("xbyak", MFD_CLOEXEC); + fd = memfd_create(name_.c_str(), MFD_CLOEXEC); if (fd != -1) { mode = MAP_SHARED; - if (ftruncate(fd, size) != 0) XBYAK_THROW_RET(ERR_CANT_ALLOC, 0) + if (ftruncate(fd, size) != 0) { + close(fd); + XBYAK_THROW_RET(ERR_CANT_ALLOC, 0) + } } #endif void *p = mmap(NULL, size, PROT_READ | PROT_WRITE, mode, fd, 0); -#if defined(XBYAK_USE_MEMFD) - if (fd != -1) close(fd); -#endif - if (p == MAP_FAILED) XBYAK_THROW_RET(ERR_CANT_ALLOC, 0) + if (p == MAP_FAILED) { + if (fd != -1) close(fd); + XBYAK_THROW_RET(ERR_CANT_ALLOC, 0) + } assert(p); - sizeList_[(uintptr_t)p] = size; + Allocation &alloc = allocList_[(uintptr_t)p]; + alloc.size = size; +#if defined(XBYAK_USE_MEMFD) + alloc.fd = fd; +#endif return (uint8_t*)p; } void free(uint8_t *p) { if (p == 0) return; - SizeList::iterator i = sizeList_.find((uintptr_t)p); - if (i == sizeList_.end()) XBYAK_THROW(ERR_BAD_PARAMETER) - if (munmap((void*)i->first, i->second) < 0) XBYAK_THROW(ERR_MUNMAP) - sizeList_.erase(i); + AllocationList::iterator i = allocList_.find((uintptr_t)p); + if (i == allocList_.end()) XBYAK_THROW(ERR_BAD_PARAMETER) + if (munmap((void*)i->first, i->second.size) < 0) XBYAK_THROW(ERR_MUNMAP) +#if defined(XBYAK_USE_MEMFD) + if (i->second.fd != -1) close(i->second.fd); +#endif + allocList_.erase(i); } }; +#else +typedef Allocator MmapAllocator; #endif class Address; @@ -1176,9 +1235,6 @@ public: size_t pageSize = sysconf(_SC_PAGESIZE); size_t iaddr = reinterpret_cast(addr); size_t roundAddr = iaddr & ~(pageSize - static_cast(1)); -#ifndef NDEBUG - if (pageSize != 4096) fprintf(stderr, "large page(%zd) is used. not tested enough.\n", pageSize); -#endif return mprotect(reinterpret_cast(roundAddr), size + (iaddr - roundAddr), mode) == 0; #else return true; @@ -1448,7 +1504,6 @@ public: clabelDefList_.clear(); clabelUndefList_.clear(); resetLabelPtrList(); - ClearError(); } void enterLocal() { @@ -1574,6 +1629,7 @@ public: enum LabelType { T_SHORT, T_NEAR, + T_FAR, // far jump T_AUTO // T_SHORT if possible }; private: @@ -1622,6 +1678,11 @@ private: { return op1.isREG(i32e) && ((op2.isREG(i32e) && op1.getBit() == op2.getBit()) || op2.isMEM()); } + static inline bool isValidSSE(const Operand& op1) + { + // SSE instructions do not support XMM16 - XMM31 + return !(op1.isXMM() && op1.getIdx() >= 16); + } void rex(const Operand& op1, const Operand& op2 = Operand()) { uint8_t rex = 0; @@ -1784,9 +1845,16 @@ private: void setSIB(const RegExp& e, int reg, int disp8N = 0) { uint64_t disp64 = e.getDisp(); -#ifdef XBYAK64 +#if defined(XBYAK64) && !defined(__ILP32__) +#ifdef XBYAK_OLD_DISP_CHECK + // treat 0xffffffff as 0xffffffffffffffff uint64_t high = disp64 >> 32; if (high != 0 && high != 0xFFFFFFFF) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG) +#else + // displacement should be a signed 32-bit value, so also check sign bit + uint64_t high = disp64 >> 31; + if (high != 0 && high != 0x1FFFFFFFF) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG) +#endif #endif uint32_t disp = static_cast(disp64); const Reg& base = e.getBase(); @@ -1887,6 +1955,7 @@ private: template void opJmp(T& label, LabelType type, uint8_t shortCode, uint8_t longCode, uint8_t longPref) { + if (type == T_FAR) XBYAK_THROW(ERR_NOT_SUPPORTED) if (isAutoGrow() && size_ + 16 >= maxSize_) growMemory(); /* avoid splitting code of jmp */ size_t offset = 0; if (labelMgr_.getOffset(&offset, label)) { /* label exists */ @@ -1907,6 +1976,7 @@ private: } void opJmpAbs(const void *addr, LabelType type, uint8_t shortCode, uint8_t longCode, uint8_t longPref = 0) { + if (type == T_FAR) XBYAK_THROW(ERR_NOT_SUPPORTED) if (isAutoGrow()) { if (!isNEAR(type)) XBYAK_THROW(ERR_ONLY_T_NEAR_IS_SUPPORTED_IN_AUTO_GROW) if (size_ + 16 >= maxSize_) growMemory(); @@ -1919,6 +1989,16 @@ private: } } + void opJmpOp(const Operand& op, LabelType type, int ext) + { + const int bit = 16|i32e; + if (type == T_FAR) { + if (!op.isMEM(bit)) XBYAK_THROW(ERR_NOT_SUPPORTED) + opR_ModM(op, bit, ext + 1, 0xFF, NONE, NONE, false); + } else { + opR_ModM(op, bit, ext, 0xFF, NONE, NONE, true); + } + } // reg is reg field of ModRM // immSize is the size for immediate value // disp8N = 0(normal), disp8N = 1(force disp32), disp8N = {2, 4, 8} ; compressed displacement @@ -1945,6 +2025,7 @@ private: void opGen(const Operand& reg, const Operand& op, int code, int pref, bool isValid(const Operand&, const Operand&), int imm8 = NONE, int preCode = NONE) { if (isValid && !isValid(reg, op)) XBYAK_THROW(ERR_BAD_COMBINATION) + if (!isValidSSE(reg) || !isValidSSE(op)) XBYAK_THROW(ERR_NOT_SUPPORTED) if (pref != NONE) db(pref); if (op.isMEM()) { opModM(op.getAddress(), reg.getReg(), 0x0F, preCode, code, (imm8 != NONE) ? 1 : 0); @@ -1955,6 +2036,7 @@ private: } void opMMX_IMM(const Mmx& mmx, int imm8, int code, int ext) { + if (!isValidSSE(mmx)) XBYAK_THROW(ERR_NOT_SUPPORTED) if (mmx.isXMM()) db(0x66); opModR(Reg32(ext), mmx, 0x0F, code); db(imm8); @@ -1965,6 +2047,7 @@ private: } void opMovXMM(const Operand& op1, const Operand& op2, int code, int pref) { + if (!isValidSSE(op1) || !isValidSSE(op2)) XBYAK_THROW(ERR_NOT_SUPPORTED) if (pref != NONE) db(pref); if (op1.isXMM() && op2.isMEM()) { opModM(op2.getAddress(), op1.getReg(), 0x0F, code); @@ -1976,6 +2059,7 @@ private: } void opExt(const Operand& op, const Mmx& mmx, int code, int imm, bool hasMMX2 = false) { + if (!isValidSSE(op) || !isValidSSE(mmx)) XBYAK_THROW(ERR_NOT_SUPPORTED) if (hasMMX2 && op.isREG(i32e)) { /* pextrw is special */ if (mmx.isXMM()) db(0x66); opModR(op.getReg(), mmx, 0x0F, 0xC5); db(imm); @@ -2132,9 +2216,6 @@ private: { if (op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION) int w = op.isBit(16); -#ifdef XBYAK64 - if (op.isHigh8bit()) XBYAK_THROW(ERR_BAD_COMBINATION) -#endif bool cond = reg.isREG() && (reg.getBit() > op.getBit()); opModRM(reg, op, cond && op.isREG(), cond && op.isMEM(), 0x0F, code | w); } @@ -2356,18 +2437,21 @@ private: if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING) opVex(x, 0, addr, type, code); } - void opVnni(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int code0, PreferredEncoding encoding) + void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int code0, PreferredEncoding encoding) { + opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding), code0); + } + int orEvexIf(PreferredEncoding encoding) { if (encoding == DefaultEncoding) { - encoding = EvexEncoding; + encoding = defaultEncoding_; } if (encoding == EvexEncoding) { #ifdef XBYAK_DISABLE_AVX512 XBYAK_THROW(ERR_EVEX_IS_INVALID) #endif - type |= T_MUST_EVEX; + return T_MUST_EVEX; } - opAVX_X_X_XM(x1, x2, op, type, code0); + return 0; } void opInOut(const Reg& a, const Reg& d, uint8_t code) { @@ -2452,6 +2536,7 @@ public: #endif private: bool isDefaultJmpNEAR_; + PreferredEncoding defaultEncoding_; public: void L(const std::string& label) { labelMgr_.defineSlabel(label); } void L(Label& label) { labelMgr_.defineClabel(label); } @@ -2474,13 +2559,13 @@ public: // set default type of `jmp` of undefined label to T_NEAR void setDefaultJmpNEAR(bool isNear) { isDefaultJmpNEAR_ = isNear; } - void jmp(const Operand& op) { opR_ModM(op, BIT, 4, 0xFF, NONE, NONE, true); } + void jmp(const Operand& op, LabelType type = T_AUTO) { opJmpOp(op, type, 4); } void jmp(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0xEB, 0xE9, 0); } void jmp(const char *label, LabelType type = T_AUTO) { jmp(std::string(label), type); } void jmp(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0xEB, 0xE9, 0); } void jmp(const void *addr, LabelType type = T_AUTO) { opJmpAbs(addr, type, 0xEB, 0xE9); } - void call(const Operand& op) { opR_ModM(op, 16 | i32e, 2, 0xFF, NONE, NONE, true); } + void call(const Operand& op, LabelType type = T_AUTO) { opJmpOp(op, type, 2); } // call(string label), not const std::string& void call(std::string label) { opJmp(label, T_NEAR, 0, 0xE8, 0); } void call(const char *label) { call(std::string(label)); } @@ -2731,11 +2816,13 @@ public: , es(Segment::es), cs(Segment::cs), ss(Segment::ss), ds(Segment::ds), fs(Segment::fs), gs(Segment::gs) #endif , isDefaultJmpNEAR_(false) + , defaultEncoding_(EvexEncoding) { labelMgr_.set(this); } void reset() { + ClearError(); resetSize(); labelMgr_.reset(); labelMgr_.set(this); @@ -2767,6 +2854,9 @@ public: #undef jnl #endif + // set default encoding to select Vex or Evex + void setDefaultEncoding(PreferredEncoding encoding) { defaultEncoding_ = encoding; } + /* use single byte nop if useMultiByteNop = false */ @@ -2813,7 +2903,7 @@ public: { if (x == 1) return; if (x < 1 || (x & (x - 1))) XBYAK_THROW(ERR_BAD_ALIGN) - if (isAutoGrow() && x > inner::ALIGN_PAGE_SIZE) fprintf(stderr, "warning:autoGrow mode does not support %d align\n", (int)x); + if (isAutoGrow()) XBYAK_THROW(ERR_BAD_ALIGN) size_t remain = size_t(getCurr()) % x; if (remain) { nop(x - remain, useMultiByteNop); @@ -2871,6 +2961,10 @@ static const XBYAK_CONSTEXPR Segment es(Segment::es), cs(Segment::cs), ss(Segmen #pragma warning(pop) #endif +#if defined(__GNUC__) && !defined(__clang__) + #pragma GCC diagnostic pop +#endif + } // end of namespace #endif // XBYAK_XBYAK_H_ diff --git a/3rdparty/xbyak/xbyak/xbyak_bin2hex.h b/3rdparty/xbyak/xbyak/xbyak_bin2hex.h new file mode 100644 index 0000000000..69ecdbfedc --- /dev/null +++ b/3rdparty/xbyak/xbyak/xbyak_bin2hex.h @@ -0,0 +1,258 @@ +enum { + B00000000= 0, + B00000001= 1, + B00000010= 2, + B00000011= 3, + B00000100= 4, + B00000101= 5, + B00000110= 6, + B00000111= 7, + B00001000= 8, + B00001001= 9, + B00001010= 10, + B00001011= 11, + B00001100= 12, + B00001101= 13, + B00001110= 14, + B00001111= 15, + B00010000= 16, + B00010001= 17, + B00010010= 18, + B00010011= 19, + B00010100= 20, + B00010101= 21, + B00010110= 22, + B00010111= 23, + B00011000= 24, + B00011001= 25, + B00011010= 26, + B00011011= 27, + B00011100= 28, + B00011101= 29, + B00011110= 30, + B00011111= 31, + B00100000= 32, + B00100001= 33, + B00100010= 34, + B00100011= 35, + B00100100= 36, + B00100101= 37, + B00100110= 38, + B00100111= 39, + B00101000= 40, + B00101001= 41, + B00101010= 42, + B00101011= 43, + B00101100= 44, + B00101101= 45, + B00101110= 46, + B00101111= 47, + B00110000= 48, + B00110001= 49, + B00110010= 50, + B00110011= 51, + B00110100= 52, + B00110101= 53, + B00110110= 54, + B00110111= 55, + B00111000= 56, + B00111001= 57, + B00111010= 58, + B00111011= 59, + B00111100= 60, + B00111101= 61, + B00111110= 62, + B00111111= 63, + B01000000= 64, + B01000001= 65, + B01000010= 66, + B01000011= 67, + B01000100= 68, + B01000101= 69, + B01000110= 70, + B01000111= 71, + B01001000= 72, + B01001001= 73, + B01001010= 74, + B01001011= 75, + B01001100= 76, + B01001101= 77, + B01001110= 78, + B01001111= 79, + B01010000= 80, + B01010001= 81, + B01010010= 82, + B01010011= 83, + B01010100= 84, + B01010101= 85, + B01010110= 86, + B01010111= 87, + B01011000= 88, + B01011001= 89, + B01011010= 90, + B01011011= 91, + B01011100= 92, + B01011101= 93, + B01011110= 94, + B01011111= 95, + B01100000= 96, + B01100001= 97, + B01100010= 98, + B01100011= 99, + B01100100= 100, + B01100101= 101, + B01100110= 102, + B01100111= 103, + B01101000= 104, + B01101001= 105, + B01101010= 106, + B01101011= 107, + B01101100= 108, + B01101101= 109, + B01101110= 110, + B01101111= 111, + B01110000= 112, + B01110001= 113, + B01110010= 114, + B01110011= 115, + B01110100= 116, + B01110101= 117, + B01110110= 118, + B01110111= 119, + B01111000= 120, + B01111001= 121, + B01111010= 122, + B01111011= 123, + B01111100= 124, + B01111101= 125, + B01111110= 126, + B01111111= 127, + B10000000= 128, + B10000001= 129, + B10000010= 130, + B10000011= 131, + B10000100= 132, + B10000101= 133, + B10000110= 134, + B10000111= 135, + B10001000= 136, + B10001001= 137, + B10001010= 138, + B10001011= 139, + B10001100= 140, + B10001101= 141, + B10001110= 142, + B10001111= 143, + B10010000= 144, + B10010001= 145, + B10010010= 146, + B10010011= 147, + B10010100= 148, + B10010101= 149, + B10010110= 150, + B10010111= 151, + B10011000= 152, + B10011001= 153, + B10011010= 154, + B10011011= 155, + B10011100= 156, + B10011101= 157, + B10011110= 158, + B10011111= 159, + B10100000= 160, + B10100001= 161, + B10100010= 162, + B10100011= 163, + B10100100= 164, + B10100101= 165, + B10100110= 166, + B10100111= 167, + B10101000= 168, + B10101001= 169, + B10101010= 170, + B10101011= 171, + B10101100= 172, + B10101101= 173, + B10101110= 174, + B10101111= 175, + B10110000= 176, + B10110001= 177, + B10110010= 178, + B10110011= 179, + B10110100= 180, + B10110101= 181, + B10110110= 182, + B10110111= 183, + B10111000= 184, + B10111001= 185, + B10111010= 186, + B10111011= 187, + B10111100= 188, + B10111101= 189, + B10111110= 190, + B10111111= 191, + B11000000= 192, + B11000001= 193, + B11000010= 194, + B11000011= 195, + B11000100= 196, + B11000101= 197, + B11000110= 198, + B11000111= 199, + B11001000= 200, + B11001001= 201, + B11001010= 202, + B11001011= 203, + B11001100= 204, + B11001101= 205, + B11001110= 206, + B11001111= 207, + B11010000= 208, + B11010001= 209, + B11010010= 210, + B11010011= 211, + B11010100= 212, + B11010101= 213, + B11010110= 214, + B11010111= 215, + B11011000= 216, + B11011001= 217, + B11011010= 218, + B11011011= 219, + B11011100= 220, + B11011101= 221, + B11011110= 222, + B11011111= 223, + B11100000= 224, + B11100001= 225, + B11100010= 226, + B11100011= 227, + B11100100= 228, + B11100101= 229, + B11100110= 230, + B11100111= 231, + B11101000= 232, + B11101001= 233, + B11101010= 234, + B11101011= 235, + B11101100= 236, + B11101101= 237, + B11101110= 238, + B11101111= 239, + B11110000= 240, + B11110001= 241, + B11110010= 242, + B11110011= 243, + B11110100= 244, + B11110101= 245, + B11110110= 246, + B11110111= 247, + B11111000= 248, + B11111001= 249, + B11111010= 250, + B11111011= 251, + B11111100= 252, + B11111101= 253, + B11111110= 254, + B11111111= 255 +}; diff --git a/3rdparty/xbyak/xbyak/xbyak_mnemonic.h b/3rdparty/xbyak/xbyak/xbyak_mnemonic.h index 72bcb22ee5..edc76c5bb8 100644 --- a/3rdparty/xbyak/xbyak/xbyak_mnemonic.h +++ b/3rdparty/xbyak/xbyak/xbyak_mnemonic.h @@ -1,4 +1,6 @@ -const char *getVersionString() const { return "6.00"; } +const char *getVersionString() const { return "6.73"; } +void aadd(const Address& addr, const Reg32e ®) { opModM(addr, reg, 0x0F, 0x38, 0x0FC); } +void aand(const Address& addr, const Reg32e ®) { db(0x66); opModM(addr, reg, 0x0F, 0x38, 0x0FC); } void adc(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x10, 2); } void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); } void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); } @@ -24,6 +26,8 @@ void andnpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x66, isXM void andnps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x100, isXMM_XMMorMEM); } void andpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x66, isXMM_XMMorMEM); } void andps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x100, isXMM_XMMorMEM); } +void aor(const Address& addr, const Reg32e ®) { db(0xF2); opModM(addr, reg, 0x0F, 0x38, 0x0FC); } +void axor(const Address& addr, const Reg32e ®) { db(0xF3); opModM(addr, reg, 0x0F, 0x38, 0x0FC); } void bextr(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_0F38, 0xf7, false); } void blendpd(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0D, 0x66, isXMM_XMMorMEM, static_cast(imm), 0x3A); } void blendps(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0C, 0x66, isXMM_XMMorMEM, static_cast(imm), 0x3A); } @@ -57,9 +61,11 @@ void cbw() { db(0x66); db(0x98); } void cdq() { db(0x99); } void clc() { db(0xF8); } void cld() { db(0xFC); } +void cldemote(const Address& addr) { opMIB(addr, eax, 0x0F, 0x1C); } void clflush(const Address& addr) { opModM(addr, Reg32(7), 0x0F, 0xAE); } void clflushopt(const Address& addr) { db(0x66); opModM(addr, Reg32(7), 0x0F, 0xAE); } void cli() { db(0xFA); } +void clwb(const Address& addr) { db(0x66); opMIB(addr, esi, 0x0F, 0xAE); } void clzero() { db(0x0F); db(0x01); db(0xFC); } void cmc() { db(0xF5); } void cmova(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 7); }//-V524 @@ -323,6 +329,7 @@ void gf2p8affineqb(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, void gf2p8mulb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCF, 0x66, isXMM_XMMorMEM, NONE, 0x38); } void haddpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7C, 0x66, isXMM_XMMorMEM); } void haddps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7C, 0xF2, isXMM_XMMorMEM); } +void hlt() { db(0xF4); } void hsubpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7D, 0x66, isXMM_XMMorMEM); } void hsubps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7D, 0xF2, isXMM_XMMorMEM); } void idiv(const Operand& op) { opR_ModM(op, 0, 7, 0xF6); } @@ -500,6 +507,8 @@ void movd(const Mmx& mmx, const Address& addr) { if (mmx.isXMM()) db(0x66); opMo void movd(const Mmx& mmx, const Reg32& reg) { if (mmx.isXMM()) db(0x66); opModR(mmx, reg, 0x0F, 0x6E); } void movd(const Reg32& reg, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opModR(mmx, reg, 0x0F, 0x7E); } void movddup(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x12, 0xF2, isXMM_XMMorMEM, NONE, NONE); } +void movdir64b(const Reg& reg, const Address& addr) { db(0x66); opModM(addr, reg.cvt32(), 0x0F, 0x38, 0xF8); } +void movdiri(const Address& addr, const Reg32e& reg) { opModM(addr, reg, 0x0F, 0x38, 0xF9); } void movdq2q(const Mmx& mmx, const Xmm& xmm) { db(0xF2); opModR(mmx, xmm, 0x0F, 0xD6); } void movdqa(const Address& addr, const Xmm& xmm) { db(0x66); opModM(addr, xmm, 0x0F, 0x7F); } void movdqa(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x6F, 0x66); } @@ -580,9 +589,9 @@ void pavgb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE0); } void pavgw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE3); } void pblendvb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x10, 0x66, isXMM_XMMorMEM, NONE, 0x38); } void pblendw(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0E, 0x66, isXMM_XMMorMEM, static_cast(imm), 0x3A); } -void pclmulhqhdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x11); } +void pclmulhqhqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x11); } void pclmulhqlqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x01); } -void pclmullqhdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x10); } +void pclmullqhqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x10); } void pclmullqlqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x00); } void pclmulqdq(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x44, 0x66, isXMM_XMMorMEM, static_cast(imm), 0x3A); } void pcmpeqb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x74); } @@ -649,6 +658,8 @@ void pmuludq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF4); } void popcnt(const Reg®, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xB8); } void popf() { db(0x9D); } void por(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEB); } +void prefetchit0(const Address& addr) { opModM(addr, Reg32(7), 0x0F, 0x18); } +void prefetchit1(const Address& addr) { opModM(addr, Reg32(6), 0x0F, 0x18); } void prefetchnta(const Address& addr) { opModM(addr, Reg32(0), 0x0F, 0x18); } void prefetcht0(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0x18); } void prefetcht1(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0x18); } @@ -719,6 +730,7 @@ void repne() { db(0xF2); } void repnz() { db(0xF2); } void repz() { db(0xF3); } void ret(int imm = 0) { if (imm) { db(0xC2); dw(imm); } else { db(0xC3); } } +void retf(int imm = 0) { if (imm) { db(0xCA); dw(imm); } else { db(0xCB); } } void rol(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 0); } void rol(const Operand& op, int imm) { opShift(op, imm, 0); } void ror(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 1); } @@ -741,6 +753,7 @@ void sbb(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x18); } void scasb() { db(0xAE); } void scasd() { db(0xAF); } void scasw() { db(0x66); db(0xAF); } +void serialize() { db(0x0F); db(0x01); db(0xE8); } void seta(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 7); }//-V524 void setae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }//-V524 void setb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }//-V524 @@ -811,10 +824,13 @@ void subsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0xF2, isXMM void subss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0xF3, isXMM_XMMorMEM); } void sysenter() { db(0x0F); db(0x34); } void sysexit() { db(0x0F); db(0x35); } +void tpause(const Reg32& r) { int idx = r.getIdx(); if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) db(0x66); db(0x0F); db(0xAE); setModRM(3, 6, idx); } void tzcnt(const Reg®, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xBC); } void ucomisd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2E, 0x66, isXMM_XMMorMEM); } void ucomiss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2E, 0x100, isXMM_XMMorMEM); } void ud2() { db(0x0F); db(0x0B); } +void umonitor(const Reg& r) { int idx = r.getIdx(); if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) int bit = r.getBit(); if (BIT != bit) { if ((BIT == 32 && bit == 16) || (BIT == 64 && bit == 32)) { db(0x67); } else { XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) } } db(0xF3); db(0x0F); db(0xAE); setModRM(3, 6, idx); } +void umwait(const Reg32& r) { int idx = r.getIdx(); if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) db(0xF2); db(0x0F); db(0xAE); setModRM(3, 6, idx); } void unpckhpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x66, isXMM_XMMorMEM); } void unpckhps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x100, isXMM_XMMorMEM); } void unpcklpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x66, isXMM_XMMorMEM); } @@ -835,6 +851,8 @@ void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x55); } void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x54); } void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x54); } +void vbcstnebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM | T_B16, 0xB1); } +void vbcstnesh2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM | T_B16, 0xB1); } void vblendpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0D, imm); } void vblendps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0C, imm); } void vblendvpd(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4B, x4.getIdx() << 4); } @@ -979,6 +997,11 @@ void vcomisd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N8 | T void vcomiss(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N4 | T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x2F); } void vcvtdq2pd(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F | T_F3 | T_YMM | T_EVEX | T_EW0 | T_B32 | T_N8 | T_N_VL, 0xE6); } void vcvtdq2ps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5B); } +void vcvtneebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM, 0xB0); } +void vcvtneeph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM, 0xB0); } +void vcvtneobf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F2 | T_0F38 | T_W0 | T_YMM, 0xB0); } +void vcvtneoph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_0F38 | T_W0 | T_YMM, 0xB0); } +void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | orEvexIf(encoding), 0x72); } void vcvtpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_F2 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0xE6); } void vcvtpd2ps(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x5A); } void vcvtph2ps(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F38 | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y, 0x13); } @@ -1169,6 +1192,10 @@ void vpbroadcastb(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isME void vpbroadcastd(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x58); } void vpbroadcastq(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_W0 | T_EW1 | T_YMM | T_EVEX, 0x59); } void vpbroadcastw(const Xmm& x, const Operand& op) { if (!(op.isXMM() || op.isMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x79); } +void vpclmulhqhqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x11); } +void vpclmulhqlqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x01); } +void vpclmullqhqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x10); } +void vpclmullqlqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x00); } void vpclmulqdq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM | T_EVEX, 0x44, imm); } void vpcmpeqb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x74); } void vpcmpeqd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x76); } @@ -1182,10 +1209,22 @@ void vpcmpgtq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1 void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x65); } void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x63, imm); } void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x62, imm); } -void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x50, encoding); } -void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x51, encoding); } -void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x52, encoding); } -void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opVnni(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x53, encoding); } +void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_YMM, 0x50); } +void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_YMM, 0x51); } +void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0x50); } +void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0x51); } +void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x50, encoding); } +void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x51, encoding); } +void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0x50); } +void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0x51); } +void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x52, encoding); } +void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x53, encoding); } +void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0xD2); } +void vpdpwsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0xD3); } +void vpdpwusd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_YMM, 0xD2); } +void vpdpwusds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_YMM, 0xD3); } +void vpdpwuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0xD2); } +void vpdpwuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0xD3); } void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x06, imm); } void vperm2i128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x46, imm); } void vpermd(const Ymm& y1, const Ymm& y2, const Operand& op) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x36); } @@ -1217,6 +1256,8 @@ void vpinsrb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if void vpinsrd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x22, imm); } void vpinsrq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(64) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x22, imm); } void vpinsrw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F | T_66 | T_EVEX | T_N2, 0xC4, imm); } +void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_B64, 0xB5, encoding); } +void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_B64, 0xB4, encoding); } void vpmaddubsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x04); } void vpmaddwd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF5); } void vpmaskmovd(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x8E); } @@ -1313,8 +1354,16 @@ void vroundsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { op void vroundss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0, 0x0A, imm); } void vrsqrtps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_YMM, 0x52); } void vrsqrtss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F, 0x52); } +void vsha512msg1(const Ymm& y, const Xmm& x) { if (!(y.isYMM() && x.isXMM())) XBYAK_THROW(ERR_BAD_PARAMETER) opVex(y, 0, x, T_F2 | T_0F38 | T_W0 | T_YMM, 0xCC); } +void vsha512msg2(const Ymm& y1, const Ymm& y2) { if (!(y1.isYMM() && y2.isYMM())) XBYAK_THROW(ERR_BAD_PARAMETER) opVex(y1, 0, y2, T_F2 | T_0F38 | T_W0 | T_YMM, 0xCD); } +void vsha512rnds2(const Ymm& y1, const Ymm& y2, const Xmm& x) { if (!(y1.isYMM() && y2.isYMM() && x.isXMM())) XBYAK_THROW(ERR_BAD_PARAMETER) opVex(y1, &y2, x, T_F2 | T_0F38 | T_W0 | T_YMM, 0xCB); } void vshufpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0xC6, imm); } void vshufps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0xC6, imm); } +void vsm3msg1(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA); } +void vsm3msg2(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA); } +void vsm3rnds2(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_EW0 | T_EVEX, 0xDE, imm); } +void vsm4key4(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA); } +void vsm4rnds4(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA); } void vsqrtpd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x51); } void vsqrtps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x51); } void vsqrtsd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX | T_ER_X, 0x51); } @@ -1339,7 +1388,10 @@ void vzeroupper() { db(0xC5); db(0xF8); db(0x77); } void wait() { db(0x9B); } void wbinvd() { db(0x0F); db(0x09); } void wrmsr() { db(0x0F); db(0x30); } +void xabort(uint8_t imm) { db(0xC6); db(0xF8); db(imm); } void xadd(const Operand& op, const Reg& reg) { opModRM(reg, op, (op.isREG() && reg.isREG() && op.getBit() == reg.getBit()), op.isMEM(), 0x0F, 0xC0 | (reg.isBit(8) ? 0 : 1)); } +void xbegin(uint32_t rel) { db(0xC7); db(0xF8); dd(rel); } +void xend() { db(0x0F); db(0x01); db(0xD5); } void xgetbv() { db(0x0F); db(0x01); db(0xD0); } void xlatb() { db(0xD7); } void xor_(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x30, 6); } @@ -1620,6 +1672,10 @@ void scasq() { db(0x48); db(0xAF); } void stosq() { db(0x48); db(0xAB); } void syscall() { db(0x0F); db(0x05); } void sysret() { db(0x0F); db(0x07); } +void clui() { db(0xF3); db(0x0F); db(0x01); db(0xEE); } +void stui() { db(0xF3); db(0x0F); db(0x01); db(0xEF); } +void testui() { db(0xF3); db(0x0F); db(0x01); db(0xED); } +void uiret() { db(0xF3); db(0x0F); db(0x01); db(0xEC); } void cmpxchg16b(const Address& addr) { opModM(addr, Reg64(1), 0x0F, 0xC7); } void fxrstor64(const Address& addr) { opModM(addr, Reg64(1), 0x0F, 0xAE); } void movq(const Reg64& reg, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opModR(mmx, reg, 0x0F, 0x7E); } @@ -1627,12 +1683,29 @@ void movq(const Mmx& mmx, const Reg64& reg) { if (mmx.isXMM()) db(0x66); opModR( void movsxd(const Reg64& reg, const Operand& op) { if (!op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION) opModRM(reg, op, op.isREG(), op.isMEM(), 0x63); } void pextrq(const Operand& op, const Xmm& xmm, uint8_t imm) { if (!op.isREG(64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x16, 0x66, 0, imm, 0x3A); } void pinsrq(const Xmm& xmm, const Operand& op, uint8_t imm) { if (!op.isREG(64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x22, 0x66, 0, imm, 0x3A); } +void senduipi(const Reg64& r) { db(0xF3); opModR(Reg32(6), r.cvt32(), 0x0F, 0xC7); } void vcvtss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_ER_X | T_N8, 0x2D); } void vcvttss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_SAE_X | T_N8, 0x2C); } void vcvtsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W1 | T_EVEX | T_EW1 | T_N4 | T_ER_X, 0x2D); } void vcvttsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W1 | T_EVEX | T_EW1 | T_N4 | T_SAE_X, 0x2C); } void vmovq(const Xmm& x, const Reg64& r) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x6E); } void vmovq(const Reg64& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x7E); } +void cmpbexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE6, false); } +void cmpbxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE2, false); } +void cmplexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEE, false); } +void cmplxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEC, false); } +void cmpnbexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE7, false); } +void cmpnbxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE3, false); } +void cmpnlexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEF, false); } +void cmpnlxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xED, false); } +void cmpnoxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE1, false); } +void cmpnpxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEB, false); } +void cmpnsxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE9, false); } +void cmpnzxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE5, false); } +void cmpoxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE0, false); } +void cmppxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xEA, false); } +void cmpsxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE8, false); } +void cmpzxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) { opGpr(r1, addr, r2, T_66 | T_0F38, 0xE4, false); } void ldtilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_0F38 | T_W0, 0x49); } void sttilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_66 | T_0F38 | T_W0, 0x49); } void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2 | T_0F38 | T_W0, 0x4b); } @@ -1644,6 +1717,7 @@ void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); } void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x5e); } void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38 | T_W0, 0x5e); } +void tdpfp16ps(const Tmm &x1, const Tmm &x2, const Tmm &x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5c); } void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5c); } #else void jcxz(std::string label) { db(0x67); opJmp(label, T_SHORT, 0xe3, 0, 0); } @@ -1898,7 +1972,6 @@ void vcompressps(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4 | void vcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x63); } void vcvtdq2ph(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x5B); } void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); } -void vcvtneps2bf16(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72); } void vcvtpd2ph(const Xmm& x, const Operand& op) { opCvt5(x, op, T_N16 | T_N_VL | T_66 | T_MAP5 | T_EW1 | T_ER_Z | T_MUST_EVEX | T_B64, 0x5A); } void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7B); } void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x79); } @@ -2132,38 +2205,36 @@ void vpgatherqd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4 | T void vpgatherqq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x91, 0); } void vplzcntd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x44); } void vplzcntq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x44); } -void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xB5); } -void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xB4); } void vpmaxsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3D); } void vpmaxuq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3F); } void vpminsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x39); } void vpminuq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3B); } void vpmovb2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x29); } void vpmovd2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x39); } -void vpmovdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x31, false); } -void vpmovdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x33, true); } +void vpmovdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x31, false); } +void vpmovdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x33, true); } void vpmovm2b(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x28); } void vpmovm2d(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x38); } void vpmovm2q(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x38); } void vpmovm2w(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x28); } void vpmovq2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x39); } -void vpmovqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x32, false); } -void vpmovqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x35, true); } -void vpmovqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x34, false); } -void vpmovsdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x21, false); } -void vpmovsdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x23, true); } -void vpmovsqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x22, false); } -void vpmovsqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x25, true); } -void vpmovsqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x24, false); } -void vpmovswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x20, true); } -void vpmovusdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x11, false); } -void vpmovusdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x13, true); } -void vpmovusqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x12, false); } -void vpmovusqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x15, true); } -void vpmovusqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x14, false); } -void vpmovuswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x10, true); } +void vpmovqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x32, false); } +void vpmovqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x35, true); } +void vpmovqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x34, false); } +void vpmovsdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x21, false); } +void vpmovsdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x23, true); } +void vpmovsqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x22, false); } +void vpmovsqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x25, true); } +void vpmovsqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x24, false); } +void vpmovswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x20, true); } +void vpmovusdb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x11, false); } +void vpmovusdw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x13, true); } +void vpmovusqb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x12, false); } +void vpmovusqd(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x15, true); } +void vpmovusqw(const Operand& op, const Xmm& x) { opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x14, false); } +void vpmovuswb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x10, true); } void vpmovw2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x29); } -void vpmovwb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x30, true); } +void vpmovwb(const Operand& op, const Xmm& x) { opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x30, true); } void vpmullq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x40); } void vpmultishiftqb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x83); } void vpopcntb(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x54); } diff --git a/3rdparty/xbyak/xbyak/xbyak_util.h b/3rdparty/xbyak/xbyak/xbyak_util.h index f2f651f266..0bcc134994 100644 --- a/3rdparty/xbyak/xbyak/xbyak_util.h +++ b/3rdparty/xbyak/xbyak/xbyak_util.h @@ -4,12 +4,18 @@ #ifdef XBYAK_ONLY_CLASS_CPU #include #include -#include #include #ifndef XBYAK_THROW #define XBYAK_THROW(x) ; #define XBYAK_THROW_RET(x, y) return y; #endif +#ifndef XBYAK_CONSTEXPR +#if ((__cplusplus >= 201402L) && !(!defined(__clang__) && defined(__GNUC__) && (__GNUC__ <= 5))) || (defined(_MSC_VER) && _MSC_VER >= 1910) + #define XBYAK_CONSTEXPR constexpr +#else + #define XBYAK_CONSTEXPR +#endif +#endif #else #include @@ -17,7 +23,6 @@ utility class and functions for Xbyak Xbyak::util::Clock ; rdtsc timer Xbyak::util::Cpu ; detect CPU - @note this header is UNDER CONSTRUCTION! */ #include "xbyak.h" #endif // XBYAK_ONLY_CLASS_CPU @@ -27,8 +32,8 @@ #endif #ifdef XBYAK_INTEL_CPU_SPECIFIC -#ifdef _MSC_VER - #if (_MSC_VER < 1400) && defined(XBYAK32) +#ifdef _WIN32 + #if defined(_MSC_VER) && (_MSC_VER < 1400) && defined(XBYAK32) static inline __declspec(naked) void __cpuid(int[4], int) { __asm { @@ -88,32 +93,69 @@ typedef enum { CoreLevel = 2 } IntelCpuTopologyLevel; +namespace local { + +template +struct TypeT { +}; + +template +XBYAK_CONSTEXPR TypeT operator|(TypeT, TypeT) { return TypeT(); } + +template +inline T max_(T x, T y) { return x >= y ? x : y; } +template +inline T min_(T x, T y) { return x < y ? x : y; } + +} // local + /** CPU detection class + @note static inline const member is supported by c++17 or later, so use template hack */ class Cpu { - uint64_t type_; +public: + class Type { + uint64_t L; + uint64_t H; + public: + Type(uint64_t L = 0, uint64_t H = 0) : L(L), H(H) { } + template + Type(local::TypeT) : L(L_), H(H_) {} + Type& operator&=(const Type& rhs) { L &= rhs.L; H &= rhs.H; return *this; } + Type& operator|=(const Type& rhs) { L |= rhs.L; H |= rhs.H; return *this; } + Type operator&(const Type& rhs) const { Type t = *this; t &= rhs; return t; } + Type operator|(const Type& rhs) const { Type t = *this; t |= rhs; return t; } + bool operator==(const Type& rhs) const { return H == rhs.H && L == rhs.L; } + bool operator!=(const Type& rhs) const { return !operator==(rhs); } + // without explicit because backward compatilibity + operator bool() const { return (H | L) != 0; } + uint64_t getL() const { return L; } + uint64_t getH() const { return H; } + }; +private: + Type type_; //system topology bool x2APIC_supported_; static const size_t maxTopologyLevels = 2; - unsigned int numCores_[maxTopologyLevels]; + uint32_t numCores_[maxTopologyLevels]; - static const unsigned int maxNumberCacheLevels = 10; - unsigned int dataCacheSize_[maxNumberCacheLevels]; - unsigned int coresSharignDataCache_[maxNumberCacheLevels]; - unsigned int dataCacheLevels_; + static const uint32_t maxNumberCacheLevels = 10; + uint32_t dataCacheSize_[maxNumberCacheLevels]; + uint32_t coresSharignDataCache_[maxNumberCacheLevels]; + uint32_t dataCacheLevels_; - unsigned int get32bitAsBE(const char *x) const + uint32_t get32bitAsBE(const char *x) const { return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24); } - unsigned int mask(int n) const + uint32_t mask(int n) const { return (1U << n) - 1; } void setFamily() { - unsigned int data[4] = {}; + uint32_t data[4] = {}; getCpuid(1, data); stepping = data[0] & mask(4); model = (data[0] >> 4) & mask(4); @@ -132,17 +174,15 @@ class Cpu { displayModel = model; } } - unsigned int extractBit(unsigned int val, unsigned int base, unsigned int end) + uint32_t extractBit(uint32_t val, uint32_t base, uint32_t end) { return (val >> base) & ((1u << (end - base)) - 1); } void setNumCores() { - if ((type_ & tINTEL) == 0) return; + if (!has(tINTEL) && !has(tAMD)) return; - unsigned int data[4] = {}; - - /* CAUTION: These numbers are configuration as shipped by Intel. */ + uint32_t data[4] = {}; getCpuidEx(0x0, 0, data); if (data[0] >= 0xB) { /* @@ -152,7 +192,7 @@ class Cpu { leaf 0xB can be zeroed-out by a hypervisor */ x2APIC_supported_ = true; - for (unsigned int i = 0; i < maxTopologyLevels; i++) { + for (uint32_t i = 0; i < maxTopologyLevels; i++) { getCpuidEx(0xB, i, data); IntelCpuTopologyLevel level = (IntelCpuTopologyLevel)extractBit(data[2], 8, 15); if (level == SmtLevel || level == CoreLevel) { @@ -162,8 +202,8 @@ class Cpu { /* Fallback values in case a hypervisor has 0xB leaf zeroed-out. */ - numCores_[SmtLevel - 1] = (std::max)(1u, numCores_[SmtLevel - 1]); - numCores_[CoreLevel - 1] = (std::max)(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]); + numCores_[SmtLevel - 1] = local::max_(1u, numCores_[SmtLevel - 1]); + numCores_[CoreLevel - 1] = local::max_(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]); } else { /* Failed to deremine num of cores without x2APIC support. @@ -176,14 +216,55 @@ class Cpu { } void setCacheHierarchy() { - if ((type_ & tINTEL) == 0) return; - const unsigned int NO_CACHE = 0; - const unsigned int DATA_CACHE = 1; -// const unsigned int INSTRUCTION_CACHE = 2; - const unsigned int UNIFIED_CACHE = 3; - unsigned int smt_width = 0; - unsigned int logical_cores = 0; - unsigned int data[4] = {}; + if (!has(tINTEL) && !has(tAMD)) return; + + // https://github.com/amd/ZenDNN/blob/a08bf9a9efc160a69147cdecfb61cc85cc0d4928/src/cpu/x64/xbyak/xbyak_util.h#L236-L288 + if (has(tAMD)) { + // There are 3 Data Cache Levels (L1, L2, L3) + dataCacheLevels_ = 3; + const uint32_t leaf = 0x8000001D; // for modern AMD CPus + // Sub leaf value ranges from 0 to 3 + // Sub leaf value 0 refers to L1 Data Cache + // Sub leaf value 1 refers to L1 Instruction Cache + // Sub leaf value 2 refers to L2 Cache + // Sub leaf value 3 refers to L3 Cache + // For legacy AMD CPU, use leaf 0x80000005 for L1 cache + // and 0x80000006 for L2 and L3 cache + int cache_index = 0; + for (uint32_t sub_leaf = 0; sub_leaf <= dataCacheLevels_; sub_leaf++) { + // Skip sub_leaf = 1 as it refers to + // L1 Instruction Cache (not required) + if (sub_leaf == 1) { + continue; + } + uint32_t data[4] = {}; + getCpuidEx(leaf, sub_leaf, data); + // Cache Size = Line Size * Partitions * Associativity * Cache Sets + dataCacheSize_[cache_index] = + (extractBit(data[1], 22, 31) + 1) // Associativity-1 + * (extractBit(data[1], 12, 21) + 1) // Partitions-1 + * (extractBit(data[1], 0, 11) + 1) // Line Size + * (data[2] + 1); + // Calculate the number of cores sharing the current data cache + int smt_width = numCores_[0]; + int logical_cores = numCores_[1]; + int actual_logical_cores = extractBit(data[0], 14, 25) /* # of cores * # of threads */ + 1; + if (logical_cores != 0) { + actual_logical_cores = local::min_(actual_logical_cores, logical_cores); + } + coresSharignDataCache_[cache_index] = local::max_(actual_logical_cores / smt_width, 1); + ++cache_index; + } + return; + } + // intel + const uint32_t NO_CACHE = 0; + const uint32_t DATA_CACHE = 1; +// const uint32_t INSTRUCTION_CACHE = 2; + const uint32_t UNIFIED_CACHE = 3; + uint32_t smt_width = 0; + uint32_t logical_cores = 0; + uint32_t data[4] = {}; if (x2APIC_supported_) { smt_width = numCores_[0]; @@ -201,12 +282,12 @@ class Cpu { */ for (int i = 0; dataCacheLevels_ < maxNumberCacheLevels; i++) { getCpuidEx(0x4, i, data); - unsigned int cacheType = extractBit(data[0], 0, 4); + uint32_t cacheType = extractBit(data[0], 0, 4); if (cacheType == NO_CACHE) break; if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) { - unsigned int actual_logical_cores = extractBit(data[0], 14, 25) + 1; + uint32_t actual_logical_cores = extractBit(data[0], 14, 25) + 1; if (logical_cores != 0) { // true only if leaf 0xB is supported and valid - actual_logical_cores = (std::min)(actual_logical_cores, logical_cores); + actual_logical_cores = local::min_(actual_logical_cores, logical_cores); } assert(actual_logical_cores != 0); dataCacheSize_[dataCacheLevels_] = @@ -216,7 +297,7 @@ class Cpu { * (data[2] + 1); if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores; assert(smt_width != 0); - coresSharignDataCache_[dataCacheLevels_] = (std::max)(actual_logical_cores / smt_width, 1u); + coresSharignDataCache_[dataCacheLevels_] = local::max_(actual_logical_cores / smt_width, 1u); dataCacheLevels_++; } } @@ -231,7 +312,7 @@ public: int displayFamily; // family + extFamily int displayModel; // model + extModel - unsigned int getNumCores(IntelCpuTopologyLevel level) const { + uint32_t getNumCores(IntelCpuTopologyLevel level) const { if (!x2APIC_supported_) XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0) switch (level) { case SmtLevel: return numCores_[level - 1]; @@ -240,13 +321,13 @@ public: } } - unsigned int getDataCacheLevels() const { return dataCacheLevels_; } - unsigned int getCoresSharingDataCache(unsigned int i) const + uint32_t getDataCacheLevels() const { return dataCacheLevels_; } + uint32_t getCoresSharingDataCache(uint32_t i) const { if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0) return coresSharignDataCache_[i]; } - unsigned int getDataCacheSize(unsigned int i) const + uint32_t getDataCacheSize(uint32_t i) const { if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0) return dataCacheSize_[i]; @@ -255,10 +336,10 @@ public: /* data[] = { eax, ebx, ecx, edx } */ - static inline void getCpuid(unsigned int eaxIn, unsigned int data[4]) + static inline void getCpuid(uint32_t eaxIn, uint32_t data[4]) { #ifdef XBYAK_INTEL_CPU_SPECIFIC - #ifdef _MSC_VER + #ifdef _WIN32 __cpuid(reinterpret_cast(data), eaxIn); #else __cpuid(eaxIn, data[0], data[1], data[2], data[3]); @@ -268,10 +349,10 @@ public: (void)data; #endif } - static inline void getCpuidEx(unsigned int eaxIn, unsigned int ecxIn, unsigned int data[4]) + static inline void getCpuidEx(uint32_t eaxIn, uint32_t ecxIn, uint32_t data[4]) { #ifdef XBYAK_INTEL_CPU_SPECIFIC - #ifdef _MSC_VER + #ifdef _WIN32 __cpuidex(reinterpret_cast(data), eaxIn, ecxIn); #else __cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]); @@ -288,7 +369,7 @@ public: #ifdef _MSC_VER return _xgetbv(0); #else - unsigned int eax, edx; + uint32_t eax, edx; // xgetvb is not support on gcc 4.2 // __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0)); __asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0)); @@ -298,93 +379,116 @@ public: return 0; #endif } - typedef uint64_t Type; - static const Type NONE = 0; - static const Type tMMX = 1 << 0; - static const Type tMMX2 = 1 << 1; - static const Type tCMOV = 1 << 2; - static const Type tSSE = 1 << 3; - static const Type tSSE2 = 1 << 4; - static const Type tSSE3 = 1 << 5; - static const Type tSSSE3 = 1 << 6; - static const Type tSSE41 = 1 << 7; - static const Type tSSE42 = 1 << 8; - static const Type tPOPCNT = 1 << 9; - static const Type tAESNI = 1 << 10; - static const Type tOSXSAVE = 1 << 12; - static const Type tPCLMULQDQ = 1 << 13; - static const Type tAVX = 1 << 14; - static const Type tFMA = 1 << 15; +#define XBYAK_SPLIT_ID(id) ((0 <= id && id < 64) ? (1ull << (id % 64)) : 0), (id >= 64 ? (1ull << (id % 64)) : 0) +#if (__cplusplus >= 201103) || (defined(_MSC_VER) && (_MSC_VER >= 1700)) /* VS2012 */ + #define XBYAK_DEFINE_TYPE(id, NAME) static const constexpr local::TypeT NAME{} +#else + #define XBYAK_DEFINE_TYPE(id, NAME) static const local::TypeT NAME +#endif + XBYAK_DEFINE_TYPE(0, tMMX); + XBYAK_DEFINE_TYPE(1, tMMX2); + XBYAK_DEFINE_TYPE(2, tCMOV); + XBYAK_DEFINE_TYPE(3, tSSE); + XBYAK_DEFINE_TYPE(4, tSSE2); + XBYAK_DEFINE_TYPE(5, tSSE3); + XBYAK_DEFINE_TYPE(6, tSSSE3); + XBYAK_DEFINE_TYPE(7, tSSE41); + XBYAK_DEFINE_TYPE(8, tSSE42); + XBYAK_DEFINE_TYPE(9, tPOPCNT); + XBYAK_DEFINE_TYPE(10, tAESNI); + XBYAK_DEFINE_TYPE(11, tAVX512_FP16); + XBYAK_DEFINE_TYPE(12, tOSXSAVE); + XBYAK_DEFINE_TYPE(13, tPCLMULQDQ); + XBYAK_DEFINE_TYPE(14, tAVX); + XBYAK_DEFINE_TYPE(15, tFMA); + XBYAK_DEFINE_TYPE(16, t3DN); + XBYAK_DEFINE_TYPE(17, tE3DN); + XBYAK_DEFINE_TYPE(18, tWAITPKG); + XBYAK_DEFINE_TYPE(19, tRDTSCP); + XBYAK_DEFINE_TYPE(20, tAVX2); + XBYAK_DEFINE_TYPE(21, tBMI1); // andn, bextr, blsi, blsmsk, blsr, tzcnt + XBYAK_DEFINE_TYPE(22, tBMI2); // bzhi, mulx, pdep, pext, rorx, sarx, shlx, shrx + XBYAK_DEFINE_TYPE(23, tLZCNT); + XBYAK_DEFINE_TYPE(24, tINTEL); + XBYAK_DEFINE_TYPE(25, tAMD); + XBYAK_DEFINE_TYPE(26, tENHANCED_REP); // enhanced rep movsb/stosb + XBYAK_DEFINE_TYPE(27, tRDRAND); + XBYAK_DEFINE_TYPE(28, tADX); // adcx, adox + XBYAK_DEFINE_TYPE(29, tRDSEED); // rdseed + XBYAK_DEFINE_TYPE(30, tSMAP); // stac + XBYAK_DEFINE_TYPE(31, tHLE); // xacquire, xrelease, xtest + XBYAK_DEFINE_TYPE(32, tRTM); // xbegin, xend, xabort + XBYAK_DEFINE_TYPE(33, tF16C); // vcvtph2ps, vcvtps2ph + XBYAK_DEFINE_TYPE(34, tMOVBE); // mobve + XBYAK_DEFINE_TYPE(35, tAVX512F); + XBYAK_DEFINE_TYPE(36, tAVX512DQ); + XBYAK_DEFINE_TYPE(37, tAVX512_IFMA); + XBYAK_DEFINE_TYPE(37, tAVX512IFMA);// = tAVX512_IFMA; + XBYAK_DEFINE_TYPE(38, tAVX512PF); + XBYAK_DEFINE_TYPE(39, tAVX512ER); + XBYAK_DEFINE_TYPE(40, tAVX512CD); + XBYAK_DEFINE_TYPE(41, tAVX512BW); + XBYAK_DEFINE_TYPE(42, tAVX512VL); + XBYAK_DEFINE_TYPE(43, tAVX512_VBMI); + XBYAK_DEFINE_TYPE(43, tAVX512VBMI); // = tAVX512_VBMI; // changed by Intel's manual + XBYAK_DEFINE_TYPE(44, tAVX512_4VNNIW); + XBYAK_DEFINE_TYPE(45, tAVX512_4FMAPS); + XBYAK_DEFINE_TYPE(46, tPREFETCHWT1); + XBYAK_DEFINE_TYPE(47, tPREFETCHW); + XBYAK_DEFINE_TYPE(48, tSHA); + XBYAK_DEFINE_TYPE(49, tMPX); + XBYAK_DEFINE_TYPE(50, tAVX512_VBMI2); + XBYAK_DEFINE_TYPE(51, tGFNI); + XBYAK_DEFINE_TYPE(52, tVAES); + XBYAK_DEFINE_TYPE(53, tVPCLMULQDQ); + XBYAK_DEFINE_TYPE(54, tAVX512_VNNI); + XBYAK_DEFINE_TYPE(55, tAVX512_BITALG); + XBYAK_DEFINE_TYPE(56, tAVX512_VPOPCNTDQ); + XBYAK_DEFINE_TYPE(57, tAVX512_BF16); + XBYAK_DEFINE_TYPE(58, tAVX512_VP2INTERSECT); + XBYAK_DEFINE_TYPE(59, tAMX_TILE); + XBYAK_DEFINE_TYPE(60, tAMX_INT8); + XBYAK_DEFINE_TYPE(61, tAMX_BF16); + XBYAK_DEFINE_TYPE(62, tAVX_VNNI); + XBYAK_DEFINE_TYPE(63, tCLFLUSHOPT); + XBYAK_DEFINE_TYPE(64, tCLDEMOTE); + XBYAK_DEFINE_TYPE(65, tMOVDIRI); + XBYAK_DEFINE_TYPE(66, tMOVDIR64B); + XBYAK_DEFINE_TYPE(67, tCLZERO); // AMD Zen + XBYAK_DEFINE_TYPE(68, tAMX_FP16); + XBYAK_DEFINE_TYPE(69, tAVX_VNNI_INT8); + XBYAK_DEFINE_TYPE(70, tAVX_NE_CONVERT); + XBYAK_DEFINE_TYPE(71, tAVX_IFMA); + XBYAK_DEFINE_TYPE(72, tRAO_INT); + XBYAK_DEFINE_TYPE(73, tCMPCCXADD); + XBYAK_DEFINE_TYPE(74, tPREFETCHITI); + XBYAK_DEFINE_TYPE(75, tSERIALIZE); + XBYAK_DEFINE_TYPE(76, tUINTR); + XBYAK_DEFINE_TYPE(77, tXSAVE); + XBYAK_DEFINE_TYPE(78, tSHA512); + XBYAK_DEFINE_TYPE(79, tSM3); + XBYAK_DEFINE_TYPE(80, tSM4); + XBYAK_DEFINE_TYPE(81, tAVX_VNNI_INT16); - static const Type t3DN = 1 << 16; - static const Type tE3DN = 1 << 17; - static const Type tRDTSCP = 1 << 19; - static const Type tAVX2 = 1 << 20; - static const Type tBMI1 = 1 << 21; // andn, bextr, blsi, blsmsk, blsr, tzcnt - static const Type tBMI2 = 1 << 22; // bzhi, mulx, pdep, pext, rorx, sarx, shlx, shrx - static const Type tLZCNT = 1 << 23; - - static const Type tINTEL = 1 << 24; - static const Type tAMD = 1 << 25; - - static const Type tENHANCED_REP = 1 << 26; // enhanced rep movsb/stosb - static const Type tRDRAND = 1 << 27; - static const Type tADX = 1 << 28; // adcx, adox - static const Type tRDSEED = 1 << 29; // rdseed - static const Type tSMAP = 1 << 30; // stac - static const Type tHLE = uint64_t(1) << 31; // xacquire, xrelease, xtest - static const Type tRTM = uint64_t(1) << 32; // xbegin, xend, xabort - static const Type tF16C = uint64_t(1) << 33; // vcvtph2ps, vcvtps2ph - static const Type tMOVBE = uint64_t(1) << 34; // mobve - static const Type tAVX512F = uint64_t(1) << 35; - static const Type tAVX512DQ = uint64_t(1) << 36; - static const Type tAVX512_IFMA = uint64_t(1) << 37; - static const Type tAVX512IFMA = tAVX512_IFMA; - static const Type tAVX512PF = uint64_t(1) << 38; - static const Type tAVX512ER = uint64_t(1) << 39; - static const Type tAVX512CD = uint64_t(1) << 40; - static const Type tAVX512BW = uint64_t(1) << 41; - static const Type tAVX512VL = uint64_t(1) << 42; - static const Type tAVX512_VBMI = uint64_t(1) << 43; - static const Type tAVX512VBMI = tAVX512_VBMI; // changed by Intel's manual - static const Type tAVX512_4VNNIW = uint64_t(1) << 44; - static const Type tAVX512_4FMAPS = uint64_t(1) << 45; - static const Type tPREFETCHWT1 = uint64_t(1) << 46; - static const Type tPREFETCHW = uint64_t(1) << 47; - static const Type tSHA = uint64_t(1) << 48; - static const Type tMPX = uint64_t(1) << 49; - static const Type tAVX512_VBMI2 = uint64_t(1) << 50; - static const Type tGFNI = uint64_t(1) << 51; - static const Type tVAES = uint64_t(1) << 52; - static const Type tVPCLMULQDQ = uint64_t(1) << 53; - static const Type tAVX512_VNNI = uint64_t(1) << 54; - static const Type tAVX512_BITALG = uint64_t(1) << 55; - static const Type tAVX512_VPOPCNTDQ = uint64_t(1) << 56; - static const Type tAVX512_BF16 = uint64_t(1) << 57; - static const Type tAVX512_VP2INTERSECT = uint64_t(1) << 58; - static const Type tAMX_TILE = uint64_t(1) << 59; - static const Type tAMX_INT8 = uint64_t(1) << 60; - static const Type tAMX_BF16 = uint64_t(1) << 61; - static const Type tAVX_VNNI = uint64_t(1) << 62; - static const Type tAVX512_FP16 = uint64_t(1) << 11; - // 18, 63 +#undef XBYAK_SPLIT_ID +#undef XBYAK_DEFINE_TYPE Cpu() - : type_(NONE) + : type_() , x2APIC_supported_(false) , numCores_() , dataCacheSize_() , coresSharignDataCache_() , dataCacheLevels_(0) { - unsigned int data[4] = {}; - const unsigned int& EAX = data[0]; - const unsigned int& EBX = data[1]; - const unsigned int& ECX = data[2]; - const unsigned int& EDX = data[3]; + uint32_t data[4] = {}; + const uint32_t& EAX = data[0]; + const uint32_t& EBX = data[1]; + const uint32_t& ECX = data[2]; + const uint32_t& EDX = data[3]; getCpuid(0, data); - const unsigned int maxNum = EAX; + const uint32_t maxNum = EAX; static const char intel[] = "ntel"; static const char amd[] = "cAMD"; if (ECX == get32bitAsBE(amd)) { @@ -407,7 +511,8 @@ public: // Extended flags information getCpuid(0x80000000, data); - if (EAX >= 0x80000001) { + const uint32_t maxExtendedNum = EAX; + if (maxExtendedNum >= 0x80000001) { getCpuid(0x80000001, data); if (EDX & (1U << 31)) type_ |= t3DN; @@ -419,15 +524,21 @@ public: if (ECX & (1U << 8)) type_ |= tPREFETCHW; } + if (maxExtendedNum >= 0x80000008) { + getCpuid(0x80000008, data); + if (EBX & (1U << 0)) type_ |= tCLZERO; + } + getCpuid(1, data); if (ECX & (1U << 0)) type_ |= tSSE3; + if (ECX & (1U << 1)) type_ |= tPCLMULQDQ; if (ECX & (1U << 9)) type_ |= tSSSE3; if (ECX & (1U << 19)) type_ |= tSSE41; if (ECX & (1U << 20)) type_ |= tSSE42; if (ECX & (1U << 22)) type_ |= tMOVBE; if (ECX & (1U << 23)) type_ |= tPOPCNT; if (ECX & (1U << 25)) type_ |= tAESNI; - if (ECX & (1U << 1)) type_ |= tPCLMULQDQ; + if (ECX & (1U << 26)) type_ |= tXSAVE; if (ECX & (1U << 27)) type_ |= tOSXSAVE; if (ECX & (1U << 30)) type_ |= tRDRAND; if (ECX & (1U << 29)) type_ |= tF16C; @@ -460,9 +571,6 @@ public: if (EBX & (1U << 31)) type_ |= tAVX512VL; if (ECX & (1U << 1)) type_ |= tAVX512_VBMI; if (ECX & (1U << 6)) type_ |= tAVX512_VBMI2; - if (ECX & (1U << 8)) type_ |= tGFNI; - if (ECX & (1U << 9)) type_ |= tVAES; - if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ; if (ECX & (1U << 11)) type_ |= tAVX512_VNNI; if (ECX & (1U << 12)) type_ |= tAVX512_BITALG; if (ECX & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ; @@ -484,20 +592,41 @@ public: if (EBX & (1U << 18)) type_ |= tRDSEED; if (EBX & (1U << 19)) type_ |= tADX; if (EBX & (1U << 20)) type_ |= tSMAP; + if (EBX & (1U << 23)) type_ |= tCLFLUSHOPT; if (EBX & (1U << 4)) type_ |= tHLE; if (EBX & (1U << 11)) type_ |= tRTM; if (EBX & (1U << 14)) type_ |= tMPX; if (EBX & (1U << 29)) type_ |= tSHA; if (ECX & (1U << 0)) type_ |= tPREFETCHWT1; + if (ECX & (1U << 5)) type_ |= tWAITPKG; + if (ECX & (1U << 8)) type_ |= tGFNI; + if (ECX & (1U << 9)) type_ |= tVAES; + if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ; + if (ECX & (1U << 25)) type_ |= tCLDEMOTE; + if (ECX & (1U << 27)) type_ |= tMOVDIRI; + if (ECX & (1U << 28)) type_ |= tMOVDIR64B; + if (EDX & (1U << 5)) type_ |= tUINTR; + if (EDX & (1U << 14)) type_ |= tSERIALIZE; + if (EDX & (1U << 22)) type_ |= tAMX_BF16; if (EDX & (1U << 24)) type_ |= tAMX_TILE; if (EDX & (1U << 25)) type_ |= tAMX_INT8; - if (EDX & (1U << 22)) type_ |= tAMX_BF16; if (maxNumSubLeaves >= 1) { getCpuidEx(7, 1, data); + if (EAX & (1U << 0)) type_ |= tSHA512; + if (EAX & (1U << 1)) type_ |= tSM3; + if (EAX & (1U << 2)) type_ |= tSM4; + if (EAX & (1U << 3)) type_ |= tRAO_INT; if (EAX & (1U << 4)) type_ |= tAVX_VNNI; if (type_ & tAVX512F) { if (EAX & (1U << 5)) type_ |= tAVX512_BF16; } + if (EAX & (1U << 7)) type_ |= tCMPCCXADD; + if (EAX & (1U << 21)) type_ |= tAMX_FP16; + if (EAX & (1U << 23)) type_ |= tAVX_IFMA; + if (EDX & (1U << 4)) type_ |= tAVX_VNNI_INT8; + if (EDX & (1U << 5)) type_ |= tAVX_NE_CONVERT; + if (EDX & (1U << 10)) type_ |= tAVX_VNNI_INT16; + if (EDX & (1U << 14)) type_ |= tPREFETCHITI; } } setFamily(); @@ -512,9 +641,9 @@ public: printf("display:family=%X, model=%X\n", displayFamily, displayModel); #endif } - bool has(Type type) const + bool has(const Type& type) const { - return (type & type_) != 0; + return (type & type_) == type; } }; @@ -527,7 +656,7 @@ public: #ifdef _MSC_VER return __rdtsc(); #else - unsigned int eax, edx; + uint32_t eax, edx; __asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx)); return ((uint64_t)edx << 32) | eax; #endif @@ -564,7 +693,7 @@ const int UseRDX = 1 << 7; class Pack { static const size_t maxTblNum = 15; - const Xbyak::Reg64 *tbl_[maxTblNum]; + Xbyak::Reg64 tbl_[maxTblNum]; size_t n_; public: Pack() : tbl_(), n_(0) {} @@ -581,32 +710,36 @@ public: return *this; } Pack(const Xbyak::Reg64& t0) - { n_ = 1; tbl_[0] = &t0; } + { n_ = 1; tbl_[0] = t0; } Pack(const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) - { n_ = 2; tbl_[0] = &t0; tbl_[1] = &t1; } + { n_ = 2; tbl_[0] = t0; tbl_[1] = t1; } Pack(const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) - { n_ = 3; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; } + { n_ = 3; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; } Pack(const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) - { n_ = 4; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; } + { n_ = 4; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; } Pack(const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) - { n_ = 5; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; } + { n_ = 5; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; } Pack(const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) - { n_ = 6; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; } + { n_ = 6; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; tbl_[5] = t5; } Pack(const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) - { n_ = 7; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; } + { n_ = 7; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; tbl_[5] = t5; tbl_[6] = t6; } Pack(const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) - { n_ = 8; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; } + { n_ = 8; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; tbl_[5] = t5; tbl_[6] = t6; tbl_[7] = t7; } Pack(const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) - { n_ = 9; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; } + { n_ = 9; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; tbl_[5] = t5; tbl_[6] = t6; tbl_[7] = t7; tbl_[8] = t8; } Pack(const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) - { n_ = 10; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; tbl_[9] = &t9; } + { n_ = 10; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; tbl_[5] = t5; tbl_[6] = t6; tbl_[7] = t7; tbl_[8] = t8; tbl_[9] = t9; } + Pack(const Xbyak::Reg64& ta, const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) + { n_ = 11; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; tbl_[5] = t5; tbl_[6] = t6; tbl_[7] = t7; tbl_[8] = t8; tbl_[9] = t9; tbl_[10] = ta; } + Pack(const Xbyak::Reg64& tb, const Xbyak::Reg64& ta, const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) + { n_ = 12; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; tbl_[5] = t5; tbl_[6] = t6; tbl_[7] = t7; tbl_[8] = t8; tbl_[9] = t9; tbl_[10] = ta; tbl_[11] = tb; } Pack& append(const Xbyak::Reg64& t) { if (n_ == maxTblNum) { fprintf(stderr, "ERR Pack::can't append\n"); XBYAK_THROW_RET(ERR_BAD_PARAMETER, *this) } - tbl_[n_++] = &t; + tbl_[n_++] = t; return *this; } void init(const Xbyak::Reg64 *tbl, size_t n) @@ -617,7 +750,7 @@ public: } n_ = n; for (size_t i = 0; i < n; i++) { - tbl_[i] = &tbl[i]; + tbl_[i] = tbl[i]; } } const Xbyak::Reg64& operator[](size_t n) const @@ -626,7 +759,7 @@ public: fprintf(stderr, "ERR Pack bad n=%d(%d)\n", (int)n, (int)n_); XBYAK_THROW_RET(ERR_BAD_PARAMETER, rax) } - return *tbl_[n]; + return tbl_[n]; } size_t size() const { return n_; } /* @@ -649,7 +782,7 @@ public: void put() const { for (size_t i = 0; i < n_; i++) { - printf("%s ", tbl_[i]->toString()); + printf("%s ", tbl_[i].toString()); } printf("\n"); } @@ -716,7 +849,7 @@ public: const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0); if (tNum_ < 0 || allRegNum > maxRegNum) XBYAK_THROW(ERR_BAD_TNUM) const Reg64& _rsp = code->rsp; - saveNum_ = (std::max)(0, allRegNum - noSaveNum); + saveNum_ = local::max_(0, allRegNum - noSaveNum); const int *tbl = getOrderTbl() + noSaveNum; for (int i = 0; i < saveNum_; i++) { code->push(Reg64(tbl[i]));