diff --git a/3rdparty/xbyak/xbyak/xbyak.h b/3rdparty/xbyak/xbyak/xbyak.h index 5cf18086ab..81f08d2258 100644 --- a/3rdparty/xbyak/xbyak/xbyak.h +++ b/3rdparty/xbyak/xbyak/xbyak.h @@ -123,8 +123,10 @@ #define XBYAK_TLS thread_local #define XBYAK_VARIADIC_TEMPLATE #define XBYAK_NOEXCEPT noexcept + #define XBYAK_OVERRIDE override #else #define XBYAK_NOEXCEPT throw() + #define XBYAK_OVERRIDE #endif // require c++14 or later @@ -161,7 +163,7 @@ namespace Xbyak { enum { DEFAULT_MAX_CODE_SIZE = 4096, - VERSION = 0x7270 /* 0xABCD = A.BC(.D) */ + VERSION = 0x7300 /* 0xABCD = A.BC(.D) */ }; #ifndef MIE_INTEGER_TYPE_DEFINED @@ -340,7 +342,7 @@ public: } } operator int() const { return err_; } - const char *what() const XBYAK_NOEXCEPT + const char *what() const XBYAK_NOEXCEPT XBYAK_OVERRIDE { return ConvertErrorToString(err_); } @@ -384,11 +386,6 @@ inline void AlignedFree(void *p) #endif } -template -inline const To CastTo(From p) XBYAK_NOEXCEPT -{ - return (const To)(size_t)(p); -} namespace inner { #ifdef _WIN32 @@ -434,6 +431,14 @@ enum LabelMode { LaddTop // (addr + top) for mov(reg, label) with AutoGrow }; +enum AddressMode { + M_none, + M_ModRM, + M_64bitDisp, + M_rip, + M_ripAddr +}; + } // inner /* @@ -487,7 +492,7 @@ class MmapAllocator : public Allocator { AllocationList allocList_; public: explicit MmapAllocator(const std::string& name = "xbyak") : name_(name) {} - uint8_t *alloc(size_t size) + uint8_t *alloc(size_t size) XBYAK_OVERRIDE { const size_t alignedSizeM1 = inner::getPageSize() - 1; size = (size + alignedSizeM1) & ~alignedSizeM1; @@ -526,7 +531,7 @@ public: #endif return (uint8_t*)p; } - void free(uint8_t *p) + void free(uint8_t *p) XBYAK_OVERRIDE { if (p == 0) return; AllocationList::iterator i = allocList_.find((uintptr_t)p); @@ -903,30 +908,6 @@ struct Reg64 : public Reg32e { explicit XBYAK_CONSTEXPR Reg64(int idx = 0) : Reg32e(idx, 64) {} }; struct RegRip { - int64_t disp_; - const Label* label_; - bool isAddr_; - explicit XBYAK_CONSTEXPR RegRip(int64_t disp = 0, const Label* label = 0, bool isAddr = false) : disp_(disp), label_(label), isAddr_(isAddr) {} - friend const RegRip operator+(const RegRip& r, int disp) { - return RegRip(r.disp_ + disp, r.label_, r.isAddr_); - } - friend const RegRip operator-(const RegRip& r, int disp) { - return RegRip(r.disp_ - disp, r.label_, r.isAddr_); - } - friend const RegRip operator+(const RegRip& r, int64_t disp) { - return RegRip(r.disp_ + disp, r.label_, r.isAddr_); - } - friend const RegRip operator-(const RegRip& r, int64_t disp) { - return RegRip(r.disp_ - disp, r.label_, r.isAddr_); - } - friend const RegRip operator+(const RegRip& r, const Label& label) { - if (r.label_ || r.isAddr_) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegRip()); - return RegRip(r.disp_, &label); - } - friend const RegRip operator+(const RegRip& r, const void *addr) { - if (r.label_ || r.isAddr_) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegRip()); - return RegRip(r.disp_ + (int64_t)addr, 0, true); - } }; #endif @@ -987,17 +968,30 @@ public: }; #endif +/* + pattern + [base]? [+index[*scale]]? [+/-disp]* [+label]? + rip [+/-disp]* [+label]? + rip+disp if backward reference then use label.getAddress() + rip+label if forward reference + [&var]?[+/-disp]* +*/ class RegExp { + friend class Address; public: #ifdef XBYAK64 enum { i32e = 32 | 64 }; #else enum { i32e = 32 }; #endif - XBYAK_CONSTEXPR RegExp(size_t disp = 0) : scale_(0), disp_(disp) { } + XBYAK_CONSTEXPR RegExp() : scale_(0), disp_(0), label_(0), rip_(false), setLabel_(false) { } + XBYAK_CONSTEXPR RegExp(size_t disp) : scale_(0), disp_(disp), label_(0), rip_(false), setLabel_(false) { } XBYAK_CONSTEXPR RegExp(const Reg& r, int scale = 1) : scale_(scale) , disp_(0) + , label_(0) + , rip_(false) + , setLabel_(false) { if (!r.isREG(i32e) && !r.is(Reg::XMM|Reg::YMM|Reg::ZMM|Reg::TMM)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) if (scale == 0) return; @@ -1008,6 +1002,26 @@ public: base_ = r; } } + RegExp(Label& label); + + RegExp(const void *addr) + : scale_(1) + , disp_(size_t(addr)) + , label_(0) + , rip_(false) + , setLabel_(true) + { + } +#ifdef XBYAK64 + RegExp(const RegRip& /*rip*/) + : scale_(0) + , disp_(0) + , label_(0) + , rip_(true) + , setLabel_(false) + { + } +#endif bool isVsib(int bit = 128 | 256 | 512) const { return index_.isBit(bit); } RegExp optimize() const { @@ -1025,6 +1039,8 @@ public: } const Reg& getBase() const { return base_; } const Reg& getIndex() const { return index_; } + const Label *getLabel() const { return label_; } + bool isOnlyDisp() const { return !base_.getBit() && !index_.getBit(); } // for mov eax int getScale() const { return scale_; } size_t getDisp() const { return disp_; } XBYAK_CONSTEXPR void verify() const @@ -1045,13 +1061,22 @@ private: Reg base_; Reg index_; int scale_; - size_t disp_; + size_t disp_; // absolute address + Label *label_; + bool rip_; + bool setLabel_; // disp_ contains the address of label }; inline RegExp operator+(const RegExp& a, const RegExp& b) { if (a.index_.getBit() && b.index_.getBit()) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegExp()) + if (a.label_ && b.label_) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegExp()) + if (b.rip_) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegExp()) + if (a.rip_ && !b.isOnlyDisp()) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegExp()) + if (a.setLabel_ && b.setLabel_) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegExp()) RegExp ret = a; + if (ret.label_ == 0) ret.label_ = b.label_; + if (ret.setLabel_ == 0) ret.setLabel_ = b.setLabel_; if (!ret.index_.getBit()) { ret.index_ = b.index_; ret.scale_ = b.scale_; } if (b.base_.getBit()) { if (ret.base_.getBit()) { @@ -1076,6 +1101,9 @@ inline RegExp operator*(int scale, const Reg& r) { return r * scale; } +// backward compatibility for eax+0 +inline RegExp operator+(const RegExp& a, size_t b) { return a + RegExp(b); } + inline RegExp operator-(const RegExp& e, size_t disp) { RegExp ret = e; @@ -1323,33 +1351,34 @@ public: class Address : public Operand { public: - enum Mode { - M_ModRM, - M_64bitDisp, - M_rip, - M_ripAddr - }; XBYAK_CONSTEXPR Address(uint32_t sizeBit, bool broadcast, const RegExp& e) - : Operand(0, MEM, sizeBit), e_(e), label_(0), mode_(M_ModRM), immSize(0), disp8N(0), permitVsib(false), broadcast_(broadcast), optimize_(true) + : Operand(0, MEM, sizeBit), e_(e), label_(e.label_), mode_(), immSize(0), disp8N(0), permitVsib(false), broadcast_(broadcast), optimize_(true) { + if (e.rip_) { + mode_ = (e.label_ || e.setLabel_) ? inner::M_ripAddr : inner::M_rip; + } else { +#ifdef XBYAK64 + uint64_t disp = e.getDisp(); + if (e.isOnlyDisp() && ((0x80000000 <= disp && disp <= 0xffffffff80000000) || e.getLabel())) { + mode_ = inner::M_64bitDisp; + } else +#endif + { + mode_ = inner::M_ModRM; + } + } e_.verify(); } -#ifdef XBYAK64 - explicit XBYAK_CONSTEXPR Address(size_t disp) - : Operand(0, MEM, 64), e_(disp), label_(0), mode_(M_64bitDisp), immSize(0), disp8N(0), permitVsib(false), broadcast_(false), optimize_(true) { } - XBYAK_CONSTEXPR Address(uint32_t sizeBit, bool broadcast, const RegRip& addr) - : Operand(0, MEM, sizeBit), e_(addr.disp_), label_(addr.label_), mode_(addr.isAddr_ ? M_ripAddr : M_rip), immSize(0), disp8N(0), permitVsib(false), broadcast_(broadcast), optimize_(true) { } -#endif RegExp getRegExp() const { return optimize_ ? e_.optimize() : e_; } Address cloneNoOptimize() const { Address addr = *this; addr.optimize_ = false; return addr; } - Mode getMode() const { return mode_; } + inner::AddressMode getMode() const { return mode_; } bool is32bit() const { return e_.getBase().getBit() == 32 || e_.getIndex().getBit() == 32; } - bool isOnlyDisp() const { return !e_.getBase().getBit() && !e_.getIndex().getBit(); } // for mov eax + bool isOnlyDisp() const { return e_.isOnlyDisp(); } size_t getDisp() const { return e_.getDisp(); } - bool is64bitDisp() const { return mode_ == M_64bitDisp; } // for moffset + bool is64bitDisp() const { return mode_ == inner::M_64bitDisp; } // for moffset bool isBroadcast() const { return broadcast_; } bool hasRex2() const { return e_.getBase().hasRex2() || e_.getIndex().hasRex2(); } const Label* getLabel() const { return label_; } @@ -1362,7 +1391,7 @@ public: private: RegExp e_; const Label* label_; - Mode mode_; + inner::AddressMode mode_; public: int immSize; // the size of immediate value of nmemonics (0, 1, 2, 4) int disp8N; // 0(normal), 1(force disp32), disp8N = {2, 4, 8} @@ -1406,21 +1435,13 @@ public: { return Address(bit_, broadcast_, e); } - Address operator[](const void *disp) const - { - return Address(bit_, broadcast_, RegExp(reinterpret_cast(disp))); - } -#ifdef XBYAK64 - Address operator[](uint64_t disp) const { return Address(disp); } - Address operator[](const RegRip& addr) const { return Address(bit_, broadcast_, addr); } -#endif }; struct JmpLabel { size_t endOfJmp; /* offset from top to the end address of jmp */ int jmpSize; inner::LabelMode mode; - size_t disp; // disp for [rip + disp] + size_t disp; // disp for [rip + disp] or [forward ref label + disp] explicit JmpLabel(size_t endOfJmp = 0, int jmpSize = 0, inner::LabelMode mode = inner::LasIs, size_t disp = 0) : endOfJmp(endOfJmp), jmpSize(jmpSize), mode(mode), disp(disp) { @@ -1440,6 +1461,7 @@ public: ~Label(); void clear() { mgr = 0; id = 0; } int getId() const { return id; } + bool isDefined() const; const uint8_t *getAddress() const; // backward compatibility @@ -1456,6 +1478,22 @@ public: } }; +inline RegExp::RegExp(Label& label) + : scale_(1) + , disp_(0) + , label_(0) + , rip_(false) + , setLabel_(true) +{ + const uint8_t *addr = label.getAddress(); + if (addr) { + disp_ = size_t(addr); + label_ = 0; + } else { + label_ = &label; + } +} + class LabelManager { // for string label struct SlabelVal { @@ -1517,6 +1555,9 @@ class LabelManager { #endif if (jmp->jmpSize == 1 && !inner::IsInDisp8((uint32_t)disp)) XBYAK_THROW(ERR_LABEL_IS_TOO_FAR) } + if (jmp->mode != inner::LasIs) { + disp += jmp->disp; + } if (base_->isAutoGrow()) { base_->save(offset, disp, jmp->jmpSize, jmp->mode); } else { @@ -1673,8 +1714,13 @@ public: bool hasUndefClabel() const { return hasUndefinedLabel_inner(clabelUndefList_); } const uint8_t *getCode() const { return base_->getCode(); } bool isReady() const { return !base_->isAutoGrow() || base_->isCalledCalcJmpAddress(); } + bool isDefined(const Label& label) const { return clabelDefList_.find(label.id) != clabelDefList_.end(); } }; +inline bool Label::isDefined() const +{ + return mgr && mgr->isDefined(*this); +} inline Label::Label(const Label& rhs) { id = rhs.id; @@ -2010,8 +2056,11 @@ private: { db(static_cast((mod << 6) | ((r1 & 7) << 3) | (r2 & 7))); } - void setSIB(const RegExp& e, int reg, int disp8N = 0) + void setSIB(const Address& addr, int reg) { + const RegExp& e = addr.getRegExp(); + const Label *label = e.getLabel(); + int disp8N = addr.disp8N; uint64_t disp64 = e.getDisp(); #if defined(XBYAK64) && !defined(__ILP32__) #ifdef XBYAK_OLD_DISP_CHECK @@ -2034,8 +2083,10 @@ private: mod00 = 0, mod01 = 1, mod10 = 2 }; int mod = mod10; // disp32 - if (!baseBit || ((baseIdx & 7) != Operand::EBP && disp == 0)) { + if (!baseBit || ((baseIdx & 7) != Operand::EBP && (label == 0 && disp == 0))) { mod = mod00; + } else if (label) { + // always disp32 } else { if (disp8N == 0) { if (inner::IsInDisp8(disp)) { @@ -2069,7 +2120,11 @@ private: if (mod == mod01) { db(disp); } else if (mod == mod10 || (mod == mod00 && !baseBit)) { - dd(disp); + if (label) { + putL_inner(*label, false, e.getDisp() - addr.immSize, 4); + } else { + dd(disp); + } } } LabelManager labelMgr_; @@ -2119,7 +2174,7 @@ private: // for only MPX(bnd*) void opMIB(const Address& addr, const Reg& reg, uint64_t type, int code) { - if (addr.getMode() != Address::M_ModRM) XBYAK_THROW(ERR_INVALID_MIB_ADDRESS) + if (addr.getMode() != inner::M_ModRM) XBYAK_THROW(ERR_INVALID_MIB_ADDRESS) opMR(addr.cloneNoOptimize(), reg, type, code); } void makeJmp(uint32_t disp, LabelType type, uint8_t shortCode, uint8_t longCode, uint8_t longPref) @@ -2188,15 +2243,15 @@ private: void opAddr(const Address &addr, int reg) { if (!addr.permitVsib && addr.isVsib()) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING) - if (addr.getMode() == Address::M_ModRM) { - setSIB(addr.getRegExp(), reg, addr.disp8N); - } else if (addr.getMode() == Address::M_rip || addr.getMode() == Address::M_ripAddr) { + if (addr.getMode() == inner::M_ModRM) { + setSIB(addr, reg); + } else if (addr.getMode() == inner::M_rip || addr.getMode() == inner::M_ripAddr) { setModRM(0, reg, 5); if (addr.getLabel()) { // [rip + Label] - putL_inner(*addr.getLabel(), true, addr.getDisp() - addr.immSize); + putL_inner(*addr.getLabel(), true, addr.getDisp() - addr.immSize, 4); } else { size_t disp = addr.getDisp(); - if (addr.getMode() == Address::M_ripAddr) { + if (addr.getMode() == inner::M_ripAddr) { if (isAutoGrow()) XBYAK_THROW(ERR_INVALID_RIP_IN_AUTO_GROW) disp -= (size_t)getCurr() + 4 + addr.immSize; } @@ -2448,9 +2503,9 @@ private: return bit / 8; } template - void putL_inner(T& label, bool relative = false, size_t disp = 0) + void putL_inner(T& label, bool relative = false, size_t disp = 0, int jmpSize = (int)sizeof(size_t)) { - const int jmpSize = relative ? 4 : (int)sizeof(size_t); + if (relative) jmpSize = 4; if (isAutoGrow() && size_ + 16 >= maxSize_) growMemory(); size_t offset = 0; if (labelMgr_.getOffset(&offset, label)) { @@ -3028,7 +3083,11 @@ public: if (code) { rex(*reg); db(op1.isREG(8) ? 0xA0 : op1.isREG() ? 0xA1 : op2.isREG(8) ? 0xA2 : 0xA3); - db(addr->getDisp(), 8); + if (addr->getLabel()) { + putL_inner(*addr->getLabel(), false, addr->getDisp() - addr->immSize, 8); + } else { + db(addr->getDisp(), 8); + } } else { XBYAK_THROW(ERR_BAD_COMBINATION) } @@ -3037,7 +3096,11 @@ public: if (code && addr->isOnlyDisp()) { rex(*reg, *addr); db(code | (reg->isBit(8) ? 0 : 1)); - dd(static_cast(addr->getDisp())); + if (addr->getLabel()) { + putL_inner(*addr->getLabel(), false, addr->getDisp() - addr->immSize); + } else { + dd(static_cast(addr->getDisp())); + } } else #endif { diff --git a/3rdparty/xbyak/xbyak/xbyak_mnemonic.h b/3rdparty/xbyak/xbyak/xbyak_mnemonic.h index f2b161e32b..5f75098f97 100644 --- a/3rdparty/xbyak/xbyak/xbyak_mnemonic.h +++ b/3rdparty/xbyak/xbyak/xbyak_mnemonic.h @@ -1,4 +1,4 @@ -const char *getVersionString() const { return "7.27"; } +const char *getVersionString() const { return "7.30"; } void aadd(const Address& addr, const Reg32e ®) { opMR(addr, reg, T_0F38, 0x0FC, T_APX); } void aand(const Address& addr, const Reg32e ®) { opMR(addr, reg, T_0F38|T_66, 0x0FC, T_APX|T_66); } void adc(const Operand& op, uint32_t imm) { opOI(op, imm, 0x10, 2); } @@ -1878,6 +1878,7 @@ void cmpxchg16b(const Address& addr) { opMR(addr, Reg64(1), T_0F, 0xC7); } void fxrstor64(const Address& addr) { opMR(addr, Reg64(1), T_0F, 0xAE); } void movq(const Reg64& reg, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opSSE(mmx, reg, T_0F, 0x7E); } void movq(const Mmx& mmx, const Reg64& reg) { if (mmx.isXMM()) db(0x66); opSSE(mmx, reg, T_0F, 0x6E); } +void movrs(const Reg& reg, const Address& addr) { opMR(addr, reg, T_0F38, reg.isBit(8) ? 0x8A : 0x8B); } void movsxd(const Reg64& reg, const Operand& op) { if (!op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION) opRO(reg, op, T_ALLOW_DIFF_SIZE, 0x63); } void pextrq(const Operand& op, const Xmm& xmm, uint8_t imm) { if (!op.isREG(64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opSSE(Reg64(xmm.getIdx()), op, T_66 | T_0F3A, 0x16, 0, imm); } void pinsrq(const Xmm& xmm, const Operand& op, uint8_t imm) { if (!op.isREG(64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opSSE(Reg64(xmm.getIdx()), op, T_66 | T_0F3A, 0x22, 0, imm); } @@ -2684,6 +2685,8 @@ void vucomxsh(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_F3 void vucomxss(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N4|T_F3|T_0F|T_W0|T_SAE_X|T_MUST_EVEX, 0x2E); } #ifdef XBYAK64 void kmovq(const Reg64& r, const Opmask& k) { opKmov(k, r, true, 64); } +void tcvtrowd2ps(const Zmm& z, const Tmm& t, const Reg32& r) { opVex(z, &r, t, T_F3|T_0F38|T_W0|T_MUST_EVEX, 0x4A); } +void tcvtrowd2ps(const Zmm& z, const Tmm& t, uint8_t imm) { opVex(z, 0, t, T_F3|T_0F3A|T_W0|T_MUST_EVEX, 0x07, imm); } void tcvtrowps2bf16h(const Zmm& z, const Tmm& t, const Reg32& r) { opVex(z, &r, t, T_F2|T_0F38|T_W0|T_MUST_EVEX, 0x6D); } void tcvtrowps2bf16h(const Zmm& z, const Tmm& t, uint8_t imm) { opVex(z, 0, t, T_F2|T_0F3A|T_W0|T_MUST_EVEX, 0x07, imm); } void tcvtrowps2bf16l(const Zmm& z, const Tmm& t, const Reg32& r) { opVex(z, &r, t, T_F3|T_0F38|T_W0|T_MUST_EVEX, 0x6D); } @@ -2694,6 +2697,10 @@ void tcvtrowps2phl(const Zmm& z, const Tmm& t, const Reg32& r) { opVex(z, &r, t, void tcvtrowps2phl(const Zmm& z, const Tmm& t, uint8_t imm) { opVex(z, 0, t, T_F2|T_0F3A|T_W0|T_MUST_EVEX, 0x77, imm); } void tilemovrow(const Zmm& z, const Tmm& t, const Reg32& r) { opVex(z, &r, t, T_66|T_0F38|T_W0|T_MUST_EVEX, 0x4A); } void tilemovrow(const Zmm& z, const Tmm& t, uint8_t imm) { opVex(z, 0, t, T_66|T_0F3A|T_W0|T_MUST_EVEX, 0x07, imm); } +void vmovrsb(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F2|T_MAP5|T_W0|T_MUST_EVEX, 0x6F); } +void vmovrsd(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3|T_MAP5|T_W0|T_MUST_EVEX, 0x6F); } +void vmovrsq(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3|T_MAP5|T_EW1|T_MUST_EVEX, 0x6F); } +void vmovrsw(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F2|T_MAP5|T_EW1|T_MUST_EVEX, 0x6F); } void vpbroadcastq(const Xmm& x, const Reg64& r) { opVex(x, 0, r, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x7C); } #endif #endif diff --git a/3rdparty/xbyak/xbyak/xbyak_util.h b/3rdparty/xbyak/xbyak/xbyak_util.h index 3f883d0712..3cee6c1395 100644 --- a/3rdparty/xbyak/xbyak/xbyak_util.h +++ b/3rdparty/xbyak/xbyak/xbyak_util.h @@ -114,6 +114,10 @@ inline T min_(T x, T y) { return x < y ? x : y; } CPU detection class @note static inline const member is supported by c++17 or later, so use template hack */ +#ifdef _MSC_VER + #pragma warning(push) + #pragma warning(disable : 4459) +#endif class Cpu { public: class Type { @@ -154,10 +158,10 @@ private: { return (1U << n) - 1; } - // [EBX:ECX:EDX] == s? - bool isEqualStr(uint32_t EBX, uint32_t ECX, uint32_t EDX, const char s[12]) const + // [ebx:ecx:edx] == s? + bool isEqualStr(uint32_t ebx, uint32_t ecx, uint32_t edx, const char s[12]) const { - return get32bitAsBE(&s[0]) == EBX && get32bitAsBE(&s[4]) == EDX && get32bitAsBE(&s[8]) == ECX; + return get32bitAsBE(&s[0]) == ebx && get32bitAsBE(&s[4]) == edx && get32bitAsBE(&s[8]) == ecx; } uint32_t extractBit(uint32_t val, uint32_t base, uint32_t end) const { @@ -567,172 +571,172 @@ public: , avx10version_(0) { uint32_t data[4] = {}; - const uint32_t& EAX = data[0]; - const uint32_t& EBX = data[1]; - const uint32_t& ECX = data[2]; - const uint32_t& EDX = data[3]; + const uint32_t& eax = data[0]; + const uint32_t& ebx = data[1]; + const uint32_t& ecx = data[2]; + const uint32_t& edx = data[3]; getCpuid(0, data); - const uint32_t maxNum = EAX; - if (isEqualStr(EBX, ECX, EDX, "AuthenticAMD")) { + const uint32_t maxNum = eax; + if (isEqualStr(ebx, ecx, edx, "AuthenticAMD")) { type_ |= tAMD; getCpuid(0x80000001, data); - if (EDX & (1U << 31)) { + if (edx & (1U << 31)) { type_ |= t3DN; // 3DNow! implies support for PREFETCHW on AMD type_ |= tPREFETCHW; } - if (EDX & (1U << 29)) { + if (edx & (1U << 29)) { // Long mode implies support for PREFETCHW on AMD type_ |= tPREFETCHW; } - } else if (isEqualStr(EBX, ECX, EDX, "GenuineIntel")) { + } else if (isEqualStr(ebx, ecx, edx, "GenuineIntel")) { type_ |= tINTEL; } // Extended flags information getCpuid(0x80000000, data); - const uint32_t maxExtendedNum = EAX; + const uint32_t maxExtendedNum = eax; if (maxExtendedNum >= 0x80000001) { getCpuid(0x80000001, data); - if (ECX & (1U << 5)) type_ |= tLZCNT; - if (ECX & (1U << 6)) type_ |= tSSE4a; - if (ECX & (1U << 8)) type_ |= tPREFETCHW; - if (EDX & (1U << 15)) type_ |= tCMOV; - if (EDX & (1U << 22)) type_ |= tMMX2; - if (EDX & (1U << 27)) type_ |= tRDTSCP; - if (EDX & (1U << 30)) type_ |= tE3DN; - if (EDX & (1U << 31)) type_ |= t3DN; + if (ecx & (1U << 5)) type_ |= tLZCNT; + if (ecx & (1U << 6)) type_ |= tSSE4a; + if (ecx & (1U << 8)) type_ |= tPREFETCHW; + if (edx & (1U << 15)) type_ |= tCMOV; + if (edx & (1U << 22)) type_ |= tMMX2; + if (edx & (1U << 27)) type_ |= tRDTSCP; + if (edx & (1U << 30)) type_ |= tE3DN; + if (edx & (1U << 31)) type_ |= t3DN; } if (maxExtendedNum >= 0x80000008) { getCpuid(0x80000008, data); - if (EBX & (1U << 0)) type_ |= tCLZERO; + if (ebx & (1U << 0)) type_ |= tCLZERO; } getCpuid(1, data); - if (ECX & (1U << 0)) type_ |= tSSE3; - if (ECX & (1U << 1)) type_ |= tPCLMULQDQ; - if (ECX & (1U << 9)) type_ |= tSSSE3; - if (ECX & (1U << 19)) type_ |= tSSE41; - if (ECX & (1U << 20)) type_ |= tSSE42; - if (ECX & (1U << 22)) type_ |= tMOVBE; - if (ECX & (1U << 23)) type_ |= tPOPCNT; - if (ECX & (1U << 25)) type_ |= tAESNI; - if (ECX & (1U << 26)) type_ |= tXSAVE; - if (ECX & (1U << 27)) type_ |= tOSXSAVE; - if (ECX & (1U << 29)) type_ |= tF16C; - if (ECX & (1U << 30)) type_ |= tRDRAND; + if (ecx & (1U << 0)) type_ |= tSSE3; + if (ecx & (1U << 1)) type_ |= tPCLMULQDQ; + if (ecx & (1U << 9)) type_ |= tSSSE3; + if (ecx & (1U << 19)) type_ |= tSSE41; + if (ecx & (1U << 20)) type_ |= tSSE42; + if (ecx & (1U << 22)) type_ |= tMOVBE; + if (ecx & (1U << 23)) type_ |= tPOPCNT; + if (ecx & (1U << 25)) type_ |= tAESNI; + if (ecx & (1U << 26)) type_ |= tXSAVE; + if (ecx & (1U << 27)) type_ |= tOSXSAVE; + if (ecx & (1U << 29)) type_ |= tF16C; + if (ecx & (1U << 30)) type_ |= tRDRAND; - if (EDX & (1U << 15)) type_ |= tCMOV; - if (EDX & (1U << 23)) type_ |= tMMX; - if (EDX & (1U << 25)) type_ |= tMMX2 | tSSE; - if (EDX & (1U << 26)) type_ |= tSSE2; + if (edx & (1U << 15)) type_ |= tCMOV; + if (edx & (1U << 23)) type_ |= tMMX; + if (edx & (1U << 25)) type_ |= tMMX2 | tSSE; + if (edx & (1U << 26)) type_ |= tSSE2; if (type_ & tOSXSAVE) { // check XFEATURE_ENABLED_MASK[2:1] = '11b' uint64_t bv = getXfeature(); if ((bv & 6) == 6) { - if (ECX & (1U << 12)) type_ |= tFMA; - if (ECX & (1U << 28)) type_ |= tAVX; + if (ecx & (1U << 12)) type_ |= tFMA; + if (ecx & (1U << 28)) type_ |= tAVX; // do *not* check AVX-512 state on macOS because it has on-demand AVX-512 support #if !defined(__APPLE__) if (((bv >> 5) & 7) == 7) #endif { getCpuidEx(7, 0, data); - if (EBX & (1U << 16)) type_ |= tAVX512F; + if (ebx & (1U << 16)) type_ |= tAVX512F; if (type_ & tAVX512F) { - if (EBX & (1U << 17)) type_ |= tAVX512DQ; - if (EBX & (1U << 21)) type_ |= tAVX512_IFMA; - if (EBX & (1U << 26)) type_ |= tAVX512PF; - if (EBX & (1U << 27)) type_ |= tAVX512ER; - if (EBX & (1U << 28)) type_ |= tAVX512CD; - if (EBX & (1U << 30)) type_ |= tAVX512BW; - if (EBX & (1U << 31)) type_ |= tAVX512VL; - if (ECX & (1U << 1)) type_ |= tAVX512_VBMI; - if (ECX & (1U << 6)) type_ |= tAVX512_VBMI2; - if (ECX & (1U << 11)) type_ |= tAVX512_VNNI; - if (ECX & (1U << 12)) type_ |= tAVX512_BITALG; - if (ECX & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ; - if (EDX & (1U << 2)) type_ |= tAVX512_4VNNIW; - if (EDX & (1U << 3)) type_ |= tAVX512_4FMAPS; - if (EDX & (1U << 8)) type_ |= tAVX512_VP2INTERSECT; - if ((type_ & tAVX512BW) && (EDX & (1U << 23))) type_ |= tAVX512_FP16; + if (ebx & (1U << 17)) type_ |= tAVX512DQ; + if (ebx & (1U << 21)) type_ |= tAVX512_IFMA; + if (ebx & (1U << 26)) type_ |= tAVX512PF; + if (ebx & (1U << 27)) type_ |= tAVX512ER; + if (ebx & (1U << 28)) type_ |= tAVX512CD; + if (ebx & (1U << 30)) type_ |= tAVX512BW; + if (ebx & (1U << 31)) type_ |= tAVX512VL; + if (ecx & (1U << 1)) type_ |= tAVX512_VBMI; + if (ecx & (1U << 6)) type_ |= tAVX512_VBMI2; + if (ecx & (1U << 11)) type_ |= tAVX512_VNNI; + if (ecx & (1U << 12)) type_ |= tAVX512_BITALG; + if (ecx & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ; + if (edx & (1U << 2)) type_ |= tAVX512_4VNNIW; + if (edx & (1U << 3)) type_ |= tAVX512_4FMAPS; + if (edx & (1U << 8)) type_ |= tAVX512_VP2INTERSECT; + if ((type_ & tAVX512BW) && (edx & (1U << 23))) type_ |= tAVX512_FP16; } } } } if (maxNum >= 7) { getCpuidEx(7, 0, data); - const uint32_t maxNumSubLeaves = EAX; - if (type_ & tAVX && (EBX & (1U << 5))) type_ |= tAVX2; - if (EBX & (1U << 3)) type_ |= tBMI1; - if (EBX & (1U << 4)) type_ |= tHLE; - if (EBX & (1U << 8)) type_ |= tBMI2; - if (EBX & (1U << 9)) type_ |= tENHANCED_REP; - if (EBX & (1U << 11)) type_ |= tRTM; - if (EBX & (1U << 14)) type_ |= tMPX; - if (EBX & (1U << 18)) type_ |= tRDSEED; - if (EBX & (1U << 19)) type_ |= tADX; - if (EBX & (1U << 20)) type_ |= tSMAP; - if (EBX & (1U << 23)) type_ |= tCLFLUSHOPT; - if (EBX & (1U << 24)) type_ |= tCLWB; - if (EBX & (1U << 29)) type_ |= tSHA; - if (ECX & (1U << 0)) type_ |= tPREFETCHWT1; - if (ECX & (1U << 5)) type_ |= tWAITPKG; - if (ECX & (1U << 8)) type_ |= tGFNI; - if (ECX & (1U << 9)) type_ |= tVAES; - if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ; - if (ECX & (1U << 23)) type_ |= tKEYLOCKER; - if (ECX & (1U << 25)) type_ |= tCLDEMOTE; - if (ECX & (1U << 27)) type_ |= tMOVDIRI; - if (ECX & (1U << 28)) type_ |= tMOVDIR64B; - if (EDX & (1U << 5)) type_ |= tUINTR; - if (EDX & (1U << 14)) type_ |= tSERIALIZE; - if (EDX & (1U << 16)) type_ |= tTSXLDTRK; - if (EDX & (1U << 22)) type_ |= tAMX_BF16; - if (EDX & (1U << 24)) type_ |= tAMX_TILE; - if (EDX & (1U << 25)) type_ |= tAMX_INT8; + const uint32_t maxNumSubLeaves = eax; + if (type_ & tAVX && (ebx & (1U << 5))) type_ |= tAVX2; + if (ebx & (1U << 3)) type_ |= tBMI1; + if (ebx & (1U << 4)) type_ |= tHLE; + if (ebx & (1U << 8)) type_ |= tBMI2; + if (ebx & (1U << 9)) type_ |= tENHANCED_REP; + if (ebx & (1U << 11)) type_ |= tRTM; + if (ebx & (1U << 14)) type_ |= tMPX; + if (ebx & (1U << 18)) type_ |= tRDSEED; + if (ebx & (1U << 19)) type_ |= tADX; + if (ebx & (1U << 20)) type_ |= tSMAP; + if (ebx & (1U << 23)) type_ |= tCLFLUSHOPT; + if (ebx & (1U << 24)) type_ |= tCLWB; + if (ebx & (1U << 29)) type_ |= tSHA; + if (ecx & (1U << 0)) type_ |= tPREFETCHWT1; + if (ecx & (1U << 5)) type_ |= tWAITPKG; + if (ecx & (1U << 8)) type_ |= tGFNI; + if (ecx & (1U << 9)) type_ |= tVAES; + if (ecx & (1U << 10)) type_ |= tVPCLMULQDQ; + if (ecx & (1U << 23)) type_ |= tKEYLOCKER; + if (ecx & (1U << 25)) type_ |= tCLDEMOTE; + if (ecx & (1U << 27)) type_ |= tMOVDIRI; + if (ecx & (1U << 28)) type_ |= tMOVDIR64B; + if (edx & (1U << 5)) type_ |= tUINTR; + if (edx & (1U << 14)) type_ |= tSERIALIZE; + if (edx & (1U << 16)) type_ |= tTSXLDTRK; + if (edx & (1U << 22)) type_ |= tAMX_BF16; + if (edx & (1U << 24)) type_ |= tAMX_TILE; + if (edx & (1U << 25)) type_ |= tAMX_INT8; if (maxNumSubLeaves >= 1) { getCpuidEx(7, 1, data); - if (EAX & (1U << 0)) type_ |= tSHA512; - if (EAX & (1U << 1)) type_ |= tSM3; - if (EAX & (1U << 2)) type_ |= tSM4; - if (EAX & (1U << 3)) type_ |= tRAO_INT; - if (EAX & (1U << 4)) type_ |= tAVX_VNNI; + if (eax & (1U << 0)) type_ |= tSHA512; + if (eax & (1U << 1)) type_ |= tSM3; + if (eax & (1U << 2)) type_ |= tSM4; + if (eax & (1U << 3)) type_ |= tRAO_INT; + if (eax & (1U << 4)) type_ |= tAVX_VNNI; if (type_ & tAVX512F) { - if (EAX & (1U << 5)) type_ |= tAVX512_BF16; + if (eax & (1U << 5)) type_ |= tAVX512_BF16; } - if (EAX & (1U << 7)) type_ |= tCMPCCXADD; - if (EAX & (1U << 21)) type_ |= tAMX_FP16; - if (EAX & (1U << 23)) type_ |= tAVX_IFMA; - if (EAX & (1U << 31)) type_ |= tMOVRS; - if (EDX & (1U << 4)) type_ |= tAVX_VNNI_INT8; - if (EDX & (1U << 5)) type_ |= tAVX_NE_CONVERT; - if (EDX & (1U << 10)) type_ |= tAVX_VNNI_INT16; - if (EDX & (1U << 14)) type_ |= tPREFETCHITI; - if (EDX & (1U << 19)) type_ |= tAVX10; - if (EDX & (1U << 21)) type_ |= tAPX_F; + if (eax & (1U << 7)) type_ |= tCMPCCXADD; + if (eax & (1U << 21)) type_ |= tAMX_FP16; + if (eax & (1U << 23)) type_ |= tAVX_IFMA; + if (eax & (1U << 31)) type_ |= tMOVRS; + if (edx & (1U << 4)) type_ |= tAVX_VNNI_INT8; + if (edx & (1U << 5)) type_ |= tAVX_NE_CONVERT; + if (edx & (1U << 10)) type_ |= tAVX_VNNI_INT16; + if (edx & (1U << 14)) type_ |= tPREFETCHITI; + if (edx & (1U << 19)) type_ |= tAVX10; + if (edx & (1U << 21)) type_ |= tAPX_F; getCpuidEx(0x1e, 1, data); - if (EAX & (1U << 4)) type_ |= tAMX_FP8; - if (EAX & (1U << 5)) type_ |= tAMX_TRANSPOSE; - if (EAX & (1U << 6)) type_ |= tAMX_TF32; - if (EAX & (1U << 7)) type_ |= tAMX_AVX512; - if (EAX & (1U << 8)) type_ |= tAMX_MOVRS; + if (eax & (1U << 4)) type_ |= tAMX_FP8; + if (eax & (1U << 5)) type_ |= tAMX_TRANSPOSE; + if (eax & (1U << 6)) type_ |= tAMX_TF32; + if (eax & (1U << 7)) type_ |= tAMX_AVX512; + if (eax & (1U << 8)) type_ |= tAMX_MOVRS; } } if (maxNum >= 0x19) { getCpuidEx(0x19, 0, data); - if (EBX & (1U << 0)) type_ |= tAESKLE; - if (EBX & (1U << 2)) type_ |= tWIDE_KL; + if (ebx & (1U << 0)) type_ |= tAESKLE; + if (ebx & (1U << 2)) type_ |= tWIDE_KL; if (type_ & (tKEYLOCKER|tAESKLE|tWIDE_KL)) type_ |= tKEYLOCKER_WIDE; } if (has(tAVX10) && maxNum >= 0x24) { getCpuidEx(0x24, 0, data); - avx10version_ = EBX & mask(7); + avx10version_ = ebx & mask(7); } setFamily(); setNumCores(); @@ -752,6 +756,9 @@ public: } int getAVX10version() const { return avx10version_; } }; +#ifdef _MSC_VER + #pragma warning(pop) +#endif #ifndef XBYAK_ONLY_CLASS_CPU class Clock {