blob: a75d3f937bd97efdcd7a8f8988b11ec600add9e2 [file] [edit]
#ifndef XBYAK_XBYAK_UTIL_H_
#define XBYAK_XBYAK_UTIL_H_
#ifdef XBYAK_ONLY_CLASS_CPU
#include <stdint.h>
#include <stdlib.h>
#include <assert.h>
#ifndef XBYAK_THROW
#define XBYAK_THROW(x) ;
#define XBYAK_THROW_RET(x, y) return y;
#endif
#ifndef XBYAK_CONSTEXPR
#if ((__cplusplus >= 201402L) && !(!defined(__clang__) && defined(__GNUC__) && (__GNUC__ <= 5))) || (defined(_MSC_VER) && _MSC_VER >= 1910)
#define XBYAK_CONSTEXPR constexpr
#else
#define XBYAK_CONSTEXPR
#endif
#define XBYAK_CPUMASK_COMPACT 0
#endif
#else
#include <string.h>
#include <stdio.h>
/**
utility class and functions for Xbyak
Xbyak::util::Clock ; rdtsc timer
Xbyak::util::Cpu ; detect CPU
*/
#include "xbyak.h"
#endif // XBYAK_ONLY_CLASS_CPU
#if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__)) || defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC))
#define XBYAK_INTEL_CPU_SPECIFIC
#endif
#ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _WIN32
#if defined(_MSC_VER) && (_MSC_VER < 1400) && defined(XBYAK32)
static inline __declspec(naked) void __cpuid(int[4], int)
{
__asm {
push ebx
push esi
mov eax, dword ptr [esp + 4 * 2 + 8] // eaxIn
cpuid
mov esi, dword ptr [esp + 4 * 2 + 4] // data
mov dword ptr [esi], eax
mov dword ptr [esi + 4], ebx
mov dword ptr [esi + 8], ecx
mov dword ptr [esi + 12], edx
pop esi
pop ebx
ret
}
}
#else
#include <intrin.h> // for __cpuid
#endif
#else
#ifndef __GNUC_PREREQ
#define __GNUC_PREREQ(major, minor) ((((__GNUC__) << 16) + (__GNUC_MINOR__)) >= (((major) << 16) + (minor)))
#endif
#if __GNUC_PREREQ(4, 3) && !defined(__APPLE__)
#include <cpuid.h>
#else
#if defined(__APPLE__) && defined(XBYAK32) // avoid err : can't find a register in class `BREG' while reloading `asm'
#define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
#define __cpuid_count(eaxIn, ecxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn))
#else
#define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
#define __cpuid_count(eaxIn, ecxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn))
#endif
#endif
#endif
#endif
#ifdef XBYAK_USE_VTUNE
// -I /opt/intel/vtune_amplifier/include/ -L /opt/intel/vtune_amplifier/lib64 -ljitprofiling -ldl
#include <jitprofiling.h>
#ifdef _MSC_VER
#pragma comment(lib, "libittnotify.lib")
#endif
#ifdef __linux__
#include <dlfcn.h>
#endif
#endif
#ifdef __linux__
#define XBYAK_USE_PERF
#endif
#ifndef XBYAK_CPU_CACHE
#define XBYAK_CPU_CACHE 1
#endif
#if XBYAK_CPU_CACHE == 1
#include <vector>
#ifndef XBYAK_CPUMASK_COMPACT
#define XBYAK_CPUMASK_COMPACT 1
#endif
#if XBYAK_CPUMASK_COMPACT == 0
#include <set>
#endif
#ifdef _WIN32
#include <windows.h>
#else
#include <sched.h>
#endif
namespace Xbyak { namespace util {
class CpuTopology;
class Cpu;
namespace impl {
bool initCpuTopology(CpuTopology& cpuTopo);
} // Xbyak::util::impl
} } // Xbyak::util
#endif // XBYAK_CPU_CACHE
namespace Xbyak { namespace util {
typedef enum {
SmtLevel = 1,
CoreLevel = 2
} CpuTopologyLevel;
typedef CpuTopologyLevel IntelCpuTopologyLevel; // for backward compatibility
namespace local {
template<uint64_t L, uint64_t H = 0>
struct TypeT {
};
template<uint64_t L1, uint64_t H1, uint64_t L2, uint64_t H2>
XBYAK_CONSTEXPR TypeT<L1 | L2, H1 | H2> operator|(TypeT<L1, H1>, TypeT<L2, H2>) { return TypeT<L1 | L2, H1 | H2>(); }
template<typename T>
inline T max_(T x, T y) { return x >= y ? x : y; }
template<typename T>
inline T min_(T x, T y) { return x < y ? x : y; }
} // local
/**
CPU detection class
@note static inline const member is supported by c++17 or later, so use template hack
*/
#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable : 4459)
#endif
class Cpu {
public:
class Type {
uint64_t L;
uint64_t H;
public:
Type(uint64_t L = 0, uint64_t H = 0) : L(L), H(H) { }
template<uint64_t L_, uint64_t H_>
Type(local::TypeT<L_, H_>) : L(L_), H(H_) {}
Type& operator&=(const Type& rhs) { L &= rhs.L; H &= rhs.H; return *this; }
Type& operator|=(const Type& rhs) { L |= rhs.L; H |= rhs.H; return *this; }
Type operator&(const Type& rhs) const { Type t = *this; t &= rhs; return t; }
Type operator|(const Type& rhs) const { Type t = *this; t |= rhs; return t; }
bool operator==(const Type& rhs) const { return H == rhs.H && L == rhs.L; }
bool operator!=(const Type& rhs) const { return !operator==(rhs); }
// without explicit because backward compatilibity
operator bool() const { return (H | L) != 0; }
uint64_t getL() const { return L; }
uint64_t getH() const { return H; }
};
private:
Type type_;
//system topology
static const size_t maxTopologyLevels = 2;
uint32_t numCores_[maxTopologyLevels];
static const uint32_t maxNumberCacheLevels = 10;
uint32_t dataCacheSize_[maxNumberCacheLevels];
uint32_t coresSharingDataCache_[maxNumberCacheLevels];
uint32_t dataCacheLevels_;
uint32_t avx10version_;
uint32_t get32bitAsBE(const char *x) const
{
return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24);
}
uint32_t mask(int n) const
{
return (1U << n) - 1;
}
// [ebx:ecx:edx] == s?
bool isEqualStr(uint32_t ebx, uint32_t ecx, uint32_t edx, const char s[12]) const
{
return get32bitAsBE(&s[0]) == ebx && get32bitAsBE(&s[4]) == edx && get32bitAsBE(&s[8]) == ecx;
}
uint32_t extractBit(uint32_t val, uint32_t base, uint32_t end) const
{
return (val >> base) & ((1u << (end + 1 - base)) - 1);
}
void setFamily()
{
uint32_t data[4] = {};
getCpuid(1, data);
stepping = extractBit(data[0], 0, 3);
model = extractBit(data[0], 4, 7);
family = extractBit(data[0], 8, 11);
//type = extractBit(data[0], 12, 13);
extModel = extractBit(data[0], 16, 19);
extFamily = extractBit(data[0], 20, 27);
if (family == 0x0f) {
displayFamily = family + extFamily;
} else {
displayFamily = family;
}
if ((has(tINTEL) && family == 6) || family == 0x0f) {
displayModel = (extModel << 4) + model;
} else {
displayModel = model;
}
}
void setNumCores()
{
if (!has(tINTEL) && !has(tAMD)) return;
uint32_t data[4] = {};
getCpuid(0x0, data);
if (data[0] >= 0xB) {
// Check if "Extended Topology Enumeration" is implemented.
getCpuidEx(0xB, 0, data);
if (data[0] != 0 || data[1] != 0) {
/*
if leaf 11 exists(x2APIC is supported),
we use it to get the number of smt cores and cores on socket
leaf 0xB can be zeroed-out by a hypervisor
*/
for (uint32_t i = 0; i < maxTopologyLevels; i++) {
getCpuidEx(0xB, i, data);
CpuTopologyLevel level = (CpuTopologyLevel)extractBit(data[2], 8, 15);
if (level == SmtLevel || level == CoreLevel) {
numCores_[level - 1] = extractBit(data[1], 0, 15);
}
}
/*
Fallback values in case a hypervisor has the leaf zeroed-out.
*/
numCores_[SmtLevel - 1] = local::max_(1u, numCores_[SmtLevel - 1]);
numCores_[CoreLevel - 1] = local::max_(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]);
return;
}
}
// "Extended Topology Enumeration" is not supported.
if (has(tAMD)) {
/*
AMD - Legacy Method
*/
int physicalThreadCount = 0;
getCpuid(0x1, data);
int logicalProcessorCount = extractBit(data[1], 16, 23);
int htt = extractBit(data[3], 28, 28); // Hyper-threading technology.
getCpuid(0x80000000, data);
uint32_t highestExtendedLeaf = data[0];
if (highestExtendedLeaf >= 0x80000008) {
getCpuid(0x80000008, data);
physicalThreadCount = extractBit(data[2], 0, 7) + 1;
}
if (htt == 0) {
numCores_[SmtLevel - 1] = 1;
numCores_[CoreLevel - 1] = 1;
} else if (physicalThreadCount > 1) {
if ((displayFamily >= 0x17) && (highestExtendedLeaf >= 0x8000001E)) {
// Zen overreports its core count by a factor of two.
getCpuid(0x8000001E, data);
int threadsPerComputeUnit = extractBit(data[1], 8, 15) + 1;
physicalThreadCount /= threadsPerComputeUnit;
}
numCores_[SmtLevel - 1] = logicalProcessorCount / physicalThreadCount;
numCores_[CoreLevel - 1] = logicalProcessorCount;
} else {
numCores_[SmtLevel - 1] = 1;
numCores_[CoreLevel - 1] = logicalProcessorCount > 1 ? logicalProcessorCount : 2;
}
} else {
/*
Intel - Legacy Method
*/
int physicalThreadCount = 0;
getCpuid(0x1, data);
int logicalProcessorCount = extractBit(data[1], 16, 23);
int htt = extractBit(data[3], 28, 28); // Hyper-threading technology.
getCpuid(0, data);
if (data[0] >= 0x4) {
getCpuid(0x4, data);
physicalThreadCount = extractBit(data[0], 26, 31) + 1;
}
if (htt == 0) {
numCores_[SmtLevel - 1] = 1;
numCores_[CoreLevel - 1] = 1;
} else if (physicalThreadCount > 1) {
numCores_[SmtLevel - 1] = logicalProcessorCount / physicalThreadCount;
numCores_[CoreLevel - 1] = logicalProcessorCount;
} else {
numCores_[SmtLevel - 1] = 1;
numCores_[CoreLevel - 1] = logicalProcessorCount > 0 ? logicalProcessorCount : 1;
}
}
}
void setCacheHierarchy()
{
uint32_t data[4] = {};
if (has(tAMD)) {
getCpuid(0x80000000, data);
if (data[0] >= 0x8000001D) {
// For modern AMD CPUs.
dataCacheLevels_ = 0;
for (uint32_t subLeaf = 0; dataCacheLevels_ < maxNumberCacheLevels; subLeaf++) {
getCpuidEx(0x8000001D, subLeaf, data);
int cacheType = extractBit(data[0], 0, 4);
/*
cacheType
00h - Null; no more caches
01h - Data cache
02h - Instrution cache
03h - Unified cache
04h-1Fh - Reserved
*/
if (cacheType == 0) break; // No more caches.
if (cacheType == 0x2) continue; // Skip instruction cache.
int fullyAssociative = extractBit(data[0], 9, 9);
int numSharingCache = extractBit(data[0], 14, 25) + 1;
int cacheNumWays = extractBit(data[1], 22, 31) + 1;
int cachePhysPartitions = extractBit(data[1], 12, 21) + 1;
int cacheLineSize = extractBit(data[1], 0, 11) + 1;
int cacheNumSets = data[2] + 1;
dataCacheSize_[dataCacheLevels_] =
cacheLineSize * cachePhysPartitions * cacheNumWays;
if (fullyAssociative == 0) {
dataCacheSize_[dataCacheLevels_] *= cacheNumSets;
}
if (subLeaf > 0) {
numSharingCache = local::min_(numSharingCache, (int)numCores_[1]);
numSharingCache /= local::max_(1u, coresSharingDataCache_[0]);
}
coresSharingDataCache_[dataCacheLevels_] = numSharingCache;
dataCacheLevels_ += 1;
}
coresSharingDataCache_[0] = local::min_(1u, coresSharingDataCache_[0]);
} else if (data[0] >= 0x80000006) {
// For legacy AMD CPUs, use leaf 0x80000005 for L1 cache
// and 0x80000006 for L2 and L3 cache.
dataCacheLevels_ = 1;
getCpuid(0x80000005, data);
int l1dc_size = extractBit(data[2], 24, 31);
dataCacheSize_[0] = l1dc_size * 1024;
coresSharingDataCache_[0] = 1;
getCpuid(0x80000006, data);
// L2 cache
int l2_assoc = extractBit(data[2], 12, 15);
if (l2_assoc > 0) {
dataCacheLevels_ = 2;
int l2_size = extractBit(data[2], 16, 31);
dataCacheSize_[1] = l2_size * 1024;
coresSharingDataCache_[1] = 1;
}
// L3 cache
int l3_assoc = extractBit(data[3], 12, 15);
if (l3_assoc > 0) {
dataCacheLevels_ = 3;
int l3_size = extractBit(data[3], 18, 31);
dataCacheSize_[2] = l3_size * 512 * 1024;
coresSharingDataCache_[2] = numCores_[1];
}
}
} else if (has(tINTEL)) {
// Use the "Deterministic Cache Parameters" leaf is supported.
const uint32_t NO_CACHE = 0;
const uint32_t DATA_CACHE = 1;
//const uint32_t INSTRUCTION_CACHE = 2;
const uint32_t UNIFIED_CACHE = 3;
uint32_t smt_width = 0;
uint32_t logical_cores = 0;
smt_width = numCores_[0];
logical_cores = numCores_[1];
/*
Assumptions:
the first level of data cache is not shared (which is the
case for every existing architecture) and use this to
determine the SMT width for arch not supporting leaf 11.
when leaf 4 reports a number of core less than numCores_
on socket reported by leaf 11, then it is a correct number
of cores not an upperbound.
*/
for (int i = 0; dataCacheLevels_ < maxNumberCacheLevels; i++) {
getCpuidEx(0x4, i, data);
uint32_t cacheType = extractBit(data[0], 0, 4);
if (cacheType == NO_CACHE) break;
if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) {
uint32_t actual_logical_cores = extractBit(data[0], 14, 25) + 1;
if (logical_cores != 0) { // true only if leaf 0xB is supported and valid
actual_logical_cores = local::min_(actual_logical_cores, logical_cores);
}
assert(actual_logical_cores != 0);
dataCacheSize_[dataCacheLevels_] =
(extractBit(data[1], 22, 31) + 1)
* (extractBit(data[1], 12, 21) + 1)
* (extractBit(data[1], 0, 11) + 1)
* (data[2] + 1);
if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores;
assert(smt_width != 0);
coresSharingDataCache_[dataCacheLevels_] = local::max_(actual_logical_cores / smt_width, 1u);
dataCacheLevels_++;
}
}
}
}
public:
int model;
int family;
int stepping;
int extModel;
int extFamily;
int displayFamily; // family + extFamily
int displayModel; // model + extModel
uint32_t getNumCores(CpuTopologyLevel level) const {
switch (level) {
case SmtLevel: return numCores_[level - 1];
case CoreLevel: return numCores_[level - 1] / numCores_[SmtLevel - 1];
default: XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0)
}
}
uint32_t getDataCacheLevels() const { return dataCacheLevels_; }
uint32_t getCoresSharingDataCache(uint32_t i) const
{
if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0)
return coresSharingDataCache_[i];
}
uint32_t getDataCacheSize(uint32_t i) const
{
if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0)
return dataCacheSize_[i];
}
/*
data[] = { eax, ebx, ecx, edx }
*/
static inline void getCpuidEx(uint32_t eaxIn, uint32_t ecxIn, uint32_t data[4])
{
#ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _MSC_VER
__cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn);
#else
__cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]);
#endif
#else
(void)eaxIn;
(void)ecxIn;
(void)data;
#endif
}
static inline void getCpuid(uint32_t eaxIn, uint32_t data[4])
{
getCpuidEx(eaxIn, 0, data);
}
static inline uint64_t getXfeature()
{
#ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _MSC_VER
return _xgetbv(0);
#else
uint32_t eax, edx;
// xgetvb is not support on gcc 4.2
// __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
__asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0));
return ((uint64_t)edx << 32) | eax;
#endif
#else
return 0;
#endif
}
#define XBYAK_SPLIT_ID(id) ((0 <= id && id < 64) ? (1ull << (id % 64)) : 0), (id >= 64 ? (1ull << (id % 64)) : 0)
#if (__cplusplus >= 201103) || (defined(_MSC_VER) && (_MSC_VER >= 1700)) /* VS2012 */
#define XBYAK_DEFINE_TYPE(id, NAME) static const constexpr local::TypeT<XBYAK_SPLIT_ID(id)> NAME{}
#else
#define XBYAK_DEFINE_TYPE(id, NAME) static const local::TypeT<XBYAK_SPLIT_ID(id)> NAME
#endif
XBYAK_DEFINE_TYPE(0, tMMX);
XBYAK_DEFINE_TYPE(1, tMMX2);
XBYAK_DEFINE_TYPE(2, tCMOV);
XBYAK_DEFINE_TYPE(3, tSSE);
XBYAK_DEFINE_TYPE(4, tSSE2);
XBYAK_DEFINE_TYPE(5, tSSE3);
XBYAK_DEFINE_TYPE(6, tSSSE3);
XBYAK_DEFINE_TYPE(7, tSSE41);
XBYAK_DEFINE_TYPE(8, tSSE42);
XBYAK_DEFINE_TYPE(9, tPOPCNT);
XBYAK_DEFINE_TYPE(10, tAESNI);
XBYAK_DEFINE_TYPE(11, tAVX512_FP16);
XBYAK_DEFINE_TYPE(12, tOSXSAVE);
XBYAK_DEFINE_TYPE(13, tPCLMULQDQ);
XBYAK_DEFINE_TYPE(14, tAVX);
XBYAK_DEFINE_TYPE(15, tFMA);
XBYAK_DEFINE_TYPE(16, t3DN);
XBYAK_DEFINE_TYPE(17, tE3DN);
XBYAK_DEFINE_TYPE(18, tWAITPKG);
XBYAK_DEFINE_TYPE(19, tRDTSCP);
XBYAK_DEFINE_TYPE(20, tAVX2);
XBYAK_DEFINE_TYPE(21, tBMI1); // andn, bextr, blsi, blsmsk, blsr, tzcnt
XBYAK_DEFINE_TYPE(22, tBMI2); // bzhi, mulx, pdep, pext, rorx, sarx, shlx, shrx
XBYAK_DEFINE_TYPE(23, tLZCNT);
XBYAK_DEFINE_TYPE(24, tINTEL);
XBYAK_DEFINE_TYPE(25, tAMD);
XBYAK_DEFINE_TYPE(26, tENHANCED_REP); // enhanced rep movsb/stosb
XBYAK_DEFINE_TYPE(27, tRDRAND);
XBYAK_DEFINE_TYPE(28, tADX); // adcx, adox
XBYAK_DEFINE_TYPE(29, tRDSEED); // rdseed
XBYAK_DEFINE_TYPE(30, tSMAP); // stac
XBYAK_DEFINE_TYPE(31, tHLE); // xacquire, xrelease, xtest
XBYAK_DEFINE_TYPE(32, tRTM); // xbegin, xend, xabort
XBYAK_DEFINE_TYPE(33, tF16C); // vcvtph2ps, vcvtps2ph
XBYAK_DEFINE_TYPE(34, tMOVBE); // mobve
XBYAK_DEFINE_TYPE(35, tAVX512F);
XBYAK_DEFINE_TYPE(36, tAVX512DQ);
XBYAK_DEFINE_TYPE(37, tAVX512_IFMA);
XBYAK_DEFINE_TYPE(37, tAVX512IFMA);// = tAVX512_IFMA;
// XBYAK_DEFINE_TYPE(38, tAVX512PF); // Xeon Phi only
// XBYAK_DEFINE_TYPE(39, tAVX512ER);
XBYAK_DEFINE_TYPE(40, tAVX512CD);
XBYAK_DEFINE_TYPE(41, tAVX512BW);
XBYAK_DEFINE_TYPE(42, tAVX512VL);
XBYAK_DEFINE_TYPE(43, tAVX512_VBMI);
XBYAK_DEFINE_TYPE(43, tAVX512VBMI); // = tAVX512_VBMI; // changed by Intel's manual
// XBYAK_DEFINE_TYPE(44, tAVX512_4VNNIW);
// XBYAK_DEFINE_TYPE(45, tAVX512_4FMAPS);
// XBYAK_DEFINE_TYPE(46, tPREFETCHWT1);
XBYAK_DEFINE_TYPE(47, tPREFETCHW);
XBYAK_DEFINE_TYPE(48, tSHA);
XBYAK_DEFINE_TYPE(49, tMPX);
XBYAK_DEFINE_TYPE(50, tAVX512_VBMI2);
XBYAK_DEFINE_TYPE(51, tGFNI);
XBYAK_DEFINE_TYPE(52, tVAES);
XBYAK_DEFINE_TYPE(53, tVPCLMULQDQ);
XBYAK_DEFINE_TYPE(54, tAVX512_VNNI);
XBYAK_DEFINE_TYPE(55, tAVX512_BITALG);
XBYAK_DEFINE_TYPE(56, tAVX512_VPOPCNTDQ);
XBYAK_DEFINE_TYPE(57, tAVX512_BF16);
XBYAK_DEFINE_TYPE(58, tAVX512_VP2INTERSECT);
XBYAK_DEFINE_TYPE(59, tAMX_TILE);
XBYAK_DEFINE_TYPE(60, tAMX_INT8);
XBYAK_DEFINE_TYPE(61, tAMX_BF16);
XBYAK_DEFINE_TYPE(62, tAVX_VNNI);
XBYAK_DEFINE_TYPE(63, tCLFLUSHOPT);
XBYAK_DEFINE_TYPE(64, tCLDEMOTE);
XBYAK_DEFINE_TYPE(65, tMOVDIRI);
XBYAK_DEFINE_TYPE(66, tMOVDIR64B);
XBYAK_DEFINE_TYPE(67, tCLZERO); // AMD Zen
XBYAK_DEFINE_TYPE(68, tAMX_FP16);
XBYAK_DEFINE_TYPE(69, tAVX_VNNI_INT8);
XBYAK_DEFINE_TYPE(70, tAVX_NE_CONVERT);
XBYAK_DEFINE_TYPE(71, tAVX_IFMA);
XBYAK_DEFINE_TYPE(72, tRAO_INT);
XBYAK_DEFINE_TYPE(73, tCMPCCXADD);
XBYAK_DEFINE_TYPE(74, tPREFETCHITI);
XBYAK_DEFINE_TYPE(75, tSERIALIZE);
XBYAK_DEFINE_TYPE(76, tUINTR);
XBYAK_DEFINE_TYPE(77, tXSAVE);
XBYAK_DEFINE_TYPE(78, tSHA512);
XBYAK_DEFINE_TYPE(79, tSM3);
XBYAK_DEFINE_TYPE(80, tSM4);
XBYAK_DEFINE_TYPE(81, tAVX_VNNI_INT16);
XBYAK_DEFINE_TYPE(82, tAPX_F);
XBYAK_DEFINE_TYPE(83, tAVX10);
XBYAK_DEFINE_TYPE(84, tAESKLE);
XBYAK_DEFINE_TYPE(85, tWIDE_KL);
XBYAK_DEFINE_TYPE(86, tKEYLOCKER);
XBYAK_DEFINE_TYPE(87, tKEYLOCKER_WIDE);
XBYAK_DEFINE_TYPE(88, tSSE4a);
XBYAK_DEFINE_TYPE(89, tCLWB);
XBYAK_DEFINE_TYPE(90, tTSXLDTRK);
XBYAK_DEFINE_TYPE(91, tAMX_TRANSPOSE);
XBYAK_DEFINE_TYPE(92, tAMX_TF32);
XBYAK_DEFINE_TYPE(93, tAMX_AVX512);
XBYAK_DEFINE_TYPE(94, tAMX_MOVRS);
XBYAK_DEFINE_TYPE(95, tAMX_FP8);
XBYAK_DEFINE_TYPE(96, tMOVRS);
XBYAK_DEFINE_TYPE(97, tHYBRID);
XBYAK_DEFINE_TYPE(98, tAMX_COMPLEX);
#undef XBYAK_SPLIT_ID
#undef XBYAK_DEFINE_TYPE
Cpu()
: type_()
, numCores_()
, dataCacheSize_()
, coresSharingDataCache_()
, dataCacheLevels_(0)
, avx10version_(0)
{
uint32_t data[4] = {};
const uint32_t& eax = data[0];
const uint32_t& ebx = data[1];
const uint32_t& ecx = data[2];
const uint32_t& edx = data[3];
getCpuid(0, data);
const uint32_t maxNum = eax;
if (isEqualStr(ebx, ecx, edx, "AuthenticAMD")) {
type_ |= tAMD;
getCpuid(0x80000001, data);
if (edx & (1U << 31)) {
type_ |= t3DN;
// 3DNow! implies support for PREFETCHW on AMD
type_ |= tPREFETCHW;
}
if (edx & (1U << 29)) {
// Long mode implies support for PREFETCHW on AMD
type_ |= tPREFETCHW;
}
} else if (isEqualStr(ebx, ecx, edx, "GenuineIntel")) {
type_ |= tINTEL;
}
// Extended flags information
getCpuid(0x80000000, data);
const uint32_t maxExtendedNum = eax;
if (maxExtendedNum >= 0x80000001) {
getCpuid(0x80000001, data);
if (ecx & (1U << 5)) type_ |= tLZCNT;
if (ecx & (1U << 6)) type_ |= tSSE4a;
if (ecx & (1U << 8)) type_ |= tPREFETCHW;
if (edx & (1U << 15)) type_ |= tCMOV;
if (edx & (1U << 22)) type_ |= tMMX2;
if (edx & (1U << 27)) type_ |= tRDTSCP;
if (edx & (1U << 30)) type_ |= tE3DN;
if (edx & (1U << 31)) type_ |= t3DN;
}
if (maxExtendedNum >= 0x80000008) {
getCpuid(0x80000008, data);
if (ebx & (1U << 0)) type_ |= tCLZERO;
}
getCpuid(1, data);
if (ecx & (1U << 0)) type_ |= tSSE3;
if (ecx & (1U << 1)) type_ |= tPCLMULQDQ;
if (ecx & (1U << 9)) type_ |= tSSSE3;
if (ecx & (1U << 19)) type_ |= tSSE41;
if (ecx & (1U << 20)) type_ |= tSSE42;
if (ecx & (1U << 22)) type_ |= tMOVBE;
if (ecx & (1U << 23)) type_ |= tPOPCNT;
if (ecx & (1U << 25)) type_ |= tAESNI;
if (ecx & (1U << 26)) type_ |= tXSAVE;
if (ecx & (1U << 27)) type_ |= tOSXSAVE;
if (ecx & (1U << 29)) type_ |= tF16C;
if (ecx & (1U << 30)) type_ |= tRDRAND;
if (edx & (1U << 15)) type_ |= tCMOV;
if (edx & (1U << 23)) type_ |= tMMX;
if (edx & (1U << 25)) type_ |= tMMX2 | tSSE;
if (edx & (1U << 26)) type_ |= tSSE2;
if (type_ & tOSXSAVE) {
// check XFEATURE_ENABLED_MASK[2:1] = '11b'
uint64_t bv = getXfeature();
if ((bv & 6) == 6) {
if (ecx & (1U << 12)) type_ |= tFMA;
if (ecx & (1U << 28)) type_ |= tAVX;
// do *not* check AVX-512 state on macOS because it has on-demand AVX-512 support
#if !defined(__APPLE__)
if (((bv >> 5) & 7) == 7)
#endif
{
getCpuidEx(7, 0, data);
if (ebx & (1U << 16)) type_ |= tAVX512F;
if (type_ & tAVX512F) {
if (ebx & (1U << 17)) type_ |= tAVX512DQ;
if (ebx & (1U << 21)) type_ |= tAVX512_IFMA;
if (ebx & (1U << 28)) type_ |= tAVX512CD;
if (ebx & (1U << 30)) type_ |= tAVX512BW;
if (ebx & (1U << 31)) type_ |= tAVX512VL;
if (ecx & (1U << 1)) type_ |= tAVX512_VBMI;
if (ecx & (1U << 6)) type_ |= tAVX512_VBMI2;
if (ecx & (1U << 11)) type_ |= tAVX512_VNNI;
if (ecx & (1U << 12)) type_ |= tAVX512_BITALG;
if (ecx & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ;
if (edx & (1U << 8)) type_ |= tAVX512_VP2INTERSECT;
if ((type_ & tAVX512BW) && (edx & (1U << 23))) type_ |= tAVX512_FP16;
}
}
}
}
if (maxNum >= 7) {
getCpuidEx(7, 0, data);
const uint32_t maxNumSubLeaves = eax;
if (type_ & tAVX && (ebx & (1U << 5))) type_ |= tAVX2;
if (ebx & (1U << 3)) type_ |= tBMI1;
if (ebx & (1U << 4)) type_ |= tHLE;
if (ebx & (1U << 8)) type_ |= tBMI2;
if (ebx & (1U << 9)) type_ |= tENHANCED_REP;
if (ebx & (1U << 11)) type_ |= tRTM;
if (ebx & (1U << 14)) type_ |= tMPX;
if (ebx & (1U << 18)) type_ |= tRDSEED;
if (ebx & (1U << 19)) type_ |= tADX;
if (ebx & (1U << 20)) type_ |= tSMAP;
if (ebx & (1U << 23)) type_ |= tCLFLUSHOPT;
if (ebx & (1U << 24)) type_ |= tCLWB;
if (ebx & (1U << 29)) type_ |= tSHA;
if (ecx & (1U << 5)) type_ |= tWAITPKG;
if (ecx & (1U << 8)) type_ |= tGFNI;
if (ecx & (1U << 9)) type_ |= tVAES;
if (ecx & (1U << 10)) type_ |= tVPCLMULQDQ;
if (ecx & (1U << 23)) type_ |= tKEYLOCKER;
if (ecx & (1U << 25)) type_ |= tCLDEMOTE;
if (ecx & (1U << 27)) type_ |= tMOVDIRI;
if (ecx & (1U << 28)) type_ |= tMOVDIR64B;
if (edx & (1U << 5)) type_ |= tUINTR;
if (edx & (1U << 14)) type_ |= tSERIALIZE;
if (edx & (1U << 15)) type_ |= tHYBRID;
if (edx & (1U << 16)) type_ |= tTSXLDTRK;
if (edx & (1U << 22)) type_ |= tAMX_BF16;
if (edx & (1U << 24)) type_ |= tAMX_TILE;
if (edx & (1U << 25)) type_ |= tAMX_INT8;
if (maxNumSubLeaves >= 1) {
getCpuidEx(7, 1, data);
if (eax & (1U << 0)) type_ |= tSHA512;
if (eax & (1U << 1)) type_ |= tSM3;
if (eax & (1U << 2)) type_ |= tSM4;
if (eax & (1U << 3)) type_ |= tRAO_INT;
if (eax & (1U << 4)) type_ |= tAVX_VNNI;
if (type_ & tAVX512F) {
if (eax & (1U << 5)) type_ |= tAVX512_BF16;
}
if (eax & (1U << 7)) type_ |= tCMPCCXADD;
if (eax & (1U << 21)) type_ |= tAMX_FP16;
if (eax & (1U << 23)) type_ |= tAVX_IFMA;
if (eax & (1U << 31)) type_ |= tMOVRS;
if (edx & (1U << 4)) type_ |= tAVX_VNNI_INT8;
if (edx & (1U << 5)) type_ |= tAVX_NE_CONVERT;
if (edx & (1U << 8)) type_ |= tAMX_COMPLEX;
if (edx & (1U << 10)) type_ |= tAVX_VNNI_INT16;
if (edx & (1U << 14)) type_ |= tPREFETCHITI;
if (edx & (1U << 19)) type_ |= tAVX10;
if (edx & (1U << 21)) type_ |= tAPX_F;
getCpuidEx(0x1e, 1, data);
if (eax & (1U << 4)) type_ |= tAMX_FP8;
if (eax & (1U << 5)) type_ |= tAMX_TRANSPOSE;
if (eax & (1U << 6)) type_ |= tAMX_TF32;
if (eax & (1U << 7)) type_ |= tAMX_AVX512;
if (eax & (1U << 8)) type_ |= tAMX_MOVRS;
}
}
if (maxNum >= 0x19) {
getCpuidEx(0x19, 0, data);
if (ebx & (1U << 0)) type_ |= tAESKLE;
if (ebx & (1U << 2)) type_ |= tWIDE_KL;
if (type_ & (tKEYLOCKER|tAESKLE|tWIDE_KL)) type_ |= tKEYLOCKER_WIDE;
}
if (has(tAVX10) && maxNum >= 0x24) {
getCpuidEx(0x24, 0, data);
avx10version_ = ebx & mask(7);
}
setFamily();
setNumCores();
setCacheHierarchy();
}
void putFamily() const
{
#ifndef XBYAK_ONLY_CLASS_CPU
printf("family=%d, model=%X, stepping=%d, extFamily=%d, extModel=%X\n",
family, model, stepping, extFamily, extModel);
printf("display:family=%X, model=%X\n", displayFamily, displayModel);
#endif
}
bool has(const Type& type) const
{
return (type & type_) == type;
}
int getAVX10version() const { return avx10version_; }
};
#ifdef _MSC_VER
#pragma warning(pop)
#endif
#ifndef XBYAK_ONLY_CLASS_CPU
#if XBYAK_CPU_CACHE == 1
enum CoreType {
Unknown,
Performance, // P-core (Intel)
Efficient, // E-core (Intel)
Standard // Non-hybrid
};
inline const char *getCoreTypeStr(int coreType)
{
switch (coreType) {
case Performance: return "P-core";
case Efficient: return "E-core";
case Standard: return "Standard";
default: return "Unknown";
}
}
enum CacheType {
L1i,
L1d,
L2,
L3,
CACHE_UNKNOWN,
CACHE_TYPE_NUM = CACHE_UNKNOWN
};
inline const char* getCacheTypeStr(int type)
{
switch (type) {
case L1i: return "L1i";
case L1d: return "L1d";
case L2: return "L2";
case L3: return "L3";
default: return "Unknown";
}
}
namespace impl {
inline void appendStr(std::string& s, uint32_t v)
{
#if __cplusplus >= 201103L
s += std::to_string(v);
#else
char buf[16];
snprintf(buf, sizeof(buf), "%u", v);
s += buf;
#endif
}
// str = "(int|range)[,(int|range)]*"
// range = int-int
// e.g. "1,3,5", "0-3,5-7", ""
template<class T>
bool setStr(T& x, const char *str)
{
const char *p = str;
while (*p) {
if (p != str) {
if (*p != ',') return false;
p++;
}
char *endp;
uint32_t v = uint32_t(strtoul(p, &endp, 10));
if (endp == p) return false;
if (*endp == '-') {
const char *rangeStart = endp + 1;
uint32_t next = uint32_t(strtoul(rangeStart, &endp, 10));
if (endp == rangeStart) return false;
if (!x.appendRange(v, next)) return false;
} else {
if (!x.append(v)) return false;
}
if (*endp == '\0') return true;
p = endp;
}
return true;
}
} // impl
#ifndef XBYAK_CPUMASK_N
#define XBYAK_CPUMASK_N 6
#endif
#ifndef XBYAK_CPUMASK_BITN
#define XBYAK_CPUMASK_BITN 10 // max number of logical cpu = 1024
#endif
#if XBYAK_CPUMASK_COMPACT == 1
/*
a_ is treated as an array of N elements, each being bitN bits
a_ = 1<<bitN and n_ = 0 and range_ = 0 means empty set
n_ is length of a_[] - 1
When range_ is false (discrete values):
Values satisfy a_[i] + 1 < a_[i+1] for all 0 <= i <= n_
When range_ is true (intervals):
v = a_[i*2] is the start of the interval
n = a_[i*2+1] is the interval length - 1
Represents the interval [v, v+n]
Max number of cpu = 2**bitN - 1
Max value that can be stored = N
Max interval length = N/2
*/
class CpuMask {
static const uint32_t N = XBYAK_CPUMASK_N;
static const uint32_t bitN = XBYAK_CPUMASK_BITN;
static const uint64_t mask = (uint64_t(1) << bitN) - 1;
uint64_t a_:N*bitN;
uint64_t n_:3;
uint64_t range_:1;
// Set a_[idx] = v
void set_a(size_t idx, uint32_t v)
{
assert(idx < N);
assert(v <= mask);
a_ &= ~(mask << (idx*bitN));
a_ |= (v & mask) << (idx*bitN);
}
// Get a_[idx]
uint32_t get_a(size_t idx) const
{
assert(idx < N);
return (a_ >> (idx*bitN)) & mask;
}
#ifndef NDEBUG
// Return true if the idx-th value exists
bool hasNext(uint32_t idx) const
{
if (empty()) return false;
if (!range_) return idx <= n_;
uint32_t n = 0;
for (uint32_t i = 1; i <= n_; i += 2) {
n += get_a(i) + 1;
if (idx < n) return true;
}
return false;
}
#endif
public:
CpuMask() { clear(); }
class ConstIterator {
const CpuMask& parent_;
uint32_t idx_;
uint32_t size_;
friend class CpuMask;
public:
ConstIterator(const CpuMask& parent)
: parent_(parent), idx_(0), size_(uint32_t(parent.size())) {}
uint32_t operator*() const { return parent_.get(idx_); }
ConstIterator& operator++() { idx_++; return *this; }
bool operator==(const ConstIterator& rhs) const { return idx_ == rhs.idx_; }
bool operator!=(const ConstIterator& rhs) const { return !operator==(rhs); }
};
ConstIterator begin() const { return ConstIterator(*this); }
ConstIterator end() const {
ConstIterator it(*this);
it.idx_ = uint32_t(size());
return it;
}
typedef ConstIterator iterator;
typedef ConstIterator const_iterator;
void clear() { a_ = 1 << bitN; n_ = 0; range_ = 0; }
bool empty() const
{
return a_ == 1 << bitN && n_ == 0 && range_ == 0;
}
uint64_t to_u64() const { return a_ | (uint64_t(n_) << (N * bitN)) | (uint64_t(range_) << (N * bitN + 3)); }
bool operator<(const CpuMask& rhs) const { return to_u64() < rhs.to_u64(); }
bool operator>(const CpuMask& rhs) const { return to_u64() > rhs.to_u64(); }
bool operator>=(const CpuMask& rhs) const { return !operator<(rhs); }
bool operator<=(const CpuMask& rhs) const { return !operator>(rhs); }
bool operator==(const CpuMask& rhs) const { return to_u64() == rhs.to_u64(); }
bool operator!=(const CpuMask& rhs) const { return !operator==(rhs); }
// Add element v
// v should be monotonically increasing
bool append(uint32_t v)
{
uint32_t prev = 0, n = 0;
if (v > mask) goto ERR;
// When adding for the first time, treat as discrete value
if (empty()) {
a_ = v;
n_ = 0;
return true;
}
if (!range_) {
prev = get_a(n_);
if (v <= prev) goto ERR;
// If there's one discrete value and it forms an interval with the new value, switch to interval mode
if (n_ == 0 && prev + 1 == v) {
set_a(1, 1);
range_ = 1;
n_ = 1;
return true;
}
if (n_ >= N - 1) goto ERR;
// Add discrete value
n_++;
set_a(n_, v);
return true;
}
// If the value to add is 1 greater than the end of the current interval
n = get_a(n_);
prev = get_a(n_ - 1) + n;
if (prev >= v) goto ERR;
if (prev + 1 == v) {
// Increase the interval length by one
set_a(n_, n + 1);
return true;
} else {
if (n_ >= N - 1) goto ERR;
// If not continuous with the previous interval
// Add a new interval [v]
set_a(n_ + 1, v);
n_ += 2;
return true;
}
ERR:
XBYAK_THROW_RET(ERR_INVALID_CPUMASK_INDEX, false)
}
// add range [a, b] which means a, a+1, ..., b
bool appendRange(uint32_t a, uint32_t b)
{
if ((empty() || (range_ && n_ < N - 1)) && (a <= b && b <= mask)) {
range_ = true;
n_ += n_ == 0 ? 1 : 2;
set_a(n_ - 1, a);
set_a(n_, b - a);
return true;
}
return false;
}
// str = "(int|range)[,(int|range)]*"
// range = int-int
bool setStr(const char *str)
{
return impl::setStr(*this, str);
}
bool setStr(const std::string& str) { return setStr(str.c_str()); }
std::string getStr() const
{
std::string s;
if (empty()) return s;
if (!range_) {
for (uint32_t i = 0; i <= n_; i++) {
if (!s.empty()) s += ",";
impl::appendStr(s, get_a(i));
}
return s;
}
for (uint32_t i = 0; i <= n_; i += 2) {
uint32_t v = get_a(i);
uint32_t len = get_a(i + 1);
if (!s.empty()) s += ",";
impl::appendStr(s, v);
if (len > 0) {
s += "-";
impl::appendStr(s, v + len);
}
}
return s;
}
size_t size() const
{
if (empty()) return 0;
if (!range_) return n_ + 1;
size_t n = 0;
for (uint32_t i = 1; i <= n_; i += 2) {
n += get_a(i) + 1;
}
return n;
}
uint32_t get(uint32_t idx) const
{
assert(hasNext(idx));
if (!range_) return get_a(idx);
uint32_t n = 0;
for (uint32_t i = 1; i <= n_; i += 2) {
uint32_t range = get_a(i) + 1;
if (idx < n + range) {
return get_a(i - 1) + (idx - n);
}
n += range;
}
return false;
}
void dump() const
{
printf("a_:");
for (int i = int(N) - 1; i >= 0; i--) {
printf("%u ", uint32_t((a_ >> (i * bitN)) & mask));
}
printf("\n");
printf("n_: %u\n", (uint32_t)n_);
printf("range_: %u\n", (uint32_t)range_);
}
void put(const char *label = NULL) const
{
if (label) printf("%s: ", label);
printf("%s\n", getStr().c_str());
}
};
#else
class CpuMask {
typedef std::set<uint32_t> IntSet;
IntSet indices_;
public:
CpuMask() : indices_() {}
typedef IntSet::const_iterator const_iterator;
typedef const_iterator iterator;
const_iterator begin() const { return indices_.begin(); }
const_iterator end() const { return indices_.end(); }
void clear() { indices_.clear(); }
bool empty() const { return indices_.empty(); }
bool operator<(const CpuMask& rhs) const { return indices_ < rhs.indices_; }
bool operator>(const CpuMask& rhs) const { return indices_ > rhs.indices_; }
bool operator>=(const CpuMask& rhs) const { return !operator<(rhs); }
bool operator<=(const CpuMask& rhs) const { return !operator>(rhs); }
bool operator==(const CpuMask& rhs) const { return indices_ == rhs.indices_; }
bool operator!=(const CpuMask& rhs) const { return !operator==(rhs); }
// idx should be monotonically increasing
bool append(uint32_t idx)
{
if (idx >= (1u << XBYAK_CPUMASK_BITN)) return false;
if (!indices_.empty() && *indices_.rbegin() >= idx) return false;
indices_.insert(idx);
return true;
}
// add range [a, b] which means a, a+1, ..., b
bool appendRange(uint32_t a, uint32_t b)
{
if (a > b) return false;
while (a <= b) {
if (!append(a)) return false;
a++;
}
return true;
}
bool setStr(const char *str)
{
return impl::setStr(*this, str);
}
bool setStr(const std::string& str) { return setStr(str.c_str()); }
std::string getStr() const
{
std::string s;
bool inRange = false;
uint32_t prev = 0x80000000;
for (const_iterator i = indices_.begin(); i != indices_.end(); ++i) {
uint32_t v = *i;
if (inRange) {
if (prev + 1 != v) {
impl::appendStr(s, prev);
inRange = false;
s += ',';
impl::appendStr(s, v);
}
} else {
if (prev + 1 == v) {
// start range
s += '-';
inRange = true;
} else {
if (!s.empty()) s += ',';
impl::appendStr(s, v);
}
}
prev = v;
}
if (inRange) {
impl::appendStr(s, prev);
}
return s;
}
size_t size() const { return indices_.size(); }
uint32_t get(uint32_t idx) const
{
assert(idx < size());
const_iterator it = indices_.begin();
std::advance(it, idx);
return *it;
}
void put(const char *label = NULL) const
{
if (label) printf("%s: ", label);
printf("%s\n", getStr().c_str());
}
};
#endif
class CpuCache {
public:
CpuCache() : size(0), associativity(0) {}
// Cache size in bytes
uint32_t size;
// number of ways of associativity
uint32_t associativity;
// Set of logical CPU indices sharing this cache
CpuMask sharedCpuIndices;
// Whether this is a shared cache
bool isShared() const { return sharedCpuIndices.size() > 1; }
// Number of logical CPUs sharing this cache
size_t getSharedCpuNum() const { return sharedCpuIndices.size(); }
void put(const char *label = NULL) const
{
if (label) printf("%s: ", label);
printf("%u KiB, assoc. %u, shared ", size / 1024, associativity);
sharedCpuIndices.put();
}
};
struct LogicalCpu {
LogicalCpu()
: coreId(0)
, coreType(Unknown)
, cache()
{
}
uint32_t coreId; // index of physical core
CoreType coreType; // for hybrid systems
CpuCache cache[CACHE_TYPE_NUM];
const CpuMask& getSiblings() const { return cache[L1i].sharedCpuIndices; }
void put(const char *label = NULL) const
{
if (label) printf("%s: ", label);
printf("coreId %u, type %s\n", coreId, getCoreTypeStr(coreType));
for (int i = 0; i < CACHE_TYPE_NUM; i++) {
cache[i].put(getCacheTypeStr(i));
}
}
};
class CpuTopology {
public:
explicit CpuTopology(const Cpu& cpu)
: logicalCpus_()
, physicalCoreNum_(0)
, lineSize_(0)
, isHybrid_(cpu.has(cpu.tHYBRID))
{
if (!impl::initCpuTopology(*this)) {
XBYAK_THROW(ERR_CANT_INIT_CPUTOPOLOGY);
}
}
// Number of logical CPUs
size_t getLogicalCpuNum() const { return logicalCpus_.size(); }
// Number of physical cores
size_t getPhysicalCoreNum() const { return physicalCoreNum_; }
// Cache line size in bytes
uint32_t getLineSize() const { return lineSize_; }
// Get logical CPU information
const LogicalCpu& getLogicalCpu(size_t cpuIdx) const
{
return logicalCpus_[cpuIdx];
}
// Get cache information for a specific logical CPU
const CpuCache& getCache(size_t cpuIdx, CacheType type) const
{
return logicalCpus_[cpuIdx].cache[type];
}
// Whether this is a hybrid system
bool isHybrid() const { return isHybrid_; }
private:
friend bool impl::initCpuTopology(CpuTopology&);
std::vector<LogicalCpu> logicalCpus_;
size_t physicalCoreNum_;
uint32_t lineSize_;
bool isHybrid_;
};
namespace impl {
inline uint32_t popcnt(uint64_t mask)
{
#if defined(_M_X64) || defined(_M_AMD64)
return (int)__popcnt64(mask);
#elif defined(__GNUC__) || defined(__clang__)
return __builtin_popcountll(mask);
#else
uint32_t count = 0;
while (mask) {
count += (mask & 1);
mask >>= 1;
}
return count;
#endif
}
// fall back to CPUID leaf 0x1A
inline CoreType getCoreType()
{
uint32_t data[4] = {};
Cpu::getCpuidEx(0x1A, 0, data);
const uint32_t coreTypeField = (data[0] >> 24) & 0xFF;
if (coreTypeField == 0x40) return Performance; // P-core
if (coreTypeField == 0x20) return Efficient; // E-core
return Standard;
}
#ifdef _WIN32
typedef std::vector<uint32_t> U32Vec;
#if (defined(NTDDI_VERSION) && NTDDI_VERSION >= 0x06010000) || (defined(_WIN32_WINNT) && _WIN32_WINNT >= 0x0601)
#define XBYAK_WINSDK_HAS_RELATIONSHIP_GROUP_AFFINITY 1
#else
#define XBYAK_WINSDK_HAS_RELATIONSHIP_GROUP_AFFINITY 0
#endif
#if (defined(NTDDI_VERSION) && NTDDI_VERSION >= 0x0A000000) || (defined(_WIN32_WINNT) && _WIN32_WINNT >= 0x0A00)
#define XBYAK_WINSDK_HAS_EFFICIENCY_CLASS 1
#else
#define XBYAK_WINSDK_HAS_EFFICIENCY_CLASS 0
#endif
// GroupMasks[] / GroupCount on CACHE_RELATIONSHIP added in Win10 20H1 (SDK 10.0.19041, NTDDI_WIN10_VB)
// NOTE: _WIN32_WINNT has no sub-version granularity for Win10, so only
// NTDDI_VERSION can distinguish 20H1 (0x0A00000C) from earlier Win10 builds.
// If NTDDI_VERSION is not set, this macro will be 0 (safe/conservative fallback).
#if defined(NTDDI_VERSION) && NTDDI_VERSION >= 0x0A00000C
#define XBYAK_WINSDK_HAS_CACHE_RELATIONSHIP_GROUPMASKS 1
#else
#define XBYAK_WINSDK_HAS_CACHE_RELATIONSHIP_GROUPMASKS 0
#endif
#if XBYAK_WINSDK_HAS_RELATIONSHIP_GROUP_AFFINITY
typedef SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX ProcInfo;
inline CoreType getCoreTypeForAffinity(const GROUP_AFFINITY& affinity)
{
GROUP_AFFINITY previousMask = {};
if (!SetThreadGroupAffinity(GetCurrentThread(), &affinity, &previousMask)) {
return Standard;
}
CoreType type = impl::getCoreType();
SetThreadGroupAffinity(GetCurrentThread(), &previousMask, NULL);
return type;
}
// return total logical cpus if sucessful, 0 if failed
inline uint32_t getGroupAcc(U32Vec& v)
{
DWORD len = 0;
GetLogicalProcessorInformationEx(RelationGroup, NULL, &len);
std::vector<char> buf(len);
if (!GetLogicalProcessorInformationEx(RelationGroup, reinterpret_cast<ProcInfo*>(buf.data()), &len)) {
return 0;
}
const auto& entry = *reinterpret_cast<const ProcInfo*>(buf.data());
const GROUP_RELATIONSHIP& gr = entry.Group;
const uint32_t n = gr.ActiveGroupCount;
if (n == 0) return 0;
v.resize(n);
uint32_t acc = 0;
for (uint32_t g = 0; g < n; g++) {
v[g] = acc;
acc += gr.GroupInfo[g].ActiveProcessorCount;
}
return acc;
}
// return number of physical cores if successful, 0 if failed
static inline uint32_t getCores(std::vector<LogicalCpu>& cpus, bool isHybrid, const U32Vec& groupAcc) {
DWORD len = 0;
GetLogicalProcessorInformationEx(RelationProcessorCore, NULL, &len);
std::vector<char> buf(len);
if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<ProcInfo*>(buf.data()), &len)) return 0;
// get core indices
const char *p = buf.data();
const char *end = p + len;
uint32_t coreIdx = 0;
while (p < end) {
const auto& entry = *reinterpret_cast<const ProcInfo*>(p);
if (entry.Relationship == RelationProcessorCore) {
const PROCESSOR_RELATIONSHIP& core = entry.Processor;
LogicalCpu cpu;
cpu.coreId = coreIdx++;
if (!isHybrid) {
cpu.coreType = Standard;
} else {
#if XBYAK_WINSDK_HAS_EFFICIENCY_CLASS
cpu.coreType = core.EfficiencyClass > 0 ? Performance : Efficient;
#else
cpu.coreType = getCoreTypeForAffinity(core.GroupMask[0]);
#endif
}
const GROUP_AFFINITY* masks = core.GroupMask;
for (WORD i = 0; i < core.GroupCount; i++) {
const WORD group = masks[i].Group;
const KAFFINITY m = masks[i].Mask;
const uint32_t base = groupAcc[group];
for (uint32_t b = 0; b < sizeof(KAFFINITY) * 8; b++) {
if (m & (KAFFINITY(1) << b)) {
const uint32_t idx = base + b;
if (idx >= cpus.size()) return 0;
cpus[idx] = cpu;
}
}
}
}
p += entry.Size;
}
return coreIdx;
}
inline bool convertMask(CpuMask& mask, const U32Vec& groupAcc, const CACHE_RELATIONSHIP& cache)
{
#if XBYAK_WINSDK_HAS_CACHE_RELATIONSHIP_GROUPMASKS
const WORD count = cache.GroupCount;
#else
const WORD count = 1;
#endif
for (WORD i = 0; i < count; i++) {
#if XBYAK_WINSDK_HAS_CACHE_RELATIONSHIP_GROUPMASKS
const GROUP_AFFINITY& cg = cache.GroupMasks[i];
#else
const GROUP_AFFINITY& cg = cache.GroupMask;
#endif
const KAFFINITY m = cg.Mask;
const uint32_t base = groupAcc[cg.Group];
for (uint32_t b = 0; b < sizeof(KAFFINITY) * 8; b++) {
if (m & (KAFFINITY(1) << b)) {
if (!mask.append(base + b)) return false;
}
}
}
return true;
}
inline bool initCpuTopology(CpuTopology& cpuTopo)
{
U32Vec groupAcc;
const uint32_t logicalCpuNum = getGroupAcc(groupAcc);
if (logicalCpuNum == 0) return false;
if (logicalCpuNum >= (1u << XBYAK_CPUMASK_BITN)) return false;
cpuTopo.logicalCpus_.resize(logicalCpuNum);
cpuTopo.physicalCoreNum_ = getCores(cpuTopo.logicalCpus_, cpuTopo.isHybrid(), groupAcc);
if (cpuTopo.physicalCoreNum_ == 0) return false;
DWORD len = 0;
GetLogicalProcessorInformationEx(RelationCache, NULL, &len);
std::vector<char> buf(len);
if (!GetLogicalProcessorInformationEx(RelationCache, reinterpret_cast<ProcInfo*>(buf.data()), &len)) return false;
const char *p = buf.data();
const char *end = p + len;
while (p < end) {
const auto& entry = *reinterpret_cast<const ProcInfo*>(p);
if (entry.Relationship == RelationCache) {
const CACHE_RELATIONSHIP& cache = entry.Cache;
uint32_t type = CACHE_UNKNOWN;
if (cache.Level == 1) {
if (cache.Type == CacheInstruction) {
type = L1i;
} else if (cache.Type == CacheData) {
type = L1d;
}
} else if (cache.Level == 2) {
type = L2;
} else if (cache.Level == 3) {
type = L3;
}
if (type != CACHE_UNKNOWN) {
CpuMask mask;
if (!convertMask(mask, groupAcc, cache)) return false;
for (const auto& i : mask) {
if (i >= cpuTopo.logicalCpus_.size()) return false;
cpuTopo.logicalCpus_[i].cache[type].size = cache.CacheSize;
if (cpuTopo.lineSize_ == 0) cpuTopo.lineSize_ = cache.LineSize;
cpuTopo.logicalCpus_[i].cache[type].associativity = cache.Associativity;
cpuTopo.logicalCpus_[i].cache[type].sharedCpuIndices = mask;
}
}
}
p += entry.Size;
}
return true;
}
#else
inline bool initCpuTopology(CpuTopology& cpuTopo)
{
(void)cpuTopo;
return false;
}
#endif
// unset WinSDK version macros to avoid Macro pollution
#undef XBYAK_WINSDK_HAS_RELATIONSHIP_GROUP_AFFINITY
#undef XBYAK_WINSDK_HAS_EFFICIENCY_CLASS
#undef XBYAK_WINSDK_HAS_CACHE_RELATIONSHIP_GROUPMASKS
#elif defined(__linux__) // Linux
struct WrapFILE {
FILE *f;
explicit WrapFILE(const char *name)
: f(fopen(name, "r"))
{
}
~WrapFILE() { if (f) fclose(f); }
};
inline uint32_t readIntFromFile(const char* path) {
WrapFILE wf(path);
if (!wf.f) return 0;
uint32_t val = 0;
int n = fscanf(wf.f, "%u", &val);
return (n == 1) ? val : 0;
}
inline bool parseCpuList(CpuMask& mask, const char* path) {
WrapFILE wf(path);
if (!wf.f) return false;
char buf[1024];
if (!fgets(buf, sizeof(buf), wf.f)) return false;
size_t n = strlen(buf);
if (n > 0 && buf[n - 1] == '\n') buf[n - 1] = '\0';
return setStr(mask, buf);
}
inline CoreType setAffinityAndGetCoreType(uint32_t cpu)
{
cpu_set_t cpuMask;
CPU_ZERO(&cpuMask);
CPU_SET(cpu, &cpuMask);
if (sched_setaffinity(0, sizeof(cpu_set_t), &cpuMask)) return Standard;
return impl::getCoreType();
}
inline bool initCpuTopology(CpuTopology& cpuTopo)
{
const uint32_t logicalCpuNum = sysconf(_SC_NPROCESSORS_ONLN);
if (logicalCpuNum == 0) return false;
if (logicalCpuNum >= (1u << XBYAK_CPUMASK_BITN)) return false;
cpuTopo.logicalCpus_.resize(logicalCpuNum);
uint32_t maxPhisicalIdx = 0;
for (uint32_t cpuIdx = 0; cpuIdx < logicalCpuNum; cpuIdx++) {
char path[256];
LogicalCpu& logCpu = cpuTopo.logicalCpus_[cpuIdx];
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%u/topology/core_id", cpuIdx);
logCpu.coreId = readIntFromFile(path);
maxPhisicalIdx = (std::max)(maxPhisicalIdx, logCpu.coreId);
logCpu.coreType = Standard;
for (uint32_t cacheIdx = 0; cacheIdx < CACHE_TYPE_NUM; cacheIdx++) {
CacheType cacheType = CACHE_UNKNOWN;
// Map cache index to cache type
{
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%u/cache/index%u/type", cpuIdx, cacheIdx);
char typeStr[32];
WrapFILE wf(path);
if (wf.f && fgets(typeStr, sizeof(typeStr), wf.f)) {
if (strncmp(typeStr, "Instruction", 11) == 0) {
cacheType = L1i;
} else if (strncmp(typeStr, "Data", 4) == 0) {
// Determine level
char path[256];
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%u/cache/index%u/level", cpuIdx, cacheIdx);
switch (readIntFromFile(path)) {
case 1: cacheType = L1d; break;
case 2: cacheType = L2; break;
case 3: cacheType = L3; break;
default: break;;
}
} else if (strncmp(typeStr, "Unified", 7) == 0) {
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%u/cache/index%u/level", cpuIdx, cacheIdx);
switch (readIntFromFile(path)) {
case 2: cacheType = L2; break;
case 3: cacheType = L3; break;
default: break;;
}
}
}
}
if (cacheType == CACHE_UNKNOWN) continue;
CpuCache& cache = logCpu.cache[cacheType];
// Read cache size
{
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%u/cache/index%u/size", cpuIdx, cacheIdx);
char sizeStr[32];
WrapFILE wf(path);
if (wf.f && fgets(sizeStr, sizeof(sizeStr), wf.f)) {
char *endp;
uint32_t size = (uint32_t)strtoul(sizeStr, &endp, 10);
switch (*endp) {
case '\0': case '\n': cache.size = size; break;
case 'K': case 'k': cache.size = size * 1024; break;
case 'M': case 'm': cache.size = size * 1024 * 1024; break;
default: break;
}
}
}
// Read ways of associativity
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%u/cache/index%u/ways_of_associativity", cpuIdx, cacheIdx);
cache.associativity = readIntFromFile(path);
// Read shared CPU list
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%u/cache/index%u/shared_cpu_list", cpuIdx, cacheIdx);
parseCpuList(cache.sharedCpuIndices, path);
}
}
// Assign core types for hybrid architectures
const bool isHybrid = cpuTopo.isHybrid();
if (isHybrid) {
// For hybrid systems, try toread P-core and E-core lists from sysfs first
CpuMask pCoreMask;
const bool hasPCoreSysfs = parseCpuList(pCoreMask, "/sys/devices/cpu_core/cpus");
if (hasPCoreSysfs) {
// Set Performance core types
for (CpuMask::const_iterator it = pCoreMask.begin(); it != pCoreMask.end(); ++it) {
uint32_t cpuIdx = *it;
if (cpuIdx < logicalCpuNum) {
cpuTopo.logicalCpus_[cpuIdx].coreType = Performance;
}
}
}
CpuMask eCoreMask;
const bool hasECoreSysfs = parseCpuList(eCoreMask, "/sys/devices/cpu_atom/cpus");
if (hasECoreSysfs) {
// Set Efficient core types
for (CpuMask::const_iterator it = eCoreMask.begin(); it != eCoreMask.end(); ++it) {
uint32_t cpuIdx = *it;
if (cpuIdx < logicalCpuNum) {
cpuTopo.logicalCpus_[cpuIdx].coreType = Efficient;
}
}
}
// Fallback: if either sysfs paths are unavailable, detect both core type per-CPU
if (!hasPCoreSysfs || !hasECoreSysfs) {
cpu_set_t originalMask;
CPU_ZERO(&originalMask);
if (sched_getaffinity(0, sizeof(cpu_set_t), &originalMask) == 0) {
for (uint32_t cpu = 0; cpu < logicalCpuNum; cpu++) {
cpuTopo.logicalCpus_[cpu].coreType = impl::setAffinityAndGetCoreType(cpu);
}
sched_setaffinity(0, sizeof(cpu_set_t), &originalMask);
}
}
}
// Read coherency line size
cpuTopo.lineSize_ = readIntFromFile("/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size");
cpuTopo.physicalCoreNum_ = maxPhisicalIdx + 1;
return true;
}
#else // Other OS (e.g., macOS)
inline bool initCpuTopology(CpuTopology& cpuTopo)
{
// CPU topology detection not yet implemented
(void)cpuTopo;
return false;
}
#endif // _WIN32 / __linux__ / other OS
} // namespace impl
#endif // XBYAK_CPU_CACHE
class Clock {
public:
static inline uint64_t getRdtsc()
{
#ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _MSC_VER
return __rdtsc();
#else
uint32_t eax, edx;
__asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx));
return ((uint64_t)edx << 32) | eax;
#endif
#else
// TODO: Need another impl of Clock or rdtsc-equivalent for non-x86 cpu
return 0;
#endif
}
Clock()
: clock_(0)
, count_(0)
{
}
void begin()
{
clock_ -= getRdtsc();
}
void end()
{
clock_ += getRdtsc();
count_++;
}
int getCount() const { return count_; }
uint64_t getClock() const { return clock_; }
void clear() { count_ = 0; clock_ = 0; }
private:
uint64_t clock_;
int count_;
};
#ifdef XBYAK64
class Pack {
static const size_t maxTblNum = 15;
Xbyak::Reg64 tbl_[maxTblNum];
size_t n_;
public:
Pack() : tbl_(), n_(0) {}
Pack(const Xbyak::Reg64 *tbl, size_t n) { init(tbl, n); }
Pack(const Pack& rhs)
: n_(rhs.n_)
{
for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i];
}
Pack& operator=(const Pack& rhs)
{
n_ = rhs.n_;
for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i];
return *this;
}
Pack(const Xbyak::Reg64& t0)
{ n_ = 1; tbl_[0] = t0; }
Pack(const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
{ n_ = 2; tbl_[0] = t0; tbl_[1] = t1; }
Pack(const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
{ n_ = 3; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; }
Pack(const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
{ n_ = 4; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; }
Pack(const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
{ n_ = 5; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; }
Pack(const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
{ n_ = 6; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; tbl_[5] = t5; }
Pack(const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
{ n_ = 7; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; tbl_[5] = t5; tbl_[6] = t6; }
Pack(const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
{ n_ = 8; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; tbl_[5] = t5; tbl_[6] = t6; tbl_[7] = t7; }
Pack(const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
{ n_ = 9; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; tbl_[5] = t5; tbl_[6] = t6; tbl_[7] = t7; tbl_[8] = t8; }
Pack(const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
{ n_ = 10; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; tbl_[5] = t5; tbl_[6] = t6; tbl_[7] = t7; tbl_[8] = t8; tbl_[9] = t9; }
Pack(const Xbyak::Reg64& ta, const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
{ n_ = 11; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; tbl_[5] = t5; tbl_[6] = t6; tbl_[7] = t7; tbl_[8] = t8; tbl_[9] = t9; tbl_[10] = ta; }
Pack(const Xbyak::Reg64& tb, const Xbyak::Reg64& ta, const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
{ n_ = 12; tbl_[0] = t0; tbl_[1] = t1; tbl_[2] = t2; tbl_[3] = t3; tbl_[4] = t4; tbl_[5] = t5; tbl_[6] = t6; tbl_[7] = t7; tbl_[8] = t8; tbl_[9] = t9; tbl_[10] = ta; tbl_[11] = tb; }
Pack& append(const Xbyak::Reg64& t)
{
if (n_ == maxTblNum) {
fprintf(stderr, "ERR Pack::can't append\n");
XBYAK_THROW_RET(ERR_BAD_PARAMETER, *this)
}
tbl_[n_++] = t;
return *this;
}
void init(const Xbyak::Reg64 *tbl, size_t n)
{
if (n > maxTblNum) {
fprintf(stderr, "ERR Pack::init bad n=%d\n", (int)n);
XBYAK_THROW(ERR_BAD_PARAMETER)
}
n_ = n;
for (size_t i = 0; i < n; i++) {
tbl_[i] = tbl[i];
}
}
const Xbyak::Reg64& operator[](size_t n) const
{
if (n >= n_) {
fprintf(stderr, "ERR Pack bad n=%d(%d)\n", (int)n, (int)n_);
XBYAK_THROW_RET(ERR_BAD_PARAMETER, rax)
}
return tbl_[n];
}
size_t size() const { return n_; }
/*
get tbl[pos, pos + num)
*/
Pack sub(size_t pos, size_t num = size_t(-1)) const
{
if (num == size_t(-1)) num = n_ - pos;
if (pos + num > n_) {
fprintf(stderr, "ERR Pack::sub bad pos=%d, num=%d\n", (int)pos, (int)num);
XBYAK_THROW_RET(ERR_BAD_PARAMETER, Pack())
}
Pack pack;
pack.n_ = num;
for (size_t i = 0; i < num; i++) {
pack.tbl_[i] = tbl_[pos + i];
}
return pack;
}
void put() const
{
for (size_t i = 0; i < n_; i++) {
printf("%s ", tbl_[i].toString());
}
printf("\n");
}
};
// start from a bit position larger than the number of GPRs
const int UseRBP = 1 << 5;
const int UseRCX = 1 << 6;
const int UseRDX = 1 << 7;
const int UseRSI = 1 << 8;
const int UseRDI = 1 << 9;
const int UseRBPAsFramePointer = UseRBP | (1 << 10);
class StackFrame {
#ifdef XBYAK64_WIN
static const int noSaveNum = 6;
#else
static const int noSaveNum = 8;
#endif
static const int maxPnum = 4;
static const int maxRegNum = 14; // maxRegNum = 16 - rsp - rax
static const int calleeSaveNum = maxRegNum - noSaveNum;
static const int UseMASK = UseRCX|UseRDX|UseRSI|UseRDI|UseRBP;
Xbyak::CodeGenerator *code_;
Xbyak::Reg64 pTbl_[maxPnum];
Xbyak::Reg64 tTbl_[maxRegNum];
Pack p_;
Pack t_;
int pNum_;
int tNum_;
int useRegs_;
int saveNum_;
int saveRegs_[calleeSaveNum];
int P_;
bool makeEpilog_;
StackFrame(const StackFrame&);
void operator=(const StackFrame&);
public:
const Pack& p;
const Pack& t;
/*
make stack frame
@param sf [in] this
@param pNum [in] number of function parameters(0 <= pNum <= 4)
@param tNum [in] number of temporary registers(0 <= tNum, can be OR-ed with Use{RCX,RDX,RSI,RDI,RBP}, e.g., 3|UseRCX)
@param stackSizeByte [in] local stack size
@param makeEpilog [in] automatically call close() if true
pNum + tNum + #Use must be <= 14
you can use
rax
p[0], ..., p[pNum-1] as function parameters
t[0], ..., t[tNum-1] as temporary registers
{rcx,rdx,rsi,rdi,rbp} are explicitly available by specifying Use{RCX,RDX,RSI,RDI,RBP} in tNum
rsp[0..stackSizeByte-1] if stackSizeByte > 0
*/
StackFrame(Xbyak::CodeGenerator *code, int pNum, int tNum = 0, int stackSizeByte = 0, bool makeEpilog = true)
: code_(code)
, pNum_(pNum)
, tNum_(tNum & ~(UseMASK|UseRBPAsFramePointer))
, useRegs_(tNum & UseMASK) // drop UseRBPAsFramePointer bit
, saveNum_(0)
, P_(0)
, makeEpilog_(makeEpilog)
, p(p_)
, t(t_)
{
if (pNum < 0 || pNum > 4) XBYAK_THROW(ERR_BAD_PNUM)
if (tNum < 0) XBYAK_THROW(ERR_BAD_TNUM)
const int *const fullTbl = getRegEntryTbl();
const int *const calleeTbl = fullTbl + noSaveNum;
int callerUseNum = 0;
int calleeUseNum = 0;
for (int i = 0; i < maxRegNum; i++) {
if (useRegs_ & useFlagOf(fullTbl[i])) {
if (i < noSaveNum) {
callerUseNum++;
} else {
calleeUseNum++;
}
}
}
const int useNum = callerUseNum + calleeUseNum;
if (pNum + tNum_ + useNum > maxRegNum) XBYAK_THROW(ERR_BAD_TNUM)
const int baseSaveNum = local::max_(0, pNum + tNum_ + useNum - noSaveNum);
bool pushedRbp = false;
if (useRegs_ & UseRBP) {
code->push(rbp);
saveRegs_[saveNum_++] = Operand::RBP;
pushedRbp = true;
if ((tNum & UseRBPAsFramePointer) == UseRBPAsFramePointer) code->mov(rbp, rsp);
}
for (int i = 0; i < calleeSaveNum; i++) {
int r = calleeTbl[i];
if (i < baseSaveNum || isUseReg(r)) {
if (pushedRbp && r == Operand::RBP) continue;
saveRegs_[saveNum_++] = r;
code->push(Reg64(r));
}
}
P_ = (stackSizeByte + 7) / 8;
// (rsp % 16) == 8, then increment P_ for 16 byte alignment
if (P_ > 0 && (P_ & 1) == (saveNum_ & 1)) P_++;
P_ *= 8;
if (P_ > 0) code->sub(rsp, P_);
int pos = 0;
for (int i = 0; i < pNum; i++) {
pTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
}
for (int i = 0; i < tNum_; i++) {
tTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
}
// replace reserved reg with backup reg if needed
for (size_t i = 0; i < maxPnum; i++) {
const RegSlot& rp = getRegSlotTbl()[i];
if (isUseReg(rp.target) && rp.pos < pNum && rp.alt >= 0) {
code->mov(Xbyak::Reg64(rp.alt), Xbyak::Reg64(rp.target));
}
}
p_.init(pTbl_, pNum);
t_.init(tTbl_, tNum_);
}
/*
make epilog manually
@param callRet [in] call ret() if true
*/
void close(bool callRet = true)
{
if (P_ > 0) code_->add(code_->rsp, P_);
for (int i = saveNum_ - 1; i >= 0; i--) {
code_->pop(Reg64(saveRegs_[i]));
}
if (callRet) code_->ret();
}
~StackFrame()
{
if (!makeEpilog_) return;
close();
}
private:
static int useFlagOf(int r)
{
switch (r) {
case Operand::RCX: return UseRCX;
case Operand::RDX: return UseRDX;
case Operand::RSI: return UseRSI;
case Operand::RDI: return UseRDI;
case Operand::RBP: return UseRBP;
default: return 0;
}
}
bool isUseReg(int r) const { return (useRegs_ & useFlagOf(r)) != 0; }
// Register allocation for the first 4 function parameters
struct RegSlot {
int target;
int pos; // position of target in getRegEntryTbl()
int alt; // alternative if target is used for parameter. -1 means no alternative.
};
const RegSlot *getRegSlotTbl() const
{
// Win: p[] = rcx(r10), rdx(r11), r8, r9:
// Linux: p[] = rdi(r8), rsi(r9), rdx(r11), rcx(r10)
// reg(alt) means a reserved reg if Use<reg> is used.
static const RegSlot tbl[maxPnum] = {
#ifdef XBYAK64_WIN
{ Operand::RCX, 0, Operand::R10 },
{ Operand::RDX, 1, Operand::R11 },
{ Operand::RDI, 6, -1 },
{ Operand::RSI, 7, -1 },
#else
{ Operand::RCX, 3, Operand::R10 },
{ Operand::RDX, 2, Operand::R11 },
{ Operand::RDI, 0, Operand::R8 },
{ Operand::RSI, 1, Operand::R9 },
#endif
};
return tbl;
}
const int *getRegEntryTbl() const
{
static const int tbl[maxRegNum] = {
#ifdef XBYAK64_WIN
Operand::RCX, Operand::RDX, Operand::R8, Operand::R9, Operand::R10, Operand::R11, Operand::RDI, Operand::RSI,
#else
Operand::RDI, Operand::RSI, Operand::RDX, Operand::RCX, Operand::R8, Operand::R9, Operand::R10, Operand::R11,
#endif
Operand::RBX, Operand::RBP, Operand::R12, Operand::R13, Operand::R14, Operand::R15
};
return &tbl[0];
}
// get an available register index from tbl, skipping reserved registers
int getRegIdx(int& pos) const
{
const int *tbl = getRegEntryTbl();
const RegSlot *slotTbl = getRegSlotTbl();
for (;;) {
NEXT:;
assert(pos < maxRegNum);
int r = tbl[pos++];
// if r is a Use*** target with alt, return alt as backup
// otherwise skip Use*** targets, their alts, and UseRBP's rbp
for (size_t i = 0; i < maxPnum; i++) {
const RegSlot& slot = slotTbl[i];
if (!isUseReg(slot.target)) continue;
if (r == slot.alt) goto NEXT;
if (r == slot.target) {
if (slot.alt >= 0) return slot.alt;
goto NEXT;
}
}
if (!isUseReg(r)) return r;
}
}
};
#endif
class Profiler {
int mode_;
const char *suffix_;
const void *startAddr_;
#ifdef XBYAK_USE_PERF
FILE *fp_;
#endif
public:
enum {
None = 0,
Perf = 1,
VTune = 2
};
Profiler()
: mode_(None)
, suffix_("")
, startAddr_(0)
#ifdef XBYAK_USE_PERF
, fp_(0)
#endif
{
}
// append suffix to funcName
void setNameSuffix(const char *suffix)
{
suffix_ = suffix;
}
void setStartAddr(const void *startAddr)
{
startAddr_ = startAddr;
}
void init(int mode)
{
mode_ = None;
switch (mode) {
default:
case None:
return;
case Perf:
#ifdef XBYAK_USE_PERF
close();
{
const int pid = getpid();
char name[128];
snprintf(name, sizeof(name), "/tmp/perf-%d.map", pid);
fp_ = fopen(name, "a+");
if (fp_ == 0) {
fprintf(stderr, "can't open %s\n", name);
return;
}
}
mode_ = Perf;
#endif
return;
case VTune:
#ifdef XBYAK_USE_VTUNE
dlopen("dummy", RTLD_LAZY); // force to load dlopen to enable jit profiling
if (iJIT_IsProfilingActive() != iJIT_SAMPLING_ON) {
fprintf(stderr, "VTune profiling is not active\n");
return;
}
mode_ = VTune;
#endif
return;
}
}
~Profiler()
{
close();
}
void close()
{
#ifdef XBYAK_USE_PERF
if (fp_ == 0) return;
fclose(fp_);
fp_ = 0;
#endif
}
void set(const char *funcName, const void *startAddr, size_t funcSize) const
{
if (mode_ == None) return;
#if !defined(XBYAK_USE_PERF) && !defined(XBYAK_USE_VTUNE)
(void)funcName;
(void)startAddr;
(void)funcSize;
#endif
#ifdef XBYAK_USE_PERF
if (mode_ == Perf) {
if (fp_ == 0) return;
fprintf(fp_, "%llx %zx %s%s", (long long)startAddr, funcSize, funcName, suffix_);
/*
perf does not recognize the function name which is less than 3,
so append '_' at the end of the name if necessary
*/
size_t n = strlen(funcName) + strlen(suffix_);
for (size_t i = n; i < 3; i++) {
fprintf(fp_, "_");
}
fprintf(fp_, "\n");
fflush(fp_);
}
#endif
#ifdef XBYAK_USE_VTUNE
if (mode_ != VTune) return;
char className[] = "";
char fileName[] = "";
iJIT_Method_Load jmethod = {};
jmethod.method_id = iJIT_GetNewMethodID();
jmethod.class_file_name = className;
jmethod.source_file_name = fileName;
jmethod.method_load_address = const_cast<void*>(startAddr);
jmethod.method_size = funcSize;
jmethod.line_number_size = 0;
char buf[128];
snprintf(buf, sizeof(buf), "%s%s", funcName, suffix_);
jmethod.method_name = buf;
iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, (void*)&jmethod);
#endif
}
/*
for continuous set
funcSize = endAddr - <previous set endAddr>
*/
void set(const char *funcName, const void *endAddr)
{
set(funcName, startAddr_, (size_t)endAddr - (size_t)startAddr_);
startAddr_ = endAddr;
}
};
#endif // XBYAK_ONLY_CLASS_CPU
} } // end of util
#if XBYAK_CPUMASK_COMPACT == 1 && __cplusplus >= 201103
namespace std {
template<>
struct hash<Xbyak::util::CpuMask> {
size_t operator()(const Xbyak::util::CpuMask& m) const noexcept {
return std::hash<uint64_t>{}(m.to_u64());
}
};
} // std
#endif
#endif