Adding queries to get the cpu topology on Intel architectures.
diff --git a/xbyak/xbyak_util.h b/xbyak/xbyak_util.h
index e55d66d..37a611f 100644
--- a/xbyak/xbyak_util.h
+++ b/xbyak/xbyak_util.h
@@ -84,6 +84,59 @@
displayModel = model;
}
}
+ static const unsigned int max_number_cache_levels = 10;
+#define value_from_bits(val, base, end) ((val << (sizeof(val)*8-end-1)) >> (sizeof(val)*8-end+base-1))
+ void setCacheHierarchy()
+ {
+ unsigned int cache_type = 42;
+ unsigned int smt_width = 0;
+ unsigned int n_cores;
+ unsigned int data[4];
+
+ if ((type_ & tINTEL) == 0) {
+ fprintf(stderr, "ERR cache hierarchy querying is not supported\n");
+ throw Error(ERR_INTERNAL);
+ }
+
+ // if leaf 11 exists, we use it to get the number of smt cores and cores on socket
+ // If x2APIC is supported, these are the only correct numbers.
+ getCpuidEx(0x0, 0, data);
+ if(data[0] >= 11){
+ getCpuidEx(0xB, 0, data); // CPUID for SMT Level
+ smt_width = (data[1] & 0x7FFF);
+ getCpuidEx(0xB, 1, data); // CPUID for CORE Level
+ n_cores = (data[1] & 0x7FFF);
+ }
+
+ /* Assumptions:
+ * - the first level of data cache is not shared (which is the
+ * case for every existing architecture) and use this to
+ * determine the SMT width for arch not supporting leaf 11
+ * - when leaf 4 reports a number of core less than n_cores
+ * on socket reported by leaf 11, then it is a correct number
+ * of cores not an upperbound */
+#define min_cores(a,b) ((a) < (b)) ? (a) : (b)
+ for (int i = 0; ((cache_type != NO_CACHE) && (data_cache_levels < max_number_cache_levels)); i++) {
+ getCpuidEx(0x4, i, data);
+ cache_type = value_from_bits(data[0], 0, 4);
+ if ((cache_type == DATA_CACHE) || (cache_type == UNIFIED_CACHE)) {
+ int nb_logical_cores = min_cores(value_from_bits(data[0], 14, 25) + 1,
+ n_cores);
+ data_cache_size[data_cache_levels] =
+ (value_from_bits(data[1], 22, 31) + 1)
+ * (value_from_bits(data[1], 12, 21) + 1)
+ * (value_from_bits(data[1], 0, 11) + 1)
+ * (data[2] + 1);
+ if ((cache_type == DATA_CACHE) && (smt_width == 0)) smt_width = nb_logical_cores;
+ assert(smt_width != 0);
+ cores_sharing_data_cache[data_cache_levels] = nb_logical_cores / smt_width;
+ data_cache_levels++;
+ }
+ }
+#undef min_cores
+ }
+#undef value_from_bits
+
public:
int model;
int family;
@@ -92,6 +145,11 @@
int extFamily;
int displayFamily; // family + extFamily
int displayModel; // model + extModel
+
+ unsigned int data_cache_size[max_number_cache_levels];
+ unsigned int cores_sharing_data_cache[max_number_cache_levels];
+ unsigned int data_cache_levels;
+
/*
data[] = { eax, ebx, ecx, edx }
*/
@@ -124,6 +182,11 @@
#endif
}
typedef uint64 Type;
+ static const Type NO_CACHE = 0;
+ static const Type DATA_CACHE = 1;
+ static const Type INSTRUCTION_CACHE = 2;
+ static const Type UNIFIED_CACHE = 3;
+
static const Type NONE = 0;
static const Type tMMX = 1 << 0;
static const Type tMMX2 = 1 << 1;
@@ -190,6 +253,7 @@
Cpu()
: type_(NONE)
+ , data_cache_levels(0)
{
unsigned int data[4];
const unsigned int& EAX = data[0];
@@ -281,6 +345,8 @@
if (ECX & (1U << 0)) type_ |= tPREFETCHWT1;
}
setFamily();
+ if ((type_ & tINTEL) == tINTEL)
+ setCacheHierarchy();
}
void putFamily() const
{