add functionality to get num of cores using x2APIC ID
diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h
index bcfeb34..38fae73 100644
--- a/xbyak/xbyak.h
+++ b/xbyak/xbyak.h
@@ -186,7 +186,8 @@
 	ERR_INVALID_ZERO,
 	ERR_INVALID_RIP_IN_AUTO_GROW,
 	ERR_INVALID_MIB_ADDRESS,
-	ERR_INTERNAL
+	ERR_INTERNAL,
+	ERR_x2APIC_NOT_SUPPORTED_CANT_GET_NCORES
 };
 
 class Error : public std::exception {
@@ -248,6 +249,7 @@
 			"invalid rip in AutoGrow",
 			"invalid mib address",
 			"internal error",
+			"cannot determine num of cores because x2APIC not supported"
 		};
 		assert((size_t)err_ < sizeof(errTbl) / sizeof(*errTbl));
 		return errTbl[err_];
diff --git a/xbyak/xbyak_util.h b/xbyak/xbyak_util.h
index 0154450..d5d9346 100644
--- a/xbyak/xbyak_util.h
+++ b/xbyak/xbyak_util.h
@@ -50,6 +50,11 @@
 
 namespace Xbyak { namespace util {
 
+typedef enum intel_cpu_topology_level {
+   smt_level = 1,
+   core_level = 2
+}intel_cpu_topology_level_t;
+
 /**
 	CPU detection class
 */
@@ -88,6 +93,39 @@
 	{
 		return (val >> base) & ((1u << (end - base)) - 1);
 	}
+	void setNumCores()
+	{
+		if ((type_ & tINTEL) == 0) return;
+
+		unsigned int data[4];
+
+		 /* CAUTION: These numbers are configuration as shipped by Intel. */
+		getCpuidEx(0x0, 0, data);
+		if (data[0] >= 0xB) {
+			 /*
+				if leaf 11 exists(x2APIC is supported),
+				we use it to get the number of smt cores and cores on socket
+
+				leaf 0xB can be zeroed-out by a hypervisor
+			*/
+			x2APIC_supported = true;
+			for (size_t i = 0; i < maxTopologyLevels; i++) {
+				getCpuidEx(0xB, i, data);
+				intel_cpu_topology_level_t level_type =
+					(intel_cpu_topology_level_t)extractBit(data[2], 8, 15);
+				if (level_type == smt_level || level_type == core_level)
+					n_cores[level_type - 1] = extractBit(data[1], 0, 15);
+			}
+			if (n_cores[smt_level - 1] != 0)
+				n_cores[core_level - 1] /= n_cores[smt_level - 1];
+		} else {
+			/* Failed to deremine num of cores without x2APIC support.
+			   TODO: USE initial APIC ID to determine ncores. */
+			n_cores[smt_level - 1] = 0;
+			n_cores[core_level - 1] = 0;
+		}
+
+	}
 	void setCacheHierarchy()
 	{
 		if ((type_ & tINTEL) == 0) return;
@@ -96,21 +134,12 @@
 //		const unsigned int INSTRUCTION_CACHE = 2;
 		const unsigned int UNIFIED_CACHE = 3;
 		unsigned int smt_width = 0;
-		unsigned int n_cores = 0;
+		unsigned int logical_cores = 0;
 		unsigned int data[4];
 
-		/*
-			if leaf 11 exists, we use it to get the number of smt cores and cores on socket
-			If x2APIC is supported, these are the only correct numbers.
-
-			leaf 0xB can be zeroed-out by a hypervisor
-		*/
-		getCpuidEx(0x0, 0, data);
-		if (data[0] >= 0xB) {
-			getCpuidEx(0xB, 0, data); // CPUID for SMT Level
-			smt_width = data[1] & 0x7FFF;
-			getCpuidEx(0xB, 1, data); // CPUID for CORE Level
-			n_cores = data[1] & 0x7FFF;
+		if (x2APIC_supported) {
+			smt_width = n_cores[0];
+			logical_cores = n_cores[1];
 		}
 
 		/*
@@ -127,24 +156,27 @@
 			unsigned int cacheType = extractBit(data[0], 0, 4);
 			if (cacheType == NO_CACHE) break;
 			if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) {
-				unsigned int nb_logical_cores = extractBit(data[0], 14, 25) + 1;
-				if (n_cores != 0) { // true only if leaf 0xB is supported and valid
-					nb_logical_cores = (std::min)(nb_logical_cores, n_cores);
-				}
-				assert(nb_logical_cores != 0);
+				unsigned int actual_logical_cores = extractBit(data[0], 14, 25) + 1;
+				if (logical_cores != 0) // true only if leaf 0xB is supported and valid
+					actual_logical_cores = (std::min)(actual_logical_cores, logical_cores);
+				assert(actual_logical_cores != 0);
 				data_cache_size[data_cache_levels] =
 					(extractBit(data[1], 22, 31) + 1)
 					* (extractBit(data[1], 12, 21) + 1)
 					* (extractBit(data[1], 0, 11) + 1)
 					* (data[2] + 1);
-				if (cacheType == DATA_CACHE && smt_width == 0) smt_width = nb_logical_cores;
+				if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores;
 				assert(smt_width != 0);
-				cores_sharing_data_cache[data_cache_levels] = (std::max)(nb_logical_cores / smt_width, 1u);
+				cores_sharing_data_cache[data_cache_levels] = (std::max)(actual_logical_cores / smt_width, 1u);
 				data_cache_levels++;
 			}
 		}
 	}
 
+	//system topology
+	bool x2APIC_supported;
+	static const unsigned int maxTopologyLevels = 2;
+	unsigned int n_cores[maxTopologyLevels];
 public:
 	int model;
 	int family;
@@ -160,6 +192,14 @@
 	unsigned int cores_sharing_data_cache[maxNumberCacheLevels];
 	unsigned int data_cache_levels;
 
+
+	unsigned int getNumCores(intel_cpu_topology_level_t topology_level) {
+		if (topology_level != smt_level
+			&& topology_level != core_level) throw Error(ERR_BAD_PARAMETER);
+		if (!x2APIC_supported) throw Error(ERR_x2APIC_NOT_SUPPORTED_CANT_GET_NCORES);
+		return n_cores[topology_level - 1];
+	}
+
 	unsigned int getDataCacheLevels() const { return data_cache_levels; }
 	unsigned int getCoresSharingDataCache(unsigned int i) const
 	{
@@ -271,6 +311,7 @@
 
 	Cpu()
 		: type_(NONE)
+		, x2APIC_supported(false)
 		, data_cache_levels(0)
 	{
 		unsigned int data[4];
@@ -363,6 +404,7 @@
 			if (ECX & (1U << 0)) type_ |= tPREFETCHWT1;
 		}
 		setFamily();
+		setNumCores();
 		setCacheHierarchy();
 	}
 	void putFamily() const