Fix detection of `GFNI`,`VAES`, and `VPCLMULQDQ`

Previously these CPUID flags were only tested when the host supported
AVX512.

Before:
```
% sde64 -adl -- ./test_util64
64bit
vendor intel
 mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe avx_vnni waitpkg clflushopt cldemote movdiri movdir64b
popcnt ok
family=6, model=A, stepping=0, extFamily=0, extModel=9
display:family=6, model=9A
cache level=0 data cache size=32768 cores sharing data cache=1
cache level=1 data cache size=1048576 cores sharing data cache=1
cache level=2 data cache size=14417920 cores sharing data cache=1
SmtLevel =2
CoreLevel=1
```

After:
```
% sde64 -adl -- ./test_util64
64bit
vendor intel
 mmx mmx2 cmov sse sse2 sse3 ssse3 sse41 sse42 popcnt aesni rdtscp osxsave(xgetvb) pclmulqdq avx fma avx2 bmi1 bmi2 lzcnt prefetchw enh_rep rdrand adx rdseed smap sha f16c movbe gfni vaes vpclmulqdq avx_vnni waitpkg clflushopt cldemote movdiri movdir64b
popcnt ok
family=6, model=A, stepping=0, extFamily=0, extModel=9
display:family=6, model=9A
cache level=0 data cache size=32768 cores sharing data cache=1
cache level=1 data cache size=1048576 cores sharing data cache=1
cache level=2 data cache size=14417920 cores sharing data cache=1
SmtLevel =2
CoreLevel=1
```
diff --git a/xbyak/xbyak_util.h b/xbyak/xbyak_util.h
index 4b94d37..db8ac00 100644
--- a/xbyak/xbyak_util.h
+++ b/xbyak/xbyak_util.h
@@ -506,9 +506,6 @@
 						if (EBX & (1U << 31)) type_ |= tAVX512VL;
 						if (ECX & (1U << 1)) type_ |= tAVX512_VBMI;
 						if (ECX & (1U << 6)) type_ |= tAVX512_VBMI2;
-						if (ECX & (1U << 8)) type_ |= tGFNI;
-						if (ECX & (1U << 9)) type_ |= tVAES;
-						if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ;
 						if (ECX & (1U << 11)) type_ |= tAVX512_VNNI;
 						if (ECX & (1U << 12)) type_ |= tAVX512_BITALG;
 						if (ECX & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ;
@@ -537,6 +534,9 @@
 			if (EBX & (1U << 29)) type_ |= tSHA;
 			if (ECX & (1U << 0)) type_ |= tPREFETCHWT1;
 			if (ECX & (1U << 5)) type_ |= tWAITPKG;
+			if (ECX & (1U << 8)) type_ |= tGFNI;
+			if (ECX & (1U << 9)) type_ |= tVAES;
+			if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ;
 			if (ECX & (1U << 25)) type_ |= tCLDEMOTE;
 			if (ECX & (1U << 27)) type_ |= tMOVDIRI;
 			if (ECX & (1U << 28)) type_ |= tMOVDIR64B;