Put multiply_u64__output in little-endian order
diff --git a/internal/cgen/base/fundamental-public.h b/internal/cgen/base/fundamental-public.h
index 744f1cd..c7f9c3b 100644
--- a/internal/cgen/base/fundamental-public.h
+++ b/internal/cgen/base/fundamental-public.h
@@ -438,8 +438,8 @@
 // --------
 
 typedef struct {
-  uint64_t hi;
   uint64_t lo;
+  uint64_t hi;
 } wuffs_base__multiply_u64__output;
 
 // wuffs_base__multiply_u64 returns x*y as a 128-bit value.
@@ -457,8 +457,8 @@
   uint64_t w2 = t >> 32;
   w1 += x0 * y1;
   wuffs_base__multiply_u64__output o;
-  o.hi = (x1 * y1) + w2 + (w1 >> 32);
   o.lo = x * y;
+  o.hi = (x1 * y1) + w2 + (w1 >> 32);
   return o;
 }
 
diff --git a/internal/cgen/data/data.go b/internal/cgen/data/data.go
index b64c4a3..6884997 100644
--- a/internal/cgen/data/data.go
+++ b/internal/cgen/data/data.go
@@ -77,7 +77,7 @@
 	"// --------\n\n// Saturating arithmetic (sat_add, sat_sub) branchless bit-twiddling algorithms\n// are per https://locklessinc.com/articles/sat_arithmetic/\n//\n// It is important that the underlying types are unsigned integers, as signed\n// integer arithmetic overflow is undefined behavior in C.\n\nstatic inline uint8_t  //\nwuffs_base__u8__sat_add(uint8_t x, uint8_t y) {\n  uint8_t res = (uint8_t)(x + y);\n  res |= (uint8_t)(-(res < x));\n  return res;\n}\n\nstatic inline uint8_t  //\nwuffs_base__u8__sat_sub(uint8_t x, uint8_t y) {\n  uint8_t res = (uint8_t)(x - y);\n  res &= (uint8_t)(-(res <= x));\n  return res;\n}\n\nstatic inline uint16_t  //\nwuffs_base__u16__sat_add(uint16_t x, uint16_t y) {\n  uint16_t res = (uint16_t)(x + y);\n  res |= (uint16_t)(-(res < x));\n  return res;\n}\n\nstatic inline uint16_t  //\nwuffs_base__u16__sat_sub(uint16_t x, uint16_t y) {\n  uint16_t res = (uint16_t)(x - y);\n  res &= (uint16_t)(-(res <= x));\n  return res;\n}\n\nstatic inline uint32_t  //\nwuffs_base__u32__sat_add(uint32_t x, uint32_t y) {\n  uint32" +
 	"_t res = (uint32_t)(x + y);\n  res |= (uint32_t)(-(res < x));\n  return res;\n}\n\nstatic inline uint32_t  //\nwuffs_base__u32__sat_sub(uint32_t x, uint32_t y) {\n  uint32_t res = (uint32_t)(x - y);\n  res &= (uint32_t)(-(res <= x));\n  return res;\n}\n\nstatic inline uint64_t  //\nwuffs_base__u64__sat_add(uint64_t x, uint64_t y) {\n  uint64_t res = (uint64_t)(x + y);\n  res |= (uint64_t)(-(res < x));\n  return res;\n}\n\nstatic inline uint64_t  //\nwuffs_base__u64__sat_sub(uint64_t x, uint64_t y) {\n  uint64_t res = (uint64_t)(x - y);\n  res &= (uint64_t)(-(res <= x));\n  return res;\n}\n\n" +
 	"" +
-	"// --------\n\ntypedef struct {\n  uint64_t hi;\n  uint64_t lo;\n} wuffs_base__multiply_u64__output;\n\n// wuffs_base__multiply_u64 returns x*y as a 128-bit value.\n//\n// The maximum inclusive output hi_lo is 0xFFFFFFFFFFFFFFFE_0000000000000001.\nstatic inline wuffs_base__multiply_u64__output  //\nwuffs_base__multiply_u64(uint64_t x, uint64_t y) {\n  uint64_t x0 = x & 0xFFFFFFFF;\n  uint64_t x1 = x >> 32;\n  uint64_t y0 = y & 0xFFFFFFFF;\n  uint64_t y1 = y >> 32;\n  uint64_t w0 = x0 * y0;\n  uint64_t t = (x1 * y0) + (w0 >> 32);\n  uint64_t w1 = t & 0xFFFFFFFF;\n  uint64_t w2 = t >> 32;\n  w1 += x0 * y1;\n  wuffs_base__multiply_u64__output o;\n  o.hi = (x1 * y1) + w2 + (w1 >> 32);\n  o.lo = x * y;\n  return o;\n}\n\n" +
+	"// --------\n\ntypedef struct {\n  uint64_t lo;\n  uint64_t hi;\n} wuffs_base__multiply_u64__output;\n\n// wuffs_base__multiply_u64 returns x*y as a 128-bit value.\n//\n// The maximum inclusive output hi_lo is 0xFFFFFFFFFFFFFFFE_0000000000000001.\nstatic inline wuffs_base__multiply_u64__output  //\nwuffs_base__multiply_u64(uint64_t x, uint64_t y) {\n  uint64_t x0 = x & 0xFFFFFFFF;\n  uint64_t x1 = x >> 32;\n  uint64_t y0 = y & 0xFFFFFFFF;\n  uint64_t y1 = y >> 32;\n  uint64_t w0 = x0 * y0;\n  uint64_t t = (x1 * y0) + (w0 >> 32);\n  uint64_t w1 = t & 0xFFFFFFFF;\n  uint64_t w2 = t >> 32;\n  w1 += x0 * y1;\n  wuffs_base__multiply_u64__output o;\n  o.lo = x * y;\n  o.hi = (x1 * y1) + w2 + (w1 >> 32);\n  return o;\n}\n\n" +
 	"" +
 	"// --------\n\n#if defined(__GNUC__) && (__SIZEOF_LONG__ == 8)\n\nstatic inline uint32_t  //\nwuffs_base__count_leading_zeroes_u64(uint64_t u) {\n  return u ? ((uint32_t)(__builtin_clzl(u))) : 64u;\n}\n\n#else\n\nstatic inline uint32_t  //\nwuffs_base__count_leading_zeroes_u64(uint64_t u) {\n  if (u == 0) {\n    return 64;\n  }\n\n  uint32_t n = 0;\n  if ((u >> 32) == 0) {\n    n |= 32;\n    u <<= 32;\n  }\n  if ((u >> 48) == 0) {\n    n |= 16;\n    u <<= 16;\n  }\n  if ((u >> 56) == 0) {\n    n |= 8;\n    u <<= 8;\n  }\n  if ((u >> 60) == 0) {\n    n |= 4;\n    u <<= 4;\n  }\n  if ((u >> 62) == 0) {\n    n |= 2;\n    u <<= 2;\n  }\n  if ((u >> 63) == 0) {\n    n |= 1;\n    u <<= 1;\n  }\n  return n;\n}\n\n#endif  // defined(__GNUC__) && (__SIZEOF_LONG__ == 8)\n\n" +
 	"" +
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 02cf276..a634200 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -512,8 +512,8 @@
 // --------
 
 typedef struct {
-  uint64_t hi;
   uint64_t lo;
+  uint64_t hi;
 } wuffs_base__multiply_u64__output;
 
 // wuffs_base__multiply_u64 returns x*y as a 128-bit value.
@@ -531,8 +531,8 @@
   uint64_t w2 = t >> 32;
   w1 += x0 * y1;
   wuffs_base__multiply_u64__output o;
-  o.hi = (x1 * y1) + w2 + (w1 >> 32);
   o.lo = x * y;
+  o.hi = (x1 * y1) + w2 + (w1 >> 32);
   return o;
 }