Yet another representation for arrays

This "linear" representation (see ltable.h for details) has worse
locality than cells, but the simpler access code seems to compensate
that.
diff --git a/lobject.h b/lobject.h
index 169512f..a70731f 100644
--- a/lobject.h
+++ b/lobject.h
@@ -773,15 +773,12 @@
 #define setnorealasize(t)	((t)->flags |= BITRAS)
 
 
-typedef struct ArrayCell ArrayCell;
-
-
 typedef struct Table {
   CommonHeader;
   lu_byte flags;  /* 1<<p means tagmethod(p) is not present */
   lu_byte lsizenode;  /* log2 of size of 'node' array */
   unsigned int alimit;  /* "limit" of 'array' array */
-  ArrayCell *array;  /* array part */
+  Value *array;  /* array part */
   Node *node;
   struct Table *metatable;
   GCObject *gclist;
diff --git a/ltable.c b/ltable.c
index ef19a5c..e969ade 100644
--- a/ltable.c
+++ b/ltable.c
@@ -25,6 +25,7 @@
 
 #include <math.h>
 #include <limits.h>
+#include <string.h>
 
 #include "lua.h"
 
@@ -73,7 +74,7 @@
 ** MAXASIZEB is the maximum number of elements in the array part such
 ** that the size of the array fits in 'size_t'.
 */
-#define MAXASIZEB	((MAX_SIZET/sizeof(ArrayCell)) * NM)
+#define MAXASIZEB	(MAX_SIZET/(sizeof(Value) + 1))
 
 
 /*
@@ -553,26 +554,52 @@
 /*
 ** Convert an "abstract size" (number of slots in an array) to
 ** "concrete size" (number of bytes in the array).
-** If the abstract size is not a multiple of NM, the last cell is
-** incomplete, so we don't need to allocate memory for the whole cell.
-** 'extra' computes how many values are not needed in that last cell.
-** It will be zero when 'size' is a multiple of NM, and from there it
-** increases as 'size' decreases, up to (NM - 1).
 */
 static size_t concretesize (unsigned int size) {
-  unsigned int numcells = (size + NM - 1) / NM;   /* (size / NM) rounded up */
-  unsigned int extra = NM - 1 - ((size + NM - 1) % NM);
-  return numcells * sizeof(ArrayCell) - extra * sizeof(Value);
+  return size * sizeof(Value) + size;  /* space for the two arrays */
 }
 
 
-static ArrayCell *resizearray (lua_State *L , Table *t,
+/*
+** Resize the array part of a table. If new size is equal to the old,
+** do nothing. Else, if new size is zero, free the old array. (It must
+** be present, as the sizes are different.) Otherwise, allocate a new
+** array, move the common elements to new proper position, and then
+** frees old array.
+** When array grows, we could reallocate it, but we still would need
+** to move the elements to their new position, so the copy implicit
+** in realloc is a waste.  When array shrinks, it always erases some
+** elements that should still be in the array, so we must reallocate in
+** two steps anyway. It is simpler to always reallocate in two steps.
+*/
+static Value *resizearray (lua_State *L , Table *t,
                                unsigned oldasize,
                                unsigned newasize) {
-  size_t oldasizeb = concretesize(oldasize);
-  size_t newasizeb = concretesize(newasize);
-  void *a = luaM_reallocvector(L, t->array, oldasizeb, newasizeb, lu_byte);
-  return cast(ArrayCell*, a);
+  if (oldasize == newasize)
+    return t->array;  /* nothing to be done */
+  else if (newasize == 0) {  /* erasing array? */
+    Value *op = t->array - oldasize;  /* original array's real address */
+    luaM_freemem(L, op, concretesize(oldasize));  /* free it */
+    return NULL;
+  }
+  else {
+    size_t newasizeb = concretesize(newasize);
+    Value *np = cast(Value *,
+                  luaM_reallocvector(L, NULL, 0, newasizeb, lu_byte));
+    if (np == NULL)  /* allocation error? */
+      return NULL;
+    if (oldasize > 0) {
+      Value *op = t->array - oldasize;  /* real original array */
+      unsigned tomove = (oldasize < newasize) ? oldasize : newasize;
+      lua_assert(tomove > 0);
+      /* move common elements to new position */
+      memcpy(np + newasize - tomove,
+             op + oldasize - tomove,
+             concretesize(tomove));
+      luaM_freemem(L, op, concretesize(oldasize));
+    }
+    return np + newasize;  /* shift pointer to the end of value segment */
+  }
 }
 
 
@@ -699,7 +726,7 @@
                                           unsigned nhsize) {
   Table newt;  /* to keep the new hash part */
   unsigned int oldasize = setlimittosize(t);
-  ArrayCell *newarray;
+  Value *newarray;
   if (newasize > MAXASIZE)
     luaG_runerror(L, "table overflow");
   /* create new hash part with appropriate size into 'newt' */
@@ -777,18 +804,12 @@
 
 
 /*
-** Frees a table. The assert ensures the correctness of 'concretesize',
-** checking its result against the address of the last element in the
-** array part of the table, computed abstractly.
+** Frees a table.
 */
 void luaH_free (lua_State *L, Table *t) {
   unsigned int realsize = luaH_realasize(t);
-  size_t sizeb = concretesize(realsize);
-  lua_assert((sizeb == 0 && realsize == 0) ||
-             cast_charp(t->array) + sizeb - sizeof(Value) ==
-             cast_charp(getArrVal(t, realsize - 1)));
   freehash(L, t);
-  luaM_freemem(L, t->array, sizeb);
+  resizearray(L, t, realsize, 0);
   luaM_free(L, t);
 }
 
diff --git a/ltable.h b/ltable.h
index 4734bd5..6db197b 100644
--- a/ltable.h
+++ b/ltable.h
@@ -71,7 +71,7 @@
 
 /*
 ** 'luaH_get*' operations set 'res', unless the value is absent, and
-** return the tag of the result,
+** return the tag of the result.
 ** The 'luaH_pset*' (pre-set) operations set the given value and return
 ** HOK, unless the original value is absent. In that case, if the key
 ** is really absent, they return HNOTFOUND. Otherwise, if there is a
@@ -86,24 +86,27 @@
 
 
 /*
-** The array part of a table is represented by an array of cells.
-** Each cell is composed of NM tags followed by NM values, so that
-** no space is wasted in padding. The last cell may be incomplete,
-** that is, it may have fewer than NM values.
+** The array part of a table is represented by an inverted array of
+** values followed by an array of tags, to avoid wasting space with
+** padding.  The 'array' pointer points to the junction of the two
+** arrays, so that values are indexed with negative indices and tags
+** with non-negative indices.
+
+                     Values                      Tags
+        --------------------------------------------------------
+         ...  |   Value 1     |   Value 0     |0|1|...
+        --------------------------------------------------------
+                                               ^ t->array
+
+** All accesses to 't->array' should be through the macros 'getArrTag'
+** and 'getArrVal'.
 */
-#define NM      cast_uint(sizeof(Value))
-
-struct ArrayCell {
-  lu_byte tag[NM];
-  Value value[NM];
-};
-
 
 /* Computes the address of the tag for the abstract index 'k' */
-#define getArrTag(t,k)	(&(t)->array[(k)/NM].tag[(k)%NM])
+#define getArrTag(t,k)	(cast(lu_byte*, (t)->array) + (k))
 
 /* Computes the address of the value for the abstract index 'k' */
-#define getArrVal(t,k)	(&(t)->array[(k)/NM].value[(k)%NM])
+#define getArrVal(t,k)	((t)->array - 1 - (k))
 
 
 /*
@@ -117,9 +120,9 @@
 
 
 /*
-** Often, we need to check the tag of a value before moving it. These
-** macros also move TValues to/from arrays, but receive the precomputed
-** tag value or address as an extra argument.
+** Often, we need to check the tag of a value before moving it. The
+** following macros also move TValues to/from arrays, but receive the
+** precomputed tag value or address as an extra argument.
 */
 #define farr2val(h,k,tag,res)  \
   ((res)->tt_ = tag, (res)->value_ = *getArrVal(h,(k)-1u))