Correct handling of 'luaV_execute' invocations

The previous stackless implementations marked all 'luaV_execute'
invocations as fresh. However, re-entering 'luaV_execute' when
resuming a coroutine should not be a fresh invocation. (It works
because 'unroll' called 'luaV_execute' for each call entry, but
it was slower than letting 'luaV_execute' finish all non-fresh
invocations.)
diff --git a/ldo.c b/ldo.c
index 0a6a716..052c57a 100644
--- a/ldo.c
+++ b/ldo.c
@@ -449,12 +449,13 @@
 
 /*
 ** Prepares the call to a function (C or Lua). For C functions, also do
-** the call.  The function to be called is at '*func'.  The arguments are
-** on the stack, right after the function.  Returns true if the call was
-** made (it was a C function).  When returns true, all the results are
-** on the stack, starting at the original function position.
+** the call. The function to be called is at '*func'.  The arguments
+** are on the stack, right after the function.  Returns the CallInfo
+** to be executed, if it was a Lua function. Otherwise (a C function)
+** returns NULL, with all the results on the stack, starting at the
+** original function position.
 */
-int luaD_precall (lua_State *L, StkId func, int nresults) {
+CallInfo *luaD_precall (lua_State *L, StkId func, int nresults) {
   lua_CFunction f;
  retry:
   switch (ttypetag(s2v(func))) {
@@ -482,7 +483,7 @@
       lua_lock(L);
       api_checknelems(L, n);
       luaD_poscall(L, ci, n);
-      return 1;
+      return NULL;
     }
     case LUA_VLCL: {  /* Lua function */
       CallInfo *ci;
@@ -494,14 +495,13 @@
       L->ci = ci = next_ci(L);
       ci->nresults = nresults;
       ci->u.l.savedpc = p->code;  /* starting point */
-      ci->callstatus = 0;
       ci->top = func + 1 + fsize;
       ci->func = func;
       L->ci = ci;
       for (; narg < nfixparams; narg++)
         setnilvalue(s2v(L->top++));  /* complete missing arguments */
       lua_assert(ci->top <= L->stack_last);
-      return 0;
+      return ci;
     }
     default: {  /* not a function */
       checkstackGCp(L, 1, func);  /* space for metamethod */
@@ -518,11 +518,14 @@
 ** increment number of non-yieldable calls).
 */
 static void docall (lua_State *L, StkId func, int nResults, int inc) {
+  CallInfo *ci;
   L->nCcalls += inc;
-  if (getCcalls(L) >= LUAI_MAXCCALLS)
+  if (unlikely(getCcalls(L) >= LUAI_MAXCCALLS))
     luaE_checkcstack(L);
-  if (!luaD_precall(L, func, nResults))  /* is a Lua function? */
-    luaV_execute(L, L->ci);  /* call it */
+  if ((ci = luaD_precall(L, func, nResults)) != NULL) {  /* Lua function? */
+    ci->callstatus = CIST_FRESH;  /* mark that it is a "fresh" execute */
+    luaV_execute(L, ci);  /* call it */
+  }
   L->nCcalls -= inc;
 }
 
diff --git a/ldo.h b/ldo.h
index 7d03211..4d30d07 100644
--- a/ldo.h
+++ b/ldo.h
@@ -59,7 +59,7 @@
                                         int fTransfer, int nTransfer);
 LUAI_FUNC void luaD_hookcall (lua_State *L, CallInfo *ci);
 LUAI_FUNC void luaD_pretailcall (lua_State *L, CallInfo *ci, StkId func, int n);
-LUAI_FUNC int luaD_precall (lua_State *L, StkId func, int nResults);
+LUAI_FUNC CallInfo *luaD_precall (lua_State *L, StkId func, int nResults);
 LUAI_FUNC void luaD_call (lua_State *L, StkId func, int nResults);
 LUAI_FUNC void luaD_callnoyield (lua_State *L, StkId func, int nResults);
 LUAI_FUNC void luaD_tryfuncTM (lua_State *L, StkId func);
diff --git a/lstate.c b/lstate.c
index bd1b512..13c1ff0 100644
--- a/lstate.c
+++ b/lstate.c
@@ -172,7 +172,7 @@
 
 LUAI_FUNC void luaE_incCstack (lua_State *L) {
   L->nCcalls++;
-  if (getCcalls(L) >= LUAI_MAXCCALLS)
+  if (unlikely(getCcalls(L) >= LUAI_MAXCCALLS))
     luaE_checkcstack(L);
 }
 
diff --git a/lstate.h b/lstate.h
index a05db37..5573898 100644
--- a/lstate.h
+++ b/lstate.h
@@ -183,14 +183,15 @@
 */
 #define CIST_OAH	(1<<0)	/* original value of 'allowhook' */
 #define CIST_C		(1<<1)	/* call is running a C function */
-#define CIST_HOOKED	(1<<2)	/* call is running a debug hook */
-#define CIST_YPCALL	(1<<3)	/* call is a yieldable protected call */
-#define CIST_TAIL	(1<<4)	/* call was tail called */
-#define CIST_HOOKYIELD	(1<<5)	/* last hook called yielded */
-#define CIST_FIN	(1<<6)  /* call is running a finalizer */
-#define CIST_TRAN	(1<<7)	/* 'ci' has transfer information */
+#define CIST_FRESH	(1<<2)  /* call is on a fresh "luaV_execute" frame */
+#define CIST_HOOKED	(1<<3)	/* call is running a debug hook */
+#define CIST_YPCALL	(1<<4)	/* call is a yieldable protected call */
+#define CIST_TAIL	(1<<5)	/* call was tail called */
+#define CIST_HOOKYIELD	(1<<6)	/* last hook called yielded */
+#define CIST_FIN	(1<<7)  /* call is running a finalizer */
+#define CIST_TRAN	(1<<8)	/* 'ci' has transfer information */
 #if defined(LUA_COMPAT_LT_LE)
-#define CIST_LEQ	(1<<8)  /* using __lt for __le */
+#define CIST_LEQ	(1<<9)  /* using __lt for __le */
 #endif
 
 /* active function is a Lua function */
diff --git a/lvm.c b/lvm.c
index eadf66b..51b22d8 100644
--- a/lvm.c
+++ b/lvm.c
@@ -1124,7 +1124,6 @@
 
 
 void luaV_execute (lua_State *L, CallInfo *ci) {
-  CallInfo * const origci = ci;
   LClosure *cl;
   TValue *k;
   StkId base;
@@ -1133,7 +1132,7 @@
 #if LUA_USE_JUMPTABLE
 #include "ljumptab.h"
 #endif
- tailcall:
+ execute:
   trap = L->hookmask;
   cl = clLvalue(s2v(ci->func));
   k = cl->p->k;
@@ -1607,17 +1606,19 @@
         vmbreak;
       }
       vmcase(OP_CALL) {
+        CallInfo *newci;
         int b = GETARG_B(i);
         int nresults = GETARG_C(i) - 1;
         if (b != 0)  /* fixed number of arguments? */
           L->top = ra + b;  /* top signals number of arguments */
         /* else previous instruction set top */
         savepc(L);  /* in case of errors */
-        if (luaD_precall(L, ra, nresults))
+        if ((newci = luaD_precall(L, ra, nresults)) == NULL)
           updatetrap(ci);  /* C call; nothing else to be done */
         else {  /* Lua call: run function in this same invocation */
-          ci = L->ci;
-          goto tailcall;
+          ci = newci;
+          ci->callstatus = 0;  /* call re-uses 'luaV_execute' */
+          goto execute;
         }
         vmbreak;
       }
@@ -1647,13 +1648,13 @@
           luaD_precall(L, ra, LUA_MULTRET);  /* call it */
           updatetrap(ci);
           updatestack(ci);  /* stack may have been relocated */
-          ci->func -= delta;
+          ci->func -= delta;  /* restore 'func' (if vararg) */
           luaD_poscall(L, ci, cast_int(L->top - ra));  /* finish caller */
-          goto ret;
+          goto ret;  /* caller returns after the tail call */
         }
-        ci->func -= delta;
+        ci->func -= delta;  /* restore 'func' (if vararg) */
         luaD_pretailcall(L, ci, ra, b);  /* prepare call frame */
-        goto tailcall;
+        goto execute;  /* execute the callee */
       }
       vmcase(OP_RETURN) {
         int n = GETARG_B(i) - 1;  /* number of results */
@@ -1706,11 +1707,11 @@
           }
         }
        ret:
-        if (ci == origci)
-          return;
+        if (ci->callstatus & CIST_FRESH)
+          return;  /* end this frame */
         else {
           ci = ci->previous;
-          goto tailcall;
+          goto execute;  /* continue running caller in this frame */
         }
       }
       vmcase(OP_FORLOOP) {