issue #109 work on simple bytecode profiler

8 years ago · 36c99300a2
parent 4bb39d00f0
commit 36c99300a2
8 changed files with 162 additions and 22 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -138,7 +138,7 @@ endif()
 set(LUA_CORE_SRCS src/lapi.c src/lcode.c src/lctype.c src/ldebug.c src/ldo.c src/ldump.c
        src/lfunc.c src/lgc.c src/llex.c src/lmem.c src/lobject.c src/lopcodes.c
        src/lparser.c src/lstate.c src/lstring.c src/ltable.c src/ltm.c src/lundump.c
-        src/lvm.c src/lzio.c src/ravijit.cpp src/ltests.c)
+        src/lvm.c src/lzio.c src/ravijit.cpp src/ltests.c src/ravi_profile.c)
 # define the lua lib source files
 set(LUA_LIB_SRCS src/lauxlib.c src/lbaselib.c src/lbitlib.c src/lcorolib.c src/ldblib.c src/liolib.c
        src/lmathlib.c src/loslib.c src/ltablib.c src/lstrlib.c src/loadlib.c src/linit.c src/lutf8lib.c)
--- a/include/ravi_profile.h
+++ b/include/ravi_profile.h
@ -0,0 +1,13 @@
+#ifndef RAVI_PROFILE_H
+#define RAVI_PROFILE_H
+
+#include "lua.h"
+#include "lopcodes.h"
+
+LUAI_DDEC unsigned long long raviV_profiledata[NUM_OPCODES];
+
+LUAI_FUNC void raviV_init_profiledata(void);
+LUAI_FUNC void raviV_add_profiledata(OpCode opcode);
+LUAI_FUNC void raviV_print_profiledata(void); 
+
+#endif
--- a/ravi-tests/matmul1.ravi
+++ b/ravi-tests/matmul1.ravi
@ -34,7 +34,7 @@ end
 -- Matrix transpose
 -- This version uses slices
 function matrix.T(a)
-  local t1 = os.clock()
+  --local t1 = os.clock()
  local mrows, mcols, mnew, mdata, mrow, mtran = matrix.rows, matrix.cols, matrix.new, matrix.getdata, matrix.getrow, matrix.T
  local m: integer, n: integer = mrows(a), mcols(a);
  local x = mnew(n,m)
@ -51,15 +51,15 @@ function matrix.T(a)
      data[pos] = slice[j] 
    end
  end
-  local t2 = os.clock()
-  print("T: time ", t2-t1)
+  --local t2 = os.clock()
+  --print("T: time ", t2-t1)
  return x;
 end

 -- Matrix transpose
 -- Does not use slices
 function matrix.T2(a)
-  local t1 = os.clock()
+  --local t1 = os.clock()
  local mrows, mcols, mnew, mdata, mrow, mtran = matrix.rows, matrix.cols, matrix.new, matrix.getdata, matrix.getrow, matrix.T
  local m: integer, n: integer = mrows(a), mcols(a);
  local x = mnew(n,m)
@ -74,15 +74,15 @@ function matrix.T2(a)
      data[(j-1)*m+i] = adata[ri+j] 
    end
  end
-  local t2 = os.clock()
-  print("T2: time ", t2-t1)
+  --local t2 = os.clock()
+  --print("T2: time ", t2-t1)
  return x;
 end

 -- Matrix multiply
 -- Uses slices
 function matrix.mul(a, b)
-  local t1 = os.clock()
+  --local t1 = os.clock()
  local mrows, mcols, mnew, mdata, mrow, mtran = matrix.rows, matrix.cols, matrix.new, matrix.getdata, matrix.getrow, matrix.T
  local m: integer, n: integer, p: integer = mrows(a), mcols(a), mcols(b);
  assert(n == p)
@ -98,8 +98,8 @@ function matrix.mul(a, b)
      xi[j] = sum;
    end
  end
-  local t2 = os.clock()
-  print("mul: time ", t2-t1)
+  --local t2 = os.clock()
+  --print("mul: time ", t2-t1)
  return x;
 end

@ -107,7 +107,7 @@ end
 -- this version avoids using slices - we operate on the 
 -- one dimensional array
 function matrix.mul2(a, b)
-  local t1 = os.clock()
+  --local t1 = os.clock()
  local mrows, mcols, mnew, mdata, mrow, mtran = matrix.rows, matrix.cols, matrix.new, matrix.getdata, matrix.getrow, matrix.T2
  local m: integer, n: integer, p: integer = mrows(a), mcols(a), mcols(b);
  assert(n == p)
@ -131,14 +131,14 @@ function matrix.mul2(a, b)
      xdata[xi+j] = sum;
    end
  end
-  local t2 = os.clock()
-  print("mul2: time ", t2-t1)
+  --local t2 = os.clock()
+  --print("mul2: time ", t2-t1)
  return x;
 end

 -- Generate the matrix - uses slices
 function matrix.gen(n: integer)
-  local t1 = os.clock()
+  --local t1 = os.clock()
  local mrows, mcols, mnew, mdata, mrow, mtran = matrix.rows, matrix.cols, matrix.new, matrix.getdata, matrix.getrow, matrix.T
  local a = mnew(n, n)
  local tmp: number = 1.0 / n / n;
@ -148,14 +148,14 @@ function matrix.gen(n: integer)
      row[j] = tmp * (i - j) * (i + j - 2) 
    end
  end
-  local t2 = os.clock()
-  print("gen: time ", t2-t1)
+  --local t2 = os.clock()
+  --print("gen: time ", t2-t1)
  return a;
 end

 -- Generate the matrix - this version does not use slices
 function matrix.gen2(n: integer)
-  local t1 = os.clock()
+  --local t1 = os.clock()
  local mrows, mcols, mnew, mdata, mrow, mtran = matrix.rows, matrix.cols, matrix.new, matrix.getdata, matrix.getrow, matrix.T
  local a = mnew(n, n)
  local data: number[] = mdata(a)
@ -168,8 +168,8 @@ function matrix.gen2(n: integer)
      data[ri+j] = tmp * (i - j) * (i + j - 2) 
    end
  end
-  local t2 = os.clock()
-  print("gen2: time ", t2-t1)
+  --local t2 = os.clock()
+  --print("gen2: time ", t2-t1)
  return a;
 end

@ -193,7 +193,7 @@ end
 local n = arg[1] or 1000;
 n = math.floor(n/2) * 2;
 local t1 = os.clock()
-local a = matrix.mul2(matrix.gen(n), matrix.gen(n));
+local a = matrix.mul2(matrix.gen2(n), matrix.gen2(n));
 local t2 = os.clock()
 print("total time taken ", t2-t1)

--- a/ravi-tests/matmul1_ravi.lua
+++ b/ravi-tests/matmul1_ravi.lua
@ -0,0 +1,62 @@
+-- Writen by Attractive Chaos; distributed under the MIT license
+
+matrix = {}
+
+function matrix.T(a: table)
+	local m: integer, n: integer, x: table = #a, #a[1], {};
+	for i = 1, n do
+		local xi: number[] = table.numarray(n, 0.0)
+		x[i] = xi
+		for j = 1, m do xi[j] = @number (a[j][i]) end
+	end
+	return x;
+end
+
+function matrix.mul(a: table, b: table)
+	assert(#a[1] == #b);
+	local m: integer, n: integer, p: integer, x: table = #a, #a[1], #b[1], {};
+	local c: table = matrix.T(b); -- transpose for efficiency
+	for i = 1, m do
+		local xi: number[] = table.numarray(p, 0.0)
+		x[i] = xi
+		for j = 1, p do
+			local sum: number, ai: number[], cj: number[] = 0.0, @number[](a[i]), @number[](c[j]);
+			-- for luajit, caching c[j] or not makes no difference; lua is not so clever
+			for k = 1, n do sum = sum + ai[k] * cj[k] end
+			xi[j] = sum;
+		end
+	end
+	return x;
+end
+
+function matgen(n: integer)
+	local a: table, tmp: number = {}, 1. / n / n;
+	for i = 1, n do
+		local ai: number[] = table.numarray(n, 0.0)
+		a[i] = ai
+		for j = 1, n do
+			ai[j] = tmp * (i - j) * (i + j - 2) 
+		end
+	end
+	return a;
+end
+
+--ravi.dumplua(matgen)
+
+if ravi and ravi.jit() then
+	ravi.compile(matrix.T)
+	ravi.compile(matrix.mul)
+	ravi.compile(matgen)
+end
+
+local n = arg[1] or 1000;
+n = math.floor(n/2) * 2;
+if jit then
+  -- luajit warmup
+  matrix.mul(matgen(n), matgen(n))
+end
+local t1 = os.clock()
+local a = matrix.mul(matgen(n), matgen(n))
+local t2 = os.clock()
+print("time taken ", t2-t1)
+print(a[n/2+1][n//2+1]);
--- a/src/lstate.c
+++ b/src/lstate.c
@ -29,6 +29,7 @@
 #include "ltm.h"

 #include "ravijit.h"
+#include "ravi_profile.h"

 #if !defined(LUAI_GCPAUSE)
 #define LUAI_GCPAUSE	200  /* 200% */
@ -318,6 +319,7 @@ LUA_API lua_State *lua_newstate (lua_Alloc f, void *ud) {
  int i;
  lua_State *L;
  global_State *g;
+
  LG *l = cast(LG *, (*f)(ud, NULL, LUA_TTHREAD, sizeof(LG)));
  if (l == NULL) return NULL;
  L = &l->l.l;
@ -362,6 +364,7 @@ LUA_API lua_State *lua_newstate (lua_Alloc f, void *ud) {
    close_state(L);
    L = NULL;
  }
+  raviV_init_profiledata();
  return L;
 }

@ -370,6 +373,7 @@ LUA_API void lua_close (lua_State *L) {
  L = G(L)->mainthread;  /* only the main thread can be closed */
  lua_lock(L);
  close_state(L);
+  raviV_print_profiledata();
 }


--- a/src/lua.c
+++ b/src/lua.c
@ -19,7 +19,6 @@
 #include "lauxlib.h"
 #include "lualib.h"

-
 #if !defined(LUA_PROMPT)
 #define LUA_PROMPT		"> "
 #define LUA_PROMPT2		">> "
--- a/src/lvm.c
+++ b/src/lvm.c
@ -34,7 +34,7 @@
 #include "ltable.h"
 #include "ltm.h"
 #include "lvm.h"
-
+#include "ravi_profile.h"

 /* limit for table tag-method chains (to avoid loops) */
 #define MAXTAGLOOP	2000
@ -952,6 +952,7 @@ int luaV_execute (lua_State *L) {
  k = cl->p->k;  /* local reference to function's constant table */
  base = ci->u.l.base;  /* local copy of function's base */
  /* main loop of interpreter */
+  OpCode prevop = -1;
  for (;;) {
    Instruction i = *(ci->u.l.savedpc++);
    StkId ra;
@ -959,6 +960,8 @@ int luaV_execute (lua_State *L) {
      Protect(luaG_traceexec(L));
    /* WARNING: several calls may realloc the stack and invalidate 'ra' */
    OpCode op = GET_OPCODE(i);
+    if (prevop != -1) raviV_add_profiledata(prevop);
+    prevop = op;
 #if 0
    RAVI_DEBUG_STACK(
        ravi_debug_trace(L, op, (ci->u.l.savedpc - cl->p->code) - 1));
@ -1356,6 +1359,7 @@ int luaV_execute (lua_State *L) {
             in JIT mode (see how b is handled in OP_CALL JIT implementation)
             or via luaD_precall() if a JITed function is invoked (see
             ldo.c for how luaD_precall() handles this */
+          raviV_add_profiledata(op);
          return b; /* external invocation: return */
        }
        else {  /* invocation via reentry: continue execution */
@ -1363,6 +1367,7 @@ int luaV_execute (lua_State *L) {
          if (b) L->top = ci->top;
          lua_assert(isLua(ci));
          lua_assert(GET_OPCODE(*((ci)->u.l.savedpc - 1)) == OP_CALL);
+          raviV_add_profiledata(op);
          goto newframe;  /* restart luaV_execute over new Lua function */
        }
      }
--- a/src/ravi_profile.c
+++ b/src/ravi_profile.c
@ -0,0 +1,57 @@
+#include "ravi_profile.h"
+#include <stdint.h>
+
+#ifdef _WIN32
+
+#include <windows.h>
+
+
+unsigned long long raviV_profiledata[NUM_OPCODES];
+
+/* The number of nanoseconds in one second. */
+#define UV__NANOSEC 1000000000
+
+/* Interval (in seconds) of the high-resolution clock. */
+static double hrtime_interval_ = 0;
+static uint64_t prev_time = 0;
+
+void raviV_init_profiledata(void) {
+  LARGE_INTEGER perf_frequency;
+  /* Retrieve high-resolution timer frequency
+   * and precompute its reciprocal.
+   */
+  if (QueryPerformanceFrequency(&perf_frequency)) {
+    hrtime_interval_ = 1.0 / perf_frequency.QuadPart;
+  } else {
+    hrtime_interval_= 0;
+  }
+}
+
+void raviV_add_profiledata(OpCode opcode) {
+  LARGE_INTEGER counter;
+
+  /* If the performance interval is zero, there's no support. */
+  if (hrtime_interval_ == 0) {
+    return;
+  }
+
+  if (!QueryPerformanceCounter(&counter)) {
+    return;
+  }
+
+  /* Because we have no guarantee about the order of magnitude of the
+   * performance counter interval, integer math could cause this computation
+   * to overflow. Therefore we resort to floating point math.
+   */
+  uint64_t this_time = (uint64_t) ((double) counter.QuadPart * hrtime_interval_ * UV__NANOSEC);
+  raviV_profiledata[opcode] += (this_time - (prev_time == 0 ? this_time : prev_time));
+  prev_time = this_time;
+}
+void raviV_print_profiledata(void) {
+    for (int i = 0; i < NUM_OPCODES; i++) {
+        printf("PerfStat [%s] %llu\n", luaP_opnames[i], raviV_profiledata[i]);
+    }
+} 
+
+
+#endif