issue #109 work on simple bytecode profiler

gccjit-ravi534
Dibyendu Majumdar 8 years ago
parent 4bb39d00f0
commit 36c99300a2

@ -138,7 +138,7 @@ endif()
set(LUA_CORE_SRCS src/lapi.c src/lcode.c src/lctype.c src/ldebug.c src/ldo.c src/ldump.c
src/lfunc.c src/lgc.c src/llex.c src/lmem.c src/lobject.c src/lopcodes.c
src/lparser.c src/lstate.c src/lstring.c src/ltable.c src/ltm.c src/lundump.c
src/lvm.c src/lzio.c src/ravijit.cpp src/ltests.c)
src/lvm.c src/lzio.c src/ravijit.cpp src/ltests.c src/ravi_profile.c)
# define the lua lib source files
set(LUA_LIB_SRCS src/lauxlib.c src/lbaselib.c src/lbitlib.c src/lcorolib.c src/ldblib.c src/liolib.c
src/lmathlib.c src/loslib.c src/ltablib.c src/lstrlib.c src/loadlib.c src/linit.c src/lutf8lib.c)

@ -0,0 +1,13 @@
#ifndef RAVI_PROFILE_H
#define RAVI_PROFILE_H
#include "lua.h"
#include "lopcodes.h"
LUAI_DDEC unsigned long long raviV_profiledata[NUM_OPCODES];
LUAI_FUNC void raviV_init_profiledata(void);
LUAI_FUNC void raviV_add_profiledata(OpCode opcode);
LUAI_FUNC void raviV_print_profiledata(void);
#endif

@ -34,7 +34,7 @@ end
-- Matrix transpose
-- This version uses slices
function matrix.T(a)
local t1 = os.clock()
--local t1 = os.clock()
local mrows, mcols, mnew, mdata, mrow, mtran = matrix.rows, matrix.cols, matrix.new, matrix.getdata, matrix.getrow, matrix.T
local m: integer, n: integer = mrows(a), mcols(a);
local x = mnew(n,m)
@ -51,15 +51,15 @@ function matrix.T(a)
data[pos] = slice[j]
end
end
local t2 = os.clock()
print("T: time ", t2-t1)
--local t2 = os.clock()
--print("T: time ", t2-t1)
return x;
end
-- Matrix transpose
-- Does not use slices
function matrix.T2(a)
local t1 = os.clock()
--local t1 = os.clock()
local mrows, mcols, mnew, mdata, mrow, mtran = matrix.rows, matrix.cols, matrix.new, matrix.getdata, matrix.getrow, matrix.T
local m: integer, n: integer = mrows(a), mcols(a);
local x = mnew(n,m)
@ -74,15 +74,15 @@ function matrix.T2(a)
data[(j-1)*m+i] = adata[ri+j]
end
end
local t2 = os.clock()
print("T2: time ", t2-t1)
--local t2 = os.clock()
--print("T2: time ", t2-t1)
return x;
end
-- Matrix multiply
-- Uses slices
function matrix.mul(a, b)
local t1 = os.clock()
--local t1 = os.clock()
local mrows, mcols, mnew, mdata, mrow, mtran = matrix.rows, matrix.cols, matrix.new, matrix.getdata, matrix.getrow, matrix.T
local m: integer, n: integer, p: integer = mrows(a), mcols(a), mcols(b);
assert(n == p)
@ -98,8 +98,8 @@ function matrix.mul(a, b)
xi[j] = sum;
end
end
local t2 = os.clock()
print("mul: time ", t2-t1)
--local t2 = os.clock()
--print("mul: time ", t2-t1)
return x;
end
@ -107,7 +107,7 @@ end
-- this version avoids using slices - we operate on the
-- one dimensional array
function matrix.mul2(a, b)
local t1 = os.clock()
--local t1 = os.clock()
local mrows, mcols, mnew, mdata, mrow, mtran = matrix.rows, matrix.cols, matrix.new, matrix.getdata, matrix.getrow, matrix.T2
local m: integer, n: integer, p: integer = mrows(a), mcols(a), mcols(b);
assert(n == p)
@ -131,14 +131,14 @@ function matrix.mul2(a, b)
xdata[xi+j] = sum;
end
end
local t2 = os.clock()
print("mul2: time ", t2-t1)
--local t2 = os.clock()
--print("mul2: time ", t2-t1)
return x;
end
-- Generate the matrix - uses slices
function matrix.gen(n: integer)
local t1 = os.clock()
--local t1 = os.clock()
local mrows, mcols, mnew, mdata, mrow, mtran = matrix.rows, matrix.cols, matrix.new, matrix.getdata, matrix.getrow, matrix.T
local a = mnew(n, n)
local tmp: number = 1.0 / n / n;
@ -148,14 +148,14 @@ function matrix.gen(n: integer)
row[j] = tmp * (i - j) * (i + j - 2)
end
end
local t2 = os.clock()
print("gen: time ", t2-t1)
--local t2 = os.clock()
--print("gen: time ", t2-t1)
return a;
end
-- Generate the matrix - this version does not use slices
function matrix.gen2(n: integer)
local t1 = os.clock()
--local t1 = os.clock()
local mrows, mcols, mnew, mdata, mrow, mtran = matrix.rows, matrix.cols, matrix.new, matrix.getdata, matrix.getrow, matrix.T
local a = mnew(n, n)
local data: number[] = mdata(a)
@ -168,8 +168,8 @@ function matrix.gen2(n: integer)
data[ri+j] = tmp * (i - j) * (i + j - 2)
end
end
local t2 = os.clock()
print("gen2: time ", t2-t1)
--local t2 = os.clock()
--print("gen2: time ", t2-t1)
return a;
end
@ -193,7 +193,7 @@ end
local n = arg[1] or 1000;
n = math.floor(n/2) * 2;
local t1 = os.clock()
local a = matrix.mul2(matrix.gen(n), matrix.gen(n));
local a = matrix.mul2(matrix.gen2(n), matrix.gen2(n));
local t2 = os.clock()
print("total time taken ", t2-t1)

@ -0,0 +1,62 @@
-- Writen by Attractive Chaos; distributed under the MIT license
matrix = {}
function matrix.T(a: table)
local m: integer, n: integer, x: table = #a, #a[1], {};
for i = 1, n do
local xi: number[] = table.numarray(n, 0.0)
x[i] = xi
for j = 1, m do xi[j] = @number (a[j][i]) end
end
return x;
end
function matrix.mul(a: table, b: table)
assert(#a[1] == #b);
local m: integer, n: integer, p: integer, x: table = #a, #a[1], #b[1], {};
local c: table = matrix.T(b); -- transpose for efficiency
for i = 1, m do
local xi: number[] = table.numarray(p, 0.0)
x[i] = xi
for j = 1, p do
local sum: number, ai: number[], cj: number[] = 0.0, @number[](a[i]), @number[](c[j]);
-- for luajit, caching c[j] or not makes no difference; lua is not so clever
for k = 1, n do sum = sum + ai[k] * cj[k] end
xi[j] = sum;
end
end
return x;
end
function matgen(n: integer)
local a: table, tmp: number = {}, 1. / n / n;
for i = 1, n do
local ai: number[] = table.numarray(n, 0.0)
a[i] = ai
for j = 1, n do
ai[j] = tmp * (i - j) * (i + j - 2)
end
end
return a;
end
--ravi.dumplua(matgen)
if ravi and ravi.jit() then
ravi.compile(matrix.T)
ravi.compile(matrix.mul)
ravi.compile(matgen)
end
local n = arg[1] or 1000;
n = math.floor(n/2) * 2;
if jit then
-- luajit warmup
matrix.mul(matgen(n), matgen(n))
end
local t1 = os.clock()
local a = matrix.mul(matgen(n), matgen(n))
local t2 = os.clock()
print("time taken ", t2-t1)
print(a[n/2+1][n//2+1]);

@ -29,6 +29,7 @@
#include "ltm.h"
#include "ravijit.h"
#include "ravi_profile.h"
#if !defined(LUAI_GCPAUSE)
#define LUAI_GCPAUSE 200 /* 200% */
@ -318,6 +319,7 @@ LUA_API lua_State *lua_newstate (lua_Alloc f, void *ud) {
int i;
lua_State *L;
global_State *g;
LG *l = cast(LG *, (*f)(ud, NULL, LUA_TTHREAD, sizeof(LG)));
if (l == NULL) return NULL;
L = &l->l.l;
@ -362,6 +364,7 @@ LUA_API lua_State *lua_newstate (lua_Alloc f, void *ud) {
close_state(L);
L = NULL;
}
raviV_init_profiledata();
return L;
}
@ -370,6 +373,7 @@ LUA_API void lua_close (lua_State *L) {
L = G(L)->mainthread; /* only the main thread can be closed */
lua_lock(L);
close_state(L);
raviV_print_profiledata();
}

@ -19,7 +19,6 @@
#include "lauxlib.h"
#include "lualib.h"
#if !defined(LUA_PROMPT)
#define LUA_PROMPT "> "
#define LUA_PROMPT2 ">> "

@ -34,7 +34,7 @@
#include "ltable.h"
#include "ltm.h"
#include "lvm.h"
#include "ravi_profile.h"
/* limit for table tag-method chains (to avoid loops) */
#define MAXTAGLOOP 2000
@ -952,6 +952,7 @@ int luaV_execute (lua_State *L) {
k = cl->p->k; /* local reference to function's constant table */
base = ci->u.l.base; /* local copy of function's base */
/* main loop of interpreter */
OpCode prevop = -1;
for (;;) {
Instruction i = *(ci->u.l.savedpc++);
StkId ra;
@ -959,6 +960,8 @@ int luaV_execute (lua_State *L) {
Protect(luaG_traceexec(L));
/* WARNING: several calls may realloc the stack and invalidate 'ra' */
OpCode op = GET_OPCODE(i);
if (prevop != -1) raviV_add_profiledata(prevop);
prevop = op;
#if 0
RAVI_DEBUG_STACK(
ravi_debug_trace(L, op, (ci->u.l.savedpc - cl->p->code) - 1));
@ -1356,6 +1359,7 @@ int luaV_execute (lua_State *L) {
in JIT mode (see how b is handled in OP_CALL JIT implementation)
or via luaD_precall() if a JITed function is invoked (see
ldo.c for how luaD_precall() handles this */
raviV_add_profiledata(op);
return b; /* external invocation: return */
}
else { /* invocation via reentry: continue execution */
@ -1363,6 +1367,7 @@ int luaV_execute (lua_State *L) {
if (b) L->top = ci->top;
lua_assert(isLua(ci));
lua_assert(GET_OPCODE(*((ci)->u.l.savedpc - 1)) == OP_CALL);
raviV_add_profiledata(op);
goto newframe; /* restart luaV_execute over new Lua function */
}
}

@ -0,0 +1,57 @@
#include "ravi_profile.h"
#include <stdint.h>
#ifdef _WIN32
#include <windows.h>
unsigned long long raviV_profiledata[NUM_OPCODES];
/* The number of nanoseconds in one second. */
#define UV__NANOSEC 1000000000
/* Interval (in seconds) of the high-resolution clock. */
static double hrtime_interval_ = 0;
static uint64_t prev_time = 0;
void raviV_init_profiledata(void) {
LARGE_INTEGER perf_frequency;
/* Retrieve high-resolution timer frequency
* and precompute its reciprocal.
*/
if (QueryPerformanceFrequency(&perf_frequency)) {
hrtime_interval_ = 1.0 / perf_frequency.QuadPart;
} else {
hrtime_interval_= 0;
}
}
void raviV_add_profiledata(OpCode opcode) {
LARGE_INTEGER counter;
/* If the performance interval is zero, there's no support. */
if (hrtime_interval_ == 0) {
return;
}
if (!QueryPerformanceCounter(&counter)) {
return;
}
/* Because we have no guarantee about the order of magnitude of the
* performance counter interval, integer math could cause this computation
* to overflow. Therefore we resort to floating point math.
*/
uint64_t this_time = (uint64_t) ((double) counter.QuadPart * hrtime_interval_ * UV__NANOSEC);
raviV_profiledata[opcode] += (this_time - (prev_time == 0 ? this_time : prev_time));
prev_time = this_time;
}
void raviV_print_profiledata(void) {
for (int i = 0; i < NUM_OPCODES; i++) {
printf("PerfStat [%s] %llu\n", luaP_opnames[i], raviV_profiledata[i]);
}
}
#endif
Loading…
Cancel
Save