issue #40: generate inline code for comparison ops when types are known

9 years ago · d918977470
parent bdbabd4781
commit d918977470
4 changed files with 97 additions and 34 deletions
--- a/include/ravi_llvmcodegen.h
+++ b/include/ravi_llvmcodegen.h
@ -931,7 +931,7 @@ public:
  // j must be the jump target (offset of the code to which we need to jump to)
  // jA must be the A operand of the jump instruction
  void emit_EQ(RaviFunctionDef *def, int A, int B, int C, int j, int jA,
-               llvm::Constant *callee);
+               llvm::Constant *callee, OpCode opCode);

  // OP_TEST is followed by a OP_JMP instruction - both are handled
  // together
--- a/readthedocs/ravi-benchmarks.rst
+++ b/readthedocs/ravi-benchmarks.rst
@ -1,6 +1,5 @@
 Ravi Performance Benchmarks
 ===========================
-
 Ravi's reason for existence is to achieve greater performance than standard Lua 5.3. Hence performance benchmarks are of interest.

 The programs used in the performance testing can be found at `Ravi Tests <https://github.com/dibyendumajumdar/ravi/tree/master/ravi-tests>`_ folder.
@ -14,9 +13,9 @@ The programs used in the performance testing can be found at `Ravi Tests <https:
 +---------------+---------+------------+------------+
 |fornum_test3   | 53.932  | 4.598      | 7.778      |
 +---------------+---------+------------+------------+
-|mandel(4000)   | 21.247  | 2.936      | 1.633      |
+|mandel(4000)   | 21.247  | 1.582      | 1.633      |
 +---------------+---------+------------+------------+
-|fannkuchen(11) | 63.446  | 8.317      | 4.751      |
+|fannkuchen(11) | 63.446  | 4.55       | 4.751      |
 +---------------+---------+------------+------------+
 |matmul(1000)   | 34.604  | 2.942      | 0.968      |
 +---------------+---------+------------+------------+
@ -24,11 +23,11 @@ The programs used in the performance testing can be found at `Ravi Tests <https:
 Following points are worth bearing in mind when looking at above benchmarks.

 1. Luajit uses an optimized representation of double values. In Lua 5.3 and
-   in Ravi, a value is 16 bytes - and floating point operations require two loads
-   / two stores. Luajit has a performance advantage when it comes to floating 
-   point operations due to this.
+   in Ravi, a value is 16 bytes - and floating point operations require two
+   loads / two stores. 

-2. More work is needed to optimize numeric operations in Ravi.
+2. More work is needed to optimize numeric operations in Ravi (such as
+   using the C stack for temporaries).

 3. Luajit compilation approach ensures that it can use information about 
   the actual execution path taken by the code at runtime whereas Ravi
@ -48,33 +47,48 @@ There are a number of improvements possible. Below are some of my thoughts.

 Allocating variables on C stack
 -------------------------------
-Certain local and temporary variables that hold numeric values could be allocated on the C stack avoiding the overhead of accessing the Lua stack. This requires implementing escape analysis to determine which variables are safe to be allocated on the C stack.
+Certain local and temporary variables that hold numeric values could be
+allocated on the C stack avoiding the overhead of accessing the Lua stack.
+This requires implementing escape analysis to determine which variables are
+safe to be allocated on the C stack.

 Optimizing Fornum loops
 -----------------------
 The Lua fornum loops create an `extra "external" variable <http://www.lua.org/manual/5.3/manual.html#3.3.5>`_ that has the name given by the user. 
-However an internal variable is actually used as the loop index. The external variable is updated at every iteration - this entails several IR 
-instructions. The obvious optimization is to eliminate this variable by making the loop index available as a readonly value. If for backward 
-compatiblity it is necessary to allow updates to the external variable then a compromise would be analyse the Lua program and only create the
-external variable if necessary.
-
-The Fornum loop needs to handle four different scenarios, resulting from the type of the index variable and whether the loop increments or decrements. 
-The generated code is not very efficient due to branching. The common case of integer index with constant step can be specialized for greater
-performance. I have implemented the case when index is an integer and the step size is a positive constant. This seems to be the most common case.
+However an internal variable is actually used as the loop index. The external
+variable is updated at every iteration. An obvious optimization is to eliminate
+this variable by making the loop index available as a readonly value.
+If for backward compatiblity it is necessary to allow updates to the external
+variable then a compromise would be analyse the Lua program and only create
+the external variable if necessary.
+
+The Fornum loop needs to handle four different scenarios, resulting from
+the type of the index variable and whether the loop increments or decrements. 
+The common case of integer index with constant step can be specialized
+for greater performance. I have implemented the case when index is an integer
+and the step size is a positive constant. This seems to be the most common case.

 The Value Storage
 -----------------
-In Lua the type of the value and the data associated with a value are stored in separate fields. Luajit however overlays the storage by utilizing
-the `technique known as NaN tagging <http://lua-users.org/lists/lua-l/2009-11/msg00089.html>`_. The Luajit model is not suited for Lua 5.3 as in this version 64-int integers are natively supported by Lua. 
+In Lua the type of the value and the data associated with a value are stored
+in separate fields. Luajit however overlays the storage by utilizing
+the `technique known as NaN tagging <http://lua-users.org/lists/lua-l/2009-11/msg00089.html>`_. The Luajit model is not suited for Lua 5.3 as in this version
+64-int integers are natively supported by Lua. 
+
+There is however still a possibility that NaN tagging can be used.
+The following scheme should work.

-There is however still a possibility that NaN tagging can be used to improve performance of values that hold doubles. The following scheme should work.
+.. note::
+   I have tested the following approach and found that it does not help
+   performance.

-Let the first 8 bytes hold a double value. And let the other values be held in the second 8 bytes.
-Then the NaN tagging technique can be used to overlay the type information with the double part.
-This would allow operations involving doubles to be faster as an extra step to set the type can be avoided. This would mean greater
-performance in floating point operations which are important in many domains.
+Let the first 8 bytes hold a double value. And let the other values be
+held in the second 8 bytes. Then the NaN tagging technique can be used to
+overlay the type information with the double part. This would allow operations
+involving doubles to be faster as avoid the extra step of setting the type.
+However other types including integers will be penalised.

-Above scheme has the additional advantage that it can be extended to support complex numbers.
+Above scheme can be extended to support complex numbers.

 * First 8 bytes could be a double representing the real part.
 * Second 8 bytes could be a double representing the imaginary part.
@ -83,6 +97,4 @@ If a value is a not a complex number then the real part will either be
 NaN, or if the real part is a double then the imaginary part will be a
 NaN.

-The problem of course is that NaN tagging may not be viable in mainstream Lua as it is probably a non-portable technique. It could also 
-introduce incompatibility between Lua and Ravi especially if Ravi supported complex numbers.

--- a/src/ravi_llvmcodegen.cpp
+++ b/src/ravi_llvmcodegen.cpp
@ -1215,6 +1215,7 @@ void RaviCodeGenerator::compile(lua_State *L, Proto *p, bool doDump,
    case OP_EQ: {
      int B = GETARG_B(i);
      int C = GETARG_C(i);
+      OpCode compOperator = op;
      llvm::Constant *comparison_function =
          ((op == OP_EQ || op == OP_RAVI_EQ_II || op == OP_RAVI_EQ_FF)
               ? def->luaV_equalobjF
@ -1228,7 +1229,7 @@ void RaviCodeGenerator::compile(lua_State *L, Proto *p, bool doDump,
      int sbx = GETARG_sBx(i);
      // j below is the jump target
      int j = sbx + pc + 1;
-      emit_EQ(def, A, B, C, j, GETARG_A(i), comparison_function);
+      emit_EQ(def, A, B, C, j, GETARG_A(i), comparison_function, compOperator);
    } break;
    case OP_TFORCALL: {
      int B = GETARG_B(i);
--- a/src/ravi_llvmcomp.cpp
+++ b/src/ravi_llvmcomp.cpp
@ -27,7 +27,7 @@ namespace ravi {
 // Although the name is EQ this actually
 // implements EQ, LE and LT - by using the supplied lua function to call.
 void RaviCodeGenerator::emit_EQ(RaviFunctionDef *def, int A, int B, int C,
-                                int j, int jA, llvm::Constant *callee) {
+                                int j, int jA, llvm::Constant *callee, OpCode opCode) {
  //  case OP_EQ: {
  //    TValue *rb = RKB(i);
  //    TValue *rc = RKC(i);
@ -43,16 +43,66 @@ void RaviCodeGenerator::emit_EQ(RaviFunctionDef *def, int A, int B, int C,
  emit_load_base(def);

  // Get pointer to register B
-  llvm::Value *lhs_ptr = emit_gep_register_or_constant(def, B);
+  llvm::Value *regB = emit_gep_register_or_constant(def, B);
  // Get pointer to register C
-  llvm::Value *rhs_ptr = emit_gep_register_or_constant(def, C);
+  llvm::Value *regC = emit_gep_register_or_constant(def, C);
+
+  llvm::Value *result = NULL;
+  switch (opCode) {
+
+  case OP_RAVI_LT_II:
+  case OP_RAVI_LE_II:
+  case OP_RAVI_EQ_II: {
+    llvm::Instruction *p1 = emit_load_reg_i(def, regB);
+    llvm::Instruction *p2 = emit_load_reg_i(def, regC);
+
+    switch (opCode) {
+    case OP_RAVI_EQ_II:
+      result = def->builder->CreateICmpEQ(p1, p2, "EQ_II_result");
+      break;
+    case OP_RAVI_LT_II:
+      result = def->builder->CreateICmpSLT(p1, p2, "LT_II_result");
+      break;
+    case OP_RAVI_LE_II:
+      result = def->builder->CreateICmpSLE(p1, p2, "LE_II_result");
+      break;
+    }
+    result = def->builder->CreateZExt(result, def->types->C_intT, "II_result_int");
+
+  } break;
+
+  case OP_RAVI_LT_FF:
+  case OP_RAVI_LE_FF:
+  case OP_RAVI_EQ_FF: {
+    llvm::Instruction *p1 = emit_load_reg_n(def, regB);
+    llvm::Instruction *p2 = emit_load_reg_n(def, regC);
+
+    switch (opCode) {
+    case OP_RAVI_EQ_FF:
+      result = def->builder->CreateFCmpOEQ(p1, p2, "EQ_FF_result");
+      break;
+    case OP_RAVI_LT_FF:
+      result = def->builder->CreateFCmpULT(p1, p2, "LT_FF_result");
+      break;
+    case OP_RAVI_LE_FF:
+      result = def->builder->CreateFCmpULE(p1, p2, "LE_FF_result");
+      break;
+    }
+    result = def->builder->CreateZExt(result, def->types->C_intT, "FF_result_int");
+
+  } break;
+
+  default:
+    // Call luaV_equalobj with register B and C
+    result =
+      CreateCall3(def->builder, callee, def->L, regB, regC);
+  }

-  // Call luaV_equalobj with register B and C
-  llvm::Value *result =
-      CreateCall3(def->builder, callee, def->L, lhs_ptr, rhs_ptr);
  // Test if result is equal to operand A
  llvm::Value *result_eq_A = def->builder->CreateICmpEQ(
      result, llvm::ConstantInt::get(def->types->C_intT, A));
+  
+  
  // If result == A then we need to execute the next statement which is a jump
  llvm::BasicBlock *then_block =
      llvm::BasicBlock::Create(def->jitState->context(), "if.then", def->f);