You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ravi/ravicomp/src/lexer.c

1013 lines
29 KiB

/******************************************************************************
* Copyright (C) 2020-2021 Dibyendu Majumdar
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
******************************************************************************/
/**
* The lexer is basically a hacked version of Lua 5.3 lexer.
* Copyright (C) 1994-2019 Lua.org, PUC-Rio.
*/
#include "fnv_hash.h"
#include "parser.h"
#include <limits.h>
#include <locale.h>
#include <math.h>
enum { EOZ = -1 }; /* end of stream */
#define cast(t, v) ((t)v)
#define cast_int(v) cast(int, v)
#define cast_uchar(c) cast(unsigned char, c)
#define cast_num(n) cast(lua_Number, n)
#define l_castU2S(i) ((lua_Integer)(i))
static inline int zgetc(LexerState *z) { return z->n-- > 0 ? cast_uchar(*z->p++) : EOZ; }
static inline void next(LexerState *ls) { ls->current = zgetc(ls); }
static inline bool currIsNewline(LexerState *ls) { return ls->current == '\n' || ls->current == '\r'; }
static inline char lua_getlocaledecpoint(void) { return localeconv()->decimal_point[0]; }
#define ARRAY_SIZE(array) ((int)(sizeof(array) / sizeof(array[0])))
/*Note: Following array was generated using utils/tokenstr.h */
static const char *const luaX_tokens[] = {
"and",
"break",
"do",
"else",
"elseif",
"end",
"false",
"for",
"function",
"goto",
"if",
"in",
"local",
"defer",
"nil",
"not",
"or",
"repeat",
"return",
"then",
"true",
"until",
"while",
"/"
"/",
"..",
"...",
"==",
">=",
"<=",
"~=",
"<<",
">>",
"::",
"@integer",
"@number",
"@integer[]",
"@number[]",
"@table",
"@string",
"@closure",
"<eof>",
"<number>",
"<integer>",
"<name>",
"<string>",
};
/* Says whether the given string represents a Lua/Ravi keyword i.e. reserved word */
static inline int is_reserved(const StringObject *s) { return s->reserved; }
enum { ALPHABIT = 0, DIGITBIT = 1, PRINTBIT = 2, SPACEBIT = 3, XDIGITBIT = 4 };
#define MASK(B) (1 << (B))
static const lu_byte luai_ctype_[UCHAR_MAX + 2] = {
0x00, /* EOZ */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0. */
0x00, 0x08, 0x08, 0x08, 0x08, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 1. */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, /* 2. */
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, /* 3. */
0x16, 0x16, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x05, /* 4. */
0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, /* 5. */
0x05, 0x05, 0x05, 0x04, 0x04, 0x04, 0x04, 0x05, 0x04, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x05, /* 6. */
0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, /* 7. */
0x05, 0x05, 0x05, 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 8. */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 9. */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* a. */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* b. */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* c. */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* d. */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* e. */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* f. */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
};
/*
** add 1 to char to allow index -1 (EOZ)
*/
#define testprop(c, p) (luai_ctype_[(c) + 1] & (p))
/*
** 'lalpha' (Lua alphabetic) and 'lalnum' (Lua alphanumeric) both include '_'
*/
static inline bool lislalpha(int c) { return testprop(c, MASK(ALPHABIT)); }
static inline bool lislalnum(int c) { return testprop(c, (MASK(ALPHABIT) | MASK(DIGITBIT))); }
static inline bool lisdigit(int c) { return testprop(c, MASK(DIGITBIT)); }
static inline bool lisspace(int c) { return testprop(c, MASK(SPACEBIT)); }
// static inline bool lisprint(int c) { return testprop(c, MASK(PRINTBIT)); }
static inline bool lisxdigit(int c) { return testprop(c, MASK(XDIGITBIT)); }
/*
** this 'ltolower' only works for alphabetic characters
*/
static inline int ltolower(int c) { return ((c) | ('A' ^ 'a')); }
/*
Creates a new string object. string objects are interned in a hash set.
If the string matches a keyword then the reserved attribute will be set to the token id associated
with the keyword else this attribute will be -1. The hash value of the string is stored the 'hash'
attribute. Note that we need to allow strings that have embedded 0 character hence the length
is explicit. But all tokens and reserved keywords are expected to be standard C strings.
*/
const StringObject *raviX_create_string(CompilerState *compiler_state, const char *input, uint32_t len)
{
StringObject temp = {.len = len, .reserved = -1, .hash = fnv1_hash_data(input, len), .str = input, };
SetEntry *entry = raviX_set_search_pre_hashed(compiler_state->strings, temp.hash, &temp);
if (entry != NULL)
/* found the string */
return (StringObject *)entry->key;
else {
StringObject *new_string = (StringObject *) raviX_allocator_allocate(&compiler_state->string_object_allocator, 0);
char *s = (char* )raviX_allocator_allocate(&compiler_state->string_allocator, len + 1); /* allow for 0 terminator */
memcpy(s, input, len);
s[len] = 0; /* 0 terminate string, however string may contain embedded 0 characters */
new_string->str = s;
new_string->len = len;
new_string->hash = temp.hash;
new_string->reserved = -1;
/* Check if this is a keyword, linear search is okay as we do this only when we first
* encounter a keyword
*/
for (int i = 0; i < ARRAY_SIZE(luaX_tokens); i++) {
if (lislalpha(luaX_tokens[i][0]) || luaX_tokens[i][0] == '@') {
if (strcmp(luaX_tokens[i], s) == 0) {
new_string->reserved = i; /* save index of the keyword */
break;
}
}
}
raviX_set_add_pre_hashed(compiler_state->strings, temp.hash, new_string);
return new_string;
}
}
#define lua_str2number(s, p) ((lua_Number)strtod((s), (p)))
static void save(LexerState *ls, int c);
void raviX_token2str(int token, TextBuffer *mb)
{
if (token < FIRST_RESERVED) { /* single-byte symbols? */
assert(token == cast_uchar(token));
raviX_buffer_add_fstring(mb, "'%c'", token);
} else {
const char *s = luaX_tokens[token - FIRST_RESERVED];
if (token < TOK_EOS) /* fixed format - reserved keywords */
raviX_buffer_add_fstring(mb, "'%s'", s);
else /* names, strings, and numerals - note that @<usertype> is covered here */
raviX_buffer_add_string(mb, s);
}
}
static void txtToken(LexerState *ls, int token)
{
switch (token) {
case TOK_NAME:
case TOK_STRING:
case TOK_FLT:
case TOK_INT:
save(ls, '\0');
raviX_buffer_add_fstring(&ls->container->error_message, "'%s'", raviX_buffer_data(ls->buff));
break;
default:
raviX_token2str(token, &ls->container->error_message);
}
}
static void lexerror(LexerState *ls, const char *msg, int token)
{
raviX_buffer_add_fstring(&ls->container->error_message, "%s(%d): %s", ls->source, ls->linenumber, msg);
if (token) {
raviX_buffer_add_string(&ls->container->error_message, " near ");
txtToken(ls, token);
}
longjmp(ls->container->env, 1);
}
void raviX_syntaxerror(LexerState *ls, const char *msg) { lexerror(ls, msg, ls->t.token); }
static void save(LexerState *ls, int c)
{
TextBuffer *b = ls->buff;
if (raviX_buffer_len(b) + 1 > raviX_buffer_size(b)) {
size_t newsize;
if (raviX_buffer_size(b) >= INT_MAX / 2)
lexerror(ls, "lexical element too long", 0);
size_t oldsize = raviX_buffer_size(b);
if (oldsize == 0)
newsize = 32;
else
newsize = oldsize * 2;
raviX_buffer_resize(b, newsize);
}
raviX_buffer_addc(b, c);
}
static inline void save_and_next(LexerState *ls)
{
save(ls, ls->current);
next(ls);
}
/*
** creates a new interned string.
*/
static const StringObject *luaX_newstring(LexerState *ls, const char *str, uint32_t l)
{
return raviX_create_string(ls->container, str, l);
}
/*
** increment line number and skips newline sequence (any of
** \n, \r, \n\r, or \r\n)
*/
static void inclinenumber(LexerState *ls)
{
int old = ls->current;
assert(currIsNewline(ls));
next(ls); /* skip '\n' or '\r' */
if (currIsNewline(ls) && ls->current != old)
next(ls); /* skip '\n\r' or '\r\n' */
if (++ls->linenumber >= INT_MAX)
lexerror(ls, "chunk has too many lines", 0);
}
LexerState *raviX_init_lexer(CompilerState *container, const char *buf, size_t buflen,
const char *source)
{
LexerState *ls = (LexerState *)raviX_calloc(1, sizeof(LexerState));
ls->container = container;
ls->t.token = 0;
ls->buf = buf;
ls->bufsize = buflen;
ls->n = ls->bufsize;
ls->p = ls->buf;
ls->current = zgetc(ls);
ls->lookahead.token = TOK_EOS; /* no look-ahead token */
ls->linenumber = 1;
ls->lastline = 1;
ls->source = source;
ls->envn = raviX_create_string(ls->container, LUA_ENV, (uint32_t)strlen(LUA_ENV))->str; /* get env name */
ls->buff = &container->buff;
for (int i = 0; i < NUM_RESERVED; i++) {
raviX_create_string(ls->container, luaX_tokens[i], (uint32_t)strlen(luaX_tokens[i]));
}
return ls;
}
void raviX_destroy_lexer(LexerState *ls)
{
if (ls == NULL)
return;
raviX_free(ls);
}
const LexerInfo *raviX_get_lexer_info(LexerState *ls) { return (LexerInfo *)ls; }
/*
** =======================================================
** LEXICAL ANALYZER
** =======================================================
*/
static int check_next1(LexerState *ls, int c)
{
if (ls->current == c) {
next(ls);
return 1;
} else
return 0;
}
static int check_save_next1(LexerState *ls, int c)
{
if (ls->current == c) {
save_and_next(ls);
return 1;
} else
return 0;
}
/*
** Check whether current char is in set 'set' (with two chars) and
** saves it
*/
static int check_next2(LexerState *ls, const char *set)
{
assert(set[2] == '\0');
if (ls->current == set[0] || ls->current == set[1]) {
save_and_next(ls);
return 1;
} else
return 0;
}
static int luaO_hexavalue(int c)
{
if (lisdigit(c))
return c - '0';
else
return (ltolower(c) - 'a') + 10;
}
static int isneg(const char **s)
{
if (**s == '-') {
(*s)++;
return 1;
} else if (**s == '+')
(*s)++;
return 0;
}
/*
** {==================================================================
** Lua's implementation for 'lua_strx2number'
** ===================================================================
*/
#if !defined(lua_strx2number)
/* maximum number of significant digits to read (to avoid overflows
even with single floats) */
#define MAXSIGDIG 30
/*
** convert an hexadecimal numeric string to a number, following
** C99 specification for 'strtod'
*/
static lua_Number lua_strx2number(const char *s, char **endptr)
{
int dot = lua_getlocaledecpoint();
lua_Number r = 0.0; /* result (accumulator) */
int sigdig = 0; /* number of significant digits */
int nosigdig = 0; /* number of non-significant digits */
int e = 0; /* exponent correction */
int neg; /* 1 if number is negative */
int hasdot = 0; /* true after seen a dot */
*endptr = (char *)s; /* nothing is valid yet */
while (lisspace(cast_uchar(*s)))
s++; /* skip initial spaces */
neg = isneg(&s); /* check signal */
if (!(*s == '0' && (*(s + 1) == 'x' || *(s + 1) == 'X'))) /* check '0x' */
return 0.0; /* invalid format (no '0x') */
for (s += 2;; s++) { /* skip '0x' and read numeral */
if (*s == dot) {
if (hasdot)
break; /* second dot? stop loop */
else
hasdot = 1;
} else if (lisxdigit(cast_uchar(*s))) {
if (sigdig == 0 && *s == '0') /* non-significant digit (zero)? */
nosigdig++;
else if (++sigdig <= MAXSIGDIG) /* can read it without overflow? */
r = (r * cast_num(16.0)) + luaO_hexavalue(*s);
else
e++; /* too many digits; ignore, but still count for exponent */
if (hasdot)
e--; /* decimal digit? correct exponent */
} else
break; /* neither a dot nor a digit */
}
if (nosigdig + sigdig == 0) /* no digits? */
return 0.0; /* invalid format */
*endptr = (char *)s; /* valid up to here */
e *= 4; /* each digit multiplies/divides value by 2^4 */
if (*s == 'p' || *s == 'P') { /* exponent part? */
int exp1 = 0; /* exponent value */
int neg1; /* exponent signal */
s++; /* skip 'p' */
neg1 = isneg(&s); /* signal */
if (!lisdigit(cast_uchar(*s)))
return 0.0; /* invalid; must have at least one digit */
while (lisdigit(cast_uchar(*s))) /* read exponent */
exp1 = exp1 * 10 + *(s++) - '0';
if (neg1)
exp1 = -exp1;
e += exp1;
*endptr = (char *)s; /* valid up to here */
}
if (neg)
r = -r;
return (lua_Number)ldexp(r, e);
}
#endif
/* }====================================================== */
/* maximum length of a numeral */
#if !defined(L_MAXLENNUM)
#define L_MAXLENNUM 200
#endif
static const char *l_str2dloc(const char *s, lua_Number *result, int mode)
{
char *endptr;
*result = (mode == 'x') ? lua_strx2number(s, &endptr) /* try to convert */
: lua_str2number(s, &endptr);
if (endptr == s)
return NULL; /* nothing recognized? */
while (lisspace(cast_uchar(*endptr)))
endptr++; /* skip trailing spaces */
return (*endptr == '\0') ? endptr : NULL; /* OK if no trailing characters */
}
/*
** Convert string 's' to a Lua number (put in 'result'). Return NULL
** on fail or the address of the ending '\0' on success.
** 'pmode' points to (and 'mode' contains) special things in the string:
** - 'x'/'X' means an hexadecimal numeral
** - 'n'/'N' means 'inf' or 'nan' (which should be rejected)
** - '.' just optimizes the search for the common case (nothing special)
** This function accepts both the current locale or a dot as the radix
** mark. If the convertion fails, it may mean number has a dot but
** locale accepts something else. In that case, the code copies 's'
** to a buffer (because 's' is read-only), changes the dot to the
** current locale radix mark, and tries to convert again.
*/
static const char *l_str2d(const char *s, lua_Number *result)
{
const char *endptr;
const char *pmode = strpbrk(s, ".xXnN");
int mode = pmode ? ltolower(cast_uchar(*pmode)) : 0;
if (mode == 'n') /* reject 'inf' and 'nan' */
return NULL;
endptr = l_str2dloc(s, result, mode); /* try to convert */
if (endptr == NULL) { /* failed? may be a different locale */
char buff[L_MAXLENNUM + 1];
const char *pdot = strchr(s, '.');
if (strlen(s) > L_MAXLENNUM || pdot == NULL)
return NULL; /* string too long or no dot; fail */
strcpy(buff, s); /* copy string to buffer */
buff[pdot - s] = lua_getlocaledecpoint(); /* correct decimal point */
endptr = l_str2dloc(buff, result, mode); /* try again */
if (endptr != NULL)
endptr = s + (endptr - buff); /* make relative to 's' */
}
return endptr;
}
#define MAXBY10 cast(lua_Unsigned, LUA_MAXINTEGER / 10)
#define MAXLASTD cast_int(LUA_MAXINTEGER % 10)
static const char *l_str2int(const char *s, lua_Integer *result)
{
lua_Unsigned a = 0;
int empty = 1;
int neg;
while (lisspace(cast_uchar(*s)))
s++; /* skip initial spaces */
neg = isneg(&s);
if (s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) { /* hex? */
s += 2; /* skip '0x' */
for (; lisxdigit(cast_uchar(*s)); s++) {
a = a * 16 + luaO_hexavalue(*s);
empty = 0;
}
} else { /* decimal */
for (; lisdigit(cast_uchar(*s)); s++) {
int d = *s - '0';
if (a >= MAXBY10 && (a > MAXBY10 || d > MAXLASTD + neg)) /* overflow? */
return NULL; /* do not accept it (as integer) */
a = a * 10 + d;
empty = 0;
}
}
while (lisspace(cast_uchar(*s)))
s++; /* skip trailing spaces */
if (empty || *s != '\0')
return NULL; /* something wrong in the numeral */
else {
*result = l_castU2S((neg) ? 0u - a : a);
return s;
}
}
struct konst {
uint8_t type;
union {
lua_Integer i;
lua_Number n;
};
};
static size_t luaO_str2num(const char *s, struct konst *o)
{
lua_Integer i;
lua_Number n;
const char *e;
if ((e = l_str2int(s, &i)) != NULL) { /* try as an integer */
o->i = i;
o->type = 1;
} else if ((e = l_str2d(s, &n)) != NULL) { /* else try as a float */
o->n = n;
o->type = 2;
} else
return 0; /* conversion failed */
return (e - s) + 1; /* success; return string size */
}
/* LUA_NUMBER */
/*
** this function is quite liberal in what it accepts, as 'luaO_str2num'
** will reject ill-formed numerals.
*/
static int read_numeral(LexerState *ls, SemInfo *seminfo)
{
struct konst obj;
const char *expo = "Ee";
int first = ls->current;
assert(lisdigit(ls->current));
save_and_next(ls);
if (first == '0' && check_next2(ls, "xX")) /* hexadecimal? */
expo = "Pp";
for (;;) {
if (check_next2(ls, expo)) /* exponent part? */
check_next2(ls, "-+"); /* optional exponent sign */
if (lisxdigit(ls->current))
save_and_next(ls);
else if (ls->current == '.')
save_and_next(ls);
else
break;
}
save(ls, '\0');
if (luaO_str2num(raviX_buffer_data(ls->buff), &obj) == 0) /* format error? */
lexerror(ls, "malformed number", TOK_FLT);
if (obj.type == 1) {
seminfo->i = obj.i;
return TOK_INT;
} else {
assert(obj.type == 2);
seminfo->r = obj.n;
return TOK_FLT;
}
}
/*
** skip a sequence '[=*[' or ']=*]'; if sequence is well formed, return
** its number of '='s; otherwise, return a negative number (-1 iff there
** are no '='s after initial bracket)
*/
static int skip_sep(LexerState *ls)
{
int count = 0;
int s = ls->current;
assert(s == '[' || s == ']');
save_and_next(ls);
while (ls->current == '=') {
save_and_next(ls);
count++;
}
return (ls->current == s) ? count : (-count) - 1;
}
static void read_long_string(LexerState *ls, SemInfo *seminfo, int sep)
{
int line = ls->linenumber; /* initial line (for error message) */
(void)line;
save_and_next(ls); /* skip 2nd '[' */
if (currIsNewline(ls)) /* string starts with a newline? */
inclinenumber(ls); /* skip it */
for (;;) {
switch (ls->current) {
case EOZ: { /* error */
const char *what = (seminfo ? "string" : "comment");
const char *msg = "";
(void)what;
// luaO_pushfstring(ls->L, "unfinished long %s (starting at line %d)",
// what, line);
lexerror(ls, msg, TOK_EOS);
break; /* to avoid warnings */
}
case ']': {
if (skip_sep(ls) == sep) {
save_and_next(ls); /* skip 2nd ']' */
goto endloop;
}
break;
}
case '\n':
case '\r': {
save(ls, '\n');
inclinenumber(ls);
if (!seminfo)
raviX_buffer_reset(ls->buff); /* avoid wasting space */
break;
}
default: {
if (seminfo)
save_and_next(ls);
else
next(ls);
}
}
}
endloop:
if (seminfo)
seminfo->ts = luaX_newstring(ls, raviX_buffer_data(ls->buff) + (2 + sep),
(uint32_t)(raviX_buffer_len(ls->buff) - 2 * (2 + sep)));
}
static void esccheck(LexerState *ls, int c, const char *msg)
{
if (!c) {
if (ls->current != EOZ)
save_and_next(ls); /* add current to buffer for error message */
lexerror(ls, msg, TOK_STRING);
}
}
static int gethexa(LexerState *ls)
{
save_and_next(ls);
esccheck(ls, lisxdigit(ls->current), "hexadecimal digit expected");
return luaO_hexavalue(ls->current);
}
static int readhexaesc(LexerState *ls)
{
int r = gethexa(ls);
r = (r << 4) + gethexa(ls);
raviX_buffer_remove(ls->buff, 2); /* remove saved chars from buffer */
return r;
}
// static unsigned long readutf8esc (LexerState *ls) {
// unsigned long r;
// int i = 4; /* chars to be removed: '\', 'u', '{', and first digit */
// save_and_next(ls); /* skip 'u' */
// esccheck(ls, ls->current == '{', "missing '{'");
// r = gethexa(ls); /* must have at least one digit */
// while ((save_and_next(ls), lisxdigit(ls->current))) {
// i++;
// r = (r << 4) + luaO_hexavalue(ls->current);
// esccheck(ls, r <= 0x10FFFF, "UTF-8 value too large");
// }
// esccheck(ls, ls->current == '}', "missing '}'");
// next(ls); /* skip '}' */
// raviX_buffer_remove(ls->buff, i); /* remove saved chars from buffer */
// return r;
//}
//
//
// static void utf8esc (LexerState *ls) {
// char buff[UTF8BUFFSZ];
// int n = luaO_utf8esc(buff, readutf8esc(ls));
// for (; n > 0; n--) /* add 'buff' to string */
// save(ls, buff[UTF8BUFFSZ - n]);
//}
static int readdecesc(LexerState *ls)
{
int i;
int r = 0; /* result accumulator */
for (i = 0; i < 3 && lisdigit(ls->current); i++) { /* read up to 3 digits */
r = 10 * r + ls->current - '0';
save_and_next(ls);
}
esccheck(ls, r <= UCHAR_MAX, "decimal escape too large");
raviX_buffer_remove(ls->buff, i); /* remove read digits from buffer */
return r;
}
static void read_string(LexerState *ls, int del, SemInfo *seminfo)
{
save_and_next(ls); /* keep delimiter (for error messages) */
while (ls->current != del) {
switch (ls->current) {
case EOZ:
lexerror(ls, "unfinished string", TOK_EOS);
break; /* to avoid warnings */
case '\n':
case '\r':
lexerror(ls, "unfinished string", TOK_STRING);
break; /* to avoid warnings */
case '\\': { /* escape sequences */
int c; /* final character to be saved */
save_and_next(ls); /* keep '\\' for error messages */
switch (ls->current) {
case 'a':
c = '\a';
goto read_save;
case 'b':
c = '\b';
goto read_save;
case 'f':
c = '\f';
goto read_save;
case 'n':
c = '\n';
goto read_save;
case 'r':
c = '\r';
goto read_save;
case 't':
c = '\t';
goto read_save;
case 'v':
c = '\v';
goto read_save;
case 'x':
c = readhexaesc(ls);
goto read_save;
// TODO - FIXME
// case 'u': utf8esc(ls); goto no_save;
case '\n':
case '\r':
inclinenumber(ls);
c = '\n';
goto only_save;
case '\\':
case '\"':
case '\'':
c = ls->current;
goto read_save;
case EOZ:
goto no_save; /* will raise an error next loop */
case 'z': { /* zap following span of spaces */
raviX_buffer_remove(ls->buff, 1); /* remove '\\' */
next(ls); /* skip the 'z' */
while (lisspace(ls->current)) {
if (currIsNewline(ls))
inclinenumber(ls);
else
next(ls);
}
goto no_save;
}
default: {
esccheck(ls, lisdigit(ls->current), "invalid escape sequence");
c = readdecesc(ls); /* digital escape '\ddd' */
goto only_save;
}
}
read_save:
next(ls);
/* go through */
only_save:
raviX_buffer_remove(ls->buff, 1); /* remove '\\' */
save(ls, c);
/* go through */
no_save:
break;
}
default:
save_and_next(ls);
}
}
save_and_next(ls); /* skip delimiter */
seminfo->ts = luaX_newstring(ls, raviX_buffer_data(ls->buff) + 1, (uint32_t)(raviX_buffer_len(ls->buff) - 2));
}
/*
** RAVI extension: generate a token for the cast operators -
** @number, @number[], @integer, @integer[], @table, @string, @closure
*/
static int casttoken(LexerState *ls, SemInfo *seminfo)
{
size_t n = raviX_buffer_len(ls->buff);
const char *s = raviX_buffer_data(ls->buff);
int tok;
/* @integer or @integer[] */
if (strncmp(s, "@integer", n) == 0)
tok = TOK_TO_INTEGER;
else if (strncmp(s, "@integer[]", n) == 0)
tok = TOK_TO_INTARRAY;
/* @number or @number[] */
else if (strncmp(s, "@number", n) == 0)
tok = TOK_TO_NUMBER;
else if (strncmp(s, "@number[]", n) == 0)
tok = TOK_TO_NUMARRAY;
/* @table */
else if (strncmp(s, "@table", n) == 0)
tok = TOK_TO_TABLE;
else if (strncmp(s, "@string", n) == 0)
tok = TOK_TO_STRING;
else if (strncmp(s, "@closure", n) == 0)
tok = TOK_TO_CLOSURE;
else {
seminfo->ts = luaX_newstring(ls, s + 1, (uint32_t)(n - 1)); /* omit @ */
tok = '@';
}
raviX_buffer_remove(ls->buff, (int)n); /* rewind but buffer still holds the saved characters */
return tok;
}
static int llex(LexerState *ls, SemInfo *seminfo)
{
raviX_buffer_reset(ls->buff);
for (;;) {
switch (ls->current) {
case '\n':
case '\r': { /* line breaks */
inclinenumber(ls);
break;
}
case ' ':
case '\f':
case '\t':
case '\v': { /* spaces */
next(ls);
break;
}
case '-': { /* '-' or '--' (comment) */
next(ls);
if (ls->current != '-')
return '-';
/* else is a comment */
next(ls);
if (ls->current == '[') { /* long comment? */
int sep = skip_sep(ls);
raviX_buffer_reset(ls->buff); /* 'skip_sep' may dirty the buffer */
if (sep >= 0) {
read_long_string(ls, NULL, sep); /* skip long comment */
raviX_buffer_reset(ls->buff); /* previous call may dirty the buff. */
break;
}
}
/* else short comment */
while (!currIsNewline(ls) && ls->current != EOZ)
next(ls); /* skip until end of line (or end of file) */
break;
}
case '[': { /* long string or simply '[' */
int sep = skip_sep(ls);
if (sep >= 0) {
read_long_string(ls, seminfo, sep);
return TOK_STRING;
} else if (sep != -1) /* '[=...' missing second bracket */
lexerror(ls, "invalid long string delimiter", TOK_STRING);
return '[';
}
case '=': {
next(ls);
if (check_next1(ls, '='))
return TOK_EQ;
else
return '=';
}
case '<': {
next(ls);
if (check_next1(ls, '='))
return TOK_LE;
else if (check_next1(ls, '<'))
return TOK_SHL;
else
return '<';
}
case '>': {
next(ls);
if (check_next1(ls, '='))
return TOK_GE;
else if (check_next1(ls, '>'))
return TOK_SHR;
else
return '>';
}
case '/': {
next(ls);
if (check_next1(ls, '/'))
return TOK_IDIV;
else
return '/';
}
case '~': {
next(ls);
if (check_next1(ls, '='))
return TOK_NE;
else
return '~';
}
case ':': {
next(ls);
if (check_next1(ls, ':'))
return TOK_DBCOLON;
else
return ':';
}
case '"':
case '\'': { /* short literal strings */
read_string(ls, ls->current, seminfo);
return TOK_STRING;
}
case '.': { /* '.', '..', '...', or number */
save_and_next(ls);
if (check_next1(ls, '.')) {
if (check_next1(ls, '.'))
return TOK_DOTS; /* '...' */
else
return TOK_CONCAT; /* '..' */
} else if (!lisdigit(ls->current))
return '.';
else
return read_numeral(ls, seminfo);
}
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': {
return read_numeral(ls, seminfo);
}
case EOZ: {
return TOK_EOS;
}
case '@': {
/* RAVI change: @ introduces a type assertion operator */
save_and_next(ls);
while (lislalnum(ls->current)) {
save_and_next(ls);
}
check_save_next1(ls, '[');
check_save_next1(ls, ']');
return casttoken(ls, seminfo);
}
default: {
if (lislalpha(ls->current)) { /* identifier or reserved word? */
const StringObject *ts;
do {
save_and_next(ls);
} while (lislalnum(ls->current));
ts = raviX_create_string(ls->container, raviX_buffer_data(ls->buff),
(int32_t)raviX_buffer_len(ls->buff));
seminfo->ts = ts;
int tok = is_reserved(ts);
if (tok != -1) /* reserved word? */
return tok + FIRST_RESERVED;
else {
return TOK_NAME;
}
} else { /* single-char tokens (+ - / ...) */
int c = ls->current;
next(ls);
return c;
}
}
}
}
}
void raviX_next(LexerState *ls)
{
ls->lastline = ls->linenumber;
if (ls->lookahead.token != TOK_EOS) { /* is there a look-ahead token? */
ls->t = ls->lookahead; /* use this one */
ls->lookahead.token = TOK_EOS; /* and discharge it */
} else
ls->t.token = llex(ls, &ls->t.seminfo); /* read next token */
}
int raviX_lookahead(LexerState *ls)
{
assert(ls->lookahead.token == TOK_EOS);
ls->lookahead.token = llex(ls, &ls->lookahead.seminfo);
return ls->lookahead.token;
}
const char *raviX_get_last_error(CompilerState *container) { return container->error_message.buf; }