issue #195 lexer / utf8 improvements from Lua 5.4

lua54-misc
Dibyendu Majumdar 4 years ago
parent d4992e8d08
commit f547edd330

@ -1,5 +1,5 @@
/* /*
** $Id: llex.c,v 2.96.1.1 2017/04/19 17:20:42 roberto Exp $ ** $Id: llex.c $
** Lexical Analyzer ** Lexical Analyzer
** See Copyright Notice in lua.h ** See Copyright Notice in lua.h
*/ */
@ -29,7 +29,7 @@
#define next(ls) (ls->current = zgetc(ls->z)) #define next(ls) (ls->current = zgetc(ls->z))
@ -88,7 +88,10 @@ void luaX_init (lua_State *L) {
const char *luaX_token2str (LexState *ls, int token) { const char *luaX_token2str (LexState *ls, int token) {
if (token < FIRST_RESERVED) { /* single-byte symbols? */ if (token < FIRST_RESERVED) { /* single-byte symbols? */
lua_assert(token == cast_uchar(token)); lua_assert(token == cast_uchar(token));
return luaO_pushfstring(ls->L, "'%c'", token); if (lisprint(token))
return luaO_pushfstring(ls->L, "'%c'", token);
else /* control character */
return luaO_pushfstring(ls->L, "'<\\%d>'", token);
} }
else { else {
const char *s = luaX_tokens[token - FIRST_RESERVED]; const char *s = luaX_tokens[token - FIRST_RESERVED];
@ -224,8 +227,16 @@ static int check_next2 (LexState *ls, const char *set) {
/* LUA_NUMBER */ /* LUA_NUMBER */
/* /*
** this function is quite liberal in what it accepts, as 'luaO_str2num' ** This function is quite liberal in what it accepts, as 'luaO_str2num'
** will reject ill-formed numerals. ** will reject ill-formed numerals. Roughly, it accepts the following
** pattern:
**
** %d(%x|%.|([Ee][+-]?))* | 0[Xx](%x|%.|([Pp][+-]?))*
**
** The only tricky part is to accept [+-] only after a valid exponent
** mark, to avoid reading '3-4' or '0xe+1' as a single number.
**
** The caller might have already read an initial dot.
*/ */
static int read_numeral (LexState *ls, SemInfo *seminfo) { static int read_numeral (LexState *ls, SemInfo *seminfo) {
TValue obj; TValue obj;
@ -236,14 +247,14 @@ static int read_numeral (LexState *ls, SemInfo *seminfo) {
if (first == '0' && check_next2(ls, "xX")) /* hexadecimal? */ if (first == '0' && check_next2(ls, "xX")) /* hexadecimal? */
expo = "Pp"; expo = "Pp";
for (;;) { for (;;) {
if (check_next2(ls, expo)) /* exponent part? */ if (check_next2(ls, expo)) /* exponent mark? */
check_next2(ls, "-+"); /* optional exponent sign */ check_next2(ls, "-+"); /* optional exponent sign */
if (lisxdigit(ls->current)) else if (lisxdigit(ls->current) || ls->current == '.') /* '%x|%.' */
save_and_next(ls);
else if (ls->current == '.')
save_and_next(ls); save_and_next(ls);
else break; else break;
} }
if (lislalpha(ls->current)) /* is numeral touching a letter? */
save_and_next(ls); /* force an error */
save(ls, '\0'); save(ls, '\0');
if (luaO_str2num(luaZ_buffer(ls->buff), &obj) == 0) /* format error? */ if (luaO_str2num(luaZ_buffer(ls->buff), &obj) == 0) /* format error? */
lexerror(ls, "malformed number", TK_FLT); lexerror(ls, "malformed number", TK_FLT);
@ -260,12 +271,12 @@ static int read_numeral (LexState *ls, SemInfo *seminfo) {
/* /*
** skip a sequence '[=*[' or ']=*]'; if sequence is well formed, return ** reads a sequence '[=*[' or ']=*]', leaving the last bracket.
** its number of '='s; otherwise, return a negative number (-1 iff there ** If sequence is well formed, return its number of '='s + 2; otherwise,
** are no '='s after initial bracket) ** return 1 if there is no '='s or 0 otherwise (an unfinished '[==...').
*/ */
static int skip_sep (LexState *ls) { static size_t skip_sep (LexState *ls) {
int count = 0; size_t count = 0;
int s = ls->current; int s = ls->current;
lua_assert(s == '[' || s == ']'); lua_assert(s == '[' || s == ']');
save_and_next(ls); save_and_next(ls);
@ -273,11 +284,13 @@ static int skip_sep (LexState *ls) {
save_and_next(ls); save_and_next(ls);
count++; count++;
} }
return (ls->current == s) ? count : (-count) - 1; return (ls->current == s) ? count + 2
: (count == 0) ? 1
: 0;
} }
static void read_long_string (LexState *ls, SemInfo *seminfo, int sep) { static void read_long_string (LexState *ls, SemInfo *seminfo, size_t sep) {
int line = ls->linenumber; /* initial line (for error message) */ int line = ls->linenumber; /* initial line (for error message) */
save_and_next(ls); /* skip 2nd '[' */ save_and_next(ls); /* skip 2nd '[' */
if (currIsNewline(ls)) /* string starts with a newline? */ if (currIsNewline(ls)) /* string starts with a newline? */
@ -311,8 +324,8 @@ static void read_long_string (LexState *ls, SemInfo *seminfo, int sep) {
} }
} endloop: } endloop:
if (seminfo) if (seminfo)
seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + (2 + sep), seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + sep,
luaZ_bufflen(ls->buff) - 2*(2 + sep)); luaZ_bufflen(ls->buff) - 2 * sep);
} }
@ -348,8 +361,8 @@ static unsigned long readutf8esc (LexState *ls) {
r = gethexa(ls); /* must have at least one digit */ r = gethexa(ls); /* must have at least one digit */
while ((save_and_next(ls), lisxdigit(ls->current))) { while ((save_and_next(ls), lisxdigit(ls->current))) {
i++; i++;
esccheck(ls, r <= (0x7FFFFFFFu >> 4), "UTF-8 value too large");
r = (r << 4) + luaO_hexavalue(ls->current); r = (r << 4) + luaO_hexavalue(ls->current);
esccheck(ls, r <= 0x10FFFF, "UTF-8 value too large");
} }
esccheck(ls, ls->current == '}', "missing '}'"); esccheck(ls, ls->current == '}', "missing '}'");
next(ls); /* skip '}' */ next(ls); /* skip '}' */
@ -494,9 +507,9 @@ static int llex (LexState *ls, SemInfo *seminfo) {
/* else is a comment */ /* else is a comment */
next(ls); next(ls);
if (ls->current == '[') { /* long comment? */ if (ls->current == '[') { /* long comment? */
int sep = skip_sep(ls); size_t sep = skip_sep(ls);
luaZ_resetbuffer(ls->buff); /* 'skip_sep' may dirty the buffer */ luaZ_resetbuffer(ls->buff); /* 'skip_sep' may dirty the buffer */
if (sep >= 0) { if (sep >= 2) {
read_long_string(ls, NULL, sep); /* skip long comment */ read_long_string(ls, NULL, sep); /* skip long comment */
luaZ_resetbuffer(ls->buff); /* previous call may dirty the buff. */ luaZ_resetbuffer(ls->buff); /* previous call may dirty the buff. */
break; break;
@ -508,12 +521,12 @@ static int llex (LexState *ls, SemInfo *seminfo) {
break; break;
} }
case '[': { /* long string or simply '[' */ case '[': { /* long string or simply '[' */
int sep = skip_sep(ls); size_t sep = skip_sep(ls);
if (sep >= 0) { if (sep >= 2) {
read_long_string(ls, seminfo, sep); read_long_string(ls, seminfo, sep);
return TK_STRING; return TK_STRING;
} }
else if (sep != -1) /* '[=...' missing second bracket */ else if (sep == 0) /* '[=...' missing second bracket? */
lexerror(ls, "invalid long string delimiter", TK_STRING); lexerror(ls, "invalid long string delimiter", TK_STRING);
return '['; return '[';
} }

@ -1,5 +1,5 @@
/* /*
** $Id: lobject.c,v 2.113.1.1 2017/04/19 17:29:57 roberto Exp $ ** $Id: lobject.c $
** Some generic functions over Lua objects ** Some generic functions over Lua objects
** See Copyright Notice in lua.h ** See Copyright Notice in lua.h
*/ */
@ -345,7 +345,7 @@ size_t luaO_str2num (const char *s, TValue *o) {
int luaO_utf8esc (char *buff, unsigned long x) { int luaO_utf8esc (char *buff, unsigned long x) {
int n = 1; /* number of bytes put in buffer (backwards) */ int n = 1; /* number of bytes put in buffer (backwards) */
lua_assert(x <= 0x10FFFF); lua_assert(x <= 0x7FFFFFFFu);
if (x < 0x80) /* ascii? */ if (x < 0x80) /* ascii? */
buff[UTF8BUFFSZ - 1] = cast(char, x); buff[UTF8BUFFSZ - 1] = cast(char, x);
else { /* need continuation bytes */ else { /* need continuation bytes */
@ -442,7 +442,7 @@ const char *luaO_pushvfstring (lua_State *L, const char *fmt, va_list argp) {
} }
case 'U': { /* an 'int' as a UTF-8 sequence */ case 'U': { /* an 'int' as a UTF-8 sequence */
char buff[UTF8BUFFSZ]; char buff[UTF8BUFFSZ];
int l = luaO_utf8esc(buff, cast(long, va_arg(argp, long))); int l = luaO_utf8esc(buff, va_arg(argp, long));
pushstr(L, buff + UTF8BUFFSZ - l, l); pushstr(L, buff + UTF8BUFFSZ - l, l);
break; break;
} }

@ -20,7 +20,20 @@
#include "lauxlib.h" #include "lauxlib.h"
#include "lualib.h" #include "lualib.h"
#define MAXUNICODE 0x10FFFF
#define MAXUNICODE 0x10FFFFu
#define MAXUTF 0x7FFFFFFFu
/*
** Integer type for decoded UTF-8 values; MAXUTF needs 31 bits.
*/
#if (UINT_MAX >> 30) >= 1
typedef unsigned int utfint;
#else
typedef unsigned long utfint;
#endif
#define iscont(p) ((*(p) & 0xC0) == 0x80) #define iscont(p) ((*(p) & 0xC0) == 0x80)
@ -35,53 +48,62 @@ static lua_Integer u_posrelat (lua_Integer pos, size_t len) {
/* /*
** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid. ** Decode one UTF-8 sequence, returning NULL if byte sequence is
** invalid. The array 'limits' stores the minimum value for each
** sequence length, to check for overlong representations. Its first
** entry forces an error for non-ascii bytes with no continuation
** bytes (count == 0).
*/ */
static const char *utf8_decode (const char *o, int *val) { static const char *utf8_decode (const char *s, utfint *val, int strict) {
static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFF}; static const utfint limits[] =
const unsigned char *s = (const unsigned char *)o; {~(utfint)0, 0x80, 0x800, 0x10000u, 0x200000u, 0x4000000u};
unsigned int c = s[0]; unsigned int c = (unsigned char)s[0];
unsigned int res = 0; /* final result */ utfint res = 0; /* final result */
if (c < 0x80) /* ascii? */ if (c < 0x80) /* ascii? */
res = c; res = c;
else { else {
int count = 0; /* to count number of continuation bytes */ int count = 0; /* to count number of continuation bytes */
while (c & 0x40) { /* still have continuation bytes? */ for (; c & 0x40; c <<= 1) { /* while it needs continuation bytes... */
int cc = s[++count]; /* read next byte */ unsigned int cc = (unsigned char)s[++count]; /* read next byte */
if ((cc & 0xC0) != 0x80) /* not a continuation byte? */ if ((cc & 0xC0) != 0x80) /* not a continuation byte? */
return NULL; /* invalid byte sequence */ return NULL; /* invalid byte sequence */
res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */ res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */
c <<= 1; /* to test next bit */
} }
res |= ((c & 0x7F) << (count * 5)); /* add first byte */ res |= ((utfint)(c & 0x7F) << (count * 5)); /* add first byte */
if (count > 3 || res > MAXUNICODE || res <= limits[count]) if (count > 5 || res > MAXUTF || res < limits[count])
return NULL; /* invalid byte sequence */ return NULL; /* invalid byte sequence */
s += count; /* skip continuation bytes read */ s += count; /* skip continuation bytes read */
} }
if (strict) {
/* check for invalid code points; too large or surrogates */
if (res > MAXUNICODE || (0xD800u <= res && res <= 0xDFFFu))
return NULL;
}
if (val) *val = res; if (val) *val = res;
return (const char *)s + 1; /* +1 to include first byte */ return s + 1; /* +1 to include first byte */
} }
/* /*
** utf8len(s [, i [, j]]) --> number of characters that start in the ** utf8len(s [, i [, j [, lax]]]) --> number of characters that
** range [i,j], or nil + current position if 's' is not well formed in ** start in the range [i,j], or nil + current position if 's' is not
** that interval ** well formed in that interval
*/ */
static int utflen (lua_State *L) { static int utflen (lua_State *L) {
int n = 0; lua_Integer n = 0; /* counter for the number of characters */
size_t len; size_t len; /* string length in bytes */
const char *s = luaL_checklstring(L, 1, &len); const char *s = luaL_checklstring(L, 1, &len);
lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len); lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len); lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len);
int lax = lua_toboolean(L, 4);
luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2, luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2,
"initial position out of string"); "initial position out of bounds");
luaL_argcheck(L, --posj < (lua_Integer)len, 3, luaL_argcheck(L, --posj < (lua_Integer)len, 3,
"final position out of string"); "final position out of bounds");
while (posi <= posj) { while (posi <= posj) {
const char *s1 = utf8_decode(s + posi, NULL); const char *s1 = utf8_decode(s + posi, NULL, !lax);
if (s1 == NULL) { /* conversion error? */ if (s1 == NULL) { /* conversion error? */
lua_pushnil(L); /* return nil ... */ luaL_pushfail(L); /* return fail ... */
lua_pushinteger(L, posi + 1); /* ... and current position */ lua_pushinteger(L, posi + 1); /* ... and current position */
return 2; return 2;
} }
@ -94,28 +116,29 @@ static int utflen (lua_State *L) {
/* /*
** codepoint(s, [i, [j]]) -> returns codepoints for all characters ** codepoint(s, [i, [j [, lax]]]) -> returns codepoints for all
** that start in the range [i,j] ** characters that start in the range [i,j]
*/ */
static int codepoint (lua_State *L) { static int codepoint (lua_State *L) {
size_t len; size_t len;
const char *s = luaL_checklstring(L, 1, &len); const char *s = luaL_checklstring(L, 1, &len);
lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len); lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len); lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len);
int lax = lua_toboolean(L, 4);
int n; int n;
const char *se; const char *se;
luaL_argcheck(L, posi >= 1, 2, "out of range"); luaL_argcheck(L, posi >= 1, 2, "out of bounds");
luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of range"); luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of bounds");
if (posi > pose) return 0; /* empty interval; return no values */ if (posi > pose) return 0; /* empty interval; return no values */
if (pose - posi >= INT_MAX) /* (lua_Integer -> int) overflow? */ if (pose - posi >= INT_MAX) /* (lua_Integer -> int) overflow? */
return luaL_error(L, "string slice too long"); return luaL_error(L, "string slice too long");
n = (int)(pose - posi) + 1; n = (int)(pose - posi) + 1; /* upper bound for number of returns */
luaL_checkstack(L, n, "string slice too long"); luaL_checkstack(L, n, "string slice too long");
n = 0; n = 0; /* count the number of returns */
se = s + pose; se = s + pose; /* string end */
for (s += posi - 1; s < se;) { for (s += posi - 1; s < se;) {
int code; utfint code;
s = utf8_decode(s, &code); s = utf8_decode(s, &code, !lax);
if (s == NULL) if (s == NULL)
return luaL_error(L, "invalid UTF-8 code"); return luaL_error(L, "invalid UTF-8 code");
lua_pushinteger(L, code); lua_pushinteger(L, code);
@ -126,8 +149,8 @@ static int codepoint (lua_State *L) {
static void pushutfchar (lua_State *L, int arg) { static void pushutfchar (lua_State *L, int arg) {
lua_Integer code = luaL_checkinteger(L, arg); lua_Unsigned code = (lua_Unsigned)luaL_checkinteger(L, arg);
luaL_argcheck(L, 0 <= code && code <= MAXUNICODE, arg, "value out of range"); luaL_argcheck(L, code <= MAXUTF, arg, "value out of range");
lua_pushfstring(L, "%U", (long)code); lua_pushfstring(L, "%U", (long)code);
} }
@ -164,7 +187,7 @@ static int byteoffset (lua_State *L) {
lua_Integer posi = (n >= 0) ? 1 : len + 1; lua_Integer posi = (n >= 0) ? 1 : len + 1;
posi = u_posrelat(luaL_optinteger(L, 3, posi), len); posi = u_posrelat(luaL_optinteger(L, 3, posi), len);
luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3, luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3,
"position out of range"); "position out of bounds");
if (n == 0) { if (n == 0) {
/* find beginning of current byte sequence */ /* find beginning of current byte sequence */
while (posi > 0 && iscont(s + posi)) posi--; while (posi > 0 && iscont(s + posi)) posi--;
@ -193,12 +216,12 @@ static int byteoffset (lua_State *L) {
if (n == 0) /* did it find given character? */ if (n == 0) /* did it find given character? */
lua_pushinteger(L, posi + 1); lua_pushinteger(L, posi + 1);
else /* no such character */ else /* no such character */
lua_pushnil(L); luaL_pushfail(L);
return 1; return 1;
} }
static int iter_aux (lua_State *L) { static int iter_aux (lua_State *L, int strict) {
size_t len; size_t len;
const char *s = luaL_checklstring(L, 1, &len); const char *s = luaL_checklstring(L, 1, &len);
lua_Integer n = lua_tointeger(L, 2) - 1; lua_Integer n = lua_tointeger(L, 2) - 1;
@ -211,9 +234,9 @@ static int iter_aux (lua_State *L) {
if (n >= (lua_Integer)len) if (n >= (lua_Integer)len)
return 0; /* no more codepoints */ return 0; /* no more codepoints */
else { else {
int code; utfint code;
const char *next = utf8_decode(s + n, &code); const char *next = utf8_decode(s + n, &code, strict);
if (next == NULL || iscont(next)) if (next == NULL)
return luaL_error(L, "invalid UTF-8 code"); return luaL_error(L, "invalid UTF-8 code");
lua_pushinteger(L, n + 1); lua_pushinteger(L, n + 1);
lua_pushinteger(L, code); lua_pushinteger(L, code);
@ -222,9 +245,19 @@ static int iter_aux (lua_State *L) {
} }
static int iter_auxstrict (lua_State *L) {
return iter_aux(L, 1);
}
static int iter_auxlax (lua_State *L) {
return iter_aux(L, 0);
}
static int iter_codes (lua_State *L) { static int iter_codes (lua_State *L) {
int lax = lua_toboolean(L, 2);
luaL_checkstring(L, 1); luaL_checkstring(L, 1);
lua_pushcfunction(L, iter_aux); lua_pushcfunction(L, lax ? iter_auxlax : iter_auxstrict);
lua_pushvalue(L, 1); lua_pushvalue(L, 1);
lua_pushinteger(L, 0); lua_pushinteger(L, 0);
return 3; return 3;
@ -232,7 +265,7 @@ static int iter_codes (lua_State *L) {
/* pattern to match a single UTF-8 character */ /* pattern to match a single UTF-8 character */
#define UTF8PATT "[\0-\x7F\xC2-\xF4][\x80-\xBF]*" #define UTF8PATT "[\0-\x7F\xC2-\xFD][\x80-\xBF]*"
static const luaL_Reg funcs[] = { static const luaL_Reg funcs[] = {

@ -56,16 +56,23 @@ assert("abc\z
assert("\u{0}\u{00000000}\x00\0" == string.char(0, 0, 0, 0)) assert("\u{0}\u{00000000}\x00\0" == string.char(0, 0, 0, 0))
-- limits for 1-byte sequences -- limits for 1-byte sequences
assert("\u{0}\u{7F}" == "\x00\z\x7F") assert("\u{0}\u{7F}" == "\x00\x7F")
-- limits for 2-byte sequences -- limits for 2-byte sequences
assert("\u{80}\u{7FF}" == "\xC2\x80\z\xDF\xBF") assert("\u{80}\u{7FF}" == "\xC2\x80\xDF\xBF")
-- limits for 3-byte sequences -- limits for 3-byte sequences
assert("\u{800}\u{FFFF}" == "\xE0\xA0\x80\z\xEF\xBF\xBF") assert("\u{800}\u{FFFF}" == "\xE0\xA0\x80\xEF\xBF\xBF")
-- limits for 4-byte sequences -- limits for 4-byte sequences
assert("\u{10000}\u{10FFFF}" == "\xF0\x90\x80\x80\z\xF4\x8F\xBF\xBF") assert("\u{10000}\u{1FFFFF}" == "\xF0\x90\x80\x80\xF7\xBF\xBF\xBF")
-- limits for 5-byte sequences
assert("\u{200000}\u{3FFFFFF}" == "\xF8\x88\x80\x80\x80\xFB\xBF\xBF\xBF\xBF")
-- limits for 6-byte sequences
assert("\u{4000000}\u{7FFFFFFF}" ==
"\xFC\x84\x80\x80\x80\x80\xFD\xBF\xBF\xBF\xBF\xBF")
-- Error in escape sequences -- Error in escape sequences
@ -94,7 +101,7 @@ lexerror([["xyz\300"]], [[\300"]])
lexerror([[" \256"]], [[\256"]]) lexerror([[" \256"]], [[\256"]])
-- errors in UTF-8 sequences -- errors in UTF-8 sequences
lexerror([["abc\u{110000}"]], [[abc\u{110000]]) -- too large lexerror([["abc\u{100000000}"]], [[abc\u{100000000]]) -- too large
lexerror([["abc\u11r"]], [[abc\u1]]) -- missing '{' lexerror([["abc\u11r"]], [[abc\u1]]) -- missing '{'
lexerror([["abc\u"]], [[abc\u"]]) -- missing '{' lexerror([["abc\u"]], [[abc\u"]]) -- missing '{'
lexerror([["abc\u{11r"]], [[abc\u{11r]]) -- missing '}' lexerror([["abc\u{11r"]], [[abc\u{11r]]) -- missing '}'

@ -21,62 +21,59 @@ local justone = "^" .. utf8.charpattern .. "$"
-- 't' is the list of codepoints of 's' -- 't' is the list of codepoints of 's'
local function checksyntax (s, t) local function checksyntax (s, t)
-- creates a string "return '\u{t[1]}...\u{t[n]}'"
local ts = {"return '"} local ts = {"return '"}
for i = 1, #t do ts[i + 1] = string.format("\\u{%x}", t[i]) end for i = 1, #t do ts[i + 1] = string.format("\\u{%x}", t[i]) end
ts[#t + 2] = "'" ts[#t + 2] = "'"
ts = table.concat(ts) ts = table.concat(ts)
-- its execution should result in 's'
assert(assert(load(ts))() == s) assert(assert(load(ts))() == s)
end end
assert(utf8.offset("alo", 5) == nil) assert(not utf8.offset("alo", 5))
assert(utf8.offset("alo", -4) == nil) assert(not utf8.offset("alo", -4))
-- 't' is the list of codepoints of 's' -- 'check' makes several tests over the validity of string 's'.
local function check (s, t) -- 't' is the list of codepoints of 's'.
local l = utf8.len(s) local function check (s, t, nonstrict)
local l = utf8.len(s, 1, -1, nonstrict)
assert(#t == l and len(s) == l) assert(#t == l and len(s) == l)
assert(utf8.char(table.unpack(t)) == s) assert(utf8.char(table.unpack(t)) == s) -- 't' and 's' are equivalent
assert(utf8.offset(s, 0) == 1) assert(utf8.offset(s, 0) == 1)
checksyntax(s, t) checksyntax(s, t)
local t1 = {utf8.codepoint(s, 1, -1)} -- creates new table with all codepoints of 's'
local t1 = {utf8.codepoint(s, 1, -1, nonstrict)}
assert(#t == #t1) assert(#t == #t1)
for i = 1, #t do assert(t[i] == t1[i]) end for i = 1, #t do assert(t[i] == t1[i]) end -- 't' is equal to 't1'
for i = 1, l do for i = 1, l do -- for all codepoints
local pi = utf8.offset(s, i) -- position of i-th char local pi = utf8.offset(s, i) -- position of i-th char
local pi1 = utf8.offset(s, 2, pi) -- position of next char local pi1 = utf8.offset(s, 2, pi) -- position of next char
assert(string.find(string.sub(s, pi, pi1 - 1), justone)) assert(string.find(string.sub(s, pi, pi1 - 1), justone))
assert(utf8.offset(s, -1, pi1) == pi) assert(utf8.offset(s, -1, pi1) == pi)
assert(utf8.offset(s, i - l - 1) == pi) assert(utf8.offset(s, i - l - 1) == pi)
assert(pi1 - pi == #utf8.char(utf8.codepoint(s, pi))) assert(pi1 - pi == #utf8.char(utf8.codepoint(s, pi, pi, nonstrict)))
for j = pi, pi1 - 1 do for j = pi, pi1 - 1 do
assert(utf8.offset(s, 0, j) == pi) assert(utf8.offset(s, 0, j) == pi)
end end
for j = pi + 1, pi1 - 1 do for j = pi + 1, pi1 - 1 do
assert(not utf8.len(s, j)) assert(not utf8.len(s, j))
end end
assert(utf8.len(s, pi, pi) == 1) assert(utf8.len(s, pi, pi, nonstrict) == 1)
assert(utf8.len(s, pi, pi1 - 1) == 1) assert(utf8.len(s, pi, pi1 - 1, nonstrict) == 1)
assert(utf8.len(s, pi) == l - i + 1) assert(utf8.len(s, pi, -1, nonstrict) == l - i + 1)
assert(utf8.len(s, pi1) == l - i) assert(utf8.len(s, pi1, -1, nonstrict) == l - i)
assert(utf8.len(s, 1, pi) == i) assert(utf8.len(s, 1, pi, nonstrict) == i)
end end
local i = 0 local i = 0
for p, c in utf8.codes(s) do for p, c in utf8.codes(s, nonstrict) do
i = i + 1 i = i + 1
assert(c == t[i] and p == utf8.offset(s, i)) assert(c == t[i] and p == utf8.offset(s, i))
assert(utf8.codepoint(s, p) == c) assert(utf8.codepoint(s, p, p, nonstrict) == c)
end
assert(i == #t)
i = 0
for p, c in utf8.codes(s) do
i = i + 1
assert(c == t[i] and p == utf8.offset(s, i))
end end
assert(i == #t) assert(i == #t)
@ -105,23 +102,30 @@ do -- error indication in utf8.len
check("\xF4\x9F\xBF\xBF", 1) check("\xF4\x9F\xBF\xBF", 1)
end end
-- error in utf8.codes -- errors in utf8.codes
checkerror("invalid UTF%-8 code", do
function () local function errorcodes (s)
local s = "ab\xff" checkerror("invalid UTF%-8 code",
for c in utf8.codes(s) do assert(c) end function ()
end) for c in utf8.codes(s) do assert(c) end
end)
end
errorcodes("ab\xff")
errorcodes("\u{110000}")
end
-- error in initial position for offset -- error in initial position for offset
checkerror("position out of range", utf8.offset, "abc", 1, 5) checkerror("position out of bounds", utf8.offset, "abc", 1, 5)
checkerror("position out of range", utf8.offset, "abc", 1, -4) checkerror("position out of bounds", utf8.offset, "abc", 1, -4)
checkerror("position out of range", utf8.offset, "", 1, 2) checkerror("position out of bounds", utf8.offset, "", 1, 2)
checkerror("position out of range", utf8.offset, "", 1, -1) checkerror("position out of bounds", utf8.offset, "", 1, -1)
checkerror("continuation byte", utf8.offset, "𦧺", 1, 2) checkerror("continuation byte", utf8.offset, "𦧺", 1, 2)
checkerror("continuation byte", utf8.offset, "𦧺", 1, 2) checkerror("continuation byte", utf8.offset, "𦧺", 1, 2)
checkerror("continuation byte", utf8.offset, "\x80", 1) checkerror("continuation byte", utf8.offset, "\x80", 1)
-- error in indices for len
checkerror("out of bounds", utf8.len, "abc", 0, 2)
checkerror("out of bounds", utf8.len, "abc", 1, 4)
local s = "hello World" local s = "hello World"
@ -136,19 +140,27 @@ do
local t = {utf8.codepoint(s,1,#s - 1)} local t = {utf8.codepoint(s,1,#s - 1)}
assert(#t == 3 and t[1] == 225 and t[2] == 233 and t[3] == 237) assert(#t == 3 and t[1] == 225 and t[2] == 233 and t[3] == 237)
checkerror("invalid UTF%-8 code", utf8.codepoint, s, 1, #s) checkerror("invalid UTF%-8 code", utf8.codepoint, s, 1, #s)
checkerror("out of range", utf8.codepoint, s, #s + 1) checkerror("out of bounds", utf8.codepoint, s, #s + 1)
t = {utf8.codepoint(s, 4, 3)} t = {utf8.codepoint(s, 4, 3)}
assert(#t == 0) assert(#t == 0)
checkerror("out of range", utf8.codepoint, s, -(#s + 1), 1) checkerror("out of bounds", utf8.codepoint, s, -(#s + 1), 1)
checkerror("out of range", utf8.codepoint, s, 1, #s + 1) checkerror("out of bounds", utf8.codepoint, s, 1, #s + 1)
-- surrogates
assert(utf8.codepoint("\u{D7FF}") == 0xD800 - 1)
assert(utf8.codepoint("\u{E000}") == 0xDFFF + 1)
assert(utf8.codepoint("\u{D800}", 1, 1, true) == 0xD800)
assert(utf8.codepoint("\u{DFFF}", 1, 1, true) == 0xDFFF)
assert(utf8.codepoint("\u{7FFFFFFF}", 1, 1, true) == 0x7FFFFFFF)
end end
assert(utf8.char() == "") assert(utf8.char() == "")
assert(utf8.char(97, 98, 99) == "abc") assert(utf8.char(0, 97, 98, 99, 1) == "\0abc\1")
assert(utf8.codepoint(utf8.char(0x10FFFF)) == 0x10FFFF) assert(utf8.codepoint(utf8.char(0x10FFFF)) == 0x10FFFF)
assert(utf8.codepoint(utf8.char(0x7FFFFFFF), 1, 1, true) == (1<<31) - 1)
checkerror("value out of range", utf8.char, 0x10FFFF + 1) checkerror("value out of range", utf8.char, 0x7FFFFFFF + 1)
checkerror("value out of range", utf8.char, -1)
local function invalid (s) local function invalid (s)
checkerror("invalid UTF%-8 code", utf8.codepoint, s) checkerror("invalid UTF%-8 code", utf8.codepoint, s)
@ -158,6 +170,10 @@ end
-- UTF-8 representation for 0x11ffff (value out of valid range) -- UTF-8 representation for 0x11ffff (value out of valid range)
invalid("\xF4\x9F\xBF\xBF") invalid("\xF4\x9F\xBF\xBF")
-- surrogates
invalid("\u{D800}")
invalid("\u{DFFF}")
-- overlong sequences -- overlong sequences
invalid("\xC0\x80") -- zero invalid("\xC0\x80") -- zero
invalid("\xC1\xBF") -- 0x7F (should be coded in 1 byte) invalid("\xC1\xBF") -- 0x7F (should be coded in 1 byte)
@ -183,6 +199,21 @@ s = "\0 \x7F\z
s = string.gsub(s, " ", "") s = string.gsub(s, " ", "")
check(s, {0,0x7F, 0x80,0x7FF, 0x800,0xFFFF, 0x10000,0x10FFFF}) check(s, {0,0x7F, 0x80,0x7FF, 0x800,0xFFFF, 0x10000,0x10FFFF})
do
-- original UTF-8 values
local s = "\u{4000000}\u{7FFFFFFF}"
assert(#s == 12)
check(s, {0x4000000, 0x7FFFFFFF}, true)
s = "\u{200000}\u{3FFFFFF}"
assert(#s == 10)
check(s, {0x200000, 0x3FFFFFF}, true)
s = "\u{10000}\u{1fffff}"
assert(#s == 8)
check(s, {0x10000, 0x1FFFFF}, true)
end
x = "日本語a-4\0éó" x = "日本語a-4\0éó"
check(x, {26085, 26412, 35486, 97, 45, 52, 0, 233, 243}) check(x, {26085, 26412, 35486, 97, 45, 52, 0, 233, 243})

Loading…
Cancel
Save