Commit 595ae31c authored by Chris Müller's avatar Chris Müller
Browse files

add character literal parsing

parent 6ab26875
......@@ -59,8 +59,9 @@ enum CyTOK {
TOK_OCT,
TOK_BIN,
TOK_FLOAT,
TOK_DEFINE,
TOK_IDENTIFIER
TOK_SYMBOL,
TOK_TRUE,
TOK_FALSE
};
......
......@@ -25,11 +25,6 @@
#include <string.h>
#include <assert.h>
struct Mapping {
byte_t* string;
enum CyTOK value;
};
struct CyContext*
cy_context_new(const byte_t* source, const char* filename)
......@@ -82,6 +77,14 @@ cy_error(struct CyContext* context, const char* format, ...)
va_end(args);
}
struct Mapping {
byte_t* string;
enum CyTOK value;
};
const byte_t*
cy_tok_to_string(enum CyTOK token)
{
......@@ -114,10 +117,8 @@ cy_tok_to_string(enum CyTOK token)
return "BIN";
case TOK_FLOAT:
return "FLOAT";
case TOK_DEFINE:
return "DEFINE";
case TOK_IDENTIFIER:
return "IDENTIFIER";
case TOK_SYMBOL:
return "SYMBOL";
}
return "UNKNOWN";
......@@ -284,17 +285,13 @@ lex_number(struct CyContext* context)
case INT_BIN_READ:
token = TOK_BIN;
if(ch == '_')
goto NO_APPEND_BUFFER;
else if(ch != '0' && ch != '1')
if(ch != '0' && ch != '1')
goto RETURN_TOKEN;
break;
case INT_OCT_READ:
token = TOK_OCT;
if(ch == '_')
goto NO_APPEND_BUFFER;
else if('0' > ch || ch > '7')
if('0' > ch || ch > '7')
goto RETURN_TOKEN;
break;
......@@ -303,17 +300,13 @@ lex_number(struct CyContext* context)
if(ch == '.' || ch == 'e' || ch == 'E') {
context->src = p;
return lex_float(context);
} else if(ch == '_')
goto NO_APPEND_BUFFER;
else if('0' > ch || ch > '9')
} else if('0' > ch || ch > '9')
goto RETURN_TOKEN;
break;
case INT_HEX_READ:
token = TOK_HEX;
if(ch == '_')
goto NO_APPEND_BUFFER;
else if(('0' > ch || ch > '9') && ('A' > ch || ch > 'F'))
if(('0' > ch || ch > '9') && ('A' > ch || ch > 'F'))
goto RETURN_TOKEN;
break;
......@@ -336,6 +329,235 @@ RETURN_TOKEN:
}
enum CharState {
CHAR_EAT,
CHAR_ESCAPE,
CHAR_UNICODE
};
static enum CyTOK
lex_character(struct CyContext* context)
{
assert(context != 0);
struct CryArray* buffer = context->buffer;
byte_t* p = cry_utf8_next(context->src);
unicode_t ch = cry_utf8_get(p);
enum CharState state = CHAR_EAT;
int unicount = 0;
cry_array_append(buffer, "\\", 1);
while(!cry_unicode_isspace(ch) && ch != '\0') {
switch(state) {
case CHAR_EAT:
if(ch == 'u') {
state = CHAR_UNICODE;
unicount = 4;
} else if(ch == 'U') {
state = CHAR_UNICODE;
unicount = 6;
} else {
state = CHAR_ESCAPE;
}
break;
case CHAR_UNICODE:
if(unicount-- == 0)
goto RETURN_TOKEN;
if(('0' > ch || ch > '9') && ('A' > ch || ch > 'F')) {
cy_error(context, "Unexpected hex sequence in unicode escape sequence");
cry_array_append(buffer, "0", 1);
goto NO_BUFFER_APPEND;
}
break;
case CHAR_ESCAPE:
break;
}
cry_array_append(buffer, p, cry_utf8_codepoints(p));
NO_BUFFER_APPEND:
p = cry_utf8_next(p);
ch = cry_utf8_get(p);
}
RETURN_TOKEN:
if(state == CHAR_UNICODE && unicount > 0) {
cy_error(context, "Improper unicode escape sequence found in character literal");
while(unicount-- > 0)
cry_array_append(buffer, "0", 1);
} else if(cry_array_size(buffer) == 1) {
cy_error(context, "No character symbol is given in character literal");
cry_array_append(buffer, "0", 1);
}
cry_array_append(buffer, "\0", 1);
context->src = p;
return TOK_CHAR;
}
enum StringState {
STR_EAT,
STR_ESCAPE,
STR_UNICODE,
STR_FINAL
};
static enum CyTOK
lex_string(struct CyContext* context)
{
assert(context != 0);
struct CryArray* buffer = context->buffer;
enum StringState state = STR_EAT;
byte_t* p = cry_utf8_next(context->src);
unicode_t ch = cry_utf8_get(p);
int unicount = 0;
while(ch != '\0') {
switch(state) {
case STR_EAT:
if(ch == '\\')
state = STR_ESCAPE;
else if(ch == '\"') {
state = STR_FINAL;
goto NO_BUFFER_APPEND;
} else if(ch == '\r' || ch == '\n') {
cy_error(context, "Unexpected newline/carriage return found in string literal");
state = STR_FINAL;
goto RETURN_TOKEN;
}
break;
case STR_ESCAPE:
switch(ch) {
case 'a': case 'b': case 'f': case 'n': case 'r':
case 't': case 'v': case '0': case '\"':
case '\\':
state = STR_EAT;
break;
case 'u':
unicount = 4;
state = STR_UNICODE;
break;
case 'U':
unicount = 6;
state = STR_UNICODE;
break;
default:
cy_error(context, "Unknown escape sequence found in this string literal");
state = STR_EAT;
cry_array_append(buffer, "t", 1);
goto NO_BUFFER_APPEND;
}
break;
case STR_UNICODE:
if(--unicount == 0)
state = STR_EAT;
if(('0' > ch || ch > '9') && ('A' > ch || ch > 'F')) {
cy_error(context, "Unexpected hex number in unicode escape sequence found");
cry_array_append(buffer, "0", 1);
goto NO_BUFFER_APPEND;
}
break;
case STR_FINAL:
goto RETURN_TOKEN;
}
cry_array_append(buffer, p, cry_utf8_codepoints(p));
NO_BUFFER_APPEND:
p = cry_utf8_next(p);
ch = cry_utf8_get(p);
}
RETURN_TOKEN:
if(state != STR_FINAL) {
cy_error(context, "Unexpected end of file found in unclosed string");
while(unicount-- > 0)
cry_array_append(buffer, "0", 1);
if(state == STR_ESCAPE)
cry_array_append(buffer, "0", 1);
}
context->src = p;
cry_array_append(buffer, "\0", 1);
return TOK_STRING;
}
static enum CyTOK
lex_comment(struct CyContext* context)
{
struct CryArray* buffer = context->buffer;
byte_t* p = context->src;
unicode_t ch = cry_utf8_get(p);
while(ch != '\0' && ch != '\r' && ch != '\n') {
cry_array_append(buffer, p, cry_utf8_codepoints(p));
p = cry_utf8_next(p);
ch = cry_utf8_get(p);
}
cry_array_append(buffer, "\0", 1);
return TOK_COMMENT;
}
static int
is_symbol_character(unicode_t ch)
{
return cry_unicode_isalnum(ch) ||
ch == '+' || ch == '-' || ch == '*' || ch == '/' || ch == '%' ||
ch == '<' || ch == '>' || ch == '=' || ch == '!' || ch == '?' ||
ch == '#' || ch == ':' || ch == '.' || ch == '~' || ch == '_';
}
static enum CyTOK
lex_symbol(struct CyContext* context)
{
struct CryArray* buffer = context->buffer;
byte_t* p = context->src;
unicode_t ch = cry_utf8_get(p);
while(is_symbol_character(ch)) {
cry_array_append(buffer, p, cry_utf8_codepoints(p));
p = cry_utf8_next(p);
ch = cry_utf8_get(p);
}
context->src = p;
cry_array_append(buffer, "\0", 1);
byte_t* sym = cry_array_get(buffer, 0);
if (strcmp(sym, "true") == 0)
return TOK_TRUE;
else if(strcmp(sym, "false") == 0)
return TOK_FALSE;
else
return TOK_SYMBOL;
}
enum CyTOK
cy_lex(struct CyContext* context)
{
......@@ -343,7 +565,7 @@ cy_lex(struct CyContext* context)
byte_t* p = context->src;
while(1) {
while(TRUE) {
cry_array_clear(context->buffer);
unicode_t ch = cry_utf8_get(p);
......@@ -357,6 +579,8 @@ cy_lex(struct CyContext* context)
switch(ch) {
case '\0':
return TOK_EOF;
case ';':
return lex_comment(context);
case '(':
return TOK_ROUNDLEFTBRACE;
case ')':
......@@ -371,8 +595,20 @@ cy_lex(struct CyContext* context)
case '5': case '6': case '7': case '8': case '9':
return lex_number(context);
case '\\':
// TODO: parsing
return TOK_CHAR;
return lex_character(context);
case '"':
return lex_string(context);
case '+': case '-': case '*': case '/': case '^':
case '<': case '>': case '=': case '?': case '!':
case ':': case '_': case '%': case '~': case '#':
return lex_symbol(context);
default:
if(cry_unicode_isalpha(ch))
return lex_symbol(context);
else {
cy_error(context, "Unknown character found in input scanning");
p = cry_utf8_next(p);
}
}
}
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment