Newer
Older
/*
* Cherry programming language
* Copyright (C) 2013 Christoph Mueller
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "cherry/unicode.h"
Chris Müller
committed
#include "cherry/array.h"
#include <stdarg.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
Chris Müller
committed
#include <gc.h>
struct org_cherry_context*
org_cherry_context(const cy_byte_t* source, const char* filename, cy_flags_t flags)
Chris Müller
committed
struct org_cherry_context* context = GC_MALLOC(sizeof(struct org_cherry_context));
context->filename = filename;
context->begin = source;
context->src = (cy_byte_t*) source;
context->buffer = org_cherry_array_new(sizeof(cy_byte_t));
struct org_cherry_context*
org_cherry_context_repl(const cy_byte_t* source)
return org_cherry_context(source, 0, CY_SUPRESS_COMMENTS);
org_cherry_error(struct org_cherry_context* context, const char* format, ...)
{
va_list args;
va_start(args, format);
if(context->filename != 0)
fprintf(stderr, "ERROR %s", context->filename);
fprintf(stderr, "ERROR (console)");
fprintf(stderr, ": ");
vfprintf(stderr, format, args);
fprintf(stderr, "\n");
enum org_cherry_tok value;
org_cherry_tok_to_string(enum org_cherry_tok token)
{
switch(token) {
case TOK_EOF:
return "EOF";
case TOK_COMMENT:
return "COMMENT";
case TOK_ROUNDLEFTBRACE:
return "(";
case TOK_ROUNDRIGHTBRACE:
return ")";
case TOK_SQUARELEFTBRACE:
return "[";
case TOK_SQUARERIGHTBRACE:
return "]";
case TOK_STRING:
return "STRING";
case TOK_DOT:
return ".";
case TOK_CHAR:
return "CHAR";
case TOK_HEX:
return "HEX";
case TOK_DEC:
return "DEC";
case TOK_OCT:
return "OCT";
case TOK_BIN:
return "BIN";
case TOK_FLOAT:
return "FLOAT";
}
return "UNKNOWN";
}
enum FloatState {
FP_START,
FP_EXPONENT,
FP_DOT,
FP_DECIMAL,
FP_MINUSPLUS,
FP_FINAL
};
static enum org_cherry_tok
lex_float(struct org_cherry_context* context)
Chris Müller
committed
struct org_cherry_array* buffer = context->buffer;
const cy_byte_t* p = context->src;
cy_unicode_t ch = org_cherry_utf8_get(p);
enum FloatState state = FP_START;
do {
switch(state) {
case FP_START:
if(ch == 'e' || ch == 'E')
state = FP_EXPONENT;
else if(ch == '.')
state = FP_DOT;
break;
case FP_EXPONENT:
if(ch == '+' || ch == '-')
state = FP_MINUSPLUS;
else if('0' > ch || ch > '9') {
org_cherry_error(context, "Unexpected character found in float literal after +/-");
Chris Müller
committed
org_cherry_array_append(buffer, "0", 1);
goto RETURN_TOKEN;
} else
state = FP_FINAL;
break;
case FP_DOT:
state = FP_DECIMAL;
if('0' > ch || ch > '9') {
org_cherry_error(context, "Unexpected character found in float literal after dot");
Chris Müller
committed
org_cherry_array_append(buffer, "0", 1);
goto RETURN_TOKEN;
}
break;
case FP_DECIMAL:
if(ch == 'e' || ch == 'E')
state = FP_EXPONENT;
else if('0' > ch || ch > '9')
goto RETURN_TOKEN;
break;
case FP_MINUSPLUS:
state = FP_FINAL;
if('0' > ch || ch > '9') {
org_cherry_error(context, "Unexpected character found in float literal");
Chris Müller
committed
org_cherry_array_append(buffer, "0", 1);
goto RETURN_TOKEN;
}
break;
case FP_FINAL:
if('0' > ch || ch > '9')
goto RETURN_TOKEN;
break;
}
Chris Müller
committed
org_cherry_array_append(buffer, p, 1);
p = org_cherry_utf8_next(p);
ch = org_cherry_utf8_get(p);
} while (ch != '\0');
RETURN_TOKEN:
context->src = p;
Chris Müller
committed
org_cherry_array_append(buffer, "\0", 1);
return TOK_FLOAT;
}
enum NumberState {
INT_START,
INT_BASE,
INT_BIN_WAIT,
INT_HEX_WAIT,
INT_OCT_WAIT,
INT_BIN_READ,
INT_HEX_READ,
INT_OCT_READ,
INT_DEC_READ
};
static enum org_cherry_tok
lex_number(struct org_cherry_context* context)
Chris Müller
committed
struct org_cherry_array* buffer = context->buffer;
const cy_byte_t* p = context->src;
cy_unicode_t ch = org_cherry_utf8_get(p);
enum org_cherry_tok token = TOK_DEC;
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
enum NumberState state = INT_START;
do {
switch(state) {
case INT_START:
if(ch == '0')
state = INT_BASE;
else
state = INT_DEC_READ;
break;
case INT_BASE:
if(ch == 'x') {
state = INT_HEX_WAIT;
} else if(ch == 'b') {
state = INT_BIN_WAIT;
} else if('0' <= ch && ch <= '7') {
state = INT_OCT_READ;
break;
} else if(ch == '.' || ch == 'e' || ch == 'E') {
context->src = p;
return lex_float(context);
} else {
token = TOK_DEC;
goto RETURN_TOKEN;
}
break;
case INT_HEX_WAIT:
if(('0' > ch || ch > '9') && ('A' > ch || ch > 'F')) {
org_cherry_error(context, "Unexpected character found in hex literal");
Chris Müller
committed
org_cherry_array_append(buffer, "0", 1);
token = TOK_HEX;
goto RETURN_TOKEN;
}
state = INT_HEX_READ;
break;
case INT_BIN_WAIT:
if(ch != '0' && ch != '1') {
org_cherry_error(context, "Unexpected character found in binary literal");
Chris Müller
committed
org_cherry_array_append(buffer, "0", 1);
token = TOK_BIN;
goto RETURN_TOKEN;
}
state = INT_BIN_READ;
break;
case INT_BIN_READ:
token = TOK_BIN;
goto RETURN_TOKEN;
break;
case INT_OCT_READ:
token = TOK_OCT;
goto RETURN_TOKEN;
break;
case INT_DEC_READ:
token = TOK_DEC;
if(ch == '.' || ch == 'e' || ch == 'E') {
context->src = p;
return lex_float(context);
goto RETURN_TOKEN;
break;
case INT_HEX_READ:
token = TOK_HEX;
if(('0' > ch || ch > '9') && ('A' > ch || ch > 'F'))
goto RETURN_TOKEN;
break;
default:
break;
}
Chris Müller
committed
org_cherry_array_append(buffer, p, 1);
p = org_cherry_utf8_next(p);
ch = org_cherry_utf8_get(p);
} while(ch != '\0');
RETURN_TOKEN:
context->src = p;
Chris Müller
committed
org_cherry_array_append(buffer, "\0", 1);
enum CharState {
CHAR_EAT,
CHAR_ESCAPE,
CHAR_UNICODE
};
static enum org_cherry_tok
lex_character(struct org_cherry_context* context)
Chris Müller
committed
struct org_cherry_array* buffer = context->buffer;
cy_byte_t* p = org_cherry_utf8_next(context->src);
cy_unicode_t ch = org_cherry_utf8_get(p);
enum CharState state = CHAR_EAT;
int unicount = 0;
Chris Müller
committed
org_cherry_array_append(buffer, "\\", 1);
while(!org_cherry_unicode_isspace(ch) && ch != '\0') {
switch(state) {
case CHAR_EAT:
if(ch == 'u') {
state = CHAR_UNICODE;
unicount = 4;
} else if(ch == 'U') {
state = CHAR_UNICODE;
unicount = 6;
} else {
state = CHAR_ESCAPE;
}
break;
case CHAR_UNICODE:
if(unicount-- == 0)
goto RETURN_TOKEN;
if(('0' > ch || ch > '9') && ('A' > ch || ch > 'F')) {
org_cherry_error(context, "Unexpected hex sequence in unicode escape sequence");
Chris Müller
committed
org_cherry_array_append(buffer, "0", 1);
goto NO_BUFFER_APPEND;
}
break;
case CHAR_ESCAPE:
break;
}
Chris Müller
committed
org_cherry_array_append(buffer, p, org_cherry_utf8_codepoints(p));
p = org_cherry_utf8_next(p);
ch = org_cherry_utf8_get(p);
}
RETURN_TOKEN:
if(state == CHAR_UNICODE && unicount > 0) {
org_cherry_error(context, "Improper unicode escape sequence found in character literal");
Chris Müller
committed
org_cherry_array_append(buffer, "0", 1);
} else if(org_cherry_array_size(buffer) == 1) {
org_cherry_error(context, "No character symbol is given in character literal");
Chris Müller
committed
org_cherry_array_append(buffer, "0", 1);
Chris Müller
committed
org_cherry_array_append(buffer, "\0", 1);
context->src = p;
return TOK_CHAR;
}
enum StringState {
STR_EAT,
STR_ESCAPE,
STR_UNICODE,
STR_FINAL
};
static enum org_cherry_tok
lex_string(struct org_cherry_context* context)
Chris Müller
committed
struct org_cherry_array* buffer = context->buffer;
cy_byte_t* p = org_cherry_utf8_next(context->src);
cy_unicode_t ch = org_cherry_utf8_get(p);
int unicount = 0;
while(ch != '\0') {
switch(state) {
case STR_EAT:
if(ch == '\\')
state = STR_ESCAPE;
else if(ch == '\"') {
state = STR_FINAL;
goto NO_BUFFER_APPEND;
} else if(ch == '\r' || ch == '\n') {
org_cherry_error(context, "Unexpected newline/carriage return found in string literal");
state = STR_FINAL;
goto RETURN_TOKEN;
}
break;
case STR_ESCAPE:
switch(ch) {
case 'a': case 'b': case 'f': case 'n': case 'r':
case 't': case 'v': case '0': case '\"':
case '\\':
state = STR_EAT;
break;
case 'u':
unicount = 4;
state = STR_UNICODE;
break;
case 'U':
unicount = 6;
state = STR_UNICODE;
break;
default:
org_cherry_error(context, "Unknown escape sequence found in this string literal");
Chris Müller
committed
org_cherry_array_append(buffer, "t", 1);
goto NO_BUFFER_APPEND;
}
break;
case STR_UNICODE:
if(--unicount == 0)
state = STR_EAT;
if(('0' > ch || ch > '9') && ('A' > ch || ch > 'F')) {
org_cherry_error(context, "Unexpected hex number in unicode escape sequence found");
Chris Müller
committed
org_cherry_array_append(buffer, "0", 1);
goto NO_BUFFER_APPEND;
}
break;
case STR_FINAL:
goto RETURN_TOKEN;
}
Chris Müller
committed
org_cherry_array_append(buffer, p, org_cherry_utf8_codepoints(p));
p = org_cherry_utf8_next(p);
ch = org_cherry_utf8_get(p);
}
RETURN_TOKEN:
if(state != STR_FINAL) {
org_cherry_error(context, "Unexpected end of file found in unclosed string");
Chris Müller
committed
org_cherry_array_append(buffer, "0", 1);
Chris Müller
committed
org_cherry_array_append(buffer, "0", 1);
Chris Müller
committed
org_cherry_array_append(buffer, "\0", 1);
static enum org_cherry_tok
lex_comment(struct org_cherry_context* context)
Chris Müller
committed
struct org_cherry_array* buffer = context->buffer;
const cy_byte_t* p = context->src;
cy_unicode_t ch = org_cherry_utf8_get(p);
while(ch != '\0' && ch != '\r' && ch != '\n') {
Chris Müller
committed
org_cherry_array_append(buffer, p, org_cherry_utf8_codepoints(p));
p = org_cherry_utf8_next(p);
ch = org_cherry_utf8_get(p);
Chris Müller
committed
org_cherry_array_append(buffer, "\0", 1);
return TOK_COMMENT;
}
static int
is_symbol_character(cy_unicode_t ch)
return org_cherry_unicode_isalnum(ch) ||
ch == '+' || ch == '-' || ch == '*' || ch == '/' || ch == '%' ||
ch == '<' || ch == '>' || ch == '=' || ch == '!' || ch == '?' ||
ch == '#' || ch == ':' || ch == '.' || ch == '~' || ch == '_';
}
static enum org_cherry_tok
lex_symbol(struct org_cherry_context* context)
Chris Müller
committed
struct org_cherry_array* buffer = context->buffer;
const cy_byte_t* p = context->src;
cy_unicode_t ch = org_cherry_utf8_get(p);
Chris Müller
committed
org_cherry_array_append(buffer, p, org_cherry_utf8_codepoints(p));
p = org_cherry_utf8_next(p);
ch = org_cherry_utf8_get(p);
Chris Müller
committed
org_cherry_array_append(buffer, "\0", 1);
cy_byte_t* sym = org_cherry_array_get(buffer, 0);
if (strcmp(sym, "true") == 0)
return TOK_TRUE;
else if(strcmp(sym, "false") == 0)
return TOK_FALSE;
else
return TOK_SYMBOL;
}
#define LEX_RETURN(tok) \
p = org_cherry_utf8_next(p); \
context->src = p; \
return tok;
enum org_cherry_tok
org_cherry_lex(struct org_cherry_context* context)
const cy_byte_t* p = context->src;
Chris Müller
committed
org_cherry_array_clear(context->buffer);
cy_unicode_t ch = org_cherry_utf8_get(p);
if(org_cherry_unicode_isspace(ch)) {
p = org_cherry_utf8_next(p);
context->src = p;
continue;
}
switch(ch) {
case '\0':
return TOK_EOF;
case '(':
LEX_RETURN(TOK_ROUNDLEFTBRACE);
LEX_RETURN(TOK_ROUNDRIGHTBRACE);
LEX_RETURN(TOK_SQUARELEFTBRACE);
LEX_RETURN(TOK_SQUARERIGHTBRACE);
LEX_RETURN(TOK_DOT);
case '\'':
LEX_RETURN(TOK_QUOTE);
case ';':
if(context->flags & CY_SUPRESS_COMMENTS) {
lex_comment(context);
continue;
}
return lex_comment(context);
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
return lex_number(context);
case '\\':
return lex_character(context);
case '"':
return lex_string(context);
case '+': case '-': case '*': case '/': case '^':
case '<': case '>': case '=': case '?': case '!':
case ':': case '_': case '%': case '~': case '#':
return lex_symbol(context);
default:
if(org_cherry_unicode_isalpha(ch))
org_cherry_error(context, "Unknown character found in input scanning");
p = org_cherry_utf8_next(p);
}
}
return TOK_EOF;
}
org_cherry_pos(struct org_cherry_context* context)
{
assert(context != 0);
return context->src;
}
void
org_cherry_rewind(struct org_cherry_context* context, const cy_byte_t* pos)
assert(context->begin <= pos && pos <= context->src);
context->src = pos;
}
org_cherry_token_string(struct org_cherry_context* context)
{
assert(context->buffer != 0);
return (const cy_byte_t*) org_cherry_array_get(context->buffer, 0);
org_cherry_token_length(struct org_cherry_context* context)
{
assert(context->buffer != 0);
Chris Müller
committed
return org_cherry_array_size(context->buffer);
static struct org_cherry_value*
org_cherry_read_pair(struct org_cherry_context* context)
{
enum org_cherry_tok tok;
struct org_cherry_value* head;
struct org_cherry_value* tail;
pos = org_cherry_pos(context);
tok = org_cherry_lex(context);
if(tok == TOK_ROUNDRIGHTBRACE)
return org_cherry_emptylist;
org_cherry_rewind(context, pos);
head = org_cherry_read(context);
pos = org_cherry_pos(context);
tok = org_cherry_lex(context);
if(tok == TOK_DOT) {
tail = org_cherry_read(context);
if(org_cherry_lex(context) != TOK_ROUNDRIGHTBRACE) {
org_cherry_error(context, "No trailing right parenthesis in improper list literal");
}
return (struct org_cherry_value*) org_cherry_list_cons(head, tail);
}
org_cherry_rewind(context, pos);
tail = org_cherry_read_pair(context);
return (struct org_cherry_value*) org_cherry_list_cons(head, tail);
struct org_cherry_value*
org_cherry_read(struct org_cherry_context* context)
{
assert(context != 0);
enum org_cherry_tok tok = org_cherry_lex(context);
while(tok != TOK_EOF) {
switch(tok) {
case TOK_TRUE:
return org_cherry_true;
case TOK_FALSE:
return org_cherry_false;
case TOK_HEX:
return org_cherry_fixnum_from_string(org_cherry_token_string(context), 16);
case TOK_DEC:
return org_cherry_fixnum_from_string(org_cherry_token_string(context), 10);
case TOK_OCT:
return org_cherry_fixnum_from_string(org_cherry_token_string(context), 8);
case TOK_BIN:
return org_cherry_fixnum_from_string(org_cherry_token_string(context), 2);
case TOK_FLOAT:
return org_cherry_float_from_string(org_cherry_token_string(context));
case TOK_STRING:
return org_cherry_string_from_string(org_cherry_token_string(context));
case TOK_CHAR:
return org_cherry_char_from_string(org_cherry_token_string(context));
case TOK_SYMBOL:
return org_cherry_symbol_from_string(org_cherry_token_string(context));
case TOK_ROUNDLEFTBRACE:
return org_cherry_read_pair(context);
case TOK_QUOTE:
return TO_VALUE(org_cherry_list_cons(org_cherry_symbol_quote,
TO_VALUE(org_cherry_list_cons(org_cherry_read(context), org_cherry_emptylist))));