Skip to content
Snippets Groups Projects
read.c 16.4 KiB
Newer Older
/*
 * Cherry programming language
 * Copyright (C) 2013 Christoph Mueller
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/
#include "cherry/unicode.h"
#include <stdarg.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
struct org_cherry_context*   
org_cherry_context(const cy_byte_t* source, const char* filename, cy_flags_t flags)
	struct org_cherry_context* context = GC_MALLOC(sizeof(struct org_cherry_context));
	context->filename = filename;
	context->begin = source;
	context->src = (cy_byte_t*) source;
	context->buffer = org_cherry_array_new(sizeof(cy_byte_t));
	context->flags = flags;
struct org_cherry_context*   
org_cherry_context_repl(const cy_byte_t* source)
	return org_cherry_context(source, 0, CY_SUPRESS_COMMENTS);
org_cherry_error(struct org_cherry_context* context, const char* format, ...)
{
	va_list args;
	va_start(args, format);

	if(context->filename != 0)
		fprintf(stderr, "ERROR %s", context->filename); 
		fprintf(stderr, "ERROR (console)");

	fprintf(stderr, ": ");
	vfprintf(stderr, format, args);
	fprintf(stderr, "\n");

struct Mapping {
	cy_byte_t* string;
	enum org_cherry_tok value;
org_cherry_tok_to_string(enum org_cherry_tok token)
{
	switch(token) {
		case TOK_EOF:
			return "EOF";
		case TOK_COMMENT:
			return "COMMENT";
		case TOK_ROUNDLEFTBRACE:
			return "(";
		case TOK_ROUNDRIGHTBRACE:
			return ")";
		case TOK_SQUARELEFTBRACE:
			return "[";
		case TOK_SQUARERIGHTBRACE:
			return "]";
		case TOK_STRING:
			return "STRING";
		case TOK_DOT:
			return ".";
		case TOK_CHAR:
			return "CHAR";
		case TOK_HEX:
			return "HEX";
		case TOK_DEC:
			return "DEC";
		case TOK_OCT:
			return "OCT";
		case TOK_BIN:
			return "BIN";
		case TOK_FLOAT:
			return "FLOAT";
		case TOK_SYMBOL:
			return "SYMBOL";
	}

	return "UNKNOWN";
}


enum FloatState {
	FP_START, 
	FP_EXPONENT, 
	FP_DOT, 
	FP_DECIMAL, 
	FP_MINUSPLUS, 
	FP_FINAL
};


static enum org_cherry_tok
lex_float(struct org_cherry_context* context)
	struct org_cherry_array* buffer = context->buffer;
	const cy_byte_t* p = context->src;
	cy_unicode_t ch = org_cherry_utf8_get(p);

	enum FloatState state = FP_START;

	do {
		switch(state) {
			case FP_START:
				if(ch == 'e' || ch == 'E')
					state = FP_EXPONENT;
				else if(ch == '.')
					state = FP_DOT;
				break;

			case FP_EXPONENT:
				if(ch == '+' || ch == '-')
					state = FP_MINUSPLUS;
				else if('0' > ch || ch > '9') {
					org_cherry_error(context, "Unexpected character found in float literal after +/-");
					org_cherry_array_append(buffer, "0", 1);
					goto RETURN_TOKEN;
				} else
					state = FP_FINAL;
				break;

			case FP_DOT:
				state = FP_DECIMAL;
				if('0' > ch || ch > '9') {
					org_cherry_error(context, "Unexpected character found in float literal after dot");
					org_cherry_array_append(buffer, "0", 1);
					goto RETURN_TOKEN;
				}
				break;

			case FP_DECIMAL:
				if(ch == 'e' || ch == 'E') 
					state = FP_EXPONENT;
				else if('0' > ch || ch > '9')
					goto RETURN_TOKEN;
				break;

			case FP_MINUSPLUS:
				state = FP_FINAL;
				if('0' > ch || ch > '9') {
					org_cherry_error(context, "Unexpected character found in float literal");
					org_cherry_array_append(buffer, "0", 1);
					goto RETURN_TOKEN;
				}
				break;

			case FP_FINAL:
				if('0' > ch || ch > '9')
					goto RETURN_TOKEN;
				break;
		}


NO_APPEND_BUFFER:
		p = org_cherry_utf8_next(p);
		ch = org_cherry_utf8_get(p);

	} while (ch != '\0');

RETURN_TOKEN:
	context->src = p;

	org_cherry_array_append(buffer, "\0", 1);

	return TOK_FLOAT;
}


enum NumberState {
	INT_START, 
	INT_BASE, 
	INT_BIN_WAIT, 
	INT_HEX_WAIT, 
	INT_OCT_WAIT, 
	INT_BIN_READ, 
	INT_HEX_READ, 
	INT_OCT_READ, 
	INT_DEC_READ
};


static enum org_cherry_tok
lex_number(struct org_cherry_context* context)
	struct org_cherry_array* buffer = context->buffer;
	const cy_byte_t* p = context->src;
	cy_unicode_t ch = org_cherry_utf8_get(p);
	enum org_cherry_tok token = TOK_DEC;
	enum NumberState state = INT_START;

	do {
		switch(state) {
			case INT_START:
				if(ch == '0') 
					state = INT_BASE;
				else
					state = INT_DEC_READ;
				break;

			case INT_BASE:
				if(ch == 'x') {
					state = INT_HEX_WAIT;
				} else if(ch == 'b') {
					state = INT_BIN_WAIT;
				} else if('0' <= ch && ch <= '7') {
					state = INT_OCT_READ;
					break;
				} else if(ch == '.' || ch == 'e' || ch == 'E') {
					context->src = p;
					return lex_float(context);
				} else {
					token = TOK_DEC;
					goto RETURN_TOKEN;
				}
				break;

			case INT_HEX_WAIT:
				if(('0' > ch || ch > '9') && ('A' > ch || ch > 'F')) {
					org_cherry_error(context, "Unexpected character found in hex literal");
					org_cherry_array_append(buffer, "0", 1);
					token = TOK_HEX;
					goto RETURN_TOKEN;
				}
				state = INT_HEX_READ;
				break;

			case INT_BIN_WAIT:
				if(ch != '0' && ch != '1') {
					org_cherry_error(context, "Unexpected character found in binary literal");
					org_cherry_array_append(buffer, "0", 1);
					token = TOK_BIN;
					goto RETURN_TOKEN;
				}
				state = INT_BIN_READ;
				break;

			case INT_BIN_READ:
				token = TOK_BIN;
				if(ch != '0' && ch != '1')
					goto RETURN_TOKEN;
				break;

			case INT_OCT_READ:
				token = TOK_OCT;
				if('0' > ch || ch > '7')
					goto RETURN_TOKEN;
				break;

			case INT_DEC_READ:
				token = TOK_DEC;
				if(ch == '.' || ch == 'e' || ch == 'E') {
					context->src = p;
					return lex_float(context);
				} else if('0' > ch || ch > '9')
					goto RETURN_TOKEN;
				break;

			case INT_HEX_READ:
				token = TOK_HEX;
				if(('0' > ch || ch > '9') && ('A' > ch || ch > 'F'))
					goto RETURN_TOKEN;
				break;

			default:
				break;
		}


NO_APPEND_BUFFER:
		p = org_cherry_utf8_next(p);
		ch = org_cherry_utf8_get(p);
	} while(ch != '\0');

RETURN_TOKEN:
	context->src = p;
	org_cherry_array_append(buffer, "\0", 1);
enum CharState {
	CHAR_EAT,
	CHAR_ESCAPE,
	CHAR_UNICODE
};

static enum org_cherry_tok
lex_character(struct org_cherry_context* context)
{
	assert(context != 0);

	struct org_cherry_array* buffer = context->buffer;
	cy_byte_t* p = org_cherry_utf8_next(context->src);
	cy_unicode_t ch = org_cherry_utf8_get(p);
	enum CharState state = CHAR_EAT;
	int unicount = 0;

    org_cherry_array_append(buffer, "\\", 1);
	while(!org_cherry_unicode_isspace(ch) && ch != '\0') {
		switch(state) {
			case CHAR_EAT:
				if(ch == 'u') {
					state = CHAR_UNICODE;
					unicount = 4;
				} else if(ch == 'U') {
					state = CHAR_UNICODE;
					unicount = 6;					
				} else {
					state = CHAR_ESCAPE;
				}
				break;
			case CHAR_UNICODE:
				if(unicount-- == 0)
					goto RETURN_TOKEN;

				if(('0' > ch || ch > '9') && ('A' > ch || ch > 'F')) {
					org_cherry_error(context, "Unexpected hex sequence in unicode escape sequence");
					org_cherry_array_append(buffer, "0", 1);
					goto NO_BUFFER_APPEND;
				}
				break;

			case CHAR_ESCAPE:
				break;
		}

		org_cherry_array_append(buffer, p, org_cherry_utf8_codepoints(p));
NO_BUFFER_APPEND:

		p = org_cherry_utf8_next(p);
		ch = org_cherry_utf8_get(p);
	}

RETURN_TOKEN:
	if(state == CHAR_UNICODE && unicount > 0) {
		org_cherry_error(context, "Improper unicode escape sequence found in character literal");

		while(unicount-- > 0)
			org_cherry_array_append(buffer, "0", 1);
	} else if(org_cherry_array_size(buffer) == 1) {
		org_cherry_error(context, "No character symbol is given in character literal");
		org_cherry_array_append(buffer, "0", 1);
	org_cherry_array_append(buffer, "\0", 1);

	context->src = p;

	return TOK_CHAR;
}


enum StringState {
	STR_EAT, 
	STR_ESCAPE, 
	STR_UNICODE, 
	STR_FINAL
};


static enum org_cherry_tok
lex_string(struct org_cherry_context* context)
{
	assert(context != 0);

	struct org_cherry_array* buffer = context->buffer;
	enum StringState state = STR_EAT;
	cy_byte_t* p = org_cherry_utf8_next(context->src);
	cy_unicode_t ch = org_cherry_utf8_get(p);
	int unicount = 0;

	while(ch != '\0') {
		switch(state) {
			case STR_EAT:
				if(ch == '\\') 
					state = STR_ESCAPE;
				else if(ch == '\"') {
					state = STR_FINAL;
					goto NO_BUFFER_APPEND;
				} else if(ch == '\r' || ch == '\n') {
					org_cherry_error(context, "Unexpected newline/carriage return found in string literal");
					state = STR_FINAL;
					goto RETURN_TOKEN;
				}
				break;

			case STR_ESCAPE:
				switch(ch) {
					case 'a': case 'b': case 'f': case 'n': case 'r':
					case 't': case 'v': case '0': case '\"':
					case '\\':
						state = STR_EAT;
						break;
					case 'u':
						unicount = 4;
						state = STR_UNICODE;
						break;
					case 'U':
						unicount = 6;
						state = STR_UNICODE;
						break;
					default:
						org_cherry_error(context, "Unknown escape sequence found in this string literal");
						state = STR_EAT;
						org_cherry_array_append(buffer, "t", 1);
						goto NO_BUFFER_APPEND;

				}
				break;

			case STR_UNICODE:
				if(--unicount == 0)
					state = STR_EAT;

				if(('0' > ch || ch > '9') && ('A' > ch || ch > 'F')) {
					org_cherry_error(context, "Unexpected hex number in unicode escape sequence found");
					org_cherry_array_append(buffer, "0", 1);
					goto NO_BUFFER_APPEND;
				}
				break;

			case STR_FINAL:
				goto RETURN_TOKEN;
		}

		org_cherry_array_append(buffer, p, org_cherry_utf8_codepoints(p));

NO_BUFFER_APPEND:
		p = org_cherry_utf8_next(p);
		ch = org_cherry_utf8_get(p);
	}

RETURN_TOKEN:
	if(state != STR_FINAL) {
		org_cherry_error(context, "Unexpected end of file found in unclosed string");

		while(unicount-- > 0)
			org_cherry_array_append(buffer, "0", 1);

		if(state == STR_ESCAPE)
			org_cherry_array_append(buffer, "0", 1);
	}

	context->src = p;

	org_cherry_array_append(buffer, "\0", 1);
static enum org_cherry_tok
lex_comment(struct org_cherry_context* context)
	struct org_cherry_array* buffer = context->buffer;
	const cy_byte_t* p = context->src;
	cy_unicode_t ch = org_cherry_utf8_get(p);

	while(ch != '\0' && ch != '\r' && ch != '\n') {
		org_cherry_array_append(buffer, p, org_cherry_utf8_codepoints(p));
		p = org_cherry_utf8_next(p);
		ch = org_cherry_utf8_get(p);
	org_cherry_array_append(buffer, "\0", 1);
	context->src = p;

	return TOK_COMMENT;
}


static int 
is_symbol_character(cy_unicode_t ch)
	return org_cherry_unicode_isalnum(ch) || 
		ch == '+' || ch == '-' || ch == '*' || ch == '/' || ch == '%' ||
		ch == '<' || ch == '>' || ch == '=' || ch == '!' || ch == '?' ||
		ch == '#' || ch == ':' || ch == '.' || ch == '~' || ch == '_';
}

static enum org_cherry_tok
lex_symbol(struct org_cherry_context* context)
	struct org_cherry_array* buffer = context->buffer;
	const cy_byte_t* p = context->src;
	cy_unicode_t ch = org_cherry_utf8_get(p);

	while(is_symbol_character(ch)) {
		org_cherry_array_append(buffer, p, org_cherry_utf8_codepoints(p));
		p = org_cherry_utf8_next(p);
		ch = org_cherry_utf8_get(p);
	}

	context->src = p;

	org_cherry_array_append(buffer, "\0", 1);
	cy_byte_t* sym = org_cherry_array_get(buffer, 0);

	if (strcmp(sym, "true") == 0)
		return TOK_TRUE;
	else if(strcmp(sym, "false") == 0)
		return TOK_FALSE;
	else
		return TOK_SYMBOL;
}

    p = org_cherry_utf8_next(p); \
enum org_cherry_tok      
org_cherry_lex(struct org_cherry_context* context)
{
	assert(context != 0);

	while(TRUE) {
		const cy_byte_t* p = context->src;
		org_cherry_array_clear(context->buffer);
		cy_unicode_t ch = org_cherry_utf8_get(p);
		if(org_cherry_unicode_isspace(ch)) {
			p = org_cherry_utf8_next(p);
			context->src = p;
			continue;
		}

		switch(ch) {
			case '\0':
				return TOK_EOF;
			case '(':
				LEX_RETURN(TOK_ROUNDLEFTBRACE);
				LEX_RETURN(TOK_ROUNDRIGHTBRACE);
				LEX_RETURN(TOK_SQUARELEFTBRACE);
				LEX_RETURN(TOK_SQUARERIGHTBRACE);
				LEX_RETURN(TOK_DOT);
			case '\'':
				LEX_RETURN(TOK_QUOTE);
			case ';':
				if(context->flags & CY_SUPRESS_COMMENTS) {
					lex_comment(context);
					continue;
				}
				return lex_comment(context);
	    	case '0': case '1': case '2': case '3': case '4':
			case '5': case '6': case '7': case '8': case '9':
				return lex_number(context);
			case '\\':
				return lex_character(context);
			case '"':
				return lex_string(context);
			case '+': case '-': case '*': case '/': case '^':
			case '<': case '>': case '=': case '?': case '!':
			case ':': case '_': case '%': case '~': case '#':
				return lex_symbol(context);
			default:
				if(org_cherry_unicode_isalpha(ch))
					return lex_symbol(context);
				else {
					org_cherry_error(context, "Unknown character found in input scanning");
					p = org_cherry_utf8_next(p);
org_cherry_pos(struct org_cherry_context* context)
{
	assert(context != 0);
	return context->src;
}

void                        
org_cherry_rewind(struct org_cherry_context* context, const cy_byte_t* pos)
{
	assert(context != 0);
	assert(context->begin <= pos && pos <= context->src);
org_cherry_token_string(struct org_cherry_context* context)
{
	assert(context->buffer != 0);

	return (const cy_byte_t*) org_cherry_array_get(context->buffer, 0);
org_cherry_token_length(struct org_cherry_context* context)
{
	assert(context->buffer != 0);

	return org_cherry_array_size(context->buffer);
static struct org_cherry_value*
org_cherry_read_pair(struct org_cherry_context* context)
{
	const cy_byte_t* pos;
	enum org_cherry_tok tok;
	struct org_cherry_value* head;
	struct org_cherry_value* tail;
	
	pos = org_cherry_pos(context);
	tok = org_cherry_lex(context);
	if(tok == TOK_ROUNDRIGHTBRACE)
		return org_cherry_emptylist;

	org_cherry_rewind(context, pos);

	head = org_cherry_read(context);

	pos = org_cherry_pos(context);
	tok = org_cherry_lex(context);

	if(tok == TOK_DOT) {
		tail = org_cherry_read(context);
		
		if(org_cherry_lex(context) != TOK_ROUNDRIGHTBRACE) {
			org_cherry_error(context, "No trailing right parenthesis in improper list literal");
		}
		return (struct org_cherry_value*) org_cherry_list_cons(head, tail);
	} 
	
	org_cherry_rewind(context, pos);
	tail = org_cherry_read_pair(context);

	return (struct org_cherry_value*) org_cherry_list_cons(head, tail);
struct org_cherry_value*
org_cherry_read(struct org_cherry_context* context)
{
	assert(context != 0);

	enum org_cherry_tok tok = org_cherry_lex(context);

	while(tok != TOK_EOF) {
		switch(tok) {
			case TOK_TRUE:
				return org_cherry_true;
			case TOK_FALSE:
				return org_cherry_false;
			case TOK_HEX:
				return org_cherry_fixnum_from_string(org_cherry_token_string(context), 16);
			case TOK_DEC:
				return org_cherry_fixnum_from_string(org_cherry_token_string(context), 10);
			case TOK_OCT:
				return org_cherry_fixnum_from_string(org_cherry_token_string(context), 8);
			case TOK_BIN:
				return org_cherry_fixnum_from_string(org_cherry_token_string(context), 2);
			case TOK_FLOAT:
				return org_cherry_float_from_string(org_cherry_token_string(context));
			case TOK_STRING:
				return org_cherry_string_from_string(org_cherry_token_string(context));
            case TOK_CHAR:
                return org_cherry_char_from_string(org_cherry_token_string(context));
            case TOK_SYMBOL:
                return org_cherry_symbol_from_string(org_cherry_token_string(context));
            case TOK_ROUNDLEFTBRACE:
				return org_cherry_read_pair(context);
			case TOK_QUOTE:
				return TO_VALUE(org_cherry_list_cons(org_cherry_symbol_quote, 
                        TO_VALUE(org_cherry_list_cons(org_cherry_read(context), org_cherry_emptylist))));

			default:
				org_cherry_error(context, "bad input with token %s", 
						org_cherry_tok_to_string(tok));
				return org_cherry_false;
		}
	}

	return org_cherry_false;
}