Commit 9683cd38 authored by Chris Müller's avatar Chris Müller
Browse files

Migrate Reading part from bootstrap-cherry


Signed-off-by: Chris Müller's avatarChris Mueller <ruunsmail@gmail.com>
parent 8dd02e1e
......@@ -12,7 +12,7 @@ set(CHERRY_VERSION 0.1)
include_directories(crystal/include)
include_directories(include)
#add_subdirectory(source)
add_subdirectory(bootstrap)
add_subdirectory(source)
#add_subdirectory(bootstrap)
#add_subdirectory(test)
......@@ -18,6 +18,7 @@
#include "bootstrap.h"
#include <gc.h>
#include <getopt.h>
#include <stdarg.h>
// ----------------------------------------------------------------------------
......@@ -114,6 +115,17 @@ lambda(struct value* param, struct value* body)
return v;
}
struct value*
character(unicode_t code)
{
struct value* v = alloc_value();
v->tag = CHARACTER;
v->character_value = code;
return v;
}
struct value*
tuple(size_t size, ...)
{
......@@ -161,11 +173,6 @@ cons(struct value* head, struct value* tail)
}
int main(int argc, char** argv)
{
return 0;
}
void
initialize(void)
......@@ -194,15 +201,579 @@ initialize(void)
}
// ----------------------------------------------------------------------------
// compile
// Reading
// ----------------------------------------------------------------------------
enum token {
TOK_EOF, TOK_ROUNDLEFTBRACE, TOK_ROUNDRIGHTBRACE,
TOK_SQUARELEFTBRACE, TOK_SQUARERIGHTBRACE,
TOK_STRING, TOK_DOT, TOK_CHAR, TOK_HEX, TOK_DEC,
TOK_OCT, TOK_BIN, TOK_FLOAT, TOK_SYMBOL,
TOK_QUOTE, TOK_TRUE, TOK_FALSE
};
static int
issymbol(int ch)
{
return isalnum(ch) ||
ch == '+' || ch == '-' || ch == '*' || ch == '/' || ch == '%' ||
ch == '<' || ch == '>' || ch == '=' || ch == '!' || ch == '?' ||
ch == '#' || ch == ':' || ch == '.' || ch == '~' || ch == '_';
}
enum float_state {
FP_START,
FP_EXPONENT,
FP_DOT,
FP_DECIMAL,
FP_MINUSPLUS,
FP_FINAL
};
static enum token
lex_float(byte_t** begin, byte_t* buffer, size_t buffer_size)
{
byte_t* p = *begin;
byte_t* buf_beg = buffer;
byte_t* buf_end = buffer + buffer_size;
enum float_state state = FP_START;
do {
switch(state) {
case FP_START:
if(*p == 'e' || *p == 'E')
state = FP_EXPONENT;
else if(*p == '.')
state = FP_DOT;
break;
case FP_EXPONENT:
if(*p == '+' || *p == '-')
state = FP_MINUSPLUS;
else if('0' > *p || *p > '9') {
failure("Unexpected character found in float literal after +/-");
} else
state = FP_FINAL;
break;
case FP_DOT:
state = FP_DECIMAL;
if('0' > *p || *p > '9')
failure("Unexpected character found in float literal after dot");
break;
case FP_DECIMAL:
if(*p == 'e' || *p == 'E')
state = FP_EXPONENT;
else if('0' > *p || *p > '9')
goto RETURN_TOKEN;
break;
case FP_MINUSPLUS:
state = FP_FINAL;
if('0' > *p || *p > '9')
failure("Unexpected character found in float literal");
break;
case FP_FINAL:
if('0' > *p || *p > '9')
goto RETURN_TOKEN;
break;
}
*buffer = *p;
if(buffer + 1 < buf_end) {
buffer++;
p++;
} else
failure("Bufferoverflow in number literal");
} while (*p != '\0');
RETURN_TOKEN:
*buffer = '\0';
*begin = p;
return TOK_FLOAT;
}
enum number_state {
INT_START,
INT_BASE,
INT_BIN_WAIT,
INT_HEX_WAIT,
INT_OCT_WAIT,
INT_BIN_READ,
INT_HEX_READ,
INT_OCT_READ,
INT_DEC_READ
};
static enum token
lex_number(byte_t** begin, byte_t* buffer, size_t buffer_size)
{
byte_t* p = *begin;
byte_t* buf_beg = buffer;
byte_t* buf_end = buffer + buffer_size;
enum token token = TOK_DEC;
enum number_state state = INT_START;
do {
switch(state) {
case INT_START:
if(*p == '0')
state = INT_BASE;
else
state = INT_DEC_READ;
break;
case INT_BASE:
if(*p == 'x') {
state = INT_HEX_WAIT;
} else if(*p == 'b') {
state = INT_BIN_WAIT;
} else if('0' <= *p && *p <= '7') {
state = INT_OCT_READ;
break;
} else if(*p == '.' || *p == 'e' || *p == 'E') {
return lex_float(&p, buffer, buffer_size);
} else {
token = TOK_DEC;
goto RETURN_TOKEN;
}
break;
case INT_HEX_WAIT:
if(('0' > *p || *p > '9') && ('A' > *p || *p > 'F'))
failure("Unexpected character found in hex literal");
state = INT_HEX_READ;
break;
case INT_BIN_WAIT:
if(*p != '0' && *p != '1')
failure("Unexpected character found in binary literal");
state = INT_BIN_READ;
break;
case INT_BIN_READ:
token = TOK_BIN;
if(*p != '0' && *p != '1')
goto RETURN_TOKEN;
break;
case INT_OCT_READ:
token = TOK_OCT;
if('0' > *p || *p > '7')
goto RETURN_TOKEN;
break;
case INT_DEC_READ:
token = TOK_DEC;
if(*p == '.' || *p == 'e' || *p == 'E') {
return lex_float(&p, buffer, buffer_size);
} else if('0' > *p || *p > '9')
goto RETURN_TOKEN;
break;
case INT_HEX_READ:
token = TOK_HEX;
if(('0' > *p || *p > '9') && ('A' > *p || *p > 'F'))
goto RETURN_TOKEN;
break;
default:
break;
}
*buffer = *p;
if(buffer + 1 < buf_end) {
buffer++;
p++;
} else
failure("Bufferoverflow in number literal");
} while(*p != '\0');
RETURN_TOKEN:
*buffer = '\0';
*begin = p;
return token;
}
static enum token
lex_char(byte_t** begin, byte_t* buffer, size_t buffer_size)
{
byte_t* p = *begin;
byte_t* buf_beg = buffer;
byte_t* buf_end = buffer + buffer_size;
p++; // remove beginning delimeter
while(!isspace(*p) && *p != '\0') {
*buffer = *p;
if(buffer + 1 < buf_end) {
buffer++;
p++;
} else
failure("Bufferoverflow in raw string literal");
}
*buffer = '\0';
*begin = p;
return TOK_CHAR;
};
static enum token
lex_raw_string(byte_t** begin, byte_t* buffer, size_t buffer_size)
{
byte_t* p = *begin;
byte_t* buf_beg = buffer;
byte_t* buf_end = buffer + buffer_size;
p++; // remove beginning delimeter
while(*p != '\0' && *p != '~') {
if(*p == '\r' || *p == '\n')
failure("Unexpected newline/carriage return found in raw string");
*buffer = *p;
if(buffer + 1 < buf_end) {
buffer++;
p++;
} else
failure("Bufferoverflow in raw string literal");
buffer++;
p++;
}
*buffer = '\0';
*begin = (*p != '\0') ? p : p + 1;
return TOK_STRING;
}
static enum token
lex_symbol(byte_t** begin, byte_t* buffer, size_t buffer_size)
{
byte_t* p = *begin;
byte_t* buf_beg = buffer;
byte_t* buf_end = buffer + buffer_size;
while(issymbol(*p)) {
*buffer = *p;
if(buffer < buf_end) {
buffer++;
p++;
} else
failure("Bufferoverflow in symbol scanning");
}
*buffer = '\0';
*begin = p;
if(strcmp(buf_beg, "true") == 0)
return TOK_TRUE;
else if(strcmp(buf_beg, "false") == 0)
return TOK_FALSE;
else
return TOK_SYMBOL;
}
enum string_state {
STR_EAT,
STR_ESCAPE,
STR_FINAL
};
static enum token
lex_string(byte_t** begin, byte_t* buffer, size_t buffer_size)
{
byte_t* p = *begin;
byte_t* buf_beg = buffer;
byte_t* buf_end = buffer + buffer_size;
enum string_state state = STR_EAT;
p++;
while(*p != '\0') {
switch(state) {
case STR_EAT:
if(*p == '\\') {
state = STR_ESCAPE;
goto NO_BUFFER_APPEND;
} else if(*p == '\"') {
state = STR_FINAL;
goto NO_BUFFER_APPEND;
} else if(*p == '\r' || *p == '\n')
failure("Unexpected newline/carriage return found in string literal");
break;
case STR_ESCAPE:
switch(*p) {
case 'a': *buffer = '\a'; break;
case 'b': *buffer = '\b'; break;
case 'f': *buffer = '\f'; break;
case 'n': *buffer = '\n'; break;
case 'r': *buffer = '\r'; break;
case 't': *buffer = '\t'; break;
case 'v': *buffer = '\v'; break;
case '0': *buffer = '0'; break;
case '"': *buffer = '"'; break;
case '\\': *buffer = '\\'; break;
state = STR_EAT;
goto NO_BUFFER_APPEND;
default:
failure("Unknown escape sequence found in this string literal");
}
break;
case STR_FINAL:
goto RETURN_TOKEN;
}
*buffer = *p;
NO_BUFFER_APPEND:
if(buffer < buf_end) {
buffer++;
p++;
} else
failure("Bufferoverflow in symbol scanning");
}
RETURN_TOKEN:
if(state != STR_FINAL)
failure("Unexpected end of file found in unclosed string");
*begin = p;
*buffer = '\0';
return TOK_STRING;
}
static enum token
lex(byte_t** begin, byte_t* buffer, size_t buffer_size)
{
#define SET_RETURN(token, p) \
begin = *(++p);
byte_t* p = *begin;
enum token tok = TOK_EOF;
while(TRUE) {
while(isspace(*p))
p++;
switch(*p) {
case '\0':
tok = TOK_EOF;
goto RETURN;
case '(':
tok = TOK_ROUNDLEFTBRACE;
goto RETURN_AND_INC;
case ')':
tok = TOK_ROUNDRIGHTBRACE;
goto RETURN_AND_INC;
case '.':
tok = TOK_DOT;
goto RETURN_AND_INC;
case '[':
tok = TOK_SQUARELEFTBRACE;
goto RETURN_AND_INC;
case ']':
tok = TOK_SQUARERIGHTBRACE;
goto RETURN_AND_INC;
case '\'':
tok = TOK_QUOTE;
goto RETURN_AND_INC;
case ';':
while(*p != '\n' || *p != '\0')
p++;
continue;
case '~':
tok = lex_raw_string(&p, buffer, buffer_size);
goto RETURN;
case '\\':
tok = lex_char(&p, buffer, buffer_size);
goto RETURN;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
tok = lex_number(&p, buffer, buffer_size);
goto RETURN;
case '"':
tok = lex_string(&p, buffer, buffer_size);
goto RETURN;
default:
if(issymbol(*p))
tok = lex_symbol(&p, buffer, buffer_size);
else
failure("Unexpected character found in lex stream");
goto RETURN;
}
}
RETURN_AND_INC:
++p;
*begin = p;
RETURN:
*begin = p;
return tok;
}
byte_t*
text(FILE* file)
{
size_t filesize;
fseek(file, 0, SEEK_END);
filesize = ftell(file);
rewind(file);
byte_t* data = GC_MALLOC(filesize * sizeof(byte_t));
fread(data, sizeof(byte_t), filesize, file);
fclose(file);
return data;
}
static byte_t*
string_dup(const byte_t* buffer)
{
size_t size = strlen(buffer) + 1;
byte_t* p = GC_MALLOC(sizeof(byte_t) * size);
if(!p)
failure("Boehm GC: string allocation failed (out of memory)");
memcpy(p, buffer, size);
return p;
}
static struct value*
cherry_read_pair(byte_t** src, byte_t* buffer, size_t buffer_size)
{
byte_t* pos = *src;
enum token tok = lex(src, buffer, buffer_size);
if(tok == TOK_ROUNDRIGHTBRACE || tok == TOK_EOF) {
return Emptylist;
}
*src = pos;
struct value* head = cherry_read(src, buffer, buffer_size);
struct value* tail = cherry_read_pair(src, buffer, buffer_size);
return cons(head, tail);
}
struct value*
read(FILE* filename)
cherry_read(byte_t** src, byte_t* buffer, size_t buffer_size)
{
return NULL;
struct value* last = alloc_value();
struct value* begin = last;
struct value* val = 0;
enum token tok = lex(src, buffer, buffer_size);
if (tok != TOK_EOF) {
switch(tok) {
case TOK_FALSE:
val = False;
break;
case TOK_TRUE:
val = True;
break;
case TOK_HEX:
val = fixnum(strtol(buffer + 2, 0, 16));
break;
case TOK_DEC:
val = fixnum(strtol(buffer, 0, 10));
break;
case TOK_OCT:
val = fixnum(strtol(buffer + 2, 0, 8));
break;
case TOK_BIN:
val = fixnum(strtol(buffer + 2, 0, 2));
break;
case TOK_FLOAT:
val = floatpoint(strtod(buffer + 2, 0));
break;
case TOK_STRING:
val = string(string_dup(buffer));
case TOK_SYMBOL:
val = symbol(string_dup(buffer));
break;
case TOK_CHAR:
val = character(buffer[0]);
break;
case TOK_QUOTE:
val = cons(Quote, cons(cherry_read(src, buffer, buffer_size), Emptylist));
break;
case TOK_ROUNDLEFTBRACE:
val = cherry_read_pair(src, buffer, buffer_size);
break;
default:
failure("Bad input token found in read");
}
}
return val;
}
// ----------------------------------------------------------------------------
......@@ -211,14 +782,14 @@ read(FILE* filename)
struct closure*
compile(struct value* ast, struct value* env)