373 lines
6.5 KiB
C
373 lines
6.5 KiB
C
#include "lex.h"
|
|
|
|
#include <mlimits.h>
|
|
#include <merror.h>
|
|
|
|
static struct {
|
|
int x;
|
|
int y;
|
|
} pos;
|
|
|
|
/* get next char in lexer */
|
|
static int lex_next(struct lexer *lexer)
|
|
{
|
|
if (lexer->peek != EOF) {
|
|
int c = lexer->peek;
|
|
lexer->peek = EOF;
|
|
return c;
|
|
}
|
|
|
|
int c = getc(lexer->file);
|
|
if (c == '\n') {
|
|
lexer->x = 1;
|
|
lexer->y++;
|
|
} else {
|
|
lexer->x++;
|
|
}
|
|
return c;
|
|
}
|
|
|
|
/* peek next char in lexer */
|
|
static int lex_peek(struct lexer *lexer)
|
|
{
|
|
if (lexer->peek == EOF)
|
|
lexer->peek = lex_next(lexer);
|
|
return lexer->peek;
|
|
}
|
|
|
|
/* skip all characters until EOF or newline */
|
|
static void skip_comment(struct lexer *lexer)
|
|
{
|
|
int c;
|
|
while (1) {
|
|
c = lex_next(lexer);
|
|
if (c == EOF || c == '\n')
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* lexes text until whitespace
|
|
* returns error on zero length or too long */
|
|
static int lex_ident(struct lexer *lexer, char text[MAX_LEX_LENGTH])
|
|
{
|
|
int len = 0;
|
|
char *ptr = text;
|
|
int c;
|
|
|
|
while (1) {
|
|
c = lex_peek(lexer);
|
|
if (!(
|
|
(c >= 'a' && c <= 'z') ||
|
|
(c >= 'A' && c <= 'Z') ||
|
|
(c >= '0' && c <= '9') ||
|
|
(c == '_')
|
|
)) {
|
|
break;
|
|
}
|
|
|
|
// pop char out of lexer
|
|
lex_next(lexer);
|
|
|
|
if (len + 1 == MAX_LEX_LENGTH) {
|
|
ERROR_POS(pos, "ident has max length of %d",
|
|
MAX_LEX_LENGTH);
|
|
return M_ERROR;
|
|
}
|
|
|
|
*ptr++ = c;
|
|
len++;
|
|
}
|
|
|
|
if (len == 0) {
|
|
ERROR_POS(pos, "attempted to lex empty ident %d",
|
|
MAX_LEX_LENGTH);
|
|
return M_ERROR;
|
|
}
|
|
|
|
*ptr = '\0';
|
|
return M_SUCCESS;
|
|
}
|
|
|
|
/* lexes a string until closing quote
|
|
* returns error if string is too long or hit newline */
|
|
static int lex_string(struct lexer *lexer,char text[MAX_LEX_LENGTH])
|
|
{
|
|
int len = 0;
|
|
char *ptr = text;
|
|
int c;
|
|
|
|
while (1) {
|
|
c = lex_next(lexer);
|
|
if (c == '"')
|
|
break;
|
|
|
|
// match escape character
|
|
if (c == '\\') {
|
|
switch (lex_peek(lexer)) {
|
|
case 'n':
|
|
c = '\n';
|
|
lex_next(lexer);
|
|
break;
|
|
case 't':
|
|
c = '\t';
|
|
lex_next(lexer);
|
|
break;
|
|
case '\\':
|
|
c = '\\';
|
|
lex_next(lexer);
|
|
break;
|
|
case '"':
|
|
c = '"';
|
|
lex_next(lexer);
|
|
break;
|
|
}
|
|
}
|
|
|
|
// strings cannot span multiple lines
|
|
if (c == '\n') {
|
|
ERROR_POS(pos, "reached newline before end of string");
|
|
return M_ERROR;
|
|
}
|
|
|
|
if (len + 1 == MAX_LEX_LENGTH) {
|
|
ERROR_POS(pos, "string has max length of %d",
|
|
MAX_LEX_LENGTH);
|
|
return M_ERROR;
|
|
}
|
|
|
|
*ptr++ = c;
|
|
len++;
|
|
}
|
|
|
|
*ptr = '\0';
|
|
return M_SUCCESS;
|
|
}
|
|
|
|
/* lexes a integer number in base 2,8,10, or 16,
|
|
* uses base 10 by default but chan be changed by 0b, 0o, and 0x */
|
|
static int lex_number(struct lexer *lexer, int64_t *n)
|
|
{
|
|
int64_t number = 0;
|
|
int base = 10;
|
|
int neg = 0;
|
|
|
|
// check if negative
|
|
if (lex_peek(lexer) == '-') {
|
|
lex_next(lexer);
|
|
neg = 1;
|
|
}
|
|
|
|
|
|
// skip all leading zeros, they dont do anything.
|
|
// this also allows us to directly check for 0b, 0o, and 0x
|
|
// right away!
|
|
while (1) {
|
|
if (lex_peek(lexer) == '0')
|
|
lex_next(lexer);
|
|
else
|
|
break;
|
|
}
|
|
|
|
// match change of base
|
|
switch (lex_peek(lexer)) {
|
|
case 'b':
|
|
base = 2;
|
|
lex_next(lexer);
|
|
break;
|
|
case 'o':
|
|
base = 8;
|
|
lex_next(lexer);
|
|
break;
|
|
case 'x':
|
|
base = 16;
|
|
lex_next(lexer);
|
|
break;
|
|
}
|
|
|
|
while (1) {
|
|
char c = lex_peek(lexer);
|
|
int n = 0;
|
|
if (c >= '0' && c <= '9') {
|
|
n = c - '0';
|
|
} else if (c >= 'a' && c <= 'z') { // match A-Z so we can
|
|
n = c - 'a' + 10; // catch the errors
|
|
} else if (c >= 'A' && c <= 'Z') { // here instead of later
|
|
n = c - 'A' + 10;
|
|
} else {
|
|
break; // no longer a number
|
|
}
|
|
// if number provided is bigger than my base,
|
|
// error !
|
|
if (n >= base) {
|
|
ERROR_POS(pos, "character '%c' is bigger than number base"
|
|
"'%d'", c, base);
|
|
return M_ERROR;
|
|
}
|
|
lex_next(lexer);
|
|
number *= base;
|
|
number += n;
|
|
}
|
|
|
|
if (neg)
|
|
number = -number;
|
|
|
|
*n = number;
|
|
return M_SUCCESS;
|
|
}
|
|
|
|
/* lex the next token on the file */
|
|
int lexer_next(struct lexer *lexer, struct token *token)
|
|
{
|
|
again: // use label to avoid whitespace recursion
|
|
token->x = lexer->x;
|
|
token->y = lexer->y;
|
|
pos.x = lexer->x;
|
|
pos.y = lexer->y;
|
|
token->type = TOK_EOF;
|
|
|
|
int c = lex_peek(lexer);
|
|
int res = M_SUCCESS;
|
|
|
|
switch (c) {
|
|
|
|
case EOF:
|
|
case '\0':
|
|
token->type = TOK_EOF;
|
|
break;
|
|
case ';':
|
|
case '#':
|
|
skip_comment(lexer);
|
|
token->type = TOK_NL;
|
|
break;
|
|
case ' ':
|
|
case '\t':
|
|
// skip white space
|
|
lex_next(lexer);
|
|
goto again;
|
|
case '\n':
|
|
lex_next(lexer);
|
|
token->type = TOK_NL;
|
|
break;
|
|
case ',':
|
|
lex_next(lexer);
|
|
token->type = TOK_COMMA;
|
|
break;
|
|
case '=':
|
|
lex_next(lexer);
|
|
token->type = TOK_EQUAL;
|
|
break;
|
|
case '(':
|
|
lex_next(lexer);
|
|
token->type = TOK_LPAREN;
|
|
break;
|
|
case ')':
|
|
token->type = TOK_RPAREN;
|
|
lex_next(lexer);
|
|
break;
|
|
case '$':
|
|
token->type = TOK_REG;
|
|
lex_next(lexer);
|
|
res = lex_ident(lexer, token->text);
|
|
break;
|
|
case '.':
|
|
token->type = TOK_DIRECTIVE;
|
|
lex_next(lexer);
|
|
res = lex_ident(lexer, token->text);
|
|
break;
|
|
case '"':
|
|
token->type = TOK_STRING;
|
|
lex_next(lexer);
|
|
res = lex_string(lexer, token->text);
|
|
break;
|
|
case '-':
|
|
case '0':
|
|
case '1':
|
|
case '2':
|
|
case '3':
|
|
case '4':
|
|
case '5':
|
|
case '6':
|
|
case '7':
|
|
case '8':
|
|
case '9':
|
|
token->type = TOK_NUMBER;
|
|
res = lex_number(lexer, &token->number);
|
|
break;
|
|
default:
|
|
token->type = TOK_IDENT;
|
|
res = lex_ident(lexer, token->text);
|
|
if (lex_peek(lexer) == ':') {
|
|
lex_next(lexer);
|
|
token->type = TOK_LABEL;
|
|
}
|
|
break;
|
|
}
|
|
return res;
|
|
}
|
|
|
|
int lexer_init(const char *path, struct lexer *lexer)
|
|
{
|
|
FILE *file = fopen(path, "r");
|
|
if (file == NULL) {
|
|
ERROR("cannot read '%s'", path);
|
|
return M_ERROR;
|
|
}
|
|
lexer->file = file;
|
|
lexer->peek = EOF;
|
|
lexer->x = 1;
|
|
lexer->y = 1;
|
|
return M_SUCCESS;
|
|
}
|
|
|
|
int lexer_free(struct lexer *lexer)
|
|
{
|
|
return fclose(lexer->file);
|
|
}
|
|
|
|
char *token_str(enum token_type type)
|
|
{
|
|
switch (type) {
|
|
case TOK_IDENT:
|
|
return "ident";
|
|
case TOK_REG:
|
|
return "register";
|
|
case TOK_LABEL:
|
|
return "label";
|
|
case TOK_STRING:
|
|
return "string";
|
|
case TOK_COMMA:
|
|
return "comma";
|
|
case TOK_EQUAL:
|
|
return "equal";
|
|
case TOK_LPAREN:
|
|
return "left parentheses";
|
|
case TOK_RPAREN:
|
|
return "right parentheses";
|
|
case TOK_NUMBER:
|
|
return "number";
|
|
case TOK_EOF:
|
|
return "end of file";
|
|
case TOK_NL:
|
|
return "new line";
|
|
case TOK_DIRECTIVE:
|
|
return "directive";
|
|
}
|
|
return "unknown";
|
|
}
|
|
|
|
void lexer_save(struct lexer *lexer, struct lexer_state *state)
|
|
{
|
|
state->x = lexer->x;
|
|
state->y = lexer->y;
|
|
state->peek = lexer->peek;
|
|
state->offset = ftell(lexer->file);
|
|
}
|
|
|
|
/* load a different state into a lexer */
|
|
void lexer_load(struct lexer *lexer, const struct lexer_state *state)
|
|
{
|
|
lexer->x = state->x;
|
|
lexer->y = state->y;
|
|
lexer->peek = state->peek;
|
|
fseek(lexer->file, state->offset, SEEK_SET);
|
|
}
|