mips/masm/lex.c
2024-09-13 11:11:18 -04:00

374 lines
6.5 KiB
C

#include "lex.h"
#include <mlimits.h>
#include <merror.h>
static struct {
int x;
int y;
} pos;
/* get next char in lexer */
static int lex_next(struct lexer *lexer)
{
if (lexer->peek != EOF) {
int c = lexer->peek;
lexer->peek = EOF;
return c;
}
int c = getc(lexer->file);
if (c == '\n') {
lexer->x = 1;
lexer->y++;
} else {
lexer->x++;
}
return c;
}
/* peek next char in lexer */
static int lex_peek(struct lexer *lexer)
{
if (lexer->peek == EOF)
lexer->peek = lex_next(lexer);
return lexer->peek;
}
/* skip all characters until EOF or newline */
static void skip_comment(struct lexer *lexer)
{
int c;
while (1) {
c = lex_next(lexer);
if (c == EOF || c == '\n')
break;
}
}
/* lexes text until whitespace
* returns error on zero length or too long */
static int lex_ident(struct lexer *lexer, char text[MAX_LEX_LENGTH])
{
int len = 0;
char *ptr = text;
int c;
while (1) {
c = lex_peek(lexer);
if (!(
(c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9') ||
(c == '_')
)) {
break;
}
// pop char out of lexer
lex_next(lexer);
if (len + 1 == MAX_LEX_LENGTH) {
ERROR_POS(pos, "ident has max length of %d",
MAX_LEX_LENGTH);
return M_ERROR;
}
*ptr++ = c;
len++;
}
if (len == 0) {
ERROR_POS(pos, "attempted to lex empty ident %d",
MAX_LEX_LENGTH);
return M_ERROR;
}
*ptr = '\0';
return M_SUCCESS;
}
/* lexes a string until closing quote
* returns error if string is too long or hit newline */
static int lex_string(struct lexer *lexer,char text[MAX_LEX_LENGTH])
{
int len = 0;
char *ptr = text;
int c;
while (1) {
c = lex_next(lexer);
if (c == '"')
break;
// match escape character
if (c == '\\') {
switch (lex_peek(lexer)) {
case 'n':
c = '\n';
lex_next(lexer);
break;
case 't':
c = '\t';
lex_next(lexer);
break;
case '\\':
c = '\\';
lex_next(lexer);
break;
case '"':
c = '"';
lex_next(lexer);
break;
}
}
// strings cannot span multiple lines
if (c == '\n') {
ERROR_POS(pos, "reached newline before end of string");
return M_ERROR;
}
if (len + 1 == MAX_LEX_LENGTH) {
ERROR_POS(pos, "string has max length of %d",
MAX_LEX_LENGTH);
return M_ERROR;
}
*ptr++ = c;
len++;
}
*ptr = '\0';
return M_SUCCESS;
}
/* lexes a integer number in base 2,8,10, or 16,
* uses base 10 by default but chan be changed by 0b, 0o, and 0x */
static int lex_number(struct lexer *lexer, int64_t *n)
{
int64_t number = 0;
int base = 10;
int neg = 0;
// check if negative
if (lex_peek(lexer) == '-') {
lex_next(lexer);
neg = 1;
}
// skip all leading zeros, they dont do anything.
// this also allows us to directly check for 0b, 0o, and 0x
// right away!
while (1) {
if (lex_peek(lexer) == '0')
lex_next(lexer);
else
break;
}
// match change of base
switch (lex_peek(lexer)) {
case 'b':
base = 2;
lex_next(lexer);
break;
case 'o':
base = 8;
lex_next(lexer);
break;
case 'x':
base = 16;
lex_next(lexer);
break;
}
while (1) {
char c = lex_peek(lexer);
int n = 0;
if (c >= '0' && c <= '9') {
n = c - '0';
} else if (c >= 'a' && c <= 'z') { // match A-Z so we can
n = c - 'a' + 10; // catch the errors
} else if (c >= 'A' && c <= 'Z') { // here instead of later
n = c - 'A' + 10;
} else {
break; // no longer a number
}
// if number provided is bigger than my base,
// error !
if (n >= base) {
ERROR_POS(pos, "character '%c' is bigger than number base"
"'%d'", c, base);
return M_ERROR;
}
lex_next(lexer);
number *= base;
number += n;
}
if (neg)
number = -number;
*n = number;
return M_SUCCESS;
}
/* lex the next token on the file */
int lexer_next(struct lexer *lexer, struct token *token)
{
again: // use label to avoid whitespace recursion
token->x = lexer->x;
token->y = lexer->y;
pos.x = lexer->x;
pos.y = lexer->y;
token->type = TOK_EOF;
int c = lex_peek(lexer);
int res = M_SUCCESS;
switch (c) {
case EOF:
case '\0':
token->type = TOK_EOF;
break;
case ';':
case '#':
skip_comment(lexer);
token->type = TOK_NL;
break;
case ' ':
case '\t':
// skip white space
lex_next(lexer);
goto again;
case '\n':
lex_next(lexer);
token->type = TOK_NL;
break;
case ',':
lex_next(lexer);
token->type = TOK_COMMA;
break;
case '=':
lex_next(lexer);
token->type = TOK_EQUAL;
break;
case '(':
lex_next(lexer);
token->type = TOK_LPAREN;
break;
case ')':
token->type = TOK_RPAREN;
lex_next(lexer);
break;
case '$':
token->type = TOK_REG;
lex_next(lexer);
res = lex_ident(lexer, token->text);
break;
case '.':
token->type = TOK_DIRECTIVE;
lex_next(lexer);
res = lex_ident(lexer, token->text);
break;
case '"':
token->type = TOK_STRING;
lex_next(lexer);
res = lex_string(lexer, token->text);
break;
case '-':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
token->type = TOK_NUMBER;
res = lex_number(lexer, &token->number);
break;
default:
token->type = TOK_IDENT;
res = lex_ident(lexer, token->text);
if (lex_peek(lexer) == ':') {
lex_next(lexer);
token->type = TOK_LABEL;
}
break;
}
return res;
}
int lexer_init(const char *path, struct lexer *lexer)
{
FILE *file = fopen(path, "r");
if (file == NULL) {
ERROR("cannot read '%s'", path);
return M_ERROR;
}
lexer->file = file;
lexer->peek = EOF;
lexer->x = 1;
lexer->y = 1;
return M_SUCCESS;
}
int lexer_free(struct lexer *lexer)
{
return fclose(lexer->file);
}
char *token_str(enum token_type type)
{
switch (type) {
case TOK_IDENT:
return "ident";
case TOK_REG:
return "register";
case TOK_LABEL:
return "label";
case TOK_STRING:
return "string";
case TOK_COMMA:
return "comma";
case TOK_EQUAL:
return "equal";
case TOK_LPAREN:
return "left parentheses";
case TOK_RPAREN:
return "right parentheses";
case TOK_NUMBER:
return "number";
case TOK_EOF:
return "end of file";
case TOK_NL:
return "new line";
case TOK_DIRECTIVE:
return "directive";
}
return "unknown";
}
void lexer_save(struct lexer *lexer, struct lexer_state *state)
{
state->x = lexer->x;
state->y = lexer->y;
state->peek = lexer->peek;
state->offset = ftell(lexer->file);
}
/* load a different state into a lexer */
void lexer_load(struct lexer *lexer, const struct lexer_state *state)
{
lexer->x = state->x;
lexer->y = state->y;
lexer->peek = state->peek;
fseek(lexer->file, state->offset, SEEK_SET);
}