mips/masm/lex.c

442 lines
7.6 KiB
C

#include "lex.h"
#include <mlimits.h>
#include <merror.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/mman.h>
#include <sys/stat.h>
static struct {
int x;
int y;
} pos;
/* get next char in lexer */
static int lex_next(struct lexer *lexer)
{
if (lexer->peek != EOF) {
int c = lexer->peek;
lexer->peek = EOF;
return c;
}
int c = getc(lexer->file);
if (c == '\n') {
lexer->x = 1;
lexer->y++;
} else {
lexer->x++;
}
return c;
}
/* peek next char in lexer */
static int lex_peek(struct lexer *lexer)
{
if (lexer->peek == EOF)
lexer->peek = lex_next(lexer);
return lexer->peek;
}
/* skip all characters until EOF or newline */
static void skip_comment(struct lexer *lexer)
{
int c;
while (1) {
c = lex_next(lexer);
if (c == EOF || c == '\n')
break;
}
}
/* lexes a string until closing quote
* returns error if string is too long or hit newline */
static int lex_string(struct lexer *lexer, struct string *string)
{
char c;
string_init(string);
while (1) {
c = lex_next(lexer);
// stop on ending quote
if (c == '"')
break;
// strings cannot span multiple lines
if (c == '\n') {
ERROR_POS(pos, "reached newline before end of string");
string_free(string);
return M_ERROR;
}
// match escape character
if (c == '\\') {
switch (lex_peek(lexer)) {
case 'n':
c = '\n';
lex_next(lexer);
break;
case 't':
c = '\t';
lex_next(lexer);
break;
case '\\':
c = '\\';
lex_next(lexer);
break;
case '"':
c = '"';
lex_next(lexer);
break;
}
}
// push char into string
if (string_push(string, c)) {
string_free(string);
return M_ERROR;
}
}
// null terminate string
if (string_push(string, '\0')) {
free(string->str);
return M_ERROR;
}
return M_SUCCESS;
}
/* lexes text until whitespace
* returns error on zero length or too long */
static int lex_ident(struct lexer *lexer, struct string *string,
char prefix)
{
char c;
string_init(string);
if (prefix != '\0' && string_push(string, prefix)) {
string_free(string);
return M_ERROR;
}
while (1) {
c = lex_peek(lexer);
if (!(
(c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9') ||
(c == '_')
)) {
break;
}
// pop char out of lexer
lex_next(lexer);
// push char into string
if (string_push(string, c)) {
free(string->str);
return M_ERROR;
}
}
// empty idents are not allowed
if (string->len < 1) {
string_free(string);
ERROR("empty ident tokens are not allowed");
return M_ERROR;
}
// null terminate string
if (string_push(string, '\0')) {
string_free(string);
return M_ERROR;
}
return M_SUCCESS;
}
/* lexes a integer number in base 2,8,10, or 16,
* uses base 10 by default but chan be changed by 0b, 0o, and 0x */
static int lex_number(struct lexer *lexer, int64_t *n)
{
int64_t number = 0;
int base = 10;
int neg = 0;
// check if negative
if (lex_peek(lexer) == '-') {
lex_next(lexer);
neg = 1;
}
// skip all leading zeros, they dont do anything.
// this also allows us to directly check for 0b, 0o, and 0x
// right away!
while (1) {
if (lex_peek(lexer) == '0')
lex_next(lexer);
else
break;
}
// match change of base
switch (lex_peek(lexer)) {
case 'b':
base = 2;
lex_next(lexer);
break;
case 'o':
base = 8;
lex_next(lexer);
break;
case 'x':
base = 16;
lex_next(lexer);
break;
}
while (1) {
char c = lex_peek(lexer);
int n = 0;
if (c >= '0' && c <= '9') {
n = c - '0';
} else if (c >= 'a' && c <= 'z') { // match A-Z so we can
n = c - 'a' + 10; // catch the errors
} else if (c >= 'A' && c <= 'Z') { // here instead of later
n = c - 'A' + 10;
} else {
break; // no longer a number
}
// if number provided is bigger than my base,
// error !
if (n >= base) {
ERROR_POS(pos, "character '%c' is bigger than number base"
"'%d'", c, base);
return M_ERROR;
}
lex_next(lexer);
number *= base;
number += n;
}
if (neg)
number = -number;
*n = number;
return M_SUCCESS;
}
/* lex the next token on the file */
int lexer_next(struct lexer *lexer, struct token *token)
{
again: // use label to avoid whitespace recursion
token->x = lexer->x;
token->y = lexer->y;
token->off = ftell(lexer->file);
pos.x = lexer->x;
pos.y = lexer->y;
token->type = TOK_EOF;
int c = lex_peek(lexer);
int res = M_SUCCESS;
switch (c) {
case EOF:
// return a EOF token
case '\0':
token->type = TOK_EOF;
break;
// skip the comment
// .. and return a NL token
case ';':
case '#':
skip_comment(lexer);
token->type = TOK_NL;
break;
// skip the whitespace and
// try to parse the next character
case ' ':
case '\t':
// skip white space
lex_next(lexer);
goto again;
// return a NL token
case '\n':
lex_next(lexer);
token->type = TOK_NL;
break;
// return a comma token
case ',':
lex_next(lexer);
token->type = TOK_COMMA;
break;
// return a equal token
case '=':
lex_next(lexer);
token->type = TOK_EQUAL;
break;
// return a left paren token
case '(':
lex_next(lexer);
token->type = TOK_LPAREN;
break;
// return a right paren token
case ')':
token->type = TOK_RPAREN;
lex_next(lexer);
break;
// return a register token
case '$':
token->type = TOK_REG;
lex_next(lexer);
res = lex_ident(lexer, &token->string, '\0');
break;
// return a directive token
case '.':
token->type = TOK_DIRECTIVE;
lex_next(lexer);
res = lex_ident(lexer, &token->string, '.');
break;
// return a string token
case '"':
token->type = TOK_STRING;
lex_next(lexer);
res = lex_string(lexer, &token->string);
break;
// return a number token
case '-':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
token->type = TOK_NUMBER;
res = lex_number(lexer, &token->number);
break;
// return a ident or label token depending
// if it ends with a colon
default:
token->type = TOK_IDENT;
res = lex_ident(lexer, &token->string, '\0');
if (lex_peek(lexer) == ':') {
lex_next(lexer);
token->type = TOK_LABEL;
}
break;
}
return res;
}
int lexer_init(const char *path, struct lexer *lexer)
{
/// defaults
lexer->file = NULL;
lexer->peek = EOF;
lexer->x = 1;
lexer->y = 1;
/// load file
lexer->file = fopen(path, "r");
if (lexer->file == NULL) {
PERROR("cannot read");
return M_ERROR;
}
return M_SUCCESS;
}
void lexer_free(struct lexer *lexer)
{
if (lexer->file)
fclose(lexer->file);
}
char *token_str(enum token_type type)
{
switch (type) {
case TOK_IDENT:
return "ident";
case TOK_REG:
return "register";
case TOK_LABEL:
return "label";
case TOK_STRING:
return "string";
case TOK_COMMA:
return "comma";
case TOK_EQUAL:
return "equal";
case TOK_LPAREN:
return "left parentheses";
case TOK_RPAREN:
return "right parentheses";
case TOK_NUMBER:
return "number";
case TOK_EOF:
return "end of file";
case TOK_NL:
return "new line";
case TOK_DIRECTIVE:
return "directive";
}
return "unknown";
}
/* save the current state from the lexer */
void lexer_save(struct lexer *lexer, struct lexer_state *state)
{
state->x = lexer->x;
state->y = lexer->y;
state->peek = lexer->peek;
state->offset = ftell(lexer->file);
}
/* load a different state into a lexer */
void lexer_load(struct lexer *lexer, const struct lexer_state *state)
{
lexer->x = state->x;
lexer->y = state->y;
lexer->peek = state->peek;
fseek(lexer->file, state->offset, SEEK_SET);
}
void token_free(struct token *token)
{
switch (token->type) {
case TOK_REG:
case TOK_IDENT:
case TOK_LABEL:
case TOK_STRING:
case TOK_DIRECTIVE:
if (token->string.str)
free(token->string.str);
break;
default:
}
}