mips/masm/lex.c

#include "lex.h"

#include <mlimits.h>
#include <merror.h>

static struct {
	int x;
	int y;
} pos;

/* get next char in lexer */
static int lex_next(struct lexer *lexer)
{
	if (lexer->peek != EOF) {
		int c = lexer->peek;
		lexer->peek = EOF;
		return c;
	}

	int c = getc(lexer->file);
	if (c == '\n') {
		lexer->x = 1;
		lexer->y++;
	} else {
		lexer->x++;
	}
	return c;
}

/* peek next char in lexer */
static int lex_peek(struct lexer *lexer)
{
	if (lexer->peek == EOF)
		lexer->peek = lex_next(lexer);
	return lexer->peek;
}

/* skip all characters until EOF or newline */
static void skip_comment(struct lexer *lexer)
{
	int c;
	while (1) {
		c = lex_next(lexer);
		if (c == EOF || c == '\n')
			break;
	}
}

/* lexes text until whitespace
 * returns error on zero length or too long */
static int lex_ident(struct lexer *lexer, char text[MAX_LEX_LENGTH])
{
	int len = 0;
	char *ptr = text;
	int c;

	while (1) {
		c = lex_peek(lexer);
		if (!(
			(c >= 'a' && c <= 'z') ||
			(c >= 'A' && c <= 'Z') ||
			(c >= '0' && c <= '9') ||
			(c == '_')
		)) {
			break;
		}

		// pop char out of lexer
		lex_next(lexer);

		if (len + 1 == MAX_LEX_LENGTH) {
			ERROR_POS(pos, "ident has max length of %d",
				MAX_LEX_LENGTH);
			return M_ERROR;
		}

		*ptr++ = c;
		len++;
	}

	if (len == 0) {
		ERROR_POS(pos, "attempted to lex empty ident %d",
			MAX_LEX_LENGTH);
		return M_ERROR;
	}

	*ptr = '\0';
	return M_SUCCESS;
}

/* lexes a string until closing quote
 * returns error if string is too long or hit newline */
static int lex_string(struct lexer *lexer,char text[MAX_LEX_LENGTH])
{
	int len = 0;
	char *ptr = text;
	int c;

	while (1) {
		c = lex_next(lexer);
		if (c == '"')
			break;

		// match escape character
		if (c == '\\') {
			switch (lex_peek(lexer)) {
			case 'n':
				c = '\n';
				lex_next(lexer);
				break;
			case 't':
				c = '\t';
				lex_next(lexer);
				break;
			case '\\':
				c = '\\';
				lex_next(lexer);
				break;
			case '"':
				c = '"';
				lex_next(lexer);
				break;
			}
		}

		// strings cannot span multiple lines
		if (c == '\n') {
			ERROR_POS(pos, "reached newline before end of string");
			return M_ERROR;
		}

		if (len + 1 == MAX_LEX_LENGTH) {
			ERROR_POS(pos, "string has max length of %d",
				MAX_LEX_LENGTH);
			return M_ERROR;
		}

		*ptr++ = c;
		len++;
	}

	*ptr = '\0';
	return M_SUCCESS;
}

/* lexes a integer number in base 2,8,10, or 16,
 * uses base 10 by default but chan be changed by 0b, 0o, and 0x */
static int lex_number(struct lexer *lexer, int64_t *n)
{
	int64_t number = 0;
	int base = 10;
	int neg = 0;

	// check if negative
	if (lex_peek(lexer) == '-') {
		lex_next(lexer);
		neg = 1;
	}


	// skip all leading zeros, they dont do anything.
	// this also allows us to directly check for 0b, 0o, and 0x
	// right away!
	while (1) {
		if (lex_peek(lexer) == '0')
			lex_next(lexer);
		else
			break;
	}

	// match change of base
	switch (lex_peek(lexer)) {
	case 'b':
		base = 2;
		lex_next(lexer);
		break;
	case 'o':
		base = 8;
		lex_next(lexer);
		break;
	case 'x':
		base = 16;
		lex_next(lexer);
		break;
	}

	while (1) {
		char c = lex_peek(lexer);
		int n = 0;
		if (c >= '0' && c <= '9') {
			n = c - '0';
		} else if (c >= 'a' && c <= 'z') { // match A-Z so we can
			n = c - 'a' + 10;          // catch the errors
		} else if (c >= 'A' && c <= 'Z') { // here instead of later
			n = c - 'A' + 10;
		} else {
			break; // no longer a number
		}
		// if number provided is bigger than my base,
		// error !
		if (n >= base) {
			ERROR_POS(pos, "character '%c' is bigger than number base"
				"'%d'", c, base);
			return M_ERROR;
		}
		lex_next(lexer);
		number *= base;
		number += n;
	}

	if (neg)
		number = -number;

	*n = number;
	return M_SUCCESS;
}

/* lex the next token on the file */
int lexer_next(struct lexer *lexer, struct token *token)
{
again: // use label to avoid whitespace recursion
	token->x = lexer->x;
	token->y = lexer->y;
	pos.x = lexer->x;
	pos.y = lexer->y;
	token->type = TOK_EOF;

	int c = lex_peek(lexer);
	int res = M_SUCCESS;

	switch (c) {

	case EOF:
	case '\0':
		token->type = TOK_EOF;
		break;
	case ';':
	case '#':
		skip_comment(lexer);
		token->type = TOK_NL;
		break;
	case ' ':
	case '\t':
		// skip white space
		lex_next(lexer);
		goto again;
	case '\n':
		lex_next(lexer);
		token->type = TOK_NL;
		break;
	case ',':
		lex_next(lexer);
		token->type = TOK_COMMA;
		break;
	case '=':
		lex_next(lexer);
		token->type = TOK_EQUAL;
		break;
	case '(':
		lex_next(lexer);
		token->type = TOK_LPAREN;
		break;
	case ')':
		token->type = TOK_RPAREN;
		lex_next(lexer);
		break;
	case '$':
		token->type = TOK_REG;
		lex_next(lexer);
		res = lex_ident(lexer, token->text);
		break;
	case '.':
		token->type = TOK_DIRECTIVE;
		lex_next(lexer);
		res = lex_ident(lexer, token->text);
		break;
	case '"':
		token->type = TOK_STRING;
		lex_next(lexer);
		res = lex_string(lexer, token->text);
		break;
	case '-':
	case '0':
	case '1':
	case '2':
	case '3':
	case '4':
	case '5':
	case '6':
	case '7':
	case '8':
	case '9':
		token->type = TOK_NUMBER;
		res = lex_number(lexer, &token->number);
		break;
	default:
		token->type = TOK_IDENT;
		res = lex_ident(lexer, token->text);
		if (lex_peek(lexer) == ':') {
			lex_next(lexer);
			token->type = TOK_LABEL;
		}
		break;
	}
	return res;
}

int lexer_init(const char *path, struct lexer *lexer)
{
	FILE *file = fopen(path, "r");
	if (file == NULL) {
		ERROR("cannot read '%s'", path);
		return M_ERROR;
	}
	lexer->file = file;
	lexer->peek = EOF;
	lexer->x = 1;
	lexer->y = 1;
	return M_SUCCESS;
}

int lexer_free(struct lexer *lexer)
{
	return fclose(lexer->file);
}

char *token_str(enum token_type type)
{
	switch (type) {
        case TOK_IDENT:
		return "ident";
        case TOK_REG:
		return "register";
        case TOK_LABEL:
		return "label";
        case TOK_STRING:
		return "string";
        case TOK_COMMA:
		return "comma";
        case TOK_EQUAL:
		return "equal";
        case TOK_LPAREN:
		return "left parentheses";
        case TOK_RPAREN:
		return "right parentheses";
        case TOK_NUMBER:
		return "number";
        case TOK_EOF:
		return "end of file";
        case TOK_NL:
		return "new line";
        case TOK_DIRECTIVE:
		return "directive";
        }
	return "unknown";
}

void lexer_save(struct lexer *lexer, struct lexer_state *state)
{
	state->x = lexer->x;
	state->y = lexer->y;
	state->peek = lexer->peek;
	state->offset = ftell(lexer->file);
}

/* load a different state into a lexer */
void lexer_load(struct lexer *lexer, const struct lexer_state *state)
{
	lexer->x = state->x;
	lexer->y = state->y;
	lexer->peek = state->peek;
	fseek(lexer->file, state->offset, SEEK_SET);
}