mips/masm/lex.c

#include "lex.h"

#include <mlimits.h>
#include <merror.h>

static struct {
	int x;
	int y;
} pos;

/* get next char in lexer */
static int lex_next(struct lexer *lexer)
{
	if (lexer->peek != EOF) {
		int c = lexer->peek;
		lexer->peek = EOF;
		return c;
	}

	int c = getc(lexer->file);
	if (c == '\n') {
		lexer->x = 0;
		lexer->y++;
	} else {
		lexer->x++;
	}
	return c;
}

/* peek next char in lexer */
static int lex_peek(struct lexer *lexer)
{
	if (lexer->peek == EOF)
		lexer->peek = lex_next(lexer);
	return lexer->peek;
}

/* skip all characters until EOF or newline */
static void skip_comment(struct lexer *lexer)
{
	int c;
	while (1) {
		c = lex_next(lexer);
		if (c == EOF || c == '\n')
			break;
	}
}

/* lexes text until whitespace
 * returns error on zero length or too long */
static int lex_ident(struct lexer *lexer, char text[MAX_LEX_LENGTH])
{
	int len = 0;
	char *ptr = text;
	int c;

	while (1) {
		c = lex_peek(lexer);
		if (!(
			(c >= 'a' && c <= 'z') ||
			(c >= 'A' && c <= 'Z') ||
			(c >= '0' && c <= '9') ||
			(c == '_')
		)) {
			break;
		}

		// pop char out of lexer
		lex_next(lexer);

		if (len + 1 == MAX_LEX_LENGTH) {
			ERROR_POS(pos, "ident has max length of %d",
				MAX_LEX_LENGTH);
			return M_ERROR;
		}

		*ptr++ = c;
		len++;
	}

	if (len == 0) {
		ERROR_POS(pos, "attempted to lex empty ident %d",
			MAX_LEX_LENGTH);
		return M_ERROR;
	}

	*ptr = '\0';
	return M_SUCCESS;
}

/* lexes a string until closing quote
 * returns error if string is too long or hit newline */
static int lex_string(struct lexer *lexer,char text[MAX_LEX_LENGTH])
{
	int len = 0;
	char *ptr = text;
	int c;

	while (1) {
		c = lex_next(lexer);
		if (c == '"')
			break;

		// match escape character
		if (c == '\\') {
			switch (lex_peek(lexer)) {
			case 'n':
				c = '\n';
				lex_next(lexer);
				break;
			case 't':
				c = '\t';
				lex_next(lexer);
				break;
			case '\\':
				c = '\\';
				lex_next(lexer);
				break;
			case '"':
				c = '"';
				lex_next(lexer);
				break;
			}
		}

		// strings cannot span multiple lines
		if (c == '\n') {
			ERROR_POS(pos, "reached newline before end of string");
			return M_ERROR;
		}

		if (len + 1 == MAX_LEX_LENGTH) {
			ERROR_POS(pos, "string has max length of %d",
				MAX_LEX_LENGTH);
			return M_ERROR;
		}

		*ptr++ = c;
		len++;
	}

	*ptr = '\0';
	return M_SUCCESS;
}

/* lexes a integer number in base 2,8,10, or 16,
 * uses base 10 by default but chan be changed by 0b, 0o, and 0x */
static int lex_number(struct lexer *lexer, int64_t *n)
{
	int64_t number = 0;
	int base = 10;

	// skip all leading zeros, they dont do anything.
	// this also allows us to directly check for 0b, 0o, and 0x
	// right away!
	while (1) {
		if (lex_peek(lexer) == '0')
			lex_next(lexer);
		else
			break;
	}

	// match change of base
	switch (lex_peek(lexer)) {
	case 'b':
		base = 2;
		lex_next(lexer);
		break;
	case 'o':
		base = 8;
		lex_next(lexer);
		break;
	case 'x':
		base = 16;
		lex_next(lexer);
		break;
	}

	while (1) {
		char c = lex_peek(lexer);
		int n = 0;
		if (c >= '0' && c <= '9') {
			n = c - '0';
		} else if (c >= 'a' && c <= 'z') { // match A-Z so we can
			n = c - 'a' + 10;          // catch the errors
		} else if (c >= 'A' && c <= 'Z') { // here instead of later
			n = c - 'A' + 10;
		} else {
			break; // no longer a number
		}
		// if number provided is bigger than my base,
		// error !
		if (n >= base) {
			ERROR_POS(pos, "character '%c' is bigger than number base"
				"'%d'", c, base);
			return M_ERROR;
		}
		lex_next(lexer);
		number *= base;
		number += n;
	}

	*n = number;
	return M_SUCCESS;
}

/* lex the next token on the file */
int lexer_next(struct lexer *lexer, struct token *token)
{
again: // use label to avoid whitespace recursion
	token->x = lexer->x;
	token->y = lexer->y;
	pos.x = lexer->x;
	pos.y = lexer->y;
	token->type = TOK_EOF;

	int c = lex_peek(lexer);
	int res = M_SUCCESS;

	switch (c) {

	case EOF:
	case '\0':
		token->type = TOK_EOF;
		break;
	case ';':
	case '#':
		skip_comment(lexer);
		goto again;
	case ' ':
	case '\t':
		// skip white space
		lex_next(lexer);
		goto again;
	case '\n':
		lex_next(lexer);
		token->type = TOK_NL;
		break;
	case ',':
		lex_next(lexer);
		token->type = TOK_COMMA;
		break;
	case '=':
		lex_next(lexer);
		token->type = TOK_EQUAL;
		break;
	case '(':
		lex_next(lexer);
		token->type = TOK_LPAREN;
		break;
	case ')':
		token->type = TOK_RPAREN;
		lex_next(lexer);
		break;
	case '$':
		token->type = TOK_REG;
		lex_next(lexer);
		res = lex_ident(lexer, token->text);
		break;
	case '.':
		token->type = TOK_DIRECTIVE;
		lex_next(lexer);
		res = lex_ident(lexer, token->text);
		break;
	case '"':
		token->type = TOK_STRING;
		lex_next(lexer);
		res = lex_string(lexer, token->text);
		break;
	case '0':
	case '1':
	case '2':
	case '3':
	case '4':
	case '5':
	case '6':
	case '7':
	case '8':
	case '9':
		token->type = TOK_NUMBER;
		res = lex_number(lexer, &token->number);
		break;
	default:
		token->type = TOK_IDENT;
		res = lex_ident(lexer, token->text);
		if (lex_peek(lexer) == ':') {
			lex_next(lexer);
			token->type = TOK_LABEL;
		}
		break;
	}
	return res;
}

int lexer_init(const char *path, struct lexer *lexer)
{
	FILE *file = fopen(path, "r");
	if (file == NULL) {
		ERROR_POS(pos, "cannot file '%s'", path);
		return M_ERROR;
	}
	lexer->file = file;
	lexer->peek = EOF;
	lexer->x = 0;
	lexer->y = 0;
	return M_SUCCESS;
}

int lexer_free(struct lexer *lexer)
{
	return fclose(lexer->file);
}

char *token_str(enum token_type type)
{
	switch (type) {
        case TOK_IDENT:
		return "ident";
        case TOK_REG:
		return "register";
        case TOK_LABEL:
		return "label";
        case TOK_STRING:
		return "string";
        case TOK_COMMA:
		return "comma";
        case TOK_EQUAL:
		return "equal";
        case TOK_LPAREN:
		return "left parentheses";
        case TOK_RPAREN:
		return "right parentheses";
        case TOK_NUMBER:
		return "number";
        case TOK_EOF:
		return "end of file";
        case TOK_NL:
		return "new line";
        case TOK_DIRECTIVE:
		return "directive";
        }
	return "unknown";
}
initial mips32 (r2000ish mips32r6) assembler 2024-09-09 16:41:49 +00:00			`#include "lex.h"`

			`#include <mlimits.h>`
			`#include <merror.h>`

			`static struct {`
			`int x;`
			`int y;`
			`} pos;`

			`/* get next char in lexer */`
			`static int lex_next(struct lexer *lexer)`
			`{`
			`if (lexer->peek != EOF) {`
			`int c = lexer->peek;`
			`lexer->peek = EOF;`
			`return c;`
			`}`

			`int c = getc(lexer->file);`
			`if (c == '\n') {`
			`lexer->x = 0;`
			`lexer->y++;`
			`} else {`
			`lexer->x++;`
			`}`
			`return c;`
			`}`

			`/* peek next char in lexer */`
			`static int lex_peek(struct lexer *lexer)`
			`{`
			`if (lexer->peek == EOF)`
			`lexer->peek = lex_next(lexer);`
			`return lexer->peek;`
			`}`

			`/* skip all characters until EOF or newline */`
			`static void skip_comment(struct lexer *lexer)`
			`{`
			`int c;`
			`while (1) {`
			`c = lex_next(lexer);`
			`if (c == EOF \|\| c == '\n')`
			`break;`
			`}`
			`}`

			`/* lexes text until whitespace`
			`* returns error on zero length or too long */`
			`static int lex_ident(struct lexer *lexer, char text[MAX_LEX_LENGTH])`
			`{`
			`int len = 0;`
			`char *ptr = text;`
			`int c;`

			`while (1) {`
			`c = lex_peek(lexer);`
			`if (!(`
			`(c >= 'a' && c <= 'z') \|\|`
			`(c >= 'A' && c <= 'Z') \|\|`
			`(c >= '0' && c <= '9') \|\|`
			`(c == '_')`
			`)) {`
			`break;`
			`}`

			`// pop char out of lexer`
			`lex_next(lexer);`

			`if (len + 1 == MAX_LEX_LENGTH) {`
			`ERROR_POS(pos, "ident has max length of %d",`
			`MAX_LEX_LENGTH);`
			`return M_ERROR;`
			`}`

			`*ptr++ = c;`
			`len++;`
			`}`

			`if (len == 0) {`
			`ERROR_POS(pos, "attempted to lex empty ident %d",`
			`MAX_LEX_LENGTH);`
			`return M_ERROR;`
			`}`

			`*ptr = '\0';`
			`return M_SUCCESS;`
			`}`

			`/* lexes a string until closing quote`
			`* returns error if string is too long or hit newline */`
			`static int lex_string(struct lexer *lexer,char text[MAX_LEX_LENGTH])`
			`{`
			`int len = 0;`
			`char *ptr = text;`
			`int c;`

			`while (1) {`
			`c = lex_next(lexer);`
			`if (c == '"')`
			`break;`

			`// match escape character`
			`if (c == '\\') {`
			`switch (lex_peek(lexer)) {`
			`case 'n':`
			`c = '\n';`
			`lex_next(lexer);`
			`break;`
			`case 't':`
			`c = '\t';`
			`lex_next(lexer);`
			`break;`
			`case '\\':`
			`c = '\\';`
			`lex_next(lexer);`
			`break;`
			`case '"':`
			`c = '"';`
			`lex_next(lexer);`
			`break;`
			`}`
			`}`

			`// strings cannot span multiple lines`
			`if (c == '\n') {`
			`ERROR_POS(pos, "reached newline before end of string");`
			`return M_ERROR;`
			`}`

			`if (len + 1 == MAX_LEX_LENGTH) {`
			`ERROR_POS(pos, "string has max length of %d",`
			`MAX_LEX_LENGTH);`
			`return M_ERROR;`
			`}`

			`*ptr++ = c;`
			`len++;`
			`}`

			`*ptr = '\0';`
			`return M_SUCCESS;`
			`}`

			`/* lexes a integer number in base 2,8,10, or 16,`
			`* uses base 10 by default but chan be changed by 0b, 0o, and 0x */`
			`static int lex_number(struct lexer lexer, int64_t n)`
			`{`
			`int64_t number = 0;`
			`int base = 10;`

			`// skip all leading zeros, they dont do anything.`
			`// this also allows us to directly check for 0b, 0o, and 0x`
			`// right away!`
			`while (1) {`
			`if (lex_peek(lexer) == '0')`
			`lex_next(lexer);`
			`else`
			`break;`
			`}`

			`// match change of base`
			`switch (lex_peek(lexer)) {`
			`case 'b':`
			`base = 2;`
			`lex_next(lexer);`
			`break;`
			`case 'o':`
			`base = 8;`
			`lex_next(lexer);`
			`break;`
			`case 'x':`
			`base = 16;`
			`lex_next(lexer);`
			`break;`
			`}`

			`while (1) {`
			`char c = lex_peek(lexer);`
			`int n = 0;`
			`if (c >= '0' && c <= '9') {`
			`n = c - '0';`
			`} else if (c >= 'a' && c <= 'z') { // match A-Z so we can`
			`n = c - 'a' + 10; // catch the errors`
			`} else if (c >= 'A' && c <= 'Z') { // here instead of later`
			`n = c - 'A' + 10;`
			`} else {`
			`break; // no longer a number`
			`}`
			`// if number provided is bigger than my base,`
			`// error !`
			`if (n >= base) {`
			`ERROR_POS(pos, "character '%c' is bigger than number base"`
			`"'%d'", c, base);`
			`return M_ERROR;`
			`}`
			`lex_next(lexer);`
			`number *= base;`
			`number += n;`
			`}`

			`*n = number;`
			`return M_SUCCESS;`
			`}`

			`/* lex the next token on the file */`
			`int lexer_next(struct lexer lexer, struct token token)`
			`{`
			`again: // use label to avoid whitespace recursion`
			`token->x = lexer->x;`
			`token->y = lexer->y;`
			`pos.x = lexer->x;`
			`pos.y = lexer->y;`
			`token->type = TOK_EOF;`

			`int c = lex_peek(lexer);`
			`int res = M_SUCCESS;`

			`switch (c) {`

			`case EOF:`
			`case '\0':`
			`token->type = TOK_EOF;`
			`break;`
			`case ';':`
			`case '#':`
			`skip_comment(lexer);`
			`goto again;`
			`case ' ':`
			`case '\t':`
			`// skip white space`
			`lex_next(lexer);`
			`goto again;`
			`case '\n':`
			`lex_next(lexer);`
			`token->type = TOK_NL;`
			`break;`
			`case ',':`
			`lex_next(lexer);`
			`token->type = TOK_COMMA;`
			`break;`
			`case '=':`
			`lex_next(lexer);`
			`token->type = TOK_EQUAL;`
			`break;`
			`case '(':`
			`lex_next(lexer);`
			`token->type = TOK_LPAREN;`
			`break;`
			`case ')':`
			`token->type = TOK_RPAREN;`
			`lex_next(lexer);`
			`break;`
			`case '$':`
			`token->type = TOK_REG;`
			`lex_next(lexer);`
			`res = lex_ident(lexer, token->text);`
			`break;`
			`case '.':`
			`token->type = TOK_DIRECTIVE;`
			`lex_next(lexer);`
			`res = lex_ident(lexer, token->text);`
			`break;`
			`case '"':`
			`token->type = TOK_STRING;`
			`lex_next(lexer);`
			`res = lex_string(lexer, token->text);`
			`break;`
			`case '0':`
			`case '1':`
			`case '2':`
			`case '3':`
			`case '4':`
			`case '5':`
			`case '6':`
			`case '7':`
			`case '8':`
			`case '9':`
			`token->type = TOK_NUMBER;`
			`res = lex_number(lexer, &token->number);`
			`break;`
			`default:`
			`token->type = TOK_IDENT;`
			`res = lex_ident(lexer, token->text);`
			`if (lex_peek(lexer) == ':') {`
			`lex_next(lexer);`
			`token->type = TOK_LABEL;`
			`}`
			`break;`
			`}`
			`return res;`
			`}`

			`int lexer_init(const char path, struct lexer lexer)`
			`{`
			`FILE *file = fopen(path, "r");`
			`if (file == NULL) {`
			`ERROR_POS(pos, "cannot file '%s'", path);`
			`return M_ERROR;`
			`}`
			`lexer->file = file;`
			`lexer->peek = EOF;`
			`lexer->x = 0;`
			`lexer->y = 0;`
			`return M_SUCCESS;`
			`}`

			`int lexer_free(struct lexer *lexer)`
			`{`
			`return fclose(lexer->file);`
			`}`

			`char *token_str(enum token_type type)`
			`{`
			`switch (type) {`
			`case TOK_IDENT:`
			`return "ident";`
			`case TOK_REG:`
			`return "register";`
			`case TOK_LABEL:`
			`return "label";`
			`case TOK_STRING:`
			`return "string";`
			`case TOK_COMMA:`
			`return "comma";`
			`case TOK_EQUAL:`
			`return "equal";`
			`case TOK_LPAREN:`
			`return "left parentheses";`
			`case TOK_RPAREN:`
			`return "right parentheses";`
			`case TOK_NUMBER:`
			`return "number";`
			`case TOK_EOF:`
			`return "end of file";`
			`case TOK_NL:`
			`return "new line";`
			`case TOK_DIRECTIVE:`
			`return "directive";`
			`}`
			`return "unknown";`
			`}`