From 2ed275821676a0d5baea6c7fd843d71c72c2342c Mon Sep 17 00:00:00 2001 From: Freya Murphy Date: Mon, 9 Sep 2024 12:41:49 -0400 Subject: initial mips32 (r2000ish mips32r6) assembler --- masm/lex.c | 343 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 343 insertions(+) create mode 100644 masm/lex.c (limited to 'masm/lex.c') diff --git a/masm/lex.c b/masm/lex.c new file mode 100644 index 0000000..06c7114 --- /dev/null +++ b/masm/lex.c @@ -0,0 +1,343 @@ +#include "lex.h" + +#include +#include + +static struct { + int x; + int y; +} pos; + +/* get next char in lexer */ +static int lex_next(struct lexer *lexer) +{ + if (lexer->peek != EOF) { + int c = lexer->peek; + lexer->peek = EOF; + return c; + } + + int c = getc(lexer->file); + if (c == '\n') { + lexer->x = 0; + lexer->y++; + } else { + lexer->x++; + } + return c; +} + +/* peek next char in lexer */ +static int lex_peek(struct lexer *lexer) +{ + if (lexer->peek == EOF) + lexer->peek = lex_next(lexer); + return lexer->peek; +} + +/* skip all characters until EOF or newline */ +static void skip_comment(struct lexer *lexer) +{ + int c; + while (1) { + c = lex_next(lexer); + if (c == EOF || c == '\n') + break; + } +} + +/* lexes text until whitespace + * returns error on zero length or too long */ +static int lex_ident(struct lexer *lexer, char text[MAX_LEX_LENGTH]) +{ + int len = 0; + char *ptr = text; + int c; + + while (1) { + c = lex_peek(lexer); + if (!( + (c >= 'a' && c <= 'z') || + (c >= 'A' && c <= 'Z') || + (c >= '0' && c <= '9') || + (c == '_') + )) { + break; + } + + // pop char out of lexer + lex_next(lexer); + + if (len + 1 == MAX_LEX_LENGTH) { + ERROR_POS(pos, "ident has max length of %d", + MAX_LEX_LENGTH); + return M_ERROR; + } + + *ptr++ = c; + len++; + } + + if (len == 0) { + ERROR_POS(pos, "attempted to lex empty ident %d", + MAX_LEX_LENGTH); + return M_ERROR; + } + + *ptr = '\0'; + return M_SUCCESS; +} + +/* lexes a string until closing quote + * returns error if string is too long or hit newline */ +static int lex_string(struct lexer *lexer,char text[MAX_LEX_LENGTH]) +{ + int len = 0; + char *ptr = text; + int c; + + while (1) { + c = lex_next(lexer); + if (c == '"') + break; + + // match escape character + if (c == '\\') { + switch (lex_peek(lexer)) { + case 'n': + c = '\n'; + lex_next(lexer); + break; + case 't': + c = '\t'; + lex_next(lexer); + break; + case '\\': + c = '\\'; + lex_next(lexer); + break; + case '"': + c = '"'; + lex_next(lexer); + break; + } + } + + // strings cannot span multiple lines + if (c == '\n') { + ERROR_POS(pos, "reached newline before end of string"); + return M_ERROR; + } + + if (len + 1 == MAX_LEX_LENGTH) { + ERROR_POS(pos, "string has max length of %d", + MAX_LEX_LENGTH); + return M_ERROR; + } + + *ptr++ = c; + len++; + } + + *ptr = '\0'; + return M_SUCCESS; +} + +/* lexes a integer number in base 2,8,10, or 16, + * uses base 10 by default but chan be changed by 0b, 0o, and 0x */ +static int lex_number(struct lexer *lexer, int64_t *n) +{ + int64_t number = 0; + int base = 10; + + // skip all leading zeros, they dont do anything. + // this also allows us to directly check for 0b, 0o, and 0x + // right away! + while (1) { + if (lex_peek(lexer) == '0') + lex_next(lexer); + else + break; + } + + // match change of base + switch (lex_peek(lexer)) { + case 'b': + base = 2; + lex_next(lexer); + break; + case 'o': + base = 8; + lex_next(lexer); + break; + case 'x': + base = 16; + lex_next(lexer); + break; + } + + while (1) { + char c = lex_peek(lexer); + int n = 0; + if (c >= '0' && c <= '9') { + n = c - '0'; + } else if (c >= 'a' && c <= 'z') { // match A-Z so we can + n = c - 'a' + 10; // catch the errors + } else if (c >= 'A' && c <= 'Z') { // here instead of later + n = c - 'A' + 10; + } else { + break; // no longer a number + } + // if number provided is bigger than my base, + // error ! + if (n >= base) { + ERROR_POS(pos, "character '%c' is bigger than number base" + "'%d'", c, base); + return M_ERROR; + } + lex_next(lexer); + number *= base; + number += n; + } + + *n = number; + return M_SUCCESS; +} + +/* lex the next token on the file */ +int lexer_next(struct lexer *lexer, struct token *token) +{ +again: // use label to avoid whitespace recursion + token->x = lexer->x; + token->y = lexer->y; + pos.x = lexer->x; + pos.y = lexer->y; + token->type = TOK_EOF; + + int c = lex_peek(lexer); + int res = M_SUCCESS; + + switch (c) { + + case EOF: + case '\0': + token->type = TOK_EOF; + break; + case ';': + case '#': + skip_comment(lexer); + goto again; + case ' ': + case '\t': + // skip white space + lex_next(lexer); + goto again; + case '\n': + lex_next(lexer); + token->type = TOK_NL; + break; + case ',': + lex_next(lexer); + token->type = TOK_COMMA; + break; + case '=': + lex_next(lexer); + token->type = TOK_EQUAL; + break; + case '(': + lex_next(lexer); + token->type = TOK_LPAREN; + break; + case ')': + token->type = TOK_RPAREN; + lex_next(lexer); + break; + case '$': + token->type = TOK_REG; + lex_next(lexer); + res = lex_ident(lexer, token->text); + break; + case '.': + token->type = TOK_DIRECTIVE; + lex_next(lexer); + res = lex_ident(lexer, token->text); + break; + case '"': + token->type = TOK_STRING; + lex_next(lexer); + res = lex_string(lexer, token->text); + break; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + token->type = TOK_NUMBER; + res = lex_number(lexer, &token->number); + break; + default: + token->type = TOK_IDENT; + res = lex_ident(lexer, token->text); + if (lex_peek(lexer) == ':') { + lex_next(lexer); + token->type = TOK_LABEL; + } + break; + } + return res; +} + +int lexer_init(const char *path, struct lexer *lexer) +{ + FILE *file = fopen(path, "r"); + if (file == NULL) { + ERROR_POS(pos, "cannot file '%s'", path); + return M_ERROR; + } + lexer->file = file; + lexer->peek = EOF; + lexer->x = 0; + lexer->y = 0; + return M_SUCCESS; +} + +int lexer_free(struct lexer *lexer) +{ + return fclose(lexer->file); +} + +char *token_str(enum token_type type) +{ + switch (type) { + case TOK_IDENT: + return "ident"; + case TOK_REG: + return "register"; + case TOK_LABEL: + return "label"; + case TOK_STRING: + return "string"; + case TOK_COMMA: + return "comma"; + case TOK_EQUAL: + return "equal"; + case TOK_LPAREN: + return "left parentheses"; + case TOK_RPAREN: + return "right parentheses"; + case TOK_NUMBER: + return "number"; + case TOK_EOF: + return "end of file"; + case TOK_NL: + return "new line"; + case TOK_DIRECTIVE: + return "directive"; + } + return "unknown"; +} -- cgit v1.2.3-freya