summaryrefslogtreecommitdiff
path: root/masm/lex.c
diff options
context:
space:
mode:
authorFreya Murphy <freya@freyacat.org>2024-09-09 12:41:49 -0400
committerFreya Murphy <freya@freyacat.org>2024-09-09 12:41:49 -0400
commit2ed275821676a0d5baea6c7fd843d71c72c2342c (patch)
tree480297f28e5c42d02a47b3b94027a7abe507d010 /masm/lex.c
downloadmips-2ed275821676a0d5baea6c7fd843d71c72c2342c.tar.gz
mips-2ed275821676a0d5baea6c7fd843d71c72c2342c.tar.bz2
mips-2ed275821676a0d5baea6c7fd843d71c72c2342c.zip
initial mips32 (r2000ish mips32r6) assembler
Diffstat (limited to 'masm/lex.c')
-rw-r--r--masm/lex.c343
1 files changed, 343 insertions, 0 deletions
diff --git a/masm/lex.c b/masm/lex.c
new file mode 100644
index 0000000..06c7114
--- /dev/null
+++ b/masm/lex.c
@@ -0,0 +1,343 @@
+#include "lex.h"
+
+#include <mlimits.h>
+#include <merror.h>
+
+static struct {
+ int x;
+ int y;
+} pos;
+
+/* get next char in lexer */
+static int lex_next(struct lexer *lexer)
+{
+ if (lexer->peek != EOF) {
+ int c = lexer->peek;
+ lexer->peek = EOF;
+ return c;
+ }
+
+ int c = getc(lexer->file);
+ if (c == '\n') {
+ lexer->x = 0;
+ lexer->y++;
+ } else {
+ lexer->x++;
+ }
+ return c;
+}
+
+/* peek next char in lexer */
+static int lex_peek(struct lexer *lexer)
+{
+ if (lexer->peek == EOF)
+ lexer->peek = lex_next(lexer);
+ return lexer->peek;
+}
+
+/* skip all characters until EOF or newline */
+static void skip_comment(struct lexer *lexer)
+{
+ int c;
+ while (1) {
+ c = lex_next(lexer);
+ if (c == EOF || c == '\n')
+ break;
+ }
+}
+
+/* lexes text until whitespace
+ * returns error on zero length or too long */
+static int lex_ident(struct lexer *lexer, char text[MAX_LEX_LENGTH])
+{
+ int len = 0;
+ char *ptr = text;
+ int c;
+
+ while (1) {
+ c = lex_peek(lexer);
+ if (!(
+ (c >= 'a' && c <= 'z') ||
+ (c >= 'A' && c <= 'Z') ||
+ (c >= '0' && c <= '9') ||
+ (c == '_')
+ )) {
+ break;
+ }
+
+ // pop char out of lexer
+ lex_next(lexer);
+
+ if (len + 1 == MAX_LEX_LENGTH) {
+ ERROR_POS(pos, "ident has max length of %d",
+ MAX_LEX_LENGTH);
+ return M_ERROR;
+ }
+
+ *ptr++ = c;
+ len++;
+ }
+
+ if (len == 0) {
+ ERROR_POS(pos, "attempted to lex empty ident %d",
+ MAX_LEX_LENGTH);
+ return M_ERROR;
+ }
+
+ *ptr = '\0';
+ return M_SUCCESS;
+}
+
+/* lexes a string until closing quote
+ * returns error if string is too long or hit newline */
+static int lex_string(struct lexer *lexer,char text[MAX_LEX_LENGTH])
+{
+ int len = 0;
+ char *ptr = text;
+ int c;
+
+ while (1) {
+ c = lex_next(lexer);
+ if (c == '"')
+ break;
+
+ // match escape character
+ if (c == '\\') {
+ switch (lex_peek(lexer)) {
+ case 'n':
+ c = '\n';
+ lex_next(lexer);
+ break;
+ case 't':
+ c = '\t';
+ lex_next(lexer);
+ break;
+ case '\\':
+ c = '\\';
+ lex_next(lexer);
+ break;
+ case '"':
+ c = '"';
+ lex_next(lexer);
+ break;
+ }
+ }
+
+ // strings cannot span multiple lines
+ if (c == '\n') {
+ ERROR_POS(pos, "reached newline before end of string");
+ return M_ERROR;
+ }
+
+ if (len + 1 == MAX_LEX_LENGTH) {
+ ERROR_POS(pos, "string has max length of %d",
+ MAX_LEX_LENGTH);
+ return M_ERROR;
+ }
+
+ *ptr++ = c;
+ len++;
+ }
+
+ *ptr = '\0';
+ return M_SUCCESS;
+}
+
+/* lexes a integer number in base 2,8,10, or 16,
+ * uses base 10 by default but chan be changed by 0b, 0o, and 0x */
+static int lex_number(struct lexer *lexer, int64_t *n)
+{
+ int64_t number = 0;
+ int base = 10;
+
+ // skip all leading zeros, they dont do anything.
+ // this also allows us to directly check for 0b, 0o, and 0x
+ // right away!
+ while (1) {
+ if (lex_peek(lexer) == '0')
+ lex_next(lexer);
+ else
+ break;
+ }
+
+ // match change of base
+ switch (lex_peek(lexer)) {
+ case 'b':
+ base = 2;
+ lex_next(lexer);
+ break;
+ case 'o':
+ base = 8;
+ lex_next(lexer);
+ break;
+ case 'x':
+ base = 16;
+ lex_next(lexer);
+ break;
+ }
+
+ while (1) {
+ char c = lex_peek(lexer);
+ int n = 0;
+ if (c >= '0' && c <= '9') {
+ n = c - '0';
+ } else if (c >= 'a' && c <= 'z') { // match A-Z so we can
+ n = c - 'a' + 10; // catch the errors
+ } else if (c >= 'A' && c <= 'Z') { // here instead of later
+ n = c - 'A' + 10;
+ } else {
+ break; // no longer a number
+ }
+ // if number provided is bigger than my base,
+ // error !
+ if (n >= base) {
+ ERROR_POS(pos, "character '%c' is bigger than number base"
+ "'%d'", c, base);
+ return M_ERROR;
+ }
+ lex_next(lexer);
+ number *= base;
+ number += n;
+ }
+
+ *n = number;
+ return M_SUCCESS;
+}
+
+/* lex the next token on the file */
+int lexer_next(struct lexer *lexer, struct token *token)
+{
+again: // use label to avoid whitespace recursion
+ token->x = lexer->x;
+ token->y = lexer->y;
+ pos.x = lexer->x;
+ pos.y = lexer->y;
+ token->type = TOK_EOF;
+
+ int c = lex_peek(lexer);
+ int res = M_SUCCESS;
+
+ switch (c) {
+
+ case EOF:
+ case '\0':
+ token->type = TOK_EOF;
+ break;
+ case ';':
+ case '#':
+ skip_comment(lexer);
+ goto again;
+ case ' ':
+ case '\t':
+ // skip white space
+ lex_next(lexer);
+ goto again;
+ case '\n':
+ lex_next(lexer);
+ token->type = TOK_NL;
+ break;
+ case ',':
+ lex_next(lexer);
+ token->type = TOK_COMMA;
+ break;
+ case '=':
+ lex_next(lexer);
+ token->type = TOK_EQUAL;
+ break;
+ case '(':
+ lex_next(lexer);
+ token->type = TOK_LPAREN;
+ break;
+ case ')':
+ token->type = TOK_RPAREN;
+ lex_next(lexer);
+ break;
+ case '$':
+ token->type = TOK_REG;
+ lex_next(lexer);
+ res = lex_ident(lexer, token->text);
+ break;
+ case '.':
+ token->type = TOK_DIRECTIVE;
+ lex_next(lexer);
+ res = lex_ident(lexer, token->text);
+ break;
+ case '"':
+ token->type = TOK_STRING;
+ lex_next(lexer);
+ res = lex_string(lexer, token->text);
+ break;
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ token->type = TOK_NUMBER;
+ res = lex_number(lexer, &token->number);
+ break;
+ default:
+ token->type = TOK_IDENT;
+ res = lex_ident(lexer, token->text);
+ if (lex_peek(lexer) == ':') {
+ lex_next(lexer);
+ token->type = TOK_LABEL;
+ }
+ break;
+ }
+ return res;
+}
+
+int lexer_init(const char *path, struct lexer *lexer)
+{
+ FILE *file = fopen(path, "r");
+ if (file == NULL) {
+ ERROR_POS(pos, "cannot file '%s'", path);
+ return M_ERROR;
+ }
+ lexer->file = file;
+ lexer->peek = EOF;
+ lexer->x = 0;
+ lexer->y = 0;
+ return M_SUCCESS;
+}
+
+int lexer_free(struct lexer *lexer)
+{
+ return fclose(lexer->file);
+}
+
+char *token_str(enum token_type type)
+{
+ switch (type) {
+ case TOK_IDENT:
+ return "ident";
+ case TOK_REG:
+ return "register";
+ case TOK_LABEL:
+ return "label";
+ case TOK_STRING:
+ return "string";
+ case TOK_COMMA:
+ return "comma";
+ case TOK_EQUAL:
+ return "equal";
+ case TOK_LPAREN:
+ return "left parentheses";
+ case TOK_RPAREN:
+ return "right parentheses";
+ case TOK_NUMBER:
+ return "number";
+ case TOK_EOF:
+ return "end of file";
+ case TOK_NL:
+ return "new line";
+ case TOK_DIRECTIVE:
+ return "directive";
+ }
+ return "unknown";
+}