1 files changed, 146 insertions, 77 deletions
diff --git a/masm/lex.c b/masm/lex.c
index a7707d6..b835a7f 100644
--- a/masm/lex.c
+++ b/masm/lex.c
@@ -2,6 +2,10 @@
 
 #include <mlimits.h>
 #include <merror.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
 
 static struct {
 	int x;
@@ -46,64 +50,24 @@ static void skip_comment(struct lexer *lexer)
 	}
 }
 
-/* lexes text until whitespace
- * returns error on zero length or too long */
-static int lex_ident(struct lexer *lexer, char text[MAX_LEX_LENGTH])
-{
-	int len = 0;
-	char *ptr = text;
-	int c;
-
-	while (1) {
-		c = lex_peek(lexer);
-		if (!(
-			(c >= 'a' && c <= 'z') ||
-			(c >= 'A' && c <= 'Z') ||
-			(c >= '0' && c <= '9') ||
-			(c == '_')
-		)) {
-			break;
-		}
-
-		// pop char out of lexer
-		lex_next(lexer);
-
-		if (len + 1 == MAX_LEX_LENGTH) {
-			ERROR_POS(pos, "ident has max length of %d",
-				MAX_LEX_LENGTH);
-			return M_ERROR;
-		}
-
-		*ptr++ = c;
-		len++;
-	}
-
-	if (len == 0) {
-		ERROR_POS(pos, "attempted to lex empty ident %d",
-			MAX_LEX_LENGTH);
-		return M_ERROR;
-	}
-
-	*ptr = '\0';
-	return M_SUCCESS;
-}
-
 /* lexes a string until closing quote
  * returns error if string is too long or hit newline */
-static int lex_string(struct lexer *lexer,char text[MAX_LEX_LENGTH])
+static int lex_string(struct lexer *lexer, struct string *string)
 {
-	int len = 0;
-	char *ptr = text;
-	int c;
+	char c;
+	string_init(string);
 
 	while (1) {
 		c = lex_next(lexer);
+
+		// stop on ending quote
 		if (c == '"')
 			break;
 
 		// strings cannot span multiple lines
 		if (c == '\n') {
 			ERROR_POS(pos, "reached newline before end of string");
+			string_free(string);
 			return M_ERROR;
 		}
 
@@ -129,20 +93,73 @@ static int lex_string(struct lexer *lexer,char text[MAX_LEX_LENGTH])
 			}
 		}
 
-		if (len + 1 == MAX_LEX_LENGTH) {
-			ERROR_POS(pos, "string has max length of %d",
-				MAX_LEX_LENGTH);
+		// push char into string
+		if (string_push(string, c)) {
+			string_free(string);
+			return M_ERROR;
+		}
+	}
+
+	// null terminate string
+	if (string_push(string, '\0')) {
+		free(string->str);
+		return M_ERROR;
+	}
+
+	return M_SUCCESS;
+}
+
+/* lexes text until whitespace
+ * returns error on zero length or too long */
+static int lex_ident(struct lexer *lexer, struct string *string,
+		     char prefix)
+{
+	char c;
+	string_init(string);
+
+	if (prefix != '\0' && string_push(string, prefix)) {
+		string_free(string);
+		return M_ERROR;
+	}
+
+	while (1) {
+		c = lex_peek(lexer);
+		if (!(
+			(c >= 'a' && c <= 'z') ||
+			(c >= 'A' && c <= 'Z') ||
+			(c >= '0' && c <= '9') ||
+			(c == '_')
+		)) {
+			break;
+		}
+
+		// pop char out of lexer
+		lex_next(lexer);
+
+		// push char into string
+		if (string_push(string, c)) {
+			free(string->str);
 			return M_ERROR;
 		}
+	}
+
+	// empty idents are not allowed
+	if (string->len < 1) {
+		string_free(string);
+		ERROR("empty ident tokens are not allowed");
+		return M_ERROR;
+	}
 
-		*ptr++ = c;
-		len++;
+	// null terminate string
+	if (string_push(string, '\0')) {
+		string_free(string);
+		return M_ERROR;
 	}
 
-	*ptr = '\0';
 	return M_SUCCESS;
 }
 
+
 /* lexes a integer number in base 2,8,10, or 16,
  * uses base 10 by default but chan be changed by 0b, 0o, and 0x */
 static int lex_number(struct lexer *lexer, int64_t *n)
@@ -221,6 +238,7 @@ int lexer_next(struct lexer *lexer, struct token *token)
 again: // use label to avoid whitespace recursion
 	token->x = lexer->x;
 	token->y = lexer->y;
+	token->off = ftell(lexer->file);
 	pos.x = lexer->x;
 	pos.y = lexer->y;
 	token->type = TOK_EOF;
@@ -231,54 +249,80 @@ again: // use label to avoid whitespace recursion
 	switch (c) {
 
 	case EOF:
+
+	// return a EOF token
 	case '\0':
 		token->type = TOK_EOF;
 		break;
+
+	// skip the comment
+	// .. and return a NL token
 	case ';':
 	case '#':
 		skip_comment(lexer);
 		token->type = TOK_NL;
 		break;
+
+	// skip the whitespace and
+	// try to parse the next character
 	case ' ':
 	case '\t':
 		// skip white space
 		lex_next(lexer);
 		goto again;
+
+	// return a NL token
 	case '\n':
 		lex_next(lexer);
 		token->type = TOK_NL;
 		break;
+
+	// return a comma token
 	case ',':
 		lex_next(lexer);
 		token->type = TOK_COMMA;
 		break;
+
+	// return a equal token
 	case '=':
 		lex_next(lexer);
 		token->type = TOK_EQUAL;
 		break;
+
+	// return a left paren token
 	case '(':
 		lex_next(lexer);
 		token->type = TOK_LPAREN;
 		break;
+
+	// return a right paren token
 	case ')':
 		token->type = TOK_RPAREN;
 		lex_next(lexer);
 		break;
+
+	// return a register token
 	case '$':
 		token->type = TOK_REG;
 		lex_next(lexer);
-		res = lex_ident(lexer, token->text);
+		res = lex_ident(lexer, &token->string, '\0');
 		break;
+
+	// return a directive token
 	case '.':
 		token->type = TOK_DIRECTIVE;
 		lex_next(lexer);
-		res = lex_ident(lexer, token->text);
+		res = lex_ident(lexer, &token->string, '.');
 		break;
+
+	// return a string token
 	case '"':
 		token->type = TOK_STRING;
 		lex_next(lexer);
-		res = lex_string(lexer, token->text);
+		res = lex_string(lexer, &token->string);
 		break;
+
+	// return a number token
 	case '-':
 	case '0':
 	case '1':
@@ -293,68 +337,78 @@ again: // use label to avoid whitespace recursion
 		token->type = TOK_NUMBER;
 		res = lex_number(lexer, &token->number);
 		break;
+
+	// return a ident or label token depending
+	// if it ends with a colon
 	default:
 		token->type = TOK_IDENT;
-		res = lex_ident(lexer, token->text);
+		res = lex_ident(lexer, &token->string, '\0');
 		if (lex_peek(lexer) == ':') {
 			lex_next(lexer);
 			token->type = TOK_LABEL;
 		}
 		break;
 	}
+
 	return res;
 }
 
 int lexer_init(const char *path, struct lexer *lexer)
 {
-	FILE *file = fopen(path, "r");
-	if (file == NULL) {
-		PERROR("cannot read '%s'", path);
-		return M_ERROR;
-	}
-	lexer->file = file;
+	/// defaults
+	lexer->file = NULL;
 	lexer->peek = EOF;
 	lexer->x = 1;
 	lexer->y = 1;
+
+	/// load file
+	lexer->file = fopen(path, "r");
+	if (lexer->file == NULL) {
+		PERROR("cannot read");
+		return M_ERROR;
+	}
+
 	return M_SUCCESS;
 }
 
-int lexer_free(struct lexer *lexer)
+void lexer_free(struct lexer *lexer)
 {
-	return fclose(lexer->file);
+	if (lexer->file)
+		fclose(lexer->file);
 }
 
 char *token_str(enum token_type type)
 {
 	switch (type) {
-        case TOK_IDENT:
+	case TOK_IDENT:
 		return "ident";
-        case TOK_REG:
+	case TOK_REG:
 		return "register";
-        case TOK_LABEL:
+	case TOK_LABEL:
 		return "label";
-        case TOK_STRING:
+	case TOK_STRING:
 		return "string";
-        case TOK_COMMA:
+	case TOK_COMMA:
 		return "comma";
-        case TOK_EQUAL:
+	case TOK_EQUAL:
 		return "equal";
-        case TOK_LPAREN:
+	case TOK_LPAREN:
 		return "left parentheses";
-        case TOK_RPAREN:
+	case TOK_RPAREN:
 		return "right parentheses";
-        case TOK_NUMBER:
+	case TOK_NUMBER:
 		return "number";
-        case TOK_EOF:
+	case TOK_EOF:
 		return "end of file";
-        case TOK_NL:
+	case TOK_NL:
 		return "new line";
-        case TOK_DIRECTIVE:
+	case TOK_DIRECTIVE:
 		return "directive";
-        }
+	}
 	return "unknown";
 }
 
+/* save the current state from the lexer */
 void lexer_save(struct lexer *lexer, struct lexer_state *state)
 {
 	state->x = lexer->x;
@@ -371,3 +425,18 @@ void lexer_load(struct lexer *lexer, const struct lexer_state *state)
 	lexer->peek = state->peek;
 	fseek(lexer->file, state->offset, SEEK_SET);
 }
+
+void token_free(struct token *token)
+{
+	switch (token->type) {
+	case TOK_REG:
+	case TOK_IDENT:
+	case TOK_LABEL:
+	case TOK_STRING:
+	case TOK_DIRECTIVE:
+		if (token->string.str)
+			free(token->string.str);
+		break;
+	default:
+	}
+}