#include "lex.h" #include #include static struct { int x; int y; } pos; /* get next char in lexer */ static int lex_next(struct lexer *lexer) { if (lexer->peek != EOF) { int c = lexer->peek; lexer->peek = EOF; return c; } int c = getc(lexer->file); if (c == '\n') { lexer->x = 1; lexer->y++; } else { lexer->x++; } return c; } /* peek next char in lexer */ static int lex_peek(struct lexer *lexer) { if (lexer->peek == EOF) lexer->peek = lex_next(lexer); return lexer->peek; } /* skip all characters until EOF or newline */ static void skip_comment(struct lexer *lexer) { int c; while (1) { c = lex_next(lexer); if (c == EOF || c == '\n') break; } } /* lexes text until whitespace * returns error on zero length or too long */ static int lex_ident(struct lexer *lexer, char text[MAX_LEX_LENGTH]) { int len = 0; char *ptr = text; int c; while (1) { c = lex_peek(lexer); if (!( (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || (c == '_') )) { break; } // pop char out of lexer lex_next(lexer); if (len + 1 == MAX_LEX_LENGTH) { ERROR_POS(pos, "ident has max length of %d", MAX_LEX_LENGTH); return M_ERROR; } *ptr++ = c; len++; } if (len == 0) { ERROR_POS(pos, "attempted to lex empty ident %d", MAX_LEX_LENGTH); return M_ERROR; } *ptr = '\0'; return M_SUCCESS; } /* lexes a string until closing quote * returns error if string is too long or hit newline */ static int lex_string(struct lexer *lexer,char text[MAX_LEX_LENGTH]) { int len = 0; char *ptr = text; int c; while (1) { c = lex_next(lexer); if (c == '"') break; // match escape character if (c == '\\') { switch (lex_peek(lexer)) { case 'n': c = '\n'; lex_next(lexer); break; case 't': c = '\t'; lex_next(lexer); break; case '\\': c = '\\'; lex_next(lexer); break; case '"': c = '"'; lex_next(lexer); break; } } // strings cannot span multiple lines if (c == '\n') { ERROR_POS(pos, "reached newline before end of string"); return M_ERROR; } if (len + 1 == MAX_LEX_LENGTH) { ERROR_POS(pos, "string has max length of %d", MAX_LEX_LENGTH); return M_ERROR; } *ptr++ = c; len++; } *ptr = '\0'; return M_SUCCESS; } /* lexes a integer number in base 2,8,10, or 16, * uses base 10 by default but chan be changed by 0b, 0o, and 0x */ static int lex_number(struct lexer *lexer, int64_t *n) { int64_t number = 0; int base = 10; // skip all leading zeros, they dont do anything. // this also allows us to directly check for 0b, 0o, and 0x // right away! while (1) { if (lex_peek(lexer) == '0') lex_next(lexer); else break; } // match change of base switch (lex_peek(lexer)) { case 'b': base = 2; lex_next(lexer); break; case 'o': base = 8; lex_next(lexer); break; case 'x': base = 16; lex_next(lexer); break; } while (1) { char c = lex_peek(lexer); int n = 0; if (c >= '0' && c <= '9') { n = c - '0'; } else if (c >= 'a' && c <= 'z') { // match A-Z so we can n = c - 'a' + 10; // catch the errors } else if (c >= 'A' && c <= 'Z') { // here instead of later n = c - 'A' + 10; } else { break; // no longer a number } // if number provided is bigger than my base, // error ! if (n >= base) { ERROR_POS(pos, "character '%c' is bigger than number base" "'%d'", c, base); return M_ERROR; } lex_next(lexer); number *= base; number += n; } *n = number; return M_SUCCESS; } /* lex the next token on the file */ int lexer_next(struct lexer *lexer, struct token *token) { again: // use label to avoid whitespace recursion token->x = lexer->x; token->y = lexer->y; pos.x = lexer->x; pos.y = lexer->y; token->type = TOK_EOF; int c = lex_peek(lexer); int res = M_SUCCESS; switch (c) { case EOF: case '\0': token->type = TOK_EOF; break; case ';': case '#': skip_comment(lexer); goto again; case ' ': case '\t': // skip white space lex_next(lexer); goto again; case '\n': lex_next(lexer); token->type = TOK_NL; break; case ',': lex_next(lexer); token->type = TOK_COMMA; break; case '=': lex_next(lexer); token->type = TOK_EQUAL; break; case '(': lex_next(lexer); token->type = TOK_LPAREN; break; case ')': token->type = TOK_RPAREN; lex_next(lexer); break; case '$': token->type = TOK_REG; lex_next(lexer); res = lex_ident(lexer, token->text); break; case '.': token->type = TOK_DIRECTIVE; lex_next(lexer); res = lex_ident(lexer, token->text); break; case '"': token->type = TOK_STRING; lex_next(lexer); res = lex_string(lexer, token->text); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': token->type = TOK_NUMBER; res = lex_number(lexer, &token->number); break; default: token->type = TOK_IDENT; res = lex_ident(lexer, token->text); if (lex_peek(lexer) == ':') { lex_next(lexer); token->type = TOK_LABEL; } break; } return res; } int lexer_init(const char *path, struct lexer *lexer) { FILE *file = fopen(path, "r"); if (file == NULL) { ERROR("cannot read '%s'", path); return M_ERROR; } lexer->file = file; lexer->peek = EOF; lexer->x = 1; lexer->y = 1; return M_SUCCESS; } int lexer_free(struct lexer *lexer) { return fclose(lexer->file); } char *token_str(enum token_type type) { switch (type) { case TOK_IDENT: return "ident"; case TOK_REG: return "register"; case TOK_LABEL: return "label"; case TOK_STRING: return "string"; case TOK_COMMA: return "comma"; case TOK_EQUAL: return "equal"; case TOK_LPAREN: return "left parentheses"; case TOK_RPAREN: return "right parentheses"; case TOK_NUMBER: return "number"; case TOK_EOF: return "end of file"; case TOK_NL: return "new line"; case TOK_DIRECTIVE: return "directive"; } return "unknown"; }