2024-09-09 16:41:49 +00:00
|
|
|
#include "lex.h"
|
|
|
|
|
|
|
|
#include <mlimits.h>
|
|
|
|
#include <merror.h>
|
2024-10-04 23:41:10 +00:00
|
|
|
#include <stdlib.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <sys/mman.h>
|
|
|
|
#include <sys/stat.h>
|
2024-09-09 16:41:49 +00:00
|
|
|
|
|
|
|
static struct {
|
|
|
|
int x;
|
|
|
|
int y;
|
|
|
|
} pos;
|
|
|
|
|
|
|
|
/* get next char in lexer */
|
|
|
|
static int lex_next(struct lexer *lexer)
|
|
|
|
{
|
|
|
|
if (lexer->peek != EOF) {
|
|
|
|
int c = lexer->peek;
|
|
|
|
lexer->peek = EOF;
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
|
|
|
int c = getc(lexer->file);
|
|
|
|
if (c == '\n') {
|
2024-09-10 22:23:46 +00:00
|
|
|
lexer->x = 1;
|
2024-09-09 16:41:49 +00:00
|
|
|
lexer->y++;
|
|
|
|
} else {
|
|
|
|
lexer->x++;
|
|
|
|
}
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* peek next char in lexer */
|
|
|
|
static int lex_peek(struct lexer *lexer)
|
|
|
|
{
|
|
|
|
if (lexer->peek == EOF)
|
|
|
|
lexer->peek = lex_next(lexer);
|
|
|
|
return lexer->peek;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* skip all characters until EOF or newline */
|
|
|
|
static void skip_comment(struct lexer *lexer)
|
|
|
|
{
|
|
|
|
int c;
|
|
|
|
while (1) {
|
|
|
|
c = lex_next(lexer);
|
|
|
|
if (c == EOF || c == '\n')
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* lexes a string until closing quote
|
|
|
|
* returns error if string is too long or hit newline */
|
2024-10-04 23:41:10 +00:00
|
|
|
static int lex_string(struct lexer *lexer, struct string *string)
|
2024-09-09 16:41:49 +00:00
|
|
|
{
|
2024-10-04 23:41:10 +00:00
|
|
|
char c;
|
|
|
|
string_init(string);
|
2024-09-09 16:41:49 +00:00
|
|
|
|
|
|
|
while (1) {
|
|
|
|
c = lex_next(lexer);
|
2024-10-04 23:41:10 +00:00
|
|
|
|
|
|
|
// stop on ending quote
|
2024-09-09 16:41:49 +00:00
|
|
|
if (c == '"')
|
|
|
|
break;
|
|
|
|
|
2024-09-30 22:51:30 +00:00
|
|
|
// strings cannot span multiple lines
|
|
|
|
if (c == '\n') {
|
|
|
|
ERROR_POS(pos, "reached newline before end of string");
|
2024-10-04 23:41:10 +00:00
|
|
|
string_free(string);
|
2024-09-30 22:51:30 +00:00
|
|
|
return M_ERROR;
|
|
|
|
}
|
|
|
|
|
2024-09-09 16:41:49 +00:00
|
|
|
// match escape character
|
|
|
|
if (c == '\\') {
|
|
|
|
switch (lex_peek(lexer)) {
|
|
|
|
case 'n':
|
|
|
|
c = '\n';
|
|
|
|
lex_next(lexer);
|
|
|
|
break;
|
|
|
|
case 't':
|
|
|
|
c = '\t';
|
|
|
|
lex_next(lexer);
|
|
|
|
break;
|
|
|
|
case '\\':
|
|
|
|
c = '\\';
|
|
|
|
lex_next(lexer);
|
|
|
|
break;
|
|
|
|
case '"':
|
|
|
|
c = '"';
|
|
|
|
lex_next(lexer);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-10-04 23:41:10 +00:00
|
|
|
// push char into string
|
|
|
|
if (string_push(string, c)) {
|
|
|
|
string_free(string);
|
|
|
|
return M_ERROR;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// null terminate string
|
|
|
|
if (string_push(string, '\0')) {
|
|
|
|
free(string->str);
|
|
|
|
return M_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
return M_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* lexes text until whitespace
|
|
|
|
* returns error on zero length or too long */
|
|
|
|
static int lex_ident(struct lexer *lexer, struct string *string,
|
|
|
|
char prefix)
|
|
|
|
{
|
|
|
|
char c;
|
|
|
|
string_init(string);
|
|
|
|
|
|
|
|
if (prefix != '\0' && string_push(string, prefix)) {
|
|
|
|
string_free(string);
|
|
|
|
return M_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
c = lex_peek(lexer);
|
|
|
|
if (!(
|
|
|
|
(c >= 'a' && c <= 'z') ||
|
|
|
|
(c >= 'A' && c <= 'Z') ||
|
|
|
|
(c >= '0' && c <= '9') ||
|
|
|
|
(c == '_')
|
|
|
|
)) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// pop char out of lexer
|
|
|
|
lex_next(lexer);
|
|
|
|
|
|
|
|
// push char into string
|
|
|
|
if (string_push(string, c)) {
|
|
|
|
free(string->str);
|
2024-09-09 16:41:49 +00:00
|
|
|
return M_ERROR;
|
|
|
|
}
|
2024-10-04 23:41:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// empty idents are not allowed
|
|
|
|
if (string->len < 1) {
|
|
|
|
string_free(string);
|
|
|
|
ERROR("empty ident tokens are not allowed");
|
|
|
|
return M_ERROR;
|
|
|
|
}
|
2024-09-09 16:41:49 +00:00
|
|
|
|
2024-10-04 23:41:10 +00:00
|
|
|
// null terminate string
|
|
|
|
if (string_push(string, '\0')) {
|
|
|
|
string_free(string);
|
|
|
|
return M_ERROR;
|
2024-09-09 16:41:49 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return M_SUCCESS;
|
|
|
|
}
|
|
|
|
|
2024-10-04 23:41:10 +00:00
|
|
|
|
2024-09-09 16:41:49 +00:00
|
|
|
/* lexes a integer number in base 2,8,10, or 16,
|
|
|
|
* uses base 10 by default but chan be changed by 0b, 0o, and 0x */
|
|
|
|
static int lex_number(struct lexer *lexer, int64_t *n)
|
|
|
|
{
|
|
|
|
int64_t number = 0;
|
|
|
|
int base = 10;
|
2024-09-13 15:11:18 +00:00
|
|
|
int neg = 0;
|
|
|
|
|
|
|
|
// check if negative
|
|
|
|
if (lex_peek(lexer) == '-') {
|
|
|
|
lex_next(lexer);
|
|
|
|
neg = 1;
|
|
|
|
}
|
|
|
|
|
2024-09-09 16:41:49 +00:00
|
|
|
|
|
|
|
// skip all leading zeros, they dont do anything.
|
|
|
|
// this also allows us to directly check for 0b, 0o, and 0x
|
|
|
|
// right away!
|
|
|
|
while (1) {
|
|
|
|
if (lex_peek(lexer) == '0')
|
|
|
|
lex_next(lexer);
|
|
|
|
else
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// match change of base
|
|
|
|
switch (lex_peek(lexer)) {
|
|
|
|
case 'b':
|
|
|
|
base = 2;
|
|
|
|
lex_next(lexer);
|
|
|
|
break;
|
|
|
|
case 'o':
|
|
|
|
base = 8;
|
|
|
|
lex_next(lexer);
|
|
|
|
break;
|
|
|
|
case 'x':
|
|
|
|
base = 16;
|
|
|
|
lex_next(lexer);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
char c = lex_peek(lexer);
|
|
|
|
int n = 0;
|
|
|
|
if (c >= '0' && c <= '9') {
|
|
|
|
n = c - '0';
|
|
|
|
} else if (c >= 'a' && c <= 'z') { // match A-Z so we can
|
|
|
|
n = c - 'a' + 10; // catch the errors
|
|
|
|
} else if (c >= 'A' && c <= 'Z') { // here instead of later
|
|
|
|
n = c - 'A' + 10;
|
|
|
|
} else {
|
|
|
|
break; // no longer a number
|
|
|
|
}
|
|
|
|
// if number provided is bigger than my base,
|
|
|
|
// error !
|
|
|
|
if (n >= base) {
|
|
|
|
ERROR_POS(pos, "character '%c' is bigger than number base"
|
|
|
|
"'%d'", c, base);
|
|
|
|
return M_ERROR;
|
|
|
|
}
|
|
|
|
lex_next(lexer);
|
|
|
|
number *= base;
|
|
|
|
number += n;
|
|
|
|
}
|
|
|
|
|
2024-09-13 15:11:18 +00:00
|
|
|
if (neg)
|
|
|
|
number = -number;
|
|
|
|
|
2024-09-09 16:41:49 +00:00
|
|
|
*n = number;
|
|
|
|
return M_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* lex the next token on the file */
|
|
|
|
int lexer_next(struct lexer *lexer, struct token *token)
|
|
|
|
{
|
|
|
|
again: // use label to avoid whitespace recursion
|
|
|
|
token->x = lexer->x;
|
|
|
|
token->y = lexer->y;
|
2024-10-04 23:41:10 +00:00
|
|
|
token->off = ftell(lexer->file);
|
2024-09-09 16:41:49 +00:00
|
|
|
pos.x = lexer->x;
|
|
|
|
pos.y = lexer->y;
|
|
|
|
token->type = TOK_EOF;
|
|
|
|
|
|
|
|
int c = lex_peek(lexer);
|
|
|
|
int res = M_SUCCESS;
|
|
|
|
|
|
|
|
switch (c) {
|
|
|
|
|
|
|
|
case EOF:
|
2024-10-04 23:41:10 +00:00
|
|
|
|
|
|
|
// return a EOF token
|
2024-09-09 16:41:49 +00:00
|
|
|
case '\0':
|
|
|
|
token->type = TOK_EOF;
|
|
|
|
break;
|
2024-10-04 23:41:10 +00:00
|
|
|
|
|
|
|
// skip the comment
|
|
|
|
// .. and return a NL token
|
2024-09-09 16:41:49 +00:00
|
|
|
case ';':
|
|
|
|
case '#':
|
|
|
|
skip_comment(lexer);
|
2024-09-13 15:11:18 +00:00
|
|
|
token->type = TOK_NL;
|
|
|
|
break;
|
2024-10-04 23:41:10 +00:00
|
|
|
|
|
|
|
// skip the whitespace and
|
|
|
|
// try to parse the next character
|
2024-09-09 16:41:49 +00:00
|
|
|
case ' ':
|
|
|
|
case '\t':
|
|
|
|
// skip white space
|
|
|
|
lex_next(lexer);
|
|
|
|
goto again;
|
2024-10-04 23:41:10 +00:00
|
|
|
|
|
|
|
// return a NL token
|
2024-09-09 16:41:49 +00:00
|
|
|
case '\n':
|
|
|
|
lex_next(lexer);
|
|
|
|
token->type = TOK_NL;
|
|
|
|
break;
|
2024-10-04 23:41:10 +00:00
|
|
|
|
|
|
|
// return a comma token
|
2024-09-09 16:41:49 +00:00
|
|
|
case ',':
|
|
|
|
lex_next(lexer);
|
|
|
|
token->type = TOK_COMMA;
|
|
|
|
break;
|
2024-10-04 23:41:10 +00:00
|
|
|
|
|
|
|
// return a equal token
|
2024-09-09 16:41:49 +00:00
|
|
|
case '=':
|
|
|
|
lex_next(lexer);
|
|
|
|
token->type = TOK_EQUAL;
|
|
|
|
break;
|
2024-10-04 23:41:10 +00:00
|
|
|
|
|
|
|
// return a left paren token
|
2024-09-09 16:41:49 +00:00
|
|
|
case '(':
|
|
|
|
lex_next(lexer);
|
|
|
|
token->type = TOK_LPAREN;
|
|
|
|
break;
|
2024-10-04 23:41:10 +00:00
|
|
|
|
|
|
|
// return a right paren token
|
2024-09-09 16:41:49 +00:00
|
|
|
case ')':
|
|
|
|
token->type = TOK_RPAREN;
|
|
|
|
lex_next(lexer);
|
|
|
|
break;
|
2024-10-04 23:41:10 +00:00
|
|
|
|
|
|
|
// return a register token
|
2024-09-09 16:41:49 +00:00
|
|
|
case '$':
|
|
|
|
token->type = TOK_REG;
|
|
|
|
lex_next(lexer);
|
2024-10-04 23:41:10 +00:00
|
|
|
res = lex_ident(lexer, &token->string, '\0');
|
2024-09-09 16:41:49 +00:00
|
|
|
break;
|
2024-10-04 23:41:10 +00:00
|
|
|
|
|
|
|
// return a directive token
|
2024-09-09 16:41:49 +00:00
|
|
|
case '.':
|
|
|
|
token->type = TOK_DIRECTIVE;
|
|
|
|
lex_next(lexer);
|
2024-10-04 23:41:10 +00:00
|
|
|
res = lex_ident(lexer, &token->string, '.');
|
2024-09-09 16:41:49 +00:00
|
|
|
break;
|
2024-10-04 23:41:10 +00:00
|
|
|
|
|
|
|
// return a string token
|
2024-09-09 16:41:49 +00:00
|
|
|
case '"':
|
|
|
|
token->type = TOK_STRING;
|
|
|
|
lex_next(lexer);
|
2024-10-04 23:41:10 +00:00
|
|
|
res = lex_string(lexer, &token->string);
|
2024-09-09 16:41:49 +00:00
|
|
|
break;
|
2024-10-04 23:41:10 +00:00
|
|
|
|
|
|
|
// return a number token
|
2024-09-13 15:11:18 +00:00
|
|
|
case '-':
|
2024-09-09 16:41:49 +00:00
|
|
|
case '0':
|
|
|
|
case '1':
|
|
|
|
case '2':
|
|
|
|
case '3':
|
|
|
|
case '4':
|
|
|
|
case '5':
|
|
|
|
case '6':
|
|
|
|
case '7':
|
|
|
|
case '8':
|
|
|
|
case '9':
|
|
|
|
token->type = TOK_NUMBER;
|
|
|
|
res = lex_number(lexer, &token->number);
|
|
|
|
break;
|
2024-10-04 23:41:10 +00:00
|
|
|
|
|
|
|
// return a ident or label token depending
|
|
|
|
// if it ends with a colon
|
2024-09-09 16:41:49 +00:00
|
|
|
default:
|
|
|
|
token->type = TOK_IDENT;
|
2024-10-04 23:41:10 +00:00
|
|
|
res = lex_ident(lexer, &token->string, '\0');
|
2024-09-09 16:41:49 +00:00
|
|
|
if (lex_peek(lexer) == ':') {
|
|
|
|
lex_next(lexer);
|
|
|
|
token->type = TOK_LABEL;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2024-10-04 23:41:10 +00:00
|
|
|
|
2024-09-09 16:41:49 +00:00
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
int lexer_init(const char *path, struct lexer *lexer)
|
|
|
|
{
|
2024-10-04 23:41:10 +00:00
|
|
|
/// defaults
|
|
|
|
lexer->file = NULL;
|
2024-09-09 16:41:49 +00:00
|
|
|
lexer->peek = EOF;
|
2024-09-10 22:23:46 +00:00
|
|
|
lexer->x = 1;
|
|
|
|
lexer->y = 1;
|
2024-10-04 23:41:10 +00:00
|
|
|
|
|
|
|
/// load file
|
|
|
|
lexer->file = fopen(path, "r");
|
|
|
|
if (lexer->file == NULL) {
|
|
|
|
PERROR("cannot read");
|
|
|
|
return M_ERROR;
|
|
|
|
}
|
|
|
|
|
2024-09-09 16:41:49 +00:00
|
|
|
return M_SUCCESS;
|
|
|
|
}
|
|
|
|
|
2024-10-04 23:41:10 +00:00
|
|
|
void lexer_free(struct lexer *lexer)
|
2024-09-09 16:41:49 +00:00
|
|
|
{
|
2024-10-04 23:41:10 +00:00
|
|
|
if (lexer->file)
|
|
|
|
fclose(lexer->file);
|
2024-09-09 16:41:49 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
char *token_str(enum token_type type)
|
|
|
|
{
|
|
|
|
switch (type) {
|
2024-10-04 23:41:10 +00:00
|
|
|
case TOK_IDENT:
|
2024-09-09 16:41:49 +00:00
|
|
|
return "ident";
|
2024-10-04 23:41:10 +00:00
|
|
|
case TOK_REG:
|
2024-09-09 16:41:49 +00:00
|
|
|
return "register";
|
2024-10-04 23:41:10 +00:00
|
|
|
case TOK_LABEL:
|
2024-09-09 16:41:49 +00:00
|
|
|
return "label";
|
2024-10-04 23:41:10 +00:00
|
|
|
case TOK_STRING:
|
2024-09-09 16:41:49 +00:00
|
|
|
return "string";
|
2024-10-04 23:41:10 +00:00
|
|
|
case TOK_COMMA:
|
2024-09-09 16:41:49 +00:00
|
|
|
return "comma";
|
2024-10-04 23:41:10 +00:00
|
|
|
case TOK_EQUAL:
|
2024-09-09 16:41:49 +00:00
|
|
|
return "equal";
|
2024-10-04 23:41:10 +00:00
|
|
|
case TOK_LPAREN:
|
2024-09-09 16:41:49 +00:00
|
|
|
return "left parentheses";
|
2024-10-04 23:41:10 +00:00
|
|
|
case TOK_RPAREN:
|
2024-09-09 16:41:49 +00:00
|
|
|
return "right parentheses";
|
2024-10-04 23:41:10 +00:00
|
|
|
case TOK_NUMBER:
|
2024-09-09 16:41:49 +00:00
|
|
|
return "number";
|
2024-10-04 23:41:10 +00:00
|
|
|
case TOK_EOF:
|
2024-09-09 16:41:49 +00:00
|
|
|
return "end of file";
|
2024-10-04 23:41:10 +00:00
|
|
|
case TOK_NL:
|
2024-09-09 16:41:49 +00:00
|
|
|
return "new line";
|
2024-10-04 23:41:10 +00:00
|
|
|
case TOK_DIRECTIVE:
|
2024-09-09 16:41:49 +00:00
|
|
|
return "directive";
|
2024-10-04 23:41:10 +00:00
|
|
|
}
|
2024-09-09 16:41:49 +00:00
|
|
|
return "unknown";
|
|
|
|
}
|
2024-09-13 15:11:18 +00:00
|
|
|
|
2024-10-04 23:41:10 +00:00
|
|
|
/* save the current state from the lexer */
|
2024-09-13 15:11:18 +00:00
|
|
|
void lexer_save(struct lexer *lexer, struct lexer_state *state)
|
|
|
|
{
|
|
|
|
state->x = lexer->x;
|
|
|
|
state->y = lexer->y;
|
|
|
|
state->peek = lexer->peek;
|
|
|
|
state->offset = ftell(lexer->file);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* load a different state into a lexer */
|
|
|
|
void lexer_load(struct lexer *lexer, const struct lexer_state *state)
|
|
|
|
{
|
|
|
|
lexer->x = state->x;
|
|
|
|
lexer->y = state->y;
|
|
|
|
lexer->peek = state->peek;
|
|
|
|
fseek(lexer->file, state->offset, SEEK_SET);
|
|
|
|
}
|
2024-10-04 23:41:10 +00:00
|
|
|
|
|
|
|
void token_free(struct token *token)
|
|
|
|
{
|
|
|
|
switch (token->type) {
|
|
|
|
case TOK_REG:
|
|
|
|
case TOK_IDENT:
|
|
|
|
case TOK_LABEL:
|
|
|
|
case TOK_STRING:
|
|
|
|
case TOK_DIRECTIVE:
|
|
|
|
if (token->string.str)
|
|
|
|
free(token->string.str);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
}
|
|
|
|
}
|