use std::fmt::{Debug, Display}; use crate::prelude::*; pub struct RegexToken { regex: Regex } impl Debug for RegexToken { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{:?}", self.regex) } } impl PartialEq for RegexToken { fn eq(&self, other: &Self) -> bool { self.regex.as_str().eq(other.regex.as_str()) } } impl From for RegexToken { fn from(regex: Regex) -> Self { Self { regex } } } impl From for Regex { fn from(value: RegexToken) -> Self { value.regex } } #[derive(Debug, Clone, PartialEq, Eq, Copy)] pub struct Position { pub row: usize, pub col: usize, } impl Default for Position { fn default() -> Self { Self { row: 1, col: 1 } } } impl Display for Position { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}:{}", self.row, self.col) } } impl Display for TokenData { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{self:?}") } } impl Display for Token { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.data) } } #[derive(Debug, PartialEq)] pub enum TokenData { //syntax LeftParen, RightParen, LeftBrack, RightBrack, LeftBrace, RightBrace, Assign, Access, SemiColon, Arrow, ThinArrow, Comma, Range, RangeEq, Colon, Backslash, Varadic, Pipe, // math Regex(RegexToken), Int(i64), Float(f64), Complex(f64), String(Rc), Ident(Rc), // equality Equal, NotEqual, GreaterEqual, LessEqual, GreaterThan, LessThan, And, Or, Not, BitwiseShiftLeft, BitwiseShiftRight, BitwiseAnd, BitwiseOr, BitwiseXor, Add, Subtract, Multiply, Divide, Modulo, Power, AssignAnd, AssignOr, AssignBitwiseShiftLeft, AssignBitwiseShiftRight, AssignBitwiseAnd, AssignBitwiseOr, AssignBitwiseXor, AssignAdd, AssignSubtract, AssignMultiply, AssignDivide, AssignModulo, AssignPower, // keywords If, Else, While, Let, Const, Function, True, False, Nil, Continue, Break, Do, Loop, Return, For, In, Try, Catch, // eof Eof, // comment Comment, } #[derive(Debug, PartialEq)] pub struct Token { pub data: TokenData, pub pos: Position, pub str: String, pub bidx: usize, pub blen: usize, } pub struct Lexer { pub index: usize, len: usize, data: Vec, pos: Position, byte_len: usize, } impl Default for Token { fn default() -> Self { Self { data: TokenData::Eof, pos: Position {row: 0, col: 0}, str: "".into(), bidx: 0, blen: 0, } } } trait IsIdent { fn is_initial_ident(&self) -> bool; fn is_ident(&self) -> bool; } impl IsIdent for char { fn is_initial_ident(&self) -> bool { self.is_alphabetic() || *self == '_' } fn is_ident(&self) -> bool { self.is_alphanumeric() || *self == '_' } } impl> From for Lexer { fn from(value: T) -> Self { Self::new(value) } } macro_rules! error { ($($arg:tt)*) => { exception!(PARSE_EXCEPTION, $($arg)*) }; } impl Lexer { pub fn new>(input: T) -> Self { let data: Vec = input.into().chars().collect(); Self { index: 0, len: data.len(), data, pos: Position::default(), byte_len: 0 } } fn peek(&self) -> char { if self.index >= self.len { return '\0'; } self.data[self.index] } fn next(&mut self) -> char { let c = self.peek(); self.index += 1; self.byte_len += c.len_utf8(); self.pos.col += 1; if c == '\n' { self.pos.col = 1; self.pos.row += 1; } c } fn next_not_eof(&mut self) -> Result { let c = self.next(); if c == '\0' { return Err(error!("unexpected end of file")) } Ok(c) } fn next_expect(&mut self, expected: char) -> Result { let c = self.next(); if c != expected { return Err(error!("expected character '{c}'")) } Ok(c) } fn skip_whitespace(&mut self, ignore_newlines: bool) { while self.peek().is_whitespace() && (ignore_newlines || self.peek() != '\n') { self.next(); } } fn lex_string(&mut self, delimit: char) -> Result> { let mut buf = String::new(); loop { let c = self.next_not_eof()?; if c == delimit { break; } if c != '\\' { buf.push(c); continue; } let next = self.next_not_eof()?; match next { '\'' | '\"' | '\\' => buf.push(c), '0' => buf.push('\0'), 'a' => buf.push('\x07'), 'b' => buf.push('\x08'), 't' => buf.push('\t'), 'n' => buf.push('\n'), 'v' => buf.push('\x0b'), 'f' => buf.push('\x0c'), 'r' => buf.push('\r'), 'e' => buf.push('\x1b'), 'x' => { let n1 = self.next_not_eof()?; let n2 = self.next_not_eof()?; buf.push(char::from_u32( n1.to_digit(16).ok_or(error!("invalid digit '{n1}'"))? * 16 + n2.to_digit(16).ok_or(error!("invalid digit '{n2}'"))? ).expect("bypassed digit check")); }, 'u' => { self.next_expect('{')?; let mut n = 0u32; loop { let c = self.next_not_eof()?; if c == '}' { break } if n >= 0x1000_0000_u32 { return Err(error!("invalid utf8 codepoint '{n}'")) } n = n * 16 + c.to_digit(16).ok_or(error!("invalid digit '{c}'"))?; } let ch = char::from_u32(n).ok_or(error!("invalid codepoint '{n}'"))?; buf.push(ch); }, _ => return Err(error!("invalid string escape '\\{next}'")) } } Ok(buf.into()) } fn lex_ident(&mut self, initial: char) -> Result { use TokenData as T; let mut buf = std::string::String::new(); if !initial.is_initial_ident() { return Err(error!("unexpected character '{initial}'")) } buf.push(initial); loop { if self.peek().is_ident() { buf.push(self.next()); } else { break; } } Ok(match buf.as_str() { "if" => T::If, "else" => T::Else, "while" => T::While, "let" => T::Let, "const" => T::Const, "fn" | "function" => T::Function, "true" => T::True, "false" => T::False, "nil" => T::Nil, "continue" => T::Continue, "break" => T::Break, "do" => T::Do, "loop" => T::Loop, "and" => T::And, "or" => T::Or, "not" => T::Not, "return" => T::Return, "for" => T::For, "in" => T::In, "try" => T::Try, "catch" => T::Catch, _ => T::Ident(buf.into()) }) } fn lex_radix(&mut self, radix: i64, radix_char: char) -> Result { use TokenData as T; let mut n = 0i64; let mut char_found = false; loop { if let Some(i) = self.peek().to_digit(radix as u32) { self.next(); n = n * radix + (i as i64); char_found = true; } else if self.peek().is_ident() { return Err(error!("invalid digit '{}'", self.peek())) } else { break; } } if char_found { Ok(T::Int(n)) } else { Err(error!("invalid number radix specifier '0{radix_char}'")) } } fn lex_number(&mut self, initial: char) -> Result { if initial == '0' { match self.peek() { 'x' => { self.next(); return self.lex_radix(16, 'x') } 'o' => { self.next(); return self.lex_radix(8, 'o'); } _ => () } } let mut buf = String::new(); buf.push(initial); let mut pos = self.pos; let mut idx = self.index; let mut bidx = self.byte_len; if initial != '.' { loop { if !self.peek().is_ascii_digit() { break; } buf.push(self.next()); } if self.peek() == '.' { pos = self.pos; idx = self.index; bidx = self.byte_len; buf.push(self.next()); } } let last: char = buf.chars().last().unwrap_or('\0'); let is_range = initial != '.' && last == '.' && self.peek() == '.'; if is_range { self.pos = pos; self.index = idx; self.byte_len = bidx; buf.pop(); } else { loop { if !self.peek().is_ascii_digit() { break; } buf.push(self.next()); } if self.peek() == 'e' || self.peek() == 'E' { buf.push(self.next()); if self.peek() == '+' || self.peek() == '-' { buf.push(self.next()); } loop { if !self.peek().is_ascii_digit() { break; } buf.push(self.next()); } } } let complex = self.peek() == 'i'; if complex { self.next(); } if self.peek().is_ident() { return Err(error!("unexpected character '{}'", self.peek())) } if let Ok(int) = buf.parse::() { use TokenData as T; if complex { return Ok(T::Complex(int as f64)) } return Ok(T::Int(int)) } if let Ok(float) = buf.parse::() { use TokenData as T; if complex { return Ok(T::Complex(float)) } return Ok(T::Float(float)) } Err(error!("invalid number '{buf}'")) } fn lex_comment(&mut self) -> TokenData { loop { match self.peek() { '\0' | '\n' => break, _ => self.next() }; } TokenData::Comment } fn lex_block_comment(&mut self) -> TokenData { loop { match self.peek() { '\0' => break, '*' => { self.next(); if self.peek() == '/' { self.next(); break; } } _ => {self.next();} }; } TokenData::Comment } fn read_token_impl(&mut self, ignore_newlines: bool) -> Result { use TokenData as T; self.skip_whitespace(ignore_newlines); let str_start = self.index; let byte_start = self.byte_len; let pos = self.pos; let char = self.next(); let next = self.peek(); if char == '\0' { let data = if ignore_newlines { T::Eof } else { T::SemiColon }; return Ok(Token { data, pos, str: String::new(), bidx: byte_start, blen: 0, }) } let data = match char { '\n' => T::SemiColon, '(' => T::LeftParen, ')' => T::RightParen, '[' => T::LeftBrack, ']' => T::RightBrack, '{' => T::LeftBrace, '}' => T::RightBrace, ':' => T::Colon, '\\' => T::Backslash, ';' => T::SemiColon, '+' => { match next { '=' => { self.next(); T::AssignAdd } _ => T::Add } }, '/' => { match next { '=' => { self.next(); T::AssignDivide }, '/' => { self.next(); self.lex_comment() }, '*' => { self.next(); self.lex_block_comment() } _ => T::Divide } }, '%' => { match next { '=' => { self.next(); T::AssignModulo } _ => T::Modulo } }, ',' => T::Comma, '*' => { match next { '*' => { self.next(); match self.peek() { '=' => { self.next(); T::AssignPower }, _ => T::Power } }, '=' => { self.next(); T::AssignMultiply } _ => T::Multiply } }, '!' => { match next { '=' => { self.next(); T::NotEqual } _ => T::Not } } '&' => { match next { '&' => { self.next(); match self.peek() { '=' => { self.next(); T::AssignAnd }, _ => T::And } }, '=' => { self.next(); T::AssignBitwiseAnd }, _ => T::BitwiseAnd } }, '|' => { match next { '|' => { self.next(); match self.peek() { '=' => { self.next(); T::AssignOr }, _ => T::Or } }, '=' => { self.next(); T::AssignBitwiseOr }, '>' => { self.next(); T::Pipe }, _ => T::BitwiseOr } }, '-' => { match next { '>' => { self.next(); T::ThinArrow }, '=' => { self.next(); T::AssignSubtract }, _ => T::Subtract } }, '=' => { match next { '>' => { self.next(); T::Arrow } '=' => { self.next(); T::Equal } _ => T::Assign } }, '>' => { match next { '>' => { self.next(); match self.peek() { '=' => { self.next(); T::AssignBitwiseShiftRight }, _ => T::BitwiseShiftRight } } '=' => { self.next(); T::GreaterEqual } _ => T::GreaterThan } }, '<' => { match next { '<' => { self.next(); match self.peek() { '=' => { self.next(); T::AssignBitwiseShiftLeft }, _ => T::BitwiseShiftLeft } } '=' => { self.next(); T::LessEqual } _ => T::LessThan } }, '^' => { match next { '=' => { self.next(); T::AssignBitwiseXor }, _ => T::BitwiseXor } } '\'' | '\"' => T::String(self.lex_string(char)?), 'r' => { match next { '\'' | '\"' => { self.next(); T::Regex(regex::Regex::new(&self.lex_string(next)?) .map(|e| e.into()) .map_err(|e| error!("invalid regex: '{e}'"))?) } _ => { self.lex_ident(char)? } } }, '.' => { if next == '.' { self.next(); match self.peek() { '.' => { self.next(); T::Varadic }, '=' => { self.next(); T::RangeEq }, _ => T::Range } } else if next.is_digit(10) { self.lex_number(char)? } else { T::Access } }, _ => { if char.is_digit(10) { self.lex_number(char)? } else { self.lex_ident(char)? } }, }; let str_end = self.index; let byte_end = self.byte_len; let str = self.data[str_start..str_end].to_owned().into_iter().collect(); Ok(Token { data, pos, str, bidx: byte_start, blen: byte_end - byte_start }) } fn save_state(&self) -> (Position, usize, usize) { (self.pos, self.index, self.byte_len) } fn restore_state(&mut self, state: (Position, usize, usize)) { self.pos = state.0; self.index = state.1; self.byte_len = state.2; } pub fn read_token( &mut self, peek: bool, // reset state to undo read keep_comments: bool, // dont ignore comment tokens significant_newlines: bool // newlines become semicolons ) -> Result { let state = self.save_state(); loop { let token = self.read_token_impl(!significant_newlines)?; if !keep_comments && token.data == TokenData::Comment { continue; } if peek { self.restore_state(state); } return Ok(token) } } pub fn next_token(&mut self) -> Result { let pos = self.pos; match self.read_token(false, false, false) { Ok(token) => Ok(token), Err(e) => Err(e.pos(pos)), } } pub fn next_token_nl(&mut self) -> Result { let pos = self.pos; match self.read_token(false, false, true) { Ok(token) => Ok(token), Err(e) => Err(e.pos(pos)), } } pub fn peek_token(&mut self) -> Result { let pos = self.pos; match self.read_token(true, false, false) { Ok(token) => Ok(token), Err(e) => Err(e.pos(pos)), } } pub fn peek_token_nl(&mut self) -> Result { let pos = self.pos; match self.read_token(true, false, true) { Ok(token) => Ok(token), Err(e) => Err(e.pos(pos)), } } pub fn next_token_cmt(&mut self) -> Result { let pos = self.pos; match self.read_token(false, true, false) { Ok(token) => Ok(token), Err(e) => Err(e.pos(pos)), } } }