diff options
Diffstat (limited to 'matrix-lang/src/lex.rs')
-rw-r--r-- | matrix-lang/src/lex.rs | 777 |
1 files changed, 777 insertions, 0 deletions
diff --git a/matrix-lang/src/lex.rs b/matrix-lang/src/lex.rs new file mode 100644 index 0000000..b2487ad --- /dev/null +++ b/matrix-lang/src/lex.rs @@ -0,0 +1,777 @@ +use std::fmt::{Debug, Display}; +use crate::prelude::*; + +pub struct RegexToken { + regex: Regex +} + +impl Debug for RegexToken { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self.regex) + } +} + +impl PartialEq for RegexToken { + fn eq(&self, other: &Self) -> bool { + self.regex.as_str().eq(other.regex.as_str()) + } +} + +impl From<Regex> for RegexToken { + fn from(regex: Regex) -> Self { + Self { regex } + } +} + +impl From<RegexToken> for Regex { + fn from(value: RegexToken) -> Self { + value.regex + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Copy)] +pub struct Position { + pub row: usize, + pub col: usize, +} + +impl Default for Position { + fn default() -> Self { + Self { row: 1, col: 1 } + } +} + +impl Display for Position { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}:{}", self.row, self.col) + } +} + +impl Display for TokenData { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{self:?}") + } +} + +impl Display for Token { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.data) + } +} + +#[derive(Debug, PartialEq)] +pub enum TokenData { + //syntax + LeftParen, + RightParen, + LeftBrack, + RightBrack, + LeftBrace, + RightBrace, + Assign, + Access, + SemiColon, + Arrow, + ThinArrow, + Comma, + Range, + RangeEq, + Colon, + Backslash, + Varadic, + Pipe, + + // math + Regex(RegexToken), + Int(i64), + Float(f64), + Complex(f64), + String(Rc<str>), + Ident(Rc<str>), + + // equality + Equal, + NotEqual, + GreaterEqual, + LessEqual, + GreaterThan, + LessThan, + + And, + Or, + Not, + + BitwiseShiftLeft, + BitwiseShiftRight, + BitwiseAnd, + BitwiseOr, + BitwiseXor, + + Add, + Subtract, + Multiply, + Divide, + Modulo, + Power, + + AssignAnd, + AssignOr, + + AssignBitwiseShiftLeft, + AssignBitwiseShiftRight, + AssignBitwiseAnd, + AssignBitwiseOr, + AssignBitwiseXor, + + AssignAdd, + AssignSubtract, + AssignMultiply, + AssignDivide, + AssignModulo, + AssignPower, + + // keywords + If, + Else, + While, + Let, + Const, + Function, + True, + False, + Nil, + Continue, + Break, + Do, + Loop, + Return, + For, + In, + Try, + Catch, + + // eof + Eof, +} + +#[derive(Debug, PartialEq)] +pub struct Token { + pub data: TokenData, + pub pos: Position, + pub str: String, + pub bidx: usize, + pub blen: usize, +} + +pub struct Lexer { + pub index: usize, + len: usize, + data: Vec<char>, + pos: Position, + byte_len: usize, +} + +trait IsIdent { + fn is_initial_ident(&self) -> bool; + fn is_ident(&self) -> bool; +} + +impl IsIdent for char { + fn is_initial_ident(&self) -> bool { + self.is_alphabetic() || *self == '_' + } + fn is_ident(&self) -> bool { + self.is_alphanumeric() || *self == '_' + } +} + +impl<T: Into<String>> From<T> for Lexer { + fn from(value: T) -> Self { + Self::new(value) + } +} + +macro_rules! error { + ($($arg:tt)*) => { + exception!(PARSE_EXCEPTION, $($arg)*) + }; +} + +impl Lexer { + pub fn new<T: Into<String>>(input: T) -> Self { + let data: Vec<char> = input.into().chars().collect(); + Self { + index: 0, + len: data.len(), + data, + pos: Position::default(), + byte_len: 0 + } + } + + fn peek(&self) -> char { + if self.index >= self.len { + return '\0'; + } + self.data[self.index] + } + + fn next(&mut self) -> char { + let c = self.peek(); + self.index += 1; + self.byte_len += c.len_utf8(); + + self.pos.col += 1; + if c == '\n' { + self.pos.col = 1; + self.pos.row += 1; + } + + c + } + + fn next_not_eof(&mut self) -> Result<char> { + let c = self.next(); + if c == '\0' { + return Err(error!("unexpected end of file")) + } + Ok(c) + } + + fn next_expect(&mut self, expected: char) -> Result<char> { + let c = self.next(); + if c != expected { + return Err(error!("expected character '{c}'")) + } + Ok(c) + } + + fn skip_whitespace(&mut self, ignore_newlines: bool) { + while self.peek().is_whitespace() && (ignore_newlines || self.peek() != '\n') { + self.next(); + } + } + + fn lex_string(&mut self, delimit: char) -> Result<Rc<str>> { + + let mut buf = String::new(); + + loop { + let c = self.next_not_eof()?; + + if c == delimit { + break; + } + + if c != '\\' { + buf.push(c); + continue; + } + + let next = self.next_not_eof()?; + match next { + '\'' | '\"' | '\\' => buf.push(c), + '0' => buf.push('\0'), + 'a' => buf.push('\x07'), + 'b' => buf.push('\x08'), + 't' => buf.push('\t'), + 'n' => buf.push('\n'), + 'v' => buf.push('\x0b'), + 'f' => buf.push('\x0c'), + 'r' => buf.push('\r'), + 'e' => buf.push('\x1b'), + 'x' => { + let n1 = self.next_not_eof()?; + let n2 = self.next_not_eof()?; + buf.push(char::from_u32( + n1.to_digit(16).ok_or(error!("invalid digit '{n1}'"))? * 16 + + n2.to_digit(16).ok_or(error!("invalid digit '{n2}'"))? + ).unwrap()); + }, + 'u' => { + self.next_expect('{')?; + let mut n = 0u32; + loop { + let c = self.next_not_eof()?; + if c == '}' { break } + if n >= 0x1000_0000_u32 { + return Err(error!("invalid utf8 codepoint '{n}'")) + } + n = n * 16 + c.to_digit(16).ok_or(error!("invalid digit '{c}'"))?; + } + let ch = char::from_u32(n).ok_or(error!("invalid codepoint '{n}'"))?; + buf.push(ch); + + }, + _ => return Err(error!("invalid string escape '\\{next}'")) + } + } + + Ok(buf.into()) + } + + fn lex_ident(&mut self, initial: char) -> Result<TokenData> { + use TokenData as T; + + let mut buf = std::string::String::new(); + + if !initial.is_initial_ident() { + return Err(error!("unexpected character '{initial}'")) + } + + buf.push(initial); + + loop { + if self.peek().is_ident() { + buf.push(self.next()); + } else { + break; + } + } + + Ok(match buf.as_str() { + "if" => T::If, + "else" => T::Else, + "while" => T::While, + "let" => T::Let, + "const" => T::Const, + "fn" | "function" => T::Function, + "true" => T::True, + "false" => T::False, + "nil" => T::Nil, + "continue" => T::Continue, + "break" => T::Break, + "do" => T::Do, + "loop" => T::Loop, + "and" => T::And, + "or" => T::Or, + "not" => T::Not, + "return" => T::Return, + "for" => T::For, + "in" => T::In, + "try" => T::Try, + "catch" => T::Catch, + _ => T::Ident(buf.into()) + }) + } + + fn lex_radix(&mut self, radix: i64, radix_char: char) -> Result<TokenData> { + use TokenData as T; + + let mut n = 0i64; + let mut char_found = false; + + loop { + if let Some(i) = self.peek().to_digit(radix as u32) { + self.next(); + n = n * radix + (i as i64); + char_found = true; + } else if self.peek().is_ident() { + return Err(error!("invalid digit '{}'", self.peek())) + } else { + break; + } + } + + if char_found { + Ok(T::Int(n)) + } else { + Err(error!("invalid number radix specifier '0{radix_char}'")) + } + } + + fn lex_number(&mut self, initial: char) -> Result<TokenData> { + if initial == '0' { + match self.peek() { + 'x' => { + self.next(); + return self.lex_radix(16, 'x') + } + 'o' => { + self.next(); + return self.lex_radix(8, 'o'); + } + _ => () + } + } + + let mut buf = String::new(); + buf.push(initial); + + let mut pos = self.pos; + let mut idx = self.index; + let mut bidx = self.byte_len; + + if initial != '.' { + loop { + if !self.peek().is_ascii_digit() { break; } + buf.push(self.next()); + } + + if self.peek() == '.' { + pos = self.pos; + idx = self.index; + bidx = self.byte_len; + buf.push(self.next()); + } + } + + let last: char = buf.chars().last().unwrap(); + let is_range = initial != '.' && last == '.' && self.peek() == '.'; + + if is_range { + self.pos = pos; + self.index = idx; + self.byte_len = bidx; + buf.pop(); + } else { + loop { + if !self.peek().is_ascii_digit() { break; } + buf.push(self.next()); + } + + if self.peek() == 'e' || self.peek() == 'E' { + buf.push(self.next()); + if self.peek() == '+' || self.peek() == '-' { + buf.push(self.next()); + } + + loop { + if !self.peek().is_ascii_digit() { break; } + buf.push(self.next()); + } + } + } + + let complex = self.peek() == 'i'; + if complex { + self.next(); + } + + if self.peek().is_ident() { + return Err(error!("unexpected character '{}'", self.peek())) + } + + if let Ok(int) = buf.parse::<i64>() { + use TokenData as T; + if complex { + return Ok(T::Complex(int as f64)) + } + return Ok(T::Int(int)) + } + + if let Ok(float) = buf.parse::<f64>() { + use TokenData as T; + if complex { + return Ok(T::Complex(float)) + } + return Ok(T::Float(float)) + } + + Err(error!("invalid number '{buf}'")) + } + + fn read_token(&mut self, ignore_newlines: bool) -> Result<Token> { + use TokenData as T; + + self.skip_whitespace(ignore_newlines); + + let str_start = self.index; + let byte_start = self.byte_len; + + let pos = self.pos; + let char = self.next(); + let next = self.peek(); + + if char == '\0' { + let data = if ignore_newlines { T::Eof } else { T::SemiColon }; + return Ok(Token { + data, + pos, + str: String::new(), + bidx: byte_start, + blen: 0, + }) + } + + let data = match char { + '\n' => T::SemiColon, + '(' => T::LeftParen, + ')' => T::RightParen, + '[' => T::LeftBrack, + ']' => T::RightBrack, + '{' => T::LeftBrace, + '}' => T::RightBrace, + ':' => T::Colon, + '\\' => T::Backslash, + ';' => T::SemiColon, + '+' => { + match next { + '=' => { + self.next(); + T::AssignAdd + } + _ => T::Add + } + }, + '/' => { + match next { + '=' => { + self.next(); + T::AssignDivide + } + _ => T::Divide + } + }, + '%' => { + match next { + '=' => { + self.next(); + T::AssignModulo + } + _ => T::Modulo + } + }, + ',' => T::Comma, + '*' => { + match next { + '*' => { + self.next(); + match self.peek() { + '=' => { + self.next(); + T::AssignPower + }, + _ => T::Power + } + }, + '=' => { + self.next(); + T::AssignMultiply + } + _ => T::Multiply + } + }, + '!' => { + match next { + '=' => { + self.next(); + T::NotEqual + } + _ => T::Not + } + } + '&' => { + match next { + '&' => { + self.next(); + match self.peek() { + '=' => { + self.next(); + T::AssignAnd + }, + _ => T::And + } + }, + '=' => { + self.next(); + T::AssignBitwiseAnd + }, + _ => T::BitwiseAnd + } + }, + '|' => { + match next { + '|' => { + self.next(); + match self.peek() { + '=' => { + self.next(); + T::AssignOr + }, + _ => T::Or + } + }, + '=' => { + self.next(); + T::AssignBitwiseOr + }, + '>' => { + self.next(); + T::Pipe + }, + _ => T::BitwiseOr + } + }, + '-' => { + match next { + '>' => { + self.next(); + T::ThinArrow + }, + '=' => { + self.next(); + T::AssignSubtract + }, + _ => T::Subtract + } + }, + '=' => { + match next { + '>' => { + self.next(); + T::Arrow + } + '=' => { + self.next(); + T::Equal + } + _ => T::Assign + } + }, + '>' => { + match next { + '>' => { + self.next(); + match self.peek() { + '=' => { + self.next(); + T::AssignBitwiseShiftRight + }, + _ => T::BitwiseShiftRight + } + } + '=' => { + self.next(); + T::GreaterEqual + } + _ => T::GreaterThan + } + }, + '<' => { + match next { + '<' => { + self.next(); + match self.peek() { + '=' => { + self.next(); + T::AssignBitwiseShiftLeft + }, + _ => T::BitwiseShiftLeft + } + } + '=' => { + self.next(); + T::LessEqual + } + _ => T::LessThan + } + }, + '^' => { + match next { + '=' => { + self.next(); + T::AssignBitwiseXor + }, + _ => T::BitwiseXor + } + } + '\'' | '\"' => T::String(self.lex_string(char)?), + 'r' => { + match next { + '\'' | '\"' => { + self.next(); + T::Regex(regex::Regex::new(&self.lex_string(next)?) + .map(|e| e.into()) + .map_err(|e| error!("invalid regex: '{e}'"))?) + } + _ => { + self.lex_ident(char)? + } + } + }, + '.' => { + if next == '.' { + self.next(); + match self.peek() { + '.' => { + self.next(); + T::Varadic + }, + '=' => { + self.next(); + T::RangeEq + }, + _ => T::Range + } + } else if next.is_digit(10) { + self.lex_number(char)? + } else { + T::Access + } + }, + _ => { + if char.is_digit(10) { + self.lex_number(char)? + } else { + self.lex_ident(char)? + } + }, + }; + + let str_end = self.index; + let byte_end = self.byte_len; + let str = self.data[str_start..str_end].to_owned().into_iter().collect(); + Ok(Token { + data, + pos, + str, + bidx: byte_start, + blen: byte_end - byte_start + }) + } + + pub fn next_token(&mut self) -> Result<Token> { + let pos = self.pos; + match self.read_token(true) { + Ok(token) => Ok(token), + Err(e) => Err(e.pos(pos)), + } + } + + pub fn next_token_nl(&mut self) -> Result<Token> { + let pos = self.pos; + match self.read_token(false) { + Ok(token) => Ok(token), + Err(e) => Err(e.pos(pos)), + } + } + + pub fn peek_token(&mut self) -> Result<Token> { + let idx = self.index; + let pos = self.pos; + let bidx = self.byte_len; + let token = self.read_token(true); + self.index = idx; + self.pos = pos; + self.byte_len = bidx; + match token { + Ok(token) => Ok(token), + Err(e) => Err(e.pos(pos)), + } + } + + pub fn peek_token_nl(&mut self) -> Result<Token> { + let idx = self.index; + let pos = self.pos; + let bidx = self.byte_len; + let token = self.read_token(false); + self.index = idx; + self.pos = pos; + self.byte_len = bidx; + match token { + Ok(token) => Ok(token), + Err(e) => Err(e.pos(pos)), + } + } +} |