summaryrefslogtreecommitdiff
path: root/matrix-lang/src/lex.rs
diff options
context:
space:
mode:
Diffstat (limited to 'matrix-lang/src/lex.rs')
-rw-r--r--matrix-lang/src/lex.rs777
1 files changed, 777 insertions, 0 deletions
diff --git a/matrix-lang/src/lex.rs b/matrix-lang/src/lex.rs
new file mode 100644
index 0000000..b2487ad
--- /dev/null
+++ b/matrix-lang/src/lex.rs
@@ -0,0 +1,777 @@
+use std::fmt::{Debug, Display};
+use crate::prelude::*;
+
+pub struct RegexToken {
+ regex: Regex
+}
+
+impl Debug for RegexToken {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "{:?}", self.regex)
+ }
+}
+
+impl PartialEq for RegexToken {
+ fn eq(&self, other: &Self) -> bool {
+ self.regex.as_str().eq(other.regex.as_str())
+ }
+}
+
+impl From<Regex> for RegexToken {
+ fn from(regex: Regex) -> Self {
+ Self { regex }
+ }
+}
+
+impl From<RegexToken> for Regex {
+ fn from(value: RegexToken) -> Self {
+ value.regex
+ }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Copy)]
+pub struct Position {
+ pub row: usize,
+ pub col: usize,
+}
+
+impl Default for Position {
+ fn default() -> Self {
+ Self { row: 1, col: 1 }
+ }
+}
+
+impl Display for Position {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "{}:{}", self.row, self.col)
+ }
+}
+
+impl Display for TokenData {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "{self:?}")
+ }
+}
+
+impl Display for Token {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "{}", self.data)
+ }
+}
+
+#[derive(Debug, PartialEq)]
+pub enum TokenData {
+ //syntax
+ LeftParen,
+ RightParen,
+ LeftBrack,
+ RightBrack,
+ LeftBrace,
+ RightBrace,
+ Assign,
+ Access,
+ SemiColon,
+ Arrow,
+ ThinArrow,
+ Comma,
+ Range,
+ RangeEq,
+ Colon,
+ Backslash,
+ Varadic,
+ Pipe,
+
+ // math
+ Regex(RegexToken),
+ Int(i64),
+ Float(f64),
+ Complex(f64),
+ String(Rc<str>),
+ Ident(Rc<str>),
+
+ // equality
+ Equal,
+ NotEqual,
+ GreaterEqual,
+ LessEqual,
+ GreaterThan,
+ LessThan,
+
+ And,
+ Or,
+ Not,
+
+ BitwiseShiftLeft,
+ BitwiseShiftRight,
+ BitwiseAnd,
+ BitwiseOr,
+ BitwiseXor,
+
+ Add,
+ Subtract,
+ Multiply,
+ Divide,
+ Modulo,
+ Power,
+
+ AssignAnd,
+ AssignOr,
+
+ AssignBitwiseShiftLeft,
+ AssignBitwiseShiftRight,
+ AssignBitwiseAnd,
+ AssignBitwiseOr,
+ AssignBitwiseXor,
+
+ AssignAdd,
+ AssignSubtract,
+ AssignMultiply,
+ AssignDivide,
+ AssignModulo,
+ AssignPower,
+
+ // keywords
+ If,
+ Else,
+ While,
+ Let,
+ Const,
+ Function,
+ True,
+ False,
+ Nil,
+ Continue,
+ Break,
+ Do,
+ Loop,
+ Return,
+ For,
+ In,
+ Try,
+ Catch,
+
+ // eof
+ Eof,
+}
+
+#[derive(Debug, PartialEq)]
+pub struct Token {
+ pub data: TokenData,
+ pub pos: Position,
+ pub str: String,
+ pub bidx: usize,
+ pub blen: usize,
+}
+
+pub struct Lexer {
+ pub index: usize,
+ len: usize,
+ data: Vec<char>,
+ pos: Position,
+ byte_len: usize,
+}
+
+trait IsIdent {
+ fn is_initial_ident(&self) -> bool;
+ fn is_ident(&self) -> bool;
+}
+
+impl IsIdent for char {
+ fn is_initial_ident(&self) -> bool {
+ self.is_alphabetic() || *self == '_'
+ }
+ fn is_ident(&self) -> bool {
+ self.is_alphanumeric() || *self == '_'
+ }
+}
+
+impl<T: Into<String>> From<T> for Lexer {
+ fn from(value: T) -> Self {
+ Self::new(value)
+ }
+}
+
+macro_rules! error {
+ ($($arg:tt)*) => {
+ exception!(PARSE_EXCEPTION, $($arg)*)
+ };
+}
+
+impl Lexer {
+ pub fn new<T: Into<String>>(input: T) -> Self {
+ let data: Vec<char> = input.into().chars().collect();
+ Self {
+ index: 0,
+ len: data.len(),
+ data,
+ pos: Position::default(),
+ byte_len: 0
+ }
+ }
+
+ fn peek(&self) -> char {
+ if self.index >= self.len {
+ return '\0';
+ }
+ self.data[self.index]
+ }
+
+ fn next(&mut self) -> char {
+ let c = self.peek();
+ self.index += 1;
+ self.byte_len += c.len_utf8();
+
+ self.pos.col += 1;
+ if c == '\n' {
+ self.pos.col = 1;
+ self.pos.row += 1;
+ }
+
+ c
+ }
+
+ fn next_not_eof(&mut self) -> Result<char> {
+ let c = self.next();
+ if c == '\0' {
+ return Err(error!("unexpected end of file"))
+ }
+ Ok(c)
+ }
+
+ fn next_expect(&mut self, expected: char) -> Result<char> {
+ let c = self.next();
+ if c != expected {
+ return Err(error!("expected character '{c}'"))
+ }
+ Ok(c)
+ }
+
+ fn skip_whitespace(&mut self, ignore_newlines: bool) {
+ while self.peek().is_whitespace() && (ignore_newlines || self.peek() != '\n') {
+ self.next();
+ }
+ }
+
+ fn lex_string(&mut self, delimit: char) -> Result<Rc<str>> {
+
+ let mut buf = String::new();
+
+ loop {
+ let c = self.next_not_eof()?;
+
+ if c == delimit {
+ break;
+ }
+
+ if c != '\\' {
+ buf.push(c);
+ continue;
+ }
+
+ let next = self.next_not_eof()?;
+ match next {
+ '\'' | '\"' | '\\' => buf.push(c),
+ '0' => buf.push('\0'),
+ 'a' => buf.push('\x07'),
+ 'b' => buf.push('\x08'),
+ 't' => buf.push('\t'),
+ 'n' => buf.push('\n'),
+ 'v' => buf.push('\x0b'),
+ 'f' => buf.push('\x0c'),
+ 'r' => buf.push('\r'),
+ 'e' => buf.push('\x1b'),
+ 'x' => {
+ let n1 = self.next_not_eof()?;
+ let n2 = self.next_not_eof()?;
+ buf.push(char::from_u32(
+ n1.to_digit(16).ok_or(error!("invalid digit '{n1}'"))? * 16 +
+ n2.to_digit(16).ok_or(error!("invalid digit '{n2}'"))?
+ ).unwrap());
+ },
+ 'u' => {
+ self.next_expect('{')?;
+ let mut n = 0u32;
+ loop {
+ let c = self.next_not_eof()?;
+ if c == '}' { break }
+ if n >= 0x1000_0000_u32 {
+ return Err(error!("invalid utf8 codepoint '{n}'"))
+ }
+ n = n * 16 + c.to_digit(16).ok_or(error!("invalid digit '{c}'"))?;
+ }
+ let ch = char::from_u32(n).ok_or(error!("invalid codepoint '{n}'"))?;
+ buf.push(ch);
+
+ },
+ _ => return Err(error!("invalid string escape '\\{next}'"))
+ }
+ }
+
+ Ok(buf.into())
+ }
+
+ fn lex_ident(&mut self, initial: char) -> Result<TokenData> {
+ use TokenData as T;
+
+ let mut buf = std::string::String::new();
+
+ if !initial.is_initial_ident() {
+ return Err(error!("unexpected character '{initial}'"))
+ }
+
+ buf.push(initial);
+
+ loop {
+ if self.peek().is_ident() {
+ buf.push(self.next());
+ } else {
+ break;
+ }
+ }
+
+ Ok(match buf.as_str() {
+ "if" => T::If,
+ "else" => T::Else,
+ "while" => T::While,
+ "let" => T::Let,
+ "const" => T::Const,
+ "fn" | "function" => T::Function,
+ "true" => T::True,
+ "false" => T::False,
+ "nil" => T::Nil,
+ "continue" => T::Continue,
+ "break" => T::Break,
+ "do" => T::Do,
+ "loop" => T::Loop,
+ "and" => T::And,
+ "or" => T::Or,
+ "not" => T::Not,
+ "return" => T::Return,
+ "for" => T::For,
+ "in" => T::In,
+ "try" => T::Try,
+ "catch" => T::Catch,
+ _ => T::Ident(buf.into())
+ })
+ }
+
+ fn lex_radix(&mut self, radix: i64, radix_char: char) -> Result<TokenData> {
+ use TokenData as T;
+
+ let mut n = 0i64;
+ let mut char_found = false;
+
+ loop {
+ if let Some(i) = self.peek().to_digit(radix as u32) {
+ self.next();
+ n = n * radix + (i as i64);
+ char_found = true;
+ } else if self.peek().is_ident() {
+ return Err(error!("invalid digit '{}'", self.peek()))
+ } else {
+ break;
+ }
+ }
+
+ if char_found {
+ Ok(T::Int(n))
+ } else {
+ Err(error!("invalid number radix specifier '0{radix_char}'"))
+ }
+ }
+
+ fn lex_number(&mut self, initial: char) -> Result<TokenData> {
+ if initial == '0' {
+ match self.peek() {
+ 'x' => {
+ self.next();
+ return self.lex_radix(16, 'x')
+ }
+ 'o' => {
+ self.next();
+ return self.lex_radix(8, 'o');
+ }
+ _ => ()
+ }
+ }
+
+ let mut buf = String::new();
+ buf.push(initial);
+
+ let mut pos = self.pos;
+ let mut idx = self.index;
+ let mut bidx = self.byte_len;
+
+ if initial != '.' {
+ loop {
+ if !self.peek().is_ascii_digit() { break; }
+ buf.push(self.next());
+ }
+
+ if self.peek() == '.' {
+ pos = self.pos;
+ idx = self.index;
+ bidx = self.byte_len;
+ buf.push(self.next());
+ }
+ }
+
+ let last: char = buf.chars().last().unwrap();
+ let is_range = initial != '.' && last == '.' && self.peek() == '.';
+
+ if is_range {
+ self.pos = pos;
+ self.index = idx;
+ self.byte_len = bidx;
+ buf.pop();
+ } else {
+ loop {
+ if !self.peek().is_ascii_digit() { break; }
+ buf.push(self.next());
+ }
+
+ if self.peek() == 'e' || self.peek() == 'E' {
+ buf.push(self.next());
+ if self.peek() == '+' || self.peek() == '-' {
+ buf.push(self.next());
+ }
+
+ loop {
+ if !self.peek().is_ascii_digit() { break; }
+ buf.push(self.next());
+ }
+ }
+ }
+
+ let complex = self.peek() == 'i';
+ if complex {
+ self.next();
+ }
+
+ if self.peek().is_ident() {
+ return Err(error!("unexpected character '{}'", self.peek()))
+ }
+
+ if let Ok(int) = buf.parse::<i64>() {
+ use TokenData as T;
+ if complex {
+ return Ok(T::Complex(int as f64))
+ }
+ return Ok(T::Int(int))
+ }
+
+ if let Ok(float) = buf.parse::<f64>() {
+ use TokenData as T;
+ if complex {
+ return Ok(T::Complex(float))
+ }
+ return Ok(T::Float(float))
+ }
+
+ Err(error!("invalid number '{buf}'"))
+ }
+
+ fn read_token(&mut self, ignore_newlines: bool) -> Result<Token> {
+ use TokenData as T;
+
+ self.skip_whitespace(ignore_newlines);
+
+ let str_start = self.index;
+ let byte_start = self.byte_len;
+
+ let pos = self.pos;
+ let char = self.next();
+ let next = self.peek();
+
+ if char == '\0' {
+ let data = if ignore_newlines { T::Eof } else { T::SemiColon };
+ return Ok(Token {
+ data,
+ pos,
+ str: String::new(),
+ bidx: byte_start,
+ blen: 0,
+ })
+ }
+
+ let data = match char {
+ '\n' => T::SemiColon,
+ '(' => T::LeftParen,
+ ')' => T::RightParen,
+ '[' => T::LeftBrack,
+ ']' => T::RightBrack,
+ '{' => T::LeftBrace,
+ '}' => T::RightBrace,
+ ':' => T::Colon,
+ '\\' => T::Backslash,
+ ';' => T::SemiColon,
+ '+' => {
+ match next {
+ '=' => {
+ self.next();
+ T::AssignAdd
+ }
+ _ => T::Add
+ }
+ },
+ '/' => {
+ match next {
+ '=' => {
+ self.next();
+ T::AssignDivide
+ }
+ _ => T::Divide
+ }
+ },
+ '%' => {
+ match next {
+ '=' => {
+ self.next();
+ T::AssignModulo
+ }
+ _ => T::Modulo
+ }
+ },
+ ',' => T::Comma,
+ '*' => {
+ match next {
+ '*' => {
+ self.next();
+ match self.peek() {
+ '=' => {
+ self.next();
+ T::AssignPower
+ },
+ _ => T::Power
+ }
+ },
+ '=' => {
+ self.next();
+ T::AssignMultiply
+ }
+ _ => T::Multiply
+ }
+ },
+ '!' => {
+ match next {
+ '=' => {
+ self.next();
+ T::NotEqual
+ }
+ _ => T::Not
+ }
+ }
+ '&' => {
+ match next {
+ '&' => {
+ self.next();
+ match self.peek() {
+ '=' => {
+ self.next();
+ T::AssignAnd
+ },
+ _ => T::And
+ }
+ },
+ '=' => {
+ self.next();
+ T::AssignBitwiseAnd
+ },
+ _ => T::BitwiseAnd
+ }
+ },
+ '|' => {
+ match next {
+ '|' => {
+ self.next();
+ match self.peek() {
+ '=' => {
+ self.next();
+ T::AssignOr
+ },
+ _ => T::Or
+ }
+ },
+ '=' => {
+ self.next();
+ T::AssignBitwiseOr
+ },
+ '>' => {
+ self.next();
+ T::Pipe
+ },
+ _ => T::BitwiseOr
+ }
+ },
+ '-' => {
+ match next {
+ '>' => {
+ self.next();
+ T::ThinArrow
+ },
+ '=' => {
+ self.next();
+ T::AssignSubtract
+ },
+ _ => T::Subtract
+ }
+ },
+ '=' => {
+ match next {
+ '>' => {
+ self.next();
+ T::Arrow
+ }
+ '=' => {
+ self.next();
+ T::Equal
+ }
+ _ => T::Assign
+ }
+ },
+ '>' => {
+ match next {
+ '>' => {
+ self.next();
+ match self.peek() {
+ '=' => {
+ self.next();
+ T::AssignBitwiseShiftRight
+ },
+ _ => T::BitwiseShiftRight
+ }
+ }
+ '=' => {
+ self.next();
+ T::GreaterEqual
+ }
+ _ => T::GreaterThan
+ }
+ },
+ '<' => {
+ match next {
+ '<' => {
+ self.next();
+ match self.peek() {
+ '=' => {
+ self.next();
+ T::AssignBitwiseShiftLeft
+ },
+ _ => T::BitwiseShiftLeft
+ }
+ }
+ '=' => {
+ self.next();
+ T::LessEqual
+ }
+ _ => T::LessThan
+ }
+ },
+ '^' => {
+ match next {
+ '=' => {
+ self.next();
+ T::AssignBitwiseXor
+ },
+ _ => T::BitwiseXor
+ }
+ }
+ '\'' | '\"' => T::String(self.lex_string(char)?),
+ 'r' => {
+ match next {
+ '\'' | '\"' => {
+ self.next();
+ T::Regex(regex::Regex::new(&self.lex_string(next)?)
+ .map(|e| e.into())
+ .map_err(|e| error!("invalid regex: '{e}'"))?)
+ }
+ _ => {
+ self.lex_ident(char)?
+ }
+ }
+ },
+ '.' => {
+ if next == '.' {
+ self.next();
+ match self.peek() {
+ '.' => {
+ self.next();
+ T::Varadic
+ },
+ '=' => {
+ self.next();
+ T::RangeEq
+ },
+ _ => T::Range
+ }
+ } else if next.is_digit(10) {
+ self.lex_number(char)?
+ } else {
+ T::Access
+ }
+ },
+ _ => {
+ if char.is_digit(10) {
+ self.lex_number(char)?
+ } else {
+ self.lex_ident(char)?
+ }
+ },
+ };
+
+ let str_end = self.index;
+ let byte_end = self.byte_len;
+ let str = self.data[str_start..str_end].to_owned().into_iter().collect();
+ Ok(Token {
+ data,
+ pos,
+ str,
+ bidx: byte_start,
+ blen: byte_end - byte_start
+ })
+ }
+
+ pub fn next_token(&mut self) -> Result<Token> {
+ let pos = self.pos;
+ match self.read_token(true) {
+ Ok(token) => Ok(token),
+ Err(e) => Err(e.pos(pos)),
+ }
+ }
+
+ pub fn next_token_nl(&mut self) -> Result<Token> {
+ let pos = self.pos;
+ match self.read_token(false) {
+ Ok(token) => Ok(token),
+ Err(e) => Err(e.pos(pos)),
+ }
+ }
+
+ pub fn peek_token(&mut self) -> Result<Token> {
+ let idx = self.index;
+ let pos = self.pos;
+ let bidx = self.byte_len;
+ let token = self.read_token(true);
+ self.index = idx;
+ self.pos = pos;
+ self.byte_len = bidx;
+ match token {
+ Ok(token) => Ok(token),
+ Err(e) => Err(e.pos(pos)),
+ }
+ }
+
+ pub fn peek_token_nl(&mut self) -> Result<Token> {
+ let idx = self.index;
+ let pos = self.pos;
+ let bidx = self.byte_len;
+ let token = self.read_token(false);
+ self.index = idx;
+ self.pos = pos;
+ self.byte_len = bidx;
+ match token {
+ Ok(token) => Ok(token),
+ Err(e) => Err(e.pos(pos)),
+ }
+ }
+}