lexer/
lib.rs

1mod cursor;
2mod error;
3pub mod unescaped;
4
5use std::{collections::HashMap, sync::OnceLock};
6
7pub use cursor::Cursor;
8pub mod token;
9use error::{LexerError, LexerErrorKind};
10use error_manager::ErrorManager;
11use token::{Token, TokenKind};
12
13/// A stream of [tokens](Token)
14pub struct TokenStream<'lex, 'src> {
15    lexer: Lexer<'lex, 'src>,
16    prev: Option<Token>,
17    next: Option<Token>,
18}
19
20impl TokenStream<'_, '_> {
21    /// Returns true if the lexer reached EOF
22    #[inline]
23    pub fn is_finished(&self) -> bool { self.lexer.c.is_finished() }
24
25    /// Returns the previous token, if present
26    #[inline]
27    pub fn previous(&self) -> Option<&Token> { self.prev.as_ref() }
28
29    /// Returns the next token, if present, without consuming it
30    #[inline]
31    pub fn peek(&self) -> Option<&Token> { self.next.as_ref() }
32}
33
34pub struct Lexer<'lex, 'src> {
35    c: Cursor<'src>,
36    em: &'lex mut ErrorManager,
37}
38
39fn as_keyword(ident: &str) -> Option<TokenKind> {
40    static KEYWORKDS: OnceLock<HashMap<&str, TokenKind>> = OnceLock::new();
41    KEYWORKDS
42        .get_or_init(|| {
43            let mut map = HashMap::new();
44            map.insert("struct", TokenKind::Struct);
45            map.insert("else", TokenKind::Else);
46            map.insert("false", TokenKind::False);
47            map.insert("fn", TokenKind::Fn);
48            map.insert("for", TokenKind::For);
49            map.insert("if", TokenKind::If);
50            map.insert("i8", TokenKind::I8);
51            map.insert("i16", TokenKind::I16);
52            map.insert("i32", TokenKind::I32);
53            map.insert("i64", TokenKind::I64);
54            map.insert("u8", TokenKind::U8);
55            map.insert("u16", TokenKind::U16);
56            map.insert("u32", TokenKind::U32);
57            map.insert("u64", TokenKind::U64);
58            map.insert("f32", TokenKind::F32);
59            map.insert("f64", TokenKind::F64);
60            map.insert("char", TokenKind::Char);
61            map.insert("bool", TokenKind::Bool);
62            map.insert("return", TokenKind::Return);
63            map.insert("true", TokenKind::True);
64            map.insert("let", TokenKind::Let);
65            map.insert("use", TokenKind::Use);
66            map.insert("as", TokenKind::As);
67            map.insert("mod", TokenKind::Mod);
68            map.insert("const", TokenKind::Const);
69            map.insert("while", TokenKind::While);
70            map.insert("break", TokenKind::Break);
71            map.insert("continue", TokenKind::Continue);
72            map.insert("extern", TokenKind::Extern);
73            map
74        })
75        .get(ident)
76        .copied()
77}
78
79impl Iterator for TokenStream<'_, '_> {
80    type Item = Token;
81
82    /// Returns the next token from the stream.
83    ///
84    /// This is similar to [`Lexer::next_token`], but also
85    /// keeps track of the previous and next [tokens](Token)
86    fn next(&mut self) -> Option<Self::Item> {
87        let ret = self.next.take();
88        self.prev.clone_from(&ret);
89        self.next = self.lexer.next_token();
90
91        ret
92    }
93}
94
95impl<'lex, 'src> IntoIterator for Lexer<'lex, 'src> {
96    type Item = Token;
97    type IntoIter = TokenStream<'lex, 'src>;
98
99    /// Converts this [Lexer] into an iterator of [tokens](Token)
100    ///
101    /// This is the same as calling [`Self::into_token_stream`]
102    #[inline]
103    fn into_iter(self) -> Self::IntoIter { self.into_token_stream() }
104}
105
106impl<'lex, 'src> Lexer<'lex, 'src> {
107    /// Builds a new [Lexer]
108    ///
109    /// # Arguments
110    /// - text: Source code to tokenize
111    /// - em: An [`ErrorManager`], where all the errors will be sent
112    pub fn new(text: &'src str, base_offset: usize, em: &'lex mut ErrorManager) -> Self {
113        Self {
114            c: Cursor::new(text, base_offset),
115            em,
116        }
117    }
118
119    /// Turns this [Lexer] into a [`TokenStream`]
120    pub fn into_token_stream(mut self) -> TokenStream<'lex, 'src> {
121        let next = self.next_token();
122        TokenStream {
123            lexer: self,
124            prev: None,
125            next,
126        }
127    }
128
129    /// Parses the next [token](Token)
130    ///
131    /// Returns None if the EOF was reached
132    pub fn next_token(&mut self) -> Option<Token> {
133        if self.c.is_finished() {
134            return None;
135        }
136        self.c.step();
137        if let Some(t) = self.scan_token() {
138            Some(t)
139        } else {
140            self.next_token()
141        }
142    }
143
144    #[allow(clippy::unnecessary_wraps)]
145    fn add_token(&self, kind: TokenKind) -> Option<Token> {
146        Some(Token {
147            kind,
148            span: self.c.current_span(),
149        })
150    }
151    #[allow(clippy::too_many_lines)]
152    fn scan_token(&mut self) -> Option<Token> {
153        match self.c.advance() {
154            '(' => self.add_token(TokenKind::LeftParen),
155            ')' => self.add_token(TokenKind::RightParen),
156            '{' => self.add_token(TokenKind::LeftBrace),
157            '}' => self.add_token(TokenKind::RightBrace),
158            '[' => self.add_token(TokenKind::LeftBracket),
159            ']' => self.add_token(TokenKind::RightBracket),
160            ',' => self.add_token(TokenKind::Comma),
161            '.' => {
162                if self.c.match_next('.') && self.c.match_next('.') {
163                    self.add_token(TokenKind::ThreeDot)
164                }
165                else if self.c.peek().is_numeric() {
166                    self.error(LexerErrorKind::FloatLitWithoutIntegralPart);
167                    None
168                } else {
169                    self.add_token(TokenKind::Dot)
170                }
171            }
172            '-' => {
173                if self.c.match_next('>') {
174                    self.add_token(TokenKind::Arrow)
175                } else {
176                    self.add_token(TokenKind::Minus)
177                }
178            }
179            '+' => self.add_token(TokenKind::Plus),
180            ';' => self.add_token(TokenKind::Semicolon),
181            ':' => {
182                if self.c.match_next(':') {
183                    self.add_token(TokenKind::DoubleColon)
184                } else {
185                    self.add_token(TokenKind::Colon)
186                }
187            }
188            '?' => self.add_token(TokenKind::Question),
189            '*' => self.add_token(TokenKind::Star),
190            '\'' => self.char_literal(),
191            '!' => {
192                if self.c.match_next('=') {
193                    self.add_token(TokenKind::BangEqual)
194                } else {
195                    self.add_token(TokenKind::Bang)
196                }
197            }
198            '=' => {
199                if self.c.match_next('=') {
200                    self.add_token(TokenKind::EqualEqual)
201                } else {
202                    self.add_token(TokenKind::Equal)
203                }
204            }
205            '<' => {
206                if self.c.match_next('=') {
207                    self.add_token(TokenKind::LessEqual)
208                } else {
209                    self.add_token(TokenKind::Less)
210                }
211            }
212            '>' => {
213                if self.c.match_next('=') {
214                    self.add_token(TokenKind::GreaterEqual)
215                } else {
216                    self.add_token(TokenKind::Greater)
217                }
218            }
219            '/' => {
220                if self.c.match_next('/') {
221                    self.comment()
222                } else if self.c.match_next('*') {
223                    self.ml_comment()
224                } else {
225                    self.add_token(TokenKind::Slash)
226                }
227            }
228            '&' => {
229                if self.c.match_next('&') {
230                    self.add_token(TokenKind::And)
231                } else {
232                    self.add_token(TokenKind::Ampersand)
233                }
234            }
235            '|' => {
236                if self.c.match_next('|') {
237                    self.add_token(TokenKind::Or)
238                } else {
239                    self.add_token(TokenKind::VerticalPipe)
240                }
241            }
242            '%' => self.add_token(TokenKind::Modulo),
243            '"' => self.string(),
244            ' ' | '\n' | '\r' | '\t' => None, // Ignore whitespace.
245            c => {
246                if c.is_numeric() {
247                    self.number()
248                } else if c.is_alphabetic() || c == '_' {
249                    self.identifier()
250                } else {
251                    self.error(LexerErrorKind::UnexpectedCharacter(c));
252                    None
253                }
254            }
255        }
256    }
257    fn char_literal(&mut self) -> Option<Token> {
258        if self.c.advance() == '\\' {
259            self.c.advance();
260        }
261        if !self.c.match_next('\'') {
262            self.error(LexerErrorKind::ExpectedClosingTickOnCharLiteral);
263            self.c.advance_while(|c| *c != '\'');
264            self.c.advance();
265        }
266        self.add_token(TokenKind::CharLiteral)
267    }
268    fn comment(&mut self) -> Option<Token> {
269        self.c.advance_while(|c| *c != '\n');
270        None
271    }
272    fn ml_comment(&mut self) -> Option<Token> {
273        while self.c.advance() != '*' || self.c.peek() != '/' {
274            if self.c.is_finished() {
275                self.error(LexerErrorKind::UnterminatedComment);
276            }
277        }
278        self.c.advance(); /* Consume the / */
279        None
280    }
281    fn string(&mut self) -> Option<Token> {
282        while self.c.advance() != '"' {
283            if self.c.is_finished() {
284                self.error(LexerErrorKind::UnterminatedString);
285                return None;
286            }
287            if self.c.peek() == '\\' {
288                self.c.advance();
289                self.c.advance();
290            }
291        }
292        self.add_token(TokenKind::String)
293    }
294    fn floating(&mut self) -> Option<Token> {
295        self.c.advance(); /* Consume the . */
296        if self.c.peek().is_numeric() {
297            self.c.advance_while(char::is_ascii_digit);
298            self.add_token(TokenKind::FloatLiteral)
299        } else {
300            self.error(LexerErrorKind::FloatLitWithoutFloatingPart);
301            None
302        }
303    }
304    fn number(&mut self) -> Option<Token> {
305        self.c.advance_while(|n| n.is_numeric());
306        if self.c.peek() == '.' {
307            self.floating()
308        } else {
309            self.add_token(TokenKind::IntLiteral)
310        }
311    }
312    fn identifier(&mut self) -> Option<Token> {
313        self.c.advance_while(|c| c.is_alphanumeric() || *c == '_');
314        let lexem = self.c.current_lexem();
315        let token_type = as_keyword(lexem).unwrap_or(TokenKind::Identifier);
316        self.add_token(token_type)
317    }
318    fn error(&mut self, kind: LexerErrorKind) {
319        self.em.emit_error(LexerError {
320            kind,
321            span: self.c.current_span(),
322        });
323    }
324}