1
use crate::char_stream::CharStream;
2
use crate::diagnostics::Diag;
3
use crate::source_map::{Span, Spanned};
4

            
5
pub(crate) struct Scanner<'input> {
6
    chars: CharStream<'input>,
7
}
8

            
9
impl<'input> Scanner<'input> {
10
6958
    pub(crate) fn with_input(source_text: &'input str) -> Scanner<'input> {
11
6958
        Scanner {
12
6958
            chars: CharStream::with_text(source_text),
13
6958
        }
14
6958
    }
15

            
16
22406
    pub(crate) fn scan_next_token(&mut self) -> Result<Spanned<Token>, Diag> {
17
38263
        while is_space(self.peek()) {
18
15857
            self.consume();
19
15857
        }
20

            
21
22406
        let span_start = self.chars.peek_byte_pos();
22

            
23
22406
        let token_kind = match self.consume() {
24
6435
            CharStream::EOF_CHAR => return Ok(Token::eof()),
25
29
            '(' => TokenKind::Open(Bracket::Round),
26
34
            ')' => TokenKind::Closed(Bracket::Round),
27
120
            '[' => TokenKind::Open(Bracket::Square),
28
112
            ']' => TokenKind::Closed(Bracket::Square),
29
125
            '{' => TokenKind::Open(Bracket::Curly),
30
18
            '}' => TokenKind::Closed(Bracket::Curly),
31
            '.' => {
32
2062
                if is_digit(self.peek()) {
33
445
                    let first_digit = self.consume();
34
445
                    self.scan_numeric_constant(first_digit)
35
1617
                } else if self.peek() == '.' && self.lookahead(1) == '.' {
36
1419
                    self.consume();
37
1419
                    self.consume();
38
1419
                    TokenKind::Ellipsis
39
                } else {
40
198
                    TokenKind::Period
41
                }
42
            }
43
            '-' => {
44
154
                if self.try_consume('>') {
45
1
                    TokenKind::Arrow
46
153
                } else if self.try_consume('-') {
47
1
                    TokenKind::MinusMinus
48
152
                } else if self.try_consume('=') {
49
2
                    TokenKind::MinusEqual
50
                } else {
51
150
                    TokenKind::Minus
52
                }
53
            }
54
            '+' => {
55
163
                if self.try_consume('+') {
56
1
                    TokenKind::PlusPlus
57
162
                } else if self.try_consume('=') {
58
1
                    TokenKind::PlusEqual
59
                } else {
60
161
                    TokenKind::Plus
61
                }
62
            }
63
            '&' => {
64
131
                if self.try_consume('&') {
65
1
                    TokenKind::AmpAmp
66
130
                } else if self.try_consume('=') {
67
2
                    TokenKind::AmpEqual
68
                } else {
69
128
                    TokenKind::Ampersand
70
                }
71
            }
72
            '*' => {
73
114
                if self.try_consume('=') {
74
3
                    TokenKind::StarEqual
75
                } else {
76
111
                    TokenKind::Star
77
                }
78
            }
79
15
            '~' => TokenKind::Tilde,
80
            '!' => {
81
55
                if self.try_consume('=') {
82
1
                    TokenKind::ExclaEqual
83
                } else {
84
54
                    TokenKind::Exclamation
85
                }
86
            }
87
            '/' => {
88
766
                if self.try_consume('=') {
89
2
                    TokenKind::SlashEqual
90
764
                } else if self.try_consume('*') {
91
6
                    self.skip_block_comment()?;
92

            
93
4
                    return self.scan_next_token();
94
758
                } else if self.try_consume('/') {
95
520
                    self.skip_line_comment()?;
96

            
97
520
                    return self.scan_next_token();
98
                } else {
99
238
                    TokenKind::Slash
100
                }
101
            }
102
            '%' => {
103
130
                if self.try_consume('=') {
104
2
                    TokenKind::PercentEqual
105
                } else {
106
128
                    TokenKind::Percent
107
                }
108
            }
109
            '<' => {
110
153
                if self.try_consume('<') {
111
5
                    if self.try_consume('=') {
112
2
                        TokenKind::LessLessEqual
113
                    } else {
114
3
                        TokenKind::LessLess
115
                    }
116
148
                } else if self.try_consume('=') {
117
2
                    TokenKind::LessEqual
118
                } else {
119
146
                    TokenKind::Less
120
                }
121
            }
122
            '>' => {
123
69
                if self.try_consume('>') {
124
2
                    if self.try_consume('=') {
125
1
                        TokenKind::GreaterGreaterEqual
126
                    } else {
127
1
                        TokenKind::GreaterGreater
128
                    }
129
67
                } else if self.try_consume('=') {
130
1
                    TokenKind::GreaterEqual
131
                } else {
132
66
                    TokenKind::Greater
133
                }
134
            }
135
            '=' => {
136
384
                if self.try_consume('=') {
137
3
                    TokenKind::EqualEqual
138
                } else {
139
381
                    TokenKind::Equal
140
                }
141
            }
142
            '^' => {
143
103
                if self.try_consume('=') {
144
4
                    TokenKind::CaretEqual
145
                } else {
146
99
                    TokenKind::Caret
147
                }
148
            }
149
            '|' => {
150
29
                if self.try_consume('|') {
151
1
                    TokenKind::PipePipe
152
28
                } else if self.try_consume('=') {
153
2
                    TokenKind::PipeEqual
154
                } else {
155
26
                    TokenKind::Pipe
156
                }
157
            }
158
180
            '?' => TokenKind::Question,
159
168
            ':' => TokenKind::Colon,
160
345
            ';' => TokenKind::Semicolon,
161
29
            ',' => TokenKind::Comma,
162
            '#' => {
163
62
                if self.try_consume('#') {
164
1
                    TokenKind::HashHash
165
                } else {
166
61
                    TokenKind::Hash
167
                }
168
            }
169
1676
            '\'' => self.scan_character_constant(CharEncoding::Byte)?,
170
71
            prefix @ ('L' | 'u' | 'U') if self.peek() == '\'' => {
171
3
                self.consume();
172

            
173
3
                let char_encoding = match prefix {
174
1
                    'L' => CharEncoding::Wide,
175
1
                    'u' => CharEncoding::Utf16,
176
1
                    'U' => CharEncoding::Utf32,
177
                    _ => unreachable!("invalid character constant prefix '{}'!", prefix),
178
                };
179

            
180
3
                self.scan_character_constant(char_encoding)?
181
            }
182
1162
            '"' => self.scan_string_literal(CharEncoding::Byte)?,
183
26
            'u' if self.peek() == '8' && self.lookahead(1) == '"' => {
184
1
                self.consume();
185
1
                self.consume();
186
1

            
187
1
                self.scan_string_literal(CharEncoding::Utf8)?
188
            }
189
67
            prefix @ ('L' | 'u' | 'U') if self.peek() == '"' => {
190
3
                self.consume();
191

            
192
3
                let char_encoding = match prefix {
193
1
                    'L' => CharEncoding::Wide,
194
1
                    'u' => CharEncoding::Utf16,
195
1
                    'U' => CharEncoding::Utf32,
196
                    _ => unreachable!("invalid string literal prefix '{}'!", prefix),
197
                };
198

            
199
3
                self.scan_string_literal(char_encoding)?
200
            }
201
7576
            ch if is_digit(ch) => self.scan_numeric_constant(ch),
202
5345
            ch if is_start_of_identifier(ch) => self.scan_identifier_or_keyword(ch),
203
2209
            unrecognized_char => return Err(Diag::UnrecognizedChar(unrecognized_char)),
204
        };
205

            
206
11467
        let span_end = self.chars.peek_byte_pos();
207
11467

            
208
11467
        let token_span = Span {
209
11467
            start: span_start,
210
11467
            end: span_end,
211
11467
        };
212
11467

            
213
11467
        Ok(Spanned::new(Token { kind: token_kind }, token_span))
214
22406
    }
215

            
216
6
    fn skip_block_comment(&mut self) -> Result<(), Diag> {
217
281
        loop {
218
281
            if self.peek() == CharStream::EOF_CHAR {
219
2
                break Err(Diag::UnterminatedBlockComment);
220
279
            }
221
279

            
222
279
            if self.consume() == '*' && self.peek() == '/' {
223
4
                self.consume();
224
4
                break Ok(());
225
275
            }
226
        }
227
6
    }
228

            
229
520
    fn skip_line_comment(&mut self) -> Result<(), Diag> {
230
        // If we get an EOF while scanning a line comment, the behavior is undefined as
231
        // per [C17 6.4.9/2], because it doesn't specify what happens if there is no new
232
        // line character to be found. That's quite counterproductive, so here we just
233
        // stop scanning as if we would have found a new line.
234
9415
        while self.peek() != CharStream::EOF_CHAR && !is_newline(self.peek()) {
235
8895
            self.consume();
236
8895
        }
237

            
238
520
        Ok(())
239
520
    }
240

            
241
1679
    fn scan_character_constant(&mut self, encoding: CharEncoding) -> Result<TokenKind, Diag> {
242
        debug_assert_ne!(
243
            encoding,
244
            CharEncoding::Utf8,
245
            "C17 standard doesn't define utf-8 character constants"
246
        );
247

            
248
1679
        if self.peek() == '\'' {
249
2
            self.consume();
250
2
            return Err(Diag::EmptyCharacterConstant);
251
1677
        }
252

            
253
19475
        while self.peek() != '\'' {
254
18681
            if is_newline(self.peek()) || self.peek() == CharStream::EOF_CHAR {
255
883
                return Err(Diag::UnterminatedCharacterConstant);
256
17798
            }
257
17798

            
258
17798
            // Skips backslashes. This effectively escapes single-quotes. Validation of
259
17798
            // escape sequences occurs later on during parsing, which means
260
17798
            // character-constant tokens may be semantically invalid before parsing.
261
17798
            // NOTE: Review this during implementation of escaping newlines in code.
262
17798
            // @escape-newline.
263
17798
            if self.peek() == '\\' {
264
19
                self.consume();
265
17779
            }
266

            
267
17798
            self.consume();
268
        }
269

            
270
794
        let terminating_quote = self.consume();
271
794
        debug_assert_eq!(terminating_quote, '\'');
272

            
273
794
        Ok(TokenKind::CharacterConstant { encoding })
274
1679
    }
275

            
276
1166
    fn scan_string_literal(&mut self, encoding: CharEncoding) -> Result<TokenKind, Diag> {
277
17584
        while self.peek() != '"' {
278
17302
            if is_newline(self.peek()) || self.peek() == CharStream::EOF_CHAR {
279
884
                return Err(Diag::UnterminatedStringLiteral);
280
16418
            }
281
16418

            
282
16418
            // Skips backslashes. This effectively escapes double-quotes. Validation of
283
16418
            // escape sequences occurs later on during parsing, which means
284
16418
            // string-literal tokens may be semantically invalid before parsing.
285
16418
            // NOTE: Review this during implementation of escaping newlines in code.
286
16418
            // @escape-newline.
287
16418
            if self.peek() == '\\' {
288
30
                self.consume();
289
16388
            }
290

            
291
16418
            self.consume();
292
        }
293

            
294
282
        let terminating_quote = self.consume();
295
282
        debug_assert_eq!(terminating_quote, '"');
296

            
297
282
        Ok(TokenKind::StringLiteral { encoding })
298
1166
    }
299

            
300
2676
    fn scan_numeric_constant(&mut self, first_digit: char) -> TokenKind {
301
2676
        debug_assert!(is_numeric_constant_char(first_digit));
302

            
303
2676
        let mut prev_peek = first_digit;
304

            
305
        // Intentionally scans a more general case of a numeric constant, which may turn
306
        // out to be ill-formed as per the standard. This means a numeric-constant token
307
        // is not guaranteed to be valid before parsing.
308
        //
309
        // A more strict and conforming scanning is done during parsing, which is when
310
        // an ill-formed numeric constant is diagnosed. This takes some burden
311
        // away from the scanner, and also makes the parser's job somewhat
312
        // simpler, as the character set to be considered during the parsing of
313
        // a numeric constant token is greatly reduced.
314
        loop {
315
126507
            while is_numeric_constant_char(self.peek()) {
316
121584
                prev_peek = self.consume();
317
121584
            }
318

            
319
4923
            if matches!(self.peek(), '+' | '-') && matches!(prev_peek, 'e' | 'E' | 'p' | 'P') {
320
2247
                self.consume();
321
2247
                continue;
322
2676
            }
323
2676

            
324
2676
            break;
325
2676
        }
326
2676

            
327
2676
        TokenKind::NumericConstant
328
2676
    }
329

            
330
3136
    fn scan_identifier_or_keyword(&mut self, ident_start: char) -> TokenKind {
331
3136
        debug_assert!(is_start_of_identifier(ident_start));
332

            
333
3136
        let mut lexeme_buffer = String::with_capacity(16);
334
3136
        lexeme_buffer.push(ident_start);
335

            
336
26580
        while is_remaining_of_identifier(self.peek()) {
337
23444
            lexeme_buffer.push(self.consume());
338
23444
        }
339

            
340
3136
        get_keyword_kind_for_lexeme(&lexeme_buffer).unwrap_or(TokenKind::Identifier)
341
3136
    }
342

            
343
362822
    fn peek(&mut self) -> char {
344
362822
        self.chars.peek()
345
362822
    }
346

            
347
1522
    fn lookahead(&mut self, n: usize) -> char {
348
1522
        self.chars.lookahead(n)
349
1522
    }
350

            
351
233350
    fn consume(&mut self) -> char {
352
233350
        self.chars.consume()
353
233350
    }
354

            
355
4682
    fn try_consume(&mut self, expected_char: char) -> bool {
356
4682
        self.chars.try_consume(expected_char)
357
4682
    }
358
}
359

            
360
// TODO(feroldi): Add `is_eof()` function.
361
18426
#[derive(PartialEq, Debug, Copy, Clone)]
362
pub struct Token {
363
    pub kind: TokenKind,
364
}
365

            
366
impl Token {
367
24861
    pub fn eof() -> Spanned<Token> {
368
24861
        Spanned::with_dummy_span(Token {
369
24861
            kind: TokenKind::Eof,
370
24861
        })
371
24861
    }
372
}
373

            
374
25922
#[derive(PartialEq, Debug, Copy, Clone)]
375
pub enum TokenKind {
376
    Open(Bracket),
377
    Closed(Bracket),
378
    Period,
379
    Arrow,
380
    PlusPlus,
381
    MinusMinus,
382
    Ampersand,
383
    Star,
384
    Plus,
385
    Minus,
386
    Tilde,
387
    Exclamation,
388
    Slash,
389
    Percent,
390
    LessLess,
391
    GreaterGreater,
392
    Less,
393
    Greater,
394
    LessEqual,
395
    GreaterEqual,
396
    EqualEqual,
397
    ExclaEqual,
398
    Caret,
399
    Pipe,
400
    AmpAmp,
401
    PipePipe,
402
    Question,
403
    Colon,
404
    Semicolon,
405
    Ellipsis,
406
    Equal,
407
    StarEqual,
408
    SlashEqual,
409
    PercentEqual,
410
    PlusEqual,
411
    MinusEqual,
412
    LessLessEqual,
413
    GreaterGreaterEqual,
414
    AmpEqual,
415
    CaretEqual,
416
    PipeEqual,
417
    Comma,
418
    Hash,
419
    HashHash,
420
    Identifier,
421
    KwAuto,
422
    KwBreak,
423
    KwCase,
424
    KwChar,
425
    KwConst,
426
    KwContinue,
427
    KwDefault,
428
    KwDo,
429
    KwDouble,
430
    KwElse,
431
    KwEnum,
432
    KwExtern,
433
    KwFloat,
434
    KwFor,
435
    KwGoto,
436
    KwIf,
437
    KwInline,
438
    KwInt,
439
    KwLong,
440
    KwRegister,
441
    KwRestrict,
442
    KwReturn,
443
    KwShort,
444
    KwSigned,
445
    KwSizeof,
446
    KwStatic,
447
    KwStruct,
448
    KwSwitch,
449
    KwTypedef,
450
    KwUnion,
451
    KwUnsigned,
452
    KwVoid,
453
    KwVolatile,
454
    KwWhile,
455
    KwAlignas,
456
    KwAlignof,
457
    KwAtomic,
458
    KwBool,
459
    KwComplex,
460
    KwGeneric,
461
    KwImaginary,
462
    KwNoreturn,
463
    KwStaticAssert,
464
    KwThreadLocal,
465
    NumericConstant,
466
    CharacterConstant { encoding: CharEncoding },
467
    StringLiteral { encoding: CharEncoding },
468
    Eof,
469
}
470

            
471
20
#[derive(PartialEq, Debug, Copy, Clone)]
472
pub enum Bracket {
473
    Round,
474
    Square,
475
    Curly,
476
}
477

            
478
2742
#[derive(PartialEq, Debug, Copy, Clone)]
479
pub enum CharEncoding {
480
    Byte,
481
    Wide,
482
    Utf8,
483
    Utf16,
484
    Utf32,
485
}
486

            
487
3136
fn get_keyword_kind_for_lexeme(lexeme: &str) -> Option<TokenKind> {
488
3136
    let keyword_kind = match lexeme {
489
3136
        "auto" => TokenKind::KwAuto,
490
3135
        "break" => TokenKind::KwBreak,
491
3134
        "case" => TokenKind::KwCase,
492
3133
        "char" => TokenKind::KwChar,
493
3132
        "const" => TokenKind::KwConst,
494
3131
        "continue" => TokenKind::KwContinue,
495
3130
        "default" => TokenKind::KwDefault,
496
3129
        "do" => TokenKind::KwDo,
497
3128
        "double" => TokenKind::KwDouble,
498
3127
        "else" => TokenKind::KwElse,
499
3126
        "enum" => TokenKind::KwEnum,
500
3125
        "extern" => TokenKind::KwExtern,
501
3124
        "float" => TokenKind::KwFloat,
502
3123
        "for" => TokenKind::KwFor,
503
3122
        "goto" => TokenKind::KwGoto,
504
3121
        "if" => TokenKind::KwIf,
505
3120
        "inline" => TokenKind::KwInline,
506
3119
        "int" => TokenKind::KwInt,
507
2838
        "long" => TokenKind::KwLong,
508
2583
        "register" => TokenKind::KwRegister,
509
2582
        "restrict" => TokenKind::KwRestrict,
510
2581
        "return" => TokenKind::KwReturn,
511
2580
        "short" => TokenKind::KwShort,
512
2579
        "signed" => TokenKind::KwSigned,
513
2578
        "sizeof" => TokenKind::KwSizeof,
514
2577
        "static" => TokenKind::KwStatic,
515
2576
        "struct" => TokenKind::KwStruct,
516
2575
        "switch" => TokenKind::KwSwitch,
517
2574
        "typedef" => TokenKind::KwTypedef,
518
2573
        "union" => TokenKind::KwUnion,
519
2572
        "unsigned" => TokenKind::KwUnsigned,
520
2571
        "void" => TokenKind::KwVoid,
521
2570
        "volatile" => TokenKind::KwVolatile,
522
2569
        "while" => TokenKind::KwWhile,
523
2568
        "_Alignas" => TokenKind::KwAlignas,
524
2567
        "_Alignof" => TokenKind::KwAlignof,
525
2566
        "_Atomic" => TokenKind::KwAtomic,
526
2565
        "_Bool" => TokenKind::KwBool,
527
2564
        "_Complex" => TokenKind::KwComplex,
528
2563
        "_Generic" => TokenKind::KwGeneric,
529
2562
        "_Imaginary" => TokenKind::KwImaginary,
530
2561
        "_Noreturn" => TokenKind::KwNoreturn,
531
2560
        "_Static_assert" => TokenKind::KwStaticAssert,
532
2559
        "_Thread_local" => TokenKind::KwThreadLocal,
533
2558
        _ => return None,
534
    };
535

            
536
578
    Some(keyword_kind)
537
3136
}
538

            
539
45138
const fn is_newline(ch: char) -> bool {
540
45138
    matches!(ch, '\n' | '\r')
541
45138
}
542

            
543
129183
const fn is_numeric_constant_char(ch: char) -> bool {
544
129183
    is_alpha(ch) || is_digit(ch) || ch == '.'
545
129183
}
546

            
547
35061
const fn is_start_of_identifier(ch: char) -> bool {
548
35061
    is_alpha(ch) || ch == '_'
549
35061
}
550

            
551
26580
const fn is_remaining_of_identifier(ch: char) -> bool {
552
26580
    is_start_of_identifier(ch) || is_digit(ch)
553
26580
}
554

            
555
38263
const fn is_space(ch: char) -> bool {
556
38263
    matches!(ch, '\x20' | '\t' | '\n' | '\r' | '\x0b' | '\x0c')
557
38263
}
558

            
559
138229
const fn is_digit(ch: char) -> bool {
560
138229
    ch.is_ascii_digit()
561
138229
}
562

            
563
164244
const fn is_alpha(ch: char) -> bool {
564
164244
    ch.is_ascii_alphabetic()
565
164244
}