1
#![cfg(test)]
2

            
3
use proptest::prelude::*;
4
use proptest::string::string_regex;
5

            
6
use crate::diagnostics::Diag;
7
use crate::scanner::{Bracket, CharEncoding, Scanner, Token, TokenKind};
8
use crate::source_map::SourceFile;
9
use crate::tests::*;
10

            
11
1
#[test]
12
1
fn scanning_an_empty_input_should_return_an_eof_token() {
13
1
    assert_eq!(try_scan_all(""), []);
14
1
}
15

            
16
1
#[test]
17
1
fn scan_punctuations() {
18
1
    assert_eq!(scan_first("("), (TokenKind::Open(Bracket::Round), "("));
19
1
    assert_eq!(scan_first(")"), (TokenKind::Closed(Bracket::Round), ")"));
20
1
    assert_eq!(scan_first("["), (TokenKind::Open(Bracket::Square), "["));
21
1
    assert_eq!(scan_first("]"), (TokenKind::Closed(Bracket::Square), "]"));
22
1
    assert_eq!(scan_first("{"), (TokenKind::Open(Bracket::Curly), "{"));
23
1
    assert_eq!(scan_first("}"), (TokenKind::Closed(Bracket::Curly), "}"));
24
1
    assert_eq!(scan_first("."), (TokenKind::Period, "."));
25
1
    assert_eq!(scan_first("->"), (TokenKind::Arrow, "->"));
26
1
    assert_eq!(scan_first("++"), (TokenKind::PlusPlus, "++"));
27
1
    assert_eq!(scan_first("--"), (TokenKind::MinusMinus, "--"));
28
1
    assert_eq!(scan_first("&"), (TokenKind::Ampersand, "&"));
29
1
    assert_eq!(scan_first("*"), (TokenKind::Star, "*"));
30
1
    assert_eq!(scan_first("+"), (TokenKind::Plus, "+"));
31
1
    assert_eq!(scan_first("-"), (TokenKind::Minus, "-"));
32
1
    assert_eq!(scan_first("~"), (TokenKind::Tilde, "~"));
33
1
    assert_eq!(scan_first("!"), (TokenKind::Exclamation, "!"));
34
1
    assert_eq!(scan_first("/"), (TokenKind::Slash, "/"));
35
1
    assert_eq!(scan_first("%"), (TokenKind::Percent, "%"));
36
1
    assert_eq!(scan_first("<<"), (TokenKind::LessLess, "<<"));
37
1
    assert_eq!(scan_first(">>"), (TokenKind::GreaterGreater, ">>"));
38
1
    assert_eq!(scan_first("<"), (TokenKind::Less, "<"));
39
1
    assert_eq!(scan_first(">"), (TokenKind::Greater, ">"));
40
1
    assert_eq!(scan_first("<="), (TokenKind::LessEqual, "<="));
41
1
    assert_eq!(scan_first(">="), (TokenKind::GreaterEqual, ">="));
42
1
    assert_eq!(scan_first("=="), (TokenKind::EqualEqual, "=="));
43
1
    assert_eq!(scan_first("!="), (TokenKind::ExclaEqual, "!="));
44
1
    assert_eq!(scan_first("^"), (TokenKind::Caret, "^"));
45
1
    assert_eq!(scan_first("|"), (TokenKind::Pipe, "|"));
46
1
    assert_eq!(scan_first("&&"), (TokenKind::AmpAmp, "&&"));
47
1
    assert_eq!(scan_first("||"), (TokenKind::PipePipe, "||"));
48
1
    assert_eq!(scan_first("?"), (TokenKind::Question, "?"));
49
1
    assert_eq!(scan_first(":"), (TokenKind::Colon, ":"));
50
1
    assert_eq!(scan_first(";"), (TokenKind::Semicolon, ";"));
51
1
    assert_eq!(scan_first("..."), (TokenKind::Ellipsis, "..."));
52
1
    assert_eq!(scan_first("="), (TokenKind::Equal, "="));
53
1
    assert_eq!(scan_first("*="), (TokenKind::StarEqual, "*="));
54
1
    assert_eq!(scan_first("/="), (TokenKind::SlashEqual, "/="));
55
1
    assert_eq!(scan_first("%="), (TokenKind::PercentEqual, "%="));
56
1
    assert_eq!(scan_first("+="), (TokenKind::PlusEqual, "+="));
57
1
    assert_eq!(scan_first("-="), (TokenKind::MinusEqual, "-="));
58
1
    assert_eq!(scan_first("<<="), (TokenKind::LessLessEqual, "<<="));
59
1
    assert_eq!(scan_first(">>="), (TokenKind::GreaterGreaterEqual, ">>="));
60
1
    assert_eq!(scan_first("&="), (TokenKind::AmpEqual, "&="));
61
1
    assert_eq!(scan_first("^="), (TokenKind::CaretEqual, "^="));
62
1
    assert_eq!(scan_first("|="), (TokenKind::PipeEqual, "|="));
63
1
    assert_eq!(scan_first(","), (TokenKind::Comma, ","));
64
1
    assert_eq!(scan_first("#"), (TokenKind::Hash, "#"));
65
1
    assert_eq!(scan_first("##"), (TokenKind::HashHash, "##"));
66
1
}
67

            
68
1
#[test]
69
1
fn scan_keywords() {
70
1
    assert_eq!(scan_first("auto"), (TokenKind::KwAuto, "auto"));
71
1
    assert_eq!(scan_first("break"), (TokenKind::KwBreak, "break"));
72
1
    assert_eq!(scan_first("case"), (TokenKind::KwCase, "case"));
73
1
    assert_eq!(scan_first("char"), (TokenKind::KwChar, "char"));
74
1
    assert_eq!(scan_first("const"), (TokenKind::KwConst, "const"));
75
1
    assert_eq!(scan_first("continue"), (TokenKind::KwContinue, "continue"));
76
1
    assert_eq!(scan_first("default"), (TokenKind::KwDefault, "default"));
77
1
    assert_eq!(scan_first("do"), (TokenKind::KwDo, "do"));
78
1
    assert_eq!(scan_first("double"), (TokenKind::KwDouble, "double"));
79
1
    assert_eq!(scan_first("else"), (TokenKind::KwElse, "else"));
80
1
    assert_eq!(scan_first("enum"), (TokenKind::KwEnum, "enum"));
81
1
    assert_eq!(scan_first("extern"), (TokenKind::KwExtern, "extern"));
82
1
    assert_eq!(scan_first("float"), (TokenKind::KwFloat, "float"));
83
1
    assert_eq!(scan_first("for"), (TokenKind::KwFor, "for"));
84
1
    assert_eq!(scan_first("goto"), (TokenKind::KwGoto, "goto"));
85
1
    assert_eq!(scan_first("if"), (TokenKind::KwIf, "if"));
86
1
    assert_eq!(scan_first("inline"), (TokenKind::KwInline, "inline"));
87
1
    assert_eq!(scan_first("int"), (TokenKind::KwInt, "int"));
88
1
    assert_eq!(scan_first("long"), (TokenKind::KwLong, "long"));
89
1
    assert_eq!(scan_first("register"), (TokenKind::KwRegister, "register"));
90
1
    assert_eq!(scan_first("restrict"), (TokenKind::KwRestrict, "restrict"));
91
1
    assert_eq!(scan_first("return"), (TokenKind::KwReturn, "return"));
92
1
    assert_eq!(scan_first("short"), (TokenKind::KwShort, "short"));
93
1
    assert_eq!(scan_first("signed"), (TokenKind::KwSigned, "signed"));
94
1
    assert_eq!(scan_first("sizeof"), (TokenKind::KwSizeof, "sizeof"));
95
1
    assert_eq!(scan_first("static"), (TokenKind::KwStatic, "static"));
96
1
    assert_eq!(scan_first("struct"), (TokenKind::KwStruct, "struct"));
97
1
    assert_eq!(scan_first("switch"), (TokenKind::KwSwitch, "switch"));
98
1
    assert_eq!(scan_first("typedef"), (TokenKind::KwTypedef, "typedef"));
99
1
    assert_eq!(scan_first("union"), (TokenKind::KwUnion, "union"));
100
1
    assert_eq!(scan_first("unsigned"), (TokenKind::KwUnsigned, "unsigned"));
101
1
    assert_eq!(scan_first("void"), (TokenKind::KwVoid, "void"));
102
1
    assert_eq!(scan_first("volatile"), (TokenKind::KwVolatile, "volatile"));
103
1
    assert_eq!(scan_first("while"), (TokenKind::KwWhile, "while"));
104
1
    assert_eq!(scan_first("_Alignas"), (TokenKind::KwAlignas, "_Alignas"));
105
1
    assert_eq!(scan_first("_Alignof"), (TokenKind::KwAlignof, "_Alignof"));
106
1
    assert_eq!(scan_first("_Atomic"), (TokenKind::KwAtomic, "_Atomic"));
107
1
    assert_eq!(scan_first("_Bool"), (TokenKind::KwBool, "_Bool"));
108
1
    assert_eq!(scan_first("_Complex"), (TokenKind::KwComplex, "_Complex"));
109
1
    assert_eq!(scan_first("_Generic"), (TokenKind::KwGeneric, "_Generic"));
110
1
    assert_eq!(
111
1
        scan_first("_Imaginary"),
112
1
        (TokenKind::KwImaginary, "_Imaginary")
113
1
    );
114
1
    assert_eq!(
115
1
        scan_first("_Noreturn"),
116
1
        (TokenKind::KwNoreturn, "_Noreturn")
117
1
    );
118
1
    assert_eq!(
119
1
        scan_first("_Static_assert"),
120
1
        (TokenKind::KwStaticAssert, "_Static_assert")
121
1
    );
122
1
    assert_eq!(
123
1
        scan_first("_Thread_local"),
124
1
        (TokenKind::KwThreadLocal, "_Thread_local")
125
1
    );
126
1
}
127

            
128
1
#[test]
129
1
fn scan_two_adjacent_period_chars_as_two_separate_period_punctuations() {
130
1
    assert_eq!(
131
1
        scan_all(".."),
132
1
        [(TokenKind::Period, "."), (TokenKind::Period, ".")]
133
1
    );
134
1
}
135

            
136
1
#[test]
137
1
fn scan_single_nondigit_char_as_identifier() {
138
1
    use std::iter::once;
139
1

            
140
1
    let nondigit_chars = once('_').chain('a'..='z').chain('A'..='Z');
141

            
142
54
    for nondigit_char in nondigit_chars {
143
53
        let input_text = format!("{}", nondigit_char);
144
53
        let token = scan_first(&input_text);
145
53

            
146
53
        assert_eq!(token, (TokenKind::Identifier, &*input_text));
147
    }
148
1
}
149

            
150
260
proptest! {
151
260
    #[test]
152
260
    fn scan_valid_identifier(input_text in identifier()) {
153
260
        assert_eq!(
154
260
            scan_first(&input_text),
155
260
            (TokenKind::Identifier, &*input_text)
156
260
        );
157
261
    }
158
261
}
159

            
160
// TODO(feroldi): @charset Refactor this characters set into a module.
161
1
fn non_identifier_chars() -> impl Strategy<Value = String> {
162
1
    string_regex("[^_0-9a-zA-Z]+").unwrap()
163
1
}
164

            
165
260
proptest! {
166
260
    #[test]
167
260
    fn scan_identifier_until_it_reaches_a_non_identifier_char(
168
260
        ident in identifier(),
169
260
        non_ident in non_identifier_chars()
170
260
    ) {
171
260
        let input_text = format!("{}{}", ident, non_ident);
172
260
        prop_assume!(!is_start_of_prefixed_char_const_or_str_lit(&input_text));
173
261

            
174
261
        assert_eq!(
175
259
            scan_first(&input_text),
176
259
            (TokenKind::Identifier, &*ident)
177
259
        );
178
261
    }
179
261
}
180

            
181
260
proptest! {
182
260
    #[test]
183
260
    fn whitespace_at_the_start_of_the_input_should_be_ignored_when_scanned(
184
260
        ws in whitespace(),
185
260
        ident in identifier(),
186
260
    ) {
187
260
        let input_text = format!("{}{}", ws, ident);
188
260

            
189
260
        assert_eq!(scan_first(&input_text), (TokenKind::Identifier, &*ident));
190
261
    }
191
261
}
192

            
193
260
proptest! {
194
260
    #[test]
195
260
    fn scan_decimal_digits_as_numeric_constant(decimal_digits in "[0-9]+") {
196
260
        assert_eq!(
197
260
            scan_first(&decimal_digits),
198
260
            (TokenKind::NumericConstant, &*decimal_digits)
199
260
        );
200
261
    }
201
261
}
202

            
203
1
#[test]
204
1
fn scan_single_decimal_digit_as_numeric_constant() {
205
1
    assert_eq!(scan_first("0"), (TokenKind::NumericConstant, "0"));
206
1
    assert_eq!(scan_first("1"), (TokenKind::NumericConstant, "1"));
207
1
    assert_eq!(scan_first("2"), (TokenKind::NumericConstant, "2"));
208
1
    assert_eq!(scan_first("3"), (TokenKind::NumericConstant, "3"));
209
1
    assert_eq!(scan_first("4"), (TokenKind::NumericConstant, "4"));
210
1
    assert_eq!(scan_first("5"), (TokenKind::NumericConstant, "5"));
211
1
    assert_eq!(scan_first("6"), (TokenKind::NumericConstant, "6"));
212
1
    assert_eq!(scan_first("7"), (TokenKind::NumericConstant, "7"));
213
1
    assert_eq!(scan_first("8"), (TokenKind::NumericConstant, "8"));
214
1
    assert_eq!(scan_first("9"), (TokenKind::NumericConstant, "9"));
215
1
}
216

            
217
6
fn stop_char_for_num_const() -> impl Strategy<Value = String> {
218
6
    string_regex("[^0-9a-zA-Z.]").unwrap()
219
6
}
220

            
221
260
proptest! {
222
260
    #[test]
223
260
    fn numeric_constant_can_have_a_period_punctuation_in_the_middle(
224
260
        num_const in "[0-9]+[.][0-9]+",
225
260
        stop_char in stop_char_for_num_const(),
226
260
    ) {
227
260
        let input_text = format!("{}{}", num_const, stop_char);
228
260
        assert_eq!(
229
260
            scan_first(&input_text),
230
260
            (TokenKind::NumericConstant, &*num_const)
231
260
        );
232
261
    }
233
261
}
234

            
235
260
proptest! {
236
260
    #[test]
237
260
    fn numeric_constant_can_end_with_a_period_punctuation(
238
260
        num_const in "[0-9]+[.]",
239
260
        stop_char in stop_char_for_num_const(),
240
260
    ) {
241
260
        let input_text = format!("{}{}", num_const, stop_char);
242
260
        assert_eq!(
243
260
            scan_first(&input_text),
244
260
            (TokenKind::NumericConstant, &*num_const)
245
260
        );
246
261
    }
247
261
}
248

            
249
260
proptest! {
250
260
    #[test]
251
260
    fn numeric_constant_can_start_with_a_period_punctuation(
252
260
        num_const in "[.][0-9]+",
253
260
        stop_char in stop_char_for_num_const(),
254
260
    ) {
255
260
        let input_text = format!("{}{}", num_const, stop_char);
256
260
        assert_eq!(
257
260
            scan_first(&input_text),
258
260
            (TokenKind::NumericConstant, &*num_const)
259
260
        );
260
261
    }
261
261
}
262

            
263
260
proptest! {
264
260
    #[test]
265
260
    fn numeric_constant_cannot_start_with_more_than_one_period_punctuation(
266
260
        input_text in r"\.\.+[0-9]",
267
260
    ) {
268
260
        assert_ne!(scan_first(&input_text).0, TokenKind::NumericConstant);
269
261
    }
270
261
}
271

            
272
260
proptest! {
273
260
    #[test]
274
260
    fn numeric_constant_can_have_decimal_and_binary_exponent(
275
260
        num_const in "[0-9]+[eEpP][+-]?[0-9]+",
276
260
        stop_char in stop_char_for_num_const(),
277
260
    ) {
278
260
        let input_text = format!("{}{}", num_const, stop_char);
279
260
        assert_eq!(
280
260
            scan_first(&input_text),
281
260
            (TokenKind::NumericConstant, &*num_const)
282
260
        );
283
261
    }
284
261
}
285

            
286
260
proptest! {
287
260
    #[test]
288
260
    fn numeric_constants_should_not_contain_plus_or_minus_if_it_is_not_an_exponent(
289
260
        num_const_without_exponent in "[0-9]+[0-9a-dA-Df-oF-Oq-zQ-Z]+",
290
260
        incorrect_exponent in "[+-][0-9]+",
291
260
        stop_char in stop_char_for_num_const(),
292
260
    ) {
293
260
        let input_text = format!(
294
260
            "{}{}{}",
295
260
            num_const_without_exponent, incorrect_exponent, stop_char
296
260
        );
297
260
        assert_eq!(
298
260
            scan_first(&input_text),
299
260
            (TokenKind::NumericConstant, &*num_const_without_exponent)
300
260
        );
301
261
    }
302
261
}
303

            
304
260
proptest! {
305
260
    #[test]
306
260
    fn numeric_constant_can_have_various_decimal_or_binary_exponents(
307
260
        num_const in "[0-9]+([eEpP][+-]?[0-9]+)+",
308
260
        stop_char in stop_char_for_num_const(),
309
260
    ) {
310
260
        let input_text = format!("{}{}", num_const, stop_char);
311
260
        assert_eq!(
312
260
            scan_first(&input_text),
313
260
            (TokenKind::NumericConstant, &*num_const)
314
260
        );
315
261
    }
316
261
}
317

            
318
260
proptest! {
319
260
    #[test]
320
260
    fn character_constant_is_wrapped_in_single_quotes(
321
260
        c_char_seq in char_const_char_sequence(),
322
260
        stop_char in source_char()
323
260
    ) {
324
260
        let char_const = format!("'{}'", c_char_seq);
325
260
        let input_text = format!("{}{}", char_const, stop_char);
326
260

            
327
260
        assert_eq!(
328
260
            scan_first(&input_text),
329
260
            (
330
260
                TokenKind::CharacterConstant {
331
260
                    encoding: CharEncoding::Byte
332
260
                },
333
260
                &*char_const
334
260
            )
335
260
        );
336
261
    }
337
261
}
338

            
339
1
#[test]
340
1
fn character_constant_cannot_be_empty() {
341
1
    assert_eq!(try_scan_first("''"), Err(Diag::EmptyCharacterConstant));
342
1
}
343

            
344
260
proptest! {
345
260
    #[test]
346
260
    fn character_constant_cannot_end_in_newline_or_nul(
347
260
        c_char_seq in char_const_char_sequence()
348
261
    ) {
349
1040
        for newline_or_nul in ['\n', '\r', '\0'] {
350
780
            let input_text = format!("'{}{}", c_char_seq, newline_or_nul);
351
780

            
352
780
            assert_eq!(
353
780
                try_scan_first(&input_text),
354
780
                Err(Diag::UnterminatedCharacterConstant)
355
780
            );
356
261
        }
357
261
    }
358
261
}
359

            
360
1
#[test]
361
1
fn character_constant_cannot_abruptly_end_in_newline_or_nul() {
362
4
    for newline_or_nul in ['\n', '\r', '\0'] {
363
3
        let input_text = format!("'{}", newline_or_nul);
364
3

            
365
3
        assert_eq!(
366
3
            try_scan_first(&input_text),
367
3
            Err(Diag::UnterminatedCharacterConstant)
368
3
        );
369
    }
370
1
}
371

            
372
1
#[test]
373
1
fn escape_single_quote_in_character_constant() {
374
1
    assert_eq!(
375
1
        scan_first(r"'\''"),
376
1
        (
377
1
            TokenKind::CharacterConstant {
378
1
                encoding: CharEncoding::Byte
379
1
            },
380
1
            r"'\''"
381
1
        )
382
1
    );
383
1
}
384

            
385
1
#[test]
386
1
fn do_not_escape_single_quote_in_character_constant_if_it_follows_two_adjacent_backslashes() {
387
1
    assert_eq!(
388
1
        scan_first(r"'\\'"),
389
1
        (
390
1
            TokenKind::CharacterConstant {
391
1
                encoding: CharEncoding::Byte
392
1
            },
393
1
            r"'\\'"
394
1
        )
395
1
    );
396
1
}
397

            
398
1
#[test]
399
1
fn character_constant_missing_terminating_quote_because_it_was_escaped() {
400
1
    assert_eq!(
401
1
        try_scan_first(r"'\'"),
402
1
        Err(Diag::UnterminatedCharacterConstant)
403
1
    );
404
1
}
405

            
406
1
#[test]
407
1
fn backslashes_escape_anything_in_character_constant() {
408
1
    // TODO(feroldi): Make this test be property-based.
409
1
    assert_eq!(
410
1
        scan_first(r"'\a\\\\b\c'"),
411
1
        (
412
1
            TokenKind::CharacterConstant {
413
1
                encoding: CharEncoding::Byte
414
1
            },
415
1
            r"'\a\\\\b\c'"
416
1
        )
417
1
    );
418
1
}
419

            
420
1
#[test]
421
1
fn character_constant_may_not_start_with_utf8_prefix() {
422
1
    assert_eq!(
423
1
        scan_all("u8'x'"),
424
1
        [
425
1
            (TokenKind::Identifier, "u8"),
426
1
            (
427
1
                TokenKind::CharacterConstant {
428
1
                    encoding: CharEncoding::Byte
429
1
                },
430
1
                "'x'"
431
1
            )
432
1
        ]
433
1
    );
434
1
}
435

            
436
1
#[test]
437
1
fn character_constant_may_start_with_wide_prefix() {
438
1
    assert_eq!(
439
1
        scan_first("L'x'"),
440
1
        (
441
1
            TokenKind::CharacterConstant {
442
1
                encoding: CharEncoding::Wide
443
1
            },
444
1
            "L'x'"
445
1
        )
446
1
    );
447
1
}
448

            
449
1
#[test]
450
1
fn character_constant_may_start_with_utf16_prefix() {
451
1
    assert_eq!(
452
1
        scan_first("u'x'"),
453
1
        (
454
1
            TokenKind::CharacterConstant {
455
1
                encoding: CharEncoding::Utf16
456
1
            },
457
1
            "u'x'"
458
1
        )
459
1
    );
460
1
}
461

            
462
1
#[test]
463
1
fn character_constant_may_start_with_utf32_prefix() {
464
1
    assert_eq!(
465
1
        scan_first("U'x'"),
466
1
        (
467
1
            TokenKind::CharacterConstant {
468
1
                encoding: CharEncoding::Utf32
469
1
            },
470
1
            "U'x'"
471
1
        )
472
1
    );
473
1
}
474

            
475
260
proptest! {
476
260
    #[test]
477
260
    fn do_not_scan_alphanum_char_adjacent_to_single_quote_as_a_char_const_prefix(
478
260
        invalid_prefix in "[_0-9a-zA-Z&&[^LuU]]"
479
260
    ) {
480
260
        let input = format!("{}'x'", invalid_prefix);
481
260
        let tokens = scan_all(&input);
482
260

            
483
260
        assert_eq!(tokens.len(), 2);
484
261

            
485
261
        assert_eq!(
486
260
            tokens[1],
487
260
            (
488
260
                TokenKind::CharacterConstant { encoding: CharEncoding::Byte },
489
260
                "'x'"
490
260
            )
491
260
        );
492
261
    }
493
261
}
494

            
495
260
proptest! {
496
260
    #[test]
497
260
    fn do_not_scan_punctuation_adjacent_to_single_quote_as_a_char_const_prefix(
498
260
        punctuation in source_punctuation(),
499
260
    ) {
500
260
        let input = format!("{}'x'", punctuation);
501
260
        let tokens = scan_all(&input);
502
260

            
503
260
        assert_eq!(tokens.len(), 2);
504
261

            
505
261
        assert_eq!(
506
260
            tokens[1],
507
260
            (
508
260
                TokenKind::CharacterConstant { encoding: CharEncoding::Byte },
509
260
                "'x'"
510
260
            )
511
260
        );
512
261
    }
513
261
}
514

            
515
2
fn char_const_char_sequence() -> impl Strategy<Value = String> {
516
2
    source_chars_except(&['\'', '\\', '\n', '\r'])
517
2
}
518

            
519
260
proptest! {
520
260
    #[test]
521
260
    fn string_literal_is_wrapped_in_double_quotes(
522
260
        c_char_seq in str_lit_char_sequence(),
523
260
        stop_char in source_char()
524
260
    ) {
525
260
        let str_lit = format!("{quote}{seq}{quote}", quote='"', seq=c_char_seq);
526
260
        let input_text = format!("{}{}", str_lit, stop_char);
527
260

            
528
260
        assert_eq!(
529
260
            scan_first(&input_text),
530
260
            (
531
260
                TokenKind::StringLiteral{
532
260
                encoding: CharEncoding::Byte
533
260
            },
534
260
                &*str_lit
535
260
            )
536
260
        );
537
261
    }
538
261
}
539

            
540
1
#[test]
541
1
fn string_literal_can_be_empty() {
542
1
    assert_eq!(
543
1
        try_scan_first(r#""""#),
544
1
        Ok((
545
1
            TokenKind::StringLiteral {
546
1
                encoding: CharEncoding::Byte
547
1
            },
548
1
            r#""""#
549
1
        ))
550
1
    );
551
1
}
552

            
553
260
proptest! {
554
260
    #[test]
555
260
    fn string_literal_cannot_end_in_newline_or_nul(
556
260
        char_seq in str_lit_char_sequence()
557
261
    ) {
558
1040
        for newline_or_nul in ['\n', '\r', '\0'] {
559
780
            let input_text = format!(
560
780
                "{quote}{seq}{end}",
561
780
                quote='"',
562
780
                seq=char_seq,
563
780
                end=newline_or_nul
564
780
            );
565
780

            
566
780
            assert_eq!(
567
780
                try_scan_first(&input_text),
568
780
                Err(Diag::UnterminatedStringLiteral)
569
780
            );
570
261
        }
571
261
    }
572
261
}
573

            
574
1
#[test]
575
1
fn string_literal_cannot_abruptly_end_in_newline_or_nul() {
576
4
    for newline_or_nul in ['\n', '\r', '\0'] {
577
3
        let input_text = format!("\"{}", newline_or_nul);
578
3

            
579
3
        assert_eq!(
580
3
            try_scan_first(&input_text),
581
3
            Err(Diag::UnterminatedStringLiteral)
582
3
        );
583
    }
584
1
}
585

            
586
1
#[test]
587
1
fn escape_double_quote_in_string_literal() {
588
1
    assert_eq!(
589
1
        scan_first(r#""\"""#),
590
1
        (
591
1
            TokenKind::StringLiteral {
592
1
                encoding: CharEncoding::Byte
593
1
            },
594
1
            r#""\"""#
595
1
        )
596
1
    );
597
1
}
598

            
599
1
#[test]
600
1
fn escape_newline_in_string_literal() {
601
4
    for newline in ["\n", "\r", "\n\r", "\r\n"] {
602
4
        let input = format!(
603
4
            "{quote}{escape}{newline}{quote}",
604
4
            quote = '"',
605
4
            escape = '\\',
606
4
            newline = newline
607
4
        );
608
4
        assert_eq!(
609
4
            try_scan_first(&input),
610
4
            Ok((
611
4
                TokenKind::StringLiteral {
612
4
                    encoding: CharEncoding::Byte
613
4
                },
614
4
                &*input
615
4
            ))
616
4
        );
617
    }
618
1
}
619

            
620
1
#[test]
621
1
fn do_not_escape_double_quote_in_string_literal_if_it_follows_two_adjacent_backslashes() {
622
1
    assert_eq!(
623
1
        try_scan_first(r#""\\""#),
624
1
        Ok((
625
1
            TokenKind::StringLiteral {
626
1
                encoding: CharEncoding::Byte
627
1
            },
628
1
            r#""\\""#
629
1
        ))
630
1
    );
631
1
}
632

            
633
1
#[test]
634
1
fn string_literal_missing_terminating_quote_because_it_was_escaped() {
635
1
    assert_eq!(
636
1
        try_scan_first(r#""\""#),
637
1
        Err(Diag::UnterminatedStringLiteral)
638
1
    );
639
1
}
640

            
641
1
#[test]
642
1
fn backslashes_escape_anything_in_string_literal() {
643
1
    // TODO(feroldi): Make this test be property-based.
644
1
    assert_eq!(
645
1
        try_scan_first(r#""\a\\\\b\c""#),
646
1
        Ok((
647
1
            TokenKind::StringLiteral {
648
1
                encoding: CharEncoding::Byte
649
1
            },
650
1
            r#""\a\\\\b\c""#
651
1
        ))
652
1
    );
653
1
}
654

            
655
1
#[test]
656
1
fn string_literal_may_start_with_utf8_prefix() {
657
1
    assert_eq!(
658
1
        scan_first(r#"u8"hello world""#),
659
1
        (
660
1
            TokenKind::StringLiteral {
661
1
                encoding: CharEncoding::Utf8
662
1
            },
663
1
            r#"u8"hello world""#
664
1
        )
665
1
    );
666
1
}
667

            
668
1
#[test]
669
1
fn string_literal_may_start_with_wide_prefix() {
670
1
    assert_eq!(
671
1
        scan_first(r#"L"hello world""#),
672
1
        (
673
1
            TokenKind::StringLiteral {
674
1
                encoding: CharEncoding::Wide
675
1
            },
676
1
            r#"L"hello world""#
677
1
        )
678
1
    );
679
1
}
680

            
681
1
#[test]
682
1
fn string_literal_may_start_with_utf16_prefix() {
683
1
    assert_eq!(
684
1
        scan_first(r#"u"hello world""#),
685
1
        (
686
1
            TokenKind::StringLiteral {
687
1
                encoding: CharEncoding::Utf16
688
1
            },
689
1
            r#"u"hello world""#
690
1
        )
691
1
    );
692
1
}
693

            
694
1
#[test]
695
1
fn string_literal_may_start_with_utf32_prefix() {
696
1
    assert_eq!(
697
1
        scan_first(r#"U"hello world""#),
698
1
        (
699
1
            TokenKind::StringLiteral {
700
1
                encoding: CharEncoding::Utf32
701
1
            },
702
1
            r#"U"hello world""#
703
1
        )
704
1
    );
705
1
}
706

            
707
2
fn str_lit_char_sequence() -> impl Strategy<Value = String> {
708
2
    source_chars_except(&['"', '\\', '\n', '\r'])
709
2
}
710

            
711
260
proptest! {
712
260
    #[test]
713
260
    fn scanner_should_diagnose_characters_not_in_source_charset(
714
260
        non_source_char in non_source_char()
715
260
    ) {
716
260
        let unrec_char = non_source_char.chars().next().unwrap();
717
260

            
718
260
        assert_eq!(
719
260
            try_scan_first(&non_source_char),
720
260
            Err(Diag::UnrecognizedChar(unrec_char))
721
260
        );
722
261
    }
723
261
}
724

            
725
1
#[test]
726
1
fn scanner_should_skip_block_comments() {
727
1
    assert_eq!(scan_all("/*this block comment should be skipped*/"), []);
728
1
}
729

            
730
1
#[test]
731
1
fn block_comments_allow_newlines_inside() {
732
1
    assert_eq!(
733
1
        try_scan_all(
734
1
            r"
735
1
            /*this block comment
736
1
               contains many lines
737
1
              
738
1
               that are separated by line-feed characters
739
1
               how nice
740
1
            */
741
1
            "
742
1
        ),
743
1
        []
744
1
    );
745
1
}
746

            
747
1
#[test]
748
1
fn block_comments_do_not_nest() {
749
1
    assert_eq!(
750
1
        try_scan_all("/* this is skipped /* this too */ not this */"),
751
1
        [
752
1
            Ok((TokenKind::Identifier, "not")),
753
1
            Ok((TokenKind::Identifier, "this")),
754
1
            Ok((TokenKind::Star, "*")),
755
1
            Ok((TokenKind::Slash, "/"))
756
1
        ]
757
1
    );
758
1
}
759

            
760
1
#[test]
761
1
fn block_comments_do_not_form_inside_string_literals() {
762
1
    assert_eq!(
763
1
        try_scan_first(r#""foo /* bar */ baz""#),
764
1
        Ok((
765
1
            TokenKind::StringLiteral {
766
1
                encoding: CharEncoding::Byte
767
1
            },
768
1
            r#""foo /* bar */ baz""#
769
1
        )),
770
1
    );
771
1
}
772

            
773
1
#[test]
774
1
fn block_comments_do_not_form_inside_character_constants() {
775
1
    assert_eq!(
776
1
        try_scan_first("'foo /* bar */ baz'"),
777
1
        Ok((
778
1
            TokenKind::CharacterConstant {
779
1
                encoding: CharEncoding::Byte
780
1
            },
781
1
            "'foo /* bar */ baz'"
782
1
        )),
783
1
    );
784
1
}
785

            
786
1
#[test]
787
1
fn block_comment_between_two_identifiers_should_scan_them_separately() {
788
1
    assert_eq!(
789
1
        scan_all("foo/*this is a comment*/bar"),
790
1
        [
791
1
            (TokenKind::Identifier, "foo"),
792
1
            (TokenKind::Identifier, "bar"),
793
1
        ]
794
1
    );
795
1
}
796

            
797
260
proptest! {
798
260
#[test]
799
260
    fn scanner_should_skip_line_comments(
800
260
        comment_text in printable_chars(),
801
260
        newline in newline()
802
260
    ) {
803
260
        let input_text = format!("foo//{}{}bar", comment_text, newline);
804
260

            
805
260
        assert_eq!(
806
260
            try_scan_all(&input_text),
807
260
            [
808
260
                Ok((TokenKind::Identifier, "foo")),
809
260
                Ok((TokenKind::Identifier, "bar")),
810
260
            ]
811
260
        );
812
261
    }
813
261
}
814

            
815
1
#[test]
816
1
fn line_comments_do_not_form_inside_string_literals() {
817
1
    assert_eq!(
818
1
        try_scan_first(r#""foo // bar ""#),
819
1
        Ok((
820
1
            TokenKind::StringLiteral {
821
1
                encoding: CharEncoding::Byte
822
1
            },
823
1
            r#""foo // bar ""#
824
1
        )),
825
1
    );
826
1
}
827

            
828
1
#[test]
829
1
fn line_comments_do_not_form_inside_character_constants() {
830
1
    assert_eq!(
831
1
        try_scan_first(r#"'foo // bar '"#),
832
1
        Ok((
833
1
            TokenKind::CharacterConstant {
834
1
                encoding: CharEncoding::Byte
835
1
            },
836
1
            r#"'foo // bar '"#
837
1
        )),
838
1
    );
839
1
}
840

            
841
260
proptest! {
842
260
#[test]
843
260
    fn line_comment_can_end_in_eof(comment_text in printable_chars()) {
844
260
        // If we get an EOF while scanning a line comment, the behavior is undefined as
845
260
        // per [C17 6.4.9/2], because it doesn't specify what happens if there is no new
846
260
        // line character to be found. That's quite counterproductive, so here we just
847
260
        // stop scanning as if we would have found a new line.
848
260
        let input_text = format!("foo//{}", comment_text);
849
260

            
850
260
        assert_eq!(
851
260
            try_scan_all(&input_text),
852
260
            [
853
260
                Ok((TokenKind::Identifier, "foo")),
854
260
            ]
855
260
        );
856
261
    }
857
261
}
858

            
859
1
#[test]
860
1
fn diagnose_missing_block_comment_terminator() {
861
1
    assert_eq!(
862
1
        try_scan_all("/*this block comment doesn't end"),
863
1
        [Err(Diag::UnterminatedBlockComment)]
864
1
    );
865
1
}
866

            
867
1
#[test]
868
1
fn diagnose_missing_block_comment_terminator_for_corner_case_of_empty_block_comment() {
869
1
    assert_eq!(try_scan_all("/*"), [Err(Diag::UnterminatedBlockComment)]);
870
1
}
871

            
872
6434
fn try_scan_all(input_text: &str) -> Vec<Result<(TokenKind, &str), Diag>> {
873
6434
    let mut scanner = Scanner::with_input(input_text);
874
6434
    let source_file = SourceFile::new(input_text);
875
6434

            
876
6434
    let mut tokens = Vec::new();
877

            
878
    'outer: loop {
879
20024
        match scanner.scan_next_token() {
880
16044
            Ok(token) if token != Token::eof() => {
881
9610
                let lexeme = source_file.get_text_snippet(token);
882
9610
                tokens.push(Ok((token.kind, lexeme)));
883
9610
            }
884
6434
            Ok(_) => break 'outer,
885
3980
            Err(diag) => tokens.push(Err(diag)),
886
        }
887
    }
888

            
889
6434
    tokens
890
6434
}
891

            
892
1840
fn try_scan_first(input_text: &str) -> Result<(TokenKind, &str), Diag> {
893
1840
    let tokens = try_scan_all(input_text);
894
1840
    assert_ne!(tokens.len(), 0);
895

            
896
1840
    tokens[0]
897
1840
}
898

            
899
4069
fn scan_all(input_text: &str) -> Vec<(TokenKind, &str)> {
900
4069
    try_scan_all(input_text).into_iter().flatten().collect()
901
4069
}
902

            
903
3545
fn scan_first(input_text: &str) -> (TokenKind, &str) {
904
3545
    let tokens = scan_all(input_text);
905
3545
    assert_ne!(tokens.len(), 0);
906

            
907
3545
    tokens[0]
908
3545
}