Grcov report - scanner.rs

1

use crate::char_stream::CharStream;

2

use crate::diagnostics::Diag;

3

use crate::source_map::{Span, Spanned};

4

5

pub(crate) struct Scanner<'input> {

6

    chars: CharStream<'input>,

7

8

9

impl<'input> Scanner<'input> {

10

6958

    pub(crate) fn with_input(source_text: &'input str) -> Scanner<'input> {

11

6958

        Scanner {

12

6958

            chars: CharStream::with_text(source_text),

13

6958

14

6958

15

16

22406

    pub(crate) fn scan_next_token(&mut self) -> Result<Spanned<Token>, Diag> {

17

38263

        while is_space(self.peek()) {

18

15857

            self.consume();

19

15857

20

21

22406

        let span_start = self.chars.peek_byte_pos();

22

23

22406

        let token_kind = match self.consume() {

24

6435

            CharStream::EOF_CHAR => return Ok(Token::eof()),

25

29

            '(' => TokenKind::Open(Bracket::Round),

26

34

            ')' => TokenKind::Closed(Bracket::Round),

27

120

            '[' => TokenKind::Open(Bracket::Square),

28

112

            ']' => TokenKind::Closed(Bracket::Square),

29

125

            '{' => TokenKind::Open(Bracket::Curly),

30

18

            '}' => TokenKind::Closed(Bracket::Curly),

31

            '.' => {

32

2062

                if is_digit(self.peek()) {

33

445

                    let first_digit = self.consume();

34

445

                    self.scan_numeric_constant(first_digit)

35

1617

                } else if self.peek() == '.' && self.lookahead(1) == '.' {

36

1419

                    self.consume();

37

1419

                    self.consume();

38

1419

                    TokenKind::Ellipsis

39

                } else {

40

198

                    TokenKind::Period

41

42

43

            '-' => {

44

154

                if self.try_consume('>') {

45

1

                    TokenKind::Arrow

46

153

                } else if self.try_consume('-') {

47

1

                    TokenKind::MinusMinus

48

152

                } else if self.try_consume('=') {

49

2

                    TokenKind::MinusEqual

50

                } else {

51

150

                    TokenKind::Minus

52

53

54

            '+' => {

55

163

                if self.try_consume('+') {

56

1

                    TokenKind::PlusPlus

57

162

                } else if self.try_consume('=') {

58

1

                    TokenKind::PlusEqual

59

                } else {

60

161

                    TokenKind::Plus

61

62

63

            '&' => {

64

131

                if self.try_consume('&') {

65

1

                    TokenKind::AmpAmp

66

130

                } else if self.try_consume('=') {

67

2

                    TokenKind::AmpEqual

68

                } else {

69

128

                    TokenKind::Ampersand

70

71

72

            '*' => {

73

114

                if self.try_consume('=') {

74

3

                    TokenKind::StarEqual

75

                } else {

76

111

                    TokenKind::Star

77

78

79

15

            '~' => TokenKind::Tilde,

80

            '!' => {

81

55

                if self.try_consume('=') {

82

1

                    TokenKind::ExclaEqual

83

                } else {

84

54

                    TokenKind::Exclamation

85

86

87

            '/' => {

88

766

                if self.try_consume('=') {

89

2

                    TokenKind::SlashEqual

90

764

                } else if self.try_consume('*') {

91

6

                    self.skip_block_comment()?;

92

93

4

                    return self.scan_next_token();

94

758

                } else if self.try_consume('/') {

95

520

                    self.skip_line_comment()?;

96

97

520

                    return self.scan_next_token();

98

                } else {

99

238

                    TokenKind::Slash

100

101

102

            '%' => {

103

130

                if self.try_consume('=') {

104

2

                    TokenKind::PercentEqual

105

                } else {

106

128

                    TokenKind::Percent

107

108

109

            '<' => {

110

153

                if self.try_consume('<') {

111

5

                    if self.try_consume('=') {

112

2

                        TokenKind::LessLessEqual

113

                    } else {

114

3

                        TokenKind::LessLess

115

116

148

                } else if self.try_consume('=') {

117

2

                    TokenKind::LessEqual

118

                } else {

119

146

                    TokenKind::Less

120

121

122

            '>' => {

123

69

                if self.try_consume('>') {

124

2

                    if self.try_consume('=') {

125

1

                        TokenKind::GreaterGreaterEqual

126

                    } else {

127

1

                        TokenKind::GreaterGreater

128

129

67

                } else if self.try_consume('=') {

130

1

                    TokenKind::GreaterEqual

131

                } else {

132

66

                    TokenKind::Greater

133

134

135

            '=' => {

136

384

                if self.try_consume('=') {

137

3

                    TokenKind::EqualEqual

138

                } else {

139

381

                    TokenKind::Equal

140

141

142

            '^' => {

143

103

                if self.try_consume('=') {

144

4

                    TokenKind::CaretEqual

145

                } else {

146

99

                    TokenKind::Caret

147

148

149

            '|' => {

150

29

                if self.try_consume('|') {

151

1

                    TokenKind::PipePipe

152

28

                } else if self.try_consume('=') {

153

2

                    TokenKind::PipeEqual

154

                } else {

155

26

                    TokenKind::Pipe

156

157

158

180

            '?' => TokenKind::Question,

159

168

            ':' => TokenKind::Colon,

160

345

            ';' => TokenKind::Semicolon,

161

29

            ',' => TokenKind::Comma,

162

            '#' => {

163

62

                if self.try_consume('#') {

164

1

                    TokenKind::HashHash

165

                } else {

166

61

                    TokenKind::Hash

167

168

169

1676

            '\'' => self.scan_character_constant(CharEncoding::Byte)?,

170

71

            prefix @ ('L' | 'u' | 'U') if self.peek() == '\'' => {

171

3

                self.consume();

172

173

3

                let char_encoding = match prefix {

174

1

                    'L' => CharEncoding::Wide,

175

1

                    'u' => CharEncoding::Utf16,

176

1

                    'U' => CharEncoding::Utf32,

177

                    _ => unreachable!("invalid character constant prefix '{}'!", prefix),

178

};

179

180

3

                self.scan_character_constant(char_encoding)?

181

182

1162

            '"' => self.scan_string_literal(CharEncoding::Byte)?,

183

26

            'u' if self.peek() == '8' && self.lookahead(1) == '"' => {

184

1

                self.consume();

185

1

                self.consume();

186

1

187

1

                self.scan_string_literal(CharEncoding::Utf8)?

188

189

67

            prefix @ ('L' | 'u' | 'U') if self.peek() == '"' => {

190

3

                self.consume();

191

192

3

                let char_encoding = match prefix {

193

1

                    'L' => CharEncoding::Wide,

194

1

                    'u' => CharEncoding::Utf16,

195

1

                    'U' => CharEncoding::Utf32,

196

                    _ => unreachable!("invalid string literal prefix '{}'!", prefix),

197

};

198

199

3

                self.scan_string_literal(char_encoding)?

200

201

7576

            ch if is_digit(ch) => self.scan_numeric_constant(ch),

202

5345

            ch if is_start_of_identifier(ch) => self.scan_identifier_or_keyword(ch),

203

2209

            unrecognized_char => return Err(Diag::UnrecognizedChar(unrecognized_char)),

204

};

205

206

11467

        let span_end = self.chars.peek_byte_pos();

207

11467

208

11467

        let token_span = Span {

209

11467

            start: span_start,

210

11467

            end: span_end,

211

11467

};

212

11467

213

11467

        Ok(Spanned::new(Token { kind: token_kind }, token_span))

214

22406

215

216

6

    fn skip_block_comment(&mut self) -> Result<(), Diag> {

217

281

        loop {

218

281

            if self.peek() == CharStream::EOF_CHAR {

219

2

                break Err(Diag::UnterminatedBlockComment);

220

279

221

279

222

279

            if self.consume() == '*' && self.peek() == '/' {

223

4

                self.consume();

224

4

                break Ok(());

225

275

226

227

6

228

229

520

    fn skip_line_comment(&mut self) -> Result<(), Diag> {

230

        // If we get an EOF while scanning a line comment, the behavior is undefined as

231

        // per [C17 6.4.9/2], because it doesn't specify what happens if there is no new

232

        // line character to be found. That's quite counterproductive, so here we just

233

        // stop scanning as if we would have found a new line.

234

9415

        while self.peek() != CharStream::EOF_CHAR && !is_newline(self.peek()) {

235

8895

            self.consume();

236

8895

237

238

520

        Ok(())

239

520

240

241

1679

    fn scan_character_constant(&mut self, encoding: CharEncoding) -> Result<TokenKind, Diag> {

242

        debug_assert_ne!(

243

            encoding,

244

            CharEncoding::Utf8,

245

            "C17 standard doesn't define utf-8 character constants"

246

);

247

248

1679

        if self.peek() == '\'' {

249

2

            self.consume();

250

2

            return Err(Diag::EmptyCharacterConstant);

251

1677

252

253

19475

        while self.peek() != '\'' {

254

18681

            if is_newline(self.peek()) || self.peek() == CharStream::EOF_CHAR {

255

883

                return Err(Diag::UnterminatedCharacterConstant);

256

17798

257

17798

258

17798

            // Skips backslashes. This effectively escapes single-quotes. Validation of

259

17798

            // escape sequences occurs later on during parsing, which means

260

17798

            // character-constant tokens may be semantically invalid before parsing.

261

17798

            // NOTE: Review this during implementation of escaping newlines in code.

262

17798

            // @escape-newline.

263

17798

            if self.peek() == '\\' {

264

19

                self.consume();

265

17779

266

267

17798

            self.consume();

268

269

270

794

        let terminating_quote = self.consume();

271

794

        debug_assert_eq!(terminating_quote, '\'');

272

273

794

        Ok(TokenKind::CharacterConstant { encoding })

274

1679

275

276

1166

    fn scan_string_literal(&mut self, encoding: CharEncoding) -> Result<TokenKind, Diag> {

277

17584

        while self.peek() != '"' {

278

17302

            if is_newline(self.peek()) || self.peek() == CharStream::EOF_CHAR {

279

884

                return Err(Diag::UnterminatedStringLiteral);

280

16418

281

16418

282

16418

            // Skips backslashes. This effectively escapes double-quotes. Validation of

283

16418

            // escape sequences occurs later on during parsing, which means

284

16418

            // string-literal tokens may be semantically invalid before parsing.

285

16418

            // NOTE: Review this during implementation of escaping newlines in code.

286

16418

            // @escape-newline.

287

16418

            if self.peek() == '\\' {

288

30

                self.consume();

289

16388

290

291

16418

            self.consume();

292

293

294

282

        let terminating_quote = self.consume();

295

282

        debug_assert_eq!(terminating_quote, '"');

296

297

282

        Ok(TokenKind::StringLiteral { encoding })

298

1166

299

300

2676

    fn scan_numeric_constant(&mut self, first_digit: char) -> TokenKind {

301

2676

        debug_assert!(is_numeric_constant_char(first_digit));

302

303

2676

        let mut prev_peek = first_digit;

304

305

        // Intentionally scans a more general case of a numeric constant, which may turn

306

        // out to be ill-formed as per the standard. This means a numeric-constant token

307

        // is not guaranteed to be valid before parsing.

308

//

309

        // A more strict and conforming scanning is done during parsing, which is when

310

        // an ill-formed numeric constant is diagnosed. This takes some burden

311

        // away from the scanner, and also makes the parser's job somewhat

312

        // simpler, as the character set to be considered during the parsing of

313

        // a numeric constant token is greatly reduced.

314

        loop {

315

126507

            while is_numeric_constant_char(self.peek()) {

316

121584

                prev_peek = self.consume();

317

121584

318

319

4923

            if matches!(self.peek(), '+' | '-') && matches!(prev_peek, 'e' | 'E' | 'p' | 'P') {

320

2247

                self.consume();

321

2247

                continue;

322

2676

323

2676

324

2676

            break;

325

2676

326

2676

327

2676

        TokenKind::NumericConstant

328

2676

329

330

3136

    fn scan_identifier_or_keyword(&mut self, ident_start: char) -> TokenKind {

331

3136

        debug_assert!(is_start_of_identifier(ident_start));

332

333

3136

        let mut lexeme_buffer = String::with_capacity(16);

334

3136

        lexeme_buffer.push(ident_start);

335

336

26580

        while is_remaining_of_identifier(self.peek()) {

337

23444

            lexeme_buffer.push(self.consume());

338

23444

339

340

3136

        get_keyword_kind_for_lexeme(&lexeme_buffer).unwrap_or(TokenKind::Identifier)

341

3136

342

343

362822

    fn peek(&mut self) -> char {

344

362822

        self.chars.peek()

345

362822

346

347

1522

    fn lookahead(&mut self, n: usize) -> char {

348

1522

        self.chars.lookahead(n)

349

1522

350

351

233350

    fn consume(&mut self) -> char {

352

233350

        self.chars.consume()

353

233350

354

355

4682

    fn try_consume(&mut self, expected_char: char) -> bool {

356

4682

        self.chars.try_consume(expected_char)

357

4682

358

359

360

// TODO(feroldi): Add `is_eof()` function.

361

18426

#[derive(PartialEq, Debug, Copy, Clone)]

362

pub struct Token {

363

    pub kind: TokenKind,

364

365

366

impl Token {

367

24861

    pub fn eof() -> Spanned<Token> {

368

24861

        Spanned::with_dummy_span(Token {

369

24861

            kind: TokenKind::Eof,

370

24861

})

371

24861

372

373

374

25922

#[derive(PartialEq, Debug, Copy, Clone)]

375

pub enum TokenKind {

376

    Open(Bracket),

377

    Closed(Bracket),

378

    Period,

379

    Arrow,

380

    PlusPlus,

381

    MinusMinus,

382

    Ampersand,

383

    Star,

384

    Plus,

385

    Minus,

386

    Tilde,

387

    Exclamation,

388

    Slash,

389

    Percent,

390

    LessLess,

391

    GreaterGreater,

392

    Less,

393

    Greater,

394

    LessEqual,

395

    GreaterEqual,

396

    EqualEqual,

397

    ExclaEqual,

398

    Caret,

399

    Pipe,

400

    AmpAmp,

401

    PipePipe,

402

    Question,

403

    Colon,

404

    Semicolon,

405

    Ellipsis,

406

    Equal,

407

    StarEqual,

408

    SlashEqual,

409

    PercentEqual,

410

    PlusEqual,

411

    MinusEqual,

412

    LessLessEqual,

413

    GreaterGreaterEqual,

414

    AmpEqual,

415

    CaretEqual,

416

    PipeEqual,

417

    Comma,

418

    Hash,

419

    HashHash,

420

    Identifier,

421

    KwAuto,

422

    KwBreak,

423

    KwCase,

424

    KwChar,

425

    KwConst,

426

    KwContinue,

427

    KwDefault,

428

    KwDo,

429

    KwDouble,

430

    KwElse,

431

    KwEnum,

432

    KwExtern,

433

    KwFloat,

434

    KwFor,

435

    KwGoto,

436

    KwIf,

437

    KwInline,

438

    KwInt,

439

    KwLong,

440

    KwRegister,

441

    KwRestrict,

442

    KwReturn,

443

    KwShort,

444

    KwSigned,

445

    KwSizeof,

446

    KwStatic,

447

    KwStruct,

448

    KwSwitch,

449

    KwTypedef,

450

    KwUnion,

451

    KwUnsigned,

452

    KwVoid,

453

    KwVolatile,

454

    KwWhile,

455

    KwAlignas,

456

    KwAlignof,

457

    KwAtomic,

458

    KwBool,

459

    KwComplex,

460

    KwGeneric,

461

    KwImaginary,

462

    KwNoreturn,

463

    KwStaticAssert,

464

    KwThreadLocal,

465

    NumericConstant,

466

    CharacterConstant { encoding: CharEncoding },

467

    StringLiteral { encoding: CharEncoding },

468

    Eof,

469

470

471

20

#[derive(PartialEq, Debug, Copy, Clone)]

472

pub enum Bracket {

473

    Round,

474

    Square,

475

    Curly,

476

477

478

2742

#[derive(PartialEq, Debug, Copy, Clone)]

479

pub enum CharEncoding {

480

    Byte,

481

    Wide,

482

    Utf8,

483

    Utf16,

484

    Utf32,

485

486

487

3136

fn get_keyword_kind_for_lexeme(lexeme: &str) -> Option<TokenKind> {

488

3136

    let keyword_kind = match lexeme {

489

3136

        "auto" => TokenKind::KwAuto,

490

3135

        "break" => TokenKind::KwBreak,

491

3134

        "case" => TokenKind::KwCase,

492

3133

        "char" => TokenKind::KwChar,

493

3132

        "const" => TokenKind::KwConst,

494

3131

        "continue" => TokenKind::KwContinue,

495

3130

        "default" => TokenKind::KwDefault,

496

3129

        "do" => TokenKind::KwDo,

497

3128

        "double" => TokenKind::KwDouble,

498

3127

        "else" => TokenKind::KwElse,

499

3126

        "enum" => TokenKind::KwEnum,

500

3125

        "extern" => TokenKind::KwExtern,

501

3124

        "float" => TokenKind::KwFloat,

502

3123

        "for" => TokenKind::KwFor,

503

3122

        "goto" => TokenKind::KwGoto,

504

3121

        "if" => TokenKind::KwIf,

505

3120

        "inline" => TokenKind::KwInline,

506

3119

        "int" => TokenKind::KwInt,

507

2838

        "long" => TokenKind::KwLong,

508

2583

        "register" => TokenKind::KwRegister,

509

2582

        "restrict" => TokenKind::KwRestrict,

510

2581

        "return" => TokenKind::KwReturn,

511

2580

        "short" => TokenKind::KwShort,

512

2579

        "signed" => TokenKind::KwSigned,

513

2578

        "sizeof" => TokenKind::KwSizeof,

514

2577

        "static" => TokenKind::KwStatic,

515

2576

        "struct" => TokenKind::KwStruct,

516

2575

        "switch" => TokenKind::KwSwitch,

517

2574

        "typedef" => TokenKind::KwTypedef,

518

2573

        "union" => TokenKind::KwUnion,

519

2572

        "unsigned" => TokenKind::KwUnsigned,

520

2571

        "void" => TokenKind::KwVoid,

521

2570

        "volatile" => TokenKind::KwVolatile,

522

2569

        "while" => TokenKind::KwWhile,

523

2568

        "_Alignas" => TokenKind::KwAlignas,

524

2567

        "_Alignof" => TokenKind::KwAlignof,

525

2566

        "_Atomic" => TokenKind::KwAtomic,

526

2565

        "_Bool" => TokenKind::KwBool,

527

2564

        "_Complex" => TokenKind::KwComplex,

528

2563

        "_Generic" => TokenKind::KwGeneric,

529

2562

        "_Imaginary" => TokenKind::KwImaginary,

530

2561

        "_Noreturn" => TokenKind::KwNoreturn,

531

2560

        "_Static_assert" => TokenKind::KwStaticAssert,

532

2559

        "_Thread_local" => TokenKind::KwThreadLocal,

533

2558

        _ => return None,

534

};

535

536

578

    Some(keyword_kind)

537

3136

538

539

45138

const fn is_newline(ch: char) -> bool {

540

45138

    matches!(ch, '\n' | '\r')

541

45138

542

543

129183

const fn is_numeric_constant_char(ch: char) -> bool {

544

129183

    is_alpha(ch) || is_digit(ch) || ch == '.'

545

129183

546

547

35061

const fn is_start_of_identifier(ch: char) -> bool {

548

35061

    is_alpha(ch) || ch == '_'

549

35061

550

551

26580

const fn is_remaining_of_identifier(ch: char) -> bool {

552

26580

    is_start_of_identifier(ch) || is_digit(ch)

553

26580

554

555

38263

const fn is_space(ch: char) -> bool {

556

38263

    matches!(ch, '\x20' | '\t' | '\n' | '\r' | '\x0b' | '\x0c')

557

38263

558

559

138229

const fn is_digit(ch: char) -> bool {

560

138229

    ch.is_ascii_digit()

561

138229

562

563

164244

const fn is_alpha(ch: char) -> bool {

564

164244

    ch.is_ascii_alphabetic()

565

164244