C (C99) 2025-11-11 c tokenizer

Description

toooken

Code

RAW
#include "tokenizer.h"#include <ctype.h>#include <stdio.h>#include <string.h>static const bool char_is_alnum[256] = {['A' ... 'Z'] = true, ['a' ... 'z'] = true, ['0' ... '9'] = true, ['_'] = true};static const bool char_is_digit[256] = {['0' ... '9'] = true};static const bool char_is_whitespace[256] = {[' '] = true, ['\t'] = true, ['\n'] = true, ['\r'] = true, ['\v'] = true, ['\f'] = true};void tokenizer_init(Tokenizer *tokenizer, const char *input) {  tokenizer->input = input;  tokenizer->current = input;  tokenizer->line = 1;  tokenizer->column = 1;}static Token read_identifier(Tokenizer *tokenizer);static Token read_number(Tokenizer *tokenizer);static Token read_string(Tokenizer *tokenizer);static Token read_raw_string(Tokenizer *tokenizer);static Token handle_single_char(Tokenizer *tokenizer);static Token handle_equal(Tokenizer *tokenizer);static Token handle_bang(Tokenizer *tokenizer);static Token handle_less(Tokenizer *tokenizer);static Token handle_greater(Tokenizer *tokenizer);static Token handle_dot(Tokenizer *tokenizer);static Token handle_comment(Tokenizer *tokenizer);static Token handle_unknown(Tokenizer *tokenizer);typedef Token (*TokenHandler)(Tokenizer *);static const TokenHandler token_handlers[256] = {    ['A' ... 'Z'] = read_identifier,    ['a' ... 'z'] = read_identifier,    ['_'] = read_identifier,    ['0' ... '9'] = read_number,    ['"'] = read_string,    ['\''] = read_string,    ['`'] = read_raw_string,    ['('] = handle_single_char,    [')'] = handle_single_char,    ['['] = handle_single_char,    [']'] = handle_single_char,    ['{'] = handle_single_char,    ['}'] = handle_single_char,    ['.'] = handle_dot,    [' '] = handle_single_char,    [','] = handle_single_char,    [';'] = handle_single_char,    [':'] = handle_single_char,    ['+'] = handle_single_char,    ['-'] = handle_single_char,    ['*'] = handle_single_char,    ['/'] = handle_single_char,    ['='] = handle_equal,    ['!'] = handle_bang,    ['<'] = handle_less,    ['>'] = handle_greater,    ['?'] = handle_single_char,    ['#'] = handle_comment,    ['$'] = handle_single_char,    ['~'] = handle_single_char,    ['|'] = handle_single_char,    ['%'] = handle_single_char,};static inline char peek_char(Tokenizer *tokenizer) { return *tokenizer->current; }static char next_char(Tokenizer *tokenizer) {  if (*tokenizer->current == '\0') return '\0';  char c = *tokenizer->current++;  if (c == '\n') {    tokenizer->line++;    tokenizer->column = 1;  } else {    tokenizer->column++;  }  return c;}static inline void skip_whitespace(Tokenizer *tokenizer) {  const char *p = tokenizer->current;  if (!char_is_whitespace[(unsigned char)*p]) return;  while (char_is_whitespace[(unsigned char)*p]) {    p++;  }  while (tokenizer->current < p) {    next_char(tokenizer);  }}static inline Token make_token(TokenType type, const char *start, size_t length) {  Token token;  token.type = type;  token.start = start;  token.length = length;  size_t copy_len = (length < MAX_TOKEN_VALUE_LENGTH) ? length : MAX_TOKEN_VALUE_LENGTH - 1;  memcpy(token.value, start, copy_len);  token.value[copy_len] = '\0';  return token;}Token tokenizer_next_token(Tokenizer *tokenizer) {  skip_whitespace(tokenizer);  char c = peek_char(tokenizer);  if (c == '\0') return make_token(TOKEN_EOF, tokenizer->current, 0);  TokenHandler handler = token_handlers[(unsigned char)c];  if (handler) {    return handler(tokenizer);  }  return handle_unknown(tokenizer);}static Token read_identifier(Tokenizer *tokenizer) {  const char *start = tokenizer->current;  while (char_is_alnum[(unsigned char)peek_char(tokenizer)]) {    next_char(tokenizer);  }  return make_token(TOKEN_IDENTIFIER, start, tokenizer->current - start);}static Token read_number(Tokenizer *tokenizer) {  const char *start = tokenizer->current;  while (char_is_digit[(unsigned char)peek_char(tokenizer)]) {    next_char(tokenizer);  }  if (peek_char(tokenizer) == '.') {    const char *look = tokenizer->current + 1;    if (char_is_digit[(unsigned char)*look]) {      next_char(tokenizer); // consume '.'      while (char_is_digit[(unsigned char)peek_char(tokenizer)]) {        next_char(tokenizer);      }    }  }  return make_token(TOKEN_NUMBER, start, tokenizer->current - start);}static Token read_string(Tokenizer *tokenizer) {  Token token;  token.type = TOKEN_STRING;  token.start = tokenizer->current;  char *write_ptr = token.value;  do {    const char quote_char = next_char(tokenizer); // opening quote    for (;;) {      char c = next_char(tokenizer);      if (c == '\0' || c == quote_char) break;      if (write_ptr >= token.value + MAX_TOKEN_VALUE_LENGTH - 1) {        if (c == '\\') {          char esc = next_char(tokenizer);          (void)esc;        }        continue;      }      if (c == '\\') {        char esc = next_char(tokenizer);        switch (esc) {        case 'n': *write_ptr++ = '\n'; break;        case 't': *write_ptr++ = '\t'; break;        case 'r': *write_ptr++ = '\r'; break;        case '"': *write_ptr++ = '"'; break;        case '\'': *write_ptr++ = '\''; break;        case '\\': *write_ptr++ = '\\'; break;        case '\0': /* unterminated, stop */ goto string_done;        default: *write_ptr++ = esc; break;        }      } else {        *write_ptr++ = c;      }    }    const char *p = tokenizer->current;    while (char_is_whitespace[(unsigned char)*p]) p++;    if (*p != '"' && *p != '\'') {      break;    }    while (tokenizer->current < p) next_char(tokenizer);  } while (true);string_done:  *write_ptr = '\0';  token.length = (size_t)(write_ptr - token.value);  return token;}static const TokenType single_char_map[256] = {    ['('] = TOKEN_LEFT_PAREN,    [')'] = TOKEN_RIGHT_PAREN,    ['['] = TOKEN_LEFT_BRACKET,    [']'] = TOKEN_RIGHT_BRACKET,    ['{'] = TOKEN_LEFT_BRACE,    ['}'] = TOKEN_RIGHT_BRACE,    ['.'] = TOKEN_DOT,    [','] = TOKEN_COMMA,    [';'] = TOKEN_SEMICOLON,    [':'] = TOKEN_COLON,    ['+'] = TOKEN_PLUS,    ['-'] = TOKEN_MINUS,    ['*'] = TOKEN_STAR,    ['/'] = TOKEN_SLASH,    ['='] = TOKEN_EQUAL,    ['!'] = TOKEN_BANG,    ['<'] = TOKEN_LEFT_ANGLE,    ['>'] = TOKEN_RIGHT_ANGLE,    ['?'] = TOKEN_QUESTION,    ['#'] = TOKEN_HASH,    ['$'] = TOKEN_DOLLAR,    ['~'] = TOKEN_TILDE,    ['|'] = TOKEN_PIPE,    ['%'] = TOKEN_MODULO,};static Token handle_single_char(Tokenizer *tokenizer) {  const char *start = tokenizer->current;  char c = next_char(tokenizer);  return make_token(single_char_map[(unsigned char)c], start, 1);}static Token handle_equal(Tokenizer *tokenizer) {  const char *start = tokenizer->current;  next_char(tokenizer); // consume '='  if (peek_char(tokenizer) == '=') {    next_char(tokenizer); // consume second '='    return make_token(TOKEN_EQUAL_EQUAL, start, 2);  }  return make_token(TOKEN_EQUAL, start, 1);}static Token handle_bang(Tokenizer *tokenizer) {  const char *start = tokenizer->current;  next_char(tokenizer); // consume '!'  if (peek_char(tokenizer) == '=') {    next_char(tokenizer); // consume '='    return make_token(TOKEN_NOT_EQUAL, start, 2);  }  return make_token(TOKEN_BANG, start, 1);}static Token handle_less(Tokenizer *tokenizer) {  const char *start = tokenizer->current;  next_char(tokenizer); // consume '<'  if (peek_char(tokenizer) == '=') {    next_char(tokenizer); // consume '='    return make_token(TOKEN_LESS_EQUAL, start, 2);  }  return make_token(TOKEN_LEFT_ANGLE, start, 1);}static Token handle_greater(Tokenizer *tokenizer) {  const char *start = tokenizer->current;  next_char(tokenizer); // consume '>'  if (peek_char(tokenizer) == '=') {    next_char(tokenizer); // consume '='    return make_token(TOKEN_GREATER_EQUAL, start, 2);  }  return make_token(TOKEN_RIGHT_ANGLE, start, 1);}static Token handle_dot(Tokenizer *tokenizer) {  const char *start = tokenizer->current;  next_char(tokenizer); // consume '.'  if (peek_char(tokenizer) == '.') {    next_char(tokenizer); // consume second '.'    return make_token(TOKEN_DOT_DOT, start, 2);  }  return make_token(TOKEN_DOT, start, 1);}static Token read_raw_string(Tokenizer *tokenizer) {  Token token;  token.type = TOKEN_STRING;  token.start = tokenizer->current;  char *write_ptr = token.value;  next_char(tokenizer); // Skip opening backtick  for (;;) {    char c = peek_char(tokenizer);    if (c == '\0' || c == '`') break;    c = next_char(tokenizer);    if (write_ptr < token.value + MAX_TOKEN_VALUE_LENGTH - 1) {      *write_ptr++ = c;    }  }  if (peek_char(tokenizer) == '`') {    next_char(tokenizer); // Skip closing backtick  }  *write_ptr = '\0';  token.length = (size_t)(write_ptr - token.value);  return token;}static Token handle_comment(Tokenizer *tokenizer) {  while (peek_char(tokenizer) != '\0' && peek_char(tokenizer) != '\n') {    next_char(tokenizer);  }  return tokenizer_next_token(tokenizer);}static Token handle_unknown(Tokenizer *tokenizer) {  const char *start = tokenizer->current;  next_char(tokenizer);  return make_token(TOKEN_UNKNOWN, start, 1);}Token tokenizer_peek_token(Tokenizer *tokenizer) {  Tokenizer saved_state = *tokenizer;  Token token = tokenizer_next_token(tokenizer);  *tokenizer = saved_state;  return token;}Token tokenizer_peek_next_token(Tokenizer *tokenizer) {  Tokenizer saved_state = *tokenizer;  tokenizer_next_token(tokenizer);  Token next_token = tokenizer_next_token(tokenizer);  *tokenizer = saved_state;  return next_token;}bool tokenizer_consume(Tokenizer *tokenizer, TokenType expected_type) {  if (tokenizer_peek_token(tokenizer).type == expected_type) {    tokenizer_next_token(tokenizer);    return true;  }  return false;}bool tokenizer_consume_value(Tokenizer *tokenizer, TokenType expected_type, const char *expected_value) {  if (expected_value == NULL) return false;  Token token = tokenizer_peek_token(tokenizer);  if (token.type == expected_type && strcmp(token.value, expected_value) == 0) {    tokenizer_next_token(tokenizer);    return true;  }  return false;}void tokenizer_skip_until(Tokenizer *tokenizer, const TokenType *stop_tokens, int num_stop_tokens) {  while (true) {    TokenType current_type = tokenizer_peek_token(tokenizer).type;    if (current_type == TOKEN_EOF) break;    for (int i = 0; i < num_stop_tokens; i++) {      if (current_type == stop_tokens[i]) return;    }    tokenizer_next_token(tokenizer);  }}const char *token_type_to_string(TokenType type) {  static const char *strings[] = {      [TOKEN_EOF] = "EOF",      [TOKEN_IDENTIFIER] = "IDENTIFIER",      [TOKEN_STRING] = "STRING",      [TOKEN_NUMBER] = "NUMBER",      [TOKEN_UNKNOWN] = "UNKNOWN",      [TOKEN_LEFT_PAREN] = "(",      [TOKEN_RIGHT_PAREN] = ")",      [TOKEN_LEFT_BRACKET] = "[",      [TOKEN_RIGHT_BRACKET] = "]",      [TOKEN_LEFT_BRACE] = "{",      [TOKEN_RIGHT_BRACE] = "}",      [TOKEN_DOT] = ".",      [TOKEN_DOT_DOT] = "..",      [TOKEN_COMMA] = ",",      [TOKEN_SEMICOLON] = ";",      [TOKEN_COLON] = ":",      [TOKEN_PLUS] = "+",      [TOKEN_MINUS] = "-",      [TOKEN_STAR] = "*",      [TOKEN_SLASH] = "/",      [TOKEN_EQUAL] = "=",      [TOKEN_BANG] = "!",      [TOKEN_LEFT_ANGLE] = "<",      [TOKEN_RIGHT_ANGLE] = ">",      [TOKEN_QUESTION] = "?",      [TOKEN_HASH] = "#",      [TOKEN_DOLLAR] = "$",      [TOKEN_TILDE] = "~",      [TOKEN_PIPE] = "|",      [TOKEN_MODULO] = "%",      [TOKEN_EQUAL_EQUAL] = "==",      [TOKEN_NOT_EQUAL] = "!=",      [TOKEN_LESS_EQUAL] = "<=",      [TOKEN_GREATER_EQUAL] = ">=",  };  if (type >= 0 && type < (int)(sizeof(strings) / sizeof(strings[0])) && strings[type]) {    return strings[type];  }  return "INVALID";}

Comments

No comments yet. Be the first to comment!