C (C99)
2025-11-11
c
tokenizer
Description
toooken
Code
#include "tokenizer.h"#include <ctype.h>#include <stdio.h>#include <string.h>static const bool char_is_alnum[256] = {['A' ... 'Z'] = true, ['a' ... 'z'] = true, ['0' ... '9'] = true, ['_'] = true};static const bool char_is_digit[256] = {['0' ... '9'] = true};static const bool char_is_whitespace[256] = {[' '] = true, ['\t'] = true, ['\n'] = true, ['\r'] = true, ['\v'] = true, ['\f'] = true};void tokenizer_init(Tokenizer *tokenizer, const char *input) { tokenizer->input = input; tokenizer->current = input; tokenizer->line = 1; tokenizer->column = 1;}static Token read_identifier(Tokenizer *tokenizer);static Token read_number(Tokenizer *tokenizer);static Token read_string(Tokenizer *tokenizer);static Token read_raw_string(Tokenizer *tokenizer);static Token handle_single_char(Tokenizer *tokenizer);static Token handle_equal(Tokenizer *tokenizer);static Token handle_bang(Tokenizer *tokenizer);static Token handle_less(Tokenizer *tokenizer);static Token handle_greater(Tokenizer *tokenizer);static Token handle_dot(Tokenizer *tokenizer);static Token handle_comment(Tokenizer *tokenizer);static Token handle_unknown(Tokenizer *tokenizer);typedef Token (*TokenHandler)(Tokenizer *);static const TokenHandler token_handlers[256] = { ['A' ... 'Z'] = read_identifier, ['a' ... 'z'] = read_identifier, ['_'] = read_identifier, ['0' ... '9'] = read_number, ['"'] = read_string, ['\''] = read_string, ['`'] = read_raw_string, ['('] = handle_single_char, [')'] = handle_single_char, ['['] = handle_single_char, [']'] = handle_single_char, ['{'] = handle_single_char, ['}'] = handle_single_char, ['.'] = handle_dot, [' '] = handle_single_char, [','] = handle_single_char, [';'] = handle_single_char, [':'] = handle_single_char, ['+'] = handle_single_char, ['-'] = handle_single_char, ['*'] = handle_single_char, ['/'] = handle_single_char, ['='] = handle_equal, ['!'] = handle_bang, ['<'] = handle_less, ['>'] = handle_greater, ['?'] = handle_single_char, ['#'] = handle_comment, ['$'] = handle_single_char, ['~'] = handle_single_char, ['|'] = handle_single_char, ['%'] = handle_single_char,};static inline char peek_char(Tokenizer *tokenizer) { return *tokenizer->current; }static char next_char(Tokenizer *tokenizer) { if (*tokenizer->current == '\0') return '\0'; char c = *tokenizer->current++; if (c == '\n') { tokenizer->line++; tokenizer->column = 1; } else { tokenizer->column++; } return c;}static inline void skip_whitespace(Tokenizer *tokenizer) { const char *p = tokenizer->current; if (!char_is_whitespace[(unsigned char)*p]) return; while (char_is_whitespace[(unsigned char)*p]) { p++; } while (tokenizer->current < p) { next_char(tokenizer); }}static inline Token make_token(TokenType type, const char *start, size_t length) { Token token; token.type = type; token.start = start; token.length = length; size_t copy_len = (length < MAX_TOKEN_VALUE_LENGTH) ? length : MAX_TOKEN_VALUE_LENGTH - 1; memcpy(token.value, start, copy_len); token.value[copy_len] = '\0'; return token;}Token tokenizer_next_token(Tokenizer *tokenizer) { skip_whitespace(tokenizer); char c = peek_char(tokenizer); if (c == '\0') return make_token(TOKEN_EOF, tokenizer->current, 0); TokenHandler handler = token_handlers[(unsigned char)c]; if (handler) { return handler(tokenizer); } return handle_unknown(tokenizer);}static Token read_identifier(Tokenizer *tokenizer) { const char *start = tokenizer->current; while (char_is_alnum[(unsigned char)peek_char(tokenizer)]) { next_char(tokenizer); } return make_token(TOKEN_IDENTIFIER, start, tokenizer->current - start);}static Token read_number(Tokenizer *tokenizer) { const char *start = tokenizer->current; while (char_is_digit[(unsigned char)peek_char(tokenizer)]) { next_char(tokenizer); } if (peek_char(tokenizer) == '.') { const char *look = tokenizer->current + 1; if (char_is_digit[(unsigned char)*look]) { next_char(tokenizer); // consume '.' while (char_is_digit[(unsigned char)peek_char(tokenizer)]) { next_char(tokenizer); } } } return make_token(TOKEN_NUMBER, start, tokenizer->current - start);}static Token read_string(Tokenizer *tokenizer) { Token token; token.type = TOKEN_STRING; token.start = tokenizer->current; char *write_ptr = token.value; do { const char quote_char = next_char(tokenizer); // opening quote for (;;) { char c = next_char(tokenizer); if (c == '\0' || c == quote_char) break; if (write_ptr >= token.value + MAX_TOKEN_VALUE_LENGTH - 1) { if (c == '\\') { char esc = next_char(tokenizer); (void)esc; } continue; } if (c == '\\') { char esc = next_char(tokenizer); switch (esc) { case 'n': *write_ptr++ = '\n'; break; case 't': *write_ptr++ = '\t'; break; case 'r': *write_ptr++ = '\r'; break; case '"': *write_ptr++ = '"'; break; case '\'': *write_ptr++ = '\''; break; case '\\': *write_ptr++ = '\\'; break; case '\0': /* unterminated, stop */ goto string_done; default: *write_ptr++ = esc; break; } } else { *write_ptr++ = c; } } const char *p = tokenizer->current; while (char_is_whitespace[(unsigned char)*p]) p++; if (*p != '"' && *p != '\'') { break; } while (tokenizer->current < p) next_char(tokenizer); } while (true);string_done: *write_ptr = '\0'; token.length = (size_t)(write_ptr - token.value); return token;}static const TokenType single_char_map[256] = { ['('] = TOKEN_LEFT_PAREN, [')'] = TOKEN_RIGHT_PAREN, ['['] = TOKEN_LEFT_BRACKET, [']'] = TOKEN_RIGHT_BRACKET, ['{'] = TOKEN_LEFT_BRACE, ['}'] = TOKEN_RIGHT_BRACE, ['.'] = TOKEN_DOT, [','] = TOKEN_COMMA, [';'] = TOKEN_SEMICOLON, [':'] = TOKEN_COLON, ['+'] = TOKEN_PLUS, ['-'] = TOKEN_MINUS, ['*'] = TOKEN_STAR, ['/'] = TOKEN_SLASH, ['='] = TOKEN_EQUAL, ['!'] = TOKEN_BANG, ['<'] = TOKEN_LEFT_ANGLE, ['>'] = TOKEN_RIGHT_ANGLE, ['?'] = TOKEN_QUESTION, ['#'] = TOKEN_HASH, ['$'] = TOKEN_DOLLAR, ['~'] = TOKEN_TILDE, ['|'] = TOKEN_PIPE, ['%'] = TOKEN_MODULO,};static Token handle_single_char(Tokenizer *tokenizer) { const char *start = tokenizer->current; char c = next_char(tokenizer); return make_token(single_char_map[(unsigned char)c], start, 1);}static Token handle_equal(Tokenizer *tokenizer) { const char *start = tokenizer->current; next_char(tokenizer); // consume '=' if (peek_char(tokenizer) == '=') { next_char(tokenizer); // consume second '=' return make_token(TOKEN_EQUAL_EQUAL, start, 2); } return make_token(TOKEN_EQUAL, start, 1);}static Token handle_bang(Tokenizer *tokenizer) { const char *start = tokenizer->current; next_char(tokenizer); // consume '!' if (peek_char(tokenizer) == '=') { next_char(tokenizer); // consume '=' return make_token(TOKEN_NOT_EQUAL, start, 2); } return make_token(TOKEN_BANG, start, 1);}static Token handle_less(Tokenizer *tokenizer) { const char *start = tokenizer->current; next_char(tokenizer); // consume '<' if (peek_char(tokenizer) == '=') { next_char(tokenizer); // consume '=' return make_token(TOKEN_LESS_EQUAL, start, 2); } return make_token(TOKEN_LEFT_ANGLE, start, 1);}static Token handle_greater(Tokenizer *tokenizer) { const char *start = tokenizer->current; next_char(tokenizer); // consume '>' if (peek_char(tokenizer) == '=') { next_char(tokenizer); // consume '=' return make_token(TOKEN_GREATER_EQUAL, start, 2); } return make_token(TOKEN_RIGHT_ANGLE, start, 1);}static Token handle_dot(Tokenizer *tokenizer) { const char *start = tokenizer->current; next_char(tokenizer); // consume '.' if (peek_char(tokenizer) == '.') { next_char(tokenizer); // consume second '.' return make_token(TOKEN_DOT_DOT, start, 2); } return make_token(TOKEN_DOT, start, 1);}static Token read_raw_string(Tokenizer *tokenizer) { Token token; token.type = TOKEN_STRING; token.start = tokenizer->current; char *write_ptr = token.value; next_char(tokenizer); // Skip opening backtick for (;;) { char c = peek_char(tokenizer); if (c == '\0' || c == '`') break; c = next_char(tokenizer); if (write_ptr < token.value + MAX_TOKEN_VALUE_LENGTH - 1) { *write_ptr++ = c; } } if (peek_char(tokenizer) == '`') { next_char(tokenizer); // Skip closing backtick } *write_ptr = '\0'; token.length = (size_t)(write_ptr - token.value); return token;}static Token handle_comment(Tokenizer *tokenizer) { while (peek_char(tokenizer) != '\0' && peek_char(tokenizer) != '\n') { next_char(tokenizer); } return tokenizer_next_token(tokenizer);}static Token handle_unknown(Tokenizer *tokenizer) { const char *start = tokenizer->current; next_char(tokenizer); return make_token(TOKEN_UNKNOWN, start, 1);}Token tokenizer_peek_token(Tokenizer *tokenizer) { Tokenizer saved_state = *tokenizer; Token token = tokenizer_next_token(tokenizer); *tokenizer = saved_state; return token;}Token tokenizer_peek_next_token(Tokenizer *tokenizer) { Tokenizer saved_state = *tokenizer; tokenizer_next_token(tokenizer); Token next_token = tokenizer_next_token(tokenizer); *tokenizer = saved_state; return next_token;}bool tokenizer_consume(Tokenizer *tokenizer, TokenType expected_type) { if (tokenizer_peek_token(tokenizer).type == expected_type) { tokenizer_next_token(tokenizer); return true; } return false;}bool tokenizer_consume_value(Tokenizer *tokenizer, TokenType expected_type, const char *expected_value) { if (expected_value == NULL) return false; Token token = tokenizer_peek_token(tokenizer); if (token.type == expected_type && strcmp(token.value, expected_value) == 0) { tokenizer_next_token(tokenizer); return true; } return false;}void tokenizer_skip_until(Tokenizer *tokenizer, const TokenType *stop_tokens, int num_stop_tokens) { while (true) { TokenType current_type = tokenizer_peek_token(tokenizer).type; if (current_type == TOKEN_EOF) break; for (int i = 0; i < num_stop_tokens; i++) { if (current_type == stop_tokens[i]) return; } tokenizer_next_token(tokenizer); }}const char *token_type_to_string(TokenType type) { static const char *strings[] = { [TOKEN_EOF] = "EOF", [TOKEN_IDENTIFIER] = "IDENTIFIER", [TOKEN_STRING] = "STRING", [TOKEN_NUMBER] = "NUMBER", [TOKEN_UNKNOWN] = "UNKNOWN", [TOKEN_LEFT_PAREN] = "(", [TOKEN_RIGHT_PAREN] = ")", [TOKEN_LEFT_BRACKET] = "[", [TOKEN_RIGHT_BRACKET] = "]", [TOKEN_LEFT_BRACE] = "{", [TOKEN_RIGHT_BRACE] = "}", [TOKEN_DOT] = ".", [TOKEN_DOT_DOT] = "..", [TOKEN_COMMA] = ",", [TOKEN_SEMICOLON] = ";", [TOKEN_COLON] = ":", [TOKEN_PLUS] = "+", [TOKEN_MINUS] = "-", [TOKEN_STAR] = "*", [TOKEN_SLASH] = "/", [TOKEN_EQUAL] = "=", [TOKEN_BANG] = "!", [TOKEN_LEFT_ANGLE] = "<", [TOKEN_RIGHT_ANGLE] = ">", [TOKEN_QUESTION] = "?", [TOKEN_HASH] = "#", [TOKEN_DOLLAR] = "$", [TOKEN_TILDE] = "~", [TOKEN_PIPE] = "|", [TOKEN_MODULO] = "%", [TOKEN_EQUAL_EQUAL] = "==", [TOKEN_NOT_EQUAL] = "!=", [TOKEN_LESS_EQUAL] = "<=", [TOKEN_GREATER_EQUAL] = ">=", }; if (type >= 0 && type < (int)(sizeof(strings) / sizeof(strings[0])) && strings[type]) { return strings[type]; } return "INVALID";}
Comments
No comments yet. Be the first to comment!
Please login to leave a comment.