Skip to content

Instantly share code, notes, and snippets.

@caiodanielnunessantos
Created August 22, 2022 12:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save caiodanielnunessantos/2bdd5ac54952d57ec3361935b4029a9a to your computer and use it in GitHub Desktop.
Save caiodanielnunessantos/2bdd5ac54952d57ec3361935b4029a9a to your computer and use it in GitHub Desktop.
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <assert.h>
#define DBGINT(variable) \
fprintf(stderr, "%s: %d\n", #variable, variable);
#define DBGCHAR(variable) \
fprintf(stderr, "%s: %c\n", #variable, variable);
#ifdef FUNCDBG
#define DBG \
fprintf(stderr, "Function %s, line %d\n", __PRETTY_FUNCTION__, __LINE__);
#else
#define DBG ;
#endif
#define EQ(character_to_compare) \
(character == character_to_compare)
#define BT(first_character, last_character) \
(character >= first_character && character <= last_character)
#define MC(name) \
int is_ ## name(char character)
#define MCEQ(name, character) \
MC(name) { \
return EQ(character); \
}
#define MCBT(name, first_character, last_character) \
MC(name) { \
return BT(first_character, last_character); \
}
#define MC2(name, condition1, condition2) \
MC(name) { \
return (is_ ## condition1(character) || is_ ## condition2(character)); \
}
#define MC3(name, condition1, condition2, condition3) \
MC(name) { \
return (is_ ## condition1(character) || is_ ## condition2(character) || is_ ## condition3(character)); \
}
MCEQ(UNDERSCORE, '_')
MCEQ(COMMA, ',')
MCEQ(PERIOD, '.')
MCEQ(SEMICOLON, ';')
MCEQ(SLASH, '/')
MCEQ(BACKSLASH, '\\')
MCEQ(LPARENTHESIS, '(')
MCEQ(RPARENTHESIS, ')')
MCEQ(LBRACKET, '{')
MCEQ(RBRACKET, '}')
MCEQ(QUOTE, '\"')
MCBT(ALPHA_LOWER, 'a', 'z')
MCBT(ALPHA_UPPER, 'A', 'Z')
MCBT(DIGIT, '0', '9')
MC2(ALPHA, ALPHA_LOWER, ALPHA_UPPER)
MC2(NAME_HEAD, ALPHA, UNDERSCORE)
MC2(NAME_TAIL, NAME_HEAD, DIGIT)
MCEQ(SPACE, ' ')
MCEQ(TABULATION, '\t')
MCEQ(LINEBREAK, '\n')
MC3(WHITESPACE, SPACE, TABULATION, LINEBREAK)
#define TEST(function, character) \
assert(is_ ## function(character));
#define TESTNOT(function, character) \
assert(!(is_ ## function(character)));
void test_simple_matchers() {
TEST(ALPHA, 'A')
TEST(ALPHA_UPPER, 'A')
TEST(ALPHA, 'a')
TEST(ALPHA_LOWER, 'a')
TEST(ALPHA, 'Z')
TEST(ALPHA, 'z')
TESTNOT(ALPHA, ',')
TESTNOT(ALPHA, ' ')
TEST(DIGIT, '0')
TEST(DIGIT, '9')
TESTNOT(DIGIT, ' ')
TESTNOT(DIGIT, 'K')
}
//Name buffer
static char global_name_buffer[4096];
void gnb_reset() {
memset(global_name_buffer, 0, 4096);
}
void gnb_add_character(int character) {
*((char*)memchr(global_name_buffer, 0, 4096)) = (char)character;
}
char* gnb_to_heap() {
size_t length = strlen(global_name_buffer);
char* heap = malloc(sizeof(char) * length + 1);
strcpy(heap, global_name_buffer);
return heap;
}
enum {
ST_FRESH,
ST_COMMENT,
ST_NAME,
ST_STRING,
ST_STRING_ESCAPE,
ST_NUMBER,
ST_NUMBER_FLOATING,
};
int tokenizer_state = ST_FRESH;
enum {
TK_NONE,
TK_NAME,
TK_STRING,
TK_INTEGER,
TK_FLOATING,
TK_COMMA,
TK_SEMICOLON,
TK_LPARENTHESIS,
TK_RPARENTHESIS,
TK_LBRACKET,
TK_RBRACKET,
};
#define ONECHTK(name) \
if (is_ ## name(character)) { \
token(TK_ ## name); \
return 1; \
}
//IT'S FRESH, FRESH
int match_one_char(int character, void (*token)(int)) {
DBG
ONECHTK(COMMA)
ONECHTK(SEMICOLON)
ONECHTK(LPARENTHESIS)
ONECHTK(RPARENTHESIS)
ONECHTK(LBRACKET)
ONECHTK(RBRACKET)
return 0;
}
int match_whitespace(int character, void (*token)(int)) {
DBG
if (is_WHITESPACE(character)) {
return 1;
}
return 0;
}
int match_name_head(int character, void (*token)(int)) {
DBG
if (is_NAME_HEAD(character)) {
DBG
tokenizer_state = ST_NAME;
gnb_reset();
gnb_add_character(character);
return 1;
}
return 0;
}
int match_string_head(int character, void (*token)(int)) {
DBG
if (is_QUOTE(character)) {
tokenizer_state = ST_STRING;
gnb_reset();
return 1;
}
return 0;
}
int match_comment_head(int character, void (*token)(int)) {
DBG
if (is_SLASH(character)) {
tokenizer_state = ST_COMMENT;
return 1;
}
return 0;
}
int match_number_head(int character, void (*token)(int)) {
DBG
if (is_DIGIT(character)) {
DBG
tokenizer_state = ST_NUMBER;
gnb_reset();
gnb_add_character(character);
return 1;
}
return 0;
}
int match_fresh(int character, void (*token)(int)) {
DBG
return (
match_one_char(character, token) ||
match_whitespace(character, token) ||
match_name_head(character, token) ||
match_string_head(character, token) ||
match_comment_head(character, token) ||
match_number_head(character, token)
);
}
//It's name
int match_name_tail(int character, void (*token)(int)) {
DBG
if (is_NAME_TAIL(character)) {
gnb_add_character(character);
return 1;
}
tokenizer_state = ST_FRESH;
token(TK_NAME);
return match_fresh(character, token);
}
//It's string
int match_string_tail(int character, void (*token)(int)) {
DBG
if (is_QUOTE(character)) {
token(TK_STRING);
return 1;
}
if (is_SLASH(character)) {
tokenizer_state = ST_STRING_ESCAPE;
return 1;
}
gnb_add_character(character);
return 1;
}
int match_string_escape_tail(int character, void (*token)(int)) {
DBG
if (character == 'n') {
gnb_add_character('\n');
tokenizer_state = ST_STRING;
return 1;
}
if (character == 't') {
gnb_add_character('\t');
tokenizer_state = ST_STRING;
return 1;
}
if (character == 'v') {
gnb_add_character('\v');
tokenizer_state = ST_STRING;
return 1;
}
if (character == '\\') {
gnb_add_character('\\');
tokenizer_state = ST_STRING;
return 1;
}
if (character == '\'') {
gnb_add_character('\'');
tokenizer_state = ST_STRING;
return 1;
}
if (character == '\"') {
gnb_add_character('\"');
tokenizer_state = ST_STRING;
return 1;
}
return 0;
}
int match_comment_tail(int character, void (*token)(int)) {
DBG
if (character == '/') {
tokenizer_state = ST_FRESH;
return 1;
}
return 1;
}
int match_number_tail(int character, void(*token)(int)) {
DBG
if (character == '.') {
gnb_add_character('.');
tokenizer_state = ST_NUMBER_FLOATING;
return 1;
}
if (is_DIGIT(character)) {
gnb_add_character(character);
return 1;
}
tokenizer_state = ST_FRESH;
token(TK_INTEGER);
return match_fresh(character, token);
}
int match_token(int character, void (*token)(int)) {
DBG
switch (tokenizer_state) {
case ST_FRESH:
return match_fresh(character, token);
case ST_COMMENT:
return match_comment_tail(character, token);
case ST_STRING:
return match_string_tail(character, token);
case ST_STRING_ESCAPE:
return match_string_escape_tail(character, token);
case ST_NAME:
return match_name_tail(character, token);
case ST_NUMBER:
return match_number_tail(character, token);
case ST_NUMBER_FLOATING:
return match_number_tail(character, token);
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment