-
-
Save caiodanielnunessantos/2bdd5ac54952d57ec3361935b4029a9a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdlib.h> | |
#include <string.h> | |
#include <stdio.h> | |
#include <assert.h> | |
#define DBGINT(variable) \ | |
fprintf(stderr, "%s: %d\n", #variable, variable); | |
#define DBGCHAR(variable) \ | |
fprintf(stderr, "%s: %c\n", #variable, variable); | |
#ifdef FUNCDBG | |
#define DBG \ | |
fprintf(stderr, "Function %s, line %d\n", __PRETTY_FUNCTION__, __LINE__); | |
#else | |
#define DBG ; | |
#endif | |
#define EQ(character_to_compare) \ | |
(character == character_to_compare) | |
#define BT(first_character, last_character) \ | |
(character >= first_character && character <= last_character) | |
#define MC(name) \ | |
int is_ ## name(char character) | |
#define MCEQ(name, character) \ | |
MC(name) { \ | |
return EQ(character); \ | |
} | |
#define MCBT(name, first_character, last_character) \ | |
MC(name) { \ | |
return BT(first_character, last_character); \ | |
} | |
#define MC2(name, condition1, condition2) \ | |
MC(name) { \ | |
return (is_ ## condition1(character) || is_ ## condition2(character)); \ | |
} | |
#define MC3(name, condition1, condition2, condition3) \ | |
MC(name) { \ | |
return (is_ ## condition1(character) || is_ ## condition2(character) || is_ ## condition3(character)); \ | |
} | |
MCEQ(UNDERSCORE, '_') | |
MCEQ(COMMA, ',') | |
MCEQ(PERIOD, '.') | |
MCEQ(SEMICOLON, ';') | |
MCEQ(SLASH, '/') | |
MCEQ(BACKSLASH, '\\') | |
MCEQ(LPARENTHESIS, '(') | |
MCEQ(RPARENTHESIS, ')') | |
MCEQ(LBRACKET, '{') | |
MCEQ(RBRACKET, '}') | |
MCEQ(QUOTE, '\"') | |
MCBT(ALPHA_LOWER, 'a', 'z') | |
MCBT(ALPHA_UPPER, 'A', 'Z') | |
MCBT(DIGIT, '0', '9') | |
MC2(ALPHA, ALPHA_LOWER, ALPHA_UPPER) | |
MC2(NAME_HEAD, ALPHA, UNDERSCORE) | |
MC2(NAME_TAIL, NAME_HEAD, DIGIT) | |
MCEQ(SPACE, ' ') | |
MCEQ(TABULATION, '\t') | |
MCEQ(LINEBREAK, '\n') | |
MC3(WHITESPACE, SPACE, TABULATION, LINEBREAK) | |
#define TEST(function, character) \ | |
assert(is_ ## function(character)); | |
#define TESTNOT(function, character) \ | |
assert(!(is_ ## function(character))); | |
void test_simple_matchers() { | |
TEST(ALPHA, 'A') | |
TEST(ALPHA_UPPER, 'A') | |
TEST(ALPHA, 'a') | |
TEST(ALPHA_LOWER, 'a') | |
TEST(ALPHA, 'Z') | |
TEST(ALPHA, 'z') | |
TESTNOT(ALPHA, ',') | |
TESTNOT(ALPHA, ' ') | |
TEST(DIGIT, '0') | |
TEST(DIGIT, '9') | |
TESTNOT(DIGIT, ' ') | |
TESTNOT(DIGIT, 'K') | |
} | |
//Name buffer | |
static char global_name_buffer[4096]; | |
void gnb_reset() { | |
memset(global_name_buffer, 0, 4096); | |
} | |
void gnb_add_character(int character) { | |
*((char*)memchr(global_name_buffer, 0, 4096)) = (char)character; | |
} | |
char* gnb_to_heap() { | |
size_t length = strlen(global_name_buffer); | |
char* heap = malloc(sizeof(char) * length + 1); | |
strcpy(heap, global_name_buffer); | |
return heap; | |
} | |
enum { | |
ST_FRESH, | |
ST_COMMENT, | |
ST_NAME, | |
ST_STRING, | |
ST_STRING_ESCAPE, | |
ST_NUMBER, | |
ST_NUMBER_FLOATING, | |
}; | |
int tokenizer_state = ST_FRESH; | |
enum { | |
TK_NONE, | |
TK_NAME, | |
TK_STRING, | |
TK_INTEGER, | |
TK_FLOATING, | |
TK_COMMA, | |
TK_SEMICOLON, | |
TK_LPARENTHESIS, | |
TK_RPARENTHESIS, | |
TK_LBRACKET, | |
TK_RBRACKET, | |
}; | |
#define ONECHTK(name) \ | |
if (is_ ## name(character)) { \ | |
token(TK_ ## name); \ | |
return 1; \ | |
} | |
//IT'S FRESH, FRESH | |
int match_one_char(int character, void (*token)(int)) { | |
DBG | |
ONECHTK(COMMA) | |
ONECHTK(SEMICOLON) | |
ONECHTK(LPARENTHESIS) | |
ONECHTK(RPARENTHESIS) | |
ONECHTK(LBRACKET) | |
ONECHTK(RBRACKET) | |
return 0; | |
} | |
int match_whitespace(int character, void (*token)(int)) { | |
DBG | |
if (is_WHITESPACE(character)) { | |
return 1; | |
} | |
return 0; | |
} | |
int match_name_head(int character, void (*token)(int)) { | |
DBG | |
if (is_NAME_HEAD(character)) { | |
DBG | |
tokenizer_state = ST_NAME; | |
gnb_reset(); | |
gnb_add_character(character); | |
return 1; | |
} | |
return 0; | |
} | |
int match_string_head(int character, void (*token)(int)) { | |
DBG | |
if (is_QUOTE(character)) { | |
tokenizer_state = ST_STRING; | |
gnb_reset(); | |
return 1; | |
} | |
return 0; | |
} | |
int match_comment_head(int character, void (*token)(int)) { | |
DBG | |
if (is_SLASH(character)) { | |
tokenizer_state = ST_COMMENT; | |
return 1; | |
} | |
return 0; | |
} | |
int match_number_head(int character, void (*token)(int)) { | |
DBG | |
if (is_DIGIT(character)) { | |
DBG | |
tokenizer_state = ST_NUMBER; | |
gnb_reset(); | |
gnb_add_character(character); | |
return 1; | |
} | |
return 0; | |
} | |
int match_fresh(int character, void (*token)(int)) { | |
DBG | |
return ( | |
match_one_char(character, token) || | |
match_whitespace(character, token) || | |
match_name_head(character, token) || | |
match_string_head(character, token) || | |
match_comment_head(character, token) || | |
match_number_head(character, token) | |
); | |
} | |
//It's name | |
int match_name_tail(int character, void (*token)(int)) { | |
DBG | |
if (is_NAME_TAIL(character)) { | |
gnb_add_character(character); | |
return 1; | |
} | |
tokenizer_state = ST_FRESH; | |
token(TK_NAME); | |
return match_fresh(character, token); | |
} | |
//It's string | |
int match_string_tail(int character, void (*token)(int)) { | |
DBG | |
if (is_QUOTE(character)) { | |
token(TK_STRING); | |
return 1; | |
} | |
if (is_SLASH(character)) { | |
tokenizer_state = ST_STRING_ESCAPE; | |
return 1; | |
} | |
gnb_add_character(character); | |
return 1; | |
} | |
int match_string_escape_tail(int character, void (*token)(int)) { | |
DBG | |
if (character == 'n') { | |
gnb_add_character('\n'); | |
tokenizer_state = ST_STRING; | |
return 1; | |
} | |
if (character == 't') { | |
gnb_add_character('\t'); | |
tokenizer_state = ST_STRING; | |
return 1; | |
} | |
if (character == 'v') { | |
gnb_add_character('\v'); | |
tokenizer_state = ST_STRING; | |
return 1; | |
} | |
if (character == '\\') { | |
gnb_add_character('\\'); | |
tokenizer_state = ST_STRING; | |
return 1; | |
} | |
if (character == '\'') { | |
gnb_add_character('\''); | |
tokenizer_state = ST_STRING; | |
return 1; | |
} | |
if (character == '\"') { | |
gnb_add_character('\"'); | |
tokenizer_state = ST_STRING; | |
return 1; | |
} | |
return 0; | |
} | |
int match_comment_tail(int character, void (*token)(int)) { | |
DBG | |
if (character == '/') { | |
tokenizer_state = ST_FRESH; | |
return 1; | |
} | |
return 1; | |
} | |
int match_number_tail(int character, void(*token)(int)) { | |
DBG | |
if (character == '.') { | |
gnb_add_character('.'); | |
tokenizer_state = ST_NUMBER_FLOATING; | |
return 1; | |
} | |
if (is_DIGIT(character)) { | |
gnb_add_character(character); | |
return 1; | |
} | |
tokenizer_state = ST_FRESH; | |
token(TK_INTEGER); | |
return match_fresh(character, token); | |
} | |
int match_token(int character, void (*token)(int)) { | |
DBG | |
switch (tokenizer_state) { | |
case ST_FRESH: | |
return match_fresh(character, token); | |
case ST_COMMENT: | |
return match_comment_tail(character, token); | |
case ST_STRING: | |
return match_string_tail(character, token); | |
case ST_STRING_ESCAPE: | |
return match_string_escape_tail(character, token); | |
case ST_NAME: | |
return match_name_tail(character, token); | |
case ST_NUMBER: | |
return match_number_tail(character, token); | |
case ST_NUMBER_FLOATING: | |
return match_number_tail(character, token); | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment