322 lines
9.6 KiB
C
322 lines
9.6 KiB
C
#define LIST_IMPLEMENTATION
|
|
#include "list.h"
|
|
|
|
#include "../include/SterlingCompiler.h"
|
|
|
|
const char *SYMBOLS = ";(){}[]$%&*#@!?:,.<>|-+=~`^";
|
|
|
|
// Common C operators (Order matters: put longer ones first if you add 3-char ops)
|
|
MultiOp MUNCH_TABLE[] = {
|
|
{"<<=", 3}, {">>=", 3},
|
|
{"==", 2}, {"!=", 2}, {"<=", 2}, {">=", 2},
|
|
{"++", 2}, {"--", 2}, {"->", 2}, {"+=", 2},
|
|
{"-=", 2}, {"*=", 2}, {"/=", 2}, {"&&", 2}, {"||", 2},
|
|
{"^=", 2}, {"<<", 2}, {">>", 2}, {"|=", 2}, {"&=", 2},
|
|
{NULL, 0}
|
|
};
|
|
|
|
// This can be expanded at runtime if you use a dynamic array instead of a static one
|
|
KeywordEntry KEYWORD_TABLE[] = {
|
|
{"if", TOK_KEY},
|
|
{"else", TOK_KEY},
|
|
{"while", TOK_KEY},
|
|
{"return", TOK_KEY},
|
|
{"int", TOK_KEY},
|
|
{"float", TOK_KEY},
|
|
{"void", TOK_KEY},
|
|
{"include", TOK_PREPROC},
|
|
{"define", TOK_PREPROC},
|
|
{"@comptime",TOK_KEY}, // Your custom identifier
|
|
{NULL, TOK_NONE}
|
|
};
|
|
|
|
# ifndef strndup
|
|
char *strndup(const char *s, size_t n) {
|
|
char *str = calloc(n + 1, sizeof(char));
|
|
memcpy(str, s, n);
|
|
return (str);
|
|
}
|
|
# endif
|
|
|
|
|
|
bool IsWhitespace(const char *s) {
|
|
while (*s) {
|
|
if (!isspace((unsigned char)*s)) return false;
|
|
s++;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void ClearTokens(void*arg) {
|
|
Token_t *tok = arg;
|
|
free(tok->data);
|
|
free(tok);
|
|
}
|
|
|
|
node_t* NewNode(void* data) {
|
|
node_t* n = calloc(1, sizeof(node_t));
|
|
if(n) n->data = data;
|
|
return n;
|
|
}
|
|
|
|
Token_t* NewToken(const char* start, size_t len, TKN_CTX ctx) {
|
|
Token_t* t = malloc(sizeof(Token_t));
|
|
t->data = strndup(start, len);
|
|
t->size = len;
|
|
t->ctx = ctx;
|
|
return t;
|
|
}
|
|
|
|
void PushToken(list_t *lst, const char *start, const char *end, TKN_CTX ctx) {
|
|
if (end <= start) return;
|
|
ListPushBack(lst, NewToken(start, end - start, ctx));
|
|
}
|
|
|
|
void ListSplitToken(list_t *lst, node_t *node, size_t index) {
|
|
Token_t *t = (Token_t *)node->data;
|
|
|
|
// Create the suffix node first
|
|
Token_t *suffix = NewToken(t->data + index, t->size - index, TOK_RAW);
|
|
node_t *new_node = NewNode(suffix);
|
|
new_node->next = node->next;
|
|
node->next = new_node;
|
|
if (lst->last == node) lst->last = new_node;
|
|
lst->size++;
|
|
|
|
// Truncate the original (prefix)
|
|
char *new_prefix = strndup(t->data, index);
|
|
free(t->data);
|
|
t->data = new_prefix;
|
|
t->size = index;
|
|
}
|
|
|
|
char *LoadFile(const char *filename) {
|
|
FILE *file = NULL;
|
|
char *data = NULL;
|
|
file = fopen(filename, "r");
|
|
assert(file);
|
|
fseek(file, 0, SEEK_END);
|
|
long size = ftell(file);
|
|
fseek(file, 0, SEEK_SET);
|
|
data = (char *)malloc(size + 1);
|
|
assert(data);
|
|
fread(data, 1, size, file);
|
|
data[size] = 0x00;
|
|
fclose(file);
|
|
return (data);
|
|
}
|
|
|
|
|
|
void InitialScanner(char *data, list_t *tkn_lst) {
|
|
char *curr = data, *start = data;
|
|
|
|
while (*curr) {
|
|
// Handle Strings
|
|
if (*curr == '\"' || *curr == '\'') {
|
|
PushToken(tkn_lst, start, curr, TOK_RAW);
|
|
char *s_start = curr++, q = *curr;
|
|
while (*curr && *curr != q) { if (*curr == '\\') curr++; curr++; }
|
|
if (*curr) curr++;
|
|
PushToken(tkn_lst, s_start, curr, TOK_STRING);
|
|
start = curr;
|
|
}
|
|
// Handle Comments
|
|
else if (*curr == '/' && (curr[1] == '/' || curr[1] == '*')) {
|
|
PushToken(tkn_lst, start, curr, TOK_RAW);
|
|
if (curr[1] == '/') { while (*curr && *curr != '\n') curr++; }
|
|
else {
|
|
curr += 2;
|
|
while (*curr && !(*curr == '*' && curr[1] == '/')) curr++;
|
|
if (*curr) curr += 2;
|
|
}
|
|
start = curr;
|
|
}
|
|
else curr++;
|
|
}
|
|
PushToken(tkn_lst, start, curr, TOK_RAW);
|
|
}
|
|
|
|
void RefineSymbols(list_t *tkn_lst) {
|
|
for (node_t *curr = tkn_lst->first; curr; ) {
|
|
Token_t *t = curr->data;
|
|
if (t->ctx != TOK_RAW || (t->size == 1 && strchr(SYMBOLS, t->data[0]))) {
|
|
curr = curr->next;
|
|
continue;
|
|
}
|
|
|
|
size_t pos = strcspn(t->data, SYMBOLS);
|
|
if (pos < t->size) {
|
|
ListSplitToken(tkn_lst, curr, (pos == 0) ? 1 : pos);
|
|
// Don't move curr yet, we might have more symbols in the suffix
|
|
} else {
|
|
curr = curr->next;
|
|
}
|
|
}
|
|
}
|
|
|
|
void MunchTokens(list_t *lst) {
|
|
for (node_t *n = lst->first; n && n->next; ) {
|
|
Token_t *t1 = n->data, *t2 = n->next->data;
|
|
|
|
if (t1->ctx == TOK_RAW && t2->ctx == TOK_RAW && t1->size == 1 && t2->size == 1) {
|
|
char op[3] = { t1->data[0], t2->data[0], '\0' };
|
|
bool match = false;
|
|
for (int i = 0; MUNCH_TABLE[i].op; i++) {
|
|
if (strcmp(op, MUNCH_TABLE[i].op) == 0) { match = true; break; }
|
|
}
|
|
|
|
if (match) {
|
|
free(t1->data);
|
|
t1->data = strndup(op, 2);
|
|
t1->size = 2;
|
|
t1->ctx = TOK_OP; // Upgrade to Operator context
|
|
|
|
node_t *tmp = n->next;
|
|
n->next = tmp->next;
|
|
if (lst->last == tmp) lst->last = n;
|
|
ClearTokens(tmp->data);
|
|
free(tmp);
|
|
lst->size--;
|
|
continue; // Check if the next char can be munched too (e.g. >>=)
|
|
}
|
|
}
|
|
n = n->next;
|
|
}
|
|
}
|
|
|
|
|
|
void RefineRawNodes(list_t *tkn_lst) {
|
|
node_t *curr = tkn_lst->first;
|
|
//node_t *prev = NULL;
|
|
|
|
while (curr) {
|
|
Token_t *t = (Token_t *)curr->data;
|
|
if (t->ctx == TOK_RAW) {
|
|
char *span = NULL;
|
|
char *to_split = strndup(t->data, t->size);
|
|
char *tok = strtok_r(to_split, " \t\r\n", &span);
|
|
|
|
if (tok) {
|
|
free(t->data);
|
|
t->size = strlen(tok);
|
|
t->data = strndup(tok, t->size);
|
|
|
|
node_t *last_inserted = curr;
|
|
tok = strtok_r(NULL, " \t\r\n", &span);
|
|
|
|
while (tok) {
|
|
Token_t *new_t = calloc(1, sizeof(Token_t));
|
|
new_t->size = strlen(tok);
|
|
new_t->data = strndup(tok, new_t->size);
|
|
new_t->ctx = TOK_RAW;
|
|
|
|
node_t *new_node = calloc(1, sizeof(node_t));
|
|
new_node->data = new_t;
|
|
|
|
new_node->next = last_inserted->next;
|
|
last_inserted->next = new_node;
|
|
|
|
if (tkn_lst->last == last_inserted) tkn_lst->last = new_node;
|
|
|
|
last_inserted = new_node;
|
|
tkn_lst->size++;
|
|
tok = strtok_r(NULL, " \t\r\n", &span);
|
|
}
|
|
curr = last_inserted;
|
|
}
|
|
free(to_split);
|
|
}
|
|
//prev = curr;
|
|
curr = curr->next;
|
|
}
|
|
}
|
|
|
|
void PruneWhitespaceNodes(list_t *lst) {
|
|
node_t *curr = lst->first;
|
|
node_t *prev = NULL;
|
|
|
|
while (curr) {
|
|
Token_t *t = (Token_t *)curr->data;
|
|
if (t->ctx == TOK_RAW && IsWhitespace(t->data)) {
|
|
// Unlink and free
|
|
node_t *temp = curr;
|
|
if (prev) prev->next = curr->next;
|
|
else lst->first = curr->next;
|
|
|
|
if (lst->last == temp) lst->last = prev;
|
|
|
|
curr = curr->next;
|
|
ClearTokens(temp->data);
|
|
free(temp);
|
|
lst->size--;
|
|
} else {
|
|
prev = curr;
|
|
curr = curr->next;
|
|
}
|
|
}
|
|
}
|
|
|
|
void IdentifyTokens(list_t *lst) {
|
|
for (node_t *curr = lst->first; curr; curr = curr->next) {
|
|
Token_t *t = (Token_t *)curr->data;
|
|
|
|
if (t->ctx != TOK_RAW) continue;
|
|
|
|
bool found = false;
|
|
// 1. Check against Keyword Registry
|
|
for (int i = 0; KEYWORD_TABLE[i].name != NULL; i++) {
|
|
if (strcmp(t->data, KEYWORD_TABLE[i].name) == 0) {
|
|
t->ctx = KEYWORD_TABLE[i].ctx;
|
|
found = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// 2. If not a keyword, is it a valid Identifier? (e.g., my_var_1)
|
|
if (!found && t->size > 0) {
|
|
if (isalpha(t->data[0]) || t->data[0] == '_' || t->data[0] == '@') {
|
|
t->ctx = TOK_ID;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
// Modular function to register new identifiers
|
|
void RegisterIdentifier(const char *name, TKN_CTX type) {
|
|
// In a professional compiler, you'd insert this into a Hash Map.
|
|
// For now, it's enough to know this is where user-defined types go.
|
|
}
|
|
*/
|
|
|
|
int main(int ac, char **av) {
|
|
if (ac <= 1) return printf("No file specified\n"), -1;
|
|
char* data = LoadFile(av[1]);
|
|
list_t *tkn_lst = ListInit(NULL);
|
|
|
|
InitialScanner(data, tkn_lst);
|
|
PruneWhitespaceNodes(tkn_lst);
|
|
RefineRawNodes(tkn_lst);
|
|
RefineSymbols(tkn_lst);
|
|
MunchTokens(tkn_lst);
|
|
IdentifyTokens(tkn_lst);
|
|
|
|
list_iter_t iter = ListGetIter(tkn_lst);
|
|
while (iter.current) {
|
|
Token_t *t = (Token_t *)iter.current->data;
|
|
printf("[%02X] %-10s | %s\n", t->ctx,
|
|
(t->ctx == TOK_ID ? "IDENTIFIER" : "TOKEN"), t->data);
|
|
iter.current = iter.current->next;
|
|
}
|
|
//pass on ";(){}[]$%&*$#@!?:,.<>|_-+=~`"
|
|
//and give each token a context
|
|
//let's replace preprocessor (include, define, etc)
|
|
//let's do recursive parsing everywhere that need it
|
|
//compile time reflection (@comptime or @reflect)
|
|
//metaprogramming logic annotation if i do it lastly** may not be
|
|
ListFree(tkn_lst, ClearTokens);
|
|
free(data);
|
|
return(0);
|
|
}
|
|
//test
|