587 lines
17 KiB
C

#define LIST_IMPLEMENTATION
#include "list.h"
#include "../include/SterlingCompiler.h"
# ifndef strndup
char *strndup(const char *s, size_t n) {
char *str = calloc(n + 1, sizeof(char));
memcpy(str, s, n);
return (str);
}
# endif
char *LoadFile(const char *filename) {
FILE *file = NULL;
char *data = NULL;
file = fopen(filename, "r");
assert(file);
fseek(file, 0, SEEK_END);
long size = ftell(file);
fseek(file, 0, SEEK_SET);
data = (char *)malloc(size + 1);
assert(data);
fread(data, 1, size, file);
data[size] = 0x00;
fclose(file);
return (data);
}
bool IsWhitespace(const char *s) {
while (*s) {
if (!isspace((unsigned char)*s)) return false;
s++;
}
return true;
}
bool IsNumeric(const char *s) {
if (!s || !*s) return false;
for (int i = 0; s[i]; i++) {
if (!isdigit((unsigned char)s[i])) return false;
}
return true;
}
bool IsComplexNumeric(const char *s, size_t len) {
if (len == 0) return false;
// Most numbers start with a digit
if (isdigit((unsigned char)s[0])) return true;
// Floats can start with a dot (e.g., .5)
if (s[0] == '.' && len > 1 && isdigit((unsigned char)s[1])) return true;
return false;
}
void ResolveTrigraphs(char *data) {
char *src = data, *dst = data;
while (*src) {
if (src[0] == '?' && src[1] == '?' && src[2]) {
char c = 0;
switch (src[2]) {
case '=': c = '#'; break; case '/': c = '\\'; break;
case '\'': c = '^'; break; case '(': c = '['; break;
case ')': c = ']'; break; case '!': c = '|'; break;
case '<': c = '{'; break; case '>': c = '}'; break;
case '-': c = '~'; break;
}
if (c) { *dst++ = c; src += 3; continue; }
}
*dst++ = *src++;
}
*dst = '\0';
}
typedef struct {
list_iter_t iter;
list_t *tokens;
bool error;
} Parser_t;
// Initialize the parser
Parser_t ParserInit(list_t *lst) {
return (Parser_t){ .iter = ListGetIter(lst), .tokens = lst, .error = false };
}
// Look at current token
Token_t* Peek(Parser_t *p) {
if (!p->iter.current) return NULL;
return (Token_t*)p->iter.current->data;
}
// Move to next
void Advance(Parser_t *p) {
if (p->iter.current) p->iter.current = p->iter.current->next;
}
// Check if current matches type (and optionally value)
bool Match(Parser_t *p, TKN_CTX ctx, const char *val) {
Token_t *t = Peek(p);
if (!t || (t->ctx != ctx)) return false;
if (val && strcmp(t->data, val) != 0) return false;
Advance(p);
return true;
}
// Required token; errors out if not found
Token_t* Expect(Parser_t *p, TKN_CTX ctx, const char *val) {
Token_t *t = Peek(p);
if (!t || (t->ctx != ctx) || (val && strcmp(t->data, val) != 0)) {
printf("Syntax Error: Expected '%s', but found '%s'\n",
val ? val : "specific type", t ? t->data : "EOF");
p->error = true;
return NULL;
}
Advance(p);
return t;
}
void ClearTokens(void*arg) {
Token_t *tok = arg;
free(tok->data);
free(tok);
}
node_t* NewNode(void* data) {
node_t* n = calloc(1, sizeof(node_t));
if(n) n->data = data;
return n;
}
Token_t* NewToken(const char* start, size_t len, TKN_CTX ctx) {
Token_t* t = malloc(sizeof(Token_t));
t->data = strndup(start, len);
t->size = len;
t->ctx = ctx;
return t;
}
void PushToken(list_t *lst, const char *start, const char *end, TKN_CTX ctx) {
if (end <= start) return;
ListPushBack(lst, NewToken(start, end - start, ctx));
}
void IdentifyTokens(list_t *lst) {
for (node_t *curr = lst->first; curr; curr = curr->next) {
Token_t *t = (Token_t *)curr->data;
// Skip nodes that were already identified (like TOK_STRING or munched TOK_OP)
if (t->ctx != TOK_RAW) continue;
// 1. Check Keyword Registry (Highest Priority)
bool found = false;
for (int i = 0; KEYWORD_TABLE[i].name != NULL; i++) {
if (strcmp(t->data, KEYWORD_TABLE[i].name) == 0) {
t->ctx = KEYWORD_TABLE[i].ctx;
found = true;
break;
}
}
if (found) continue;
// 2. Check for Numeric Literals (0x..., 3.14, 100L)
if (isdigit((unsigned char)t->data[0]) || (t->data[0] == '.' && t->size > 1 && isdigit(t->data[1]))) {
t->ctx = TOK_NUM;
continue;
}
// 3. Check for Identifiers (my_var, @comptime)
if (isalpha((unsigned char)t->data[0]) || t->data[0] == '_' || t->data[0] == '@') {
t->ctx = TOK_ID;
continue;
}
// 4. Check for Operators/Symbols (;, +, -, #)
// If it's in our SYMBOLS string, it's an operator or preprocessor trigger
if (strchr(SYMBOLS, t->data[0])) {
// Special case for '#' which is often its own thing
if (t->data[0] == '#') t->ctx = TOK_PREPROC;
else t->ctx = TOK_OP;
continue;
}
}
}
void ApplyTypeAliases(list_t *lst) {
for (node_t *curr = lst->first; curr; curr = curr->next) {
Token_t *t = curr->data;
// If we see 'int', we could programmatically replace it
// with the sequence ': 4' during a transformation pass.
if (t->ctx == TOK_ID && strcmp(t->data, "int") == 0) {
// Logic to transform token...
}
}
}
void ListSplitToken(list_t *lst, node_t *node, size_t index) {
Token_t *t = (Token_t *)node->data;
// Create the suffix node first
Token_t *suffix = NewToken(t->data + index, t->size - index, TOK_RAW);
node_t *new_node = NewNode(suffix);
new_node->next = node->next;
node->next = new_node;
if (lst->last == node) lst->last = new_node;
lst->size++;
// Truncate the original (prefix)
char *new_prefix = strndup(t->data, index);
free(t->data);
t->data = new_prefix;
t->size = index;
}
void InitialScanner(char *data, list_t *tkn_lst) {
char *curr = data, *start = data;
while (*curr) {
// 1. Handle Wide or Normal Strings/Chars
// Check for 'L' followed immediately by a quote
bool is_wide = (*curr == 'L' && (curr[1] == '\"' || curr[1] == '\''));
if (*curr == '\"' || *curr == '\'' || is_wide) {
PushToken(tkn_lst, start, curr, TOK_RAW);
char *s_start = curr;
if (is_wide) curr++; // Advance past 'L'
char q = *curr;
curr++; // Skip opening quote
while (*curr && *curr != q) {
if (*curr == '\\' && curr[1]) curr++; // Skip escaped char
curr++;
}
if (*curr) curr++; // Skip closing quote
PushToken(tkn_lst, s_start, curr, TOK_STRING);
start = curr;
}
// 2. Handle Comments (Same as before)
else if (*curr == '/' && (curr[1] == '/' || curr[1] == '*')) {
PushToken(tkn_lst, start, curr, TOK_RAW);
if (curr[1] == '/') { while (*curr && *curr != '\n') curr++; }
else {
curr += 2;
while (*curr && !(*curr == '*' && curr[1] == '/')) curr++;
if (*curr) curr += 2;
}
start = curr;
}
else curr++;
}
PushToken(tkn_lst, start, curr, TOK_RAW);
}
void RefineSymbols(list_t *tkn_lst) {
for (node_t *curr = tkn_lst->first; curr; ) {
Token_t *t = curr->data;
//IsComplexNumeric(t->data, t->size) ||
if (t->ctx != TOK_RAW || (t->size == 1 && strchr(SYMBOLS, t->data[0]))) {
curr = curr->next;
continue;
}
size_t pos = strcspn(t->data, SYMBOLS);
if (pos < t->size) {
ListSplitToken(tkn_lst, curr, (pos == 0) ? 1 : pos);
// Don't move curr yet, we might have more symbols in the suffix
} else {
curr = curr->next;
}
}
}
void MunchFloats(list_t *lst) {
for (node_t *n = lst->first; n && n->next && n->next->next; ) {
Token_t *t1 = n->data, *dot = n->next->data, *t2 = n->next->next->data;
// Look for [Digit] [.] [Digit]
if (isdigit(t1->data[0]) && dot->data[0] == '.' && dot->size == 1 && isdigit(t2->data[0])) {
size_t new_size = t1->size + 1 + t2->size;
char *buf = malloc(new_size + 1);
sprintf(buf, "%s.%s", t1->data, t2->data);
free(t1->data);
t1->data = buf;
t1->size = new_size;
t1->ctx = TOK_NUM; // Mark it now!
// Remove '.' and '14'
for(int i=0; i<2; i++) {
node_t *rem = n->next;
n->next = rem->next;
if (lst->last == rem) lst->last = n;
ClearTokens(rem->data); free(rem);
lst->size--;
}
continue;
}
n = n->next;
}
}
void MunchScientificNotation(list_t *lst) {
for (node_t *n = lst->first; n && n->next && n->next->next; ) {
Token_t *t1 = n->data;
Token_t *op = n->next->data;
Token_t *t2 = n->next->next->data;
// Check if t1 ends with 'e' or 'E' (and t1 is currently RAW)
if (t1->ctx == TOK_RAW && t1->size > 0) {
char last = tolower((unsigned char)t1->data[t1->size - 1]);
if (last == 'e' &&
(op->data[0] == '+' || op->data[0] == '-') && op->size == 1 &&
isdigit((unsigned char)t2->data[0])) {
// We found a match! (e.g., "1e" + "-" + "5")
size_t new_size = t1->size + op->size + t2->size;
char *new_data = malloc(new_size + 1);
sprintf(new_data, "%s%s%s", t1->data, op->data, t2->data);
free(t1->data);
t1->data = new_data;
t1->size = new_size;
// Remove the op and t2 nodes
for (size_t i = 0; i < 2; i++) {
node_t *to_remove = n->next;
n->next = to_remove->next;
if (lst->last == to_remove) lst->last = n;
ClearTokens(to_remove->data);
free(to_remove);
lst->size--;
}
// Check this same node again (in case of weird nesting, though rare here)
continue;
}
}
n = n->next;
}
}
void MunchTokens(list_t *lst) {
node_t *curr = lst->first;
while (curr) {
Token_t *t1 = curr->data;
if (t1->ctx != TOK_RAW && t1->ctx != TOK_OP) {
curr = curr->next;
continue;
}
bool matched = false;
for (size_t i = 0; MUNCH_TABLE[i].op; i++) {
size_t len = MUNCH_TABLE[i].len;
// 1. Peek ahead to see if we have enough nodes
node_t *temp = curr;
char buffer[5] = {0}; // Max munch is 4
size_t nodes_found = 0;
for (size_t j = 0; j < len && temp; j++) {
Token_t *tk = temp->data;
if (tk->size != 1) break; // Multi-char tokens can't be part of a new munch
buffer[j] = tk->data[0];
temp = temp->next;
nodes_found++;
}
// 2. Compare buffer to table entry
if (nodes_found == len && strcmp(buffer, MUNCH_TABLE[i].op) == 0) {
// SUCCESS: Consolidate 'len' nodes into 'curr'
free(t1->data);
t1->data = strndup(MUNCH_TABLE[i].op, len);
t1->size = len;
t1->ctx = (MUNCH_TABLE[i].op[0] == '%') ? TOK_PREPROC : TOK_OP;
// Remove the 'tail' nodes
for (size_t j = 1; j < len; j++) {
node_t *to_remove = curr->next;
curr->next = to_remove->next;
if (lst->last == to_remove) lst->last = curr;
ClearTokens(to_remove->data);
free(to_remove);
lst->size--;
}
matched = true;
break;
}
}
// If we munched, stay on 'curr' to see if a new sequence formed
if (!matched) curr = curr->next;
}
}
void RefineRawNodes(list_t *tkn_lst) {
node_t *curr = tkn_lst->first;
//node_t *prev = NULL;
while (curr) {
Token_t *t = (Token_t *)curr->data;
if (t->ctx == TOK_RAW) {
char *span = NULL;
char *to_split = strndup(t->data, t->size);
char *tok = strtok_r(to_split, " \t\r\n", &span);
if (tok) {
free(t->data);
t->size = strlen(tok);
t->data = strndup(tok, t->size);
node_t *last_inserted = curr;
tok = strtok_r(NULL, " \t\r\n", &span);
while (tok) {
Token_t *new_t = calloc(1, sizeof(Token_t));
new_t->size = strlen(tok);
new_t->data = strndup(tok, new_t->size);
new_t->ctx = TOK_RAW;
node_t *new_node = calloc(1, sizeof(node_t));
new_node->data = new_t;
new_node->next = last_inserted->next;
last_inserted->next = new_node;
if (tkn_lst->last == last_inserted) tkn_lst->last = new_node;
last_inserted = new_node;
tkn_lst->size++;
tok = strtok_r(NULL, " \t\r\n", &span);
}
curr = last_inserted;
}
free(to_split);
}
//prev = curr;
curr = curr->next;
}
}
void PruneWhitespaceNodes(list_t *lst) {
node_t *curr = lst->first;
node_t *prev = NULL;
while (curr) {
Token_t *t = (Token_t *)curr->data;
if (t->ctx == TOK_RAW && IsWhitespace(t->data)) {
// Unlink and free
node_t *temp = curr;
if (prev) prev->next = curr->next;
else lst->first = curr->next;
if (lst->last == temp) lst->last = prev;
curr = curr->next;
ClearTokens(temp->data);
free(temp);
lst->size--;
} else {
prev = curr;
curr = curr->next;
}
}
}
/*
// Modular function to register new identifiers
void RegisterIdentifier(const char *name, TKN_CTX type) {
//insert this into a Hash Map.
//this is where user-defined types go.
}
*/
void ParseVarDeclaration(Parser_t *p) {
// 1. We already saw 'var' (the trigger)
// 2. Expect an Identifier (the name)
Token_t *name = Expect(p, TOK_ID, NULL);
if (p->error) return;
// 3. Expect the separator ':'
Expect(p, TOK_OP, ":");
if (p->error) return;
// 4. Expect the size (numeric)
Token_t *size = Expect(p, TOK_NUM, NULL);
if (p->error) return;
printf("Defined variable '%s' with size %s bytes.\n", name->data, size->data);
// 5. Finalize with semicolon
Expect(p, TOK_OP, ";");
}
void Parse(Parser_t *p) {
while (Peek(p) != NULL && !p->error) {
Token_t *t = Peek(p);
if (t->ctx == TOK_KEY && strcmp(t->data, "var") == 0) {
Advance(p); // Consume 'var'
ParseVarDeclaration(p);
}
else {
printf("Unknown token: %s\n", t->data);
Advance(p);
}
}
}
const char* CtxToString(TKN_CTX ctx) {
if (ctx & TOK_KEY) return "KEYWORD";
if (ctx & TOK_ID) return "IDENTIFIER";
if (ctx & TOK_NUM) return "NUMBER";
if (ctx & TOK_OP) return "OPERATOR";
if (ctx & TOK_STRING) return "STRING";
if (ctx & TOK_PREPROC) return "PREPROCESS";
if (ctx & TOK_COMMENT) return "COMMENT";
if (ctx & TOK_RAW) return "RAW";
if (ctx & TOK_LITERAL) return "LITERAL";
if (ctx & TOK_NONE) return "NONE";
return "UNKNOWN";
}
/*
pass on ";(){}[]$%&*$#@!?:,.<>|_-+=~`"
and give each token a context
let's replace preprocessor (include, define, etc)
let's do recursive parsing everywhere that need it
compile time reflection (@comptime or @reflect)
metaprogramming logic annotation if i do it lastly** may not be
*/
int main(int ac, char **av) {
if (ac <= 1) return printf("No file specified\n"), -1;
char* data = LoadFile(av[1]);
assert(data);
ResolveTrigraphs(data);
list_t *tkn_lst = ListInit(NULL);
assert(tkn_lst);
InitialScanner(data, tkn_lst);
PruneWhitespaceNodes(tkn_lst);
RefineRawNodes(tkn_lst);
RefineSymbols(tkn_lst);
MunchFloats(tkn_lst);
MunchScientificNotation(tkn_lst);
MunchTokens(tkn_lst);
IdentifyTokens(tkn_lst);
list_iter_t iter = ListGetIter(tkn_lst);
printf("\n--- TOKEN STREAM ---\n");
printf("%-6s | %-12s | %s\n", "HEX", "CONTEXT", "VALUE");
printf("-------|--------------|----------\n");
while (iter.current) {
Token_t *t = (Token_t *)iter.current->data;
// Use CtxToString for the middle column
printf("[0x%04X] | %-12s | %s\n",
t->ctx,
CtxToString(t->ctx),
t->data);
iter.current = iter.current->next;
}
printf("--------------------\n");
//Parser_t p = ParserInit(tkn_lst);
//Parse(&p);
//Ast
//print all error and check correction available
//symbletable
//type check
//metaanalyze to know what how to put it into IR (as asm allow for metaprog)
//create ir by resolving ast on multiple thread
ListFree(tkn_lst, ClearTokens);
free(data);
return(0);
}
//test