#define LIST_IMPLEMENTATION #include "list.h" #include "../include/SterlingCompiler.h" # ifndef strndup char *strndup(const char *s, size_t n) { char *str = calloc(n + 1, sizeof(char)); memcpy(str, s, n); return (str); } # endif char *LoadFile(const char *filename) { FILE *file = NULL; char *data = NULL; file = fopen(filename, "r"); assert(file); fseek(file, 0, SEEK_END); long size = ftell(file); fseek(file, 0, SEEK_SET); data = (char *)malloc(size + 1); assert(data); fread(data, 1, size, file); data[size] = 0x00; fclose(file); return (data); } bool IsWhitespace(const char *s) { while (*s) { if (!isspace((unsigned char)*s)) return false; s++; } return true; } bool IsNumeric(const char *s) { if (!s || !*s) return false; for (int i = 0; s[i]; i++) { if (!isdigit((unsigned char)s[i])) return false; } return true; } bool IsComplexNumeric(const char *s, size_t len) { if (len == 0) return false; // Most numbers start with a digit if (isdigit((unsigned char)s[0])) return true; // Floats can start with a dot (e.g., .5) if (s[0] == '.' && len > 1 && isdigit((unsigned char)s[1])) return true; return false; } void ResolveTrigraphs(char *data) { char *src = data, *dst = data; while (*src) { if (src[0] == '?' && src[1] == '?' && src[2]) { char c = 0; switch (src[2]) { case '=': c = '#'; break; case '/': c = '\\'; break; case '\'': c = '^'; break; case '(': c = '['; break; case ')': c = ']'; break; case '!': c = '|'; break; case '<': c = '{'; break; case '>': c = '}'; break; case '-': c = '~'; break; } if (c) { *dst++ = c; src += 3; continue; } } *dst++ = *src++; } *dst = '\0'; } typedef struct { list_iter_t iter; list_t *tokens; bool error; } Parser_t; // Initialize the parser Parser_t ParserInit(list_t *lst) { return (Parser_t){ .iter = ListGetIter(lst), .tokens = lst, .error = false }; } // Look at current token Token_t* Peek(Parser_t *p) { if (!p->iter.current) return NULL; return (Token_t*)p->iter.current->data; } // Move to next void Advance(Parser_t *p) { if (p->iter.current) p->iter.current = p->iter.current->next; } // Check if current matches type (and optionally value) bool Match(Parser_t *p, TKN_CTX ctx, const char *val) { Token_t *t = Peek(p); if (!t || (t->ctx != ctx)) return false; if (val && strcmp(t->data, val) != 0) return false; Advance(p); return true; } // Required token; errors out if not found Token_t* Expect(Parser_t *p, TKN_CTX ctx, const char *val) { Token_t *t = Peek(p); if (!t || (t->ctx != ctx) || (val && strcmp(t->data, val) != 0)) { printf("Syntax Error: Expected '%s', but found '%s'\n", val ? val : "specific type", t ? t->data : "EOF"); p->error = true; return NULL; } Advance(p); return t; } void ClearTokens(void*arg) { Token_t *tok = arg; free(tok->data); free(tok); } node_t* NewNode(void* data) { node_t* n = calloc(1, sizeof(node_t)); if(n) n->data = data; return n; } Token_t* NewToken(const char* start, size_t len, TKN_CTX ctx) { Token_t* t = malloc(sizeof(Token_t)); t->data = strndup(start, len); t->size = len; t->ctx = ctx; return t; } void PushToken(list_t *lst, const char *start, const char *end, TKN_CTX ctx) { if (end <= start) return; ListPushBack(lst, NewToken(start, end - start, ctx)); } void IdentifyTokens(list_t *lst) { for (node_t *curr = lst->first; curr; curr = curr->next) { Token_t *t = (Token_t *)curr->data; // Skip nodes that were already identified (like TOK_STRING or munched TOK_OP) if (t->ctx != TOK_RAW) continue; // 1. Check Keyword Registry (Highest Priority) bool found = false; for (int i = 0; KEYWORD_TABLE[i].name != NULL; i++) { if (strcmp(t->data, KEYWORD_TABLE[i].name) == 0) { t->ctx = KEYWORD_TABLE[i].ctx; found = true; break; } } if (found) continue; // 2. Check for Numeric Literals (0x..., 3.14, 100L) if (isdigit((unsigned char)t->data[0]) || (t->data[0] == '.' && t->size > 1 && isdigit(t->data[1]))) { t->ctx = TOK_NUM; continue; } // 3. Check for Identifiers (my_var, @comptime) if (isalpha((unsigned char)t->data[0]) || t->data[0] == '_' || t->data[0] == '@') { t->ctx = TOK_ID; continue; } // 4. Check for Operators/Symbols (;, +, -, #) // If it's in our SYMBOLS string, it's an operator or preprocessor trigger if (strchr(SYMBOLS, t->data[0])) { // Special case for '#' which is often its own thing if (t->data[0] == '#') t->ctx = TOK_PREPROC; else t->ctx = TOK_OP; continue; } } } void ApplyTypeAliases(list_t *lst) { for (node_t *curr = lst->first; curr; curr = curr->next) { Token_t *t = curr->data; // If we see 'int', we could programmatically replace it // with the sequence ': 4' during a transformation pass. if (t->ctx == TOK_ID && strcmp(t->data, "int") == 0) { // Logic to transform token... } } } void ListSplitToken(list_t *lst, node_t *node, size_t index) { Token_t *t = (Token_t *)node->data; // Create the suffix node first Token_t *suffix = NewToken(t->data + index, t->size - index, TOK_RAW); node_t *new_node = NewNode(suffix); new_node->next = node->next; node->next = new_node; if (lst->last == node) lst->last = new_node; lst->size++; // Truncate the original (prefix) char *new_prefix = strndup(t->data, index); free(t->data); t->data = new_prefix; t->size = index; } void InitialScanner(char *data, list_t *tkn_lst) { char *curr = data, *start = data; while (*curr) { // 1. Handle Wide or Normal Strings/Chars // Check for 'L' followed immediately by a quote bool is_wide = (*curr == 'L' && (curr[1] == '\"' || curr[1] == '\'')); if (*curr == '\"' || *curr == '\'' || is_wide) { PushToken(tkn_lst, start, curr, TOK_RAW); char *s_start = curr; if (is_wide) curr++; // Advance past 'L' char q = *curr; curr++; // Skip opening quote while (*curr && *curr != q) { if (*curr == '\\' && curr[1]) curr++; // Skip escaped char curr++; } if (*curr) curr++; // Skip closing quote PushToken(tkn_lst, s_start, curr, TOK_STRING); start = curr; } // 2. Handle Comments (Same as before) else if (*curr == '/' && (curr[1] == '/' || curr[1] == '*')) { PushToken(tkn_lst, start, curr, TOK_RAW); if (curr[1] == '/') { while (*curr && *curr != '\n') curr++; } else { curr += 2; while (*curr && !(*curr == '*' && curr[1] == '/')) curr++; if (*curr) curr += 2; } start = curr; } else curr++; } PushToken(tkn_lst, start, curr, TOK_RAW); } void RefineSymbols(list_t *tkn_lst) { for (node_t *curr = tkn_lst->first; curr; ) { Token_t *t = curr->data; //IsComplexNumeric(t->data, t->size) || if (t->ctx != TOK_RAW || (t->size == 1 && strchr(SYMBOLS, t->data[0]))) { curr = curr->next; continue; } size_t pos = strcspn(t->data, SYMBOLS); if (pos < t->size) { ListSplitToken(tkn_lst, curr, (pos == 0) ? 1 : pos); // Don't move curr yet, we might have more symbols in the suffix } else { curr = curr->next; } } } void MunchFloats(list_t *lst) { for (node_t *n = lst->first; n && n->next && n->next->next; ) { Token_t *t1 = n->data, *dot = n->next->data, *t2 = n->next->next->data; // Look for [Digit] [.] [Digit] if (isdigit(t1->data[0]) && dot->data[0] == '.' && dot->size == 1 && isdigit(t2->data[0])) { size_t new_size = t1->size + 1 + t2->size; char *buf = malloc(new_size + 1); sprintf(buf, "%s.%s", t1->data, t2->data); free(t1->data); t1->data = buf; t1->size = new_size; t1->ctx = TOK_NUM; // Mark it now! // Remove '.' and '14' for(int i=0; i<2; i++) { node_t *rem = n->next; n->next = rem->next; if (lst->last == rem) lst->last = n; ClearTokens(rem->data); free(rem); lst->size--; } continue; } n = n->next; } } void MunchScientificNotation(list_t *lst) { for (node_t *n = lst->first; n && n->next && n->next->next; ) { Token_t *t1 = n->data; Token_t *op = n->next->data; Token_t *t2 = n->next->next->data; // Check if t1 ends with 'e' or 'E' (and t1 is currently RAW) if (t1->ctx == TOK_RAW && t1->size > 0) { char last = tolower((unsigned char)t1->data[t1->size - 1]); if (last == 'e' && (op->data[0] == '+' || op->data[0] == '-') && op->size == 1 && isdigit((unsigned char)t2->data[0])) { // We found a match! (e.g., "1e" + "-" + "5") size_t new_size = t1->size + op->size + t2->size; char *new_data = malloc(new_size + 1); sprintf(new_data, "%s%s%s", t1->data, op->data, t2->data); free(t1->data); t1->data = new_data; t1->size = new_size; // Remove the op and t2 nodes for (size_t i = 0; i < 2; i++) { node_t *to_remove = n->next; n->next = to_remove->next; if (lst->last == to_remove) lst->last = n; ClearTokens(to_remove->data); free(to_remove); lst->size--; } // Check this same node again (in case of weird nesting, though rare here) continue; } } n = n->next; } } void MunchTokens(list_t *lst) { node_t *curr = lst->first; while (curr) { Token_t *t1 = curr->data; if (t1->ctx != TOK_RAW && t1->ctx != TOK_OP) { curr = curr->next; continue; } bool matched = false; for (size_t i = 0; MUNCH_TABLE[i].op; i++) { size_t len = MUNCH_TABLE[i].len; // 1. Peek ahead to see if we have enough nodes node_t *temp = curr; char buffer[5] = {0}; // Max munch is 4 size_t nodes_found = 0; for (size_t j = 0; j < len && temp; j++) { Token_t *tk = temp->data; if (tk->size != 1) break; // Multi-char tokens can't be part of a new munch buffer[j] = tk->data[0]; temp = temp->next; nodes_found++; } // 2. Compare buffer to table entry if (nodes_found == len && strcmp(buffer, MUNCH_TABLE[i].op) == 0) { // SUCCESS: Consolidate 'len' nodes into 'curr' free(t1->data); t1->data = strndup(MUNCH_TABLE[i].op, len); t1->size = len; t1->ctx = (MUNCH_TABLE[i].op[0] == '%') ? TOK_PREPROC : TOK_OP; // Remove the 'tail' nodes for (size_t j = 1; j < len; j++) { node_t *to_remove = curr->next; curr->next = to_remove->next; if (lst->last == to_remove) lst->last = curr; ClearTokens(to_remove->data); free(to_remove); lst->size--; } matched = true; break; } } // If we munched, stay on 'curr' to see if a new sequence formed if (!matched) curr = curr->next; } } void RefineRawNodes(list_t *tkn_lst) { node_t *curr = tkn_lst->first; //node_t *prev = NULL; while (curr) { Token_t *t = (Token_t *)curr->data; if (t->ctx == TOK_RAW) { char *span = NULL; char *to_split = strndup(t->data, t->size); char *tok = strtok_r(to_split, " \t\r\n", &span); if (tok) { free(t->data); t->size = strlen(tok); t->data = strndup(tok, t->size); node_t *last_inserted = curr; tok = strtok_r(NULL, " \t\r\n", &span); while (tok) { Token_t *new_t = calloc(1, sizeof(Token_t)); new_t->size = strlen(tok); new_t->data = strndup(tok, new_t->size); new_t->ctx = TOK_RAW; node_t *new_node = calloc(1, sizeof(node_t)); new_node->data = new_t; new_node->next = last_inserted->next; last_inserted->next = new_node; if (tkn_lst->last == last_inserted) tkn_lst->last = new_node; last_inserted = new_node; tkn_lst->size++; tok = strtok_r(NULL, " \t\r\n", &span); } curr = last_inserted; } free(to_split); } //prev = curr; curr = curr->next; } } void PruneWhitespaceNodes(list_t *lst) { node_t *curr = lst->first; node_t *prev = NULL; while (curr) { Token_t *t = (Token_t *)curr->data; if (t->ctx == TOK_RAW && IsWhitespace(t->data)) { // Unlink and free node_t *temp = curr; if (prev) prev->next = curr->next; else lst->first = curr->next; if (lst->last == temp) lst->last = prev; curr = curr->next; ClearTokens(temp->data); free(temp); lst->size--; } else { prev = curr; curr = curr->next; } } } /* // Modular function to register new identifiers void RegisterIdentifier(const char *name, TKN_CTX type) { //insert this into a Hash Map. //this is where user-defined types go. } */ void ParseVarDeclaration(Parser_t *p) { // 1. We already saw 'var' (the trigger) // 2. Expect an Identifier (the name) Token_t *name = Expect(p, TOK_ID, NULL); if (p->error) return; // 3. Expect the separator ':' Expect(p, TOK_OP, ":"); if (p->error) return; // 4. Expect the size (numeric) Token_t *size = Expect(p, TOK_NUM, NULL); if (p->error) return; printf("Defined variable '%s' with size %s bytes.\n", name->data, size->data); // 5. Finalize with semicolon Expect(p, TOK_OP, ";"); } void Parse(Parser_t *p) { while (Peek(p) != NULL && !p->error) { Token_t *t = Peek(p); if (t->ctx == TOK_KEY && strcmp(t->data, "var") == 0) { Advance(p); // Consume 'var' ParseVarDeclaration(p); } else { printf("Unknown token: %s\n", t->data); Advance(p); } } } const char* CtxToString(TKN_CTX ctx) { if (ctx & TOK_KEY) return "KEYWORD"; if (ctx & TOK_ID) return "IDENTIFIER"; if (ctx & TOK_NUM) return "NUMBER"; if (ctx & TOK_OP) return "OPERATOR"; if (ctx & TOK_STRING) return "STRING"; if (ctx & TOK_PREPROC) return "PREPROCESS"; if (ctx & TOK_COMMENT) return "COMMENT"; if (ctx & TOK_RAW) return "RAW"; if (ctx & TOK_LITERAL) return "LITERAL"; if (ctx & TOK_NONE) return "NONE"; return "UNKNOWN"; } /* pass on ";(){}[]$%&*$#@!?:,.<>|_-+=~`" and give each token a context let's replace preprocessor (include, define, etc) let's do recursive parsing everywhere that need it compile time reflection (@comptime or @reflect) metaprogramming logic annotation if i do it lastly** may not be */ int main(int ac, char **av) { if (ac <= 1) return printf("No file specified\n"), -1; char* data = LoadFile(av[1]); assert(data); ResolveTrigraphs(data); list_t *tkn_lst = ListInit(NULL); assert(tkn_lst); InitialScanner(data, tkn_lst); PruneWhitespaceNodes(tkn_lst); RefineRawNodes(tkn_lst); RefineSymbols(tkn_lst); MunchFloats(tkn_lst); MunchScientificNotation(tkn_lst); MunchTokens(tkn_lst); IdentifyTokens(tkn_lst); list_iter_t iter = ListGetIter(tkn_lst); printf("\n--- TOKEN STREAM ---\n"); printf("%-6s | %-12s | %s\n", "HEX", "CONTEXT", "VALUE"); printf("-------|--------------|----------\n"); while (iter.current) { Token_t *t = (Token_t *)iter.current->data; // Use CtxToString for the middle column printf("[0x%04X] | %-12s | %s\n", t->ctx, CtxToString(t->ctx), t->data); iter.current = iter.current->next; } printf("--------------------\n"); //Parser_t p = ParserInit(tkn_lst); //Parse(&p); //Ast //print all error and check correction available //symbletable //type check //metaanalyze to know what how to put it into IR (as asm allow for metaprog) //create ir by resolving ast on multiple thread ListFree(tkn_lst, ClearTokens); free(data); return(0); } //test