diff --git a/Makefile b/Makefile index 27d23d1..1576baf 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,12 @@ -NAME := Sterling +NAME := SterlingCompiler SRC := $(wildcard source/*.c) OBJ := $(SRC:source/%.c=obj/%.o) CC := gcc -CFLAG := -ggdb -Wall -Wextra -Werror -Wpedantic -I include -O0 -std=c99 -fsignaling-nans -LFLAG := +CFLAG := -ggdb -Wall -Wextra -Werror -Wpedantic -I include -Og -fsignaling-nans -fsanitize=address -fsanitize=undefined +LDFLAG := -lasan -lubsan -lpthread all: $(NAME) @@ -14,7 +14,7 @@ obj/%.o : source/%.c | makedir $(CC) $(CFLAG) -c $< -o $@ $(NAME): $(OBJ) - $(CC) $(OBJ) $(LFLAG) -o build/$(NAME) + $(CC) $(OBJ) $(LDFLAG) -o build/$(NAME) makedir: mkdir -p obj diff --git a/build/SterlingCompiler b/build/SterlingCompiler new file mode 100644 index 0000000..e7aef17 Binary files /dev/null and b/build/SterlingCompiler differ diff --git a/include/SterlingCompiler.h b/include/SterlingCompiler.h index 2301c91..cd3122e 100644 --- a/include/SterlingCompiler.h +++ b/include/SterlingCompiler.h @@ -1,81 +1,81 @@ -#ifndef STERLING_COMPILER_H -# define STERLING_COMPILER_H - -#include -#include -#include -#include -#include -#include -#include - -//simd -# ifdef __x86_64__ -# include -# endif - -typedef enum { - TOK_NONE = 1 << 0, - TOK_RAW = 1 << 1, - TOK_STRING = 1 << 2,//"asdfasf"; L"WideString" - TOK_LITERAL = 1 << 3,//INT: 42, 0xff, 0777, 100L; FLOAT:3.14, 1e-5, 2.0f; ENUM; char CONST 'a' '\n' L'x' - TOK_OP = 1 << 4, - TOK_PREPROC = 1 << 5, - TOK_COMMENT = 1 << 6, - TOK_KEY = 1 << 7, - TOK_ID = 1 << 8, - TOK_NUM = 1 << 9 -} TKN_CTX; - -typedef struct Token_s { - size_t size; - TKN_CTX ctx; - char *data; -} Token_t; - -typedef struct { - char *op; - size_t len; -} MultiOp; - -typedef struct { - const char *name; - TKN_CTX ctx; -} KeywordEntry; - - -const char *SYMBOLS = ";(){}[]$%&*#@!?:,.<>|-+=~`^"; - -// Common C operators (Order matters: put longer ones first if you add 3-char ops) -static const MultiOp MUNCH_TABLE[] = { - {"%:%:", 4}, - {"<<=", 3}, {">>=", 3}, {"...", 3}, - {"\?\?=", 3}, {"\?\?/", 3}, {"\?\?'", 3}, {"\?\?(", 3}, - {"\?\?)", 3}, {"\?\?!", 3}, {"\?\?<", 3}, - {"\?\?>", 3}, {"\?\?-", 3}, //trigraph - {"==", 2}, {"!=", 2}, {"<=", 2}, {">=", 2}, {"##", 2}, - {"++", 2}, {"--", 2}, {"->", 2}, {"+=", 2}, {"%=",2}, - {"-=", 2}, {"*=", 2}, {"/=", 2}, {"&&", 2}, {"||", 2}, - {"^=", 2}, {"<<", 2}, {">>", 2}, {"|=", 2}, {"&=", 2},// - {"<:", 2}, {":>", 2}, {"<%", 2}, {"%>", 2}, {"%:", 2},//digraphs - {NULL, 0} -}; - -// This can be expanded at runtime if you use a dynamic array instead of a static one -static const KeywordEntry KEYWORD_TABLE[] = { - {"if", TOK_KEY}, - {"else", TOK_KEY}, - {"while", TOK_KEY}, - {"return", TOK_KEY}, - {"var", TOK_KEY}, - {"int", TOK_KEY}, - {"float", TOK_KEY}, - {"void", TOK_KEY}, - {"include", TOK_PREPROC}, - {"define", TOK_PREPROC}, - {"comptime",TOK_KEY}, - {"reflect", TOK_KEY}, - {NULL, TOK_NONE} -}; - -#endif +#ifndef STERLING_COMPILER_H +# define STERLING_COMPILER_H + +#include +#include +#include +#include +#include +#include +#include + +//simd +# ifdef __x86_64__ +# include +# endif + +typedef enum { + TOK_NONE = 1 << 0, + TOK_RAW = 1 << 1, + TOK_STRING = 1 << 2,//"asdfasf"; L"WideString" + TOK_LITERAL = 1 << 3,//INT: 42, 0xff, 0777, 100L; FLOAT:3.14, 1e-5, 2.0f; ENUM; char CONST 'a' '\n' L'x' + TOK_OP = 1 << 4, + TOK_PREPROC = 1 << 5, + TOK_COMMENT = 1 << 6, + TOK_KEY = 1 << 7, + TOK_ID = 1 << 8, + TOK_NUM = 1 << 9 +} TKN_CTX; + +typedef struct Token_s { + size_t size; + TKN_CTX ctx; + char *data; +} Token_t; + +typedef struct { + char *op; + size_t len; +} MultiOp; + +typedef struct { + const char *name; + TKN_CTX ctx; +} KeywordEntry; + + +const char *SYMBOLS = ";(){}[]$%&*#@!?:,.<>|-+=~`^"; + +// Common C operators (Order matters: put longer ones first if you add 3-char ops) +static const MultiOp MUNCH_TABLE[] = { + {"%:%:", 4}, + {"<<=", 3}, {">>=", 3}, {"...", 3}, + {"\?\?=", 3}, {"\?\?/", 3}, {"\?\?'", 3}, {"\?\?(", 3}, + {"\?\?)", 3}, {"\?\?!", 3}, {"\?\?<", 3}, + {"\?\?>", 3}, {"\?\?-", 3}, //trigraph + {"==", 2}, {"!=", 2}, {"<=", 2}, {">=", 2}, {"##", 2}, + {"++", 2}, {"--", 2}, {"->", 2}, {"+=", 2}, {"%=",2}, + {"-=", 2}, {"*=", 2}, {"/=", 2}, {"&&", 2}, {"||", 2}, + {"^=", 2}, {"<<", 2}, {">>", 2}, {"|=", 2}, {"&=", 2},// + {"<:", 2}, {":>", 2}, {"<%", 2}, {"%>", 2}, {"%:", 2},//digraphs + {NULL, 0} +}; + +// This can be expanded at runtime if you use a dynamic array instead of a static one +static const KeywordEntry KEYWORD_TABLE[] = { + {"if", TOK_KEY}, + {"else", TOK_KEY}, + {"while", TOK_KEY}, + {"return", TOK_KEY}, + {"var", TOK_KEY}, + {"int", TOK_KEY}, + {"float", TOK_KEY}, + {"void", TOK_KEY}, + {"include", TOK_PREPROC}, + {"define", TOK_PREPROC}, + {"comptime",TOK_KEY}, + {"reflect", TOK_KEY}, + {NULL, TOK_NONE} +}; + +#endif diff --git a/obj/main.o b/obj/main.o index 4c0aba5..0864745 100644 Binary files a/obj/main.o and b/obj/main.o differ diff --git a/source/array.h b/source/array.h deleted file mode 100644 index bb021ed..0000000 --- a/source/array.h +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef ARRAY_H -# define ARRAY_H - -#include -#include -#include -#include - -typedef struct Header_s{ - int size; - int capacity; - int type; -} Header; - -#ifndef memset - -void *memset(void *src, char c, size_t size) { - for (int i = 0; i < size; i++) { - *(char *)(src + i) = c; - } - return (src); -} - -#endif - -typedef void *Array; - -#define ARRAY_BASE_CAPACITY 64 - -#define ArraySize(arr) ((Header*)(arr) - 1)->size - -#define ArrayFree(arr) free(((Header*)(arr) - 1)) - -#define ArrayPush(arr, x)\ - do {\ - Header *head = NULL;\ - if (!arr) { \ - head = malloc(sizeof(x) * ARRAY_BASE_CAPACITY + sizeof(Header));\ - head->size = 0;\ - head->type = sizeof(x);\ - head->capacity = ARRAY_BASE_CAPACITY;\ - arr = (void *)(head + 1);\ - }\ - head = (Header*)(arr)-1;\ - assert(sizeof(x) == head->type);\ - if (head->size >= head->capacity) {\ - head->capacity *= 2;\ - head = realloc(head, head->type *head->capacity + sizeof(Header));\ - arr = (void *)(head + 1);\ - }\ - (arr)[head->size++] = x;\ - } while(0) - -#define ArrayClear(arr)\ - do {\ - assert(arr);\ - Header * head = (Header*)(arr)-1;\ - memset(arr, 0, head->size * head->type);\ - } while(0)\ - - -# ifdef ARRAY_IMPL - -# endif - -#endif diff --git a/source/list.h b/source/list.h index 2c704f2..a87d7e9 100644 --- a/source/list.h +++ b/source/list.h @@ -124,8 +124,11 @@ void ListFree(list_t *lst, void (*free_func)(void*)) { free(current); current = next; } + //seems like i got a leak here, but may not be that bad since it is only called when closing lst->size = 0; lst->first = lst->last = NULL; + free(lst); + lst = NULL; } void *ListPeekK(const list_iter_t *it, size_t k) { diff --git a/source/main.c b/source/main.c index 771939c..7a808a6 100644 --- a/source/main.c +++ b/source/main.c @@ -1,578 +1,586 @@ -#define LIST_IMPLEMENTATION -#include "list.h" - -#include "../include/SterlingCompiler.h" - -# ifndef strndup -char *strndup(const char *s, size_t n) { - char *str = calloc(n + 1, sizeof(char)); - memcpy(str, s, n); - return (str); -} -# endif - -char *LoadFile(const char *filename) { - FILE *file = NULL; - char *data = NULL; - file = fopen(filename, "r"); - assert(file); - fseek(file, 0, SEEK_END); - long size = ftell(file); - fseek(file, 0, SEEK_SET); - data = (char *)malloc(size + 1); - assert(data); - fread(data, 1, size, file); - data[size] = 0x00; - fclose(file); - return (data); -} - -bool IsWhitespace(const char *s) { - while (*s) { - if (!isspace((unsigned char)*s)) return false; - s++; - } - return true; -} - -bool IsNumeric(const char *s) { - if (!s || !*s) return false; - for (int i = 0; s[i]; i++) { - if (!isdigit((unsigned char)s[i])) return false; - } - return true; -} - -bool IsComplexNumeric(const char *s, size_t len) { - if (len == 0) return false; - - // Most numbers start with a digit - if (isdigit((unsigned char)s[0])) return true; - - // Floats can start with a dot (e.g., .5) - if (s[0] == '.' && len > 1 && isdigit((unsigned char)s[1])) return true; - - return false; -} - -void ResolveTrigraphs(char *data) { - char *src = data, *dst = data; - while (*src) { - if (src[0] == '?' && src[1] == '?' && src[2]) { - char c = 0; - switch (src[2]) { - case '=': c = '#'; break; case '/': c = '\\'; break; - case '\'': c = '^'; break; case '(': c = '['; break; - case ')': c = ']'; break; case '!': c = '|'; break; - case '<': c = '{'; break; case '>': c = '}'; break; - case '-': c = '~'; break; - } - if (c) { *dst++ = c; src += 3; continue; } - } - *dst++ = *src++; - } - *dst = '\0'; -} - -typedef struct { - list_iter_t iter; - list_t *tokens; - bool error; -} Parser_t; - -// Initialize the parser -Parser_t ParserInit(list_t *lst) { - return (Parser_t){ .iter = ListGetIter(lst), .tokens = lst, .error = false }; -} - -// Look at current token -Token_t* Peek(Parser_t *p) { - if (!p->iter.current) return NULL; - return (Token_t*)p->iter.current->data; -} - -// Move to next -void Advance(Parser_t *p) { - if (p->iter.current) p->iter.current = p->iter.current->next; -} - -// Check if current matches type (and optionally value) -bool Match(Parser_t *p, TKN_CTX ctx, const char *val) { - Token_t *t = Peek(p); - if (!t || (t->ctx != ctx)) return false; - if (val && strcmp(t->data, val) != 0) return false; - - Advance(p); - return true; -} - -// Required token; errors out if not found -Token_t* Expect(Parser_t *p, TKN_CTX ctx, const char *val) { - Token_t *t = Peek(p); - if (!t || (t->ctx != ctx) || (val && strcmp(t->data, val) != 0)) { - printf("Syntax Error: Expected '%s', but found '%s'\n", - val ? val : "specific type", t ? t->data : "EOF"); - p->error = true; - return NULL; - } - Advance(p); - return t; -} - -void ClearTokens(void*arg) { - Token_t *tok = arg; - free(tok->data); - free(tok); -} - -node_t* NewNode(void* data) { - node_t* n = calloc(1, sizeof(node_t)); - if(n) n->data = data; - return n; -} - -Token_t* NewToken(const char* start, size_t len, TKN_CTX ctx) { - Token_t* t = malloc(sizeof(Token_t)); - t->data = strndup(start, len); - t->size = len; - t->ctx = ctx; - return t; -} - -void PushToken(list_t *lst, const char *start, const char *end, TKN_CTX ctx) { - if (end <= start) return; - ListPushBack(lst, NewToken(start, end - start, ctx)); -} - -void IdentifyTokens(list_t *lst) { - for (node_t *curr = lst->first; curr; curr = curr->next) { - Token_t *t = (Token_t *)curr->data; - - // Skip nodes that were already identified (like TOK_STRING or munched TOK_OP) - if (t->ctx != TOK_RAW) continue; - - // 1. Check Keyword Registry (Highest Priority) - bool found = false; - for (int i = 0; KEYWORD_TABLE[i].name != NULL; i++) { - if (strcmp(t->data, KEYWORD_TABLE[i].name) == 0) { - t->ctx = KEYWORD_TABLE[i].ctx; - found = true; - break; - } - } - if (found) continue; - - // 2. Check for Numeric Literals (0x..., 3.14, 100L) - if (isdigit((unsigned char)t->data[0]) || (t->data[0] == '.' && t->size > 1 && isdigit(t->data[1]))) { - t->ctx = TOK_NUM; - continue; - } - - // 3. Check for Identifiers (my_var, @comptime) - if (isalpha((unsigned char)t->data[0]) || t->data[0] == '_' || t->data[0] == '@') { - t->ctx = TOK_ID; - continue; - } - - // 4. Check for Operators/Symbols (;, +, -, #) - // If it's in our SYMBOLS string, it's an operator or preprocessor trigger - if (strchr(SYMBOLS, t->data[0])) { - // Special case for '#' which is often its own thing - if (t->data[0] == '#') t->ctx = TOK_PREPROC; - else t->ctx = TOK_OP; - continue; - } - } -} - -void ApplyTypeAliases(list_t *lst) { - for (node_t *curr = lst->first; curr; curr = curr->next) { - Token_t *t = curr->data; - // If we see 'int', we could programmatically replace it - // with the sequence ': 4' during a transformation pass. - if (t->ctx == TOK_ID && strcmp(t->data, "int") == 0) { - // Logic to transform token... - } - } -} - -void ListSplitToken(list_t *lst, node_t *node, size_t index) { - Token_t *t = (Token_t *)node->data; - - // Create the suffix node first - Token_t *suffix = NewToken(t->data + index, t->size - index, TOK_RAW); - node_t *new_node = NewNode(suffix); - new_node->next = node->next; - node->next = new_node; - if (lst->last == node) lst->last = new_node; - lst->size++; - - // Truncate the original (prefix) - char *new_prefix = strndup(t->data, index); - free(t->data); - t->data = new_prefix; - t->size = index; -} - -void InitialScanner(char *data, list_t *tkn_lst) { - char *curr = data, *start = data; - - while (*curr) { - // 1. Handle Wide or Normal Strings/Chars - // Check for 'L' followed immediately by a quote - bool is_wide = (*curr == 'L' && (curr[1] == '\"' || curr[1] == '\'')); - if (*curr == '\"' || *curr == '\'' || is_wide) { - PushToken(tkn_lst, start, curr, TOK_RAW); - - char *s_start = curr; - if (is_wide) curr++; // Advance past 'L' - - char q = *curr; - curr++; // Skip opening quote - while (*curr && *curr != q) { - if (*curr == '\\' && curr[1]) curr++; // Skip escaped char - curr++; - } - if (*curr) curr++; // Skip closing quote - - PushToken(tkn_lst, s_start, curr, TOK_STRING); - start = curr; - } - // 2. Handle Comments (Same as before) - else if (*curr == '/' && (curr[1] == '/' || curr[1] == '*')) { - PushToken(tkn_lst, start, curr, TOK_RAW); - if (curr[1] == '/') { while (*curr && *curr != '\n') curr++; } - else { - curr += 2; - while (*curr && !(*curr == '*' && curr[1] == '/')) curr++; - if (*curr) curr += 2; - } - start = curr; - } - else curr++; - } - PushToken(tkn_lst, start, curr, TOK_RAW); -} - -void RefineSymbols(list_t *tkn_lst) { - for (node_t *curr = tkn_lst->first; curr; ) { - Token_t *t = curr->data; - //IsComplexNumeric(t->data, t->size) || - if (t->ctx != TOK_RAW || (t->size == 1 && strchr(SYMBOLS, t->data[0]))) { - curr = curr->next; - continue; - } - - size_t pos = strcspn(t->data, SYMBOLS); - if (pos < t->size) { - ListSplitToken(tkn_lst, curr, (pos == 0) ? 1 : pos); - // Don't move curr yet, we might have more symbols in the suffix - } else { - curr = curr->next; - } - } -} - -void MunchFloats(list_t *lst) { - for (node_t *n = lst->first; n && n->next && n->next->next; ) { - Token_t *t1 = n->data, *dot = n->next->data, *t2 = n->next->next->data; - - // Look for [Digit] [.] [Digit] - if (isdigit(t1->data[0]) && dot->data[0] == '.' && dot->size == 1 && isdigit(t2->data[0])) { - size_t new_size = t1->size + 1 + t2->size; - char *buf = malloc(new_size + 1); - sprintf(buf, "%s.%s", t1->data, t2->data); - - free(t1->data); - t1->data = buf; - t1->size = new_size; - t1->ctx = TOK_NUM; // Mark it now! - - // Remove '.' and '14' - for(int i=0; i<2; i++) { - node_t *rem = n->next; - n->next = rem->next; - if (lst->last == rem) lst->last = n; - ClearTokens(rem->data); free(rem); - lst->size--; - } - continue; - } - n = n->next; - } -} - -void MunchScientificNotation(list_t *lst) { - for (node_t *n = lst->first; n && n->next && n->next->next; ) { - Token_t *t1 = n->data; - Token_t *op = n->next->data; - Token_t *t2 = n->next->next->data; - - // Check if t1 ends with 'e' or 'E' (and t1 is currently RAW) - if (t1->ctx == TOK_RAW && t1->size > 0) { - char last = tolower((unsigned char)t1->data[t1->size - 1]); - - if (last == 'e' && - (op->data[0] == '+' || op->data[0] == '-') && op->size == 1 && - isdigit((unsigned char)t2->data[0])) { - - // We found a match! (e.g., "1e" + "-" + "5") - size_t new_size = t1->size + op->size + t2->size; - char *new_data = malloc(new_size + 1); - - sprintf(new_data, "%s%s%s", t1->data, op->data, t2->data); - - free(t1->data); - t1->data = new_data; - t1->size = new_size; - - // Remove the op and t2 nodes - for (size_t i = 0; i < 2; i++) { - node_t *to_remove = n->next; - n->next = to_remove->next; - if (lst->last == to_remove) lst->last = n; - ClearTokens(to_remove->data); - free(to_remove); - lst->size--; - } - // Check this same node again (in case of weird nesting, though rare here) - continue; - } - } - n = n->next; - } -} - -void MunchTokens(list_t *lst) { - node_t *curr = lst->first; - - while (curr) { - Token_t *t1 = curr->data; - if (t1->ctx != TOK_RAW && t1->ctx != TOK_OP) { - curr = curr->next; - continue; - } - - bool matched = false; - for (size_t i = 0; MUNCH_TABLE[i].op; i++) { - size_t len = MUNCH_TABLE[i].len; - - // 1. Peek ahead to see if we have enough nodes - node_t *temp = curr; - char buffer[5] = {0}; // Max munch is 4 - size_t nodes_found = 0; - - for (size_t j = 0; j < len && temp; j++) { - Token_t *tk = temp->data; - if (tk->size != 1) break; // Multi-char tokens can't be part of a new munch - buffer[j] = tk->data[0]; - temp = temp->next; - nodes_found++; - } - - // 2. Compare buffer to table entry - if (nodes_found == len && strcmp(buffer, MUNCH_TABLE[i].op) == 0) { - // SUCCESS: Consolidate 'len' nodes into 'curr' - free(t1->data); - t1->data = strndup(MUNCH_TABLE[i].op, len); - t1->size = len; - t1->ctx = (MUNCH_TABLE[i].op[0] == '%') ? TOK_PREPROC : TOK_OP; - - // Remove the 'tail' nodes - for (size_t j = 1; j < len; j++) { - node_t *to_remove = curr->next; - curr->next = to_remove->next; - if (lst->last == to_remove) lst->last = curr; - ClearTokens(to_remove->data); - free(to_remove); - lst->size--; - } - matched = true; - break; - } - } - // If we munched, stay on 'curr' to see if a new sequence formed - if (!matched) curr = curr->next; - } -} - -void RefineRawNodes(list_t *tkn_lst) { - node_t *curr = tkn_lst->first; - //node_t *prev = NULL; - - while (curr) { - Token_t *t = (Token_t *)curr->data; - if (t->ctx == TOK_RAW) { - char *span = NULL; - char *to_split = strndup(t->data, t->size); - char *tok = strtok_r(to_split, " \t\r\n", &span); - - if (tok) { - free(t->data); - t->size = strlen(tok); - t->data = strndup(tok, t->size); - - node_t *last_inserted = curr; - tok = strtok_r(NULL, " \t\r\n", &span); - - while (tok) { - Token_t *new_t = calloc(1, sizeof(Token_t)); - new_t->size = strlen(tok); - new_t->data = strndup(tok, new_t->size); - new_t->ctx = TOK_RAW; - - node_t *new_node = calloc(1, sizeof(node_t)); - new_node->data = new_t; - - new_node->next = last_inserted->next; - last_inserted->next = new_node; - - if (tkn_lst->last == last_inserted) tkn_lst->last = new_node; - - last_inserted = new_node; - tkn_lst->size++; - tok = strtok_r(NULL, " \t\r\n", &span); - } - curr = last_inserted; - } - free(to_split); - } - //prev = curr; - curr = curr->next; - } -} - -void PruneWhitespaceNodes(list_t *lst) { - node_t *curr = lst->first; - node_t *prev = NULL; - - while (curr) { - Token_t *t = (Token_t *)curr->data; - if (t->ctx == TOK_RAW && IsWhitespace(t->data)) { - // Unlink and free - node_t *temp = curr; - if (prev) prev->next = curr->next; - else lst->first = curr->next; - - if (lst->last == temp) lst->last = prev; - - curr = curr->next; - ClearTokens(temp->data); - free(temp); - lst->size--; - } else { - prev = curr; - curr = curr->next; - } - } -} - -/* -// Modular function to register new identifiers -void RegisterIdentifier(const char *name, TKN_CTX type) { - //insert this into a Hash Map. - //this is where user-defined types go. -} -*/ - -void ParseVarDeclaration(Parser_t *p) { - // 1. We already saw 'var' (the trigger) - - // 2. Expect an Identifier (the name) - Token_t *name = Expect(p, TOK_ID, NULL); - if (p->error) return; - - // 3. Expect the separator ':' - Expect(p, TOK_OP, ":"); - if (p->error) return; - - // 4. Expect the size (numeric) - Token_t *size = Expect(p, TOK_NUM, NULL); - if (p->error) return; - - printf("Defined variable '%s' with size %s bytes.\n", name->data, size->data); - - // 5. Finalize with semicolon - Expect(p, TOK_OP, ";"); -} - -void Parse(Parser_t *p) { - while (Peek(p) != NULL && !p->error) { - Token_t *t = Peek(p); - - if (t->ctx == TOK_KEY && strcmp(t->data, "var") == 0) { - Advance(p); // Consume 'var' - ParseVarDeclaration(p); - } - else { - printf("Unknown token: %s\n", t->data); - Advance(p); - } - } -} - -const char* CtxToString(TKN_CTX ctx) { - if (ctx & TOK_KEY) return "KEYWORD"; - if (ctx & TOK_ID) return "IDENTIFIER"; - if (ctx & TOK_NUM) return "NUMBER"; - if (ctx & TOK_OP) return "OPERATOR"; - if (ctx & TOK_STRING) return "STRING"; - if (ctx & TOK_PREPROC) return "PREPROCESS"; - if (ctx & TOK_COMMENT) return "COMMENT"; - if (ctx & TOK_RAW) return "RAW"; - if (ctx & TOK_LITERAL) return "LITERAL"; - if (ctx & TOK_NONE) return "NONE"; - return "UNKNOWN"; -} - -/* -pass on ";(){}[]$%&*$#@!?:,.<>|_-+=~`" -and give each token a context -let's replace preprocessor (include, define, etc) -let's do recursive parsing everywhere that need it -compile time reflection (@comptime or @reflect) -metaprogramming logic annotation if i do it lastly** may not be -*/ -int main(int ac, char **av) { - if (ac <= 1) return printf("No file specified\n"), -1; - - char* data = LoadFile(av[1]); - assert(data); - ResolveTrigraphs(data); - - list_t *tkn_lst = ListInit(NULL); - assert(tkn_lst); - InitialScanner(data, tkn_lst); - PruneWhitespaceNodes(tkn_lst); - RefineRawNodes(tkn_lst); - RefineSymbols(tkn_lst); - MunchFloats(tkn_lst); - MunchScientificNotation(tkn_lst); - MunchTokens(tkn_lst); - IdentifyTokens(tkn_lst); - - list_iter_t iter = ListGetIter(tkn_lst); - printf("\n--- TOKEN STREAM ---\n"); - printf("%-6s | %-12s | %s\n", "HEX", "CONTEXT", "VALUE"); - printf("-------|--------------|----------\n"); - while (iter.current) { - Token_t *t = (Token_t *)iter.current->data; - - // Use CtxToString for the middle column - printf("[0x%04X] | %-12s | %s\n", - t->ctx, - CtxToString(t->ctx), - t->data); - - iter.current = iter.current->next; - } - printf("--------------------\n"); - - //Parser_t p = ParserInit(tkn_lst); - //Parse(&p); - - ListFree(tkn_lst, ClearTokens); - free(data); - return(0); -} -//test +#define LIST_IMPLEMENTATION +#include "list.h" + +#include "../include/SterlingCompiler.h" + +# ifndef strndup +char *strndup(const char *s, size_t n) { + char *str = calloc(n + 1, sizeof(char)); + memcpy(str, s, n); + return (str); +} +# endif + +char *LoadFile(const char *filename) { + FILE *file = NULL; + char *data = NULL; + file = fopen(filename, "r"); + assert(file); + fseek(file, 0, SEEK_END); + long size = ftell(file); + fseek(file, 0, SEEK_SET); + data = (char *)malloc(size + 1); + assert(data); + fread(data, 1, size, file); + data[size] = 0x00; + fclose(file); + return (data); +} + +bool IsWhitespace(const char *s) { + while (*s) { + if (!isspace((unsigned char)*s)) return false; + s++; + } + return true; +} + +bool IsNumeric(const char *s) { + if (!s || !*s) return false; + for (int i = 0; s[i]; i++) { + if (!isdigit((unsigned char)s[i])) return false; + } + return true; +} + +bool IsComplexNumeric(const char *s, size_t len) { + if (len == 0) return false; + + // Most numbers start with a digit + if (isdigit((unsigned char)s[0])) return true; + + // Floats can start with a dot (e.g., .5) + if (s[0] == '.' && len > 1 && isdigit((unsigned char)s[1])) return true; + + return false; +} + +void ResolveTrigraphs(char *data) { + char *src = data, *dst = data; + while (*src) { + if (src[0] == '?' && src[1] == '?' && src[2]) { + char c = 0; + switch (src[2]) { + case '=': c = '#'; break; case '/': c = '\\'; break; + case '\'': c = '^'; break; case '(': c = '['; break; + case ')': c = ']'; break; case '!': c = '|'; break; + case '<': c = '{'; break; case '>': c = '}'; break; + case '-': c = '~'; break; + } + if (c) { *dst++ = c; src += 3; continue; } + } + *dst++ = *src++; + } + *dst = '\0'; +} + +typedef struct { + list_iter_t iter; + list_t *tokens; + bool error; +} Parser_t; + +// Initialize the parser +Parser_t ParserInit(list_t *lst) { + return (Parser_t){ .iter = ListGetIter(lst), .tokens = lst, .error = false }; +} + +// Look at current token +Token_t* Peek(Parser_t *p) { + if (!p->iter.current) return NULL; + return (Token_t*)p->iter.current->data; +} + +// Move to next +void Advance(Parser_t *p) { + if (p->iter.current) p->iter.current = p->iter.current->next; +} + +// Check if current matches type (and optionally value) +bool Match(Parser_t *p, TKN_CTX ctx, const char *val) { + Token_t *t = Peek(p); + if (!t || (t->ctx != ctx)) return false; + if (val && strcmp(t->data, val) != 0) return false; + + Advance(p); + return true; +} + +// Required token; errors out if not found +Token_t* Expect(Parser_t *p, TKN_CTX ctx, const char *val) { + Token_t *t = Peek(p); + if (!t || (t->ctx != ctx) || (val && strcmp(t->data, val) != 0)) { + printf("Syntax Error: Expected '%s', but found '%s'\n", + val ? val : "specific type", t ? t->data : "EOF"); + p->error = true; + return NULL; + } + Advance(p); + return t; +} + +void ClearTokens(void*arg) { + Token_t *tok = arg; + free(tok->data); + free(tok); +} + +node_t* NewNode(void* data) { + node_t* n = calloc(1, sizeof(node_t)); + if(n) n->data = data; + return n; +} + +Token_t* NewToken(const char* start, size_t len, TKN_CTX ctx) { + Token_t* t = malloc(sizeof(Token_t)); + t->data = strndup(start, len); + t->size = len; + t->ctx = ctx; + return t; +} + +void PushToken(list_t *lst, const char *start, const char *end, TKN_CTX ctx) { + if (end <= start) return; + ListPushBack(lst, NewToken(start, end - start, ctx)); +} + +void IdentifyTokens(list_t *lst) { + for (node_t *curr = lst->first; curr; curr = curr->next) { + Token_t *t = (Token_t *)curr->data; + + // Skip nodes that were already identified (like TOK_STRING or munched TOK_OP) + if (t->ctx != TOK_RAW) continue; + + // 1. Check Keyword Registry (Highest Priority) + bool found = false; + for (int i = 0; KEYWORD_TABLE[i].name != NULL; i++) { + if (strcmp(t->data, KEYWORD_TABLE[i].name) == 0) { + t->ctx = KEYWORD_TABLE[i].ctx; + found = true; + break; + } + } + if (found) continue; + + // 2. Check for Numeric Literals (0x..., 3.14, 100L) + if (isdigit((unsigned char)t->data[0]) || (t->data[0] == '.' && t->size > 1 && isdigit(t->data[1]))) { + t->ctx = TOK_NUM; + continue; + } + + // 3. Check for Identifiers (my_var, @comptime) + if (isalpha((unsigned char)t->data[0]) || t->data[0] == '_' || t->data[0] == '@') { + t->ctx = TOK_ID; + continue; + } + + // 4. Check for Operators/Symbols (;, +, -, #) + // If it's in our SYMBOLS string, it's an operator or preprocessor trigger + if (strchr(SYMBOLS, t->data[0])) { + // Special case for '#' which is often its own thing + if (t->data[0] == '#') t->ctx = TOK_PREPROC; + else t->ctx = TOK_OP; + continue; + } + } +} + +void ApplyTypeAliases(list_t *lst) { + for (node_t *curr = lst->first; curr; curr = curr->next) { + Token_t *t = curr->data; + // If we see 'int', we could programmatically replace it + // with the sequence ': 4' during a transformation pass. + if (t->ctx == TOK_ID && strcmp(t->data, "int") == 0) { + // Logic to transform token... + } + } +} + +void ListSplitToken(list_t *lst, node_t *node, size_t index) { + Token_t *t = (Token_t *)node->data; + + // Create the suffix node first + Token_t *suffix = NewToken(t->data + index, t->size - index, TOK_RAW); + node_t *new_node = NewNode(suffix); + new_node->next = node->next; + node->next = new_node; + if (lst->last == node) lst->last = new_node; + lst->size++; + + // Truncate the original (prefix) + char *new_prefix = strndup(t->data, index); + free(t->data); + t->data = new_prefix; + t->size = index; +} + +void InitialScanner(char *data, list_t *tkn_lst) { + char *curr = data, *start = data; + + while (*curr) { + // 1. Handle Wide or Normal Strings/Chars + // Check for 'L' followed immediately by a quote + bool is_wide = (*curr == 'L' && (curr[1] == '\"' || curr[1] == '\'')); + if (*curr == '\"' || *curr == '\'' || is_wide) { + PushToken(tkn_lst, start, curr, TOK_RAW); + + char *s_start = curr; + if (is_wide) curr++; // Advance past 'L' + + char q = *curr; + curr++; // Skip opening quote + while (*curr && *curr != q) { + if (*curr == '\\' && curr[1]) curr++; // Skip escaped char + curr++; + } + if (*curr) curr++; // Skip closing quote + + PushToken(tkn_lst, s_start, curr, TOK_STRING); + start = curr; + } + // 2. Handle Comments (Same as before) + else if (*curr == '/' && (curr[1] == '/' || curr[1] == '*')) { + PushToken(tkn_lst, start, curr, TOK_RAW); + if (curr[1] == '/') { while (*curr && *curr != '\n') curr++; } + else { + curr += 2; + while (*curr && !(*curr == '*' && curr[1] == '/')) curr++; + if (*curr) curr += 2; + } + start = curr; + } + else curr++; + } + PushToken(tkn_lst, start, curr, TOK_RAW); +} + +void RefineSymbols(list_t *tkn_lst) { + for (node_t *curr = tkn_lst->first; curr; ) { + Token_t *t = curr->data; + //IsComplexNumeric(t->data, t->size) || + if (t->ctx != TOK_RAW || (t->size == 1 && strchr(SYMBOLS, t->data[0]))) { + curr = curr->next; + continue; + } + + size_t pos = strcspn(t->data, SYMBOLS); + if (pos < t->size) { + ListSplitToken(tkn_lst, curr, (pos == 0) ? 1 : pos); + // Don't move curr yet, we might have more symbols in the suffix + } else { + curr = curr->next; + } + } +} + +void MunchFloats(list_t *lst) { + for (node_t *n = lst->first; n && n->next && n->next->next; ) { + Token_t *t1 = n->data, *dot = n->next->data, *t2 = n->next->next->data; + + // Look for [Digit] [.] [Digit] + if (isdigit(t1->data[0]) && dot->data[0] == '.' && dot->size == 1 && isdigit(t2->data[0])) { + size_t new_size = t1->size + 1 + t2->size; + char *buf = malloc(new_size + 1); + sprintf(buf, "%s.%s", t1->data, t2->data); + + free(t1->data); + t1->data = buf; + t1->size = new_size; + t1->ctx = TOK_NUM; // Mark it now! + + // Remove '.' and '14' + for(int i=0; i<2; i++) { + node_t *rem = n->next; + n->next = rem->next; + if (lst->last == rem) lst->last = n; + ClearTokens(rem->data); free(rem); + lst->size--; + } + continue; + } + n = n->next; + } +} + +void MunchScientificNotation(list_t *lst) { + for (node_t *n = lst->first; n && n->next && n->next->next; ) { + Token_t *t1 = n->data; + Token_t *op = n->next->data; + Token_t *t2 = n->next->next->data; + + // Check if t1 ends with 'e' or 'E' (and t1 is currently RAW) + if (t1->ctx == TOK_RAW && t1->size > 0) { + char last = tolower((unsigned char)t1->data[t1->size - 1]); + + if (last == 'e' && + (op->data[0] == '+' || op->data[0] == '-') && op->size == 1 && + isdigit((unsigned char)t2->data[0])) { + + // We found a match! (e.g., "1e" + "-" + "5") + size_t new_size = t1->size + op->size + t2->size; + char *new_data = malloc(new_size + 1); + + sprintf(new_data, "%s%s%s", t1->data, op->data, t2->data); + + free(t1->data); + t1->data = new_data; + t1->size = new_size; + + // Remove the op and t2 nodes + for (size_t i = 0; i < 2; i++) { + node_t *to_remove = n->next; + n->next = to_remove->next; + if (lst->last == to_remove) lst->last = n; + ClearTokens(to_remove->data); + free(to_remove); + lst->size--; + } + // Check this same node again (in case of weird nesting, though rare here) + continue; + } + } + n = n->next; + } +} + +void MunchTokens(list_t *lst) { + node_t *curr = lst->first; + + while (curr) { + Token_t *t1 = curr->data; + if (t1->ctx != TOK_RAW && t1->ctx != TOK_OP) { + curr = curr->next; + continue; + } + + bool matched = false; + for (size_t i = 0; MUNCH_TABLE[i].op; i++) { + size_t len = MUNCH_TABLE[i].len; + + // 1. Peek ahead to see if we have enough nodes + node_t *temp = curr; + char buffer[5] = {0}; // Max munch is 4 + size_t nodes_found = 0; + + for (size_t j = 0; j < len && temp; j++) { + Token_t *tk = temp->data; + if (tk->size != 1) break; // Multi-char tokens can't be part of a new munch + buffer[j] = tk->data[0]; + temp = temp->next; + nodes_found++; + } + + // 2. Compare buffer to table entry + if (nodes_found == len && strcmp(buffer, MUNCH_TABLE[i].op) == 0) { + // SUCCESS: Consolidate 'len' nodes into 'curr' + free(t1->data); + t1->data = strndup(MUNCH_TABLE[i].op, len); + t1->size = len; + t1->ctx = (MUNCH_TABLE[i].op[0] == '%') ? TOK_PREPROC : TOK_OP; + + // Remove the 'tail' nodes + for (size_t j = 1; j < len; j++) { + node_t *to_remove = curr->next; + curr->next = to_remove->next; + if (lst->last == to_remove) lst->last = curr; + ClearTokens(to_remove->data); + free(to_remove); + lst->size--; + } + matched = true; + break; + } + } + // If we munched, stay on 'curr' to see if a new sequence formed + if (!matched) curr = curr->next; + } +} + +void RefineRawNodes(list_t *tkn_lst) { + node_t *curr = tkn_lst->first; + //node_t *prev = NULL; + + while (curr) { + Token_t *t = (Token_t *)curr->data; + if (t->ctx == TOK_RAW) { + char *span = NULL; + char *to_split = strndup(t->data, t->size); + char *tok = strtok_r(to_split, " \t\r\n", &span); + + if (tok) { + free(t->data); + t->size = strlen(tok); + t->data = strndup(tok, t->size); + + node_t *last_inserted = curr; + tok = strtok_r(NULL, " \t\r\n", &span); + + while (tok) { + Token_t *new_t = calloc(1, sizeof(Token_t)); + new_t->size = strlen(tok); + new_t->data = strndup(tok, new_t->size); + new_t->ctx = TOK_RAW; + + node_t *new_node = calloc(1, sizeof(node_t)); + new_node->data = new_t; + + new_node->next = last_inserted->next; + last_inserted->next = new_node; + + if (tkn_lst->last == last_inserted) tkn_lst->last = new_node; + + last_inserted = new_node; + tkn_lst->size++; + tok = strtok_r(NULL, " \t\r\n", &span); + } + curr = last_inserted; + } + free(to_split); + } + //prev = curr; + curr = curr->next; + } +} + +void PruneWhitespaceNodes(list_t *lst) { + node_t *curr = lst->first; + node_t *prev = NULL; + + while (curr) { + Token_t *t = (Token_t *)curr->data; + if (t->ctx == TOK_RAW && IsWhitespace(t->data)) { + // Unlink and free + node_t *temp = curr; + if (prev) prev->next = curr->next; + else lst->first = curr->next; + + if (lst->last == temp) lst->last = prev; + + curr = curr->next; + ClearTokens(temp->data); + free(temp); + lst->size--; + } else { + prev = curr; + curr = curr->next; + } + } +} + +/* +// Modular function to register new identifiers +void RegisterIdentifier(const char *name, TKN_CTX type) { + //insert this into a Hash Map. + //this is where user-defined types go. +} +*/ + +void ParseVarDeclaration(Parser_t *p) { + // 1. We already saw 'var' (the trigger) + + // 2. Expect an Identifier (the name) + Token_t *name = Expect(p, TOK_ID, NULL); + if (p->error) return; + + // 3. Expect the separator ':' + Expect(p, TOK_OP, ":"); + if (p->error) return; + + // 4. Expect the size (numeric) + Token_t *size = Expect(p, TOK_NUM, NULL); + if (p->error) return; + + printf("Defined variable '%s' with size %s bytes.\n", name->data, size->data); + + // 5. Finalize with semicolon + Expect(p, TOK_OP, ";"); +} + +void Parse(Parser_t *p) { + while (Peek(p) != NULL && !p->error) { + Token_t *t = Peek(p); + + if (t->ctx == TOK_KEY && strcmp(t->data, "var") == 0) { + Advance(p); // Consume 'var' + ParseVarDeclaration(p); + } + else { + printf("Unknown token: %s\n", t->data); + Advance(p); + } + } +} + +const char* CtxToString(TKN_CTX ctx) { + if (ctx & TOK_KEY) return "KEYWORD"; + if (ctx & TOK_ID) return "IDENTIFIER"; + if (ctx & TOK_NUM) return "NUMBER"; + if (ctx & TOK_OP) return "OPERATOR"; + if (ctx & TOK_STRING) return "STRING"; + if (ctx & TOK_PREPROC) return "PREPROCESS"; + if (ctx & TOK_COMMENT) return "COMMENT"; + if (ctx & TOK_RAW) return "RAW"; + if (ctx & TOK_LITERAL) return "LITERAL"; + if (ctx & TOK_NONE) return "NONE"; + return "UNKNOWN"; +} + +/* +pass on ";(){}[]$%&*$#@!?:,.<>|_-+=~`" +and give each token a context +let's replace preprocessor (include, define, etc) +let's do recursive parsing everywhere that need it +compile time reflection (@comptime or @reflect) +metaprogramming logic annotation if i do it lastly** may not be +*/ +int main(int ac, char **av) { + if (ac <= 1) return printf("No file specified\n"), -1; + + char* data = LoadFile(av[1]); + assert(data); + ResolveTrigraphs(data); + + list_t *tkn_lst = ListInit(NULL); + assert(tkn_lst); + InitialScanner(data, tkn_lst); + PruneWhitespaceNodes(tkn_lst); + RefineRawNodes(tkn_lst); + RefineSymbols(tkn_lst); + MunchFloats(tkn_lst); + MunchScientificNotation(tkn_lst); + MunchTokens(tkn_lst); + IdentifyTokens(tkn_lst); + + list_iter_t iter = ListGetIter(tkn_lst); + printf("\n--- TOKEN STREAM ---\n"); + printf("%-6s | %-12s | %s\n", "HEX", "CONTEXT", "VALUE"); + printf("-------|--------------|----------\n"); + while (iter.current) { + Token_t *t = (Token_t *)iter.current->data; + + // Use CtxToString for the middle column + printf("[0x%04X] | %-12s | %s\n", + t->ctx, + CtxToString(t->ctx), + t->data); + + iter.current = iter.current->next; + } + printf("--------------------\n"); + + //Parser_t p = ParserInit(tkn_lst); + //Parse(&p); + + //Ast + //print all error and check correction available + //symbletable + //type check + //metaanalyze to know what how to put it into IR (as asm allow for metaprog) + + //create ir by resolving ast on multiple thread + + ListFree(tkn_lst, ClearTokens); + free(data); + return(0); +} +//test