114 lines
3.2 KiB
C
114 lines
3.2 KiB
C
enum TokenType {
|
|
TOK_EOF, TOK_IDENTIFIER, TOK_INTEGER, TOK_STRING, TOK_CHAR,
|
|
TOK_FN, TOK_RETURN, TOK_STRUCT, TOK_TYPEDEF,
|
|
TOK_TYPE_U8, TOK_TYPE_U32, // ... add all types
|
|
TOK_FN_STATIC, TOK_FN_EXPORT, // ... all fn variants
|
|
TOK_LBRACE, TOK_RBRACE, TOK_LPAREN, TOK_RPAREN,
|
|
TOK_COMMA, TOK_SEMICOLON, TOK_STAR, TOK_EQUAL,
|
|
TOK_PLUS, TOK_MINUS, TOK_SLASH, TOK_BANG, TOK_ARROW,
|
|
// ...
|
|
};
|
|
|
|
struct Token {
|
|
TokenType type;
|
|
char* lexeme;
|
|
int line;
|
|
};
|
|
|
|
struct Lexer {
|
|
char* src;
|
|
int pos;
|
|
int line;
|
|
};
|
|
|
|
char peek(Lexer* l) {
|
|
return l->src[l->pos];
|
|
}
|
|
|
|
char advance(Lexer* l) {
|
|
if (l->src[l->pos] == '\n') l->line++;
|
|
return l->src[l->pos++];
|
|
}
|
|
|
|
bool match(Lexer* l, char expected) {
|
|
if (l->src[l->pos] != expected) return false;
|
|
l->pos++;
|
|
return true;
|
|
}
|
|
|
|
void skip_whitespace_and_comments(Lexer* l) {
|
|
while (true) {
|
|
char c = peek(l);
|
|
if (c == ' ' || c == '\t' || c == '\r' || c == '\n') advance(l);
|
|
else if (c == '/' && l->src[l->pos + 1] == '/') {
|
|
while (peek(l) != '\n' && peek(l) != '\0') advance(l);
|
|
}
|
|
else if (c == '/' && l->src[l->pos + 1] == '*') {
|
|
advance(l); advance(l); // skip /*
|
|
while (!(peek(l) == '*' && l->src[l->pos + 1] == '/')) {
|
|
if (peek(l) == '\0') error("unterminated comment");
|
|
advance(l);
|
|
}
|
|
advance(l); advance(l); // skip */
|
|
}
|
|
else break;
|
|
}
|
|
}
|
|
|
|
Token identifier_or_keyword(Lexer* l, int start) {
|
|
while (isalnum(peek(l)) || peek(l) == '_') advance(l);
|
|
char* text = slice(l->src + start, l->pos - start);
|
|
|
|
// check keyword map
|
|
TokenType type = lookup_keyword(text); // use hash or strcmp
|
|
return Token { type, text, l->line };
|
|
}
|
|
|
|
Token number(Lexer* l, int start) {
|
|
while (isdigit(peek(l))) advance(l);
|
|
char* text = slice(l->src + start, l->pos - start);
|
|
return Token { TOK_INTEGER, text, l->line };
|
|
}
|
|
|
|
Token string(Lexer* l) {
|
|
advance(l); // skip opening quote
|
|
int start = l->pos;
|
|
while (peek(l) != '"' && peek(l) != '\0') {
|
|
if (peek(l) == '\\') advance(l); // escape
|
|
advance(l);
|
|
}
|
|
char* text = slice(l->src + start, l->pos - start);
|
|
if (peek(l) != '"') error("unterminated string");
|
|
advance(l); // closing quote
|
|
return Token { TOK_STRING, text, l->line };
|
|
}
|
|
|
|
Token next_token(Lexer* l) {
|
|
skip_whitespace_and_comments(l);
|
|
int start = l->pos;
|
|
|
|
char c = advance(l);
|
|
switch (c) {
|
|
case '\0': return Token { TOK_EOF, "", l->line };
|
|
case '{': return Token { TOK_LBRACE, "{", l->line };
|
|
case '}': return Token { TOK_RBRACE, "}", l->line };
|
|
case '(': return Token { TOK_LPAREN, "(", l->line };
|
|
case ')': return Token { TOK_RPAREN, ")", l->line };
|
|
case ',': return Token { TOK_COMMA, ",", l->line };
|
|
case ';': return Token { TOK_SEMICOLON, ";", l->line };
|
|
case '*': return Token { TOK_STAR, "*", l->line };
|
|
case '=': return Token { TOK_EQUAL, "=", l->line };
|
|
case '+': return Token { TOK_PLUS, "+", l->line };
|
|
case '-':
|
|
if (match(l, '>')) return Token { TOK_ARROW, "->", l->line };
|
|
else return Token { TOK_MINUS, "-", l->line };
|
|
case '/': return Token { TOK_SLASH, "/", l->line };
|
|
case '!': return Token { TOK_BANG, "!", l->line };
|
|
case '"': return string(l);
|
|
default:
|
|
if (isalpha(c) || c == '_') return identifier_or_keyword(l, start);
|
|
if (isdigit(c)) return number(l, start);
|
|
error("unexpected character");
|
|
}
|
|
}
|