From cebb63912c72e67aa0ec48aac25bad8edec585c6 Mon Sep 17 00:00:00 2001 From: "Emilia(SleepeeSoftware)" Date: Thu, 5 Mar 2026 02:49:00 +0100 Subject: [PATCH] Yay git add .! parser seems to work fine, need to implemente an hash map --- include/SterlingCompiler.h | 45 ++-- obj/main.o | Bin 0 -> 21685 bytes source/{assembler.c => assembler.h} | 0 source/bckp_data.bsdok | 319 +++++++++++++++++++++++++ source/list.h | 2 +- source/main.c | 358 ++++++++++++++++------------ source/type.h | 23 ++ 7 files changed, 566 insertions(+), 181 deletions(-) create mode 100644 obj/main.o rename source/{assembler.c => assembler.h} (100%) create mode 100644 source/bckp_data.bsdok create mode 100644 source/type.h diff --git a/include/SterlingCompiler.h b/include/SterlingCompiler.h index 492f340..f409389 100644 --- a/include/SterlingCompiler.h +++ b/include/SterlingCompiler.h @@ -15,39 +15,30 @@ # endif typedef enum { - TOK_NONE, - TOK_STRING, - TOK_RAW, - TOK_PREPROCESSOR, -} TKN_CTX; + TOK_NONE = 1 << 0, + TOK_RAW = 1 << 1, + TOK_STRING = 1 << 2, + TOK_OP = 1 << 3, + TOK_PREPROC = 1 << 4, + TOK_COMMENT = 1 << 5, + TOK_KEY = 1 << 6, + TOK_ID = 1 << 7 // New: For variable/function names +} TKN_CTX; typedef struct Token_s { - int size; + size_t size; TKN_CTX ctx; char *data; } Token_t; -//builtin type: if x86_64, then since i use simd, should align them +typedef struct { + char *op; + size_t len; +} MultiOp; -typedef struct vec2 { - float x; - float y; - float _padding1; - float _padding2; -} __attribute__((aligned(16)));//losing 8 byte - -typedef struct vec3 { - float x; - float y; - float z; - float _padding; -} __attribute__((aligned(16)));//losing 4 byte - -typedef struct vec4 { - float x; - float y; - float z; - float w; -} __attribute__((aligned(16))); +typedef struct { + const char *name; + TKN_CTX ctx; +} KeywordEntry; #endif diff --git a/obj/main.o b/obj/main.o new file mode 100644 index 0000000000000000000000000000000000000000..91ad0431ff331628396526b1c002b5716b4f995e GIT binary patch literal 21685 zcmch9dw5jGm1o`CU9A?aN9&2ivjyUf9$<`l2{#WxO9G+r2*Spegw&Ec2uV?Q^RP`! zB*;Xc!Ww0~c6LI7v+=jdY`}5G`TX%p7A79Q#`7&s9Ggid6L!ZWvSBis!HLYQJlI(6#Q@)%W=XyfeL=;XjwC3WFO~0!MiXFnC69=POXUZX6)v-ch!rNo zgQ7B6-kXeT0PgfeA-=mtVYee;TwgM76#S2iQuQ?iE0!el(({D4SuDipdc|)*5)}Tm z6^oW80l$Ox{(fN+OX2RROkz^*WCi>eQfw4cxFz={;o7`Hya8OPgY#?Lz4s;IrXg3k zEQzJ`Elp;A2uOW0OW~F!H<;R2y*yPP{IWdRzJqQd{xfhdIrfq2pa`zr5gNQ68n_t< z4Q}>_1`hj0JnLK$WFw?9G+6Bq4dL@j5Y0ng(bP7kUm-$q2VM$1^b{>zgZI6YRNKJeA`zdaa1CD8 z0$D;i?~Ma07xbb5w?nT~kCK@;a=^VE8gjS;)uR>|MWKNaE3~|NG@5C|u;VS2Nz#w9 zY?rE&^Z=LvAPgNdct{*?AjSSc|B2AROF`q6O6`>-k3cCIwWCOR1lA_F+NmSUspB@B z;=SRxB}lb0Cqc@x6bmXH3#fzH)}YtedG2~G+L)r4_8nU0fl%N>l28H+-BTtxS8rQ2T zRRr9)4>`1rbE6zqYy()ZfjZ>!ou4vf@Un{Eq3Tg~$H3v+BL3H;D`5b<5;VgCHmJs+ zyC?>94-DK?{#K^4OaI;Y91X3kE{dkXJkPaG&;sJZ)S+6KiaFbQ zbIn;BZh_hn539$3;6ZVDK{NP++{#y~Z*vJ}F$f(mcv>moUSScnoQGKe(*Huz zc4l2Aj{H!(y)U!#Vp((vNI;ZYG!sP{A^ zYtFXgJF#9A4YgjuU_wi)M{5ROs~NnKG}bf#8m%iYI}XzIP*06_jGl~Czr!2VqwgI< z@RRizUaMxrvNbs;``9V6i0j`$U0<~390i0kr0b}gs7q)C9HbcMcpx=juKw+f0qbq8U%#=2T4ayIOwyS7 z@#{(E#NT!#P{fLD7NaGpIgxJrbRR z7#|Ta529z}hUDoLPNzWjJK@t#Qxpe*R?fpQ``8A|H7Amr*T921v3bXxm{oHa#fh?! zTYC-J5!Kr4V>=b{n$B-jc@$v}WFK4V01RW6L9koCT>W$CgH9*+_RH1ph?lEB6ndw* zhA6h2w9h*}8wA-XnkNxglIxjZ1={|NT``CjBV(Q2SGdnkr>&TxfXTws|$#cHx zsqc~KmFi1q3&k6HfUM(=Ad~ie$jijl(DBWeFopj>p{^!wI54z$uFXEWO;uwGJ32?l z+P33bXpjb^Qw>WdP!&TYVJE98?)VN=C#gEj$LR5Mq|40h@4b!OdD;*p?J2H6b7EyE zNR@$2ZDPqc{=k(%!j)Y?H&Ty|-=DmS68P)@k_sb6xI?keau(vD9h!!mJ9)%(GD)Zu zQl754N1Z#E(4TIaa^Kn1ToiK+$HCZKfth4x3iQi>4k;wO@TrrL(x@h*uV%kk{o%lrvTLV%b?zuyf}40dVY0zEttK=<9YMQjgF4=0w{*-T?CG3a86IRfwXhy9#t3|o zI^XG%vvBjB_dLpvMOg8@&K&j|>N^Cg8l<26j&tZJUjH$kFOU3+hn*UNjcQASV7NR| zi}eZSG5wU937L-kpE3S?UZoZ0OQWp&+Ktbm6ebba#sh3*69ugb<|=A-JRQcrO6^Yr z@iS@#=JBDe-q7*PzejHK;N_aZOKL)4O+2Ka5n9o$-@;rcZ;jcI_ODEHTToiiE(j|d zZHZ5uAo@>TQyihoNdqbQQYTLn5nXds!b=U2^HcI=Df!Bj{5>goeM+bGFHe!LO3B}! zl3$yWuTIH_Qu1{v`TCUnXHxQXK1r9hZ>P$dyp#RZiaZ&Uqadc2>sOr{9h%||4vzQ=4QvEl(xdb8BTf~H0=*4 z{SY4-W0VFFWD7x4+JVn9(3JiqK0(lwUc!fBn$my72i?MH2L^B>Xi8thrwufv|AbF3 zXiBTxLL3B5>7U_q6f~t*@HqjR(lngD(6K9}Yw`ImXiC41&ksRU`g?pXfu{7A9-M=N zru2g}A+Ca^^v!f3-UUtR^Am7B51P_LaKgu+DSZ@=GrVXwrH8YH$Old7_x*Tg0h-d^ z;d#bF&@1Q8d-U*w50%cEJ%47f4%U7*_tZYHm7n&ayt5#Ks)vK#SRh1~I5|pVD z3l@m7GEuf#ET|G?RbqaXsHhUNXN$)k6AuHk3I)Vt;8cl7y9jqh!{R_^q$@1?!m)T? zw}^DdM0;0HOHAzViL@cx+7)jLi?(okq&qBvtv&mCW08G;i)c^0uQgn;uO-r5-YVKj z2S3ROq-RbL^*sUN1fsp+)<}CK+$L()SJ!W^*-%rxRczlg#B&Xa5l3`jWBrc6y5-9Qi|#F7TwWO{+ZT&>hpRfm-Qm7SE3z$pt({c| zmn|(@x+GB6(b2XiP__lIXsoTOb@_6RJE0yOkvip6MWGVQ)7sh6Cki6qq(>uO$}Wwt>Mm| zuC{QWaL@D+Q-~68E)nS)YBp61cSSKhzAEhQX$v<+h2P5XpF(~jIp4};H6pE>p0in5 zIVAB4Pn1e?0dTWjl+7!$a(;_6jhrZ~+_aaernL4*Ulf?U2_%CLX7W;q{Av=xJE`|;XZ;Fy95vg1RHaQ*LcWwH*KI0#*;7! zx@k{4s2OQ)+G5C@tjj1i0WsKAzooufSWIqRvs2hiZQH)JrhcPvb7o6}@G{Y`wYp*J zmUY6nW0O#tecZncVHS5I@k-ojIzG>a|MjC} zwTQ9w^cNsWcLfwy6dyx&RD&moBDinomaXd@cwW{uh95yF(BW zAnYUbMT>&)kRX>N1aIBfE7I=gy8pY@!v-j6v=`Ov6#A7M2v7zJ#Y8P0uCL*WMRyX%p_C&HC>lNDG8EHA;PXsD!UQ7Vhd2?rPir zcgSuc9INuRN5kR0!o6PA(W3=6!IY~?n_8mLa9>P#i_4!NIr&z7(Qtw8o?dv;z0%Gh zm1hXTuA(pMf@m*h8R417UGNHtRTu0F?`!QnB+?f1eDgjDXyUP%I!3BJ{{rSLUu#QO zS5K=*f62-rK(+@UlJl3C4IhL>`rlaonIs^D@a=PoysU~;=^|ndtwzEwwImrq%P3v< zQz2Z|$-f11a!acywXz6c489<{wdM zWzocA4>!2OSflCstV%?7Of0r?uP_>W5)aL1o&q>(ZcSUbI~Hj_#N#+BY?tMs1)r|P z#=2o_A0~HUmULNDRh7a$=%A)KsO;8wAEt9FmnK!1=>7`_@CT)hY>>u=?Dn2`cbiDJ zvPic*+%ff>13%!vzX*PIMEI;+<&b8P-|V>Ur@4%gmff_rr>9GJ8~U*N?(BpYxYO&g zYM?3a(=LWCx|gTRI=LKfbWua>_h&i{vDj|=My6x8)!P@|FYKF6zS;eZqvds_L|7>O zFWQ&xpE=Sm8fh5g(*CysrLF;N?^fQUY;8H<7#-@zq)xRjY7aYx z{YV?;e#$9D$8&U%A(tIP8r(l{q;x{ZQn&&U_meytiqd8VWYhzQSn+77bO7xe^@JLEo69BkiJtdQqd$tZ$ky`#VQU zr_Xd*u|>i@rTfQ?H+0@{k=Tjg$PNvJkGG+hQT&mR`uD15I#9BLr{q0;Gl$#yA z9!G@$S`lGCTd0H5ea3-&OKC*?)c?;pjtViinAXcWl;w5E))BGo(7ruAU5L~n+LP1u zYn31%ehRb-h@mOk5x;U!f2&a{biZPJ@jD0beGNc|9CuvNZ1wh_sz3MdhtM9TI}>kB zcO({R>Dt!X(v6L)u)FC#fv#U^|D5bX^D=4jvAZWzBw=P!W_8a~W_g`B?cv291K*0q zS}<+fJ;e&&?7oM}|bG+i?;H-Z?GSl!t2AZuz167KkTJd0p|GG_$FuOY_@nSXtz$_m1cN%{=H6< zX${IhlaLYKP1po(qY=XaE9^SDy`t+kDnU2>Ckz-BuXTFdx@Zpy@O(8?ysk)hTlgRt z_bJc&Y20qgQXvU%#oOB>5N~lf&1&E@IK#e?2Nvce2JLj5jNM;DJgNqgus6^xAYJ-s z!#X)??AlP|gpbSd`G$GQQ(jjXGqAm3mCE{EwBNHg)X$hW!YqhJVJ00Mn4UWCKOtsdl)UIBk)N5iGM`&0rWe@RBR^mPb(vC$BtY40AN(+47} zCOWoy2z$PV#y@YlpryuWD#2Xu!Wf#bo0?A>FS`DjR-_zk%DUU)82vLiX6dG$u3DAA zl+QZ~W=z(;LQ`hUay*5BgE2T$ukjcJh`-G|C0S3fNpxgD*H}OQPtu4f-FjjwkrJv{9d=21f#!)*Ezp0`kD8r8iu9F6t#g~bpZ zThKL0C1}Y~tkzub&~#mIYiGE1Z*LFX6k0>~&gUrdiY!vpfX!Z=um;ko0y+<&YmZ8x z?7LdoKq-;bh`m5pIDX+(-#V&$+}-aeSf~|5gmo-Oftz7JRn22}1MSeP19T`u*Pu#J z;Vh`+QEMHTs7D8F=bGBFJfXzQN&xI(ud4YV-|Eqp=3_^b2`+UD`Uu}-VjqUP04}Ng zNVRA`!uL@+Afk(2K%~n$87Dia_oAlvXmeskD@XzZgy$o8jD5H!R*m6m^>852xf5Lz zRRUE%(5iZ7IzAk8eLdeO?^WJ+O0G2}Z|IVH=Zq^UtPU;}bV|)JrAn~FWYas6>s!;d zzoi@dZmXFK(jgjMV}mH73pNLw$l)2<+Qa7n@UpUdV>nhrn_6pkCIu=T%+aNGe&oAv zlOFcg?m5a^POsC%xVEJg5mZk{e024z1f_19QuULap6A)cs-MjxVAyN%^lfd?;if{V zhPOb%TDsX$Rp_X;PJSIWDMOYP>tKqs9RyYnY*oY3N>zj3so_2+tQy^KJ30?&GkFVY z1$}A->2&{ZN3c`_5b}W3W%qga_!J@~CEr8Pt6TmnnnTzZcl<}D|a=C9#aZTPT zhG!Tyj2vTU!osXX%d=8(s?JAPNjA5$#9j&2s-b>xlFH}Yy}YtH_da$@UY_Oiait~c z)Rg>?IBY!*qF^@53-c_O*FpFQFcFS1wUA?qv?1*A`-{ov#gEt9o zrk)b?B`TibwpROmaFOyXDW3@oMQO5g{>*|#63&_p;cO%1j#H93$;_I(#1NR9EKC&5 zE3l}ON%?#tiQsd4HElWDPw?oX^Xt1^&45W9P_;s6S=L8cnUv5l#|Ol8cLnGMe0Jh< z7$4ih8Hw)+P}WoQ^$q%ZmITky*Ngaay++iV^z{q+`X~Cji7z3ii*#ikGOh)dlow=P zTxMRC85b_d^i$HRlS@10WN($seCLczzaVA1oKz?C&daQRneiHdv-$_5@2bqcAYJ`3 zy+Yz%$L&5Rry;Q#u zlwJUL$&9PA7*!pW-U~8IgJus%Ye1G(!c6InIdyp&Wxl6E`eL#mE~mw0POZ#{Qw!#F z%IpDI7L!wzO=XnCJgbn{dyUQDq zadgaK9mO$eb;u=^G7skVODisyu9g#SN#6yTQ$gkyk__C3Q?lT^ z^aDL07w?yu5S@{-p9(Ah+BlOc_Iy*$@*E%?o*1_9y7D!haw=-5r3xEXOCLH6ekmnf zqpq@m;ND!BUMKyidcU-G;py94*YU^OW!5cedk18?w}Tuu<8x&1tO}VGlNqGrZHfC? zw_OVtPk@UvKKnch%PE}%n=l}4gn)HHPTfy!O7m9WDFrt&;|yuLzf-vDDsxbNw;Kv zzw|>1y1rk|ref_*aFqPHEb5oOTjadVfeX?)BWLf23=M;kUhKZzwkPKCUY9v&sJt$x zBFSl&_B&$|kN1|$d{x?26z#BnafB2i1bs(k<^wXriIG}4nQD%q9$A9W#E6(xEAvjt z3a~xpvYe8bTuY!@S&IY_l#j&SC?`>(qEXgzP9j=&Nq-HKRQ`vCdI_oFe#<_*hf19A>5@0Z2sdhhFU;u)%WGAg7B&dbShMS07Q z%83Kst8(@(ITNFkJMg@m8CR6ITm}Z@+~;H|WcMJ!bUh#g7ilb!cVOzwi`1YQ?Q-(4 zEOPaF-F)nlJ)bN?FqNtX~tOal;4K*m?Il5N$c}z~)B?Fj@ zS1Cnrb0Ne44Ro}HLd;$zS0i!NssQhoD=4>07S-X2)aPI22)C!IXe7%EPs^#U3H|R% zxl0xTinj9jd>8ZUBrHeSaKv?KU%hn!mn^tQGZ1Fu+*+9)mk6kI-2dxAaf1Cwb4@y! zm{z8EXUNi8Sy(Im8)QbUv|SmE=va(SYnSc8CV~#^aNe$-N7IQZ-)zM8b~n+TO!9;4 zbt`u7z`1L5clEwVSEOb4`fzk_tfzPPHry6CTM?MMInupxXW33XFL#WL zqna|DSw;?)MZ=v716&}m=TKlREa;2{wzc3~tuMMT(0A`V=Q%v>tFXbwhu%H2gmo~G zk>-8Ixm7D=>DZUPIa-tD|B}5f(_@ET23o!RK`1TPw|W}q%CQ)z^`+irYMgyISzqcs ztj29tFn;NT>P@|8);N28vcA-ta*f-QqK{r!p=+&5gvL4MnR-(X6*O*ZioVoy42|2B zf=fMj(K!2PaywEFgbePXWL)Ygl*a8ysW=sXkzLRsAi^~1w}{eZZQf<;XXFhG@Oj961zbu zedln}qj3Kh)Os%K!Es9qQ$?T@Z@Qr>K+)l`TZBOEVQM=ly61I^Zo~TmC{@P^P)f_w zhWZXDrR8-{ZCv&@pu$Ye#FHy}N7OAAfNE!|+VFOO>R?_NM+XWQ2i3{Erws2LDAl&* zc$A^2UxHG$ybmhECDJpsv;frSm{$u*;T`~`aDAYZ|N0H?$A&ixN8YN$y`WTyMo_BW zH8_-3)SIA`&DTLGn?E+ZnK(&R(g#32$hQ0sP`jCW1C*-cEl_)zH;A(c#rvwEo-x$- zKy@+h*PxUI$Z`ow1PoPasGy-54ApF?UPB!*)Cog9W2om1b;(dy40X*=qlTg*5^9Hq zSG6=1FjS?Xf`+1h9Z)5X80v(fo-x$(hPq^^D~7ses8K@+EO(AAhJtBg1wJlBrKYgb ziy-q5OperVDB45>@p18cp{nc@BMpEG;^X4QQ%UC{uc(DaT4{Jo47JQiDOOa8prOce z%DMQRQYGDFqzy*8-S8R>MR69y$Hg;>D%)nH6s<~1F{PxvhKd>KLBo5@P)CgPsNsFt zP+u|96NdMsp}uCMCk^izLp^Jx-!;6`hI-yee`t7T4Rz5-FB#rThI-XV-!Qx@hI-3L zuNvMz80wmlzH4|RhPr8_qlWi^p>7-L$A%{moyy~GP(kor{GP4iWf-a7Na@X6B`q@4 z6eA57Ua6tx8Y#VONRQu zP!q8>DhqZSie5fe`uYv;+lG4AP#+oG1gxB@#0pSK-(f?YHPUws?_EQ=GIWUuD5d4= zhG${DRQlQs^(ZKX`)`IfYN-4yU1E`;1`PF)p+Zt7+%v6b$pj3$m4DV4xoi-E#QEXuA zQLR&U2B-$+%{J70LzNq9nW3r-wboD@4MpQh>HCbKK5Ho2btq}Gp=e*Gc%L&=43uiy z6j0mo@ri)ptv9@o;dL2aui-sucwaNTi-vc}@MsoQb&MDu&3cMQv!1erW+}zfvy>%v z8(y=)(X66y^b0yw_O#(WZ+LGR-c`f<*zg4AAyu{%l+riX@HQDtTMcy;k6lFr{R6Y@J<-s4-N0E;r)Z*T{Ap4=2%s) z7nG`Zq2W~;-gd)lG`wyLM^o5yQ&{7VbP;%|ECgP%tuM>|#h2Bv~Qm2*){ zZN_IjhDMUQ=&3jzv5d!H`gN|ONa=WGJO2hQ5A<4;`6|#{@{s*ZcUUikbgTh9z!!VV#9eIk&Z^jV-^}c z+0aABBI7YM&#Q}Gn5H9<@t8^t;on!$amaYgVuPU>l8!>gW0n}qMqua|WISf6!O;9l zM~9zzn5^q+T7hxf$kDLbuWV^?{?-^n4QfvH3?eGC99@xSg6 ziohg^Ig|K5cL*V1$P>v|;(y$s>w#$ihGx96t32WV?P#`fSvSXo3DjOkJGvDZ`ggmA z1gRnqI2f9dT-JA}dE>5Q4hCHh4E6PR*pm*1W-XW126b?H%Jqzc!KMwEv2m9O(=UP3 z%;vH_OcWXVf};nUHDKxtKaqIZ!O-mIvgjY|R7_}${^1SfsWqxaKSQZN66e?89KmM{ z^UuiXOO>KG-Snk6WA8^$$)+Y9iG&J(5a#tI6W#gnD5rs;ju z1U6pUMxIAa=_8LkqDAd}^usN+;`0x7cp=p<;Za7v%SW_+KafFqEtKC)WC6e5NEBW> z#p{w9P{)*h6_vQ^mDr?Lb`#$L9(ysGzzOefr@Za1ewCr$!B6~QLdx$o?)vMJl;6$# i|8jHU57?4_pVrhA+1HEr_w;Lfl+!<<(lYg{wEqWgHRj3y literal 0 HcmV?d00001 diff --git a/source/assembler.c b/source/assembler.h similarity index 100% rename from source/assembler.c rename to source/assembler.h diff --git a/source/bckp_data.bsdok b/source/bckp_data.bsdok new file mode 100644 index 0000000..4cf08d7 --- /dev/null +++ b/source/bckp_data.bsdok @@ -0,0 +1,319 @@ +#ifdef noneafjodsjf +# define + +// Clean up the node creation to be more "C-Style" +node_t* NewNode(void* data) { + node_t* n = calloc(1, sizeof(node_t)); + if(n) n->data = data; + return n; +} + +// Optimization: Use a specialized Token creation function +Token_t* NewToken(const char* start, size_t len, TKN_CTX ctx) { + Token_t* t = malloc(sizeof(Token_t)); + t->data = malloc(len + 1); + memcpy(t->data, start, len); + ((char*)t->data)[len] = '\0'; + t->size = len; + t->ctx = ctx; + return t; +} + +void MunchTokens(list_t *lst) { + node_t *curr = lst->first; + + while (curr && curr->next) { + Token_t *t1 = (Token_t *)curr->data; + Token_t *t2 = (Token_t *)curr->next->data; + + // Only munch RAW tokens that are single characters + if (t1->ctx == TOK_RAW && t2->ctx == TOK_RAW && t1->size == 1 && t2->size == 1) { + char pair[3] = { ((char*)t1->data)[0], ((char*)t2->data)[0], '\0' }; + bool matched = false; + + for (int i = 0; MUNCH_TABLE[i].op != NULL; i++) { + if (strcmp(pair, MUNCH_TABLE[i].op) == 0) { + // 1. Update T1 to the new string + char *new_data = malloc(3); + memcpy(new_data, pair, 3); + free(t1->data); + t1->data = new_data; + t1->size = 2; + + // 2. Remove T2 from the list + node_t *node_to_remove = curr->next; + curr->next = node_to_remove->next; + + if (lst->last == node_to_remove) lst->last = curr; + + // 3. Free T2 memory + ClearTokens(node_to_remove->data); + free(node_to_remove); + lst->size--; + + matched = true; + break; + } + } + // If we matched "++", curr now contains "++". + // We DON'T move to next yet, in case there's a 3rd char (like ">>=") + if (matched) continue; + } + curr = curr->next; + } +} + +void RefineSymbols(list_t *tkn_lst) { + node_t *curr = tkn_lst->first; + + while (curr) { + Token_t *t = (Token_t *)curr->data; + + // Skip strings and skip nodes that are JUST a single symbol already + if (t->ctx != TOK_RAW || (t->size == 1 && strchr(SYMBOLS, ((char*)t->data)[0]))) { + curr = curr->next; + continue; + } + + // Find the first symbol in this string + size_t pos = strcspn(t->data, SYMBOLS); + + if (pos < t->size) { + // We found a symbol! Now we split. + // Case 1: Symbol is NOT at the very start (there is a prefix) + if (pos > 0) { + // Split the token into [prefix] and [symbol + suffix] + // We reuse the ListSplitToken logic we discussed earlier + ListSplitToken(tkn_lst, curr, pos); + // After splitting, curr is now just the prefix. + // We move to curr->next to handle the symbol. + curr = curr->next; + } + // Case 2: Symbol is at the start (pos == 0) + else { + // Split the token into [1-char symbol] and [suffix] + ListSplitToken(tkn_lst, curr, 1); + // The current node is now the 1-char symbol. + // We move to the next node to see if the suffix has more symbols. + curr = curr->next; + } + } else { + // No symbols found in this node, move to the next node in the list + curr = curr->next; + } + } +} + +void SeparateStrings(char *data, list_t *tkn_lst) { + char *curr = data; + char *start = data; + + while (*curr != '\0') { + if (*curr == '\"' || *curr == '\'') { + char quote_type = *curr; + + if (curr > start) { + size_t raw_len = curr - start; + Token_t *raw = calloc(1, sizeof(Token_t)); + raw->data = strndup(start, raw_len);//strndup is C99/POSIX + raw->size = raw_len; + raw->ctx = TOK_RAW; + ListPushBack(tkn_lst, raw); + } + + char *str_start = curr; + curr++; // Skip opening quote + while (*curr != '\0' && *curr != quote_type) { + if (*curr == '\\') curr++; // Skip escaped characters like \" + curr++; + } + if (*curr == quote_type) curr++; // Include closing quote + + size_t str_len = curr - str_start; + Token_t *str_tok = calloc(1, sizeof(Token_t)); + str_tok->data = strndup(str_start, str_len); + str_tok->size = str_len; + str_tok->ctx = TOK_STRING; + ListPushBack(tkn_lst, str_tok); + + start = curr; + } else { + curr++; + } + } + + if (curr > start) { + size_t last_len = curr - start; + Token_t *last = calloc(1, sizeof(Token_t)); + last->data = strndup(start, last_len); + last->size = last_len; + last->ctx = TOK_RAW; + ListPushBack(tkn_lst, last); + } +} + +void InitialScanner(char *data, list_t *tkn_lst) { + char *curr = data; + char *start = data; + + while (*curr != '\0') { + if (*curr == '\"' || *curr == '\'') { + PushRaw(start, curr, tkn_lst); + char quote = *curr; + char *str_start = curr++; + while (*curr && *curr != quote) { + if (*curr == '\\' && *(curr + 1)) curr++; + curr++; + } + if (*curr) curr++; + PushToken(str_start, curr, TOK_STRING, tkn_lst); + start = curr; + } + // 2. Handle Comments + else if (*curr == '/' && (*(curr + 1) == '/' || *(curr + 1) == '*')) { + PushRaw(start, curr, tkn_lst); + + if (*(curr + 1) == '/') { // Single line // + while (*curr && *curr != '\n') curr++; + curr++;//for skipping the \n + } else { // Multi-line /* + curr += 2; + while (*curr && !(*curr == '*' && *(curr + 1) == '/')) curr++; + if (*curr) curr += 2; // Move past */ + } + // We DON'T push a token here because we want to ignore comments. + // If you want to keep them (for a doc-generator), push a TOK_COMMENT. + //PushToken(start, curr, TOK_COMMENT, tkn_lst); + start = curr; + } else { + curr++; + } + } + PushRaw(start, curr, tkn_lst); +} + +void RefineRawNodes(list_t *tkn_lst) { + node_t *curr = tkn_lst->first; + //node_t *prev = NULL; + + while (curr) { + Token_t *t = (Token_t *)curr->data; + if (t->ctx == TOK_RAW) { + char *span = NULL; + char *to_split = strndup(t->data, t->size); + char *tok = strtok_r(to_split, " \t\r\n", &span); + + if (tok) { + free(t->data); + t->size = strlen(tok); + t->data = strndup(tok, t->size); + + node_t *last_inserted = curr; + tok = strtok_r(NULL, " \t\r\n", &span); + + while (tok) { + Token_t *new_t = calloc(1, sizeof(Token_t)); + new_t->size = strlen(tok); + new_t->data = strndup(tok, new_t->size); + new_t->ctx = TOK_RAW; + + node_t *new_node = calloc(1, sizeof(node_t)); + new_node->data = new_t; + + new_node->next = last_inserted->next; + last_inserted->next = new_node; + + if (tkn_lst->last == last_inserted) tkn_lst->last = new_node; + + last_inserted = new_node; + tkn_lst->size++; + tok = strtok_r(NULL, " \t\r\n", &span); + } + curr = last_inserted; + } + free(to_split); + } + //prev = curr; + curr = curr->next; + } +} + +void PruneWhitespaceNodes(list_t *lst) { + node_t *curr = lst->first; + node_t *prev = NULL; + + while (curr) { + Token_t *t = (Token_t *)curr->data; + if (t->ctx == TOK_RAW && IsWhitespace(t->data)) { + // Unlink and free + node_t *temp = curr; + if (prev) prev->next = curr->next; + else lst->first = curr->next; + + if (lst->last == temp) lst->last = prev; + + curr = curr->next; + ClearTokens(temp->data); + free(temp); + lst->size--; + } else { + prev = curr; + curr = curr->next; + } + } +} + +void ListSplitToken(list_t *lst, node_t *node, size_t index) { + Token_t *old_t = (Token_t *)node->data; + + // 1. Create Suffix Data + size_t suffix_len = old_t->size - index; + char *suffix_data = malloc(suffix_len + 1); + memcpy(suffix_data, (char*)old_t->data + index, suffix_len); + suffix_data[suffix_len] = '\0'; + + // 2. Truncate Prefix Data + char *prefix_data = malloc(index + 1); + memcpy(prefix_data, old_t->data, index); + prefix_data[index] = '\0'; + + free(old_t->data); + old_t->data = prefix_data; + old_t->size = index; + + // 3. Create New Node for Suffix + Token_t *new_t = calloc(1, sizeof(Token_t)); + new_t->data = suffix_data; + new_t->size = suffix_len; + new_t->ctx = TOK_RAW; + + node_t *new_node = calloc(1, sizeof(node_t)); + new_node->data = new_t; + new_node->next = node->next; + + // 4. Update List + node->next = new_node; + if (lst->last == node) lst->last = new_node; + lst->size++; +} + + +//// Helper to create and link a new token +void PushToken(char *start, char *end, TKN_CTX ctx, list_t *lst) { + size_t len = end - start; + Token_t *t = calloc(1, sizeof(Token_t)); + t->data = malloc(len + 1); + memcpy(t->data, start, len); + ((char*)t->data)[len] = '\0'; + t->size = len; + t->ctx = ctx; + ListPushBack(lst, t); +} + +//// Helper to push code that still needs to be refined +void PushRaw(char *start, char *end, list_t *lst) { + if (end <= start) return; + PushToken(start, end, TOK_RAW, lst); +} + +#endif diff --git a/source/list.h b/source/list.h index db2c70c..2c704f2 100644 --- a/source/list.h +++ b/source/list.h @@ -62,7 +62,7 @@ void ListInsert(list_t *lst, size_t idx, void *data) { lst->first = node; } else { node_t *prev = lst->first; - for (int i = 0; i < idx - 1 && prev->next; i++) { + for (size_t i = 0; i < idx - 1 && prev->next; i++) { prev = prev->next; } node->next = prev->next; diff --git a/source/main.c b/source/main.c index 59c46c9..e437b22 100644 --- a/source/main.c +++ b/source/main.c @@ -3,6 +3,93 @@ #include "../include/SterlingCompiler.h" +const char *SYMBOLS = ";(){}[]$%&*#@!?:,.<>|-+=~`^"; + +// Common C operators (Order matters: put longer ones first if you add 3-char ops) +MultiOp MUNCH_TABLE[] = { + {"<<=", 3}, {">>=", 3}, + {"==", 2}, {"!=", 2}, {"<=", 2}, {">=", 2}, + {"++", 2}, {"--", 2}, {"->", 2}, {"+=", 2}, + {"-=", 2}, {"*=", 2}, {"/=", 2}, {"&&", 2}, {"||", 2}, + {"^=", 2}, {"<<", 2}, {">>", 2}, {"|=", 2}, {"&=", 2}, + {NULL, 0} +}; + +// This can be expanded at runtime if you use a dynamic array instead of a static one +KeywordEntry KEYWORD_TABLE[] = { + {"if", TOK_KEY}, + {"else", TOK_KEY}, + {"while", TOK_KEY}, + {"return", TOK_KEY}, + {"int", TOK_KEY}, + {"float", TOK_KEY}, + {"void", TOK_KEY}, + {"include", TOK_PREPROC}, + {"define", TOK_PREPROC}, + {"@comptime",TOK_KEY}, // Your custom identifier + {NULL, TOK_NONE} +}; + +# ifndef strndup +char *strndup(const char *s, size_t n) { + char *str = calloc(n + 1, sizeof(char)); + memcpy(str, s, n); + return (str); +} +# endif + + +bool IsWhitespace(const char *s) { + while (*s) { + if (!isspace((unsigned char)*s)) return false; + s++; + } + return true; +} + +void ClearTokens(void*arg) { + Token_t *tok = arg; + free(tok->data); + free(tok); +} + +node_t* NewNode(void* data) { + node_t* n = calloc(1, sizeof(node_t)); + if(n) n->data = data; + return n; +} + +Token_t* NewToken(const char* start, size_t len, TKN_CTX ctx) { + Token_t* t = malloc(sizeof(Token_t)); + t->data = strndup(start, len); + t->size = len; + t->ctx = ctx; + return t; +} + +void PushToken(list_t *lst, const char *start, const char *end, TKN_CTX ctx) { + if (end <= start) return; + ListPushBack(lst, NewToken(start, end - start, ctx)); +} + +void ListSplitToken(list_t *lst, node_t *node, size_t index) { + Token_t *t = (Token_t *)node->data; + + // Create the suffix node first + Token_t *suffix = NewToken(t->data + index, t->size - index, TOK_RAW); + node_t *new_node = NewNode(suffix); + new_node->next = node->next; + node->next = new_node; + if (lst->last == node) lst->last = new_node; + lst->size++; + + // Truncate the original (prefix) + char *new_prefix = strndup(t->data, index); + free(t->data); + t->data = new_prefix; + t->size = index; +} + char *LoadFile(const char *filename) { FILE *file = NULL; char *data = NULL; @@ -19,175 +106,113 @@ char *LoadFile(const char *filename) { return (data); } -# ifndef strndup -char *strndup(const char *s, size_t n) { - char *str = calloc(n + 1, sizeof(char)); - memcpy(str, s, n); - return (str); -} -# endif - -bool IsWhitespace(const char *s) { - while (*s) { - if (!isspace((unsigned char)*s)) return false; - s++; - } - return true; -} - -void ClearTokens(void*arg) { - Token_t *tok = arg; - free(tok->data); - free(tok); -} - -// Helper to create and link a new token -void PushToken(char *start, char *end, TKN_CTX ctx, list_t *lst) { - size_t len = end - start; - Token_t *t = calloc(1, sizeof(Token_t)); - t->data = malloc(len + 1); - memcpy(t->data, start, len); - ((char*)t->data)[len] = '\0'; - t->size = len; - t->ctx = ctx; - ListPushBack(lst, t); -} - -// Helper to push code that still needs to be refined -void PushRaw(char *start, char *end, list_t *lst) { - if (end <= start) return; - PushToken(start, end, TOK_RAW, lst); -} - -void SeparateStrings(char *data, list_t *tkn_lst) { - char *curr = data; - char *start = data; - - while (*curr != '\0') { - // If we find a quote, we need to "package" the raw code before it, - // then package the string itself. - if (*curr == '\"' || *curr == '\'') { - char quote_type = *curr; - - // 1. Save the "Raw" chunk before the string - if (curr > start) { - size_t raw_len = curr - start; - Token_t *raw = calloc(1, sizeof(Token_t)); - raw->data = strndup(start, raw_len); // strndup is C99/POSIX - raw->size = raw_len; - raw->ctx = TOK_RAW; - ListPushBack(tkn_lst, raw); - } - - // 2. Find the end of the string - char *str_start = curr; - curr++; // Skip opening quote - while (*curr != '\0' && *curr != quote_type) { - if (*curr == '\\') curr++; // Skip escaped characters like \" - curr++; - } - if (*curr == quote_type) curr++; // Include closing quote - - // 3. Save the String Token - size_t str_len = curr - str_start; - Token_t *str_tok = calloc(1, sizeof(Token_t)); - str_tok->data = strndup(str_start, str_len); - str_tok->size = str_len; - str_tok->ctx = TOK_STRING; - ListPushBack(tkn_lst, str_tok); - - start = curr; // Reset start to the character after the string - } else { - curr++; - } - } - - // 4. Catch any remaining raw code after the last string - if (curr > start) { - size_t last_len = curr - start; - Token_t *last = calloc(1, sizeof(Token_t)); - last->data = strndup(start, last_len); - last->size = last_len; - last->ctx = TOK_RAW; - ListPushBack(tkn_lst, last); - } -} void InitialScanner(char *data, list_t *tkn_lst) { - char *curr = data; - char *start = data; + char *curr = data, *start = data; - while (*curr != '\0') { - // 1. Handle Strings + while (*curr) { + // Handle Strings if (*curr == '\"' || *curr == '\'') { - PushRaw(start, curr, tkn_lst); // Save code before string - char quote = *curr; - char *str_start = curr++; - while (*curr && *curr != quote) { - if (*curr == '\\' && *(curr + 1)) curr++; // Skip escaped char - curr++; - } - if (*curr) curr++; - PushToken(str_start, curr, TOK_STRING, tkn_lst); + PushToken(tkn_lst, start, curr, TOK_RAW); + char *s_start = curr++, q = *curr; + while (*curr && *curr != q) { if (*curr == '\\') curr++; curr++; } + if (*curr) curr++; + PushToken(tkn_lst, s_start, curr, TOK_STRING); start = curr; } - // 2. Handle Comments - else if (*curr == '/' && (*(curr + 1) == '/' || *(curr + 1) == '*')) { - PushRaw(start, curr, tkn_lst); // Save code before comment - - if (*(curr + 1) == '/') { // Single line // - while (*curr && *curr != '\n') curr++; - curr++;//for skipping the \n - } else { // Multi-line /* - curr += 2; - while (*curr && !(*curr == '*' && *(curr + 1) == '/')) curr++; - if (*curr) curr += 2; // Move past */ + // Handle Comments + else if (*curr == '/' && (curr[1] == '/' || curr[1] == '*')) { + PushToken(tkn_lst, start, curr, TOK_RAW); + if (curr[1] == '/') { while (*curr && *curr != '\n') curr++; } + else { + curr += 2; + while (*curr && !(*curr == '*' && curr[1] == '/')) curr++; + if (*curr) curr += 2; } - // We DON'T push a token here because we want to ignore comments. - // If you want to keep them (for a doc-generator), push a TOK_COMMENT. start = curr; + } + else curr++; + } + PushToken(tkn_lst, start, curr, TOK_RAW); +} + +void RefineSymbols(list_t *tkn_lst) { + for (node_t *curr = tkn_lst->first; curr; ) { + Token_t *t = curr->data; + if (t->ctx != TOK_RAW || (t->size == 1 && strchr(SYMBOLS, t->data[0]))) { + curr = curr->next; + continue; } - else { - curr++; + + size_t pos = strcspn(t->data, SYMBOLS); + if (pos < t->size) { + ListSplitToken(tkn_lst, curr, (pos == 0) ? 1 : pos); + // Don't move curr yet, we might have more symbols in the suffix + } else { + curr = curr->next; } } - PushRaw(start, curr, tkn_lst); // Catch the tail } +void MunchTokens(list_t *lst) { + for (node_t *n = lst->first; n && n->next; ) { + Token_t *t1 = n->data, *t2 = n->next->data; + + if (t1->ctx == TOK_RAW && t2->ctx == TOK_RAW && t1->size == 1 && t2->size == 1) { + char op[3] = { t1->data[0], t2->data[0], '\0' }; + bool match = false; + for (int i = 0; MUNCH_TABLE[i].op; i++) { + if (strcmp(op, MUNCH_TABLE[i].op) == 0) { match = true; break; } + } + + if (match) { + free(t1->data); + t1->data = strndup(op, 2); + t1->size = 2; + t1->ctx = TOK_OP; // Upgrade to Operator context + + node_t *tmp = n->next; + n->next = tmp->next; + if (lst->last == tmp) lst->last = n; + ClearTokens(tmp->data); + free(tmp); + lst->size--; + continue; // Check if the next char can be munched too (e.g. >>=) + } + } + n = n->next; + } +} + + void RefineRawNodes(list_t *tkn_lst) { node_t *curr = tkn_lst->first; - node_t *prev = NULL; + //node_t *prev = NULL; while (curr) { Token_t *t = (Token_t *)curr->data; - - // Only process RAW chunks; leave TOK_STRING nodes alone! if (t->ctx == TOK_RAW) { char *span = NULL; - // Note: We use a copy because strtok modifies the string - char *to_split = _strdup(t->data); + char *to_split = strndup(t->data, t->size); char *tok = strtok_r(to_split, " \t\r\n", &span); if (tok) { - // 1. Update the current node's data with the FIRST token found free(t->data); - t->data = _strdup(tok); t->size = strlen(tok); + t->data = strndup(tok, t->size); node_t *last_inserted = curr; tok = strtok_r(NULL, " \t\r\n", &span); - // 2. Insert NEW nodes for the rest of the tokens while (tok) { Token_t *new_t = calloc(1, sizeof(Token_t)); - new_t->data = _strdup(tok); new_t->size = strlen(tok); + new_t->data = strndup(tok, new_t->size); new_t->ctx = TOK_RAW; node_t *new_node = calloc(1, sizeof(node_t)); new_node->data = new_t; - // Insert into the list new_node->next = last_inserted->next; last_inserted->next = new_node; @@ -197,12 +222,11 @@ void RefineRawNodes(list_t *tkn_lst) { tkn_lst->size++; tok = strtok_r(NULL, " \t\r\n", &span); } - curr = last_inserted; // Move cursor to the end of the new chain + curr = last_inserted; } free(to_split); } - - prev = curr; + //prev = curr; curr = curr->next; } } @@ -232,32 +256,60 @@ void PruneWhitespaceNodes(list_t *lst) { } } +void IdentifyTokens(list_t *lst) { + for (node_t *curr = lst->first; curr; curr = curr->next) { + Token_t *t = (Token_t *)curr->data; + + if (t->ctx != TOK_RAW) continue; + + bool found = false; + // 1. Check against Keyword Registry + for (int i = 0; KEYWORD_TABLE[i].name != NULL; i++) { + if (strcmp(t->data, KEYWORD_TABLE[i].name) == 0) { + t->ctx = KEYWORD_TABLE[i].ctx; + found = true; + break; + } + } + + // 2. If not a keyword, is it a valid Identifier? (e.g., my_var_1) + if (!found && t->size > 0) { + if (isalpha(t->data[0]) || t->data[0] == '_' || t->data[0] == '@') { + t->ctx = TOK_ID; + } + } + } +} + +/* +// Modular function to register new identifiers +void RegisterIdentifier(const char *name, TKN_CTX type) { + // In a professional compiler, you'd insert this into a Hash Map. + // For now, it's enough to know this is where user-defined types go. +} +*/ int main(int ac, char **av) { - if (ac <= 1) { - printf("no file specified"); - return (-1); - } + if (ac <= 1) return printf("No file specified\n"), -1; char* data = LoadFile(av[1]); list_t *tkn_lst = ListInit(NULL); - //first pass on string, whitespace and comments + InitialScanner(data, tkn_lst); - //SeparateStrings(data, tkn_lst); - list_iter_t iter = ListGetIter(tkn_lst); - while (iter.current) { - printf("|%s|", ((Token_t *)iter.current->data)->data); - iter.current = iter.current->next; - } PruneWhitespaceNodes(tkn_lst); - printf("\n___\n"); RefineRawNodes(tkn_lst); - ListReset(&iter, tkn_lst); - while (iter.current) { - printf("|%s|\n", ((Token_t *)iter.current->data)->data); - iter.current = iter.current->next; - } + RefineSymbols(tkn_lst); + MunchTokens(tkn_lst); + IdentifyTokens(tkn_lst); + + list_iter_t iter = ListGetIter(tkn_lst); + while (iter.current) { + Token_t *t = (Token_t *)iter.current->data; + printf("[%02X] %-10s | %s\n", t->ctx, + (t->ctx == TOK_ID ? "IDENTIFIER" : "TOKEN"), t->data); + iter.current = iter.current->next; + } //pass on ";(){}[]$%&*$#@!?:,.<>|_-+=~`" - //give each token a context + //and give each token a context //let's replace preprocessor (include, define, etc) //let's do recursive parsing everywhere that need it //compile time reflection (@comptime or @reflect) diff --git a/source/type.h b/source/type.h new file mode 100644 index 0000000..ede78c3 --- /dev/null +++ b/source/type.h @@ -0,0 +1,23 @@ + +//builtin type: if x86_64, then since i use simd, should align them + +//typedef struct vec2 { +// float x; +// float y; +// float _padding1; +// float _padding2; +//} __attribute__((aligned(16)));//losing 8 byte + +//typedef struct vec3 { +// float x; +// float y; +// float z; +// float _padding; +//} __attribute__((aligned(16)));//losing 4 byte + +//typedef struct vec4 { +// float x; +// float y; +// float z; +// float w; +//} __attribute__((aligned(16)));