import dopp; // helper - is lexeme keyword export bool isKeyword(string lexeme){ //static immutable keywords = ["if", "else", "while", "for", "return"]; static immutable keywords = ["dlang", "ptn", "ptns"]; return keywords.canFind(lexeme); } // lexer - Tokenizer makes tokens from text //export Token[] tokenize(string source){ export Token[] tokenize(string source, ubyte indent_type, ubyte indent_matter, ubyte indent_out){ Token[] tokens; ubyte indent_lvl = 0; // +1 for open curly brace and -1 for close curly brace, 0 in result auto indent_symbol = ' '; if(indent_type != 0){ indent_symbol = '\t'; } bool inside_string = false; bool inside_comment = false; //ubyte inside_string_type = 0; // 0 = ".."; 1 = `..`; 2 = todo string str_helper = ""; int i = 0; while(i < source.length){ if(source[i] == '\n'){ // new line i++; if( (i < source.length) && ( (source[i] == indent_symbol) || (indent_lvl > 0) ) ){ auto start = i; while( (i < source.length) && (source[i] == indent_symbol) ){ i++; } auto indent_symbols_count = i - start; //writeln("indent_symbols_count: ", indent_symbols_count); auto maybe_new_indent_lvl = indent_symbols_count / indent_matter; //writeln("maybe_new_indent_lvl: ", maybe_new_indent_lvl); if(maybe_new_indent_lvl > indent_lvl){ indent_lvl++; string new_indent_out = indent_symbol.repeat(indent_lvl * indent_out).array.idup; tokens ~= Token(TokenType.Indent_Incr, "{" ~ "\n" ~ new_indent_out); }else if(maybe_new_indent_lvl < indent_lvl){ indent_lvl--; string new_indent_out = indent_symbol.repeat(indent_lvl * indent_out).array.idup; string maybe_new_line = "\n"; if( (i + 4) < source.length && source[i .. i + 4] == "else" ){ // maybe next token is else // maybe todo add token else maybe_new_line = ""; } tokens ~= Token(TokenType.Indent_Decr, "\n" ~ new_indent_out ~ "}" ~ maybe_new_line); } }else if(i > 0){ tokens ~= Token(TokenType.New_Line, "\n"); } }else if(source[i].isWhite){ // skip whitespaces i++; }else if(source[i].isAlpha || (source[i] == '_') ){ // is unicode alphabetic character or underscore auto start = i; while( (i < source.length) && (source[i].isAlphaNum || (source[i] == '_') ) ){ i++; } string lexeme = source[start .. i]; tokens ~= Token(lexeme.isKeyword ? TokenType.Keyword : TokenType.Identifier, lexeme); }else if(source[i].isDigit){ // number auto start = i; while( (i < source.length) && (source[i].isDigit || (source[i] == '_') ) ){ // underscore can be inside number like 5_000 etc i++; } if( (i < source.length) && (source[i] == '.') ){ // include dot for float i++; while( (i < source.length) && source[i].isDigit){ i++; } tokens ~= Token(TokenType.Float, source[start .. i]); }else{ tokens ~= Token(TokenType.Integer, source[start .. i]); } }else if( (source[i] == '/') && (inside_string == false) && (inside_comment == false) && ( (i + 1) < source.length ) && (source[i + 1] == '/') ){ // single line comment "//" begins auto start = i; inside_comment = true; i++; while( inside_comment && (i < source.length) ){ // goto Type single line comment end position if(source[i] == '\n'){ // line end means single line comment ends inside_comment = false; }else{ // single line comment not ends yet i++; } } if(inside_comment){ inside_comment = false; } tokens ~= Token(TokenType.Comment_Line, source[start .. i]); }else if( (inside_string == false) && (source[i] == '"') ){ // Type string ".." begins auto start = i++; // string begin position inside_string = true; while( (i < source.length) && inside_string ){ // goto Type string end position if( (source[i] == '\\') && ( (i + 1) < source.length ) && (source[i + 1] == '"') ){ // escaped " is not string end i += 2; // skip 2 symbols }else if(source[i] == '"'){ // close quote " inside_string = false; }else{ // string not ends yet i++; } } if(i < source.length){ // we count close quote i++; } tokens ~= Token(TokenType.String, source[start .. i]); }else if( (inside_string == false) && (source[i] == '`') ){ // Type string `..` begins auto start = i++; // string begin position inside_string = true; while( (i < source.length) && inside_string ){ // goto Type string end position // ` cannot be escaped in `..` string - so we can add ~ "`" for it - because this is not string end if( (source[i] == '`') && ( (i + 1) < source.length ) && (source[i + 1] != ';') ){ // todo check for '\n' next line for new version lexer-compiler for syntax without ; in lines-commands ends str_helper ~= source[start .. i] ~ "`" ~ `~ "` ~ "`" ~ `" ~ ` ~ "`"; // ` ~ "`" ~ ` -> ` after dlang compiling i++; start = i; }else if(source[i] == '`'){ // close quote ` inside_string = false; }else{ // string not ends yet i++; } } if(i < source.length){ // we count close quote i++; } if(str_helper != ""){ tokens ~= Token(TokenType.String, str_helper ~ source[start .. i]); str_helper = ""; }else{ tokens ~= Token(TokenType.String, source[start .. i]); } }else if( (inside_string == false) && (source[i] == '`') ){ // Type string `..` begins auto start = i++; // string begin position while( (i < source.length) && (source[i] != '"') ){ // goto Type string end position i++; } if(i < source.length){ i++; } tokens ~= Token(TokenType.String, source[start .. i]); }else if( (inside_string == false) && ( (source[i] == '(') || (source[i] == ')') ) ){ // round brackets tokens ~= Token(TokenType.Round_Bracket, source[i].to!string); i++; }else{ // common symbols as tokens tokens ~= Token(TokenType.Symbol, source[i].to!string); i++; } } return tokens; }