dopp_lexer.d 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179
  1. import dopp;
  2. // helper - is lexeme keyword
  3. export bool isKeyword(string lexeme){
  4. //static immutable keywords = ["if", "else", "while", "for", "return"];
  5. static immutable keywords = ["dlang", "ptn", "ptns"];
  6. return keywords.canFind(lexeme);
  7. }
  8. // lexer - Tokenizer makes tokens from text
  9. //export Token[] tokenize(string source){
  10. export Token[] tokenize(string source, ubyte indent_type, ubyte indent_matter, ubyte indent_out){
  11. Token[] tokens;
  12. ubyte indent_lvl = 0; // +1 for open curly brace and -1 for close curly brace, 0 in result
  13. auto indent_symbol = ' ';
  14. if(indent_type != 0){
  15. indent_symbol = '\t';
  16. }
  17. bool inside_string = false;
  18. bool inside_comment = false;
  19. //ubyte inside_string_type = 0; // 0 = ".."; 1 = `..`; 2 = todo
  20. string str_helper = "";
  21. int i = 0;
  22. while(i < source.length){
  23. if(source[i] == '\n'){ // new line
  24. i++;
  25. if( (i < source.length) &&
  26. ( (source[i] == indent_symbol) || (indent_lvl > 0) ) ){
  27. auto start = i;
  28. while( (i < source.length) && (source[i] == indent_symbol) ){
  29. i++;
  30. }
  31. auto indent_symbols_count = i - start;
  32. //writeln("indent_symbols_count: ", indent_symbols_count);
  33. auto maybe_new_indent_lvl = indent_symbols_count / indent_matter;
  34. //writeln("maybe_new_indent_lvl: ", maybe_new_indent_lvl);
  35. if(maybe_new_indent_lvl > indent_lvl){
  36. indent_lvl++;
  37. string new_indent_out = indent_symbol.repeat(indent_lvl * indent_out).array.idup;
  38. tokens ~= Token(TokenType.Indent_Incr, "{" ~ "\n" ~ new_indent_out);
  39. }else if(maybe_new_indent_lvl < indent_lvl){
  40. indent_lvl--;
  41. string new_indent_out = indent_symbol.repeat(indent_lvl * indent_out).array.idup;
  42. string maybe_new_line = "\n";
  43. if( (i + 4) < source.length && source[i .. i + 4] == "else" ){ // maybe next token is else // maybe todo add token else
  44. maybe_new_line = "";
  45. }
  46. tokens ~= Token(TokenType.Indent_Decr, "\n" ~ new_indent_out ~ "}" ~ maybe_new_line);
  47. }
  48. }else if(i > 0){
  49. tokens ~= Token(TokenType.New_Line, "\n");
  50. }
  51. }else if(source[i].isWhite){ // skip whitespaces
  52. i++;
  53. }else if(source[i].isAlpha || (source[i] == '_') ){ // is unicode alphabetic character or underscore
  54. auto start = i;
  55. while( (i < source.length) && (source[i].isAlphaNum || (source[i] == '_') ) ){
  56. i++;
  57. }
  58. string lexeme = source[start .. i];
  59. tokens ~= Token(lexeme.isKeyword ? TokenType.Keyword : TokenType.Identifier, lexeme);
  60. }else if(source[i].isDigit){ // number
  61. auto start = i;
  62. while( (i < source.length) && (source[i].isDigit || (source[i] == '_') ) ){ // underscore can be inside number like 5_000 etc
  63. i++;
  64. }
  65. if( (i < source.length) && (source[i] == '.') ){ // include dot for float
  66. i++;
  67. while( (i < source.length) && source[i].isDigit){
  68. i++;
  69. }
  70. tokens ~= Token(TokenType.Float, source[start .. i]);
  71. }else{
  72. tokens ~= Token(TokenType.Integer, source[start .. i]);
  73. }
  74. }else if( (source[i] == '/') && (inside_string == false) && (inside_comment == false) && ( (i + 1) < source.length ) && (source[i + 1] == '/') ){ // single line comment "//" begins
  75. auto start = i;
  76. inside_comment = true;
  77. i++;
  78. while( inside_comment && (i < source.length) ){ // goto Type single line comment end position
  79. if(source[i] == '\n'){ // line end means single line comment ends
  80. inside_comment = false;
  81. }else{ // single line comment not ends yet
  82. i++;
  83. }
  84. }
  85. if(inside_comment){ inside_comment = false; }
  86. tokens ~= Token(TokenType.Comment_Line, source[start .. i]);
  87. }else if( (inside_string == false) && (source[i] == '"') ){ // Type string ".." begins
  88. auto start = i++; // string begin position
  89. inside_string = true;
  90. while( (i < source.length) && inside_string ){ // goto Type string end position
  91. if( (source[i] == '\\') && ( (i + 1) < source.length ) && (source[i + 1] == '"') ){ // escaped " is not string end
  92. i += 2; // skip 2 symbols
  93. }else if(source[i] == '"'){ // close quote "
  94. inside_string = false;
  95. }else{ // string not ends yet
  96. i++;
  97. }
  98. }
  99. if(i < source.length){ // we count close quote
  100. i++;
  101. }
  102. tokens ~= Token(TokenType.String, source[start .. i]);
  103. }else if( (inside_string == false) && (source[i] == '`') ){ // Type string `..` begins
  104. auto start = i++; // string begin position
  105. inside_string = true;
  106. while( (i < source.length) && inside_string ){ // goto Type string end position
  107. // ` cannot be escaped in `..` string - so we can add ~ "`" for it - because this is not string end
  108. if( (source[i] == '`') && ( (i + 1) < source.length ) && (source[i + 1] != ';') ){ // todo check for '\n' next line for new version lexer-compiler for syntax without ; in lines-commands ends
  109. str_helper ~= source[start .. i] ~ "`" ~ `~ "` ~ "`" ~ `" ~ ` ~ "`"; // ` ~ "`" ~ ` -> ` after dlang compiling
  110. i++;
  111. start = i;
  112. }else if(source[i] == '`'){ // close quote `
  113. inside_string = false;
  114. }else{ // string not ends yet
  115. i++;
  116. }
  117. }
  118. if(i < source.length){ // we count close quote
  119. i++;
  120. }
  121. if(str_helper != ""){
  122. tokens ~= Token(TokenType.String, str_helper ~ source[start .. i]);
  123. str_helper = "";
  124. }else{
  125. tokens ~= Token(TokenType.String, source[start .. i]);
  126. }
  127. }else if( (inside_string == false) && (source[i] == '`') ){ // Type string `..` begins
  128. auto start = i++; // string begin position
  129. while( (i < source.length) && (source[i] != '"') ){ // goto Type string end position
  130. i++;
  131. }
  132. if(i < source.length){
  133. i++;
  134. }
  135. tokens ~= Token(TokenType.String, source[start .. i]);
  136. }else if( (inside_string == false) && ( (source[i] == '(') || (source[i] == ')') ) ){ // round brackets
  137. tokens ~= Token(TokenType.Round_Bracket, source[i].to!string);
  138. i++;
  139. }else{ // common symbols as tokens
  140. tokens ~= Token(TokenType.Symbol, source[i].to!string);
  141. i++;
  142. }
  143. }
  144. return tokens;
  145. }