1 module tokenizer; 2 private import std.array; 3 private import std.string; 4 private import std.conv; 5 6 class Tokenizer { 7 public: 8 string expression; 9 10 this(string s) { 11 this.expression = s; 12 this.ops = "`!%^&*+-={}|',/<>;"; 13 } 14 15 string[] tokenize(bool incdot=false) { 16 if (incdot) { 17 this.ops ~= "."; 18 } 19 string token = "", previous = ""; 20 string[] tokens = []; 21 bool in_quotes = false; 22 bool in_paren = false; 23 bool in_sparen = false; 24 int parenCount = 0; 25 int sparenCount = 0; 26 bool isFunctionCall = false; 27 bool inComments = false; 28 29 for (int i = 0; i < this.expression.length; i++) { 30 char element = this.expression[i]; 31 if (inComments) { 32 if (element == '\n') 33 inComments = false; 34 continue; 35 } 36 switch (element) { 37 case '#': 38 if (in_quotes) { 39 token ~= element; 40 continue; 41 } 42 inComments = true; 43 break; 44 case ' ','\n','\r','\t': 45 if (in_quotes || in_paren || in_sparen) { 46 token ~= element; 47 continue; 48 } 49 if (token != "") { 50 tokens ~= token; 51 previous = token; 52 token = ""; 53 } 54 break; 55 case '"': 56 in_quotes = !in_quotes; 57 token ~= element; 58 if (!in_quotes && !in_sparen) { 59 if (indexOf(ops, element) > 0) { 60 tokens ~= token; 61 previous = token; 62 token = ""; 63 } 64 } 65 break; 66 case '[': 67 if (!in_quotes && !in_paren) { 68 in_sparen = true; 69 sparenCount++; 70 token ~= element; 71 } 72 break; 73 case ']': 74 if (!in_quotes && !in_paren) { 75 sparenCount--; 76 token ~= element; 77 if (parenCount != 0) { 78 continue; 79 } 80 in_sparen = false; 81 tokens ~= token; 82 previous = token; 83 token = ""; 84 } 85 break; 86 case '(': 87 if (!in_quotes && !in_sparen) { 88 in_paren = true; 89 if (indexOf(ops, previous) < 0 && token == "" && !isFound(["if", "for", "def", "while"], previous)) { 90 isFunctionCall = true; 91 } 92 parenCount++; 93 token ~= element; 94 } 95 break; 96 case ')': 97 if (!in_quotes && !in_sparen) { 98 parenCount--; 99 token ~= element; 100 if (parenCount != 0) { 101 continue; 102 } 103 in_paren = false; 104 if (isFunctionCall) { 105 tokens[$-1] ~= token; 106 token = ""; 107 isFunctionCall = false; 108 continue; 109 } 110 if (i+1 < this.expression.length && this.expression[i+1] == '.' && !incdot) { 111 continue; 112 } 113 tokens ~= token; 114 previous = token; 115 token = ""; 116 } 117 break; 118 default: 119 if (in_quotes || in_paren || in_sparen) { 120 token ~= element; 121 continue; 122 } 123 if (indexOf(ops, element) > 0) { 124 if (token != "") { 125 tokens ~= token; 126 previous = token; 127 token = ""; 128 } 129 token ~= element; 130 tokens ~= token; 131 previous = token; 132 token = ""; 133 continue; 134 } 135 token ~= element; 136 } 137 } 138 if (token != "") { 139 tokens ~= token; 140 previous = token; 141 } 142 143 return normalizeTokens(tokens); 144 } 145 146 private: 147 string ops; 148 149 /* 150 This function is used to group the tokens that are supposed to be together. 151 */ 152 string[] normalizeTokens(string[] tokens) { 153 string[] norm; 154 string current = ""; 155 string limit = ""; 156 for (int i = 0; i < tokens.length; i++) { 157 string element = tokens[i]; 158 if (indexOf(ops, element) < 0) { 159 if (current != "") { 160 if ((current == "!") || 161 ((current == "-" || current == "+") && 162 (norm.length == 0 || inOperatorList(norm[$-1])))) { 163 element = current ~ element; 164 } else { 165 norm ~= current; 166 } 167 current = ""; 168 } 169 limit = ""; 170 norm ~= element; 171 continue; 172 } 173 174 switch (limit ~ element) { 175 case ">=-","<=-", "==-", "!=-", ">=+","<=+", "==+", "!=+", "&&!", "||!": 176 if (i+1 >= tokens.length) { 177 throw new Exception("Tokenizer::ERROR : unexpected operator '" 178 ~ element ~ "' after '" ~ limit ~"'"); 179 } 180 tokens[i+1] = element ~ tokens[i+1]; 181 continue; 182 default: break; 183 } 184 185 switch (current ~ element) { 186 case ">=", "&&", "||", "<=", "==", "!=": 187 limit = (current~element); 188 norm ~= limit; 189 current = ""; 190 break; 191 case "=+", "=!", "=-", ">+",">-", "<-", "<+", ",-", ",+": 192 norm ~= current; 193 limit = ""; 194 if (i+1 >= tokens.length) { 195 throw new Exception("Tokenizer::ERROR : unexpected operator '" 196 ~ element ~ "' after '" ~ current ~"'"); 197 } 198 tokens[i+1] = element ~ tokens[i+1]; 199 current = ""; 200 break; 201 default: 202 if (current != "") { 203 throw new Exception("Tokenizer::ERROR : unexpected operator '" 204 ~ element ~ "' after '" ~ current ~"'"); 205 } 206 current = element; 207 } 208 } 209 return norm; 210 } 211 212 } 213 214 bool isFound (string[] list, string en) { 215 foreach (each; list) { 216 if (en == each) 217 return true; 218 } 219 return false; 220 } 221 222 bool inOperatorList(string token) { 223 // TODO : improve the list 224 string[] oplist = ["+","-","*","/","==","!=",">=","<=","&&","||","%",">","<","="]; 225 226 foreach(op; oplist) { 227 if (op == token) { 228 return true; 229 } 230 } 231 return false; 232 }