1 module tokenizer;
2 private import std.array;
3 private import std.string;
4 private import std.conv;
5 
6 class Tokenizer {
7 public:
8   string expression;
9 
10   this(string s) {
11     this.expression = s;
12     this.ops = "`!%^&*+-={}|',/<>;";
13   }
14     
15   string[] tokenize(bool incdot=false) {
16     if (incdot) {
17       this.ops ~= ".";
18     }
19     string token = "", previous = "";
20     string[] tokens = [];
21     bool in_quotes = false;
22     bool in_paren = false;
23     bool in_sparen = false;
24     int parenCount = 0;
25     int sparenCount = 0;
26     bool isFunctionCall = false;
27     bool inComments = false;
28     
29     for (int i = 0; i < this.expression.length; i++) {
30       char element = this.expression[i];
31       if (inComments) {
32         if (element == '\n')
33           inComments = false;
34         continue;
35       }
36       switch (element) {
37       case '#':
38         if (in_quotes) {
39           token ~= element;
40           continue;
41         }
42         inComments = true;
43         break;
44       case ' ','\n','\r','\t':
45         if (in_quotes || in_paren || in_sparen) {
46           token ~= element;
47           continue;
48         }
49         if (token != "") {
50           tokens ~= token;
51           previous = token;
52           token = "";
53         }
54         break;
55       case '"':
56         in_quotes = !in_quotes;
57         token ~= element;
58         if (!in_quotes && !in_sparen) {
59           if (indexOf(ops, element) > 0) {
60             tokens ~= token;
61             previous = token;
62             token = "";
63           }
64         }
65         break;
66       case '[':
67         if (!in_quotes && !in_paren) {
68           in_sparen = true;
69           sparenCount++;
70           token ~= element;
71         }
72         break;
73       case ']':
74         if (!in_quotes && !in_paren) {
75           sparenCount--;
76           token ~= element;
77           if (parenCount != 0) {
78             continue;
79           }
80           in_sparen = false;
81           tokens ~= token;
82           previous = token;
83           token = "";
84         }
85         break;
86       case '(':
87         if (!in_quotes && !in_sparen) {
88           in_paren = true;
89           if (indexOf(ops, previous) < 0 && token == "" && !isFound(["if", "for", "def", "while"], previous)) {
90             isFunctionCall = true;
91           }
92           parenCount++;
93           token ~= element;
94         }
95         break;
96       case ')':
97         if (!in_quotes && !in_sparen) {
98           parenCount--;
99           token ~= element;
100           if (parenCount != 0) {
101             continue;
102           }
103           in_paren = false;
104           if (isFunctionCall) {
105             tokens[$-1] ~= token;
106             token = "";
107             isFunctionCall = false;
108             continue;
109           }
110           if (i+1 < this.expression.length && this.expression[i+1] == '.' && !incdot) {
111             continue;
112           }
113           tokens ~= token;
114           previous = token;
115           token = "";
116         }
117         break;
118       default:
119         if (in_quotes || in_paren || in_sparen) {
120           token ~= element;
121           continue;
122         }
123         if (indexOf(ops, element) > 0) {
124           if (token != "") {
125             tokens ~= token;
126             previous = token;
127             token = "";
128           }
129           token ~= element;
130           tokens ~= token;
131           previous = token;
132           token = "";
133           continue;
134         }
135         token ~= element;
136       }
137     }
138     if (token != "") {
139       tokens ~= token;
140       previous = token;
141     }
142 
143     return normalizeTokens(tokens);
144   }
145 
146 private:
147   string ops;
148 
149   /*
150     This function is used to group the tokens that are supposed to be together.
151    */
152   string[] normalizeTokens(string[] tokens) {
153     string[] norm;
154     string current = "";
155     string limit = "";
156     for (int i = 0; i < tokens.length; i++) {
157       string element = tokens[i];
158       if (indexOf(ops, element) < 0) {
159         if (current != "") {
160           if ((current == "!") ||
161               ((current == "-" || current == "+") &&
162                (norm.length == 0 || inOperatorList(norm[$-1])))) {
163             element = current ~ element;
164           } else {
165             norm ~= current;
166           }
167           current = "";
168         }
169         limit = "";
170         norm ~= element;
171         continue;
172       }
173 
174       switch (limit ~ element) {
175       case ">=-","<=-", "==-", "!=-", ">=+","<=+", "==+", "!=+", "&&!", "||!":
176         if (i+1 >= tokens.length) {
177           throw new Exception("Tokenizer::ERROR : unexpected operator '"
178                               ~ element ~ "' after '" ~ limit ~"'");
179         }
180         tokens[i+1] = element ~ tokens[i+1];
181         continue;
182       default: break;
183       }
184       
185       switch (current ~ element) {
186       case ">=", "&&", "||", "<=", "==", "!=":
187         limit = (current~element);
188         norm ~= limit;
189         current = "";
190         break;
191       case "=+", "=!", "=-", ">+",">-", "<-", "<+", ",-", ",+":
192         norm ~= current;
193         limit = "";
194         if (i+1 >= tokens.length) {
195           throw new Exception("Tokenizer::ERROR : unexpected operator '"
196                               ~ element ~ "' after '" ~ current ~"'");
197         }
198         tokens[i+1] = element ~ tokens[i+1];
199         current = "";
200         break;
201       default:
202         if (current != "") {
203           throw new Exception("Tokenizer::ERROR : unexpected operator '"
204                               ~ element ~ "' after '" ~ current ~"'");
205         }
206         current = element;
207       }
208     }
209     return norm;
210   }
211   
212 }
213 
214 bool isFound (string[] list, string en) {
215   foreach (each; list) {
216     if (en == each)
217       return true;
218   }
219   return false;
220 }
221 
222 bool inOperatorList(string token) {
223   // TODO : improve the list
224   string[] oplist = ["+","-","*","/","==","!=",">=","<=","&&","||","%",">","<","="];
225 
226   foreach(op; oplist) {
227     if (op == token) {
228       return true;
229     }
230   }
231   return false;
232 }