1 | <?php |
---|
2 | |
---|
3 | class JavaScriptLanguage { |
---|
4 | public static function new_token($type, $value, $extras) { |
---|
5 | return array('type' => $type, 'value' => $value, 'line_number' => $extras['line_number'], 'char_pos' => $extras['char_pos'], 'line' => $extras['exploded_lines'][$extras['line_number'] - 1]); |
---|
6 | } |
---|
7 | |
---|
8 | public static function tokenize($lines) { |
---|
9 | $lines = str_replace("\r", '', $lines); |
---|
10 | $tokens = array(); |
---|
11 | |
---|
12 | $exploded_lines = explode("\n", $lines); |
---|
13 | $line_number = 1; |
---|
14 | $char_pos = 0; |
---|
15 | $extras = array('exploded_lines', 'line_number', 'char_pos'); |
---|
16 | |
---|
17 | $positions = array(); |
---|
18 | for ($i = 0; $i < strlen($lines); $i++) { |
---|
19 | $char = $lines{$i}; |
---|
20 | |
---|
21 | $positions[$i] = $line_number . '|' . ++$char_pos; |
---|
22 | |
---|
23 | if ($char == "\n") { |
---|
24 | ++$line_number; |
---|
25 | $char_pos = 0; |
---|
26 | } |
---|
27 | } |
---|
28 | |
---|
29 | for ($i = 0; $i < strlen($lines); $i++) { |
---|
30 | $char = $lines{$i}; |
---|
31 | |
---|
32 | list($line_number, $char_pos) = explode('|', $positions[$i], 2); |
---|
33 | |
---|
34 | if (count($tokens)) { |
---|
35 | $pop = &$tokens[count($tokens) - 1]; |
---|
36 | } |
---|
37 | else { |
---|
38 | $pop = array('type' => '', 'value' => ''); |
---|
39 | } |
---|
40 | |
---|
41 | $last_expression = array('type' => '', 'value' => ''); |
---|
42 | // The last expression might be behind a comment |
---|
43 | // This is needed for stuff like regex |
---|
44 | for ($j = count($tokens) - 1; $j >= 0; $j--) { |
---|
45 | if ($tokens[$j]['type'] != 'comment') { |
---|
46 | $last_expression = $tokens[$j]; |
---|
47 | break; |
---|
48 | } |
---|
49 | } |
---|
50 | |
---|
51 | if ($char == ' ' || $char == "\t") { |
---|
52 | $pop['terminated'] = true; |
---|
53 | continue; |
---|
54 | } |
---|
55 | |
---|
56 | switch($char){ |
---|
57 | case '-': |
---|
58 | case '+': |
---|
59 | case '<': |
---|
60 | case '>': |
---|
61 | case '=': |
---|
62 | case '.': |
---|
63 | case '|': |
---|
64 | case '&': |
---|
65 | case ':': |
---|
66 | if ($char == '-' && !$pop['terminated'] && $pop['type'] == 'operator' && $pop['value'] == '-') { |
---|
67 | $pop['value'] = '--'; |
---|
68 | break; |
---|
69 | } |
---|
70 | elseif ($char == '+' && !$pop['terminated'] && $pop['type'] == 'operator' && $pop['value'] == '+') { |
---|
71 | $pop['value'] = '++'; |
---|
72 | break; |
---|
73 | } |
---|
74 | elseif ($char == '<' && !$pop['terminated'] && $pop['type'] == 'operator' && $pop['value'] == '<') { |
---|
75 | $pop['value'] = '<<'; |
---|
76 | break; |
---|
77 | } |
---|
78 | elseif ($char == '>' && !$pop['terminated'] && $pop['type'] == 'operator' && ($pop['value'] == '>' || $pop['value'] == '>>')) { |
---|
79 | $pop['value'] .= '>'; |
---|
80 | break; |
---|
81 | } |
---|
82 | elseif ($char == '=' && !$pop['terminated'] && $pop['type'] == 'operator' && in_array($pop['value'], array('+', '-', '*', '/', '%', '<<', '>>', '>>>', '&', '^', '|', '!', '!=', '=', '==', '>', '<'))) { |
---|
83 | $pop['value'] .= $char; |
---|
84 | break; |
---|
85 | } |
---|
86 | elseif ($char == '.' && !$pop['terminated'] && $pop['type'] == 'number') { |
---|
87 | $pop['value'] .= $char; |
---|
88 | break; |
---|
89 | } |
---|
90 | elseif ($char == '|' && !$pop['terminated'] && $pop['type'] == 'operator' && $pop['value'] == '|') { |
---|
91 | $pop['value'] = '||'; |
---|
92 | break; |
---|
93 | } |
---|
94 | elseif ($char == '&' && !$pop['terminated'] && $pop['type'] == 'operator' && $pop['value'] == '&') { |
---|
95 | $pop['value'] = '&&'; |
---|
96 | break; |
---|
97 | } |
---|
98 | elseif ($case == ':' && !$pop['terminated'] && $pop['type'] == 'operator' && $pop['value'] == '?') { |
---|
99 | $pop['value'] = '?:'; |
---|
100 | break; |
---|
101 | } |
---|
102 | case "\n": |
---|
103 | case ';': |
---|
104 | case ',': |
---|
105 | case '(': |
---|
106 | case ')': |
---|
107 | case '[': |
---|
108 | case ']': |
---|
109 | case '{': |
---|
110 | case '}': |
---|
111 | case '!': |
---|
112 | case '?': |
---|
113 | case '*': |
---|
114 | case '%': |
---|
115 | case '~': |
---|
116 | case '^': |
---|
117 | $tokens[] = self::new_token('operator', $char, compact($extras)); |
---|
118 | break; |
---|
119 | case "'": |
---|
120 | case '"': |
---|
121 | $string = ''; |
---|
122 | $last = ''; |
---|
123 | for(++$i; $i < strlen($lines); $i++){ |
---|
124 | $letter = $lines{$i}; |
---|
125 | if ($last == '\\') { |
---|
126 | $string .= $letter; |
---|
127 | if ($letter == '\\') { |
---|
128 | // Double backslash |
---|
129 | $last = ''; |
---|
130 | continue; |
---|
131 | } |
---|
132 | } |
---|
133 | elseif ($letter != $char) { |
---|
134 | $string .= $letter; |
---|
135 | } |
---|
136 | else { |
---|
137 | break; |
---|
138 | } |
---|
139 | $last = $letter; |
---|
140 | } |
---|
141 | $tokens[] = self::new_token('string', str_replace("\\$char", $char, $string), compact($extras)); |
---|
142 | break; |
---|
143 | case '/': |
---|
144 | // Single-line comment, multi-line comment, regular expression, or just a division sign |
---|
145 | $content = $char; |
---|
146 | $instruction = NULL; |
---|
147 | $single = FALSE; |
---|
148 | $multi = FALSE; |
---|
149 | $escaped = FALSE; |
---|
150 | $last = ''; |
---|
151 | $length = strlen($lines); |
---|
152 | for ($j = $i + 1; $j <= $length; $j++) { |
---|
153 | $letter = ($j == $length) ? "\n" : $lines{$j}; |
---|
154 | if ($single) { |
---|
155 | if ($letter == "\n") { |
---|
156 | $instruction = 'comment'; |
---|
157 | $i = $j - 1; |
---|
158 | break; |
---|
159 | } |
---|
160 | } |
---|
161 | elseif ($multi) { |
---|
162 | if (strlen($content) > 2 && $letter == '/' && $last == '*') { |
---|
163 | $content .= $letter; |
---|
164 | $instruction = 'comment'; |
---|
165 | $i = $j; |
---|
166 | break; |
---|
167 | } |
---|
168 | } |
---|
169 | elseif ($last === '' && $letter == '/') { |
---|
170 | $single = TRUE; |
---|
171 | } |
---|
172 | elseif ($last === '' && $letter == '*') { |
---|
173 | $multi = TRUE; |
---|
174 | } |
---|
175 | elseif ($escaped) { |
---|
176 | $escaped = FALSE; |
---|
177 | } |
---|
178 | elseif ($letter == '\\') { |
---|
179 | $escaped = TRUE; |
---|
180 | } |
---|
181 | elseif ($last === '') { |
---|
182 | // If it's not a comment, it might be a regex |
---|
183 | // which can only occur after certain operators |
---|
184 | if (in_array($last_expression['value'], array(')', ']')) || ($last_expression['type'] != 'operator' && $last_expression['value'] != 'return')) { |
---|
185 | $content = '/'; |
---|
186 | $instruction = 'operator'; |
---|
187 | break; |
---|
188 | } |
---|
189 | } |
---|
190 | elseif ($letter == '/') { |
---|
191 | // When the regex ends, we need to look for modifiers |
---|
192 | $content .= $letter; |
---|
193 | $instruction = 'regex'; |
---|
194 | $i = $j; |
---|
195 | for ($k = $j + 1; $k < $length; $k++) { |
---|
196 | $modifier = $lines{$k}; |
---|
197 | if ($modifier == ' ' || $modifier == "\t") { |
---|
198 | $last = $modifier; |
---|
199 | continue; |
---|
200 | } |
---|
201 | elseif(in_array($modifier, array('i', 'm', 'g'))) { |
---|
202 | $i = $k; |
---|
203 | $content .= $modifier; |
---|
204 | } |
---|
205 | else { |
---|
206 | break; |
---|
207 | } |
---|
208 | } |
---|
209 | break; |
---|
210 | } |
---|
211 | elseif ($letter == "\n") { |
---|
212 | // End of the expression with no regex terminator |
---|
213 | // So it's a division sign |
---|
214 | $content = '/'; |
---|
215 | $instruction = 'operator'; |
---|
216 | break; |
---|
217 | } |
---|
218 | |
---|
219 | $content .= $letter; |
---|
220 | $last = $letter; |
---|
221 | } |
---|
222 | |
---|
223 | $tokens[] = self::new_token($instruction, $content, compact($extras)); |
---|
224 | break; |
---|
225 | default: |
---|
226 | if ($pop['type'] != 'name' && is_numeric($char)) { |
---|
227 | if ($pop['value'] == '.' || $pop['type'] == 'number') { |
---|
228 | $pop['type'] = 'number'; |
---|
229 | $pop['value'] .= $char; |
---|
230 | } |
---|
231 | else { |
---|
232 | $tokens[] = self::new_token('number', $char, compact($extras)); |
---|
233 | } |
---|
234 | } |
---|
235 | else { |
---|
236 | if (!$pop['terminated'] && $pop['type'] == 'name') { |
---|
237 | $pop['value'] .= $char; |
---|
238 | } |
---|
239 | elseif ($pop['type'] == 'number' && strtolower(substr($pop['value'], 0, 2)) == '0x' && stripos('1234567890abcdef', $char) !== false) { |
---|
240 | // Hex |
---|
241 | $pop['value'] .= $char; |
---|
242 | } |
---|
243 | elseif (strtolower($char) == 'x' && $pop['value'] == '0') { |
---|
244 | // Hex |
---|
245 | $pop['value'] .= $char; |
---|
246 | } |
---|
247 | elseif (strtolower($char) == 'e' && $pop['type'] == 'number') { |
---|
248 | // e-notation |
---|
249 | $pop['value'] .= $char; |
---|
250 | } |
---|
251 | else { |
---|
252 | $tokens[] = self::new_token('name', $char, compact($extras)); |
---|
253 | } |
---|
254 | } |
---|
255 | } |
---|
256 | } |
---|
257 | |
---|
258 | // print '<pre>'; |
---|
259 | // print_r($tokens); |
---|
260 | // print '</pre>'; |
---|
261 | // die(); |
---|
262 | |
---|
263 | return $tokens; |
---|
264 | } |
---|
265 | } |
---|