source: other-projects/rsyntax-textarea/src/java/org/fife/ui/rsyntaxtextarea/modes/UnixShellTokenMaker.java@ 25584

Last change on this file since 25584 was 25584, checked in by davidb, 12 years ago

Initial cut an a text edit area for GLI that supports color syntax highlighting

File size: 38.5 KB
Line 
1/*
2 * 03/16/2004
3 *
4 * UnixShellTokenMaker.java - Scanner for UNIX shell scripts.
5 *
6 * This library is distributed under a modified BSD license. See the included
7 * RSyntaxTextArea.License.txt file for details.
8 */
9package org.fife.ui.rsyntaxtextarea.modes;
10
11import javax.swing.text.Segment;
12
13import org.fife.ui.rsyntaxtextarea.*;
14
15
16/**
17 * A token maker that turns text into a linked list of <code>Token</code>s
18 * for syntax highlighting UNIX shell scripts.
19 *
20 * @author Robert Futrell
21 * @version 0.1
22 */
23public class UnixShellTokenMaker extends AbstractTokenMaker {
24
25 protected final String operators = "=|><&";
26 protected final String separators = "()[]";
27 protected final String separators2 = ".,;"; // Characters you don't want syntax highlighted but separate identifiers.
28 protected final String shellVariables = "#-?$!*@_"; // Characters that are part of "$<char>" shell variables; e.g., "$_".
29
30
31 private int currentTokenStart;
32 private int currentTokenType;
33
34
35 /**
36 * Constructor.
37 */
38 public UnixShellTokenMaker() {
39 super(); // Initializes tokensToHighlight.
40 }
41
42
43 /**
44 * Checks the token to give it the exact ID it deserves before
45 * being passed up to the super method.
46 *
47 * @param segment <code>Segment</code> to get text from.
48 * @param start Start offset in <code>segment</code> of token.
49 * @param end End offset in <code>segment</code> of token.
50 * @param tokenType The token's type.
51 * @param startOffset The offset in the document at which the token occurs.
52 */
53 public void addToken(Segment segment, int start, int end, int tokenType, int startOffset) {
54
55 switch (tokenType) {
56 // Since reserved words, functions, and data types are all passed into here
57 // as "identifiers," we have to see what the token really is...
58 case Token.IDENTIFIER:
59 int value = wordsToHighlight.get(segment, start,end);
60 if (value!=-1)
61 tokenType = value;
62 break;
63 case Token.WHITESPACE:
64 case Token.SEPARATOR:
65 case Token.OPERATOR:
66 case Token.LITERAL_NUMBER_DECIMAL_INT:
67 case Token.LITERAL_STRING_DOUBLE_QUOTE:
68 case Token.LITERAL_CHAR:
69 case Token.LITERAL_BACKQUOTE:
70 case Token.COMMENT_EOL:
71 case Token.PREPROCESSOR:
72 case Token.VARIABLE:
73 break;
74
75 default:
76 new Exception("Unknown tokenType: '" + tokenType + "'").
77 printStackTrace();
78 tokenType = Token.IDENTIFIER;
79 break;
80
81 }
82
83 super.addToken(segment, start, end, tokenType, startOffset);
84
85 }
86
87
88 /**
89 * Returns the text to place at the beginning and end of a
90 * line to "comment" it in a this programming language.
91 *
92 * @return The start and end strings to add to a line to "comment"
93 * it out.
94 */
95 public String[] getLineCommentStartAndEnd() {
96 return new String[] { "#", null };
97 }
98
99
100 /**
101 * Returns whether tokens of the specified type should have "mark
102 * occurrences" enabled for the current programming language.
103 *
104 * @param type The token type.
105 * @return Whether tokens of this type should have "mark occurrences"
106 * enabled.
107 */
108 public boolean getMarkOccurrencesOfTokenType(int type) {
109 return type==Token.IDENTIFIER || type==Token.VARIABLE;
110 }
111
112
113 /**
114 * Returns the words to highlight for UNIX shell scripts.
115 *
116 * @return A <code>TokenMap</code> containing the words to highlight for
117 * UNIX shell scripts.
118 * @see org.fife.ui.rsyntaxtextarea.AbstractTokenMaker#getWordsToHighlight
119 */
120 public TokenMap getWordsToHighlight() {
121
122 TokenMap tokenMap = new TokenMap();
123
124 int reservedWord = Token.RESERVED_WORD;
125 tokenMap.put("case", reservedWord);
126 tokenMap.put("do", reservedWord);
127 tokenMap.put("done", reservedWord);
128 tokenMap.put("elif", reservedWord);
129 tokenMap.put("else", reservedWord);
130 tokenMap.put("esac", reservedWord);
131 tokenMap.put("fi", reservedWord);
132 tokenMap.put("for", reservedWord);
133 tokenMap.put("if", reservedWord);
134 tokenMap.put("in", reservedWord);
135 tokenMap.put("select", reservedWord);
136 tokenMap.put("then", reservedWord);
137 tokenMap.put("until", reservedWord);
138 tokenMap.put("while", reservedWord);
139
140 int function = Token.FUNCTION;
141 tokenMap.put("addbib", function);
142 tokenMap.put("admin", function);
143 tokenMap.put("alias", function);
144 tokenMap.put("apropos", function);
145 tokenMap.put("ar", function);
146 tokenMap.put("at", function);
147 tokenMap.put("awk", function);
148 tokenMap.put("banner", function);
149 tokenMap.put("basename", function);
150 tokenMap.put("batch", function);
151 tokenMap.put("bg", function);
152 tokenMap.put("biff", function);
153 tokenMap.put("bin-mail", function);
154 tokenMap.put("binmail", function);
155 tokenMap.put("break", function);
156 tokenMap.put("cal", function);
157 tokenMap.put("calendar", function);
158 tokenMap.put("cancel", function);
159 tokenMap.put("cat", function);
160 tokenMap.put("cb", function);
161 tokenMap.put("cc", function);
162 tokenMap.put("cd", function);
163 tokenMap.put("cdc", function);
164 tokenMap.put("chdir", function);
165 tokenMap.put("checkeq", function);
166 tokenMap.put("checknr", function);
167 tokenMap.put("chfn", function);
168 tokenMap.put("chgrp", function);
169 tokenMap.put("chmod", function);
170 tokenMap.put("chown", function);
171 tokenMap.put("chsh", function);
172 tokenMap.put("clear", function);
173 tokenMap.put("cmp", function);
174 tokenMap.put("colcrt", function);
175 tokenMap.put("comb", function);
176 tokenMap.put("comm", function);
177 tokenMap.put("command", function);
178 tokenMap.put("compress", function);
179 tokenMap.put("continue", function);
180 tokenMap.put("cp", function);
181 tokenMap.put("cpio", function);
182 tokenMap.put("cpp", function);
183 tokenMap.put("crontab", function);
184 tokenMap.put("csh", function);
185 tokenMap.put("ctags", function);
186 tokenMap.put("cut", function);
187 tokenMap.put("cvs", function);
188 tokenMap.put("date", function);
189 tokenMap.put("dbx", function);
190 tokenMap.put("delta", function);
191 tokenMap.put("deroff", function);
192 tokenMap.put("df", function);
193 tokenMap.put("diff", function);
194 tokenMap.put("dtree", function);
195 tokenMap.put("du", function);
196 tokenMap.put("e", function);
197 tokenMap.put("echo", function);
198 tokenMap.put("ed", function);
199 tokenMap.put("edit", function);
200 tokenMap.put("enscript", function);
201 tokenMap.put("eqn", function);
202 tokenMap.put("error", function);
203 tokenMap.put("eval", function);
204 tokenMap.put("ex", function);
205 tokenMap.put("exec", function);
206 tokenMap.put("exit", function);
207 tokenMap.put("expand", function);
208 tokenMap.put("export", function);
209 tokenMap.put("expr", function);
210 tokenMap.put("false", function);
211 tokenMap.put("fc", function);
212 tokenMap.put("fg", function);
213 tokenMap.put("file", function);
214 tokenMap.put("find", function);
215 tokenMap.put("finger", function);
216 tokenMap.put("fmt", function);
217 tokenMap.put("fmt_mail", function);
218 tokenMap.put("fold", function);
219 tokenMap.put("ftp", function);
220 tokenMap.put("function", function);
221 tokenMap.put("gcore", function);
222 tokenMap.put("get", function);
223 tokenMap.put("getopts", function);
224 tokenMap.put("gprof", function);
225 tokenMap.put("grep", function);
226 tokenMap.put("groups", function);
227 tokenMap.put("gunzip", function);
228 tokenMap.put("gzip", function);
229 tokenMap.put("hashcheck", function);
230 tokenMap.put("hashmake", function);
231 tokenMap.put("head", function);
232 tokenMap.put("help", function);
233 tokenMap.put("history", function);
234 tokenMap.put("imake", function);
235 tokenMap.put("indent", function);
236 tokenMap.put("install", function);
237 tokenMap.put("jobs", function);
238 tokenMap.put("join", function);
239 tokenMap.put("kill", function);
240 tokenMap.put("last", function);
241 tokenMap.put("ld", function);
242 tokenMap.put("leave", function);
243 tokenMap.put("less", function);
244 tokenMap.put("let", function);
245 tokenMap.put("lex", function);
246 tokenMap.put("lint", function);
247 tokenMap.put("ln", function);
248 tokenMap.put("login", function);
249 tokenMap.put("look", function);
250 tokenMap.put("lookbib", function);
251 tokenMap.put("lorder", function);
252 tokenMap.put("lp", function);
253 tokenMap.put("lpq", function);
254 tokenMap.put("lpr", function);
255 tokenMap.put("lprm", function);
256 tokenMap.put("ls", function);
257 tokenMap.put("mail", function);
258 tokenMap.put("Mail", function);
259 tokenMap.put("make", function);
260 tokenMap.put("man", function);
261 tokenMap.put("md", function);
262 tokenMap.put("mesg", function);
263 tokenMap.put("mkdir", function);
264 tokenMap.put("mkstr", function);
265 tokenMap.put("more", function);
266 tokenMap.put("mount", function);
267 tokenMap.put("mv", function);
268 tokenMap.put("nawk", function);
269 tokenMap.put("neqn", function);
270 tokenMap.put("nice", function);
271 tokenMap.put("nm", function);
272 tokenMap.put("nroff", function);
273 tokenMap.put("od", function);
274 tokenMap.put("page", function);
275 tokenMap.put("passwd", function);
276 tokenMap.put("paste", function);
277 tokenMap.put("pr", function);
278 tokenMap.put("print", function);
279 tokenMap.put("printf", function);
280 tokenMap.put("printenv", function);
281 tokenMap.put("prof", function);
282 tokenMap.put("prs", function);
283 tokenMap.put("prt", function);
284 tokenMap.put("ps", function);
285 tokenMap.put("ptx", function);
286 tokenMap.put("pwd", function);
287 tokenMap.put("quota", function);
288 tokenMap.put("ranlib", function);
289 tokenMap.put("rcp", function);
290 tokenMap.put("rcs", function);
291 tokenMap.put("rcsdiff", function);
292 tokenMap.put("read", function);
293 tokenMap.put("readonly", function);
294 tokenMap.put("red", function);
295 tokenMap.put("return", function);
296 tokenMap.put("rev", function);
297 tokenMap.put("rlogin", function);
298 tokenMap.put("rm", function);
299 tokenMap.put("rmdel", function);
300 tokenMap.put("rmdir", function);
301 tokenMap.put("roffbib", function);
302 tokenMap.put("rsh", function);
303 tokenMap.put("rup", function);
304 tokenMap.put("ruptime", function);
305 tokenMap.put("rusers", function);
306 tokenMap.put("rwall", function);
307 tokenMap.put("rwho", function);
308 tokenMap.put("sact", function);
309 tokenMap.put("sccs", function);
310 tokenMap.put("sccsdiff", function);
311 tokenMap.put("script", function);
312 tokenMap.put("sed", function);
313 tokenMap.put("set", function);
314 tokenMap.put("setgroups", function);
315 tokenMap.put("setsenv", function);
316 tokenMap.put("sh", function);
317 tokenMap.put("shift", function);
318 tokenMap.put("size", function);
319 tokenMap.put("sleep", function);
320 tokenMap.put("sort", function);
321 tokenMap.put("sortbib", function);
322 tokenMap.put("spell", function);
323 tokenMap.put("split", function);
324 tokenMap.put("ssh", function);
325 tokenMap.put("strings", function);
326 tokenMap.put("strip", function);
327 tokenMap.put("stty", function);
328 tokenMap.put("su", function);
329 tokenMap.put("sudo", function);
330 tokenMap.put("symorder", function);
331 tokenMap.put("tabs", function);
332 tokenMap.put("tail", function);
333 tokenMap.put("talk", function);
334 tokenMap.put("tar", function);
335 tokenMap.put("tbl", function);
336 tokenMap.put("tee", function);
337 tokenMap.put("telnet", function);
338 tokenMap.put("test", function);
339 tokenMap.put("tftp", function);
340 tokenMap.put("time", function);
341 tokenMap.put("times", function);
342 tokenMap.put("touch", function);
343 tokenMap.put("trap", function);
344 tokenMap.put("troff", function);
345 tokenMap.put("true", function);
346 tokenMap.put("tsort", function);
347 tokenMap.put("tty", function);
348 tokenMap.put("type", function);
349 tokenMap.put("typeset", function);
350 tokenMap.put("ue", function);
351 tokenMap.put("ul", function);
352 tokenMap.put("ulimit", function);
353 tokenMap.put("umask", function);
354 tokenMap.put("unalias", function);
355 tokenMap.put("uncompress", function);
356 tokenMap.put("unexpand", function);
357 tokenMap.put("unget", function);
358 tokenMap.put("unifdef", function);
359 tokenMap.put("uniq", function);
360 tokenMap.put("units", function);
361 tokenMap.put("unset", function);
362 tokenMap.put("uptime", function);
363 tokenMap.put("users", function);
364 tokenMap.put("uucp", function);
365 tokenMap.put("uudecode", function);
366 tokenMap.put("uuencode", function);
367 tokenMap.put("uulog", function);
368 tokenMap.put("uuname", function);
369 tokenMap.put("uusend", function);
370 tokenMap.put("uux", function);
371 tokenMap.put("vacation", function);
372 tokenMap.put("val", function);
373 tokenMap.put("vedit", function);
374 tokenMap.put("vgrind", function);
375 tokenMap.put("vi", function);
376 tokenMap.put("view", function);
377 tokenMap.put("vtroff", function);
378 tokenMap.put("w", function);
379 tokenMap.put("wait", function);
380 tokenMap.put("wall", function);
381 tokenMap.put("wc", function);
382 tokenMap.put("wait", function);
383 tokenMap.put("what", function);
384 tokenMap.put("whatis", function);
385 tokenMap.put("whence", function);
386 tokenMap.put("whereis", function);
387 tokenMap.put("which", function);
388 tokenMap.put("who", function);
389 tokenMap.put("whoami", function);
390 tokenMap.put("write", function);
391 tokenMap.put("xargs", function);
392 tokenMap.put("xstr", function);
393 tokenMap.put("yacc", function);
394 tokenMap.put("yes", function);
395 tokenMap.put("zcat", function);
396
397 return tokenMap;
398
399 }
400
401
402 /**
403 * Returns a list of tokens representing the given text.
404 *
405 * @param text The text to break into tokens.
406 * @param startTokenType The token with which to start tokenizing.
407 * @param startOffset The offset at which the line of tokens begins.
408 * @return A linked list of tokens representing <code>text</code>.
409 */
410 public Token getTokenList(Segment text, int startTokenType, final int startOffset) {
411
412 resetTokenList();
413
414 char[] array = text.array;
415 int offset = text.offset;
416 int count = text.count;
417 int end = offset + count;
418
419 // See, when we find a token, its starting position is always of the form:
420 // 'startOffset + (currentTokenStart-offset)'; but since startOffset and
421 // offset are constant, tokens' starting positions become:
422 // 'newStartOffset+currentTokenStart' for one less subraction operation.
423 int newStartOffset = startOffset - offset;
424
425 currentTokenStart = offset;
426 currentTokenType = startTokenType;
427 boolean backslash = false;
428
429//beginning:
430 for (int i=offset; i<end; i++) {
431
432 char c = array[i];
433
434 switch (currentTokenType) {
435
436 case Token.NULL:
437
438 currentTokenStart = i; // Starting a new token here.
439
440 switch (c) {
441
442 case ' ':
443 case '\t':
444 currentTokenType = Token.WHITESPACE;
445 break;
446
447 case '`':
448 if (backslash) { // Escaped back quote => call '`' an identifier..
449 addToken(text, currentTokenStart,i, Token.IDENTIFIER, newStartOffset+currentTokenStart);
450 backslash = false;
451 }
452 else {
453 currentTokenType = Token.LITERAL_BACKQUOTE;
454 }
455 break;
456
457 case '"':
458 if (backslash) { // Escaped double quote => call '"' an identifier..
459 addToken(text, currentTokenStart,i, Token.IDENTIFIER, newStartOffset+currentTokenStart);
460 backslash = false;
461 }
462 else {
463 currentTokenType = Token.LITERAL_STRING_DOUBLE_QUOTE;
464 }
465 break;
466
467 case '\'':
468 if (backslash) { // Escaped single quote => call '\'' an identifier.
469 addToken(text, currentTokenStart,i, Token.IDENTIFIER, newStartOffset+currentTokenStart);
470 backslash = false;
471 }
472 else {
473 currentTokenType = Token.LITERAL_CHAR;
474 }
475 break;
476
477 case '\\':
478 addToken(text, currentTokenStart,i, Token.IDENTIFIER, newStartOffset+currentTokenStart);
479 currentTokenType = Token.NULL;
480 backslash = !backslash;
481 break;
482
483 case '$':
484 if (backslash) { // Escaped dollar sign => call '$' an identifier..
485 addToken(text, currentTokenStart,i, Token.IDENTIFIER, newStartOffset+currentTokenStart);
486 backslash = false;
487 }
488 else {
489 currentTokenType = Token.VARIABLE;
490 }
491 break;
492
493 case '#':
494 backslash = false;
495 currentTokenType = Token.COMMENT_EOL;
496 break;
497
498 default:
499 if (RSyntaxUtilities.isDigit(c)) {
500 currentTokenType = Token.LITERAL_NUMBER_DECIMAL_INT;
501 break;
502 }
503 else if (RSyntaxUtilities.isLetter(c) || c=='/' || c=='_') {
504 currentTokenType = Token.IDENTIFIER;
505 break;
506 }
507 int indexOf = operators.indexOf(c,0);
508 if (indexOf>-1) {
509 addToken(text, currentTokenStart,i, Token.OPERATOR, newStartOffset+currentTokenStart);
510 currentTokenType = Token.NULL;
511 break;
512 }
513 indexOf = separators.indexOf(c,0);
514 if (indexOf>-1) {
515 addToken(text, currentTokenStart,i, Token.SEPARATOR, newStartOffset+currentTokenStart);
516 currentTokenType = Token.NULL;
517 break;
518 }
519 indexOf = separators2.indexOf(c,0);
520 if (indexOf>-1) {
521 addToken(text, currentTokenStart,i, Token.IDENTIFIER, newStartOffset+currentTokenStart);
522 currentTokenType = Token.NULL;
523 break;
524 }
525 else {
526 currentTokenType = Token.IDENTIFIER;
527 break;
528 }
529
530 } // End of switch (c).
531
532 break;
533
534 case Token.WHITESPACE:
535
536 switch (c) {
537
538 case ' ':
539 case '\t':
540 break; // Still whitespace.
541
542 case '\\':
543 addToken(text, currentTokenStart,i-1, Token.WHITESPACE, newStartOffset+currentTokenStart);
544 addToken(text, i,i, Token.IDENTIFIER, newStartOffset+i);
545 currentTokenType = Token.NULL;
546 backslash = true; // Previous char whitespace => this must be first backslash.
547 break;
548
549 case '`': // Don't need to worry about backslashes as previous char is space.
550 addToken(text, currentTokenStart,i-1, Token.WHITESPACE, newStartOffset+currentTokenStart);
551 currentTokenStart = i;
552 currentTokenType = Token.LITERAL_BACKQUOTE;
553 backslash = false;
554 break;
555
556 case '"': // Don't need to worry about backslashes as previous char is space.
557 addToken(text, currentTokenStart,i-1, Token.WHITESPACE, newStartOffset+currentTokenStart);
558 currentTokenStart = i;
559 currentTokenType = Token.LITERAL_STRING_DOUBLE_QUOTE;
560 backslash = false;
561 break;
562
563 case '\'': // Don't need to worry about backslashes as previous char is space.
564 addToken(text, currentTokenStart,i-1, Token.WHITESPACE, newStartOffset+currentTokenStart);
565 currentTokenStart = i;
566 currentTokenType = Token.LITERAL_CHAR;
567 backslash = false;
568 break;
569
570 case '$': // Don't need to worry about backslashes as previous char is space.
571 addToken(text, currentTokenStart,i-1, Token.WHITESPACE, newStartOffset+currentTokenStart);
572 currentTokenStart = i;
573 currentTokenType = Token.VARIABLE;
574 backslash = false;
575 break;
576
577 case '#':
578 addToken(text, currentTokenStart,i-1, Token.WHITESPACE, newStartOffset+currentTokenStart);
579 currentTokenStart = i;
580 currentTokenType = Token.COMMENT_EOL;
581 break;
582
583 default: // Add the whitespace token and start anew.
584
585 addToken(text, currentTokenStart,i-1, Token.WHITESPACE, newStartOffset+currentTokenStart);
586 currentTokenStart = i;
587
588 if (RSyntaxUtilities.isDigit(c)) {
589 currentTokenType = Token.LITERAL_NUMBER_DECIMAL_INT;
590 break;
591 }
592 else if (RSyntaxUtilities.isLetter(c) || c=='/' || c=='_') {
593 currentTokenType = Token.IDENTIFIER;
594 break;
595 }
596 int indexOf = operators.indexOf(c,0);
597 if (indexOf>-1) {
598 addToken(text, i,i, Token.OPERATOR, newStartOffset+i);
599 currentTokenType = Token.NULL;
600 break;
601 }
602 indexOf = separators.indexOf(c,0);
603 if (indexOf>-1) {
604 addToken(text, i,i, Token.SEPARATOR, newStartOffset+i);
605 currentTokenType = Token.NULL;
606 break;
607 }
608 indexOf = separators2.indexOf(c,0);
609 if (indexOf>-1) {
610 addToken(text, i,i, Token.IDENTIFIER, newStartOffset+i);
611 currentTokenType = Token.NULL;
612 break;
613 }
614 else {
615 currentTokenType = Token.IDENTIFIER;
616 }
617
618 } // End of switch (c).
619
620 break;
621
622 default: // Should never happen
623 case Token.IDENTIFIER:
624
625 switch (c) {
626
627 case ' ':
628 case '\t':
629 addToken(text, currentTokenStart,i-1, Token.IDENTIFIER, newStartOffset+currentTokenStart);
630 currentTokenStart = i;
631 currentTokenType = Token.WHITESPACE;
632 break;
633
634 case '/': // Special-case to colorize commands like "echo" in "/bin/echo"
635 addToken(text, currentTokenStart,i, Token.IDENTIFIER, newStartOffset+currentTokenStart);
636 currentTokenStart = i+1;
637 currentTokenType = Token.NULL;
638 break;
639
640 case '`': // Don't need to worry about backslashes as previous char is space.
641 addToken(text, currentTokenStart,i-1, Token.IDENTIFIER, newStartOffset+currentTokenStart);
642 currentTokenStart = i;
643 currentTokenType = Token.LITERAL_BACKQUOTE;
644 backslash = false;
645 break;
646
647 case '"': // Don't need to worry about backslashes as previous char is non-backslash.
648 addToken(text, currentTokenStart,i-1, Token.IDENTIFIER, newStartOffset+currentTokenStart);
649 currentTokenStart = i;
650 currentTokenType = Token.LITERAL_STRING_DOUBLE_QUOTE;
651 backslash = false;
652 break;
653
654 case '\'': // Don't need to worry about backslashes as previous char is non-backslash.
655 addToken(text, currentTokenStart,i-1, Token.IDENTIFIER, newStartOffset+currentTokenStart);
656 currentTokenStart = i;
657 currentTokenType = Token.LITERAL_CHAR;
658 backslash = false;
659 break;
660
661 case '\\':
662 addToken(text, currentTokenStart,i-1, Token.IDENTIFIER, newStartOffset+currentTokenStart);
663 addToken(text, i,i, Token.IDENTIFIER, newStartOffset+i);
664 currentTokenType = Token.NULL;
665 backslash = true;
666 break;
667
668 case '$': // Don't need to worry about backslashes as previous char is non-backslash.
669 addToken(text, currentTokenStart,i-1, Token.IDENTIFIER, newStartOffset+currentTokenStart);
670 currentTokenStart = i;
671 currentTokenType = Token.VARIABLE;
672 backslash = false;
673 break;
674
675 case '=': // Special case here; when you have "identifier=<value>" in shell, "identifier" is a variable.
676 addToken(text, currentTokenStart,i-1, Token.VARIABLE, newStartOffset+currentTokenStart);
677 addToken(text, i,i, Token.OPERATOR, newStartOffset+i);
678 currentTokenType = Token.NULL;
679 break;
680
681 default:
682 if (RSyntaxUtilities.isLetterOrDigit(c) || c=='/' || c=='_') {
683 break; // Still an identifier of some type.
684 }
685 int indexOf = operators.indexOf(c);
686 if (indexOf>-1) {
687 addToken(text, currentTokenStart,i-1, Token.IDENTIFIER, newStartOffset+currentTokenStart);
688 addToken(text, i,i, Token.OPERATOR, newStartOffset+i);
689 currentTokenType = Token.NULL;
690 break;
691 }
692 indexOf = separators.indexOf(c,0);
693 if (indexOf>-1) {
694 addToken(text, currentTokenStart,i-1, Token.IDENTIFIER, newStartOffset+currentTokenStart);
695 addToken(text, i,i, Token.SEPARATOR, newStartOffset+i);
696 currentTokenType = Token.NULL;
697 break;
698 }
699 indexOf = separators2.indexOf(c,0);
700 if (indexOf>-1) {
701 addToken(text, currentTokenStart,i-1, Token.IDENTIFIER, newStartOffset+currentTokenStart);
702 addToken(text, i,i, Token.IDENTIFIER, newStartOffset+i);
703 currentTokenType = Token.NULL;
704 break;
705 }
706 // Otherwise, we're still an identifier (?).
707
708 } // End of switch (c).
709
710 break;
711
712 case Token.LITERAL_NUMBER_DECIMAL_INT:
713
714 switch (c) {
715
716 case ' ':
717 case '\t':
718 addToken(text, currentTokenStart,i-1, Token.LITERAL_NUMBER_DECIMAL_INT, newStartOffset+currentTokenStart);
719 currentTokenStart = i;
720 currentTokenType = Token.WHITESPACE;
721 break;
722
723 case '`': // Don't need to worry about backslashes as previous char is space.
724 addToken(text, currentTokenStart,i-1, Token.LITERAL_NUMBER_DECIMAL_INT, newStartOffset+currentTokenStart);
725 currentTokenStart = i;
726 currentTokenType = Token.LITERAL_BACKQUOTE;
727 backslash = false;
728 break;
729
730 case '"': // Don't need to worry about backslashes as previous char is non-backslash.
731 addToken(text, currentTokenStart,i-1, Token.LITERAL_NUMBER_DECIMAL_INT, newStartOffset+currentTokenStart);
732 currentTokenStart = i;
733 currentTokenType = Token.LITERAL_STRING_DOUBLE_QUOTE;
734 backslash = false;
735 break;
736
737 case '\'': // Don't need to worry about backslashes as previous char is non-backslash.
738 addToken(text, currentTokenStart,i-1, Token.LITERAL_NUMBER_DECIMAL_INT, newStartOffset+currentTokenStart);
739 currentTokenStart = i;
740 currentTokenType = Token.LITERAL_CHAR;
741 backslash = false;
742 break;
743
744 case '$': // Don't need to worry about backslashes as previous char is non-backslash.
745 addToken(text, currentTokenStart,i-1, Token.LITERAL_NUMBER_DECIMAL_INT, newStartOffset+currentTokenStart);
746 currentTokenStart = i;
747 currentTokenType = Token.VARIABLE;
748 backslash = false;
749 break;
750
751 case '\\':
752 addToken(text, currentTokenStart,i-1, Token.LITERAL_NUMBER_DECIMAL_INT, newStartOffset+currentTokenStart);
753 addToken(text, i,i, Token.IDENTIFIER, newStartOffset+i);
754 currentTokenType = Token.NULL;
755 backslash = true;
756 break;
757
758 default:
759
760 if (RSyntaxUtilities.isDigit(c)) {
761 break; // Still a literal number.
762 }
763 int indexOf = operators.indexOf(c);
764 if (indexOf>-1) {
765 addToken(text, currentTokenStart,i-1, Token.LITERAL_NUMBER_DECIMAL_INT, newStartOffset+currentTokenStart);
766 addToken(text, i,i, Token.OPERATOR, newStartOffset+i);
767 currentTokenType = Token.NULL;
768 break;
769 }
770 indexOf = separators.indexOf(c);
771 if (indexOf>-1) {
772 addToken(text, currentTokenStart,i-1, Token.LITERAL_NUMBER_DECIMAL_INT, newStartOffset+currentTokenStart);
773 addToken(text, i,i, Token.SEPARATOR, newStartOffset+i);
774 currentTokenType = Token.NULL;
775 break;
776 }
777 indexOf = separators2.indexOf(c);
778 if (indexOf>-1) {
779 addToken(text, currentTokenStart,i-1, Token.LITERAL_NUMBER_DECIMAL_INT, newStartOffset+currentTokenStart);
780 addToken(text, i,i, Token.IDENTIFIER, newStartOffset+i);
781 currentTokenType = Token.NULL;
782 break;
783 }
784
785 // Otherwise, remember this was a number and start over.
786 addToken(text, currentTokenStart,i-1, Token.LITERAL_NUMBER_DECIMAL_INT, newStartOffset+currentTokenStart);
787 i--;
788 currentTokenType = Token.NULL;
789
790 } // End of switch (c).
791
792 break;
793
794 case Token.VARIABLE:
795
796 // Note that we first arrive here AFTER the '$' character.
797 // First check if the variable name is enclosed in '{' and '}' characters.
798 if (c=='{') {
799 while (++i<end) {
800 if (array[i]=='}') {
801 addToken(text, currentTokenStart,i, Token.VARIABLE, newStartOffset+currentTokenStart);
802 currentTokenType = Token.NULL;
803 break;
804 }
805 } // End of while (++i<end).
806 if (i==end) { // Happens when '}' wasn't found...
807 addToken(text, currentTokenStart,end-1, Token.VARIABLE, newStartOffset+currentTokenStart);
808 currentTokenType = Token.NULL;
809 }
810 break;
811 } // End of if (i<end-1 && array[i+1]=='{').
812
813 // If we didn't find the '{' character, find the end of the variable...
814 while (i<end) {
815 c = array[i]; // Not needed the first iteration, but can't think of a better way to do it...
816 if (!RSyntaxUtilities.isLetterOrDigit(c) && shellVariables.indexOf(c)==-1 && c!='_') {
817 addToken(text, currentTokenStart,i-1, Token.VARIABLE, newStartOffset+currentTokenStart);
818 i--;
819 currentTokenType = Token.NULL;
820 break;
821 }
822 i++;
823 }
824
825 // This only happens if we never found the end of the variable in the loop above.
826 if (i==end) {
827 addToken(text, currentTokenStart,i-1, Token.VARIABLE, newStartOffset+currentTokenStart);
828 currentTokenType = Token.NULL;
829 }
830
831 break;
832
833 case Token.COMMENT_EOL:
834 // If we got here, then the line != "#" only, so check for "#!".
835 if (c=='!')
836 currentTokenType = Token.PREPROCESSOR;
837 i = end - 1;
838 addToken(text, currentTokenStart,i, currentTokenType, newStartOffset+currentTokenStart);
839 // We need to set token type to null so at the bottom we don't add one more token.
840 currentTokenType = Token.NULL;
841
842 break;
843
844 case Token.LITERAL_CHAR:
845
846 if (c=='\\') {
847 backslash = !backslash; // Okay because if we got in here, backslash was initially false.
848 }
849 else {
850 if (c=='\'' && !backslash) {
851 addToken(text, currentTokenStart,i, Token.LITERAL_CHAR, newStartOffset+currentTokenStart);
852 currentTokenStart = i + 1;
853 currentTokenType = Token.NULL;
854 // backslash is definitely false when we leave.
855 }
856
857 backslash = false; // Need to set backslash to false here as a character was typed.
858
859 }
860 // Otherwise, we're still an unclosed char literal...
861
862 break;
863
864 case Token.LITERAL_BACKQUOTE:
865
866 switch (c) {
867
868 case '\\':
869 backslash = !backslash;
870 break;
871
872 case '`':
873 if (!backslash) {
874 addToken(text, currentTokenStart,i, Token.LITERAL_BACKQUOTE, newStartOffset+currentTokenStart);
875 currentTokenType = Token.NULL;
876 // backslash is definitely false when we leave.
877 break;
878 }
879 backslash = false;
880 break;
881
882 // Variable in the backquote string...
883 case '$':
884
885 if (backslash==true) {
886 backslash = false;
887 break;
888 }
889
890 // Add the string up-to the variable.
891 addToken(text, currentTokenStart,i-1, Token.LITERAL_BACKQUOTE, newStartOffset+currentTokenStart);
892 currentTokenType = Token.VARIABLE;
893 currentTokenStart = i;
894
895 // First check if the variable name is enclosed in '{' and '}' characters.
896 if (i<end-1 && array[i+1]=='{') {
897 i++; // Now we're on the '{' char.
898 while (++i<end) {
899 if (array[i]=='}') {
900 addToken(text, currentTokenStart,i, Token.VARIABLE, newStartOffset+currentTokenStart);
901 i++;
902 if (i<end) {
903 c = array[i];
904 if (c=='`') { // The only rub - back quote right after variable.
905 addToken(text, i,i, Token.LITERAL_BACKQUOTE, newStartOffset+i);
906 currentTokenType = Token.NULL;
907 break;
908 }
909 else { // Continue on with the string.
910 currentTokenStart = i;
911 currentTokenType = Token.LITERAL_BACKQUOTE;
912 i--;
913 break;
914 }
915 }
916 else { // i==end = "trick" this method so that the string is continued to the next line.
917 currentTokenStart = i;
918 currentTokenType = Token.LITERAL_BACKQUOTE;
919 break; // So we don't hit the condition below.
920 }
921 } // End of if (array[i]=='}').
922 } // End of while (++i<end).
923 if (i==end) { // Happens when '}' wasn't found...
924 addToken(text, currentTokenStart,end-1, Token.VARIABLE, newStartOffset+currentTokenStart);
925 currentTokenStart = end; // ???
926 currentTokenType = Token.LITERAL_BACKQUOTE;
927 break;
928 }
929 } // End of if (i<end-1 && array[i+1]=='{').
930
931 // If we reached the end of the variable, get out.
932 if (currentTokenType==Token.NULL || currentTokenType==Token.LITERAL_BACKQUOTE)
933 break;
934
935 // If we didn't find the '{' character, find the end of the variable...
936 // Increment first to skip the '$'.
937 while (++i<end) {
938 c = array[i];
939 if (!RSyntaxUtilities.isLetterOrDigit(c) && shellVariables.indexOf(c)==-1 && c!='_') {
940 addToken(text, currentTokenStart,i-1, Token.VARIABLE, newStartOffset+currentTokenStart);
941 if (c=='`') { // The only rub.
942 addToken(text, i,i, Token.LITERAL_BACKQUOTE, newStartOffset+i);
943 currentTokenType = Token.NULL;
944 break;
945 }
946 else {
947 currentTokenStart = i;
948 currentTokenType = Token.LITERAL_BACKQUOTE;
949 i--;
950 break;
951 }
952 }
953 }
954
955 // This only happens if we never found the end of the variable in the loop above.
956 // We "trick" this method so that the backquote string token is at the end.
957 if (i==end) {
958 addToken(text, currentTokenStart,i-1, Token.VARIABLE, newStartOffset+currentTokenStart);
959 currentTokenStart = i;
960 currentTokenType = Token.LITERAL_BACKQUOTE;
961 }
962
963 break;
964
965 // Otherwise, we're still in an unclosed string...
966 default:
967 backslash = false; // Need to set backslash to false here as a character was typed.
968
969 } // End of switch (c).
970
971 break;
972
973 case Token.LITERAL_STRING_DOUBLE_QUOTE:
974
975 switch (c) {
976
977 case '\\':
978 backslash = !backslash;
979 break;
980
981 case '"':
982 if (!backslash) {
983 addToken(text, currentTokenStart,i, Token.LITERAL_STRING_DOUBLE_QUOTE, newStartOffset+currentTokenStart);
984 currentTokenType = Token.NULL;
985 // backslash is definitely false when we leave.
986 break;
987 }
988 backslash = false;
989 break;
990
991 // Variable in the double-quoted string...
992 case '$':
993
994 if (backslash==true) {
995 backslash = false;
996 break;
997 }
998
999 // Add the string up-to the variable.
1000 addToken(text, currentTokenStart,i-1, Token.LITERAL_STRING_DOUBLE_QUOTE, newStartOffset+currentTokenStart);
1001 currentTokenType = Token.VARIABLE;
1002 currentTokenStart = i;
1003
1004 // First check if the variable name is enclosed in '{' and '}' characters.
1005 if (i<end-1 && array[i+1]=='{') {
1006 i++; // Now we're on the '{' char.
1007 while (++i<end) {
1008 if (array[i]=='}') {
1009 addToken(text, currentTokenStart,i, Token.VARIABLE, newStartOffset+currentTokenStart);
1010 i++;
1011 if (i<end) {
1012 c = array[i];
1013 if (c=='"') { // The only rub - double-quote right after variable.
1014 addToken(text, i,i, Token.LITERAL_STRING_DOUBLE_QUOTE, newStartOffset+i);
1015 currentTokenType = Token.NULL;
1016 break;
1017 }
1018 else { // Continue on with the string.
1019 currentTokenStart = i;
1020 currentTokenType = Token.LITERAL_STRING_DOUBLE_QUOTE;
1021 i--;
1022 break;
1023 }
1024 }
1025 else { // i==end = "trick" this method so that the string is continued to the next line.
1026 currentTokenStart = i;
1027 currentTokenType = Token.LITERAL_STRING_DOUBLE_QUOTE;
1028 break; // So we don't hit the condition below.
1029 }
1030 } // End of if (array[i]=='}').
1031 } // End of while (++i<end).
1032 if (i==end) { // Happens when '}' wasn't found...
1033 addToken(text, currentTokenStart,end-1, Token.VARIABLE, newStartOffset+currentTokenStart);
1034 currentTokenStart = end; // ???
1035 currentTokenType = Token.LITERAL_STRING_DOUBLE_QUOTE;
1036 break;
1037 }
1038 } // End of if (i<end-1 && array[i+1]=='{').
1039
1040 // If we reached the end of the variable, get out.
1041 if (currentTokenType==Token.NULL || currentTokenType==Token.LITERAL_STRING_DOUBLE_QUOTE)
1042 break;
1043
1044 // If we didn't find the '{' character, find the end of the variable...
1045 // Increment first to skip the '$'.
1046 while (++i<end) {
1047 c = array[i];
1048 if (!RSyntaxUtilities.isLetterOrDigit(c) && shellVariables.indexOf(c)==-1 && c!='_') {
1049 addToken(text, currentTokenStart,i-1, Token.VARIABLE, newStartOffset+currentTokenStart);
1050 if (c=='"') { // The only rub.
1051 addToken(text, i,i, Token.LITERAL_STRING_DOUBLE_QUOTE, newStartOffset+i);
1052 currentTokenType = Token.NULL;
1053 break;
1054 }
1055 else {
1056 currentTokenStart = i;
1057 currentTokenType = Token.LITERAL_STRING_DOUBLE_QUOTE;
1058 i--;
1059 break;
1060 }
1061 }
1062 }
1063
1064 // This only happens if we never found the end of the variable in the loop above.
1065 // We "trick" this method so that the double-quote string token is at the end.
1066 if (i==end) {
1067 addToken(text, currentTokenStart,i-1, Token.VARIABLE, newStartOffset+currentTokenStart);
1068 currentTokenStart = i;
1069 currentTokenType = Token.LITERAL_STRING_DOUBLE_QUOTE;
1070 }
1071
1072 break;
1073
1074 // Otherwise, we're still in an unclosed string...
1075 default:
1076 backslash = false; // Need to set backslash to false here as a character was typed.
1077
1078 } // End of switch (c).
1079
1080 break;
1081
1082 } // End of switch (currentTokenType).
1083
1084 } // End of for (int i=offset; i<end; i++).
1085
1086 switch (currentTokenType) {
1087
1088 // Remember what token type to begin the next line with.
1089 case Token.LITERAL_BACKQUOTE:
1090 case Token.LITERAL_STRING_DOUBLE_QUOTE:
1091 case Token.LITERAL_CHAR:
1092 addToken(text, currentTokenStart,end-1, currentTokenType, newStartOffset+currentTokenStart);
1093 break;
1094
1095 // Do nothing if everything was okay.
1096 case Token.NULL:
1097 addNullToken();
1098 break;
1099
1100 // All other token types don't continue to the next line...
1101 default:
1102 addToken(text, currentTokenStart,end-1, currentTokenType, newStartOffset+currentTokenStart);
1103 addNullToken();
1104
1105 }
1106
1107 // Return the first token in our linked list.
1108 return firstToken;
1109
1110 }
1111
1112
1113}
Note: See TracBrowser for help on using the repository browser.