Context Navigation

← Previous Changeset
Next Changeset →

Changeset 6139

Timestamp:

2003-12-08T14:14:08+13:00 (20 years ago)

Author:

jmt12

Message:

Fixed one possible NPE and fixed the tabbing, hence the large difference in lines

File:

: 1 edited

trunk/gli/src/org/greenstone/gatherer/cdm/CommandTokenizer.java (modified) (1 diff)

Legend:

: Unmodified
: Added
: Removed

trunk/gli/src/org/greenstone/gatherer/cdm/CommandTokenizer.java

-              r6051
+              r6139
  */
 public class CommandTokenizer {
+    static final public int BRACKET_ENCLOSED            = 0;
+    static final public int DOUBLE_QUOTE_ENCLOSED   = 1;
+    static final public int NORMAL                      = 2;
+    static final public int QUOTE_ENCLOSED              = 3;
+    private BufferedReader in_stream;
+    private int count = -1;
+    private StringTokenizer internal_tokenizer;
+    /** Basic Constructor. Used to parse tokens from a string keeping tokens surrounded by speechmarks or square brackets intact. Thus something like:<br>
+     * collectionmeta collectionextra [l = en] "Hello World"<br>
+     * is tokenized thus<br>
+     * {'collectionmeta', 'collectionextra', 'l = en', 'Hello World'}
+     * @param command the command String you wish to tokenize
+     */
+    public CommandTokenizer(String command) {
+        this.internal_tokenizer = new StringTokenizer(command);
+        this.in_stream = null;
+    }
+    /** Advanced Constructor. As above but with one major difference. Since it is provided an input stream (presumably where the command string originated from), it is able to parse a quote enclosed command token that stretches over several lines. Each newline is preserved in the resulting token. There is an extra bitchslap here as comething like a collection extra might have html code in them that contain escaped speechmarks, so extra care must be taken not to break at them. Thus something like:<br>
+     * collectionmeta collectionextra [l = en] "<br>
+     *     an example of the crazy as description we sometimes get which includes of all things something like <a href=\"this.html\"<br>
+     *     >this</a> which you could easily see might be a problem if I parse this niavely."<br>
+     * is tokenized thus<br>
+     * {'collectionmeta', 'collectionextra', 'l = en', '\nan example of the crazy as description we sometimes get which includes of all things something like <a href=\"this.html\"\n>this</a> which you could easily see might be a problem if I parse this niavely.'}
+     * @param command the command String you wish to tokenize
+     * @param in_stream a BufferedReader from which the tokenizer can draw further lines as necessary
+     */
+    public CommandTokenizer(String command, BufferedReader in_stream) {
+        ///atherer.println("***** CommandTokenizer *****\nparse:\t" + command + "\n****************************");
+        this.internal_tokenizer = new StringTokenizer(command);
+        this.in_stream = in_stream;
+    }
+    /** Returns the minumum number of remaining tokens before the tokenizer runs out of string. There may be more tokens than this count, but never less. The discrepancy is due to internal functionality and the fact we can't read ahead in the string or associated stream without risking the need for unpredictable push-back
+     * @return the minumum number of tokens available as an int
+     */
+    public int countTokens() {
+        if(count == 0 && internal_tokenizer.countTokens() > 1) {
+             return 1;
+        }
+        if(count == -1) {
+             count = internal_tokenizer.countTokens();
+        }
+        return count;
+    }
+    /** Determine if there are still tokens available.
+     * @return true if there are more tokens, false otherwise
+     */
+    public boolean hasMoreTokens() {
+         return internal_tokenizer.hasMoreTokens();
+    }
+    /** Method to retrieve the next token from the command, taking care to group tokens enclosed in speech marks.
+     * @return a String containing the next token from the command
+     */
+    public String nextToken() {
+        String result = null;
+    static final public int BRACKET_ENCLOSED        = 0;
+    static final public int DOUBLE_QUOTE_ENCLOSED   = 1;
+    static final public int NORMAL          = 2;
+    static final public int QUOTE_ENCLOSED      = 3;
+    private BufferedReader in_stream;
+    private int count = -1;
+    private StringTokenizer internal_tokenizer;
+    /** Basic Constructor. Used to parse tokens from a string keeping tokens surrounded by speechmarks or square brackets intact. Thus something like:<br>
+     * collectionmeta collectionextra [l = en] "Hello World"<br>
+     * is tokenized thus<br>
+     * {'collectionmeta', 'collectionextra', 'l = en', 'Hello World'}
+     * @param command the command String you wish to tokenize
+     */
+    public CommandTokenizer(String command) {
+    this.internal_tokenizer = new StringTokenizer(command);
+    this.in_stream = null;
+    }
+    /** Advanced Constructor. As above but with one major difference. Since it is provided an input stream (presumably where the command string originated from), it is able to parse a quote enclosed command token that stretches over several lines. Each newline is preserved in the resulting token. There is an extra bitchslap here as comething like a collection extra might have html code in them that contain escaped speechmarks, so extra care must be taken not to break at them. Thus something like:<br>
+     * collectionmeta collectionextra [l = en] "<br>
+     *     an example of the crazy as description we sometimes get which includes of all things something like <a href=\"this.html\"<br>
+     *     >this</a> which you could easily see might be a problem if I parse this niavely."<br>
+     * is tokenized thus<br>
+     * {'collectionmeta', 'collectionextra', 'l = en', '\nan example of the crazy as description we sometimes get which includes of all things something like <a href=\"this.html\"\n>this</a> which you could easily see might be a problem if I parse this niavely.'}
+     * @param command the command String you wish to tokenize
+     * @param in_stream a BufferedReader from which the tokenizer can draw further lines as necessary
+     */
+    public CommandTokenizer(String command, BufferedReader in_stream) {
+    ///atherer.println("***** CommandTokenizer *****\nparse:\t" + command + "\n****************************");
+    this.internal_tokenizer = new StringTokenizer(command);
+    this.in_stream = in_stream;
+    }
+    /** Returns the minumum number of remaining tokens before the tokenizer runs out of string. There may be more tokens than this count, but never less. The discrepancy is due to internal functionality and the fact we can't read ahead in the string or associated stream without risking the need for unpredictable push-back
+     * @return the minumum number of tokens available as an int
+     */
+    public int countTokens() {
+    if(count == 0 && internal_tokenizer.countTokens() > 1) {
+        return 1;
+    }
+    if(count == -1) {
+        count = internal_tokenizer.countTokens();
+    }
+    return count;
+    }
+    /** Determine if there are still tokens available.
+     * @return true if there are more tokens, false otherwise
+     */
+    public boolean hasMoreTokens() {
+    return internal_tokenizer.hasMoreTokens();
+    }
+    /** Method to retrieve the next token from the command, taking care to group tokens enclosed in speech marks.
+     * @return a String containing the next token from the command
+     */
+    public String nextToken() {
+    String result = null;
+    if(internal_tokenizer.hasMoreTokens()) {
+        StringBuffer buffer = new StringBuffer(internal_tokenizer.nextToken());
+        switch(buffer.charAt(0)) {
+        case StaticStrings.DOUBLEQUOTE_CHAR:
+        result = buildToken(buffer, StaticStrings.DOUBLEQUOTE_CHAR, true);
+        break;
+        case StaticStrings.SINGLEQUOTE_CHAR:
+        result = buildToken(buffer, StaticStrings.SINGLEQUOTE_CHAR, true);
+        break;
+        case StaticStrings.OPENBRACKET_CHAR:
+        result = buildToken(buffer, StaticStrings.CLOSEBRACKET_CHAR, false);
+        break;
+        default:
+        result = buffer.toString();
+        }
+        buffer = null;
+    }
+    // Because of our tricky counting system we never want to have negative tokens remaining. In fact, unless the internal string buffer is empty, we will return a count of 1 anyway
+    if(count > 0) {
+        count = count - 1;
+    }
+    ///atherer.println("----- CommandTokenizer -----\ntoken:\t" + result + "\n----------------------------");
+    return result;
+    }
+    /** Parse in the next token. paying heed to enclosing characters demands, escaped characters, newlines and empty buffers and consequential unexpected end of tokens
+     * @param buffer the StringBuffer in which the partial token is stored (at the first bit that caused this method to be called)
+     * @param end_char the sentinel char we are watching for as it encloses a token
+     * @param strip_characters a boolean denoting whether the enclosing characters should be stripped off
+     * @return the token, either in its entirity less the enclosing characters if required or, if an unexpected end occured, whatever we parsed without its starting enclosing character, again only if required. In fact if we weren't asked to strip characters then we add the enclosing character back in
+     */
+    private String buildToken(StringBuffer buffer, char end_char, boolean strip_characters) {
+    while(buffer.charAt(buffer.length() - 1) != end_char || (buffer.length() > 3 && buffer.charAt(buffer.length() - 2) == StaticStrings.BACKSLASH_CHAR)) {
+        try {
+        // The first version is for the basic tokenizer which has no idea of an input stream, so runs out tokens at the same time as the internal tokenizer does
         if(internal_tokenizer.hasMoreTokens()) {
+            StringBuffer buffer = new StringBuffer(internal_tokenizer.nextToken());
+            switch(buffer.charAt(0)) {
+                case StaticStrings.DOUBLEQUOTE_CHAR:
+                    result = buildToken(buffer, StaticStrings.DOUBLEQUOTE_CHAR, true);
+                    break;
+                case StaticStrings.SINGLEQUOTE_CHAR:
+                    result = buildToken(buffer, StaticStrings.SINGLEQUOTE_CHAR, true);
+                    break;
+                case StaticStrings.OPENBRACKET_CHAR:
+                    result = buildToken(buffer, StaticStrings.CLOSEBRACKET_CHAR, false);
+                    break;
+                default:
+                    result = buffer.toString();
+            buffer.append(StaticStrings.SPACE_CHAR);
+            buffer.append(internal_tokenizer.nextToken());
+        }
+        // While the second version can draw more lines from the stream until eof occurs
+        else if(in_stream != null) {
+            String line_str = null;
+            while(!internal_tokenizer.hasMoreTokens() && (line_str = in_stream.readLine()) != null) {
+            ///atherer.println("+++++ CommandTokenizer +++++\nappend:\t" + line_str + "\n+++++++++++++++++++++++++++++");
+            // Its at this stage the our token count becomes completely patu
+            internal_tokenizer = new StringTokenizer(line_str);
+            buffer.append(StaticStrings.NEW_LINE_CHAR); // A new line in the final token
+            }
+            line_str = null;
+            if(internal_tokenizer.hasMoreTokens()) {
+            // Don't add a space if we just added a newline
+            if(buffer.charAt(buffer.length() - 1) != StaticStrings.NEW_LINE_CHAR) {
+                buffer.append(StaticStrings.SPACE_CHAR);
+            }
+            buffer = null;
+        }
+        // Because of our tricky counting system we never want to have negative tokens remaining. In fact, unless the internal string buffer is empty, we will return a count of 1 anyway
+        if(count > 0) {
+            count = count - 1;
+        }
+        ///atherer.println("----- CommandTokenizer -----\ntoken:\t" + result + "\n----------------------------");
+        return result;
+    }
+    /** Parse in the next token. paying heed to enclosing characters demands, escaped characters, newlines and empty buffers and consequential unexpected end of tokens
+     * @param buffer the StringBuffer in which the partial token is stored (at the first bit that caused this method to be called)
+     * @param end_char the sentinel char we are watching for as it encloses a token
+     * @param strip_characters a boolean denoting whether the enclosing characters should be stripped off
+     * @return the token, either in its entirity less the enclosing characters if required or, if an unexpected end occured, whatever we parsed without its starting enclosing character, again only if required. In fact if we weren't asked to strip characters then we add the enclosing character back in
+     */
+    private String buildToken(StringBuffer buffer, char end_char, boolean strip_characters) {
+        while(buffer.charAt(buffer.length() - 1) != end_char || (buffer.length() > 3 && buffer.charAt(buffer.length() - 2) == StaticStrings.BACKSLASH_CHAR)) {
+            try {
+                // The first version is for the basic tokenizer which has no idea of an input stream, so runs out tokens at the same time as the internal tokenizer does
+                if(internal_tokenizer.hasMoreTokens()) {
+                    buffer.append(StaticStrings.SPACE_CHAR);
+                    buffer.append(internal_tokenizer.nextToken());
+                }
+                // While the second version can draw more lines from the stream until eof occurs
+                else if(in_stream != null) {
+                    String line_str = null;
+                    while(!internal_tokenizer.hasMoreTokens() && (line_str = in_stream.readLine()) != null) {
+                        ///atherer.println("+++++ CommandTokenizer +++++\nappend:\t" + line_str + "\n+++++++++++++++++++++++++++++");
+                        // Its at this stage the our token count becomes completely patu
+                        internal_tokenizer = new StringTokenizer(line_str);
+                        buffer.append(StaticStrings.NEW_LINE_CHAR); // A new line in the final token
+                    }
+                    line_str = null;
+                    if(internal_tokenizer.hasMoreTokens()) {
+                        // Don't add a space if we just added a newline
+                        if(buffer.charAt(buffer.length() - 1) != StaticStrings.NEW_LINE_CHAR) {
+                            buffer.append(StaticStrings.SPACE_CHAR);
+                        }
+                        buffer.append(internal_tokenizer.nextToken());
+                    }
+                    // We've prematurely run out of content, so throw the dummy, or at least return whatever we managed to parse sans its opening character
+                    else {
+                        if(strip_characters) {
+                            return buffer.substring(1);
+                        }
+                        else {
+                            buffer.append(end_char);
+                            return buffer.toString();
+                        }
+                    }
+                }
+                // We've prematurely run out of content, so throw the dummy, or at least return whatever we managed to parse sans its opening character
+                else {
+                    if(strip_characters) {
+                        return buffer.substring(1);
+                    }
+                    else {
+                        buffer.append(end_char);
+                        return buffer.toString();
+                    }
+                }
+            buffer.append(internal_tokenizer.nextToken());
+            }
+            // We've prematurely run out of content, so throw the dummy, or at least return whatever we managed to parse sans its opening character
+            else {
+            if(strip_characters) {
+                return buffer.substring(1);
+            }
+            // Exception throw when we attempted reading from the input stream, so throw the dummy, or at least return whatever we managed to parse sans its opening character
+            catch(Exception exception) {
+                Gatherer.printStackTrace(exception);
+                if(strip_characters) {
+                    return buffer.substring(1);
+                }
+                else {
+                    buffer.append(end_char);
+                    return buffer.toString();
+                }
+            else {
+                buffer.append(end_char);
+                return buffer.toString();
+            }
+        }
+        // Return the string sans enclosing characters
+        if(buffer.length() > 0 && strip_characters) {
+            return buffer.substring(1, buffer.length() - 1);
+        }
+            }
+        }
+        // We've prematurely run out of content, so throw the dummy, or at least return whatever we managed to parse sans its opening character
         else {
+            if(strip_characters) {
+            return buffer.substring(1);
+            }
+            else {
+            buffer.append(end_char);
             return buffer.toString();
+        }
+    }
+            }
+        }
+        }
+        // Exception thrown when we attempted reading from the input stream, so throw the dummy, or at least return whatever we managed to parse sans its opening character
+        catch(Exception exception) {
+        Gatherer.printStackTrace(exception);
+        if(strip_characters) {
+            return buffer.substring(1);
+        }
+        else {
+            buffer.append(end_char);
+            return buffer.toString();
+        }
+        }
+    }
+    // Return the string sans enclosing characters
+    if(buffer.length() >= 2 && strip_characters) {
+        return buffer.substring(1, buffer.length() - 1);
+    }
+    else {
+        return buffer.toString();
+    }
+    }
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 6139

Legend:

trunk/gli/src/org/greenstone/gatherer/cdm/CommandTokenizer.java

Download in other formats: