Changeset 6139


Ignore:
Timestamp:
2003-12-08T14:14:08+13:00 (20 years ago)
Author:
jmt12
Message:

Fixed one possible NPE and fixed the tabbing, hence the large difference in lines

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gli/src/org/greenstone/gatherer/cdm/CommandTokenizer.java

    r6051 r6139  
    5454 */
    5555public class CommandTokenizer {
    56 
    57     static final public int BRACKET_ENCLOSED            = 0;
    58     static final public int DOUBLE_QUOTE_ENCLOSED   = 1;
    59     static final public int NORMAL                      = 2;
    60     static final public int QUOTE_ENCLOSED              = 3;
    61 
    62     private BufferedReader in_stream;
    63     private int count = -1;
    64     private StringTokenizer internal_tokenizer;
    65 
    66     /** Basic Constructor. Used to parse tokens from a string keeping tokens surrounded by speechmarks or square brackets intact. Thus something like:<br>
    67      * collectionmeta collectionextra [l = en] "Hello World"<br>
    68      * is tokenized thus<br>
    69      * {'collectionmeta', 'collectionextra', 'l = en', 'Hello World'}
    70      * @param command the command String you wish to tokenize
    71      */
    72     public CommandTokenizer(String command) {
    73         this.internal_tokenizer = new StringTokenizer(command);
    74         this.in_stream = null;
    75     }
    76 
    77     /** Advanced Constructor. As above but with one major difference. Since it is provided an input stream (presumably where the command string originated from), it is able to parse a quote enclosed command token that stretches over several lines. Each newline is preserved in the resulting token. There is an extra bitchslap here as comething like a collection extra might have html code in them that contain escaped speechmarks, so extra care must be taken not to break at them. Thus something like:<br>
    78      * collectionmeta collectionextra [l = en] "<br>
    79      *     an example of the crazy as description we sometimes get which includes of all things something like <a href=\"this.html\"<br>
    80      *     >this</a> which you could easily see might be a problem if I parse this niavely."<br>
    81      * is tokenized thus<br>
    82      * {'collectionmeta', 'collectionextra', 'l = en', '\nan example of the crazy as description we sometimes get which includes of all things something like <a href=\"this.html\"\n>this</a> which you could easily see might be a problem if I parse this niavely.'}
    83      * @param command the command String you wish to tokenize
    84      * @param in_stream a BufferedReader from which the tokenizer can draw further lines as necessary
    85      */
    86     public CommandTokenizer(String command, BufferedReader in_stream) {
    87         ///atherer.println("***** CommandTokenizer *****\nparse:\t" + command + "\n****************************");
    88         this.internal_tokenizer = new StringTokenizer(command);
    89         this.in_stream = in_stream;
    90     }
    91 
    92     /** Returns the minumum number of remaining tokens before the tokenizer runs out of string. There may be more tokens than this count, but never less. The discrepancy is due to internal functionality and the fact we can't read ahead in the string or associated stream without risking the need for unpredictable push-back
    93      * @return the minumum number of tokens available as an int
    94      */
    95     public int countTokens() {
    96         if(count == 0 && internal_tokenizer.countTokens() > 1) {
    97              return 1;
    98         }
    99         if(count == -1) {
    100              count = internal_tokenizer.countTokens();
    101         }
    102         return count;
    103     }
    104 
    105     /** Determine if there are still tokens available.
    106      * @return true if there are more tokens, false otherwise
    107      */
    108     public boolean hasMoreTokens() {
    109          return internal_tokenizer.hasMoreTokens();
    110     }
    111 
    112     /** Method to retrieve the next token from the command, taking care to group tokens enclosed in speech marks.
    113      * @return a String containing the next token from the command
    114      */
    115     public String nextToken() {
    116         String result = null;
     56   
     57    static final public int BRACKET_ENCLOSED        = 0;
     58    static final public int DOUBLE_QUOTE_ENCLOSED   = 1;
     59    static final public int NORMAL          = 2;
     60    static final public int QUOTE_ENCLOSED      = 3;
     61   
     62    private BufferedReader in_stream;
     63    private int count = -1;
     64    private StringTokenizer internal_tokenizer;
     65   
     66    /** Basic Constructor. Used to parse tokens from a string keeping tokens surrounded by speechmarks or square brackets intact. Thus something like:<br>
     67     * collectionmeta collectionextra [l = en] "Hello World"<br>
     68     * is tokenized thus<br>
     69     * {'collectionmeta', 'collectionextra', 'l = en', 'Hello World'}
     70     * @param command the command String you wish to tokenize
     71     */
     72    public CommandTokenizer(String command) {
     73    this.internal_tokenizer = new StringTokenizer(command);
     74    this.in_stream = null;
     75    }
     76   
     77    /** Advanced Constructor. As above but with one major difference. Since it is provided an input stream (presumably where the command string originated from), it is able to parse a quote enclosed command token that stretches over several lines. Each newline is preserved in the resulting token. There is an extra bitchslap here as comething like a collection extra might have html code in them that contain escaped speechmarks, so extra care must be taken not to break at them. Thus something like:<br>
     78     * collectionmeta collectionextra [l = en] "<br>
     79     *     an example of the crazy as description we sometimes get which includes of all things something like <a href=\"this.html\"<br>
     80     *     >this</a> which you could easily see might be a problem if I parse this niavely."<br>
     81     * is tokenized thus<br>
     82     * {'collectionmeta', 'collectionextra', 'l = en', '\nan example of the crazy as description we sometimes get which includes of all things something like <a href=\"this.html\"\n>this</a> which you could easily see might be a problem if I parse this niavely.'}
     83     * @param command the command String you wish to tokenize
     84     * @param in_stream a BufferedReader from which the tokenizer can draw further lines as necessary
     85     */
     86    public CommandTokenizer(String command, BufferedReader in_stream) {
     87    ///atherer.println("***** CommandTokenizer *****\nparse:\t" + command + "\n****************************");
     88    this.internal_tokenizer = new StringTokenizer(command);
     89    this.in_stream = in_stream;
     90    }
     91   
     92    /** Returns the minumum number of remaining tokens before the tokenizer runs out of string. There may be more tokens than this count, but never less. The discrepancy is due to internal functionality and the fact we can't read ahead in the string or associated stream without risking the need for unpredictable push-back
     93     * @return the minumum number of tokens available as an int
     94     */
     95    public int countTokens() {
     96    if(count == 0 && internal_tokenizer.countTokens() > 1) {
     97        return 1;
     98    }
     99    if(count == -1) {
     100        count = internal_tokenizer.countTokens();
     101    }
     102    return count;
     103    }
     104   
     105    /** Determine if there are still tokens available.
     106     * @return true if there are more tokens, false otherwise
     107     */
     108    public boolean hasMoreTokens() {
     109    return internal_tokenizer.hasMoreTokens();
     110    }
     111   
     112    /** Method to retrieve the next token from the command, taking care to group tokens enclosed in speech marks.
     113     * @return a String containing the next token from the command
     114     */
     115    public String nextToken() {
     116    String result = null;
     117    if(internal_tokenizer.hasMoreTokens()) {
     118        StringBuffer buffer = new StringBuffer(internal_tokenizer.nextToken());
     119        switch(buffer.charAt(0)) {
     120        case StaticStrings.DOUBLEQUOTE_CHAR:
     121        result = buildToken(buffer, StaticStrings.DOUBLEQUOTE_CHAR, true);
     122        break;
     123        case StaticStrings.SINGLEQUOTE_CHAR:
     124        result = buildToken(buffer, StaticStrings.SINGLEQUOTE_CHAR, true);
     125        break;
     126        case StaticStrings.OPENBRACKET_CHAR:
     127        result = buildToken(buffer, StaticStrings.CLOSEBRACKET_CHAR, false);
     128        break;
     129        default:
     130        result = buffer.toString();
     131        }
     132        buffer = null;
     133    }
     134    // Because of our tricky counting system we never want to have negative tokens remaining. In fact, unless the internal string buffer is empty, we will return a count of 1 anyway
     135    if(count > 0) {
     136        count = count - 1;
     137    }
     138    ///atherer.println("----- CommandTokenizer -----\ntoken:\t" + result + "\n----------------------------");
     139    return result;
     140    }
     141   
     142    /** Parse in the next token. paying heed to enclosing characters demands, escaped characters, newlines and empty buffers and consequential unexpected end of tokens
     143     * @param buffer the StringBuffer in which the partial token is stored (at the first bit that caused this method to be called)
     144     * @param end_char the sentinel char we are watching for as it encloses a token
     145     * @param strip_characters a boolean denoting whether the enclosing characters should be stripped off
     146     * @return the token, either in its entirity less the enclosing characters if required or, if an unexpected end occured, whatever we parsed without its starting enclosing character, again only if required. In fact if we weren't asked to strip characters then we add the enclosing character back in
     147     */
     148    private String buildToken(StringBuffer buffer, char end_char, boolean strip_characters) {
     149    while(buffer.charAt(buffer.length() - 1) != end_char || (buffer.length() > 3 && buffer.charAt(buffer.length() - 2) == StaticStrings.BACKSLASH_CHAR)) {
     150        try {
     151        // The first version is for the basic tokenizer which has no idea of an input stream, so runs out tokens at the same time as the internal tokenizer does
    117152        if(internal_tokenizer.hasMoreTokens()) {
    118             StringBuffer buffer = new StringBuffer(internal_tokenizer.nextToken());
    119             switch(buffer.charAt(0)) {
    120                 case StaticStrings.DOUBLEQUOTE_CHAR:
    121                     result = buildToken(buffer, StaticStrings.DOUBLEQUOTE_CHAR, true);
    122                     break;
    123                 case StaticStrings.SINGLEQUOTE_CHAR:
    124                     result = buildToken(buffer, StaticStrings.SINGLEQUOTE_CHAR, true);
    125                     break;
    126                 case StaticStrings.OPENBRACKET_CHAR:
    127                     result = buildToken(buffer, StaticStrings.CLOSEBRACKET_CHAR, false);
    128                     break;
    129                 default:
    130                     result = buffer.toString();
     153            buffer.append(StaticStrings.SPACE_CHAR);
     154            buffer.append(internal_tokenizer.nextToken());
     155        }
     156        // While the second version can draw more lines from the stream until eof occurs
     157        else if(in_stream != null) {
     158            String line_str = null;
     159            while(!internal_tokenizer.hasMoreTokens() && (line_str = in_stream.readLine()) != null) {
     160            ///atherer.println("+++++ CommandTokenizer +++++\nappend:\t" + line_str + "\n+++++++++++++++++++++++++++++");
     161            // Its at this stage the our token count becomes completely patu
     162            internal_tokenizer = new StringTokenizer(line_str);
     163            buffer.append(StaticStrings.NEW_LINE_CHAR); // A new line in the final token
     164            }
     165            line_str = null;
     166            if(internal_tokenizer.hasMoreTokens()) {
     167            // Don't add a space if we just added a newline
     168            if(buffer.charAt(buffer.length() - 1) != StaticStrings.NEW_LINE_CHAR) {
     169                buffer.append(StaticStrings.SPACE_CHAR);
    131170            }
    132             buffer = null;
    133         }
    134         // Because of our tricky counting system we never want to have negative tokens remaining. In fact, unless the internal string buffer is empty, we will return a count of 1 anyway
    135         if(count > 0) {
    136             count = count - 1;
    137         }
    138         ///atherer.println("----- CommandTokenizer -----\ntoken:\t" + result + "\n----------------------------");
    139         return result;
    140     }
    141 
    142     /** Parse in the next token. paying heed to enclosing characters demands, escaped characters, newlines and empty buffers and consequential unexpected end of tokens
    143      * @param buffer the StringBuffer in which the partial token is stored (at the first bit that caused this method to be called)
    144      * @param end_char the sentinel char we are watching for as it encloses a token
    145      * @param strip_characters a boolean denoting whether the enclosing characters should be stripped off
    146      * @return the token, either in its entirity less the enclosing characters if required or, if an unexpected end occured, whatever we parsed without its starting enclosing character, again only if required. In fact if we weren't asked to strip characters then we add the enclosing character back in
    147      */
    148     private String buildToken(StringBuffer buffer, char end_char, boolean strip_characters) {
    149         while(buffer.charAt(buffer.length() - 1) != end_char || (buffer.length() > 3 && buffer.charAt(buffer.length() - 2) == StaticStrings.BACKSLASH_CHAR)) {
    150             try {
    151                 // The first version is for the basic tokenizer which has no idea of an input stream, so runs out tokens at the same time as the internal tokenizer does
    152                 if(internal_tokenizer.hasMoreTokens()) {
    153                     buffer.append(StaticStrings.SPACE_CHAR);
    154                     buffer.append(internal_tokenizer.nextToken());
    155                 }
    156                 // While the second version can draw more lines from the stream until eof occurs
    157                 else if(in_stream != null) {
    158                     String line_str = null;
    159                     while(!internal_tokenizer.hasMoreTokens() && (line_str = in_stream.readLine()) != null) {
    160                         ///atherer.println("+++++ CommandTokenizer +++++\nappend:\t" + line_str + "\n+++++++++++++++++++++++++++++");
    161                         // Its at this stage the our token count becomes completely patu
    162                         internal_tokenizer = new StringTokenizer(line_str);
    163                         buffer.append(StaticStrings.NEW_LINE_CHAR); // A new line in the final token
    164                     }
    165                     line_str = null;
    166                     if(internal_tokenizer.hasMoreTokens()) {
    167                         // Don't add a space if we just added a newline
    168                         if(buffer.charAt(buffer.length() - 1) != StaticStrings.NEW_LINE_CHAR) {
    169                             buffer.append(StaticStrings.SPACE_CHAR);
    170                         }
    171                         buffer.append(internal_tokenizer.nextToken());
    172                     }
    173                     // We've prematurely run out of content, so throw the dummy, or at least return whatever we managed to parse sans its opening character
    174                     else {
    175                         if(strip_characters) {
    176                             return buffer.substring(1);
    177                         }
    178                         else {
    179                             buffer.append(end_char);
    180                             return buffer.toString();
    181                         }
    182                     }
    183                 }
    184                 // We've prematurely run out of content, so throw the dummy, or at least return whatever we managed to parse sans its opening character
    185                 else {
    186                     if(strip_characters) {
    187                         return buffer.substring(1);
    188                     }
    189                     else {
    190                         buffer.append(end_char);
    191                         return buffer.toString();
    192                     }
    193                 }
     171            buffer.append(internal_tokenizer.nextToken());
     172            }
     173            // We've prematurely run out of content, so throw the dummy, or at least return whatever we managed to parse sans its opening character
     174            else {
     175            if(strip_characters) {
     176                return buffer.substring(1);
    194177            }
    195             // Exception throw when we attempted reading from the input stream, so throw the dummy, or at least return whatever we managed to parse sans its opening character
    196             catch(Exception exception) {
    197                 Gatherer.printStackTrace(exception);
    198                 if(strip_characters) {
    199                     return buffer.substring(1);
    200                 }
    201                 else {
    202                     buffer.append(end_char);
    203                     return buffer.toString();
    204                 }
     178            else {
     179                buffer.append(end_char);
     180                return buffer.toString();
    205181            }
    206         }
    207         // Return the string sans enclosing characters
    208         if(buffer.length() > 0 && strip_characters) {
    209             return buffer.substring(1, buffer.length() - 1);
    210         }
     182            }
     183        }
     184        // We've prematurely run out of content, so throw the dummy, or at least return whatever we managed to parse sans its opening character
    211185        else {
     186            if(strip_characters) {
     187            return buffer.substring(1);
     188            }
     189            else {
     190            buffer.append(end_char);
    212191            return buffer.toString();
    213         }
    214     }
     192            }
     193        }
     194        }
     195        // Exception thrown when we attempted reading from the input stream, so throw the dummy, or at least return whatever we managed to parse sans its opening character
     196        catch(Exception exception) {
     197        Gatherer.printStackTrace(exception);
     198        if(strip_characters) {
     199            return buffer.substring(1);
     200        }
     201        else {
     202            buffer.append(end_char);
     203            return buffer.toString();
     204        }
     205        }
     206    }
     207    // Return the string sans enclosing characters
     208    if(buffer.length() >= 2 && strip_characters) {
     209        return buffer.substring(1, buffer.length() - 1);
     210    }
     211    else {
     212        return buffer.toString();
     213    }
     214    }
    215215}
Note: See TracChangeset for help on using the changeset viewer.