/** *######################################################################### * * A component of the Gatherer application, part of the Greenstone digital * library suite from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * *

* * Author: John Thompson, Greenstone Digital Library, University of Waikato * *

* * Copyright (C) 1999 New Zealand Digital Library Project * *

* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * *

* * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * *

* * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *######################################################################## */ package org.greenstone.gatherer.util; /** This class functions much like a StringTokenizer in that it tokenizes a long string into tokens, however this tokenizer cleverly notices HTML formatting tags. */ public class HTMLStringTokenizer { /** The current position in the source string. */ private int pos = 0; /** The current token, usually created by the last nextToken call. */ private String current = null; /** The previous token. */ private String previous = null; /** The string to be tokenized, including any HTML markup. */ private String source = null; /** Constructor. * @param source The source String to be tokenized. */ public HTMLStringTokenizer(String source) { this.source = source; // Parse the first token. parseToken(); } /** Determines if there are still tokens remaining unparsed in the source. * @return A boolean which is true if there are more tokens. */ public boolean hasMoreTokens() { if(current != null && current.length() > 0) { return true; } return false; } /** Determines if the tag currently being returned by sameToken is a tag. * @return A boolean indicating if the token is a tag. */ public boolean isTag() { if(previous.startsWith("<") && previous.endsWith(">")) { return true; } return false; } /** Retrieves the next token. * @return A String representing the token. */ public String nextToken() { previous = current; // Get the next token. parseToken(); // Return previous. return previous; } /** Parses the next token and stores it in current. */ private void parseToken() { boolean found = false; boolean tag = false; boolean text = false; // Reset current current = ""; // Parse away dumpWhiteSpace(); while(pos < source.length() && !found) { char c = (char)source.charAt(pos); if(!tag && !text) { if(c == '<') { tag = true; } else { text = true; } current = current + c; } // Reading a tag. Watch only for '>'. else if(tag) { if(c == '>') { found = true; } current = current + c; } // Reading text. Watch for ' ' and '<'. Rollback '<'. else if(text) { if(c == ' ') { found = true; } else if(c == '<') { found = true; pos--; } else { current = current + c; } } pos++; } } /** Method to ignore whitespace in the source. */ private void dumpWhiteSpace() { while(pos < source.length() && source.charAt(pos) == ' ') { pos++; } } }