/**
*#########################################################################
*
* A component of the Gatherer application, part of the Greenstone digital
* library suite from the New Zealand Digital Library Project at the
* University of Waikato, New Zealand.
*
*
*
* Author: John Thompson, Greenstone Digital Library, University of Waikato
*
*
*
* Copyright (C) 1999 New Zealand Digital Library Project
*
*
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
*
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
*
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*########################################################################
*/
package org.greenstone.gatherer.util;
/** This class functions much like a StringTokenizer in that it tokenizes a long string into tokens, however this tokenizer cleverly notices HTML formatting tags. */
public class HTMLStringTokenizer {
/** The current position in the source string. */
private int pos = 0;
/** The current token, usually created by the last nextToken call. */
private String current = null;
/** The previous token. */
private String previous = null;
/** The string to be tokenized, including any HTML markup. */
private String source = null;
/** Constructor.
* @param source The source String to be tokenized.
*/
public HTMLStringTokenizer(String source) {
this.source = source;
// Parse the first token.
parseToken();
}
/** Determines if there are still tokens remaining unparsed in the source.
* @return A boolean which is true if there are more tokens.
*/
public boolean hasMoreTokens() {
if(current != null && current.length() > 0) {
return true;
}
return false;
}
/** Determines if the tag currently being returned by sameToken is a tag.
* @return A boolean indicating if the token is a tag.
*/
public boolean isTag() {
if(previous.startsWith("<") && previous.endsWith(">")) {
return true;
}
return false;
}
/** Retrieves the next token.
* @return A String representing the token.
*/
public String nextToken() {
previous = current;
// Get the next token.
parseToken();
// Return previous.
return previous;
}
/** Parses the next token and stores it in current.
*/
private void parseToken() {
boolean found = false;
boolean tag = false;
boolean text = false;
// Reset current
current = "";
// Parse away
dumpWhiteSpace();
while(pos < source.length() && !found) {
char c = (char)source.charAt(pos);
if(!tag && !text) {
if(c == '<') {
tag = true;
}
else {
text = true;
}
current = current + c;
}
// Reading a tag. Watch only for '>'.
else if(tag) {
if(c == '>') {
found = true;
}
current = current + c;
}
// Reading text. Watch for ' ' and '<'. Rollback '<'.
else if(text) {
if(c == ' ') {
found = true;
}
else if(c == '<') {
found = true;
pos--;
}
else {
current = current + c;
}
}
pos++;
}
}
/** Method to ignore whitespace in the source.
*/
private void dumpWhiteSpace() {
while(pos < source.length() && source.charAt(pos) == ' ') {
pos++;
}
}
}