source: main/trunk/gli/src/org/greenstone/gatherer/util/HTMLStringTokenizer.java@ 31711

Last change on this file since 31711 was 9313, checked in by mdewsnip, 19 years ago

Removed main function, and changed to have Unix line endings.

  • Property svn:keywords set to Author Date Id Revision
File size: 3.9 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * <BR><BR>
9 *
10 * Author: John Thompson, Greenstone Digital Library, University of Waikato
11 *
12 * <BR><BR>
13 *
14 * Copyright (C) 1999 New Zealand Digital Library Project
15 *
16 * <BR><BR>
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * <BR><BR>
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * <BR><BR>
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 *########################################################################
36 */
37package org.greenstone.gatherer.util;
38
39
40/** This class functions much like a <strong>StringTokenizer</strong> in that it tokenizes a long string into tokens, however this tokenizer cleverly notices HTML formatting tags. */
41public class HTMLStringTokenizer {
42 /** The current position in the source string. */
43 private int pos = 0;
44 /** The current token, usually created by the last nextToken call. */
45 private String current = null;
46 /** The previous token. */
47 private String previous = null;
48 /** The string to be tokenized, including any HTML markup. */
49 private String source = null;
50 /** Constructor.
51 * @param source The source <strong>String</strong> to be tokenized.
52 */
53 public HTMLStringTokenizer(String source) {
54 this.source = source;
55 // Parse the first token.
56 parseToken();
57 }
58
59 /** Determines if there are still tokens remaining unparsed in the source.
60 * @return A <strong>boolean</strong> which is <i>true</i> if there are more tokens.
61 */
62 public boolean hasMoreTokens() {
63 if(current != null && current.length() > 0) {
64 return true;
65 }
66 return false;
67 }
68
69 /** Determines if the tag currently being returned by sameToken is a tag.
70 * @return A <strong>boolean</strong> indicating if the token is a tag.
71 */
72 public boolean isTag() {
73 if(previous.startsWith("<") && previous.endsWith(">")) {
74 return true;
75 }
76 return false;
77 }
78
79 /** Retrieves the next token.
80 * @return A <strong>String</strong> representing the token.
81 */
82 public String nextToken() {
83 previous = current;
84 // Get the next token.
85 parseToken();
86 // Return previous.
87 return previous;
88 }
89
90 /** Parses the next token and stores it in current.
91 */
92 private void parseToken() {
93 boolean found = false;
94 boolean tag = false;
95 boolean text = false;
96 // Reset current
97 current = "";
98 // Parse away
99 dumpWhiteSpace();
100 while(pos < source.length() && !found) {
101 char c = (char)source.charAt(pos);
102 if(!tag && !text) {
103 if(c == '<') {
104 tag = true;
105 }
106 else {
107 text = true;
108 }
109 current = current + c;
110 }
111 // Reading a tag. Watch only for '>'.
112 else if(tag) {
113 if(c == '>') {
114 found = true;
115 }
116 current = current + c;
117 }
118 // Reading text. Watch for ' ' and '<'. Rollback '<'.
119 else if(text) {
120 if(c == ' ') {
121 found = true;
122 }
123 else if(c == '<') {
124 found = true;
125 pos--;
126 }
127 else {
128 current = current + c;
129 }
130 }
131 pos++;
132 }
133 }
134
135 /** Method to ignore whitespace in the source.
136 */
137 private void dumpWhiteSpace() {
138 while(pos < source.length() && source.charAt(pos) == ' ') {
139 pos++;
140 }
141 }
142}
Note: See TracBrowser for help on using the repository browser.