source: trunk/gli/src/org/greenstone/gatherer/cdm/CommandTokenizer.java@ 6540

Last change on this file since 6540 was 6539, checked in by jmt12, 20 years ago

Heres a bunch of other changed files. If it wasn't a Friday afternoon I might be bothered finding out what I actually changed in them. Such changes include: a new option or three on preferences, a bug fix for the GDM classes, several changes to CDM to allow for G2.39 configuration files, a fix to Codec to allow for quotes in format strings and more work on CommandTokenizer to allow for stupid, stupid, stupid collectionextra's starting with speech marks then a new line. Plus other stuff. And things. Peace Out.

  • Property svn:keywords set to Author Date Id Revision
File size: 10.3 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * <BR><BR>
9 *
10 * Author: John Thompson, Greenstone Digital Library, University of Waikato
11 *
12 * <BR><BR>
13 *
14 * Copyright (C) 1999 New Zealand Digital Library Project
15 *
16 * <BR><BR>
17 *
18 * This program is free software; you can redistribute it and/or modify
19 * it under the terms of the GNU General Public License as published by
20 * the Free Software Foundation; either version 2 of the License, or
21 * (at your option) any later version.
22 *
23 * <BR><BR>
24 *
25 * This program is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU General Public License for more details.
29 *
30 * <BR><BR>
31 *
32 * You should have received a copy of the GNU General Public License
33 * along with this program; if not, write to the Free Software
34 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 *########################################################################
36 */
37package org.greenstone.gatherer.cdm;
38
39/**************************************************************************************
40 * Title: Gatherer
41 * Description: The Gatherer: a tool for gathering and enriching a digital collection.
42 * Company: The University of Waikato
43 * Written: 07/05/02
44 * Revised: 03/10/02 - Commented
45 **************************************************************************************/
46import java.io.BufferedReader;
47import java.util.StringTokenizer;
48import org.greenstone.gatherer.Gatherer;
49import org.greenstone.gatherer.util.StaticStrings;
50
51/** This class provides an extension to the standard StringTokenizer in that it recognizes quotes (or some form of bracketting) enclose a single token so in something like: <br>format Search '&lt;table&gt;&lt;img src=... &lt;/table&gt;'<br> the formatting string is parsed as a single token. Unfortunately this makes countTokens() unreliable for exact measurement of tokens remaining, and only useful for determining if there are tokens left to be processed (includes any that have already been read into command buffer).
52 * @author John Thompson, Greenstone Digital Library, University of Waikato
53 * @version 2.3
54 */
55public class CommandTokenizer {
56
57 static final public int BRACKET_ENCLOSED = 0;
58 static final public int DOUBLE_QUOTE_ENCLOSED = 1;
59 static final public int NORMAL = 2;
60 static final public int QUOTE_ENCLOSED = 3;
61
62 private BufferedReader in_stream;
63 private int count = -1;
64 private StringTokenizer internal_tokenizer;
65
66 /** Basic Constructor. Used to parse tokens from a string keeping tokens surrounded by speechmarks or square brackets intact. Thus something like:<br>
67 * collectionmeta collectionextra [l = en] "Hello World"<br>
68 * is tokenized thus<br>
69 * {'collectionmeta', 'collectionextra', 'l = en', 'Hello World'}
70 * @param command the command String you wish to tokenize
71 */
72 public CommandTokenizer(String command) {
73 this.internal_tokenizer = new StringTokenizer(command);
74 this.in_stream = null;
75 }
76
77 /** Advanced Constructor. As above but with one major difference. Since it is provided an input stream (presumably where the command string originated from), it is able to parse a quote enclosed command token that stretches over several lines. Each newline is preserved in the resulting token. There is an extra bitchslap here as comething like a collection extra might have html code in them that contain escaped speechmarks, so extra care must be taken not to break at them. Thus something like:<br>
78 * collectionmeta collectionextra [l = en] "<br>
79 * an example of the crazy as description we sometimes get which includes of all things something like <a href=\"this.html\"<br>
80 * >this</a> which you could easily see might be a problem if I parse this niavely."<br>
81 * is tokenized thus<br>
82 * {'collectionmeta', 'collectionextra', 'l = en', '\nan example of the crazy as description we sometimes get which includes of all things something like <a href=\"this.html\"\n>this</a> which you could easily see might be a problem if I parse this niavely.'}
83 * @param command the command String you wish to tokenize
84 * @param in_stream a BufferedReader from which the tokenizer can draw further lines as necessary
85 */
86 public CommandTokenizer(String command, BufferedReader in_stream) {
87 ///atherer.println("***** CommandTokenizer *****\nparse:\t" + command + "\n****************************");
88 this.internal_tokenizer = new StringTokenizer(command);
89 this.in_stream = in_stream;
90 }
91
92 /** Returns the minumum number of remaining tokens before the tokenizer runs out of string. There may be more tokens than this count, but never less. The discrepancy is due to internal functionality and the fact we can't read ahead in the string or associated stream without risking the need for unpredictable push-back
93 * @return the minumum number of tokens available as an int
94 */
95 public int countTokens() {
96 if(count == 0 && internal_tokenizer.countTokens() > 1) {
97 return 1;
98 }
99 if(count == -1) {
100 count = internal_tokenizer.countTokens();
101 }
102 return count;
103 }
104
105 /** Determine if there are still tokens available.
106 * @return true if there are more tokens, false otherwise
107 */
108 public boolean hasMoreTokens() {
109 return internal_tokenizer.hasMoreTokens();
110 }
111
112 /** Method to retrieve the next token from the command, taking care to group tokens enclosed in speech marks.
113 * @return a String containing the next token from the command
114 */
115 public String nextToken() {
116 String result = null;
117 if(internal_tokenizer.hasMoreTokens()) {
118 StringBuffer buffer = new StringBuffer(internal_tokenizer.nextToken());
119 switch(buffer.charAt(0)) {
120 case StaticStrings.DOUBLEQUOTE_CHAR:
121 ///ystem.err.println("Building token wrapped by double quotes.");
122 result = buildToken(buffer, StaticStrings.DOUBLEQUOTE_CHAR, true);
123 break;
124 case StaticStrings.SINGLEQUOTE_CHAR:
125 ///ystem.err.println("Building token wrapped by single quotes.");
126 result = buildToken(buffer, StaticStrings.SINGLEQUOTE_CHAR, true);
127 break;
128 case StaticStrings.OPENBRACKET_CHAR:
129 ///ystem.err.println("Building token wrapped by brackets.");
130 result = buildToken(buffer, StaticStrings.CLOSEBRACKET_CHAR, false);
131 break;
132 default:
133 ///ystem.err.println("Returning plain string.");
134 result = buffer.toString();
135 }
136 buffer = null;
137 }
138 // Because of our tricky counting system we never want to have negative tokens remaining. In fact, unless the internal string buffer is empty, we will return a count of 1 anyway
139 if(count > 0) {
140 count = count - 1;
141 }
142 ///ystem.err.println("----- CommandTokenizer -----\ntoken:\t" + result + "\n----------------------------");
143 return result;
144 }
145
146 /** Parse in the next token. paying heed to enclosing characters demands, escaped characters, newlines and empty buffers and consequential unexpected end of tokens
147 * @param buffer the StringBuffer in which the partial token is stored (at the first bit that caused this method to be called)
148 * @param end_char the sentinel char we are watching for as it encloses a token
149 * @param strip_characters a boolean denoting whether the enclosing characters should be stripped off
150 * @return the token, either in its entirity less the enclosing characters if required or, if an unexpected end occured, whatever we parsed without its starting enclosing character, again only if required. In fact if we weren't asked to strip characters then we add the enclosing character back in
151 */
152 private String buildToken(StringBuffer buffer, char end_char, boolean strip_characters) {
153 while(buffer.length() == 1 || buffer.charAt(buffer.length() - 1) != end_char || (buffer.length() > 3 && buffer.charAt(buffer.length() - 2) == StaticStrings.BACKSLASH_CHAR)) {
154 try {
155 // The first version is for the basic tokenizer which has no idea of an input stream, so runs out tokens at the same time as the internal tokenizer does
156 if(internal_tokenizer.hasMoreTokens()) {
157 buffer.append(StaticStrings.SPACE_CHAR);
158 buffer.append(internal_tokenizer.nextToken());
159 }
160 // While the second version can draw more lines from the stream until eof occurs
161 else if(in_stream != null) {
162 String line_str = null;
163 while(!internal_tokenizer.hasMoreTokens() && (line_str = in_stream.readLine()) != null) {
164 ///atherer.println("+++++ CommandTokenizer +++++\nappend:\t" + line_str + "\n+++++++++++++++++++++++++++++");
165 // Its at this stage the our token count becomes completely putu
166 internal_tokenizer = new StringTokenizer(line_str);
167 buffer.append(StaticStrings.NEW_LINE_CHAR); // A new line in the final token
168 }
169 line_str = null;
170 if(internal_tokenizer.hasMoreTokens()) {
171 // Don't add a space if we just added a newline
172 if(buffer.charAt(buffer.length() - 1) != StaticStrings.NEW_LINE_CHAR) {
173 buffer.append(StaticStrings.SPACE_CHAR);
174 }
175 buffer.append(internal_tokenizer.nextToken());
176 }
177 // We've prematurely run out of content, so throw the dummy, or at least return whatever we managed to parse sans its opening character
178 else {
179 if(strip_characters) {
180 return buffer.substring(1);
181 }
182 else {
183 buffer.append(end_char);
184 return buffer.toString();
185 }
186 }
187 }
188 // We've prematurely run out of content, so throw the dummy, or at least return whatever we managed to parse sans its opening character
189 else {
190 if(strip_characters) {
191 return buffer.substring(1);
192 }
193 else {
194 buffer.append(end_char);
195 return buffer.toString();
196 }
197 }
198 }
199 // Exception thrown when we attempted reading from the input stream, so throw the dummy, or at least return whatever we managed to parse sans its opening character
200 catch(Exception exception) {
201 Gatherer.printStackTrace(exception);
202 if(strip_characters) {
203 return buffer.substring(1);
204 }
205 else {
206 buffer.append(end_char);
207 return buffer.toString();
208 }
209 }
210 }
211 // Return the string sans enclosing characters
212 if(buffer.length() >= 2 && strip_characters) {
213 return buffer.substring(1, buffer.length() - 1);
214 }
215 else {
216 return buffer.toString();
217 }
218 }
219}
Note: See TracBrowser for help on using the repository browser.