source: gsdl/trunk/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneEditor.java@ 16583

Last change on this file since 16583 was 16583, checked in by davidb, 16 years ago

Undoing change commited in r16582

  • Property svn:keywords set to Author Date Id Revision
File size: 13.6 KB
Line 
1/** @file GS2LuceneEditor.java
2 *
3 * Provides a wrapper to the index/document editing features of Lucene.
4 *
5 * This java application makes use of the existing Lucene class IndexModifier
6 * to access and make changes to the information stored about documents in a
7 * Lucene database. This is an essential component of the IncrementalBuilder
8 * PERL module, and endevours to make editing the text and metadata of
9 * documents without having to rebuild the entire collection a reality (in
10 * other words, true incremental/dynamic building).
11 *
12 * A component of the Greenstone digital library software from the New Zealand
13 * Digital Library Project at the University of Waikato, New Zealand.
14 *
15 * This program is free software; you can redistribute it and/or modify it
16 * under the terms of the GNU General Public License as published by the Free
17 * Software Foundation; either version 2 of the License, or (at your option)
18 * any later version.
19 *
20 * This program is distributed in the hope that it will be useful, but WITHOUT
21 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
22 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
23 * more details.
24 *
25 * You should have received a copy of the GNU General Public License along
26 * with this program; if not, write to the Free Software Foundation, Inc., 675
27 * Mass Ave, Cambridge, MA 02139, USA.
28 *
29 * Copyright (c) 2006 DL Consulting Ltd., New Zealand
30 */
31
32package org.greenstone.LuceneWrapper;
33
34import java.io.IOException;
35import java.util.Arrays;
36import java.util.Enumeration;
37import java.util.Vector;
38
39import org.apache.lucene.analysis.Analyzer;
40import org.apache.lucene.analysis.standard.StandardAnalyzer;
41import org.apache.lucene.document.Document;
42import org.apache.lucene.document.Field;
43
44
45/** Contains methods for modifying a document that has previously been indexed
46 * into a Lucene database.
47 * @author John Thompson, DL Consulting Ltd.
48 */
49public class GS2LuceneEditor
50{
51 /** This is the main entry point to the editor and is responsible for
52 * parsing the arguments and creating an instance of the editor class.
53 *
54 * @param args The arguments passed into the application as a string
55 * array
56 * @return An integer describing the exit state of the application
57 * @throws Exception on any fatal error state
58 *
59 * @author John Thompson, DL Consulting Ltd.
60 */
61 static public void main (String args[])
62 throws Exception
63 {
64 // Parse arguments
65 int node_id = -1;
66 String field = "";
67 String index_path = "";
68 String new_value = "";
69 String old_value = "";
70
71 for (int i = 0; i < args.length; i += 2)
72 {
73 if (args[i].equals("--index"))
74 {
75 index_path = args[i + 1];
76 }
77 else if (args[i].equals("--nodeid"))
78 {
79 String temp = args[i + 1];
80 node_id = Integer.parseInt(temp);
81 temp = null; // Off to the gc with you!
82 }
83 else if (args[i].equals("--field"))
84 {
85 field = args[i + 1];
86 }
87 else if (args[i].equals("--oldvalue"))
88 {
89 old_value = args[i + 1];
90 }
91 else if (args[i].equals("--newvalue"))
92 {
93 new_value = args[i + 1];
94 }
95 else
96 {
97 System.out.println("Error! Unknown argument: " + args[i]);
98 GS2LuceneEditor.printUsage();
99 }
100 }
101
102 // Check arguments
103 if(index_path.equals(""))
104 {
105 System.out.println("Error! Missing index path");
106 GS2LuceneEditor.printUsage();
107 }
108 if(field.equals(""))
109 {
110 System.out.println("Error! Missing field");
111 GS2LuceneEditor.printUsage();
112 }
113 if(node_id == -1)
114 {
115 System.out.println("Error! Missing or invalid Node ID");
116 GS2LuceneEditor.printUsage();
117 }
118 if(old_value.equals("") && new_value.equals(""))
119 {
120 System.out.println("Error! No modification requested");
121 GS2LuceneEditor.printUsage();
122 }
123
124
125 // Instantiate editor, and perform the edit
126 GS2LuceneEditor editor = new GS2LuceneEditor(index_path);
127 editor.editIndex(node_id, field, old_value, new_value);
128 editor.destroy();
129 editor = null;
130 }
131 /** main() **/
132
133 /** **/
134 private boolean debug = true;
135
136 /** **/
137 private GS2IndexModifier index_modifier;
138
139 /** Constructor which takes the path to the Lucene index to be edited.
140 *
141 * @param index_path The full path to the index directory as a String
142 *
143 * @author John Thompson, DL Consulting Ltd.
144 */
145 public GS2LuceneEditor(String index_path)
146 throws IOException
147 {
148 Analyzer analyzer = new StandardAnalyzer();
149 // create an index in /tmp/index, overwriting an existing one:
150 index_modifier = new GS2IndexModifier(index_path, analyzer);
151 }
152 /** GS2LuceneEditor **/
153
154 /**
155 */
156 public void debug(String message)
157 {
158 if(debug)
159 {
160 System.err.println(message);
161 }
162 }
163 /** debug() **/
164
165 /** Destructor which unallocates connection to Lucene.
166 */
167 public void destroy()
168 throws IOException
169 {
170 index_modifier.close();
171 index_modifier = null;
172 }
173
174 /** Make an edit to a Lucene index.
175 *
176 * @param oid The unique identifier of a Lucene document as an
177 * integer
178 * @param field The field to be modified as a String
179 * @param old_value The existing value to be changed or removed as a
180 * String
181 * @param old_value The replacement value to be changed or added as a
182 * String
183 *
184 * @author John Thompson, DL Consulting Ltd.
185 */
186 public void editIndex(int node_id, String field, String old_value, String new_value)
187 throws IOException
188 {
189 debug("GS2LuceneEditor.editIndex(" + node_id + ",'" + field + "','" + old_value + "','" + new_value + "')");
190 debug("- Initial number of documents in index: " + index_modifier.docCount());
191 // Retrieve the document requested
192 int doc_num = index_modifier.getDocNumByNodeID(node_id);
193 if (doc_num != -1)
194 {
195 debug("* Found document #" + doc_num);
196 // Retrieve the actual document
197 Document document = index_modifier.document(doc_num);
198 // Remove the document from the index before modifying
199 index_modifier.deleteDocument(doc_num);
200 debug("* Removed document from index prior to editing");
201 // Retrieve the requested fields values, and turn it into a
202 // vector
203 debug("* Modifying the value of the field: " + field);
204 doEdit(document, field, old_value, new_value);
205
206 // We have to do a similar modification to the ZZ field
207 // too
208 debug("* Modifying the value of the field: ZZ");
209 doEdit(document, "ZZ", old_value, new_value);
210
211 // Re-index document
212 index_modifier.addDocument(document);
213 debug("* Reindexing modified document");
214 }
215 else
216 {
217 debug("- No such document!");
218 Document document = new Document();
219
220 // Retrieve the requested fields values, and turn it into a
221 // vector
222 debug("* Adding the value to the field: " + field);
223 doEdit(document, field, old_value, new_value);
224
225 // We have to do a similar modification to the ZZ field
226 // too
227 debug("* Adding the value to the field: ZZ");
228 doEdit(document, "ZZ", old_value, new_value);
229
230 // We also have to initialize the nodeId value
231 document.add(new Field("nodeID", String.valueOf(node_id), Field.Store.YES, Field.Index.TOKENIZED));
232
233 // Re-index document
234 index_modifier.addDocument(document);
235 debug("* Indexing new document");
236 }
237
238
239 }
240 /** editIndex() **/
241
242 /**
243 */
244 protected void doEdit(Document document, String field, String old_value, String new_value)
245 {
246 if (debug)
247 {
248 debug("GS2LuceneEditor.doEdit(Document, \"" + field + "\", \"" + old_value + "\", \"" + new_value + "\")");
249 }
250
251 String values_raw[] = document.getValues(field);
252 if(values_raw != null)
253 {
254 Vector values = new Vector(Arrays.asList(values_raw));
255 // Remove all the values for this field (no other safe way to
256 // do this
257 document.removeFields(field);
258 // DEBUG
259 if (debug)
260 {
261 debug("- Before modification:");
262 for(int i = 0; i < values.size(); i++)
263 {
264 debug("\t" + field + "[" + i + "]: " + values.get(i));
265 }
266 }
267 // If old_value is set, remove it from the values array
268 if(!old_value.equals(""))
269 {
270 // Remove all occurances of this metadata - this means
271 // it becomes a bit dangerous to have multiple pieces
272 // of metadata with exactly the same metadata - but
273 // this is only for indexing purposes so its not so
274 // bad.
275 while(values.contains(old_value))
276 {
277 values.remove(old_value);
278 }
279 }
280 // If new_value is set, add it to the values array
281 if(!new_value.equals("") && !values.contains(new_value))
282 {
283 values.add(new_value);
284 }
285 // DEBUG
286 if(debug)
287 {
288 debug("- After modification:");
289 for(int i = 0; i < values.size(); i++)
290 {
291 debug("\t" + field + "[" + i + "]: " + values.get(i));
292 }
293 }
294 // Add all the values for this field
295 for(int i = 0; i < values.size(); i++)
296 {
297 document.add(new Field(field, (String)values.get(i), Field.Store.YES, Field.Index.TOKENIZED));
298 }
299 values.clear();
300 values = null;
301 }
302 // We may be adding a value to a field that current has no values
303 else if (!new_value.equals(""))
304 {
305 Vector values = new Vector();
306 values.add(new_value);
307 // DEBUG
308 if(debug)
309 {
310 debug("- Brand spanking new values:");
311 for(int i = 0; i < values.size(); i++)
312 {
313 debug("\t" + field + "[" + i + "]: " + values.get(i));
314 }
315 }
316 // Add all the values for this field
317 for(int i = 0; i < values.size(); i++)
318 {
319 document.add(new Field(field, (String)values.get(i), Field.Store.YES, Field.Index.TOKENIZED));
320 }
321 values.clear();
322 values = null;
323 }
324 // Can't do a removal unless something exists
325 else
326 {
327 debug("- No such field for this document: " + field);
328 }
329 values_raw = null;
330 }
331 /** doEdit() **/
332
333 /**
334 */
335 static public void printUsage()
336 {
337 System.out.println("usage: GS2LuceneEditor --index <path> --nodeid <int> --field <string>");
338 System.out.println(" [--oldvalue <string>] [--newvalue <string>]");
339 System.out.println("");
340 System.out.println("where:");
341 System.out.println(" index - is the full path to the directory containing the directory");
342 System.out.println(" to edit, including the level (ie didx, sidx)");
343 System.out.println(" nodeid - the unique identifier of the document to change. This is the");
344 System.out.println(" same as the docnum in the GDBM");
345 System.out.println(" field - the two letter code of the metadata field to edit. These can");
346 System.out.println(" found in the build.cfg file. ZZ is not a valid field as it");
347 System.out.println(" is handled as a special case");
348 System.out.println(" oldvalue - the current value of the metadata field if it is to be");
349 System.out.println(" replaced or removed");
350 System.out.println(" newvalue - the new value for the metadata field if it is to be replaced");
351 System.out.println(" or added");
352 System.out.println("");
353 System.exit(0);
354 }
355 /** printUsage() **/
356
357}
358/** class GS2LuceneEditor **/
Note: See TracBrowser for help on using the repository browser.