source: trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneEditor.java@ 12264

Last change on this file since 12264 was 12264, checked in by mdewsnip, 18 years ago

New classes to support incremental building with Lucene, many thanks to John Thompson and DL Consulting Ltd.

  • Property svn:keywords set to Author Date Id Revision
File size: 13.7 KB
Line 
1/** @file GS2LuceneEditor.java
2 *
3 * Provides a wrapper to the index/document editing features of Lucene.
4 *
5 * This java application makes use of the existing Lucene class IndexModifier
6 * to access and make changes to the information stored about documents in a
7 * Lucene database. This is an essential component of the IncrementalBuilder
8 * PERL module, and endevours to make editing the text and metadata of
9 * documents without having to rebuild the entire collection a reality (in
10 * other words, true incremental/dynamic building).
11 *
12 * A component of the Greenstone digital library software from the New Zealand
13 * Digital Library Project at the University of Waikato, New Zealand.
14 *
15 * This program is free software; you can redistribute it and/or modify it
16 * under the terms of the GNU General Public License as published by the Free
17 * Software Foundation; either version 2 of the License, or (at your option)
18 * any later version.
19 *
20 * This program is distributed in the hope that it will be useful, but WITHOUT
21 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
22 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
23 * more details.
24 *
25 * You should have received a copy of the GNU General Public License along
26 * with this program; if not, write to the Free Software Foundation, Inc., 675
27 * Mass Ave, Cambridge, MA 02139, USA.
28 *
29 * Copyright (c) 2006 DL Consulting Ltd., New Zealand
30 */
31
32package org.nzdl.gsdl.LuceneWrap;
33
34import java.io.IOException;
35import java.util.Arrays;
36import java.util.Enumeration;
37import java.util.Vector;
38
39import org.apache.lucene.analysis.Analyzer;
40import org.apache.lucene.analysis.standard.StandardAnalyzer;
41import org.apache.lucene.document.Document;
42import org.apache.lucene.document.Field;
43
44import org.nzdl.gsdl.LuceneWrap.GS2IndexModifier;
45
46/** Contains methods for modifying a document that has previously been indexed
47 * into a Lucene database.
48 * @author John Thompson, DL Consulting Ltd.
49 */
50public class GS2LuceneEditor
51{
52 /** This is the main entry point to the editor and is responsible for
53 * parsing the arguments and creating an instance of the editor class.
54 *
55 * @param args The arguments passed into the application as a string
56 * array
57 * @return An integer describing the exit state of the application
58 * @throws Exception on any fatal error state
59 *
60 * @author John Thompson, DL Consulting Ltd.
61 */
62 static public void main (String args[])
63 throws Exception
64 {
65 // Parse arguments
66 int node_id = -1;
67 String field = "";
68 String index_path = "";
69 String new_value = "";
70 String old_value = "";
71
72 for (int i = 0; i < args.length; i += 2)
73 {
74 if (args[i].equals("--index"))
75 {
76 index_path = args[i + 1];
77 }
78 else if (args[i].equals("--nodeid"))
79 {
80 String temp = args[i + 1];
81 node_id = Integer.parseInt(temp);
82 temp = null; // Off to the gc with you!
83 }
84 else if (args[i].equals("--field"))
85 {
86 field = args[i + 1];
87 }
88 else if (args[i].equals("--oldvalue"))
89 {
90 old_value = args[i + 1];
91 }
92 else if (args[i].equals("--newvalue"))
93 {
94 new_value = args[i + 1];
95 }
96 else
97 {
98 System.out.println("Error! Unknown argument: " + args[i]);
99 GS2LuceneEditor.printUsage();
100 }
101 }
102
103 // Check arguments
104 if(index_path.equals(""))
105 {
106 System.out.println("Error! Missing index path");
107 GS2LuceneEditor.printUsage();
108 }
109 if(field.equals(""))
110 {
111 System.out.println("Error! Missing field");
112 GS2LuceneEditor.printUsage();
113 }
114 if(node_id == -1)
115 {
116 System.out.println("Error! Missing or invalid Node ID");
117 GS2LuceneEditor.printUsage();
118 }
119 if(old_value.equals("") && new_value.equals(""))
120 {
121 System.out.println("Error! No modification requested");
122 GS2LuceneEditor.printUsage();
123 }
124
125
126 // Instantiate editor, and perform the edit
127 GS2LuceneEditor editor = new GS2LuceneEditor(index_path);
128 editor.editIndex(node_id, field, old_value, new_value);
129 editor.destroy();
130 editor = null;
131 }
132 /** main() **/
133
134 /** **/
135 private boolean debug = true;
136
137 /** **/
138 private GS2IndexModifier index_modifier;
139
140 /** Constructor which takes the path to the Lucene index to be edited.
141 *
142 * @param index_path The full path to the index directory as a String
143 *
144 * @author John Thompson, DL Consulting Ltd.
145 */
146 public GS2LuceneEditor(String index_path)
147 throws IOException
148 {
149 Analyzer analyzer = new StandardAnalyzer();
150 // create an index in /tmp/index, overwriting an existing one:
151 index_modifier = new GS2IndexModifier(index_path, analyzer);
152 }
153 /** GS2LuceneEditor **/
154
155 /**
156 */
157 public void debug(String message)
158 {
159 if(debug)
160 {
161 System.err.println(message);
162 }
163 }
164 /** debug() **/
165
166 /** Destructor which unallocates connection to Lucene.
167 */
168 public void destroy()
169 throws IOException
170 {
171 index_modifier.close();
172 index_modifier = null;
173 }
174
175 /** Make an edit to a Lucene index.
176 *
177 * @param oid The unique identifier of a Lucene document as an
178 * integer
179 * @param field The field to be modified as a String
180 * @param old_value The existing value to be changed or removed as a
181 * String
182 * @param old_value The replacement value to be changed or added as a
183 * String
184 *
185 * @author John Thompson, DL Consulting Ltd.
186 */
187 public void editIndex(int node_id, String field, String old_value, String new_value)
188 throws IOException
189 {
190 debug("GS2LuceneEditor.editIndex(" + node_id + ",'" + field + "','" + old_value + "','" + new_value + "')");
191 debug("- Initial number of documents in index: " + index_modifier.docCount());
192 // Retrieve the document requested
193 int doc_num = index_modifier.getDocNumByNodeID(node_id);
194 if (doc_num != -1)
195 {
196 debug("* Found document #" + doc_num);
197 // Retrieve the actual document
198 Document document = index_modifier.document(doc_num);
199 // Remove the document from the index before modifying
200 index_modifier.deleteDocument(doc_num);
201 debug("* Removed document from index prior to editing");
202 // Retrieve the requested fields values, and turn it into a
203 // vector
204 debug("* Modifying the value of the field: " + field);
205 doEdit(document, field, old_value, new_value);
206
207 // We have to do a similar modification to the ZZ field
208 // too
209 debug("* Modifying the value of the field: ZZ");
210 doEdit(document, "ZZ", old_value, new_value);
211
212 // Re-index document
213 index_modifier.addDocument(document);
214 debug("* Reindexing modified document");
215 }
216 else
217 {
218 debug("- No such document!");
219 Document document = new Document();
220
221 // Retrieve the requested fields values, and turn it into a
222 // vector
223 debug("* Adding the value to the field: " + field);
224 doEdit(document, field, old_value, new_value);
225
226 // We have to do a similar modification to the ZZ field
227 // too
228 debug("* Adding the value to the field: ZZ");
229 doEdit(document, "ZZ", old_value, new_value);
230
231 // We also have to initialize the nodeId value
232 document.add(new Field("nodeID", String.valueOf(node_id), Field.Store.YES, Field.Index.TOKENIZED));
233
234 // Re-index document
235 index_modifier.addDocument(document);
236 debug("* Indexing new document");
237 }
238
239
240 }
241 /** editIndex() **/
242
243 /**
244 */
245 protected void doEdit(Document document, String field, String old_value, String new_value)
246 {
247 if (debug)
248 {
249 debug("GS2LuceneEditor.doEdit(Document, \"" + field + "\", \"" + old_value + "\", \"" + new_value + "\")");
250 }
251
252 String values_raw[] = document.getValues(field);
253 if(values_raw != null)
254 {
255 Vector values = new Vector(Arrays.asList(values_raw));
256 // Remove all the values for this field (no other safe way to
257 // do this
258 document.removeFields(field);
259 // DEBUG
260 if (debug)
261 {
262 debug("- Before modification:");
263 for(int i = 0; i < values.size(); i++)
264 {
265 debug("\t" + field + "[" + i + "]: " + values.get(i));
266 }
267 }
268 // If old_value is set, remove it from the values array
269 if(!old_value.equals(""))
270 {
271 // Remove all occurances of this metadata - this means
272 // it becomes a bit dangerous to have multiple pieces
273 // of metadata with exactly the same metadata - but
274 // this is only for indexing purposes so its not so
275 // bad.
276 while(values.contains(old_value))
277 {
278 values.remove(old_value);
279 }
280 }
281 // If new_value is set, add it to the values array
282 if(!new_value.equals("") && !values.contains(new_value))
283 {
284 values.add(new_value);
285 }
286 // DEBUG
287 if(debug)
288 {
289 debug("- After modification:");
290 for(int i = 0; i < values.size(); i++)
291 {
292 debug("\t" + field + "[" + i + "]: " + values.get(i));
293 }
294 }
295 // Add all the values for this field
296 for(int i = 0; i < values.size(); i++)
297 {
298 document.add(new Field(field, (String)values.get(i), Field.Store.YES, Field.Index.TOKENIZED));
299 }
300 values.clear();
301 values = null;
302 }
303 // We may be adding a value to a field that current has no values
304 else if (!new_value.equals(""))
305 {
306 Vector values = new Vector();
307 values.add(new_value);
308 // DEBUG
309 if(debug)
310 {
311 debug("- Brand spanking new values:");
312 for(int i = 0; i < values.size(); i++)
313 {
314 debug("\t" + field + "[" + i + "]: " + values.get(i));
315 }
316 }
317 // Add all the values for this field
318 for(int i = 0; i < values.size(); i++)
319 {
320 document.add(new Field(field, (String)values.get(i), Field.Store.YES, Field.Index.TOKENIZED));
321 }
322 values.clear();
323 values = null;
324 }
325 // Can't do a removal unless something exists
326 else
327 {
328 debug("- No such field for this document: " + field);
329 }
330 values_raw = null;
331 }
332 /** doEdit() **/
333
334 /**
335 */
336 static public void printUsage()
337 {
338 System.out.println("usage: GS2LuceneEditor --index <path> --nodeid <int> --field <string>");
339 System.out.println(" [--oldvalue <string>] [--newvalue <string>]");
340 System.out.println("");
341 System.out.println("where:");
342 System.out.println(" index - is the full path to the directory containing the directory");
343 System.out.println(" to edit, including the level (ie didx, sidx)");
344 System.out.println(" nodeid - the unique identifier of the document to change. This is the");
345 System.out.println(" same as the docnum in the GDBM");
346 System.out.println(" field - the two letter code of the metadata field to edit. These can");
347 System.out.println(" found in the build.cfg file. ZZ is not a valid field as it");
348 System.out.println(" is handled as a special case");
349 System.out.println(" oldvalue - the current value of the metadata field if it is to be");
350 System.out.println(" replaced or removed");
351 System.out.println(" newvalue - the new value for the metadata field if it is to be replaced");
352 System.out.println(" or added");
353 System.out.println("");
354 System.exit(0);
355 }
356 /** printUsage() **/
357
358}
359/** class GS2LuceneEditor **/
Note: See TracBrowser for help on using the repository browser.