source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneEditor.java@ 24725

Last change on this file since 24725 was 24725, checked in by davidb, 13 years ago

Restruturing of Lucene version 2.x and 3.x to make it easier to control which one is used

  • Property svn:keywords set to Author Date Id Revision
File size: 13.8 KB
Line 
1/** @file GS2LuceneEditor.java
2 *
3 * Provides a wrapper to the index/document editing features of Lucene.
4 *
5 * This java application makes use of the existing Lucene class IndexModifier
6 * to access and make changes to the information stored about documents in a
7 * Lucene database. This is an essential component of the IncrementalBuilder
8 * PERL module, and endevours to make editing the text and metadata of
9 * documents without having to rebuild the entire collection a reality (in
10 * other words, true incremental/dynamic building).
11 *
12 * A component of the Greenstone digital library software from the New Zealand
13 * Digital Library Project at the University of Waikato, New Zealand.
14 *
15 * This program is free software; you can redistribute it and/or modify it
16 * under the terms of the GNU General Public License as published by the Free
17 * Software Foundation; either version 2 of the License, or (at your option)
18 * any later version.
19 *
20 * This program is distributed in the hope that it will be useful, but WITHOUT
21 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
22 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
23 * more details.
24 *
25 * You should have received a copy of the GNU General Public License along
26 * with this program; if not, write to the Free Software Foundation, Inc., 675
27 * Mass Ave, Cambridge, MA 02139, USA.
28 *
29 * Copyright (c) 2006 DL Consulting Ltd., New Zealand
30 */
31
32package org.greenstone.LuceneWrapper;
33
34import java.io.IOException;
35import java.io.File;
36import java.util.Arrays;
37import java.util.Enumeration;
38import java.util.Vector;
39
40import org.apache.lucene.analysis.Analyzer;
41//import org.apache.lucene.analysis.standard.StandardAnalyzer;
42import org.apache.lucene.document.Document;
43import org.apache.lucene.document.Field;
44
45import org.apache.lucene.store.SimpleFSDirectory;
46import org.apache.lucene.index.IndexWriter.MaxFieldLength;
47
48
49/** Contains methods for modifying a document that has previously been indexed
50 * into a Lucene database.
51 * @author John Thompson, DL Consulting Ltd.
52 */
53public class GS2LuceneEditor
54{
55 /** This is the main entry point to the editor and is responsible for
56 * parsing the arguments and creating an instance of the editor class.
57 *
58 * @param args The arguments passed into the application as a string
59 * array
60 * @return An integer describing the exit state of the application
61 * @throws Exception on any fatal error state
62 *
63 * @author John Thompson, DL Consulting Ltd.
64 */
65 static public void main (String args[])
66 throws Exception
67 {
68 // Parse arguments
69 int node_id = -1;
70 String field = "";
71 String index_path = "";
72 String new_value = "";
73 String old_value = "";
74
75 for (int i = 0; i < args.length; i += 2)
76 {
77 if (args[i].equals("--index"))
78 {
79 index_path = args[i + 1];
80 }
81 else if (args[i].equals("--nodeid"))
82 {
83 String temp = args[i + 1];
84 node_id = Integer.parseInt(temp);
85 temp = null; // Off to the gc with you!
86 }
87 else if (args[i].equals("--field"))
88 {
89 field = args[i + 1];
90 }
91 else if (args[i].equals("--oldvalue"))
92 {
93 old_value = args[i + 1];
94 }
95 else if (args[i].equals("--newvalue"))
96 {
97 new_value = args[i + 1];
98 }
99 else
100 {
101 System.out.println("Error! Unknown argument: " + args[i]);
102 GS2LuceneEditor.printUsage();
103 }
104 }
105
106 // Check arguments
107 if(index_path.equals(""))
108 {
109 System.out.println("Error! Missing index path");
110 GS2LuceneEditor.printUsage();
111 }
112 if(field.equals(""))
113 {
114 System.out.println("Error! Missing field");
115 GS2LuceneEditor.printUsage();
116 }
117 if(node_id == -1)
118 {
119 System.out.println("Error! Missing or invalid Node ID");
120 GS2LuceneEditor.printUsage();
121 }
122 if(old_value.equals("") && new_value.equals(""))
123 {
124 System.out.println("Error! No modification requested");
125 GS2LuceneEditor.printUsage();
126 }
127
128
129 // Instantiate editor, and perform the edit
130 GS2LuceneEditor editor = new GS2LuceneEditor(index_path);
131 editor.editIndex(node_id, field, old_value, new_value);
132 editor.destroy();
133 editor = null;
134 }
135 /** main() **/
136
137 /** **/
138 private boolean debug = true;
139
140 /** **/
141 private GS2IndexModifier index_modifier;
142
143 /** Constructor which takes the path to the Lucene index to be edited.
144 *
145 * @param index_path The full path to the index directory as a String
146 *
147 * @author John Thompson, DL Consulting Ltd.
148 */
149 public GS2LuceneEditor(String index_path)
150 throws IOException
151 {
152 Analyzer analyzer = new GS2Analyzer();
153 // create an index in /tmp/index, overwriting an existing one:
154 index_modifier = new GS2IndexModifier(index_path, analyzer);
155 }
156 /** GS2LuceneEditor **/
157
158 /**
159 */
160 public void debug(String message)
161 {
162 if(debug)
163 {
164 System.err.println(message);
165 }
166 }
167 /** debug() **/
168
169 /** Destructor which unallocates connection to Lucene.
170 */
171 public void destroy()
172 throws IOException
173 {
174 index_modifier.close();
175 index_modifier = null;
176 }
177
178 /** Make an edit to a Lucene index.
179 *
180 * @param oid The unique identifier of a Lucene document as an
181 * integer
182 * @param field The field to be modified as a String
183 * @param old_value The existing value to be changed or removed as a
184 * String
185 * @param old_value The replacement value to be changed or added as a
186 * String
187 *
188 * @author John Thompson, DL Consulting Ltd.
189 */
190 public void editIndex(int node_id, String field, String old_value, String new_value)
191 throws IOException
192 {
193 debug("GS2LuceneEditor.editIndex(" + node_id + ",'" + field + "','" + old_value + "','" + new_value + "')");
194 debug("- Initial number of documents in index: " + index_modifier.numDocs());
195 // Retrieve the document requested
196 int doc_num = index_modifier.getDocNumByNodeID(node_id);
197 if (doc_num != -1)
198 {
199 debug("* Found document #" + doc_num);
200 // Retrieve the actual document
201 Document document = index_modifier.document(doc_num);
202 // Remove the document from the index before modifying
203 index_modifier.deleteDocument(doc_num);
204 debug("* Removed document from index prior to editing");
205 // Retrieve the requested fields values, and turn it into a
206 // vector
207 debug("* Modifying the value of the field: " + field);
208 doEdit(document, field, old_value, new_value);
209
210 // We have to do a similar modification to the ZZ field
211 // too
212 debug("* Modifying the value of the field: ZZ");
213 doEdit(document, "ZZ", old_value, new_value);
214
215 // Re-index document
216 index_modifier.addDocument(document);
217 debug("* Reindexing modified document");
218 }
219 else
220 {
221 debug("- No such document!");
222 Document document = new Document();
223
224 // Retrieve the requested fields values, and turn it into a
225 // vector
226 debug("* Adding the value to the field: " + field);
227 doEdit(document, field, old_value, new_value);
228
229 // We have to do a similar modification to the ZZ field
230 // too
231 debug("* Adding the value to the field: ZZ");
232 doEdit(document, "ZZ", old_value, new_value);
233
234 // We also have to initialize the nodeId value
235 // changed to use docOID --kjdon
236 document.add(new Field("docOID", String.valueOf(node_id), Field.Store.YES, Field.Index.ANALYZED));
237
238 // Re-index document
239 index_modifier.addDocument(document);
240 debug("* Indexing new document");
241 }
242
243
244 }
245 /** editIndex() **/
246
247 /**
248 */
249 protected void doEdit(Document document, String field, String old_value, String new_value)
250 {
251 if (debug)
252 {
253 debug("GS2LuceneEditor.doEdit(Document, \"" + field + "\", \"" + old_value + "\", \"" + new_value + "\")");
254 }
255
256 String values_raw[] = document.getValues(field);
257 if(values_raw != null)
258 {
259 Vector values = new Vector(Arrays.asList(values_raw));
260 // Remove all the values for this field (no other safe way to
261 // do this
262 document.removeFields(field);
263 // DEBUG
264 if (debug)
265 {
266 debug("- Before modification:");
267 for(int i = 0; i < values.size(); i++)
268 {
269 debug("\t" + field + "[" + i + "]: " + values.get(i));
270 }
271 }
272 // If old_value is set, remove it from the values array
273 if(!old_value.equals(""))
274 {
275 // Remove all occurances of this metadata - this means
276 // it becomes a bit dangerous to have multiple pieces
277 // of metadata with exactly the same metadata - but
278 // this is only for indexing purposes so its not so
279 // bad.
280 while(values.contains(old_value))
281 {
282 values.remove(old_value);
283 }
284 }
285 // If new_value is set, add it to the values array
286 if(!new_value.equals("") && !values.contains(new_value))
287 {
288 values.add(new_value);
289 }
290 // DEBUG
291 if(debug)
292 {
293 debug("- After modification:");
294 for(int i = 0; i < values.size(); i++)
295 {
296 debug("\t" + field + "[" + i + "]: " + values.get(i));
297 }
298 }
299 // Add all the values for this field
300 for(int i = 0; i < values.size(); i++)
301 {
302 document.add(new Field(field, (String)values.get(i), Field.Store.YES, Field.Index.ANALYZED));
303 }
304 values.clear();
305 values = null;
306 }
307 // We may be adding a value to a field that current has no values
308 else if (!new_value.equals(""))
309 {
310 Vector values = new Vector();
311 values.add(new_value);
312 // DEBUG
313 if(debug)
314 {
315 debug("- Brand spanking new values:");
316 for(int i = 0; i < values.size(); i++)
317 {
318 debug("\t" + field + "[" + i + "]: " + values.get(i));
319 }
320 }
321 // Add all the values for this field
322 for(int i = 0; i < values.size(); i++)
323 {
324 document.add(new Field(field, (String)values.get(i), Field.Store.YES, Field.Index.ANALYZED));
325 }
326 values.clear();
327 values = null;
328 }
329 // Can't do a removal unless something exists
330 else
331 {
332 debug("- No such field for this document: " + field);
333 }
334 values_raw = null;
335 }
336 /** doEdit() **/
337
338 /**
339 */
340 static public void printUsage()
341 {
342 System.out.println("usage: GS2LuceneEditor --index <path> --nodeid <int> --field <string>");
343 System.out.println(" [--oldvalue <string>] [--newvalue <string>]");
344 System.out.println("");
345 System.out.println("where:");
346 System.out.println(" index - is the full path to the directory containing the directory");
347 System.out.println(" to edit, including the level (ie didx, sidx)");
348 System.out.println(" nodeid - the unique identifier of the document to change. This is the");
349 System.out.println(" same as the docnum in the GDBM");
350 System.out.println(" field - the two letter code of the metadata field to edit. These can");
351 System.out.println(" found in the build.cfg file. ZZ is not a valid field as it");
352 System.out.println(" is handled as a special case");
353 System.out.println(" oldvalue - the current value of the metadata field if it is to be");
354 System.out.println(" replaced or removed");
355 System.out.println(" newvalue - the new value for the metadata field if it is to be replaced");
356 System.out.println(" or added");
357 System.out.println("");
358 System.exit(0);
359 }
360 /** printUsage() **/
361
362}
363/** class GS2LuceneEditor **/
Note: See TracBrowser for help on using the repository browser.