source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/lucene-version-3.3/GS2LuceneEditor.java@ 24716

Last change on this file since 24716 was 24716, checked in by davidb, 13 years ago

A version of the LuceneWrapper code that works with Lucene version 3.3

  • Property svn:executable set to *
File size: 13.8 KB
Line 
1/** @file GS2LuceneEditor.java
2 *
3 * Provides a wrapper to the index/document editing features of Lucene.
4 *
5 * This java application makes use of the existing Lucene class IndexModifier
6 * to access and make changes to the information stored about documents in a
7 * Lucene database. This is an essential component of the IncrementalBuilder
8 * PERL module, and endevours to make editing the text and metadata of
9 * documents without having to rebuild the entire collection a reality (in
10 * other words, true incremental/dynamic building).
11 *
12 * A component of the Greenstone digital library software from the New Zealand
13 * Digital Library Project at the University of Waikato, New Zealand.
14 *
15 * This program is free software; you can redistribute it and/or modify it
16 * under the terms of the GNU General Public License as published by the Free
17 * Software Foundation; either version 2 of the License, or (at your option)
18 * any later version.
19 *
20 * This program is distributed in the hope that it will be useful, but WITHOUT
21 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
22 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
23 * more details.
24 *
25 * You should have received a copy of the GNU General Public License along
26 * with this program; if not, write to the Free Software Foundation, Inc., 675
27 * Mass Ave, Cambridge, MA 02139, USA.
28 *
29 * Copyright (c) 2006 DL Consulting Ltd., New Zealand
30 */
31
32package org.greenstone.LuceneWrapper;
33
34import java.io.IOException;
35import java.io.File;
36import java.util.Arrays;
37import java.util.Enumeration;
38import java.util.Vector;
39
40import org.apache.lucene.analysis.Analyzer;
41//import org.apache.lucene.analysis.standard.StandardAnalyzer;
42import org.apache.lucene.document.Document;
43import org.apache.lucene.document.Field;
44
45import org.apache.lucene.store.SimpleFSDirectory;
46import org.apache.lucene.index.IndexWriter.MaxFieldLength;
47
48
49/** Contains methods for modifying a document that has previously been indexed
50 * into a Lucene database.
51 * @author John Thompson, DL Consulting Ltd.
52 */
53public class GS2LuceneEditor
54{
55 /** This is the main entry point to the editor and is responsible for
56 * parsing the arguments and creating an instance of the editor class.
57 *
58 * @param args The arguments passed into the application as a string
59 * array
60 * @return An integer describing the exit state of the application
61 * @throws Exception on any fatal error state
62 *
63 * @author John Thompson, DL Consulting Ltd.
64 */
65 static public void main (String args[])
66 throws Exception
67 {
68 // Parse arguments
69 int node_id = -1;
70 String field = "";
71 String index_path = "";
72 String new_value = "";
73 String old_value = "";
74
75 for (int i = 0; i < args.length; i += 2)
76 {
77 if (args[i].equals("--index"))
78 {
79 index_path = args[i + 1];
80 }
81 else if (args[i].equals("--nodeid"))
82 {
83 String temp = args[i + 1];
84 node_id = Integer.parseInt(temp);
85 temp = null; // Off to the gc with you!
86 }
87 else if (args[i].equals("--field"))
88 {
89 field = args[i + 1];
90 }
91 else if (args[i].equals("--oldvalue"))
92 {
93 old_value = args[i + 1];
94 }
95 else if (args[i].equals("--newvalue"))
96 {
97 new_value = args[i + 1];
98 }
99 else
100 {
101 System.out.println("Error! Unknown argument: " + args[i]);
102 GS2LuceneEditor.printUsage();
103 }
104 }
105
106 // Check arguments
107 if(index_path.equals(""))
108 {
109 System.out.println("Error! Missing index path");
110 GS2LuceneEditor.printUsage();
111 }
112 if(field.equals(""))
113 {
114 System.out.println("Error! Missing field");
115 GS2LuceneEditor.printUsage();
116 }
117 if(node_id == -1)
118 {
119 System.out.println("Error! Missing or invalid Node ID");
120 GS2LuceneEditor.printUsage();
121 }
122 if(old_value.equals("") && new_value.equals(""))
123 {
124 System.out.println("Error! No modification requested");
125 GS2LuceneEditor.printUsage();
126 }
127
128
129 // Instantiate editor, and perform the edit
130 GS2LuceneEditor editor = new GS2LuceneEditor(index_path);
131 editor.editIndex(node_id, field, old_value, new_value);
132 editor.destroy();
133 editor = null;
134 }
135 /** main() **/
136
137 /** **/
138 private boolean debug = true;
139
140 /** **/
141 private GS2IndexModifier index_modifier;
142
143 /** Constructor which takes the path to the Lucene index to be edited.
144 *
145 * @param index_path The full path to the index directory as a String
146 *
147 * @author John Thompson, DL Consulting Ltd.
148 */
149 public GS2LuceneEditor(String index_path)
150 throws IOException
151 {
152 Analyzer analyzer = new GS2Analyzer();
153 // create an index in /tmp/index, overwriting an existing one:
154 index_modifier = new GS2IndexModifier(index_path, analyzer);
155 }
156 /** GS2LuceneEditor **/
157
158 /**
159 */
160 public void debug(String message)
161 {
162 if(debug)
163 {
164 System.err.println(message);
165 }
166 }
167 /** debug() **/
168
169 /** Destructor which unallocates connection to Lucene.
170 */
171 public void destroy()
172 throws IOException
173 {
174 index_modifier.close();
175 index_modifier = null;
176 }
177
178 /** Make an edit to a Lucene index.
179 *
180 * @param oid The unique identifier of a Lucene document as an
181 * integer
182 * @param field The field to be modified as a String
183 * @param old_value The existing value to be changed or removed as a
184 * String
185 * @param old_value The replacement value to be changed or added as a
186 * String
187 *
188 * @author John Thompson, DL Consulting Ltd.
189 */
190 public void editIndex(int node_id, String field, String old_value, String new_value)
191 throws IOException
192 {
193 debug("GS2LuceneEditor.editIndex(" + node_id + ",'" + field + "','" + old_value + "','" + new_value + "')");
194 debug("- Initial number of documents in index: " + index_modifier.numDocs());
195 // Retrieve the document requested
196 int doc_num = index_modifier.getDocNumByNodeID(node_id);
197 if (doc_num != -1)
198 {
199 debug("* Found document #" + doc_num);
200 // Retrieve the actual document
201 Document document = index_modifier.document(doc_num);
202 // Remove the document from the index before modifying
203 index_modifier.deleteDocument(doc_num);
204 debug("* Removed document from index prior to editing");
205 // Retrieve the requested fields values, and turn it into a
206 // vector
207 debug("* Modifying the value of the field: " + field);
208 doEdit(document, field, old_value, new_value);
209
210 // We have to do a similar modification to the ZZ field
211 // too
212 debug("* Modifying the value of the field: ZZ");
213 doEdit(document, "ZZ", old_value, new_value);
214
215 // Re-index document
216 index_modifier.addDocument(document);
217 debug("* Reindexing modified document");
218 }
219 else
220 {
221 debug("- No such document!");
222 Document document = new Document();
223
224 // Retrieve the requested fields values, and turn it into a
225 // vector
226 debug("* Adding the value to the field: " + field);
227 doEdit(document, field, old_value, new_value);
228
229 // We have to do a similar modification to the ZZ field
230 // too
231 debug("* Adding the value to the field: ZZ");
232 doEdit(document, "ZZ", old_value, new_value);
233
234 // We also have to initialize the nodeId value
235 // changed to use docOID --kjdon
236 document.add(new Field("docOID", String.valueOf(node_id), Field.Store.YES, Field.Index.ANALYZED));
237
238 // Re-index document
239 index_modifier.addDocument(document);
240 debug("* Indexing new document");
241 }
242
243
244 }
245 /** editIndex() **/
246
247 /**
248 */
249 protected void doEdit(Document document, String field, String old_value, String new_value)
250 {
251 if (debug)
252 {
253 debug("GS2LuceneEditor.doEdit(Document, \"" + field + "\", \"" + old_value + "\", \"" + new_value + "\")");
254 }
255
256 String values_raw[] = document.getValues(field);
257 if(values_raw != null)
258 {
259 Vector values = new Vector(Arrays.asList(values_raw));
260 // Remove all the values for this field (no other safe way to
261 // do this
262 document.removeFields(field);
263 // DEBUG
264 if (debug)
265 {
266 debug("- Before modification:");
267 for(int i = 0; i < values.size(); i++)
268 {
269 debug("\t" + field + "[" + i + "]: " + values.get(i));
270 }
271 }
272 // If old_value is set, remove it from the values array
273 if(!old_value.equals(""))
274 {
275 // Remove all occurances of this metadata - this means
276 // it becomes a bit dangerous to have multiple pieces
277 // of metadata with exactly the same metadata - but
278 // this is only for indexing purposes so its not so
279 // bad.
280 while(values.contains(old_value))
281 {
282 values.remove(old_value);
283 }
284 }
285 // If new_value is set, add it to the values array
286 if(!new_value.equals("") && !values.contains(new_value))
287 {
288 values.add(new_value);
289 }
290 // DEBUG
291 if(debug)
292 {
293 debug("- After modification:");
294 for(int i = 0; i < values.size(); i++)
295 {
296 debug("\t" + field + "[" + i + "]: " + values.get(i));
297 }
298 }
299 // Add all the values for this field
300 for(int i = 0; i < values.size(); i++)
301 {
302 document.add(new Field(field, (String)values.get(i), Field.Store.YES, Field.Index.ANALYZED));
303 }
304 values.clear();
305 values = null;
306 }
307 // We may be adding a value to a field that current has no values
308 else if (!new_value.equals(""))
309 {
310 Vector values = new Vector();
311 values.add(new_value);
312 // DEBUG
313 if(debug)
314 {
315 debug("- Brand spanking new values:");
316 for(int i = 0; i < values.size(); i++)
317 {
318 debug("\t" + field + "[" + i + "]: " + values.get(i));
319 }
320 }
321 // Add all the values for this field
322 for(int i = 0; i < values.size(); i++)
323 {
324 document.add(new Field(field, (String)values.get(i), Field.Store.YES, Field.Index.ANALYZED));
325 }
326 values.clear();
327 values = null;
328 }
329 // Can't do a removal unless something exists
330 else
331 {
332 debug("- No such field for this document: " + field);
333 }
334 values_raw = null;
335 }
336 /** doEdit() **/
337
338 /**
339 */
340 static public void printUsage()
341 {
342 System.out.println("usage: GS2LuceneEditor --index <path> --nodeid <int> --field <string>");
343 System.out.println(" [--oldvalue <string>] [--newvalue <string>]");
344 System.out.println("");
345 System.out.println("where:");
346 System.out.println(" index - is the full path to the directory containing the directory");
347 System.out.println(" to edit, including the level (ie didx, sidx)");
348 System.out.println(" nodeid - the unique identifier of the document to change. This is the");
349 System.out.println(" same as the docnum in the GDBM");
350 System.out.println(" field - the two letter code of the metadata field to edit. These can");
351 System.out.println(" found in the build.cfg file. ZZ is not a valid field as it");
352 System.out.println(" is handled as a special case");
353 System.out.println(" oldvalue - the current value of the metadata field if it is to be");
354 System.out.println(" replaced or removed");
355 System.out.println(" newvalue - the new value for the metadata field if it is to be replaced");
356 System.out.println(" or added");
357 System.out.println("");
358 System.exit(0);
359 }
360 /** printUsage() **/
361
362}
363/** class GS2LuceneEditor **/
Note: See TracBrowser for help on using the repository browser.