source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper4/GS2LuceneEditor.java@ 29148

Last change on this file since 29148 was 29148, checked in by ak19, 10 years ago

Part of port from lucene3.3.0 to lucene4.7.2. Related to LuceneWrapper. 1. Updating the lucene-gs makefiles to allow compiling up Lucene4Wrapper.jar or Lucene3Wrapper.jar. Only the Linux Makefile.in has been tested so far. 2. Adding in the jar files necessary for Lucene4Wrapper into the lib folder's new lucene4 subfolder. 3. Updating the Lucene src code to use lucene4.7.2 instead of lucene3.3.0.

  • Property svn:executable set to *
File size: 15.4 KB
Line 
1/** @file GS2LuceneEditor.java
2 *
3 * Provides a wrapper to the index/document editing features of Lucene.
4 *
5 * This java application makes use of the existing Lucene class IndexModifier
6 * to access and make changes to the information stored about documents in a
7 * Lucene database. This is an essential component of the IncrementalBuilder
8 * PERL module, and endevours to make editing the text and metadata of
9 * documents without having to rebuild the entire collection a reality (in
10 * other words, true incremental/dynamic building).
11 *
12 * A component of the Greenstone digital library software from the New Zealand
13 * Digital Library Project at the University of Waikato, New Zealand.
14 *
15 * This program is free software; you can redistribute it and/or modify it
16 * under the terms of the GNU General Public License as published by the Free
17 * Software Foundation; either version 2 of the License, or (at your option)
18 * any later version.
19 *
20 * This program is distributed in the hope that it will be useful, but WITHOUT
21 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
22 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
23 * more details.
24 *
25 * You should have received a copy of the GNU General Public License along
26 * with this program; if not, write to the Free Software Foundation, Inc., 675
27 * Mass Ave, Cambridge, MA 02139, USA.
28 *
29 * Copyright (c) 2006 DL Consulting Ltd., New Zealand
30 */
31
32package org.greenstone.LuceneWrapper4;
33
34import java.io.IOException;
35import java.io.File;
36import java.util.Arrays;
37import java.util.Enumeration;
38import java.util.Vector;
39
40import org.apache.lucene.index.Term;
41import org.apache.lucene.search.TermQuery;
42
43import org.apache.lucene.analysis.Analyzer;
44//import org.apache.lucene.analysis.standard.StandardAnalyzer;
45import org.apache.lucene.document.Document;
46import org.apache.lucene.document.Field;
47
48import org.apache.lucene.index.DirectoryReader;
49import org.apache.lucene.index.IndexReader;
50import org.apache.lucene.index.IndexWriter;
51
52import org.apache.lucene.search.IndexSearcher;
53import org.apache.lucene.search.Query;
54import org.apache.lucene.search.ScoreDoc;
55import org.apache.lucene.search.TopDocs;
56
57import org.apache.lucene.store.Directory;
58import org.apache.lucene.store.FSDirectory;
59
60
61/** Contains methods for modifying a document that has previously been indexed
62 * into a Lucene database.
63 * @author John Thompson, DL Consulting Ltd.
64 */
65public class GS2LuceneEditor
66{
67 /** This is the main entry point to the editor and is responsible for
68 * parsing the arguments and creating an instance of the editor class.
69 *
70 * @param args The arguments passed into the application as a string
71 * array
72 * @return An integer describing the exit state of the application
73 * @throws Exception on any fatal error state
74 *
75 * @author John Thompson, DL Consulting Ltd.
76 */
77 static public void main (String args[])
78 throws Exception
79 {
80 // Parse arguments
81 int node_id = -1;
82 String field = "";
83 String index_path = "";
84 String new_value = "";
85 String old_value = "";
86
87 for (int i = 0; i < args.length; i += 2)
88 {
89 if (args[i].equals("--index"))
90 {
91 index_path = args[i + 1];
92 }
93 else if (args[i].equals("--nodeid"))
94 {
95 String temp = args[i + 1];
96 node_id = Integer.parseInt(temp);
97 temp = null; // Off to the gc with you!
98 }
99 else if (args[i].equals("--field"))
100 {
101 field = args[i + 1];
102 }
103 else if (args[i].equals("--oldvalue"))
104 {
105 old_value = args[i + 1];
106 }
107 else if (args[i].equals("--newvalue"))
108 {
109 new_value = args[i + 1];
110 }
111 else
112 {
113 System.out.println("Error! Unknown argument: " + args[i]);
114 GS2LuceneEditor.printUsage();
115 }
116 }
117
118 // Check arguments
119 if(index_path.equals(""))
120 {
121 System.out.println("Error! Missing index path");
122 GS2LuceneEditor.printUsage();
123 }
124 if(field.equals(""))
125 {
126 System.out.println("Error! Missing field");
127 GS2LuceneEditor.printUsage();
128 }
129 if(node_id == -1)
130 {
131 System.out.println("Error! Missing or invalid Node ID");
132 GS2LuceneEditor.printUsage();
133 }
134 if(old_value.equals("") && new_value.equals(""))
135 {
136 System.out.println("Error! No modification requested");
137 GS2LuceneEditor.printUsage();
138 }
139
140
141 // Instantiate editor, and perform the edit
142 GS2LuceneEditor editor = new GS2LuceneEditor(index_path);
143 editor.editIndex(node_id, field, old_value, new_value);
144 editor.destroy();
145 editor = null;
146 }
147 /** main() **/
148
149 /** **/
150 private boolean debug = true;
151
152 /** **/
153 //private IndexWriter index_writer;
154 private String index_path;
155
156 /** Constructor which takes the path to the Lucene index to be edited.
157 *
158 * @param index_path The full path to the index directory as a String
159 *
160 * @author John Thompson, DL Consulting Ltd.
161 */
162 public GS2LuceneEditor(String index_path)
163 throws IOException
164 {
165 Analyzer analyzer = new GS2Analyzer();
166
167 // in place of IndexModifier (encapsulating an IndexReader and IndexWriter),
168 // we use an IndexWriter in LuceneWrapper4
169 //index_writer = GSLuceneUtil.getIndexWriter(index_path, false);
170 this.index_path = index_path;
171
172 }
173 /** GS2LuceneEditor **/
174
175 /**
176 */
177 public void debug(String message)
178 {
179 if(debug)
180 {
181 System.err.println(message);
182 }
183 }
184 /** debug() **/
185
186 /** Destructor which unallocates connection to Lucene.
187 */
188 public void destroy()
189 throws IOException
190 {
191 //index_writer.close();
192 //index_writer = null;
193 }
194
195 /** Make an edit to a Lucene index.
196 *
197 * @param oid The unique identifier of a Lucene document as an
198 * integer
199 * @param field The field to be modified as a String
200 * @param old_value The existing value to be changed or removed as a
201 * String
202 * @param old_value The replacement value to be changed or added as a
203 * String
204 *
205 * @author John Thompson, DL Consulting Ltd.
206 */
207 public void editIndex(int node_id, String field, String old_value, String new_value)
208 throws IOException
209 {
210 debug("GS2LuceneEditor.editIndex(" + node_id + ",'" + field + "','" + old_value + "','" + new_value + "')");
211 //debug("- Initial number of documents in index: " + index_writer.numDocs());
212 // Retrieve the document requested
213
214 // Create a new term to encapsulate this node id
215 // "was nodeID, now using docOID --kjdon"
216 //int doc_num = index_modifier.getDocNumByNodeID(node_id);
217
218 Term docOID_term = new Term("docOID", String.valueOf(node_id));
219 // http://stackoverflow.com/questions/14589799/lucene-get-document-ids-from-term
220 // http://stackoverflow.com/questions/16847857/how-do-you-read-the-index-in-lucene-to-do-a-search
221 Directory index = FSDirectory.open(new File(index_path));
222 IndexReader reader = DirectoryReader.open(index);
223 IndexSearcher searcher = new IndexSearcher(reader);
224 TermQuery query = new TermQuery(docOID_term);
225 TopDocs topdocs = searcher.search(query, 1);
226
227 int doc_num = -1;
228 if(topdocs != null && topdocs.scoreDocs.length > 0) {
229 doc_num = topdocs.scoreDocs[0].doc;
230 }
231
232 // we can now at last get the document
233
234 if (doc_num != -1)
235 {
236 debug("* Found document #" + doc_num);
237
238 // Retrieve the actual document
239 Document document = reader.document(doc_num);
240 reader.close(); // Closes files associated with this index. Also saves any new deletions to disk. No other methods should be called after this has been called.
241
242 // Remove the document from the index before modifying
243 IndexWriter indexWriter = GSLuceneUtil.getIndexWriter(index_path, false);
244 debug("- Initial number of documents in index: " + indexWriter.numDocs());
245 indexWriter.deleteDocuments(docOID_term);
246
247 debug("* Removed document from index prior to editing");
248 // Retrieve the requested fields values, and turn it into a
249 // vector
250 debug("* Modifying the value of the field: " + field);
251 doEdit(document, field, old_value, new_value);
252
253 // We have to do a similar modification to the ZZ field
254 // too
255 debug("* Modifying the value of the field: ZZ");
256 doEdit(document, "ZZ", old_value, new_value);
257
258 // Re-index document
259 indexWriter.addDocument(document);
260 debug("* Reindexing modified document");
261
262 indexWriter.close();
263 indexWriter = null;
264 }
265 else
266 {
267 debug("- No such document!");
268 Document document = new Document();
269
270 // Retrieve the requested fields values, and turn it into a
271 // vector
272 debug("* Adding the value to the field: " + field);
273 doEdit(document, field, old_value, new_value);
274
275 // We have to do a similar modification to the ZZ field
276 // too
277 debug("* Adding the value to the field: ZZ");
278 doEdit(document, "ZZ", old_value, new_value);
279
280 // We also have to initialize the nodeId value
281 // changed to use docOID --kjdon
282 document.add(new Field("docOID", String.valueOf(node_id), Field.Store.YES, Field.Index.ANALYZED));
283
284 // Re-index document
285 IndexWriter indexWriter = GSLuceneUtil.getIndexWriter(index_path, false);
286 indexWriter.addDocument(document);
287 debug("* Indexing new document");
288
289 indexWriter.close();
290 indexWriter = null;
291 }
292
293
294 }
295 /** editIndex() **/
296
297 /**
298 */
299 protected void doEdit(Document document, String field, String old_value, String new_value)
300 {
301 if (debug)
302 {
303 debug("GS2LuceneEditor.doEdit(Document, \"" + field + "\", \"" + old_value + "\", \"" + new_value + "\")");
304 }
305
306 String values_raw[] = document.getValues(field);
307 if(values_raw != null)
308 {
309 Vector values = new Vector(Arrays.asList(values_raw));
310 // Remove all the values for this field (no other safe way to
311 // do this
312 document.removeFields(field);
313 // DEBUG
314 if (debug)
315 {
316 debug("- Before modification:");
317 for(int i = 0; i < values.size(); i++)
318 {
319 debug("\t" + field + "[" + i + "]: " + values.get(i));
320 }
321 }
322 // If old_value is set, remove it from the values array
323 if(!old_value.equals(""))
324 {
325 // Remove all occurances of this metadata - this means
326 // it becomes a bit dangerous to have multiple pieces
327 // of metadata with exactly the same metadata - but
328 // this is only for indexing purposes so its not so
329 // bad.
330 while(values.contains(old_value))
331 {
332 values.remove(old_value);
333 }
334 }
335 // If new_value is set, add it to the values array
336 if(!new_value.equals("") && !values.contains(new_value))
337 {
338 values.add(new_value);
339 }
340 // DEBUG
341 if(debug)
342 {
343 debug("- After modification:");
344 for(int i = 0; i < values.size(); i++)
345 {
346 debug("\t" + field + "[" + i + "]: " + values.get(i));
347 }
348 }
349 // Add all the values for this field
350 for(int i = 0; i < values.size(); i++)
351 {
352 document.add(new Field(field, (String)values.get(i), Field.Store.YES, Field.Index.ANALYZED));
353 }
354 values.clear();
355 values = null;
356 }
357 // We may be adding a value to a field that current has no values
358 else if (!new_value.equals(""))
359 {
360 Vector values = new Vector();
361 values.add(new_value);
362 // DEBUG
363 if(debug)
364 {
365 debug("- Brand spanking new values:");
366 for(int i = 0; i < values.size(); i++)
367 {
368 debug("\t" + field + "[" + i + "]: " + values.get(i));
369 }
370 }
371 // Add all the values for this field
372 for(int i = 0; i < values.size(); i++)
373 {
374 document.add(new Field(field, (String)values.get(i), Field.Store.YES, Field.Index.ANALYZED));
375 }
376 values.clear();
377 values = null;
378 }
379 // Can't do a removal unless something exists
380 else
381 {
382 debug("- No such field for this document: " + field);
383 }
384 values_raw = null;
385 }
386 /** doEdit() **/
387
388 /**
389 */
390 static public void printUsage()
391 {
392 System.out.println("usage: GS2LuceneEditor --index <path> --nodeid <int> --field <string>");
393 System.out.println(" [--oldvalue <string>] [--newvalue <string>]");
394 System.out.println("");
395 System.out.println("where:");
396 System.out.println(" index - is the full path to the directory containing the directory");
397 System.out.println(" to edit, including the level (ie didx, sidx)");
398 System.out.println(" nodeid - the unique identifier of the document to change. This is the");
399 System.out.println(" same as the docnum in the GDBM");
400 System.out.println(" field - the two letter code of the metadata field to edit. These can");
401 System.out.println(" found in the build.cfg file. ZZ is not a valid field as it");
402 System.out.println(" is handled as a special case");
403 System.out.println(" oldvalue - the current value of the metadata field if it is to be");
404 System.out.println(" replaced or removed");
405 System.out.println(" newvalue - the new value for the metadata field if it is to be replaced");
406 System.out.println(" or added");
407 System.out.println("");
408 System.exit(0);
409 }
410 /** printUsage() **/
411
412}
413/** class GS2LuceneEditor **/
Note: See TracBrowser for help on using the repository browser.