[12264] | 1 | /** @file GS2LuceneEditor.java
|
---|
| 2 | *
|
---|
| 3 | * Provides a wrapper to the index/document editing features of Lucene.
|
---|
| 4 | *
|
---|
| 5 | * This java application makes use of the existing Lucene class IndexModifier
|
---|
| 6 | * to access and make changes to the information stored about documents in a
|
---|
| 7 | * Lucene database. This is an essential component of the IncrementalBuilder
|
---|
| 8 | * PERL module, and endevours to make editing the text and metadata of
|
---|
| 9 | * documents without having to rebuild the entire collection a reality (in
|
---|
| 10 | * other words, true incremental/dynamic building).
|
---|
| 11 | *
|
---|
| 12 | * A component of the Greenstone digital library software from the New Zealand
|
---|
| 13 | * Digital Library Project at the University of Waikato, New Zealand.
|
---|
| 14 | *
|
---|
| 15 | * This program is free software; you can redistribute it and/or modify it
|
---|
| 16 | * under the terms of the GNU General Public License as published by the Free
|
---|
| 17 | * Software Foundation; either version 2 of the License, or (at your option)
|
---|
| 18 | * any later version.
|
---|
| 19 | *
|
---|
| 20 | * This program is distributed in the hope that it will be useful, but WITHOUT
|
---|
| 21 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
---|
| 22 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
---|
| 23 | * more details.
|
---|
| 24 | *
|
---|
| 25 | * You should have received a copy of the GNU General Public License along
|
---|
| 26 | * with this program; if not, write to the Free Software Foundation, Inc., 675
|
---|
| 27 | * Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 28 | *
|
---|
| 29 | * Copyright (c) 2006 DL Consulting Ltd., New Zealand
|
---|
| 30 | */
|
---|
| 31 |
|
---|
[13686] | 32 | package org.greenstone.LuceneWrapper;
|
---|
[12264] | 33 |
|
---|
| 34 | import java.io.IOException;
|
---|
| 35 | import java.util.Arrays;
|
---|
| 36 | import java.util.Enumeration;
|
---|
| 37 | import java.util.Vector;
|
---|
| 38 |
|
---|
| 39 | import org.apache.lucene.analysis.Analyzer;
|
---|
| 40 | import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
---|
| 41 | import org.apache.lucene.document.Document;
|
---|
| 42 | import org.apache.lucene.document.Field;
|
---|
| 43 |
|
---|
| 44 |
|
---|
| 45 | /** Contains methods for modifying a document that has previously been indexed
|
---|
| 46 | * into a Lucene database.
|
---|
| 47 | * @author John Thompson, DL Consulting Ltd.
|
---|
| 48 | */
|
---|
| 49 | public class GS2LuceneEditor
|
---|
| 50 | {
|
---|
| 51 | /** This is the main entry point to the editor and is responsible for
|
---|
| 52 | * parsing the arguments and creating an instance of the editor class.
|
---|
| 53 | *
|
---|
| 54 | * @param args The arguments passed into the application as a string
|
---|
| 55 | * array
|
---|
| 56 | * @return An integer describing the exit state of the application
|
---|
| 57 | * @throws Exception on any fatal error state
|
---|
| 58 | *
|
---|
| 59 | * @author John Thompson, DL Consulting Ltd.
|
---|
| 60 | */
|
---|
| 61 | static public void main (String args[])
|
---|
| 62 | throws Exception
|
---|
| 63 | {
|
---|
| 64 | // Parse arguments
|
---|
| 65 | int node_id = -1;
|
---|
| 66 | String field = "";
|
---|
| 67 | String index_path = "";
|
---|
| 68 | String new_value = "";
|
---|
| 69 | String old_value = "";
|
---|
| 70 |
|
---|
| 71 | for (int i = 0; i < args.length; i += 2)
|
---|
| 72 | {
|
---|
| 73 | if (args[i].equals("--index"))
|
---|
| 74 | {
|
---|
| 75 | index_path = args[i + 1];
|
---|
| 76 | }
|
---|
| 77 | else if (args[i].equals("--nodeid"))
|
---|
| 78 | {
|
---|
| 79 | String temp = args[i + 1];
|
---|
| 80 | node_id = Integer.parseInt(temp);
|
---|
| 81 | temp = null; // Off to the gc with you!
|
---|
| 82 | }
|
---|
| 83 | else if (args[i].equals("--field"))
|
---|
| 84 | {
|
---|
| 85 | field = args[i + 1];
|
---|
| 86 | }
|
---|
| 87 | else if (args[i].equals("--oldvalue"))
|
---|
| 88 | {
|
---|
| 89 | old_value = args[i + 1];
|
---|
| 90 | }
|
---|
| 91 | else if (args[i].equals("--newvalue"))
|
---|
| 92 | {
|
---|
| 93 | new_value = args[i + 1];
|
---|
| 94 | }
|
---|
| 95 | else
|
---|
| 96 | {
|
---|
| 97 | System.out.println("Error! Unknown argument: " + args[i]);
|
---|
| 98 | GS2LuceneEditor.printUsage();
|
---|
| 99 | }
|
---|
| 100 | }
|
---|
| 101 |
|
---|
| 102 | // Check arguments
|
---|
| 103 | if(index_path.equals(""))
|
---|
| 104 | {
|
---|
| 105 | System.out.println("Error! Missing index path");
|
---|
| 106 | GS2LuceneEditor.printUsage();
|
---|
| 107 | }
|
---|
| 108 | if(field.equals(""))
|
---|
| 109 | {
|
---|
| 110 | System.out.println("Error! Missing field");
|
---|
| 111 | GS2LuceneEditor.printUsage();
|
---|
| 112 | }
|
---|
| 113 | if(node_id == -1)
|
---|
| 114 | {
|
---|
| 115 | System.out.println("Error! Missing or invalid Node ID");
|
---|
| 116 | GS2LuceneEditor.printUsage();
|
---|
| 117 | }
|
---|
| 118 | if(old_value.equals("") && new_value.equals(""))
|
---|
| 119 | {
|
---|
| 120 | System.out.println("Error! No modification requested");
|
---|
| 121 | GS2LuceneEditor.printUsage();
|
---|
| 122 | }
|
---|
| 123 |
|
---|
| 124 |
|
---|
| 125 | // Instantiate editor, and perform the edit
|
---|
| 126 | GS2LuceneEditor editor = new GS2LuceneEditor(index_path);
|
---|
| 127 | editor.editIndex(node_id, field, old_value, new_value);
|
---|
| 128 | editor.destroy();
|
---|
| 129 | editor = null;
|
---|
| 130 | }
|
---|
| 131 | /** main() **/
|
---|
| 132 |
|
---|
| 133 | /** **/
|
---|
| 134 | private boolean debug = true;
|
---|
| 135 |
|
---|
| 136 | /** **/
|
---|
| 137 | private GS2IndexModifier index_modifier;
|
---|
| 138 |
|
---|
| 139 | /** Constructor which takes the path to the Lucene index to be edited.
|
---|
| 140 | *
|
---|
| 141 | * @param index_path The full path to the index directory as a String
|
---|
| 142 | *
|
---|
| 143 | * @author John Thompson, DL Consulting Ltd.
|
---|
| 144 | */
|
---|
| 145 | public GS2LuceneEditor(String index_path)
|
---|
| 146 | throws IOException
|
---|
| 147 | {
|
---|
| 148 | Analyzer analyzer = new StandardAnalyzer();
|
---|
| 149 | // create an index in /tmp/index, overwriting an existing one:
|
---|
| 150 | index_modifier = new GS2IndexModifier(index_path, analyzer);
|
---|
| 151 | }
|
---|
| 152 | /** GS2LuceneEditor **/
|
---|
| 153 |
|
---|
| 154 | /**
|
---|
| 155 | */
|
---|
| 156 | public void debug(String message)
|
---|
| 157 | {
|
---|
| 158 | if(debug)
|
---|
| 159 | {
|
---|
| 160 | System.err.println(message);
|
---|
| 161 | }
|
---|
| 162 | }
|
---|
| 163 | /** debug() **/
|
---|
| 164 |
|
---|
| 165 | /** Destructor which unallocates connection to Lucene.
|
---|
| 166 | */
|
---|
| 167 | public void destroy()
|
---|
| 168 | throws IOException
|
---|
| 169 | {
|
---|
| 170 | index_modifier.close();
|
---|
| 171 | index_modifier = null;
|
---|
| 172 | }
|
---|
| 173 |
|
---|
| 174 | /** Make an edit to a Lucene index.
|
---|
| 175 | *
|
---|
| 176 | * @param oid The unique identifier of a Lucene document as an
|
---|
| 177 | * integer
|
---|
| 178 | * @param field The field to be modified as a String
|
---|
| 179 | * @param old_value The existing value to be changed or removed as a
|
---|
| 180 | * String
|
---|
| 181 | * @param old_value The replacement value to be changed or added as a
|
---|
| 182 | * String
|
---|
| 183 | *
|
---|
| 184 | * @author John Thompson, DL Consulting Ltd.
|
---|
| 185 | */
|
---|
| 186 | public void editIndex(int node_id, String field, String old_value, String new_value)
|
---|
| 187 | throws IOException
|
---|
| 188 | {
|
---|
| 189 | debug("GS2LuceneEditor.editIndex(" + node_id + ",'" + field + "','" + old_value + "','" + new_value + "')");
|
---|
| 190 | debug("- Initial number of documents in index: " + index_modifier.docCount());
|
---|
| 191 | // Retrieve the document requested
|
---|
| 192 | int doc_num = index_modifier.getDocNumByNodeID(node_id);
|
---|
| 193 | if (doc_num != -1)
|
---|
| 194 | {
|
---|
| 195 | debug("* Found document #" + doc_num);
|
---|
| 196 | // Retrieve the actual document
|
---|
| 197 | Document document = index_modifier.document(doc_num);
|
---|
| 198 | // Remove the document from the index before modifying
|
---|
| 199 | index_modifier.deleteDocument(doc_num);
|
---|
| 200 | debug("* Removed document from index prior to editing");
|
---|
| 201 | // Retrieve the requested fields values, and turn it into a
|
---|
| 202 | // vector
|
---|
| 203 | debug("* Modifying the value of the field: " + field);
|
---|
| 204 | doEdit(document, field, old_value, new_value);
|
---|
| 205 |
|
---|
| 206 | // We have to do a similar modification to the ZZ field
|
---|
| 207 | // too
|
---|
| 208 | debug("* Modifying the value of the field: ZZ");
|
---|
| 209 | doEdit(document, "ZZ", old_value, new_value);
|
---|
| 210 |
|
---|
| 211 | // Re-index document
|
---|
| 212 | index_modifier.addDocument(document);
|
---|
| 213 | debug("* Reindexing modified document");
|
---|
| 214 | }
|
---|
| 215 | else
|
---|
| 216 | {
|
---|
| 217 | debug("- No such document!");
|
---|
| 218 | Document document = new Document();
|
---|
| 219 |
|
---|
| 220 | // Retrieve the requested fields values, and turn it into a
|
---|
| 221 | // vector
|
---|
| 222 | debug("* Adding the value to the field: " + field);
|
---|
| 223 | doEdit(document, field, old_value, new_value);
|
---|
| 224 |
|
---|
| 225 | // We have to do a similar modification to the ZZ field
|
---|
| 226 | // too
|
---|
| 227 | debug("* Adding the value to the field: ZZ");
|
---|
| 228 | doEdit(document, "ZZ", old_value, new_value);
|
---|
| 229 |
|
---|
| 230 | // We also have to initialize the nodeId value
|
---|
| 231 | document.add(new Field("nodeID", String.valueOf(node_id), Field.Store.YES, Field.Index.TOKENIZED));
|
---|
| 232 |
|
---|
| 233 | // Re-index document
|
---|
| 234 | index_modifier.addDocument(document);
|
---|
| 235 | debug("* Indexing new document");
|
---|
| 236 | }
|
---|
| 237 |
|
---|
| 238 |
|
---|
| 239 | }
|
---|
| 240 | /** editIndex() **/
|
---|
| 241 |
|
---|
| 242 | /**
|
---|
| 243 | */
|
---|
| 244 | protected void doEdit(Document document, String field, String old_value, String new_value)
|
---|
| 245 | {
|
---|
| 246 | if (debug)
|
---|
| 247 | {
|
---|
| 248 | debug("GS2LuceneEditor.doEdit(Document, \"" + field + "\", \"" + old_value + "\", \"" + new_value + "\")");
|
---|
| 249 | }
|
---|
| 250 |
|
---|
| 251 | String values_raw[] = document.getValues(field);
|
---|
| 252 | if(values_raw != null)
|
---|
| 253 | {
|
---|
| 254 | Vector values = new Vector(Arrays.asList(values_raw));
|
---|
| 255 | // Remove all the values for this field (no other safe way to
|
---|
| 256 | // do this
|
---|
| 257 | document.removeFields(field);
|
---|
| 258 | // DEBUG
|
---|
| 259 | if (debug)
|
---|
| 260 | {
|
---|
| 261 | debug("- Before modification:");
|
---|
| 262 | for(int i = 0; i < values.size(); i++)
|
---|
| 263 | {
|
---|
| 264 | debug("\t" + field + "[" + i + "]: " + values.get(i));
|
---|
| 265 | }
|
---|
| 266 | }
|
---|
| 267 | // If old_value is set, remove it from the values array
|
---|
| 268 | if(!old_value.equals(""))
|
---|
| 269 | {
|
---|
| 270 | // Remove all occurances of this metadata - this means
|
---|
| 271 | // it becomes a bit dangerous to have multiple pieces
|
---|
| 272 | // of metadata with exactly the same metadata - but
|
---|
| 273 | // this is only for indexing purposes so its not so
|
---|
| 274 | // bad.
|
---|
| 275 | while(values.contains(old_value))
|
---|
| 276 | {
|
---|
| 277 | values.remove(old_value);
|
---|
| 278 | }
|
---|
| 279 | }
|
---|
| 280 | // If new_value is set, add it to the values array
|
---|
| 281 | if(!new_value.equals("") && !values.contains(new_value))
|
---|
| 282 | {
|
---|
| 283 | values.add(new_value);
|
---|
| 284 | }
|
---|
| 285 | // DEBUG
|
---|
| 286 | if(debug)
|
---|
| 287 | {
|
---|
| 288 | debug("- After modification:");
|
---|
| 289 | for(int i = 0; i < values.size(); i++)
|
---|
| 290 | {
|
---|
| 291 | debug("\t" + field + "[" + i + "]: " + values.get(i));
|
---|
| 292 | }
|
---|
| 293 | }
|
---|
| 294 | // Add all the values for this field
|
---|
| 295 | for(int i = 0; i < values.size(); i++)
|
---|
| 296 | {
|
---|
| 297 | document.add(new Field(field, (String)values.get(i), Field.Store.YES, Field.Index.TOKENIZED));
|
---|
| 298 | }
|
---|
| 299 | values.clear();
|
---|
| 300 | values = null;
|
---|
| 301 | }
|
---|
| 302 | // We may be adding a value to a field that current has no values
|
---|
| 303 | else if (!new_value.equals(""))
|
---|
| 304 | {
|
---|
| 305 | Vector values = new Vector();
|
---|
| 306 | values.add(new_value);
|
---|
| 307 | // DEBUG
|
---|
| 308 | if(debug)
|
---|
| 309 | {
|
---|
| 310 | debug("- Brand spanking new values:");
|
---|
| 311 | for(int i = 0; i < values.size(); i++)
|
---|
| 312 | {
|
---|
| 313 | debug("\t" + field + "[" + i + "]: " + values.get(i));
|
---|
| 314 | }
|
---|
| 315 | }
|
---|
| 316 | // Add all the values for this field
|
---|
| 317 | for(int i = 0; i < values.size(); i++)
|
---|
| 318 | {
|
---|
| 319 | document.add(new Field(field, (String)values.get(i), Field.Store.YES, Field.Index.TOKENIZED));
|
---|
| 320 | }
|
---|
| 321 | values.clear();
|
---|
| 322 | values = null;
|
---|
| 323 | }
|
---|
| 324 | // Can't do a removal unless something exists
|
---|
| 325 | else
|
---|
| 326 | {
|
---|
| 327 | debug("- No such field for this document: " + field);
|
---|
| 328 | }
|
---|
| 329 | values_raw = null;
|
---|
| 330 | }
|
---|
| 331 | /** doEdit() **/
|
---|
| 332 |
|
---|
| 333 | /**
|
---|
| 334 | */
|
---|
| 335 | static public void printUsage()
|
---|
| 336 | {
|
---|
| 337 | System.out.println("usage: GS2LuceneEditor --index <path> --nodeid <int> --field <string>");
|
---|
| 338 | System.out.println(" [--oldvalue <string>] [--newvalue <string>]");
|
---|
| 339 | System.out.println("");
|
---|
| 340 | System.out.println("where:");
|
---|
| 341 | System.out.println(" index - is the full path to the directory containing the directory");
|
---|
| 342 | System.out.println(" to edit, including the level (ie didx, sidx)");
|
---|
| 343 | System.out.println(" nodeid - the unique identifier of the document to change. This is the");
|
---|
| 344 | System.out.println(" same as the docnum in the GDBM");
|
---|
| 345 | System.out.println(" field - the two letter code of the metadata field to edit. These can");
|
---|
| 346 | System.out.println(" found in the build.cfg file. ZZ is not a valid field as it");
|
---|
| 347 | System.out.println(" is handled as a special case");
|
---|
| 348 | System.out.println(" oldvalue - the current value of the metadata field if it is to be");
|
---|
| 349 | System.out.println(" replaced or removed");
|
---|
| 350 | System.out.println(" newvalue - the new value for the metadata field if it is to be replaced");
|
---|
| 351 | System.out.println(" or added");
|
---|
| 352 | System.out.println("");
|
---|
| 353 | System.exit(0);
|
---|
| 354 | }
|
---|
| 355 | /** printUsage() **/
|
---|
| 356 |
|
---|
| 357 | }
|
---|
| 358 | /** class GS2LuceneEditor **/
|
---|