1 | package org.terrier.applications;
|
---|
2 |
|
---|
3 | import java.io.BufferedReader;
|
---|
4 | import java.io.BufferedOutputStream;
|
---|
5 | import java.io.File;
|
---|
6 | import java.io.FileInputStream;
|
---|
7 | import java.io.FileOutputStream;
|
---|
8 | import java.io.InputStreamReader;
|
---|
9 | import java.io.IOException;
|
---|
10 | import java.io.PrintWriter;
|
---|
11 | import java.nio.file.Files;
|
---|
12 | import java.nio.file.Path;
|
---|
13 | import java.nio.file.Paths;
|
---|
14 | import java.util.ArrayDeque;
|
---|
15 | import java.util.ArrayList;
|
---|
16 | import java.util.Arrays;
|
---|
17 | import java.util.Collections;
|
---|
18 | import java.util.List;
|
---|
19 | import java.util.HashSet;
|
---|
20 | import java.util.Iterator;
|
---|
21 |
|
---|
22 | import org.apache.log4j.Level;
|
---|
23 | import org.apache.log4j.Logger;
|
---|
24 |
|
---|
25 | import org.terrier.indexing.BlockIndexer;
|
---|
26 | import org.terrier.indexing.BlockSinglePassIndexer;
|
---|
27 | import org.terrier.indexing.Collection;
|
---|
28 | import org.terrier.indexing.Indexer;
|
---|
29 | import org.terrier.indexing.SimpleFileCollection;
|
---|
30 | import org.terrier.structures.CollectionStatistics;
|
---|
31 | import org.terrier.structures.Index;
|
---|
32 | import org.terrier.structures.merging.StructureMerger;
|
---|
33 | import org.terrier.utility.ApplicationSetup;
|
---|
34 |
|
---|
35 | /** @class FileIndexer
|
---|
36 | *
|
---|
37 | * A simple indexer, based on DesktopTerrier, that is intended to build a
|
---|
38 | * searchable index from the contents of a directory in the file system. This
|
---|
39 | * indexer has been created with parallel processing in mind, in that you can
|
---|
40 | * split a large or computationally complex directory into several batches
|
---|
41 | * (the contents of which are recorded by manifest files) and then process
|
---|
42 | * each batch independently (and in parallel on a multicore machine or a
|
---|
43 | * cluster). Once this is complete you can then merge each pair of indexes
|
---|
44 | * together - eventually generating a final index encompassing all of the
|
---|
45 | * batches parts.
|
---|
46 | *
|
---|
47 | * Start by preparing the collection for indexing by locating indexable files
|
---|
48 | * and recording them in one or more manifest files, i.e.:
|
---|
49 | *
|
---|
50 | * ./bin/anyclass.sh org.terrier.applications.FileIndexer -prepare \
|
---|
51 | * -path /data/mycollection/ -batchsize 10
|
---|
52 | *
|
---|
53 | * You then build the uniquely named index (in this case with a prefix of
|
---|
54 | * '000') for each manifest, i.e.:
|
---|
55 | *
|
---|
56 | * ./bin/anyclass.sh org.terrier.applications.FileIndexer -index \
|
---|
57 | * -path /terrier/var/manifest-000.spec -prefix 000
|
---|
58 | *
|
---|
59 | * If there are more index (for example, with prefixes '000' and '001'), you
|
---|
60 | * can then merge them (into a single index with the prefix 'data') using this
|
---|
61 | * command:
|
---|
62 | *
|
---|
63 | * ./bin/anyclass.sh \
|
---|
64 | * org.terrier.applications.structures.merging.BlockStructureMerger \
|
---|
65 | * /terrier/var/index/ 000 /terrier/var/index/ 001 \
|
---|
66 | * /terrier/var/index/ data
|
---|
67 | *
|
---|
68 | */
|
---|
69 | public class FileIndexer
|
---|
70 | {
|
---|
71 | private String index_prefix;
|
---|
72 |
|
---|
73 | private HashSet<String> supported_extensions;
|
---|
74 |
|
---|
75 | private List<String> file_list;
|
---|
76 |
|
---|
77 | protected static final Logger logger = Logger.getLogger(FileIndexer.class);
|
---|
78 |
|
---|
79 | /** @function FileIndexer
|
---|
80 | * Default constructor
|
---|
81 | */
|
---|
82 | public FileIndexer(String index_prefix)
|
---|
83 | {
|
---|
84 | ///ogger.info("FileIndexer::FileIndexer(" + index_prefix + ")");
|
---|
85 | this.index_prefix = index_prefix;
|
---|
86 | this.supported_extensions = new HashSet<String>();
|
---|
87 | this.file_list = new ArrayList<String>();
|
---|
88 |
|
---|
89 | //setting properties for the application
|
---|
90 | if ((ApplicationSetup.getProperty("indexer.meta.forward.keys", null)) == null)
|
---|
91 | {
|
---|
92 | ApplicationSetup.setProperty("indexer.meta.forward.keys","docno,filename");
|
---|
93 | ApplicationSetup.setProperty("indexer.meta.forward.keylens","26,2048");
|
---|
94 | }
|
---|
95 | ApplicationSetup.setProperty("indexing.max.tokens", "10000");
|
---|
96 | ApplicationSetup.setProperty("invertedfile.processterms","25000");
|
---|
97 | ApplicationSetup.setProperty("ignore.low.idf.terms","false");
|
---|
98 | ApplicationSetup.setProperty("matching.dsms", "BooleanFallback");
|
---|
99 |
|
---|
100 | // we need to know what extensions are supported by the indexer
|
---|
101 | String extension_parsers_raw = ApplicationSetup.getProperty("indexing.simplefilecollection.extensionsparsers","txt:FileDocument");
|
---|
102 | // parse up the string, and store the extensions (only) in the HashSet
|
---|
103 | if (extension_parsers_raw.length() > 0)
|
---|
104 | {
|
---|
105 | String[] extension_parsers = extension_parsers_raw.split("\\s*,\\s*");
|
---|
106 | // for each parser, of the form: <ext>:<documentclass>
|
---|
107 | for (int i = 0; i < extension_parsers.length; i++)
|
---|
108 | {
|
---|
109 | String extension_parser = extension_parsers[i];
|
---|
110 | String[] mapping = extension_parser.split(":");
|
---|
111 | // a well-formed mapping has an extension and a document class
|
---|
112 | if (mapping.length == 2)
|
---|
113 | {
|
---|
114 | // we store just the extension, leaving the actual mapping up to
|
---|
115 | // SimpleFileCollection
|
---|
116 | supported_extensions.add(mapping[0]);
|
---|
117 | }
|
---|
118 | else
|
---|
119 | {
|
---|
120 | logger.warn("Malformed extension parser mapping: " + extension_parser);
|
---|
121 | }
|
---|
122 | }
|
---|
123 | }
|
---|
124 | else
|
---|
125 | {
|
---|
126 | logger.error("No extension parsers defined in Terrier's properties - SimpleFileCollection unusable");
|
---|
127 | }
|
---|
128 | }
|
---|
129 | /** FileIndexer() **/
|
---|
130 |
|
---|
131 | /** @function canIndex
|
---|
132 | * Given a path to a file, determine if Terrier's current configuration
|
---|
133 | * allows that file to be indexed
|
---|
134 | */
|
---|
135 | public boolean canIndex(String file_path)
|
---|
136 | {
|
---|
137 | // we may have specified a default Document class for all files encountered
|
---|
138 | if (!ApplicationSetup.getProperty("indexing.simplefilecollection.defaultparser","").equals(""))
|
---|
139 | {
|
---|
140 | return true;
|
---|
141 | }
|
---|
142 | // otherwise, check through the list of supported file extensions to see
|
---|
143 | // if one matches this file
|
---|
144 | // - get the file's extension
|
---|
145 | String file_extension = file_path.substring(file_path.lastIndexOf(".") + 1);
|
---|
146 | // - see if it exists in the array of supported extensions
|
---|
147 | boolean result = this.supported_extensions.contains(file_extension);
|
---|
148 | logger.info("[P1] Can index \"" + file_path.toString() + "\"? => " + result);
|
---|
149 | return result;
|
---|
150 | }
|
---|
151 | /** canIndex(String) **/
|
---|
152 |
|
---|
153 | /** @function close
|
---|
154 | */
|
---|
155 | public void close()
|
---|
156 | {
|
---|
157 | ///ogger.info("FileIndexer::close()");
|
---|
158 | this.file_list.clear();
|
---|
159 | this.file_list = null;
|
---|
160 | this.supported_extensions.clear();
|
---|
161 | this.supported_extensions = null;
|
---|
162 | }
|
---|
163 | /** close() **/
|
---|
164 |
|
---|
165 | /** @function deleteIndex
|
---|
166 | */
|
---|
167 | public void deleteIndex(String prefix, String logging_name)
|
---|
168 | {
|
---|
169 | logger.info("[" + logging_name + "] Delete index " + prefix);
|
---|
170 | // and that all old files for this index are removed
|
---|
171 | File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH);
|
---|
172 | File files[] = index_path.listFiles();
|
---|
173 | for (int i = 0; i < files.length; i++)
|
---|
174 | {
|
---|
175 | String file_name = files[i].getName();
|
---|
176 | if (file_name.startsWith(prefix))
|
---|
177 | {
|
---|
178 | files[i].delete();
|
---|
179 | }
|
---|
180 | }
|
---|
181 | }
|
---|
182 | /** deleteIndex(String, String) **/
|
---|
183 |
|
---|
184 | /** @function epochTime
|
---|
185 | * Returns the current time in seconds since 1970JAN01
|
---|
186 | */
|
---|
187 | public long epochTime()
|
---|
188 | {
|
---|
189 | return System.currentTimeMillis()/1000;
|
---|
190 | }
|
---|
191 | /** epochTime() **/
|
---|
192 |
|
---|
193 | /** @function listIndexes
|
---|
194 | */
|
---|
195 | public ArrayDeque<String> listIndexes(boolean include_default)
|
---|
196 | {
|
---|
197 | ///ogger.info("FileIndexer::listIndexes(" + include_default + ")");
|
---|
198 | File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH);
|
---|
199 | String default_index_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data");
|
---|
200 | // we start by populating a set structure to ensure each prefix only occurs
|
---|
201 | // once
|
---|
202 | HashSet<String> index_parts_set = new HashSet<String>();
|
---|
203 | File files[] = index_path.listFiles();
|
---|
204 | if (files != null)
|
---|
205 | {
|
---|
206 | for (int i = 0; i < files.length; i++)
|
---|
207 | {
|
---|
208 | String file_name = files[i].getName();
|
---|
209 | if (include_default || !file_name.startsWith(default_index_prefix))
|
---|
210 | {
|
---|
211 | String prefix = file_name.substring(0, file_name.indexOf("."));
|
---|
212 | index_parts_set.add(prefix);
|
---|
213 | }
|
---|
214 | }
|
---|
215 | }
|
---|
216 | else
|
---|
217 | {
|
---|
218 | logger.error("[P3] Error! No indexes found - did indexing fail?");
|
---|
219 | }
|
---|
220 | // we then turn the hashset into a nice list (in this case a deque)
|
---|
221 | ArrayDeque<String> index_parts = new ArrayDeque<String>();
|
---|
222 | Iterator<String> index_parts_iterator = index_parts_set.iterator();
|
---|
223 | while (index_parts_iterator.hasNext())
|
---|
224 | {
|
---|
225 | index_parts.add(index_parts_iterator.next());
|
---|
226 | }
|
---|
227 | return index_parts;
|
---|
228 | }
|
---|
229 | /** listIndexes(boolean) **/
|
---|
230 |
|
---|
231 | /**
|
---|
232 | */
|
---|
233 | public void loadManifest(String index_prefix, Path manifest_path)
|
---|
234 | {
|
---|
235 | logger.info("[B" + index_prefix + "] Load manifest " + manifest_path.toString());
|
---|
236 | try
|
---|
237 | {
|
---|
238 | BufferedReader manifest_reader = new BufferedReader(new InputStreamReader(new FileInputStream(manifest_path.toFile())));
|
---|
239 | String line = "";
|
---|
240 | while ((line = manifest_reader.readLine()) != null)
|
---|
241 | {
|
---|
242 | this.file_list.add(line);
|
---|
243 | }
|
---|
244 | manifest_reader.close();
|
---|
245 | manifest_reader = null;
|
---|
246 | }
|
---|
247 | catch (IOException e)
|
---|
248 | {
|
---|
249 | logger.error("Exception when reading manifest! " + e);
|
---|
250 | }
|
---|
251 | }
|
---|
252 | /** loadManifest(Path) **/
|
---|
253 |
|
---|
254 | /** @function mergeIndexes
|
---|
255 | */
|
---|
256 | public boolean mergeIndexes(String prefix_one, String prefix_two)
|
---|
257 | {
|
---|
258 | ///ogger.info("FileIndexer::mergeIndexes(" + prefix_one + ", " + prefix_two + ")");
|
---|
259 | String prefix_default = ApplicationSetup.getProperty("terrier.index.prefix", "data");
|
---|
260 | return this.mergeIndexes(prefix_one, prefix_two, prefix_default);
|
---|
261 | }
|
---|
262 | /** mergeIndexes(String, String) **/
|
---|
263 |
|
---|
264 | /** @function mergeIndexes
|
---|
265 | */
|
---|
266 | public boolean mergeIndexes(String prefix_one, String prefix_two, String prefix_out)
|
---|
267 | {
|
---|
268 | logger.info("[P3] Merge indexes " + prefix_one + " and " + prefix_two + " => " + prefix_out);
|
---|
269 | // init
|
---|
270 | String index_path = ApplicationSetup.TERRIER_INDEX_PATH;
|
---|
271 | // use StructureMerger class
|
---|
272 | Index.setIndexLoadingProfileAsRetrieval(false);
|
---|
273 | Index index_one = Index.createIndex(index_path, prefix_one);
|
---|
274 | Index index_two = Index.createIndex(index_path, prefix_two);
|
---|
275 | Index index_out = Index.createNewIndex(index_path, prefix_out);
|
---|
276 | StructureMerger structure_merger = new StructureMerger(index_one, index_two, index_out);
|
---|
277 |
|
---|
278 | // quiet logger
|
---|
279 | Logger root_logger = Logger.getRootLogger();
|
---|
280 | Level log_level = root_logger.getLevel();
|
---|
281 | root_logger.setLevel((Level) Level.OFF);
|
---|
282 | structure_merger.mergeStructures();
|
---|
283 | structure_merger = null;
|
---|
284 | root_logger.setLevel(log_level);
|
---|
285 |
|
---|
286 | // Only print out statistics for 'data'
|
---|
287 | if (prefix_out.equals("data"))
|
---|
288 | {
|
---|
289 | CollectionStatistics collection_statistics = index_out.getCollectionStatistics();
|
---|
290 | logger.info("[P3] Number of Documents: " + collection_statistics.getNumberOfDocuments());
|
---|
291 | logger.info("[P3] Number of Tokens: " + collection_statistics.getNumberOfTokens());
|
---|
292 | logger.info("[P3] Number of Unique Terms: " + collection_statistics.getNumberOfUniqueTerms());
|
---|
293 | logger.info("[P3] Number of Pointers: " + collection_statistics.getNumberOfPointers());
|
---|
294 | collection_statistics = null;
|
---|
295 | }
|
---|
296 |
|
---|
297 | // Cleanup and closedown
|
---|
298 | try
|
---|
299 | {
|
---|
300 | index_one.close();
|
---|
301 | index_two.close();
|
---|
302 | index_out.close();
|
---|
303 | }
|
---|
304 | catch (IOException e)
|
---|
305 | {
|
---|
306 | logger.error("Exception while closing indexes: ", e);
|
---|
307 | return false;
|
---|
308 | }
|
---|
309 | index_one = null;
|
---|
310 | index_two = null;
|
---|
311 | index_out = null;
|
---|
312 | this.deleteIndex(prefix_one, "P3");
|
---|
313 | this.deleteIndex(prefix_two, "P3");
|
---|
314 | prefix_one = null;
|
---|
315 | prefix_two = null;
|
---|
316 | return true;
|
---|
317 | }
|
---|
318 | /** mergeIndexes(String, String, String) **/
|
---|
319 |
|
---|
320 | /** @function renameIndex
|
---|
321 | */
|
---|
322 | public void renameIndex(String prefix_in)
|
---|
323 | {
|
---|
324 | ///ogger.info("FileIndexer::renameIndex(" + prefix_in + ")");
|
---|
325 | String default_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data");
|
---|
326 | this.renameIndex(prefix_in, default_prefix);
|
---|
327 | }
|
---|
328 | /** renameIndex(String) **/
|
---|
329 |
|
---|
330 | /** @function renameIndex
|
---|
331 | */
|
---|
332 | public void renameIndex(String prefix_in, String prefix_out)
|
---|
333 | {
|
---|
334 | logger.info("[P3] Rename index " + prefix_in + " => " + prefix_out);
|
---|
335 | prefix_in = prefix_in + ".";
|
---|
336 | File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH);
|
---|
337 | File files[] = index_path.listFiles();
|
---|
338 | for (int i = 0; i < files.length; i++)
|
---|
339 | {
|
---|
340 | File a_file = files[i];
|
---|
341 | String file_name = a_file.getName();
|
---|
342 | if (file_name.startsWith(prefix_in))
|
---|
343 | {
|
---|
344 | String suffix = file_name.substring(file_name.indexOf("."));
|
---|
345 | Path target_path = Paths.get(ApplicationSetup.TERRIER_INDEX_PATH, prefix_out + suffix);
|
---|
346 | try
|
---|
347 | {
|
---|
348 | Files.move(a_file.toPath(), target_path);
|
---|
349 | }
|
---|
350 | catch (IOException e)
|
---|
351 | {
|
---|
352 | System.err.println("Error! Failed to rename file: " + e);
|
---|
353 | }
|
---|
354 | target_path = null;
|
---|
355 | suffix = null;
|
---|
356 | }
|
---|
357 | file_name = null;
|
---|
358 | a_file = null;
|
---|
359 | }
|
---|
360 | files = null;
|
---|
361 | index_path = null;
|
---|
362 | }
|
---|
363 | /** renameIndex(String, String) **/
|
---|
364 |
|
---|
365 | /** @function runIndex
|
---|
366 | */
|
---|
367 | public void runIndex(String batch_number)
|
---|
368 | {
|
---|
369 | logger.info("[B" + batch_number + "] create index");
|
---|
370 | if (this.file_list == null || this.file_list.size() == 0)
|
---|
371 | {
|
---|
372 | logger.error("No files specified to index. Aborting indexing process.");
|
---|
373 | return;
|
---|
374 | }
|
---|
375 |
|
---|
376 | try
|
---|
377 | {
|
---|
378 | // ensure the index directory exists
|
---|
379 | File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH);
|
---|
380 | if (!index_path.exists() && !index_path.mkdirs())
|
---|
381 | {
|
---|
382 | logger.error("Could not create the index folders at: "+ index_path);
|
---|
383 | logger.error("Aborting indexing process");
|
---|
384 | return;
|
---|
385 | }
|
---|
386 | this.deleteIndex(this.index_prefix, "F");
|
---|
387 | // create the appropriate indexer
|
---|
388 | Indexer indexer;
|
---|
389 | final boolean use_single_pass = Boolean.parseBoolean(ApplicationSetup.getProperty("desktop.indexing.singlepass", "false"));
|
---|
390 | logger.warn("BLOCK_INDEXING hardcoded default for SimpleFileCollection");
|
---|
391 | if (use_single_pass)
|
---|
392 | {
|
---|
393 | indexer = new BlockSinglePassIndexer(ApplicationSetup.TERRIER_INDEX_PATH, this.index_prefix);
|
---|
394 | }
|
---|
395 | else
|
---|
396 | {
|
---|
397 | indexer = new BlockIndexer(ApplicationSetup.TERRIER_INDEX_PATH, this.index_prefix);
|
---|
398 | }
|
---|
399 | // create the simple file collection object and hand it to the indexer
|
---|
400 | // for indexing
|
---|
401 | SimpleFileCollection sfc = new SimpleFileCollection(this.file_list, false);
|
---|
402 | indexer.index(new Collection[] { sfc });
|
---|
403 | Index the_index = Index.createIndex(ApplicationSetup.TERRIER_INDEX_PATH, this.index_prefix);
|
---|
404 | // output some indexing statistics
|
---|
405 | if (the_index != null)
|
---|
406 | {
|
---|
407 | CollectionStatistics collection_statistics = the_index.getCollectionStatistics();
|
---|
408 | logger.info("Number of Documents: " + collection_statistics.getNumberOfDocuments());
|
---|
409 | logger.info("Number of Tokens: " + collection_statistics.getNumberOfTokens());
|
---|
410 | logger.info("Number of Unique Terms: " + collection_statistics.getNumberOfUniqueTerms());
|
---|
411 | logger.info("Number of Pointers: " + collection_statistics.getNumberOfPointers());
|
---|
412 | collection_statistics = null;
|
---|
413 | }
|
---|
414 | else
|
---|
415 | {
|
---|
416 | logger.warn("Nothing indexed!");
|
---|
417 | }
|
---|
418 | }
|
---|
419 | catch(Exception e)
|
---|
420 | {
|
---|
421 | logger.error("An unexpected exception occured while indexing. Indexing has been aborted.",e);
|
---|
422 | }
|
---|
423 | logger.info("FileIndexer::runIndex() - Complete!");
|
---|
424 | }
|
---|
425 | /** runIndex(List<String>) **/
|
---|
426 |
|
---|
427 | /** @function main
|
---|
428 | */
|
---|
429 | public static void main(String[] args)
|
---|
430 | {
|
---|
431 | System.out.println("================================= FileIndexer =================================");
|
---|
432 |
|
---|
433 | // 0. Initialization
|
---|
434 | final int PREPARE = 0;
|
---|
435 | final int INDEX = 1;
|
---|
436 | final int MERGE = 2;
|
---|
437 | int index_mode = PREPARE;
|
---|
438 | Path a_path = null;
|
---|
439 | String index_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data");
|
---|
440 | int batch_size = 0; // No limit
|
---|
441 | int max_files = 0; // No limit
|
---|
442 | String divider = "===============================================================================";
|
---|
443 |
|
---|
444 | // 1. Parse arguments
|
---|
445 | // - first argument is the mode (prepare|index|merge)
|
---|
446 | if (args.length < 1)
|
---|
447 | {
|
---|
448 | System.out.println("Usage: <FileIndexer> -prepare -path <path> [-batchsize <int>] [-maxfiles <int>]");
|
---|
449 | System.out.println(" <FileIndexer> -index -path <path> [-prefix <str>]");
|
---|
450 | System.out.println(" <FileIndexer> -merge");
|
---|
451 | System.out.println("where: paths *must* be absolute");
|
---|
452 | System.out.println(" when preparing 'path' is to the collection directory");
|
---|
453 | System.out.println(" when indexing 'path' is to a manifest file (generated by prepare)");
|
---|
454 | System.out.println(" prefix is 'data' by default");
|
---|
455 | System.out.println(" batch_size is unlimited by default");
|
---|
456 | System.exit(0);
|
---|
457 | }
|
---|
458 | if (args[0].equals("-index"))
|
---|
459 | {
|
---|
460 | index_mode = INDEX;
|
---|
461 | }
|
---|
462 | else if (args[0].equals("-merge"))
|
---|
463 | {
|
---|
464 | index_mode = MERGE;
|
---|
465 | }
|
---|
466 | // - all other arguments
|
---|
467 | for (int argc = 1; (argc + 1) < args.length; argc += 2)
|
---|
468 | {
|
---|
469 | String key = args[argc];
|
---|
470 | String value = args[argc + 1];
|
---|
471 | if (key.equals("-batchsize"))
|
---|
472 | {
|
---|
473 | batch_size = Integer.parseInt(value);
|
---|
474 | }
|
---|
475 | else if (key.equals("-maxfiles"))
|
---|
476 | {
|
---|
477 | max_files = Integer.parseInt(value);
|
---|
478 | }
|
---|
479 | else if (key.equals("-path"))
|
---|
480 | {
|
---|
481 | a_path = Paths.get(value);
|
---|
482 | }
|
---|
483 | else if (key.equals("-prefix"))
|
---|
484 | {
|
---|
485 | index_prefix = value;
|
---|
486 | }
|
---|
487 | else
|
---|
488 | {
|
---|
489 | System.err.println("Warning! Unknown argument: " + key);
|
---|
490 | }
|
---|
491 | }
|
---|
492 | // - check arguments
|
---|
493 | if (index_mode != MERGE && (a_path == null || !Files.exists(a_path)))
|
---|
494 | {
|
---|
495 | System.err.println("Error! Required argument -path not set or invalid.");
|
---|
496 | System.err.println("");
|
---|
497 | System.exit(0);
|
---|
498 | }
|
---|
499 |
|
---|
500 | // 2. Create the file indexer
|
---|
501 | FileIndexer file_indexer = new FileIndexer(index_prefix);
|
---|
502 |
|
---|
503 | long epoch_time;
|
---|
504 |
|
---|
505 | switch(index_mode)
|
---|
506 | {
|
---|
507 | case PREPARE:
|
---|
508 | logger.info("[P1:" + file_indexer.epochTime() + "] Starting manifest preparation");
|
---|
509 | // 3a. If we are in preparation mode, this is where we go through the
|
---|
510 | // files in the search path, adding those that are indexable by
|
---|
511 | // Terrier's current configuration into one of more manifest files
|
---|
512 | // based upon the value of batch_size.
|
---|
513 | System.out.println("Mode: Preparation");
|
---|
514 | System.out.println("Collection Path: " + a_path.toString());
|
---|
515 | if (batch_size != 0)
|
---|
516 | {
|
---|
517 | System.out.println("Batch Size: " + batch_size);
|
---|
518 | }
|
---|
519 | if (max_files != 0)
|
---|
520 | {
|
---|
521 | System.out.println("Max Files: " + max_files);
|
---|
522 | }
|
---|
523 | System.out.println(divider);
|
---|
524 | // - we're going to perform a breadth-first recursive search for files
|
---|
525 | // using the canIndex() function of the indexer to determine if this
|
---|
526 | // is (or would be) a file to index
|
---|
527 | ArrayDeque<Path> search_paths = new ArrayDeque<Path>();
|
---|
528 | search_paths.add(a_path);
|
---|
529 | int file_count = 0;
|
---|
530 | int total_file_count = 0; // Used in conjunction with max_files
|
---|
531 | int manifest_count = 0;
|
---|
532 | boolean have_max_files = false;
|
---|
533 | PrintWriter manifest_writer = null;
|
---|
534 | while (!search_paths.isEmpty() && !have_max_files)
|
---|
535 | {
|
---|
536 | Path search_path = search_paths.remove();
|
---|
537 | File files_raw[] = search_path.toFile().listFiles();
|
---|
538 | // randomize file order
|
---|
539 | List<File> files = Arrays.asList(files_raw);
|
---|
540 | Collections.shuffle(files);
|
---|
541 | // iterate through files filling manifests
|
---|
542 | for (int i = 0; i < files.size() && !have_max_files; i++)
|
---|
543 | {
|
---|
544 | Path file_path = files.get(i).toPath();
|
---|
545 | String file_name = file_path.getFileName().toString();
|
---|
546 | // skip . and ..
|
---|
547 | if (file_name.equals(".") || file_name.equals(".."))
|
---|
548 | {
|
---|
549 | // do nothing
|
---|
550 | }
|
---|
551 | // if this is a directory, add it to the queue of paths to search
|
---|
552 | else if (Files.isDirectory(file_path))
|
---|
553 | {
|
---|
554 | search_paths.add(file_path);
|
---|
555 | }
|
---|
556 | else
|
---|
557 | {
|
---|
558 | // would we expect to index this file?
|
---|
559 | if (file_indexer.canIndex(file_name))
|
---|
560 | {
|
---|
561 | // Ensure we opened a file handle for writing this manifest
|
---|
562 | if (manifest_writer == null)
|
---|
563 | {
|
---|
564 | Path manifest_path = Paths.get(ApplicationSetup.TERRIER_VAR, "manifest-" + String.format("%03d", manifest_count) + ".spec");
|
---|
565 | try
|
---|
566 | {
|
---|
567 | manifest_writer = new PrintWriter(new BufferedOutputStream(new FileOutputStream(manifest_path.toFile())));
|
---|
568 | }
|
---|
569 | catch (IOException e)
|
---|
570 | {
|
---|
571 | System.err.println("Error when opening manifest for writing! " + e);
|
---|
572 | }
|
---|
573 | System.out.println("* Writing to manifest: " + manifest_path.toString());
|
---|
574 | }
|
---|
575 | manifest_writer.println(file_path.toString());
|
---|
576 | file_count++;
|
---|
577 | total_file_count++;
|
---|
578 | // if we have a batch size, and we've found enough files...
|
---|
579 | if (manifest_writer != null && batch_size > 0 && file_count >= batch_size)
|
---|
580 | {
|
---|
581 | manifest_writer.close();
|
---|
582 | manifest_writer = null;
|
---|
583 | manifest_count++;
|
---|
584 | file_count = 0;
|
---|
585 | }
|
---|
586 | // if we have a max files limit and we've done enough files, then
|
---|
587 | // break out of for and while loops
|
---|
588 | if (max_files > 0 && file_count >= max_files)
|
---|
589 | {
|
---|
590 | have_max_files = true;
|
---|
591 | }
|
---|
592 | }
|
---|
593 | }
|
---|
594 | }
|
---|
595 | }
|
---|
596 | if (manifest_writer != null)
|
---|
597 | {
|
---|
598 | manifest_writer.close();
|
---|
599 | manifest_writer = null;
|
---|
600 | }
|
---|
601 | search_paths.clear();
|
---|
602 | search_paths = null;
|
---|
603 | logger.info("[P1:" + file_indexer.epochTime() + "] Complete");
|
---|
604 | break;
|
---|
605 |
|
---|
606 | case INDEX:
|
---|
607 | // 3b. Load the manifest specified by the search path and index files
|
---|
608 | String manifest_path = a_path.toString();
|
---|
609 | System.out.println("Mode: Index");
|
---|
610 | System.out.println("Prefix: " + index_prefix);
|
---|
611 | System.out.println("Manifest: " + manifest_path);
|
---|
612 | System.out.println("Indexing: " + ApplicationSetup.getProperty("indexer.meta.forward.keys", "docno,filename"));
|
---|
613 | System.out.println(divider);
|
---|
614 |
|
---|
615 | logger.info("[B" + index_prefix + ":" + file_indexer.epochTime() + "] Starting");
|
---|
616 | file_indexer.loadManifest(index_prefix, a_path);
|
---|
617 | file_indexer.runIndex(index_prefix);
|
---|
618 | logger.info("[B" + index_prefix + ":" + file_indexer.epochTime() + "] Complete");
|
---|
619 | break;
|
---|
620 |
|
---|
621 | case MERGE:
|
---|
622 | logger.info("[P3:" + file_indexer.epochTime() + "] Starting index merging");
|
---|
623 | System.out.println("Mode: Merge");
|
---|
624 | System.out.println(divider);
|
---|
625 | // 3c. User has requested a merging process.
|
---|
626 | // Look in the default index path and locate all of the indexes to merge
|
---|
627 | // (all those not prefixed 'data')
|
---|
628 | ArrayDeque<String> index_parts = file_indexer.listIndexes(false);
|
---|
629 | // We merge the indexes two at a time, creating new intermediate indexes,
|
---|
630 | // while there are still more than two indexes left in the queue. We
|
---|
631 | // delete indexes that we have already merged.
|
---|
632 | int intermediate_file_counter = 0;
|
---|
633 | while (index_parts.size() > 2)
|
---|
634 | {
|
---|
635 | String index_part_one_prefix = index_parts.remove();
|
---|
636 | String index_part_two_prefix = index_parts.remove();
|
---|
637 | String intermediate_index_prefix = "temp" + String.format("%03d", intermediate_file_counter);
|
---|
638 | ///ogger.info("Merging index parts \"" + index_part_one_prefix + "\" and \"" + index_part_two_prefix + "\" into index \"" + intermediate_index_prefix + "\"");
|
---|
639 | file_indexer.mergeIndexes(index_part_one_prefix, index_part_two_prefix, intermediate_index_prefix);
|
---|
640 | index_part_one_prefix = null;
|
---|
641 | index_part_two_prefix = null;
|
---|
642 | // - add the intermediate index to the list of indexes to be merged
|
---|
643 | index_parts.add(intermediate_index_prefix);
|
---|
644 | // - and increment the intermediate file count for the next iteration
|
---|
645 | intermediate_file_counter++;
|
---|
646 | }
|
---|
647 | // If there are two left we merge them into the final index
|
---|
648 | if (index_parts.size() == 2)
|
---|
649 | {
|
---|
650 | String index_part_one_prefix = index_parts.remove();
|
---|
651 | String index_part_two_prefix = index_parts.remove();
|
---|
652 | ///ogger.info("Merging index parts \"" + index_part_one_prefix + "\" and \"" + index_part_two_prefix + "\" into final index");
|
---|
653 | file_indexer.mergeIndexes(index_part_one_prefix, index_part_two_prefix);
|
---|
654 | index_part_one_prefix = null;
|
---|
655 | index_part_two_prefix = null;
|
---|
656 | }
|
---|
657 | // Otherwise we just rename the files that are there into the final
|
---|
658 | // index
|
---|
659 | else if (index_parts.size() == 1)
|
---|
660 | {
|
---|
661 | String index_part_prefix = index_parts.remove();
|
---|
662 | ///ogger.info("Renaming index \"" + index_part_prefix + "\" into final index");
|
---|
663 | file_indexer.renameIndex(index_part_prefix);
|
---|
664 | index_part_prefix = null;
|
---|
665 | }
|
---|
666 | // no index parts detected - was merge run by mistake?
|
---|
667 | else
|
---|
668 | {
|
---|
669 | logger.warn("[P3] Warning! No index parts detected - merge has no effect");
|
---|
670 | }
|
---|
671 | // clean up
|
---|
672 | index_parts.clear();
|
---|
673 | index_parts = null;
|
---|
674 | logger.info("[P3:" + file_indexer.epochTime() + "] Complete");
|
---|
675 | break;
|
---|
676 |
|
---|
677 | default:
|
---|
678 | logger.warn("Unknown FileIndexer mode requested");
|
---|
679 | }
|
---|
680 |
|
---|
681 | // 4. Finish indexing
|
---|
682 | file_indexer.close();
|
---|
683 | file_indexer = null;
|
---|
684 |
|
---|
685 | // 5. Complete!
|
---|
686 | System.out.println(divider);
|
---|
687 | System.out.println("Complete!");
|
---|
688 | System.out.println(divider);
|
---|
689 |
|
---|
690 | System.out.println("");
|
---|
691 | }
|
---|
692 | /** main() **/
|
---|
693 | } |
---|