1 | package org.terrier.applications;
|
---|
2 |
|
---|
3 | import java.io.BufferedReader;
|
---|
4 | import java.io.BufferedOutputStream;
|
---|
5 | import java.io.File;
|
---|
6 | import java.io.FileInputStream;
|
---|
7 | import java.io.FileOutputStream;
|
---|
8 | import java.io.InputStreamReader;
|
---|
9 | import java.io.IOException;
|
---|
10 | import java.io.PrintWriter;
|
---|
11 | import java.nio.file.Files;
|
---|
12 | import java.nio.file.Path;
|
---|
13 | import java.nio.file.Paths;
|
---|
14 | import java.util.ArrayDeque;
|
---|
15 | import java.util.ArrayList;
|
---|
16 | import java.util.List;
|
---|
17 | import java.util.HashSet;
|
---|
18 | import java.util.Iterator;
|
---|
19 |
|
---|
20 | import org.apache.log4j.Logger;
|
---|
21 |
|
---|
22 | import org.terrier.indexing.BlockIndexer;
|
---|
23 | import org.terrier.indexing.BlockSinglePassIndexer;
|
---|
24 | import org.terrier.indexing.Collection;
|
---|
25 | import org.terrier.indexing.Indexer;
|
---|
26 | import org.terrier.indexing.SimpleFileCollection;
|
---|
27 | import org.terrier.structures.CollectionStatistics;
|
---|
28 | import org.terrier.structures.Index;
|
---|
29 | import org.terrier.structures.merging.StructureMerger;
|
---|
30 | import org.terrier.utility.ApplicationSetup;
|
---|
31 |
|
---|
32 | /** @class FileIndexer
|
---|
33 | *
|
---|
34 | * A simple indexer, based on DesktopTerrier, that is intended to build a
|
---|
35 | * searchable index from the contents of a directory in the file system. This
|
---|
36 | * indexer has been created with parallel processing in mind, in that you can
|
---|
37 | * split a large or computationally complex directory into several batches
|
---|
38 | * (the contents of which are recorded by manifest files) and then process
|
---|
39 | * each batch independently (and in parallel on a multicore machine or a
|
---|
40 | * cluster). Once this is complete you can then merge each pair of indexes
|
---|
41 | * together - eventually generating a final index encompassing all of the
|
---|
42 | * batches parts.
|
---|
43 | *
|
---|
44 | * Start by preparing the collection for indexing by locating indexable files
|
---|
45 | * and recording them in one or more manifest files, i.e.:
|
---|
46 | *
|
---|
47 | * ./bin/anyclass.sh org.terrier.applications.FileIndexer -prepare \
|
---|
48 | * -path /data/mycollection/ -batchsize 10
|
---|
49 | *
|
---|
50 | * You then build the uniquely named index (in this case with a prefix of
|
---|
51 | * '000') for each manifest, i.e.:
|
---|
52 | *
|
---|
53 | * ./bin/anyclass.sh org.terrier.applications.FileIndexer -index \
|
---|
54 | * -path /terrier/var/manifest-000.spec -prefix 000
|
---|
55 | *
|
---|
56 | * If there are more index (for example, with prefixes '000' and '001'), you
|
---|
57 | * can then merge them (into a single index with the prefix 'data') using this
|
---|
58 | * command:
|
---|
59 | *
|
---|
60 | * ./bin/anyclass.sh \
|
---|
61 | * org.terrier.applications.structures.merging.BlockStructureMerger \
|
---|
62 | * /terrier/var/index/ 000 /terrier/var/index/ 001 \
|
---|
63 | * /terrier/var/index/ data
|
---|
64 | *
|
---|
65 | */
|
---|
66 | public class FileIndexer
|
---|
67 | {
|
---|
68 | private String index_prefix;
|
---|
69 |
|
---|
70 | private HashSet<String> supported_extensions;
|
---|
71 |
|
---|
72 | private List<String> file_list;
|
---|
73 |
|
---|
74 | protected static final Logger logger = Logger.getLogger(FileIndexer.class);
|
---|
75 |
|
---|
76 | /** @function FileIndexer
|
---|
77 | * Default constructor
|
---|
78 | */
|
---|
79 | public FileIndexer(String index_prefix)
|
---|
80 | {
|
---|
81 | logger.info("FileIndexer::FileIndexer(" + index_prefix + ")");
|
---|
82 | this.index_prefix = index_prefix;
|
---|
83 | this.supported_extensions = new HashSet<String>();
|
---|
84 | this.file_list = new ArrayList<String>();
|
---|
85 |
|
---|
86 | //setting properties for the application
|
---|
87 | if ((ApplicationSetup.getProperty("indexer.meta.forward.keys", null)) == null)
|
---|
88 | {
|
---|
89 | ApplicationSetup.setProperty("indexer.meta.forward.keys","docno,filename");
|
---|
90 | ApplicationSetup.setProperty("indexer.meta.forward.keylens","26,2048");
|
---|
91 | }
|
---|
92 | ApplicationSetup.setProperty("indexing.max.tokens", "10000");
|
---|
93 | ApplicationSetup.setProperty("invertedfile.processterms","25000");
|
---|
94 | ApplicationSetup.setProperty("ignore.low.idf.terms","false");
|
---|
95 | ApplicationSetup.setProperty("matching.dsms", "BooleanFallback");
|
---|
96 |
|
---|
97 | // we need to know what extensions are supported by the indexer
|
---|
98 | String extension_parsers_raw = ApplicationSetup.getProperty("indexing.simplefilecollection.extensionsparsers","txt:FileDocument");
|
---|
99 | // parse up the string, and store the extensions (only) in the HashSet
|
---|
100 | if (extension_parsers_raw.length() > 0)
|
---|
101 | {
|
---|
102 | String[] extension_parsers = extension_parsers_raw.split("\\s*,\\s*");
|
---|
103 | // for each parser, of the form: <ext>:<documentclass>
|
---|
104 | for (int i = 0; i < extension_parsers.length; i++)
|
---|
105 | {
|
---|
106 | String extension_parser = extension_parsers[i];
|
---|
107 | String[] mapping = extension_parser.split(":");
|
---|
108 | // a well-formed mapping has an extension and a document class
|
---|
109 | if (mapping.length == 2)
|
---|
110 | {
|
---|
111 | // we store just the extension, leaving the actual mapping up to
|
---|
112 | // SimpleFileCollection
|
---|
113 | supported_extensions.add(mapping[0]);
|
---|
114 | }
|
---|
115 | else
|
---|
116 | {
|
---|
117 | logger.warn("Malformed extension parser mapping: " + extension_parser);
|
---|
118 | }
|
---|
119 | }
|
---|
120 | }
|
---|
121 | else
|
---|
122 | {
|
---|
123 | logger.error("No extension parsers defined in Terrier's properties - SimpleFileCollection unusable");
|
---|
124 | }
|
---|
125 | }
|
---|
126 | /** FileIndexer() **/
|
---|
127 |
|
---|
128 | /** @function canIndex
|
---|
129 | * Given a path to a file, determine if Terrier's current configuration
|
---|
130 | * allows that file to be indexed
|
---|
131 | */
|
---|
132 | public boolean canIndex(String file_path)
|
---|
133 | {
|
---|
134 | logger.info("FileIndexer::canIndex(" + file_path.toString() + ")");
|
---|
135 | // we may have specified a default Document class for all files encountered
|
---|
136 | if (!ApplicationSetup.getProperty("indexing.simplefilecollection.defaultparser","").equals(""))
|
---|
137 | {
|
---|
138 | return true;
|
---|
139 | }
|
---|
140 | // otherwise, check through the list of supported file extensions to see
|
---|
141 | // if one matches this file
|
---|
142 | // - get the file's extension
|
---|
143 | String file_extension = file_path.substring(file_path.lastIndexOf(".") + 1);
|
---|
144 | // - see if it exists in the array of supported extensions
|
---|
145 | return this.supported_extensions.contains(file_extension);
|
---|
146 | }
|
---|
147 | /** canIndex(String) **/
|
---|
148 |
|
---|
149 | /** @function close
|
---|
150 | */
|
---|
151 | public void close()
|
---|
152 | {
|
---|
153 | logger.info("FileIndexer::close()");
|
---|
154 | this.file_list.clear();
|
---|
155 | this.file_list = null;
|
---|
156 | this.supported_extensions.clear();
|
---|
157 | this.supported_extensions = null;
|
---|
158 | }
|
---|
159 | /** close() **/
|
---|
160 |
|
---|
161 | /** @function deleteIndex
|
---|
162 | */
|
---|
163 | public void deleteIndex(String prefix)
|
---|
164 | {
|
---|
165 | logger.info("FileIndexer::deleteIndex(" + prefix + ")");
|
---|
166 | // and that all old files for this index are removed
|
---|
167 | File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH);
|
---|
168 | File files[] = index_path.listFiles();
|
---|
169 | for (int i = 0; i < files.length; i++)
|
---|
170 | {
|
---|
171 | String file_name = files[i].getName();
|
---|
172 | if (file_name.startsWith(prefix))
|
---|
173 | {
|
---|
174 | files[i].delete();
|
---|
175 | }
|
---|
176 | }
|
---|
177 | }
|
---|
178 | /** deleteIndex(String) **/
|
---|
179 |
|
---|
180 | /** @function listIndexes
|
---|
181 | */
|
---|
182 | public ArrayDeque<String> listIndexes(boolean include_default)
|
---|
183 | {
|
---|
184 | logger.info("FileIndexer::listIndexes(" + include_default + ")");
|
---|
185 | File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH);
|
---|
186 | String default_index_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data");
|
---|
187 | // we start by populating a set structure to ensure each prefix only occurs
|
---|
188 | // once
|
---|
189 | HashSet<String> index_parts_set = new HashSet<String>();
|
---|
190 | File files[] = index_path.listFiles();
|
---|
191 | for (int i = 0; i < files.length; i++)
|
---|
192 | {
|
---|
193 | String file_name = files[i].getName();
|
---|
194 | if (include_default || !file_name.startsWith(default_index_prefix))
|
---|
195 | {
|
---|
196 | String prefix = file_name.substring(0, file_name.indexOf("."));
|
---|
197 | index_parts_set.add(prefix);
|
---|
198 | }
|
---|
199 | }
|
---|
200 | // we then turn the hashset into a nice list (in this case a deque)
|
---|
201 | ArrayDeque<String> index_parts = new ArrayDeque<String>();
|
---|
202 | Iterator<String> index_parts_iterator = index_parts_set.iterator();
|
---|
203 | while (index_parts_iterator.hasNext())
|
---|
204 | {
|
---|
205 | index_parts.add(index_parts_iterator.next());
|
---|
206 | }
|
---|
207 | return index_parts;
|
---|
208 | }
|
---|
209 | /** listIndexes(boolean) **/
|
---|
210 |
|
---|
211 | /**
|
---|
212 | */
|
---|
213 | public void loadManifest(Path manifest_path)
|
---|
214 | {
|
---|
215 | logger.info("FileIndexer::loadManifest(" + manifest_path.toString() + ")");
|
---|
216 | try
|
---|
217 | {
|
---|
218 | BufferedReader manifest_reader = new BufferedReader(new InputStreamReader(new FileInputStream(manifest_path.toFile())));
|
---|
219 | String line = "";
|
---|
220 | while ((line = manifest_reader.readLine()) != null)
|
---|
221 | {
|
---|
222 | this.file_list.add(line);
|
---|
223 | }
|
---|
224 | manifest_reader.close();
|
---|
225 | manifest_reader = null;
|
---|
226 | }
|
---|
227 | catch (IOException e)
|
---|
228 | {
|
---|
229 | logger.error("Exception when reading manifest! " + e);
|
---|
230 | }
|
---|
231 | }
|
---|
232 | /** loadManifest(Path) **/
|
---|
233 |
|
---|
234 | /** @function mergeIndexes
|
---|
235 | */
|
---|
236 | public boolean mergeIndexes(String prefix_one, String prefix_two)
|
---|
237 | {
|
---|
238 | logger.info("FileIndexer::mergeIndexes(" + prefix_one + ", " + prefix_two + ")");
|
---|
239 | String prefix_default = ApplicationSetup.getProperty("terrier.index.prefix", "data");
|
---|
240 | return this.mergeIndexes(prefix_one, prefix_two, prefix_default);
|
---|
241 | }
|
---|
242 | /** mergeIndexes(String, String) **/
|
---|
243 |
|
---|
244 | /** @function mergeIndexes
|
---|
245 | */
|
---|
246 | public boolean mergeIndexes(String prefix_one, String prefix_two, String prefix_out)
|
---|
247 | {
|
---|
248 | logger.info("FileIndexer::mergeIndexes(" + prefix_one + ", " + prefix_two + ", " + prefix_out + ")");
|
---|
249 | // init
|
---|
250 | String index_path = ApplicationSetup.TERRIER_INDEX_PATH;
|
---|
251 | // use StructureMerger class
|
---|
252 | Index.setIndexLoadingProfileAsRetrieval(false);
|
---|
253 | Index index_one = Index.createIndex(index_path, prefix_one);
|
---|
254 | Index index_two = Index.createIndex(index_path, prefix_two);
|
---|
255 | Index index_out = Index.createNewIndex(index_path, prefix_out);
|
---|
256 | StructureMerger structure_merger = new StructureMerger(index_one, index_two, index_out);
|
---|
257 | structure_merger.mergeStructures();
|
---|
258 | structure_merger = null;
|
---|
259 | CollectionStatistics collection_statistics = index_out.getCollectionStatistics();
|
---|
260 | logger.info("Number of Documents: " + collection_statistics.getNumberOfDocuments());
|
---|
261 | logger.info("Number of Tokens: " + collection_statistics.getNumberOfTokens());
|
---|
262 | logger.info("Number of Unique Terms: " + collection_statistics.getNumberOfUniqueTerms());
|
---|
263 | logger.info("Number of Pointers: " + collection_statistics.getNumberOfPointers());
|
---|
264 | collection_statistics = null;
|
---|
265 | try
|
---|
266 | {
|
---|
267 | index_one.close();
|
---|
268 | index_two.close();
|
---|
269 | index_out.close();
|
---|
270 | }
|
---|
271 | catch (IOException e)
|
---|
272 | {
|
---|
273 | logger.error("Exception while closing indexes: ", e);
|
---|
274 | return false;
|
---|
275 | }
|
---|
276 | index_one = null;
|
---|
277 | index_two = null;
|
---|
278 | index_out = null;
|
---|
279 | this.deleteIndex(prefix_one);
|
---|
280 | this.deleteIndex(prefix_two);
|
---|
281 | prefix_one = null;
|
---|
282 | prefix_two = null;
|
---|
283 | return true;
|
---|
284 | }
|
---|
285 | /** mergeIndexes(String, String, String) **/
|
---|
286 |
|
---|
287 | /** @function renameIndex
|
---|
288 | */
|
---|
289 | public void renameIndex(String prefix_in)
|
---|
290 | {
|
---|
291 | logger.info("FileIndexer::renameIndex(" + prefix_in + ")");
|
---|
292 | String default_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data");
|
---|
293 | this.renameIndex(prefix_in, default_prefix);
|
---|
294 | }
|
---|
295 | /** renameIndex(String) **/
|
---|
296 |
|
---|
297 | /** @function renameIndex
|
---|
298 | */
|
---|
299 | public void renameIndex(String prefix_in, String prefix_out)
|
---|
300 | {
|
---|
301 | logger.info("FileIndexer::renameIndex(" + prefix_in + ", " + prefix_out + ")");
|
---|
302 | prefix_in = prefix_in + ".";
|
---|
303 | File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH);
|
---|
304 | File files[] = index_path.listFiles();
|
---|
305 | for (int i = 0; i < files.length; i++)
|
---|
306 | {
|
---|
307 | File a_file = files[i];
|
---|
308 | String file_name = a_file.getName();
|
---|
309 | if (file_name.startsWith(prefix_in))
|
---|
310 | {
|
---|
311 | String suffix = file_name.substring(file_name.indexOf("."));
|
---|
312 | Path target_path = Paths.get(ApplicationSetup.TERRIER_INDEX_PATH, prefix_out + suffix);
|
---|
313 | try
|
---|
314 | {
|
---|
315 | Files.move(a_file.toPath(), target_path);
|
---|
316 | }
|
---|
317 | catch (IOException e)
|
---|
318 | {
|
---|
319 | System.err.println("Error! Failed to rename file: " + e);
|
---|
320 | }
|
---|
321 | target_path = null;
|
---|
322 | suffix = null;
|
---|
323 | }
|
---|
324 | file_name = null;
|
---|
325 | a_file = null;
|
---|
326 | }
|
---|
327 | files = null;
|
---|
328 | index_path = null;
|
---|
329 | }
|
---|
330 | /** renameIndex(String, String) **/
|
---|
331 |
|
---|
332 | /** @function runIndex
|
---|
333 | */
|
---|
334 | public void runIndex()
|
---|
335 | {
|
---|
336 | logger.info("FileIndexer::runIndex()");
|
---|
337 | if (this.file_list == null || this.file_list.size() == 0)
|
---|
338 | {
|
---|
339 | logger.error("No files specified to index. Aborting indexing process.");
|
---|
340 | return;
|
---|
341 | }
|
---|
342 |
|
---|
343 | try
|
---|
344 | {
|
---|
345 | // ensure the index directory exists
|
---|
346 | File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH);
|
---|
347 | if (!index_path.exists() && !index_path.mkdirs())
|
---|
348 | {
|
---|
349 | logger.error("Could not create the index folders at: "+ index_path);
|
---|
350 | logger.error("Aborting indexing process");
|
---|
351 | return;
|
---|
352 | }
|
---|
353 | this.deleteIndex(this.index_prefix);
|
---|
354 | // create the appropriate indexer
|
---|
355 | Indexer indexer;
|
---|
356 | final boolean use_single_pass = Boolean.parseBoolean(ApplicationSetup.getProperty("desktop.indexing.singlepass", "false"));
|
---|
357 | logger.warn("BLOCK_INDEXING hardcoded default for SimpleFileCollection");
|
---|
358 | if (use_single_pass)
|
---|
359 | {
|
---|
360 | indexer = new BlockSinglePassIndexer(ApplicationSetup.TERRIER_INDEX_PATH, this.index_prefix);
|
---|
361 | }
|
---|
362 | else
|
---|
363 | {
|
---|
364 | indexer = new BlockIndexer(ApplicationSetup.TERRIER_INDEX_PATH, this.index_prefix);
|
---|
365 | }
|
---|
366 | // create the simple file collection object and hand it to the indexer
|
---|
367 | // for indexing
|
---|
368 | SimpleFileCollection sfc = new SimpleFileCollection(this.file_list, false);
|
---|
369 | indexer.index(new Collection[] { sfc });
|
---|
370 | Index the_index = Index.createIndex(ApplicationSetup.TERRIER_INDEX_PATH, this.index_prefix);
|
---|
371 | // output some indexing statistics
|
---|
372 | if (the_index != null)
|
---|
373 | {
|
---|
374 | CollectionStatistics collection_statistics = the_index.getCollectionStatistics();
|
---|
375 | logger.info("Number of Documents: " + collection_statistics.getNumberOfDocuments());
|
---|
376 | logger.info("Number of Tokens: " + collection_statistics.getNumberOfTokens());
|
---|
377 | logger.info("Number of Unique Terms: " + collection_statistics.getNumberOfUniqueTerms());
|
---|
378 | logger.info("Number of Pointers: " + collection_statistics.getNumberOfPointers());
|
---|
379 | collection_statistics = null;
|
---|
380 | }
|
---|
381 | else
|
---|
382 | {
|
---|
383 | logger.warn("Nothing indexed!");
|
---|
384 | }
|
---|
385 | }
|
---|
386 | catch(Exception e)
|
---|
387 | {
|
---|
388 | logger.error("An unexpected exception occured while indexing. Indexing has been aborted.",e);
|
---|
389 | }
|
---|
390 | logger.info("FileIndexer::runIndex() - Complete!");
|
---|
391 | }
|
---|
392 | /** runIndex(List<String>) **/
|
---|
393 |
|
---|
394 | /** @function main
|
---|
395 | */
|
---|
396 | public static void main(String[] args)
|
---|
397 | {
|
---|
398 | System.out.println("================================= FileIndexer =================================");
|
---|
399 |
|
---|
400 | // 0. Initialization
|
---|
401 | final int PREPARE = 0;
|
---|
402 | final int INDEX = 1;
|
---|
403 | final int MERGE = 2;
|
---|
404 | int index_mode = PREPARE;
|
---|
405 | Path a_path = null;
|
---|
406 | String index_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data");
|
---|
407 | int batch_size = 0; // No limit
|
---|
408 | String divider = "===============================================================================";
|
---|
409 |
|
---|
410 | // 1. Parse arguments
|
---|
411 | // - first argument is the mode (prepare|index|merge)
|
---|
412 | if (args.length < 1)
|
---|
413 | {
|
---|
414 | System.out.println("Usage: <FileIndexer> -prepare -path <path> [-batchsize <int>]");
|
---|
415 | System.out.println(" <FileIndexer> -index -path <path> [-prefix <str>]");
|
---|
416 | System.out.println(" <FileIndexer> -merge");
|
---|
417 | System.out.println("where: paths *must* be absolute");
|
---|
418 | System.out.println(" when preparing 'path' is to the collection directory");
|
---|
419 | System.out.println(" when indexing 'path' is to a manifest file (generated by prepare)");
|
---|
420 | System.out.println(" prefix is 'data' by default");
|
---|
421 | System.out.println(" batch_size is unlimited by default");
|
---|
422 | System.exit(0);
|
---|
423 | }
|
---|
424 | if (args[0].equals("-index"))
|
---|
425 | {
|
---|
426 | index_mode = INDEX;
|
---|
427 | }
|
---|
428 | else if (args[0].equals("-merge"))
|
---|
429 | {
|
---|
430 | index_mode = MERGE;
|
---|
431 | }
|
---|
432 | // - all other arguments
|
---|
433 | for (int argc = 1; (argc + 1) < args.length; argc += 2)
|
---|
434 | {
|
---|
435 | String key = args[argc];
|
---|
436 | String value = args[argc + 1];
|
---|
437 | if (key.equals("-batchsize"))
|
---|
438 | {
|
---|
439 | batch_size = Integer.parseInt(value);
|
---|
440 | }
|
---|
441 | else if (key.equals("-path"))
|
---|
442 | {
|
---|
443 | a_path = Paths.get(value);
|
---|
444 | }
|
---|
445 | else if (key.equals("-prefix"))
|
---|
446 | {
|
---|
447 | index_prefix = value;
|
---|
448 | }
|
---|
449 | else
|
---|
450 | {
|
---|
451 | System.err.println("Warning! Unknown argument: " + key);
|
---|
452 | }
|
---|
453 | }
|
---|
454 | // - check arguments
|
---|
455 | if (index_mode != MERGE && (a_path == null || !Files.exists(a_path)))
|
---|
456 | {
|
---|
457 | System.err.println("Error! Required argument -path not set or invalid.");
|
---|
458 | System.err.println("");
|
---|
459 | System.exit(0);
|
---|
460 | }
|
---|
461 |
|
---|
462 | // 2. Create the file indexer
|
---|
463 | FileIndexer file_indexer = new FileIndexer(index_prefix);
|
---|
464 |
|
---|
465 | switch(index_mode)
|
---|
466 | {
|
---|
467 | case PREPARE:
|
---|
468 | // 3a. If we are in preparation mode, this is where we go through the
|
---|
469 | // files in the search path, adding those that are indexable by
|
---|
470 | // Terrier's current configuration into one of more manifest files
|
---|
471 | // based upon the value of batch_size.
|
---|
472 | System.out.println("Mode: Preparation");
|
---|
473 | System.out.println("Collection Path: " + a_path.toString());
|
---|
474 | if (batch_size != 0)
|
---|
475 | {
|
---|
476 | System.out.println("Batch Size: " + batch_size);
|
---|
477 | }
|
---|
478 | System.out.println(divider);
|
---|
479 | // - we're going to perform a breadth-first recursive search for files
|
---|
480 | // using the canIndex() function of the indexer to determine if this
|
---|
481 | // is (or would be) a file to index
|
---|
482 | ArrayDeque<Path> search_paths = new ArrayDeque<Path>();
|
---|
483 | search_paths.add(a_path);
|
---|
484 | int file_count = 0;
|
---|
485 | int manifest_count = 0;
|
---|
486 | PrintWriter manifest_writer = null;
|
---|
487 | while (!search_paths.isEmpty())
|
---|
488 | {
|
---|
489 | Path search_path = search_paths.remove();
|
---|
490 | File files[] = search_path.toFile().listFiles();
|
---|
491 | for (int i = 0; i < files.length; i++)
|
---|
492 | {
|
---|
493 | Path file_path = files[i].toPath();
|
---|
494 | String file_name = file_path.getFileName().toString();
|
---|
495 | // skip . and ..
|
---|
496 | if (file_name.equals(".") || file_name.equals(".."))
|
---|
497 | {
|
---|
498 | // do nothing
|
---|
499 | }
|
---|
500 | // if this is a directory, add it to the queue of paths to search
|
---|
501 | else if (Files.isDirectory(file_path))
|
---|
502 | {
|
---|
503 | search_paths.add(file_path);
|
---|
504 | }
|
---|
505 | else
|
---|
506 | {
|
---|
507 | // would we expect to index this file?
|
---|
508 | if (file_indexer.canIndex(file_name))
|
---|
509 | {
|
---|
510 | // Ensure we opened a file handle for writing this manifest
|
---|
511 | if (manifest_writer == null)
|
---|
512 | {
|
---|
513 | Path manifest_path = Paths.get(ApplicationSetup.TERRIER_VAR, "manifest-" + String.format("%03d", manifest_count) + ".spec");
|
---|
514 | try
|
---|
515 | {
|
---|
516 | manifest_writer = new PrintWriter(new BufferedOutputStream(new FileOutputStream(manifest_path.toFile())));
|
---|
517 | }
|
---|
518 | catch (IOException e)
|
---|
519 | {
|
---|
520 | System.err.println("Error when opening manifest for writing! " + e);
|
---|
521 | }
|
---|
522 | System.out.println("* Writing to manifest: " + manifest_path.toString());
|
---|
523 | }
|
---|
524 | manifest_writer.println(file_path.toString());
|
---|
525 | file_count++;
|
---|
526 | // if we have a file limit, and we've found enough files...
|
---|
527 | if (manifest_writer != null && batch_size > 0 && file_count >= batch_size)
|
---|
528 | {
|
---|
529 | manifest_writer.close();
|
---|
530 | manifest_writer = null;
|
---|
531 | manifest_count++;
|
---|
532 | file_count = 0;
|
---|
533 | }
|
---|
534 | }
|
---|
535 | }
|
---|
536 | }
|
---|
537 | }
|
---|
538 | if (manifest_writer != null)
|
---|
539 | {
|
---|
540 | manifest_writer.close();
|
---|
541 | manifest_writer = null;
|
---|
542 | }
|
---|
543 | search_paths.clear();
|
---|
544 | search_paths = null;
|
---|
545 | break;
|
---|
546 |
|
---|
547 | case INDEX:
|
---|
548 | // 3b. Load the manifest specified by the search path and index files
|
---|
549 | System.out.println("Mode: Index");
|
---|
550 | System.out.println("Manifest Path: " + a_path.toString());
|
---|
551 | System.out.println("Prefix: " + index_prefix);
|
---|
552 | System.out.println("Indexing: " + ApplicationSetup.getProperty("indexer.meta.forward.keys", "docno,filename"));
|
---|
553 | System.out.println(divider);
|
---|
554 | file_indexer.loadManifest(a_path);
|
---|
555 | file_indexer.runIndex();
|
---|
556 | break;
|
---|
557 |
|
---|
558 | case MERGE:
|
---|
559 | System.out.println("Mode: Merge");
|
---|
560 | System.out.println(divider);
|
---|
561 | // 3c. User has requested a merging process.
|
---|
562 | logger.info("[" + (System.currentTimeMillis()/1000) + "] Merging started");
|
---|
563 | // Look in the default index path and locate all of the indexes to merge
|
---|
564 | // (all those not prefixed 'data')
|
---|
565 | ArrayDeque<String> index_parts = file_indexer.listIndexes(false);
|
---|
566 | // We merge the indexes two at a time, creating new intermediate indexes,
|
---|
567 | // while there are still more than two indexes left in the queue. We
|
---|
568 | // delete indexes that we have already merged.
|
---|
569 | int intermediate_file_counter = 0;
|
---|
570 | while (index_parts.size() > 2)
|
---|
571 | {
|
---|
572 | String index_part_one_prefix = index_parts.remove();
|
---|
573 | String index_part_two_prefix = index_parts.remove();
|
---|
574 | String intermediate_index_prefix = "temp" + String.format("%03d", intermediate_file_counter);
|
---|
575 | logger.info("Merging index parts \"" + index_part_one_prefix + "\" and \"" + index_part_two_prefix + "\" into index \"" + intermediate_index_prefix + "\"");
|
---|
576 | file_indexer.mergeIndexes(index_part_one_prefix, index_part_two_prefix, intermediate_index_prefix);
|
---|
577 | index_part_one_prefix = null;
|
---|
578 | index_part_two_prefix = null;
|
---|
579 | // - add the intermediate index to the list of indexes to be merged
|
---|
580 | index_parts.add(intermediate_index_prefix);
|
---|
581 | // - and increment the intermediate file count for the next iteration
|
---|
582 | intermediate_file_counter++;
|
---|
583 | }
|
---|
584 | // If there are two left we merge them into the final index
|
---|
585 | if (index_parts.size() == 2)
|
---|
586 | {
|
---|
587 | String index_part_one_prefix = index_parts.remove();
|
---|
588 | String index_part_two_prefix = index_parts.remove();
|
---|
589 | logger.info("Merging index parts \"" + index_part_one_prefix + "\" and \"" + index_part_two_prefix + "\" into final index");
|
---|
590 | file_indexer.mergeIndexes(index_part_one_prefix, index_part_two_prefix);
|
---|
591 | index_part_one_prefix = null;
|
---|
592 | index_part_two_prefix = null;
|
---|
593 | }
|
---|
594 | // Otherwise we just rename the files that are there into the final
|
---|
595 | // index
|
---|
596 | else if (index_parts.size() == 1)
|
---|
597 | {
|
---|
598 | String index_part_prefix = index_parts.remove();
|
---|
599 | logger.info("Renaming index \"" + index_part_prefix + "\" into final index");
|
---|
600 | file_indexer.renameIndex(index_part_prefix);
|
---|
601 | index_part_prefix = null;
|
---|
602 | }
|
---|
603 | // no index parts detected - was merge run by mistake?
|
---|
604 | else
|
---|
605 | {
|
---|
606 | logger.warn("No index parts detected - merge has no effect");
|
---|
607 | }
|
---|
608 | // clean up
|
---|
609 | index_parts.clear();
|
---|
610 | index_parts = null;
|
---|
611 | logger.info("[" + (System.currentTimeMillis()/1000) + "] The merging is complete!");
|
---|
612 | break;
|
---|
613 |
|
---|
614 | default:
|
---|
615 | logger.warn("Unknown FileIndexer mode requested");
|
---|
616 | }
|
---|
617 |
|
---|
618 | // 4. Finish indexing
|
---|
619 | file_indexer.close();
|
---|
620 | file_indexer = null;
|
---|
621 |
|
---|
622 | // 5. Complete!
|
---|
623 | System.out.println(divider);
|
---|
624 | System.out.println("Complete!");
|
---|
625 | System.out.println(divider);
|
---|
626 |
|
---|
627 | System.out.println("");
|
---|
628 | }
|
---|
629 | /** main() **/
|
---|
630 | } |
---|