Changeset 26240 for gs2-extensions
- Timestamp:
- 2012-09-25T12:40:56+12:00 (12 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs2-extensions/parallel-building/trunk/src/opt/Terrier/FileIndexer.java
r26237 r26240 20 20 import java.util.Iterator; 21 21 22 import org.apache.log4j.Level; 22 23 import org.apache.log4j.Logger; 23 24 … … 81 82 public FileIndexer(String index_prefix) 82 83 { 83 logger.info("FileIndexer::FileIndexer(" + index_prefix + ")");84 ///ogger.info("FileIndexer::FileIndexer(" + index_prefix + ")"); 84 85 this.index_prefix = index_prefix; 85 86 this.supported_extensions = new HashSet<String>(); … … 134 135 public boolean canIndex(String file_path) 135 136 { 136 logger.info("FileIndexer::canIndex(" + file_path.toString() + ")");137 137 // we may have specified a default Document class for all files encountered 138 138 if (!ApplicationSetup.getProperty("indexing.simplefilecollection.defaultparser","").equals("")) … … 145 145 String file_extension = file_path.substring(file_path.lastIndexOf(".") + 1); 146 146 // - see if it exists in the array of supported extensions 147 return this.supported_extensions.contains(file_extension); 147 boolean result = this.supported_extensions.contains(file_extension); 148 logger.info("[P1] Can index \"" + file_path.toString() + "\"? => " + result); 149 return result; 148 150 } 149 151 /** canIndex(String) **/ … … 153 155 public void close() 154 156 { 155 logger.info("FileIndexer::close()");157 ///ogger.info("FileIndexer::close()"); 156 158 this.file_list.clear(); 157 159 this.file_list = null; … … 163 165 /** @function deleteIndex 164 166 */ 165 public void deleteIndex(String prefix )166 { 167 logger.info(" FileIndexer::deleteIndex(" + prefix + ")");167 public void deleteIndex(String prefix, String logging_name) 168 { 169 logger.info("[" + logging_name + "] Delete index " + prefix); 168 170 // and that all old files for this index are removed 169 171 File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH); … … 178 180 } 179 181 } 180 /** deleteIndex(String) **/ 182 /** deleteIndex(String, String) **/ 183 184 /** @function epochTime 185 * Returns the current time in seconds since 1970JAN01 186 */ 187 public long epochTime() 188 { 189 return System.currentTimeMillis()/1000; 190 } 191 /** epochTime() **/ 181 192 182 193 /** @function listIndexes … … 184 195 public ArrayDeque<String> listIndexes(boolean include_default) 185 196 { 186 logger.info("FileIndexer::listIndexes(" + include_default + ")");197 ///ogger.info("FileIndexer::listIndexes(" + include_default + ")"); 187 198 File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH); 188 199 String default_index_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data"); … … 191 202 HashSet<String> index_parts_set = new HashSet<String>(); 192 203 File files[] = index_path.listFiles(); 193 for (int i = 0; i < files.length; i++) 194 { 195 String file_name = files[i].getName(); 196 if (include_default || !file_name.startsWith(default_index_prefix)) 197 { 198 String prefix = file_name.substring(0, file_name.indexOf(".")); 199 index_parts_set.add(prefix); 200 } 204 if (files != null) 205 { 206 for (int i = 0; i < files.length; i++) 207 { 208 String file_name = files[i].getName(); 209 if (include_default || !file_name.startsWith(default_index_prefix)) 210 { 211 String prefix = file_name.substring(0, file_name.indexOf(".")); 212 index_parts_set.add(prefix); 213 } 214 } 215 } 216 else 217 { 218 logger.error("[P3] Error! No indexes found - did indexing fail?"); 201 219 } 202 220 // we then turn the hashset into a nice list (in this case a deque) … … 213 231 /** 214 232 */ 215 public void loadManifest( Path manifest_path)216 { 217 logger.info(" FileIndexer::loadManifest(" + manifest_path.toString() + ")");233 public void loadManifest(String index_prefix, Path manifest_path) 234 { 235 logger.info("[B" + index_prefix + "] Load manifest " + manifest_path.toString()); 218 236 try 219 237 { … … 238 256 public boolean mergeIndexes(String prefix_one, String prefix_two) 239 257 { 240 logger.info("FileIndexer::mergeIndexes(" + prefix_one + ", " + prefix_two + ")");258 ///ogger.info("FileIndexer::mergeIndexes(" + prefix_one + ", " + prefix_two + ")"); 241 259 String prefix_default = ApplicationSetup.getProperty("terrier.index.prefix", "data"); 242 260 return this.mergeIndexes(prefix_one, prefix_two, prefix_default); … … 248 266 public boolean mergeIndexes(String prefix_one, String prefix_two, String prefix_out) 249 267 { 250 logger.info(" FileIndexer::mergeIndexes(" + prefix_one + ", " + prefix_two + ", " + prefix_out + ")");268 logger.info("[P3] Merge indexes " + prefix_one + " and " + prefix_two + " => " + prefix_out); 251 269 // init 252 270 String index_path = ApplicationSetup.TERRIER_INDEX_PATH; … … 257 275 Index index_out = Index.createNewIndex(index_path, prefix_out); 258 276 StructureMerger structure_merger = new StructureMerger(index_one, index_two, index_out); 277 278 // quiet logger 279 Logger root_logger = Logger.getRootLogger(); 280 Level log_level = root_logger.getLevel(); 281 root_logger.setLevel((Level) Level.OFF); 259 282 structure_merger.mergeStructures(); 260 283 structure_merger = null; 261 CollectionStatistics collection_statistics = index_out.getCollectionStatistics(); 262 logger.info("Number of Documents: " + collection_statistics.getNumberOfDocuments()); 263 logger.info("Number of Tokens: " + collection_statistics.getNumberOfTokens()); 264 logger.info("Number of Unique Terms: " + collection_statistics.getNumberOfUniqueTerms()); 265 logger.info("Number of Pointers: " + collection_statistics.getNumberOfPointers()); 266 collection_statistics = null; 284 root_logger.setLevel(log_level); 285 286 // Only print out statistics for 'data' 287 if (prefix_out.equals("data")) 288 { 289 CollectionStatistics collection_statistics = index_out.getCollectionStatistics(); 290 logger.info("[P3] Number of Documents: " + collection_statistics.getNumberOfDocuments()); 291 logger.info("[P3] Number of Tokens: " + collection_statistics.getNumberOfTokens()); 292 logger.info("[P3] Number of Unique Terms: " + collection_statistics.getNumberOfUniqueTerms()); 293 logger.info("[P3] Number of Pointers: " + collection_statistics.getNumberOfPointers()); 294 collection_statistics = null; 295 } 296 297 // Cleanup and closedown 267 298 try 268 299 { … … 279 310 index_two = null; 280 311 index_out = null; 281 this.deleteIndex(prefix_one );282 this.deleteIndex(prefix_two );312 this.deleteIndex(prefix_one, "P3"); 313 this.deleteIndex(prefix_two, "P3"); 283 314 prefix_one = null; 284 315 prefix_two = null; … … 291 322 public void renameIndex(String prefix_in) 292 323 { 293 logger.info("FileIndexer::renameIndex(" + prefix_in + ")");324 ///ogger.info("FileIndexer::renameIndex(" + prefix_in + ")"); 294 325 String default_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data"); 295 326 this.renameIndex(prefix_in, default_prefix); … … 301 332 public void renameIndex(String prefix_in, String prefix_out) 302 333 { 303 logger.info(" FileIndexer::renameIndex(" + prefix_in + ", " + prefix_out + ")");334 logger.info("[P3] Rename index " + prefix_in + " => " + prefix_out); 304 335 prefix_in = prefix_in + "."; 305 336 File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH); … … 334 365 /** @function runIndex 335 366 */ 336 public void runIndex( )337 { 338 logger.info(" FileIndexer::runIndex()");367 public void runIndex(String batch_number) 368 { 369 logger.info("[B" + batch_number + "] create index"); 339 370 if (this.file_list == null || this.file_list.size() == 0) 340 371 { … … 353 384 return; 354 385 } 355 this.deleteIndex(this.index_prefix );386 this.deleteIndex(this.index_prefix, "F"); 356 387 // create the appropriate indexer 357 388 Indexer indexer; … … 470 501 FileIndexer file_indexer = new FileIndexer(index_prefix); 471 502 503 long epoch_time; 504 472 505 switch(index_mode) 473 506 { 474 507 case PREPARE: 508 logger.info("[P1:" + file_indexer.epochTime() + "] Starting manifest preparation"); 475 509 // 3a. If we are in preparation mode, this is where we go through the 476 510 // files in the search path, adding those that are indexable by … … 567 601 search_paths.clear(); 568 602 search_paths = null; 603 logger.info("[P1:" + file_indexer.epochTime() + "] Complete"); 569 604 break; 570 605 571 606 case INDEX: 572 607 // 3b. Load the manifest specified by the search path and index files 573 System.out.println("Mode: Index"); 574 System.out.println("Manifest Path: " + a_path.toString()); 575 System.out.println("Prefix: " + index_prefix); 608 String manifest_path = a_path.toString(); 609 System.out.println("Mode: Index"); 610 System.out.println("Prefix: " + index_prefix); 611 System.out.println("Manifest: " + manifest_path); 576 612 System.out.println("Indexing: " + ApplicationSetup.getProperty("indexer.meta.forward.keys", "docno,filename")); 577 613 System.out.println(divider); 578 file_indexer.loadManifest(a_path); 579 file_indexer.runIndex(); 614 615 logger.info("[B" + index_prefix + ":" + file_indexer.epochTime() + "] Starting"); 616 file_indexer.loadManifest(index_prefix, a_path); 617 file_indexer.runIndex(index_prefix); 618 logger.info("[B" + index_prefix + ":" + file_indexer.epochTime() + "] Complete"); 580 619 break; 581 620 582 621 case MERGE: 622 logger.info("[P3:" + file_indexer.epochTime() + "] Starting index merging"); 583 623 System.out.println("Mode: Merge"); 584 624 System.out.println(divider); 585 625 // 3c. User has requested a merging process. 586 logger.info("[" + (System.currentTimeMillis()/1000) + "] Merging started");587 626 // Look in the default index path and locate all of the indexes to merge 588 627 // (all those not prefixed 'data') … … 597 636 String index_part_two_prefix = index_parts.remove(); 598 637 String intermediate_index_prefix = "temp" + String.format("%03d", intermediate_file_counter); 599 logger.info("Merging index parts \"" + index_part_one_prefix + "\" and \"" + index_part_two_prefix + "\" into index \"" + intermediate_index_prefix + "\"");638 ///ogger.info("Merging index parts \"" + index_part_one_prefix + "\" and \"" + index_part_two_prefix + "\" into index \"" + intermediate_index_prefix + "\""); 600 639 file_indexer.mergeIndexes(index_part_one_prefix, index_part_two_prefix, intermediate_index_prefix); 601 640 index_part_one_prefix = null; … … 611 650 String index_part_one_prefix = index_parts.remove(); 612 651 String index_part_two_prefix = index_parts.remove(); 613 logger.info("Merging index parts \"" + index_part_one_prefix + "\" and \"" + index_part_two_prefix + "\" into final index");652 ///ogger.info("Merging index parts \"" + index_part_one_prefix + "\" and \"" + index_part_two_prefix + "\" into final index"); 614 653 file_indexer.mergeIndexes(index_part_one_prefix, index_part_two_prefix); 615 654 index_part_one_prefix = null; … … 621 660 { 622 661 String index_part_prefix = index_parts.remove(); 623 logger.info("Renaming index \"" + index_part_prefix + "\" into final index");662 ///ogger.info("Renaming index \"" + index_part_prefix + "\" into final index"); 624 663 file_indexer.renameIndex(index_part_prefix); 625 664 index_part_prefix = null; … … 628 667 else 629 668 { 630 logger.warn(" No index parts detected - merge has no effect");669 logger.warn("[P3] Warning! No index parts detected - merge has no effect"); 631 670 } 632 671 // clean up 633 672 index_parts.clear(); 634 673 index_parts = null; 635 logger.info("[ " + (System.currentTimeMillis()/1000) + "] The merging is complete!");674 logger.info("[P3:" + file_indexer.epochTime() + "] Complete"); 636 675 break; 637 676
Note:
See TracChangeset
for help on using the changeset viewer.