Changeset 6349
- Timestamp:
- 2004-01-06T11:46:04+13:00 (20 years ago)
- Location:
- trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/indexers
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/indexers/IndexerInterface.java
r6283 r6349 6 6 public interface IndexerInterface 7 7 { 8 public boolean configure(String label, String value); 9 public boolean startPass(int passNumber); 10 public boolean indexDocument(DocumentID id, DocumentInterface document); 11 public boolean endPass(int passNumber); 12 public void tidyup(); 13 public int getNumberOfPasses(); 8 public static final String DOCUMENT_LEVEL = "document"; 9 public static final String SECTION_LEVEL = "section"; 10 public static final String PARAGRAPH_LEVEL = "paragraph"; 11 12 public static final String GS2_INDEX_LABEL = "Index"; 13 14 public boolean configure(String label, String value); 15 public boolean addIndex(String level, String field); 16 public boolean startPass(int passNumber); 17 public boolean indexDocument(DocumentID id, DocumentInterface document); 18 public boolean endPass(int passNumber); 19 public void tidyup(); 20 public int getNumberOfPasses(); 14 21 } -
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/indexers/IndexerManager.java
r6283 r6349 15 15 16 16 public static final String outputDir = "outputDir"; 17 public static final String DEFAULT_LEVEL = "document"; 18 public static final String DEFAULT_FIELD = "text"; 17 19 18 20 public IndexerManager(DocumentList documentList) -
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/indexers/MGIndexer.java
r6283 r6349 35 35 List indexes; 36 36 String level; 37 38 public static final String MG_INDEX = "Index"; 37 String field; 39 38 40 39 class MGIndex … … 42 41 String field; 43 42 43 public MGIndex(String level, String field) 44 { this.level = level; 45 this.field = field; 46 } 47 44 48 public MGIndex(String indexLabel) 45 49 { int colonAt = indexLabel.indexOf(':'); 50 46 51 if (colonAt >= 0) 47 52 { field = indexLabel.substring(colonAt+1); 48 53 level = indexLabel.substring(0, colonAt); 49 }50 else51 {52 54 } 53 55 } … … 91 93 /** 92 94 * The output directory should be (collection)/building/text/ for 93 * normal Greenstone builds 95 * normal Greenstone builds. 96 * 97 * @param <code>String</code> the label to configure 98 * @param <code>String</code> the value... 94 99 */ 95 100 public boolean configure(String label, String value) … … 114 119 System.out.println("Output MG directory is " + this.textStem); 115 120 } 116 else if (label.equals( MG_INDEX)) {121 else if (label.equals(IndexerInterface.GS2_INDEX_LABEL)) { 117 122 this.indexes.add(new MGIndex(value)); 118 123 } … … 121 126 } 122 127 128 public boolean addIndex(String level, String field) 129 { 130 MGIndex index = new MGIndex(level, field); 131 this.indexes.add(index); 132 return true; 133 } 134 123 135 private Node recurseDOM(DocumentInterface metsDoc, Node node, 124 AbstractStructure structure, StringBuffer buffer) 136 AbstractStructure structure, StringBuffer buffer, 137 String namespace, String field) 125 138 { 126 139 // send out the ctrl-c...if this is … … 146 159 buffer.append((char) 3); 147 160 if (this.level != null && 148 this.level.equals( "section")) {161 this.level.equals(IndexerInterface.SECTION_LEVEL)) { 149 162 buffer.append((char) 2); 150 163 } … … 152 165 } 153 166 154 // go through our children asrequired...167 // go through our children if required... 155 168 Iterator children = structure.getChildIterator(); 156 169 while (children.hasNext()) { 157 170 AbstractStructure child = (AbstractStructure) children.next(); 158 171 159 172 // get xpointer for child 160 173 // get start position node 161 174 Node startNode = ((HTMLDocument) metsDoc).getSectionStartNode((METSDivision) child); 162 163 // while this node isn't the child's start node, produce the node 164 while (node != startNode) { 165 XPointer.printNode(node, buffer, false); 166 // print buffer to node 167 node = XPointer.getNextNode(node, buffer); 168 } 169 175 176 // while this node isn't the child's start node, produce the node text 177 if (field.equals("text")) { 178 while (node != startNode) { 179 XPointer.printNode(node, buffer, false); 180 181 // print buffer to node 182 node = XPointer.getNextNode(node, (field.equals("text") ? buffer : null)); 183 } 184 } 185 170 186 // recurse to child 171 this.recurseDOM(metsDoc, node, child, buffer );187 this.recurseDOM(metsDoc, node, child, buffer, namespace, field); 172 188 } 173 189 … … 176 192 if (structure.getStructureType().equals(METSStructure.STRUCTURE_TYPE)) { 177 193 while (node != null) { 178 XPointer.printNode(node, buffer, false); 179 node = XPointer.getNextNode(node, buffer); 194 if (field.equals("text")) { 195 XPointer.printNode(node, buffer, false); 196 } 197 else { 198 METSDescriptive descriptive; 199 200 if (structure.getStructureType().equals(METSDivision.DIVISION_TYPE)) { 201 METSDivision division = (METSDivision) structure; 202 203 String metadataId = division.getDefaultMetadataReference(); 204 205 descriptive = metsDoc.getDocumentMetadata().getDescriptiveById(metadataId); 206 if (descriptive != null) { 207 List values = descriptive.getMetadata(namespace, field); 208 209 Iterator valueIter = values.iterator(); 210 while (valueIter.hasNext()) { 211 String value = valueIter.next().toString(); 212 213 buffer.append(value); 214 if (valueIter.hasNext()) { 215 buffer.append((char) 3); 216 } 217 } 218 } 219 } 220 } 221 node = XPointer.getNextNode(node, (field.equals("text") ? buffer : null)); 180 222 } 181 223 buffer.append((char) 3); … … 185 227 } 186 228 187 private String prepareDOM(DocumentInterface metsDoc, Document document, METSStructure structure )229 private String prepareDOM(DocumentInterface metsDoc, Document document, METSStructure structure, String namespace, String field) 188 230 { Node node = document.getDocumentElement(); 189 231 StringBuffer textBuffer = new StringBuffer(); 190 232 191 this.recurseDOM(metsDoc, node, structure, textBuffer );233 this.recurseDOM(metsDoc, node, structure, textBuffer, namespace, field); 192 234 return textBuffer.toString(); 193 235 } … … 217 259 METSStructure sections = document.getDocumentStructure().getStructure("Section"); 218 260 if (sections != null) { 219 docText = this.prepareDOM(document, domDocument, sections );261 docText = this.prepareDOM(document, domDocument, sections, "gsdl3", this.field); 220 262 // System.out.println(docText); 221 263 } … … 316 358 317 359 this.indexStem = this.outputDirectory + File.separatorChar + 318 this.getIndexDirectory( "document", "text") +360 this.getIndexDirectory(index.getLevel(), index.getField()) + 319 361 File.separatorChar + "index"; // TODO: modify for index 320 362 this.level = index.getLevel(); 363 this.field = index.getField(); 364 } 365 else { 366 this.field = "text"; 321 367 } 322 368 … … 343 389 Process p = Runtime.getRuntime().exec("mg_perf_hash_build -f index -d " + this.indexDirectory.toString()); 344 390 p.waitFor(); 345 System.out.println(p.exitValue()); 391 if (p.exitValue() == 0) { 392 System.out.println("Perfect hashes completed"); 393 } 346 394 347 395 mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams +" -b 100000 -2 -c 3 -G -t 10 -N2"); … … 402 450 try { 403 451 switch (mgPass) 404 { 452 { 405 453 case 0: 406 454 System.out.println("Compressing dictionary"); … … 413 461 414 462 case 3: 463 System.out.println("Writing weights file"); 415 464 p = Runtime.getRuntime().exec("mg_weights_build -f " + this.indexStem + " -t " + this.textStem + " -d /"); 416 465 p.waitFor(); 417 System.out.println(p.exitValue()); 466 if (p.exitValue() == 0) { 467 System.out.println("Weights file successfully written"); 468 } 469 else { 470 System.out.println("Unable to create weights file"); 471 } 418 472 419 473 p = Runtime.getRuntime().exec("mg_invf_dict -f index -d " + this.indexDirectory.toString()); 420 474 p.waitFor(); 421 System.out.println(p.exitValue()); 475 if (p.exitValue() == 0) { 476 System.out.println("Inverted dictionary file successfully written"); 477 } 478 else { 479 System.out.println("Unable to create inverted dictionary file"); 480 } 422 481 423 482 p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s1 -f index -d " + this.indexDirectory.toString()); 424 483 p.waitFor(); 425 System.out.println(p.exitValue()); 484 if (p.exitValue() == 0) { 485 System.out.println("Stemmed index successfully written"); 486 } 487 else { 488 System.out.println("Unable to create stemmed index"); 489 } 490 426 491 p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s2 -f index -d " + this.indexDirectory.toString()); 427 492 p.waitFor(); 428 System.out.println(p.exitValue()); 493 if (p.exitValue() == 0) { 494 System.out.println("Stemmed index successfully written"); 495 } 496 else { 497 System.out.println("Unable to create stemmed index"); 498 } 499 429 500 p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s3 -f index -d " + this.indexDirectory.toString()); 430 501 p.waitFor(); 431 System.out.println(p.exitValue()); 502 if (p.exitValue() == 0) { 503 System.out.println("Stemmed index successfully written"); 504 } 505 else { 506 System.out.println("Unable to create stemmed index"); 507 } 432 508 break; 433 509 } -
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/indexers/MGPPIndexer.java
r6283 r6349 10 10 public class MGPPIndexer implements IndexerInterface 11 11 { 12 13 14 15 16 17 18 19 20 21 22 23 24 12 int pass; 13 boolean firstDocument; 14 String outputDirectory; 15 String outputStem; 16 InputStream indexerFeedback; 17 InputStream indexerErrors; 18 OutputStream indexerTextfeed; 19 Process mgpp_passes; 20 static final String documentSeparator = "<Document>"; 21 22 public MGPPIndexer() 23 { 24 } 25 25 26 26 /** … … 38 38 } 39 39 40 /** 41 * Index a single document; the document interface can be used to extract individual 42 * metadata items etc. as required or desired and index those instead or as well as 43 * the body text of the document. 44 */ 45 public boolean indexDocument(DocumentID docID, DocumentInterface document) 46 { 47 if (!this.firstDocument) 48 { // Send a '<document>' before the document itself 49 try { 50 this.indexerTextfeed.write(documentSeparator.getBytes(), 0, documentSeparator.getBytes().length); 51 } 52 catch (IOException ex) 53 { System.out.println("Bad output on end of document" + ex); 54 ex.printStackTrace(); 55 return false; 56 } 57 } 58 String docText = document.getDocumentText(); 59 60 byte [] bytes = docText.getBytes(); 61 int pos = 0, end = bytes.length; 62 63 try { 64 while (pos < end) { 65 this.indexerTextfeed.write(bytes, pos, (end - pos > 512 ? 512 : end - pos)); 66 pos = pos + 512; 67 68 try { 69 while (this.indexerFeedback.available() > 0) 70 { byte b[] = new byte[this.indexerFeedback.available()]; 71 System.out.println("Feedback of " + this.indexerFeedback.available()); 72 this.indexerFeedback.read(b); 73 System.out.println(b); 74 } 75 } 76 catch (IOException ex) 77 { 78 } 79 80 81 try { 82 while (this.indexerErrors.available() > 0) 83 { byte b[] = new byte[this.indexerErrors.available()]; 84 System.out.println("Feedback of " + this.indexerErrors.available()); 85 this.indexerErrors.read(b); 86 System.out.println(new String(b)); 87 } 88 } 89 catch (IOException ex) 90 { 91 } 92 } 93 } 94 catch (IOException ex) 95 { System.out.println("Bad output during document write " + ex + " " + pos + " " + end); 96 ex.printStackTrace(); 97 return false; 98 } 99 this.firstDocument = false; 100 101 try { 102 while (this.indexerErrors.available() > 0) 103 { char c = (char) this.indexerErrors.read(); 104 System.out.println(c); 105 } 106 while (this.indexerFeedback.available() > 0) 107 { byte b[] = new byte[this.indexerFeedback.available()]; 108 System.out.println("Feedback of " + this.indexerFeedback.available()); 109 this.indexerFeedback.read(b); 110 } 111 } 112 catch (IOException ex) 113 { 114 } 115 return true; 40 public boolean addIndex(String level, String field) 41 { return true; 42 } 43 44 /** 45 * Index a single document; the document interface can be used to extract individual 46 * metadata items etc. as required or desired and index those instead or as well as 47 * the body text of the document. 48 */ 49 public boolean indexDocument(DocumentID docID, DocumentInterface document) 50 { 51 if (!this.firstDocument) 52 { // Send a '<document>' before the document itself 53 try { 54 this.indexerTextfeed.write(documentSeparator.getBytes(), 0, documentSeparator.getBytes().length); 55 } 56 catch (IOException ex) 57 { System.out.println("Bad output on end of document" + ex); 58 ex.printStackTrace(); 59 return false; 60 } 61 } 62 String docText = document.getDocumentText(); 63 64 byte [] bytes = docText.getBytes(); 65 int pos = 0, end = bytes.length; 66 67 try { 68 while (pos < end) { 69 this.indexerTextfeed.write(bytes, pos, (end - pos > 512 ? 512 : end - pos)); 70 pos = pos + 512; 71 72 try { 73 while (this.indexerFeedback.available() > 0) 74 { byte b[] = new byte[this.indexerFeedback.available()]; 75 System.out.println("Feedback of " + this.indexerFeedback.available()); 76 this.indexerFeedback.read(b); 77 System.out.println(b); 78 } 116 79 } 117 118 /** 119 * Initialise the pass: open required files, check status 120 */ 121 public boolean startPass(int passNumber) 122 { this.pass = passNumber; 123 this.firstDocument = true; 124 125 try { 126 switch (this.pass) { 127 case 0: 128 mgpp_passes = Runtime.getRuntime().exec("mgpp_passes -f " + this.outputStem + " -T1"); 129 break; 130 131 case 1: 132 mgpp_passes = Runtime.getRuntime().exec("mgpp_passes -f " + this.outputStem +" -T2"); 133 break; 134 135 case 2: 136 mgpp_passes = Runtime.getRuntime().exec("mgpp_passes -f " + this.outputStem +" -I1"); 137 break; 138 139 case 3: 140 Process p = Runtime.getRuntime().exec("mgpp_perf_hash_build -f " + this.outputStem); 141 p.waitFor(); 142 143 mgpp_passes = Runtime.getRuntime().exec("mgpp_passes -f " + this.outputStem +" -I2"); 144 break; 145 } 80 catch (IOException ex) 81 { 82 } 83 84 85 try { 86 while (this.indexerErrors.available() > 0) 87 { byte b[] = new byte[this.indexerErrors.available()]; 88 System.out.println("Feedback of " + this.indexerErrors.available()); 89 this.indexerErrors.read(b); 90 System.out.println(new String(b)); 91 } 92 } 93 catch (IOException ex) 94 { 95 } 96 } 97 } 98 catch (IOException ex) 99 { System.out.println("Bad output during document write " + ex + " " + pos + " " + end); 100 ex.printStackTrace(); 101 return false; 102 } 103 this.firstDocument = false; 104 105 try { 106 while (this.indexerErrors.available() > 0) 107 { char c = (char) this.indexerErrors.read(); 108 System.out.println(c); 109 } 110 while (this.indexerFeedback.available() > 0) 111 { byte b[] = new byte[this.indexerFeedback.available()]; 112 System.out.println("Feedback of " + this.indexerFeedback.available()); 113 this.indexerFeedback.read(b); 114 } 115 } 116 catch (IOException ex) 117 { 118 } 119 return true; 120 } 121 122 /** 123 * Initialise the pass: open required files, check status 124 */ 125 public boolean startPass(int passNumber) 126 { this.pass = passNumber; 127 this.firstDocument = true; 128 129 try { 130 switch (this.pass) { 131 case 0: 132 mgpp_passes = Runtime.getRuntime().exec("mgpp_passes -f " + this.outputStem + " -T1"); 133 break; 134 135 case 1: 136 mgpp_passes = Runtime.getRuntime().exec("mgpp_passes -f " + this.outputStem +" -T2"); 137 break; 138 139 case 2: 140 mgpp_passes = Runtime.getRuntime().exec("mgpp_passes -f " + this.outputStem +" -I1"); 141 break; 142 143 case 3: 144 Process p = Runtime.getRuntime().exec("mgpp_perf_hash_build -f " + this.outputStem); 145 p.waitFor(); 146 147 mgpp_passes = Runtime.getRuntime().exec("mgpp_passes -f " + this.outputStem +" -I2"); 148 break; 149 } 146 150 147 148 149 150 151 152 {System.out.println(ex);153 154 155 156 157 {System.out.println(ex);158 159 160 161 162 163 164 165 166 167 168 169 {// TODO: end pass170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 {System.out.println(ex);230 231 232 233 234 {System.out.println(ex);235 236 237 238 239 240 241 /** 242 243 244 151 this.indexerFeedback = mgpp_passes.getInputStream(); 152 this.indexerErrors = mgpp_passes.getErrorStream(); 153 this.indexerTextfeed = mgpp_passes.getOutputStream(); 154 } 155 catch (IOException ex) 156 { System.out.println(ex); 157 ex.printStackTrace(); 158 return false; 159 } 160 catch (InterruptedException ex) 161 { System.out.println(ex); 162 ex.printStackTrace(); 163 return false; 164 } 165 System.out.println("Pass " + this.pass); 166 return true; 167 } 168 169 /** 170 * Complete a pass - reset file counters, close files, etc. 171 */ 172 public boolean endPass(int passNumber) 173 { // TODO: end pass 174 Process p; 175 176 try { 177 this.indexerTextfeed.write((char) 2); 178 this.indexerTextfeed.write(4); 179 while (this.indexerErrors.available() > 0) 180 { char c = (char) this.indexerErrors.read(); 181 System.out.print(c); 182 } 183 while (this.indexerFeedback.available() > 0) 184 { byte b[] = new byte[this.indexerFeedback.available()]; 185 System.out.print("Feedback of " + this.indexerFeedback.available()); 186 this.indexerFeedback.read(b); 187 } 188 189 this.indexerTextfeed.close(); 190 Thread.sleep(1000); 191 this.mgpp_passes.waitFor(); 192 } 193 catch (IOException ex) 194 { System.out.println(ex); 195 } 196 catch (InterruptedException ex) 197 { System.out.println(ex); 198 } 199 System.out.println("Completed with " + this.mgpp_passes.exitValue()); 200 201 try { 202 switch (this.pass) 203 { 204 case 0: 205 System.out.println("Compressing dictionary"); 206 p = Runtime.getRuntime().exec("mgpp_compression_dict -f " + this.outputStem + " -S -H -2 -k 5120"); 207 p.waitFor(); 208 System.out.println(p.exitValue()); 209 break; 210 211 case 3: 212 p = Runtime.getRuntime().exec("mgpp_weights_build -f " + this.outputStem); 213 p.waitFor(); 214 System.out.println(p.exitValue()); 215 216 p = Runtime.getRuntime().exec("mgpp_invf_dict -f " + this.outputStem); 217 p.waitFor(); 218 System.out.println(p.exitValue()); 219 220 p = Runtime.getRuntime().exec("mgpp_stem_idx -b 4096 -s1 -f " + this.outputStem + " -d " + this.outputDirectory); 221 p.waitFor(); 222 System.out.println(p.exitValue()); 223 p = Runtime.getRuntime().exec("mgpp_stem_idx -b 4096 -s2 -f " + this.outputStem + " -d " + this.outputDirectory); 224 p.waitFor(); 225 System.out.println(p.exitValue()); 226 p = Runtime.getRuntime().exec("mgpp_stem_idx -b 4096 -s3 -f " + this.outputStem + " -d " + this.outputDirectory); 227 p.waitFor(); 228 System.out.println(p.exitValue()); 229 break; 230 } 231 } 232 catch (IOException ex) 233 { System.out.println(ex); 234 ex.printStackTrace(); 235 return false; 236 } 237 catch (InterruptedException ex) 238 { System.out.println(ex); 239 ex.printStackTrace(); 240 return false; 241 } 242 return true; 243 } 244 245 /** 246 * Do any tidying up 247 */ 248 public void tidyup() 245 249 { 246 247 248 249 250 251 252 {return 4;253 250 } 251 252 /** 253 * Return the number of passes required for this index. 254 */ 255 public int getNumberOfPasses() 256 { return 4; 257 } 254 258 }
Note:
See TracChangeset
for help on using the changeset viewer.