Changeset 9940
- Timestamp:
- 2005-05-24T16:06:20+12:00 (19 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/indexers/MGIndexer.java
r9006 r9940 28 28 public class MGIndexer extends AbstractIndexer 29 29 { 30 int pass;31 int documentSeqNo;32 int sectionSeqNo;33 boolean firstDocument;34 String outputDirectory;35 // InputStream indexerFeedback;36 // InputStream indexerErrors;30 int pass; 31 int documentSeqNo; 32 int sectionSeqNo; 33 boolean firstDocument; 34 String outputDirectory; 35 // InputStream indexerFeedback; 36 // InputStream indexerErrors; 37 37 //OutputStream indexerTextfeed; 38 38 StringBuffer indexBuffer; 39 39 //Process mg_passes; 40 File textDirectory;41 File indexDirectory;42 String indexStem;43 String textStem;44 List indexes;40 File textDirectory; 41 File indexDirectory; 42 String indexStem; 43 String textStem; 44 List indexes; 45 45 String overallName; 46 46 … … 51 51 MGPassesWrapper mgPasses; 52 52 53 static final char END_OF_DOCUMENT = (char) 2;53 static final char END_OF_DOCUMENT = (char) 2; 54 54 static final char END_OF_SECTION = (char) 3; // actually this is end of para for mg 55 static final char END_OF_STREAM = (char) 4;55 static final char END_OF_STREAM = (char) 4; 56 56 57 57 public static final String MG_INDEX_TYPE = "mg"; 58 58 public static final String INDEX_FILE_STEM = "index"; 59 59 60 class MGIndex 61 { String name=null; 62 String level=null; 63 String field=null; 64 boolean error = false;// assume built until we get an error 65 66 public MGIndex(String name, String level, String field) 67 { this.name = name; 68 this.level = level; 69 this.field = field; 70 } 71 72 public MGIndex(String indexLabel) 73 { int colonAt = indexLabel.indexOf(':'); 74 75 if (colonAt >= 0) 76 { this.field = indexLabel.substring(colonAt+1); 77 this.level = indexLabel.substring(0, colonAt); 78 createIndexName(); 79 } 80 } 81 82 public String getLevel() 83 { return this.level; 84 } 85 86 public String getField() 87 { return this.field; 88 } 60 class MGIndex 61 { 62 String name=null; 63 String level=null; 64 String field=null; 65 boolean error = false;// assume built until we get an error 66 67 public MGIndex(String name, String level, String field) 68 { 69 this.name = name; 70 this.level = level; 71 this.field = field; 72 } 73 74 public MGIndex(String indexLabel) 75 { 76 int colonAt = indexLabel.indexOf(':'); 77 78 if (colonAt >= 0) { 79 this.field = indexLabel.substring(colonAt+1); 80 this.level = indexLabel.substring(0, colonAt); 81 createIndexName(); 82 } 83 } 84 85 public String getLevel() 86 { 87 return this.level; 88 } 89 90 public String getField() 91 { 92 return this.field; 93 } 94 95 public String getName() 96 { 97 if (this.name==null || this.name.equals("")) { 98 createIndexName(); 99 } 100 return this.name; 101 } 102 103 public boolean hasError() { 104 return this.error; 105 } 106 public void setError(boolean b) { 107 this.error = b; 108 } 109 110 private void createIndexName() { 111 StringBuffer new_name = new StringBuffer(); 112 new_name.append(Character.toLowerCase((char) this.level.charAt(0))); 113 114 int c, w; 115 w = 0; 116 c = 0; 117 while (c < this.field.length() && w < 2) { 118 char ch = this.field.charAt(c); 119 120 ch = Character.toLowerCase(ch); 121 if (Character.isLetter(ch)) { 122 if (ch != 'a' && ch != 'e' && ch != 'i' && 123 ch != 'o' && ch != 'u') { 124 new_name.append(ch); 125 w++; 126 } 127 } 128 c ++; 129 } 130 this.name = new_name.toString(); 131 } 132 } // MGIndex 133 134 public MGIndexer(String name) 135 { 136 this.indexes = new ArrayList(); 137 this.overallName = name; 138 } 139 140 public String getIndexType() 141 { 142 return MG_INDEX_TYPE; 143 } 89 144 90 145 public String getName() 91 146 { 92 if (this.name==null || this.name.equals("")) { 93 createIndexName(); 94 } 95 return this.name; 96 } 97 98 public boolean hasError() { 99 return this.error; 100 } 101 public void setError(boolean b) { 102 this.error = b; 103 } 104 105 private void createIndexName() { 106 StringBuffer new_name = new StringBuffer(); 107 new_name.append(Character.toLowerCase((char) this.level.charAt(0))); 108 109 int c, w; 110 w = 0; 111 c = 0; 112 while (c < this.field.length() && w < 2) { 113 char ch = this.field.charAt(c); 114 115 ch = Character.toLowerCase(ch); 116 if (Character.isLetter(ch)) { 117 if (ch != 'a' && ch != 'e' && ch != 'i' && 118 ch != 'o' && ch != 'u') { 119 new_name.append(ch); 120 w++; 121 } 122 } 123 c ++; 124 } 125 this.name = new_name.toString(); 126 127 } 128 } // MGIndex 129 130 public MGIndexer(String name) 131 { this.indexes = new ArrayList(); 132 this.overallName = name; 147 return this.overallName; 148 } 149 150 // private String getIndexDirectory(String level, String field) 151 // { StringBuffer directory = new StringBuffer(); 152 // directory.append(Character.toLowerCase((char) level.charAt(0))); 153 154 // int c, w; 155 // w = 0; 156 // c = 0; 157 // while (c < field.length() && w < 2) { 158 // char ch = field.charAt(c); 159 160 // ch = Character.toLowerCase(ch); 161 // if (Character.isLetter(ch)) { 162 // if (ch != 'a' && ch != 'e' && ch != 'i' && 163 // ch != 'o' && ch != 'u') { 164 // directory.append(ch); 165 // w++; 166 // } 167 // } 168 // c ++; 169 // } 170 // return directory.toString(); 171 // } 172 173 /** 174 * The output directory should be (collection)/building/text/ for 175 * normal Greenstone builds. 176 * 177 * @param <code>String</code> the label to configure 178 * @param <code>String</code> the value... 179 */ 180 public boolean configure(String label, String value) 181 { 182 if (label.equals(IndexerManager.outputDir)) { 183 this.outputDirectory = value; 184 this.pass = 0; 133 185 134 } 135 136 public String getIndexType() 137 { return MG_INDEX_TYPE; 138 } 139 140 public String getName() 141 { return this.overallName; 142 } 143 144 // private String getIndexDirectory(String level, String field) 145 // { StringBuffer directory = new StringBuffer(); 146 // directory.append(Character.toLowerCase((char) level.charAt(0))); 147 148 // int c, w; 149 // w = 0; 150 // c = 0; 151 // while (c < field.length() && w < 2) { 152 // char ch = field.charAt(c); 153 154 // ch = Character.toLowerCase(ch); 155 // if (Character.isLetter(ch)) { 156 // if (ch != 'a' && ch != 'e' && ch != 'i' && 157 // ch != 'o' && ch != 'u') { 158 // directory.append(ch); 159 // w++; 160 // } 161 // } 162 // c ++; 163 // } 164 // return directory.toString(); 165 // } 166 167 /** 168 * The output directory should be (collection)/building/text/ for 169 * normal Greenstone builds. 170 * 171 * @param <code>String</code> the label to configure 172 * @param <code>String</code> the value... 173 */ 174 public boolean configure(String label, String value) 175 { 176 if (label.equals(IndexerManager.outputDir)) { 177 this.outputDirectory = value; 178 this.pass = 0; 179 180 // attempt to ensure that the text subdirectory exists 181 this.textDirectory = new File(outputDirectory, "text"); 182 if (!textDirectory.exists()) { 183 if (!textDirectory.mkdir()) { 184 return false; 185 } 186 } 187 else if (!textDirectory.isDirectory()) { 188 return false; 189 } 190 this.textStem = this.textDirectory.getPath() + File.separator + INDEX_FILE_STEM; 191 192 // Sign to the user which mg directory is being used... 193 System.out.println("Output MG directory is " + this.textStem); 194 } 195 else if (label.equals(IndexerInterface.GS2_INDEX_LABEL)) { 196 this.indexes.add(new MGIndex(value)); 197 } 198 199 return true; 200 } 201 202 public boolean addIndex(String name, String level, String field) 203 { 204 MGIndex index = new MGIndex(name, level, field); 205 this.indexes.add(index); 206 return true; 207 } 208 209 private Node recurseDOM(DocumentInterface metsDoc, Node node, 210 AbstractStructure structure, StringBuffer textBuffer, 211 StringBuffer extraBuffer, String namespace) 212 //String name, String namespace, String field) 213 { 214 // send out the ctrl-c...if this is 215 if (structure.getStructureType().equals(METSDivision.DIVISION_TYPE)) { 216 // try doing this for all index types 217 if ((this.currentIndexName != null)) { // && this.level != null && this.level.equals(IndexerInterface.SECTION_LEVEL)) { //name.startsWith("s")) { 218 METSDivision division = (METSDivision) structure; 219 220 // get the division metadata block 221 METSDescriptive descriptive; 222 String metadataId = division.getDefaultMetadataReference(); 223 if (metadataId == null) { 224 descriptive = metsDoc.getDocumentMetadata().createDescriptive(division.getLabel()); 225 division.addMetadataReference(descriptive.getID()); 186 // attempt to ensure that the text subdirectory exists 187 this.textDirectory = new File(outputDirectory, "text"); 188 if (!textDirectory.exists()) { 189 if (!textDirectory.mkdir()) { 190 return false; 191 } 192 } 193 else if (!textDirectory.isDirectory()) { 194 return false; 195 } 196 this.textStem = this.textDirectory.getPath() + File.separator + INDEX_FILE_STEM; 197 198 // Sign to the user which mg directory is being used... 199 System.out.println("Output MG directory is " + this.textStem); 200 } 201 else if (label.equals(IndexerInterface.GS2_INDEX_LABEL)) { 202 this.indexes.add(new MGIndex(value)); 203 } 204 205 return true; 206 } 207 208 public boolean addIndex(String name, String level, String field) 209 { 210 MGIndex index = new MGIndex(name, level, field); 211 this.indexes.add(index); 212 return true; 213 } 214 215 private Node recurseDOM(DocumentInterface metsDoc, Node node, 216 AbstractStructure structure, StringBuffer textBuffer, 217 StringBuffer extraBuffer, String namespace) 218 //String name, String namespace, String field) 219 { 220 // send out the ctrl-c...if this is 221 if (structure.getStructureType().equals(METSDivision.DIVISION_TYPE)) { 222 // try doing this for all index types 223 if ((this.currentIndexName != null)) { // && this.level != null && this.level.equals(IndexerInterface.SECTION_LEVEL)) { //name.startsWith("s")) { 224 METSDivision division = (METSDivision) structure; 225 226 // get the division metadata block 227 METSDescriptive descriptive; 228 String metadataId = division.getDefaultMetadataReference(); 229 if (metadataId == null) { 230 descriptive = metsDoc.getDocumentMetadata().createDescriptive(division.getLabel()); 231 division.addMetadataReference(descriptive.getID()); 232 } 233 else { 234 // Get the descriptive item... 235 descriptive = metsDoc.getDocumentMetadata().getDescriptiveById(metadataId); 236 } 237 238 descriptive.addMetadata("gsdl3", "mgseqno", this.overallName + "." + Integer.toString(this.sectionSeqNo)); 239 240 metsDoc.setChanged(true); 241 //metsDoc.setModified(true); 242 // System.out.println("Assigning " + this.sectionSeqNo + " to " + metsDoc.getID() + " " + division.getLabel()); 243 } // section level 244 245 // append an 'end of section' marker 246 //textBuffer.append(END_OF_SECTION); 247 this.sectionSeqNo ++; 248 249 // for document-level indexes, always append an 'end of document' tag at the 250 // end of the document for each section. Otherwise, each section is followed 251 // by an end of document character. This ensures that all indexes use the 252 // same document numbering... 253 if (this.currentIndexLevel == null || 254 this.currentIndexLevel.equals(IndexerInterface.DOCUMENT_LEVEL)) { 255 extraBuffer.append(END_OF_DOCUMENT); 226 256 } 227 257 else { 228 // Get the descriptive item... 258 textBuffer.append(END_OF_DOCUMENT); 259 this.documentSeqNo ++; 260 } 261 262 // produce the body here for metadata output of divisions - in the case of 263 // text output, that will happen below... 264 if (!this.currentIndexField.equals("text")) { 265 METSDescriptive descriptive; 266 267 METSDivision division = (METSDivision) structure; 268 269 String metadataId = division.getDefaultMetadataReference(); 270 229 271 descriptive = metsDoc.getDocumentMetadata().getDescriptiveById(metadataId); 230 } 231 232 descriptive.addMetadata("gsdl3", "mgseqno", this.overallName + "." + Integer.toString(this.sectionSeqNo)); 233 234 metsDoc.setChanged(true); 235 //metsDoc.setModified(true); 236 // System.out.println("Assigning " + this.sectionSeqNo + " to " + metsDoc.getID() + " " + division.getLabel()); 237 } // section level 238 239 // append an 'end of section' marker 240 //textBuffer.append(END_OF_SECTION); 241 this.sectionSeqNo ++; 242 243 // for document-level indexes, always append an 'end of document' tag at the 244 // end of the document for each section. Otherwise, each section is followed 245 // by an end of document character. This ensures that all indexes use the 246 // same document numbering... 247 if (this.currentIndexLevel == null || 248 this.currentIndexLevel.equals(IndexerInterface.DOCUMENT_LEVEL)) { 249 extraBuffer.append(END_OF_DOCUMENT); 250 } 251 else { 252 textBuffer.append(END_OF_DOCUMENT); 253 this.documentSeqNo ++; 254 } 255 256 // produce the body here for metadata output of divisions - in the case of 257 // text output, that will happen below... 258 if (!this.currentIndexField.equals("text")) 259 { METSDescriptive descriptive; 260 261 METSDivision division = (METSDivision) structure; 262 263 String metadataId = division.getDefaultMetadataReference(); 264 265 descriptive = metsDoc.getDocumentMetadata().getDescriptiveById(metadataId); 266 if (descriptive != null) { 267 List values = descriptive.getMetadata(namespace, this.currentIndexField); 272 if (descriptive != null) { 273 List values = descriptive.getMetadata(namespace, this.currentIndexField); 268 274 269 if (values != null) {270 271 272 275 if (values != null) { 276 Iterator valueIter = values.iterator(); 277 while (valueIter.hasNext()) { 278 String value = valueIter.next().toString(); 273 279 274 275 276 277 278 279 }280 }281 282 283 284 285 286 287 288 280 textBuffer.append(value); 281 if (valueIter.hasNext()) { 282 //textBuffer.append(END_OF_SECTION); 283 } 284 } 285 } 286 } 287 } 288 } 289 290 // go through our children as required... 291 Iterator children = structure.getChildIterator(); 292 Node startNode; 293 while (children.hasNext()) { 294 AbstractStructure child = (AbstractStructure) children.next(); 289 295 290 291 292 293 294 295 296 296 // get xpointer for child 297 // get start position node 298 if (metsDoc.getDocumentType() == "METS"){ 299 startNode = ((METSDocument) metsDoc).getSectionStartNode((METSDivision) child); 300 } else { 301 startNode = ((HTMLDocument) metsDoc).getSectionStartNode((METSDivision) child); 302 } 297 303 //Node startNode = ((HTMLDocument) metsDoc).getSectionStartNode((METSDivision) child); 298 304 299 300 301 302 while (node != startNode) {303 XPointer.printNode(node, textBuffer, false);304 305 // print buffer to node306 node = XPointer.getNextNode(node, (this.currentIndexField.equals("text") ? textBuffer : null));307 }308 305 // while this node isn't the child's start node, produce the HTML node text, if 306 // in text field mode... 307 if (this.currentIndexField.equals("text")) { 308 while (node != startNode) { 309 XPointer.printNode(node, textBuffer, false); 310 311 // print buffer to node 312 node = XPointer.getNextNode(node, (this.currentIndexField.equals("text") ? textBuffer : null)); 313 } 314 } 309 315 310 311 312 313 314 315 316 317 318 if (this.currentIndexField.equals("text")) {319 XPointer.printNode(node, textBuffer, false);320 }321 node = XPointer.getNextNode(node, (this.currentIndexField.equals("text") ? textBuffer : null));322 316 // recurse to child 317 node = this.recurseDOM(metsDoc, node, child, textBuffer, extraBuffer, namespace); // name, namespace, field); 318 } // while next child 319 320 // close a document - the actual closing \B will be done by the main 321 // loop, so only a required \C is printed here... 322 if (structure.getStructureType().equals(METSStructure.STRUCTURE_TYPE)) { 323 while (node != null) { 324 if (this.currentIndexField.equals("text")) { 325 XPointer.printNode(node, textBuffer, false); 326 } 327 node = XPointer.getNextNode(node, (this.currentIndexField.equals("text") ? textBuffer : null)); 328 } 323 329 324 325 330 //textBuffer.append(END_OF_SECTION); 331 this.sectionSeqNo ++; 326 332 327 328 333 } 334 return node; 329 335 } 330 336 331 337 private String prepareDOM(DocumentInterface metsDoc, Document document, METSStructure structure, String namespace) 332 // String name, String namespace, String field) 333 { StringBuffer extraBuffer = new StringBuffer(); 334 Node node = document.getDocumentElement(); 335 StringBuffer textBuffer = new StringBuffer(); 336 337 this.recurseDOM(metsDoc, node, structure, textBuffer, extraBuffer, namespace); //name, namespace, field); 338 textBuffer.append(extraBuffer.toString()); 339 return textBuffer.toString(); 340 } 341 342 /** 343 * Index a single document; the document interface can be used to extract individual 344 * metadata items etc. as required or desired and index those instead or as well as 345 * the body text of the document. 346 */ 338 { 339 // String name, String namespace, String field) 340 StringBuffer extraBuffer = new StringBuffer(); 341 Node node = document.getDocumentElement(); 342 StringBuffer textBuffer = new StringBuffer(); 343 344 this.recurseDOM(metsDoc, node, structure, textBuffer, extraBuffer, namespace); //name, namespace, field); 345 textBuffer.append(extraBuffer.toString()); 346 return textBuffer.toString(); 347 } 348 349 /** 350 * Index a single document; the document interface can be used to extract individual 351 * metadata items etc. as required or desired and index those instead or as well as 352 * the body text of the document. 353 */ 347 354 public boolean indexDocument(DocumentID docID, DocumentInterface document) 348 355 { … … 405 412 406 413 this.indexBuffer.append(docText); 407 408 409 410 411 document.addDocumentMetadata("gsdl3", "mgseqno", this.overallName+"."+Integer.toString(startSeqNo));412 413 414 415 416 }414 // remember that we're not on the first document, 415 this.firstDocument = false; 416 // assign the sequence number on the first pass only, and increment the sequence number. 417 if (this.pass == 0) { 418 document.addDocumentMetadata("gsdl3", "mgseqno", this.overallName+"."+Integer.toString(startSeqNo)); 419 } 420 this.documentSeqNo += 1; 421 422 return true; 423 } 417 424 418 425 /** … … 646 653 */ 647 654 public int getNumberOfPasses() 648 { return 2 + this.indexes.size() * 2; 649 } 650 651 public boolean addServiceDescriptions(org.w3c.dom.Element service_rack_list) { 655 { 656 return 2 + this.indexes.size() * 2; 657 } 658 659 public boolean addServiceDescriptions(org.w3c.dom.Element service_rack_list) 660 { 652 661 Document doc = service_rack_list.getOwnerDocument(); 653 662
Note:
See TracChangeset
for help on using the changeset viewer.