- Timestamp:
- 2006-07-13T10:29:55+12:00 (17 years ago)
- Location:
- trunk/greenstone3-extensions/gs3build/src/org/greenstone/gsdl3/gs3build
- Files:
-
- 1 added
- 7 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/greenstone3-extensions/gs3build/src/org/greenstone/gsdl3/gs3build/CollectionManager.java
r12188 r12191 62 62 CollectionMetadata metadata; // collection-level metadata 63 63 GS3SQLConnection database; // the database to store everything in 64 String collectionHome;65 String siteHome;66 String collectionName;67 String qualifiedCollectionName; // used as the database name64 public String collectionHome; 65 public String siteHome; 66 public String collectionName; 67 public String qualifiedCollectionName; // used as the database name 68 68 String notifyHost; 69 69 … … 341 341 RecogniserInterface ri = this.buildManager.getRecogniserManager().addRecogniser(type); 342 342 if (ri != null) { 343 ri.setCollectionManager(this); 343 344 ri.configure(doc_type); 344 345 } -
trunk/greenstone3-extensions/gs3build/src/org/greenstone/gsdl3/gs3build/doctypes/AbstractDocument.java
r12188 r12191 419 419 } 420 420 421 public int getNumSections() { 422 return 3; 423 /* String query = "SELECT count(*) FROM divisions WHERE DocID="+this.id+" AND ParentType='Division'"; 424 try { 425 Statement statement = connection.createStatement(); 426 ResultSet results = statement.executeQuery(query); 427 428 if (results.first()) { 429 int count = results.getInt(0); 430 System.err.println("count = "+count); 431 return count; 432 } 433 } catch (Exception e) { 434 System.err.println("AbstractDocument.getNumSections(): "+e); 435 } 436 return -1;*/ 437 } 421 438 /** 422 439 * @see DocumentInterface:isMETSCompatible -
trunk/greenstone3-extensions/gs3build/src/org/greenstone/gsdl3/gs3build/doctypes/AbstractRecogniser.java
r12188 r12191 5 5 import org.w3c.dom.Element; 6 6 7 import org.greenstone.gsdl3.gs3build.CollectionManager; 7 8 import org.greenstone.gsdl3.gs3build.metadata.*; 8 9 import org.greenstone.gsdl3.gs3build.util.HTTPTools; … … 22 23 ArrayList filename_extensions = null; 23 24 String document_type = "SET THIS IN THE CONCRETE CLASS"; 24 25 CollectionManager coll_manager = null; 26 25 27 /** The constructor should set the variables 26 28 * preferredMimeType, filename_extensions and documentType … … 30 32 } 31 33 34 /** set the collection manager */ 35 public void setCollectionManager(CollectionManager coll_man) { 36 this.coll_manager = coll_man; 37 } 32 38 /** configure by default does nothing */ 33 39 public boolean configure(Element config_elem){ -
trunk/greenstone3-extensions/gs3build/src/org/greenstone/gsdl3/gs3build/doctypes/METSDocument.java
r12188 r12191 26 26 public static final String METS_DOCUMENT_TYPE = "METS"; 27 27 Document domDocument; 28 String original_location; 29 28 30 public METSDocument(DocumentID id) { 29 31 super(id); … … 50 52 domDocument = builder.parse(file); 51 53 52 int filePosition = file.getPath().indexOf("import/")+7; 53 parseFilePath = file.getPath().substring(0, filePosition); 54 //int filePosition = file.getPath().indexOf("import/")+7; 55 //parseFilePath = file.getPath().substring(0, filePosition); 56 parseFilePath = file.getParent(); // all refs should be relative to the current doc, or absolute 57 parseFilePath += File.separator; 58 this.original_location = parseFilePath; 54 59 // TODO: get all the types in the tree 55 60 … … 139 144 URL url = (URL) this.fileSet.getFile(0).getLocation(); 140 145 141 this.getSectionText("1");146 //this.getSectionText("1"); 142 147 143 148 if (url.getProtocol().equals("file")) { 144 metsDoc = new HTMLDoc(url,url.getPath()); 149 String path = url.getPath(); 150 File f = new File(path); 151 if (!f.isAbsolute() && this.original_location != null) { 152 path = this.original_location + path; 153 System.err.println("new path ="+path); 154 } 155 metsDoc = new HTMLDoc(url,path); 145 156 } else { 146 157 metsDoc = new HTMLDoc(url); -
trunk/greenstone3-extensions/gs3build/src/org/greenstone/gsdl3/gs3build/doctypes/RecogniserInterface.java
r12188 r12191 4 4 import org.w3c.dom.Element; 5 5 6 import org.greenstone.gsdl3.gs3build.CollectionManager; 6 7 import org.greenstone.gsdl3.gs3build.metadata.*; 7 8 … … 13 14 public interface RecogniserInterface 14 15 { 16 public void setCollectionManager(CollectionManager coll_man); 15 17 public boolean configure(Element config_elem); 16 18 public void setListRepository(DocumentList docList); -
trunk/greenstone3-extensions/gs3build/src/org/greenstone/gsdl3/gs3build/indexers/MGIndexer.java
r12188 r12191 18 18 import org.greenstone.gsdl3.gs3build.doctypes.DocumentID; 19 19 import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface; 20 import org.greenstone.gsdl3.gs3build.doctypes.AbstractDocument; 20 21 import org.greenstone.gsdl3.gs3build.doctypes.HTMLDocument; 21 22 import org.greenstone.gsdl3.gs3build.doctypes.METSDocument; … … 24 25 import org.greenstone.gsdl3.gs3build.util.DOMUtils; 25 26 import org.greenstone.gsdl3.util.GSXML; 27 import org.greenstone.gsdl3.util.GSFile; 26 28 import org.greenstone.gsdl3.util.Misc; 27 29 import org.greenstone.gsdl3.util.Processing; … … 293 295 //metsDoc.setModified(true); 294 296 // System.out.println("Assigning " + this.sectionSeqNo + " to " + metsDoc.getID() + " " + division.getLabel()); 295 } // section level297 } // first pass 296 298 297 299 // append an 'end of section' marker … … 408 410 public boolean indexDocument(DocumentID docID, DocumentInterface document) 409 411 { 410 411 if (!this.firstDocument) { 412 this.indexBuffer.append(END_OF_DOCUMENT); 413 mgPasses.processDocument(indexBuffer.toString()); 414 this.indexBuffer.delete(0, this.indexBuffer.length()); 415 416 } 417 412 int count = ((AbstractDocument)document).getNumSections(); 418 413 String docText = null; 419 414 // set the mgseqno if first pass … … 426 421 427 422 //long start = System.currentTimeMillis(); 428 Document domDocument = document.getDOMDocument(); 429 if (domDocument != null) { 430 System.err.println("dom doc is not null"); 431 METSStructure sections = document.getDocumentStructure().getStructure("Section"); 432 if (sections != null) { 433 System.err.println("sections are not null"); 434 docText = this.prepareDOM(document, domDocument, sections, "gsdl3"); //this.name, "gsdl3", this.field); 435 // System.out.println(docText); 436 } 437 } 423 if (this.current_index.getLevel().equals("section")) { 424 425 Document domDocument = document.getDOMDocument(); 426 if (domDocument != null) { 427 System.err.println("dom doc is not null"); 428 METSStructure sections = document.getDocumentStructure().getStructure("Section"); 429 if (sections != null) { 430 System.err.println("sections are not null"); 431 docText = this.prepareDOM(document, domDocument, sections, "gsdl3"); //this.name, "gsdl3", this.field); 432 // System.out.println(docText); 433 } 434 } 435 } 436 437 438 438 //long finish = System.currentTimeMillis(); 439 439 //System.err.println("dom doc = "+ Long.toString(finish-start)); … … 447 447 if (field.equals("text")) { 448 448 doc_text_buffer.append(document.getDocumentText()); 449 doc_text_buffer.append(" "); 449 450 } else { 450 451 // its a metadata - do namespace properly!! … … 455 456 String value = valueIter.next().toString(); 456 457 doc_text_buffer.append(value); 458 doc_text_buffer.append(" "); 457 459 } 458 460 } … … 461 463 docText = doc_text_buffer.toString(); 462 464 sectionSeqNo ++; 465 int num_secs = 0; 463 466 } 464 467 //finish = System.currentTimeMillis(); … … 467 470 this.indexBuffer.append(docText); 468 471 // remember that we're not on the first document, 469 this.firstDocument = false;472 //this.firstDocument = false; 470 473 this.documentSeqNo ++; 474 //if (!this.firstDocument) { 475 this.indexBuffer.append(END_OF_DOCUMENT); 476 mgPasses.processDocument(indexBuffer.toString()); 477 String filename=""; 478 try { 479 filename = "pass"+this.pass+"doc"+this.documentSeqNo+".txt"; 480 System.err.println("trying to write to "+filename); 481 GSFile.writeFile(indexBuffer.toString().getBytes(), filename); 482 } catch (Exception e) { 483 System.err.println("COUldn't write to file, "+filename); 484 } 485 this.indexBuffer.delete(0, this.indexBuffer.length()); 486 487 471 488 472 489 return true; … … 487 504 this.indexBuffer = new StringBuffer(); 488 505 int indexNo = this.pass/2; 489 this.current_index = null;490 506 491 507 this.current_index = (MGIndex) this.indexes.get(indexNo); … … 512 528 this.textStem = this.indexStem; 513 529 } 530 514 531 mgPasses.setFileName(this.indexStem); 515 532 if (!Misc.isWindows()) { … … 573 590 } catch (Exception e) {} 574 591 575 int exit_value = 0;592 int exit_value = mgPasses.exitValue(); 576 593 System.out.println("Pass " + this.pass + " completed with " + exit_value); 577 594 if (exit_value !=0) { -
trunk/greenstone3-extensions/gs3build/src/org/greenstone/gsdl3/gs3build/indexers/MGPPIndexer.java
r12188 r12191 8 8 import org.greenstone.gsdl3.gs3build.doctypes.METSDocument; 9 9 import org.greenstone.gsdl3.util.Misc; 10 import org.greenstone.gsdl3.util.GSFile; 10 11 import org.greenstone.gsdl3.util.GSXML; 11 12 import org.greenstone.gsdl3.util.Processing; 12 13 import org.greenstone.gsdl3.gs3build.xpointer.XPointer; 13 14 import org.greenstone.gsdl3.gs3build.metadata.*; 15 import org.greenstone.gsdl3.gs3build.util.DOMUtils; 16 14 17 import java.io.InputStream; 15 18 import java.io.OutputStream; … … 22 25 import org.w3c.dom.Element; 23 26 import org.w3c.dom.Node; 27 import org.w3c.dom.NodeList; 24 28 import org.w3c.dom.Document; 25 26 public class MGPPIndexer extends AbstractIndexer 29 import org.w3c.dom.NamedNodeMap; 30 31 import org.greenstone.gsdl3.util.XMLConverter; 32 33 public class MGPPIndexer //extends AbstractIndexer 34 implements IndexerInterface 27 35 { 28 36 int pass; 29 37 int documentSeqNo; 30 38 int sectionSeqNo; 31 String name;32 39 boolean firstDocument; 33 40 File indexDirectory; 34 File textDirectory;41 // File textDirectory; 35 42 String indexStem; 36 String textStem;43 // String textStem; 37 44 StringBuffer indexBuffer; 38 45 String outputDirectory; 39 //String outputStem; 40 // String passExtra; 41 // InputStream indexerFeedback; 42 // InputStream indexerErrors; 43 // OutputStream indexerTextfeed; 44 // Process mgpp_passes; 45 //String overallName; 46 String currentIndexName; 47 String currentIndexLevel; 48 String currentIndexField; 46 String overallName; 47 48 List indexes; 49 MGPPIndex current_index = null; 49 50 MGPPPassesWrapper mgppPasses; 50 51 51 52 52 static final String documentSeparator = "<Document>"; 53 static final String sectionSeparator = "<Section>"; 54 55 static final String START_OF_DOCUMENT = "<Document>"; 56 static final String END_OF_DOCUMENT = "</Document>"; 57 static final String START_OF_SECTION = "<Section>"; 58 static final String END_OF_SECTION = "</Section>"; 53 static final String DOCUMENT = "Doc"; 54 static final String SECTION = "Sec"; 55 static final String START_OF_DOCUMENT = "<"+DOCUMENT+">"; 56 static final String END_OF_DOCUMENT = "</"+DOCUMENT+">"; 57 static final String START_OF_SECTION = "<"+SECTION+">"; 58 static final String END_OF_SECTION = "</"+SECTION+">"; 59 59 60 60 … … 67 67 public String name = null; 68 68 public String doc_level = null; 69 public ArrayList levels = null;70 public ArrayList fields = null;69 public List levels = null; 70 public List fields = null; 71 71 boolean error = false;// assume built until we get an error 72 73 public MGPPIndex(Element index_element) { 74 75 this.fields = new ArrayList(); 76 this.levels = new ArrayList(); 77 this.name = index_element.getAttribute(GSXML.NAME_ATT); 78 if (this.name.equals("")) { 79 // TODO make this dynamic 80 this.name = "xx"; 81 } 82 NodeList children = index_element.getChildNodes(); 83 for (int c = 0; c < children.getLength(); c ++) { 84 Node child = children.item(c); 85 86 if (child.getNodeType() == Node.ELEMENT_NODE) { 87 String name = child.getNodeName(); 88 89 if (name.equals(GSXML.LEVEL_ELEM)) { 90 String level = DOMUtils.getNodeChildText(children.item(c)); 91 this.levels.add(level); 92 } 93 else if (name.equals(GSXML.FIELD_ELEM)) { 94 String fieldName = DOMUtils.getNodeChildText(children.item(c)); 95 this.fields.add(fieldName); 96 } 97 } 98 } 99 } 72 100 73 101 public MGPPIndex(String name) { 74 102 this.name = name; 75 doc_level = "Document"; 76 } 77 103 this.doc_level = DOCUMENT; 104 this.fields = new ArrayList(); 105 this.levels = new ArrayList(); 106 } 107 78 108 public void setDocLevel(String doc_level) { 79 109 this.doc_level = doc_level; … … 88 118 this.fields.add(field); 89 119 } 90 120 public List getLevels() { 121 return this.levels; 122 } 123 public List getFields() { 124 return this.fields; 125 } 126 public String getName() { 127 return this.name; 128 } 91 129 public boolean hasError() { 92 130 return this.error; … … 101 139 public MGPPIndexer(String name) 102 140 { 103 this.name = name; 141 this.overallName = name; 142 this.indexes = new ArrayList(); 104 143 //this.passExtra = ""; 105 144 } … … 107 146 public String getName() 108 147 { 109 return this.name; 148 return this.overallName; 149 } 150 151 public String getIndexType() 152 { 153 return MGPP_INDEX_TYPE; 154 } 155 156 // for now make all indexes use document and section levels. 157 // then when writing the buildconfig, only display the levels 158 // that the user has specified (likely to be both doc and sec). 159 public boolean configure(Node search_node) 160 { 161 NodeList index_children = GSXML.getChildrenByTagName(search_node, GSXML.INDEX_ELEM); 162 163 // add a text 'index' - we should be able to turn this off in the config file? 164 MGPPIndex text_index = new MGPPIndex("text"); 165 text_index.addField("text"); 166 // always do eveything at doc and sec level at the moment 167 text_index.addLevel(DOCUMENT); 168 text_index.addLevel(SECTION); 169 indexes.add(text_index); 170 171 for (int i = 0; i < index_children.getLength(); i ++) { 172 Element index_elem = (Element)index_children.item(i); 173 MGPPIndex index = new MGPPIndex(index_elem); 174 if (index.getName() != null && index.getLevels() != null && index.getFields()!= null) { 175 indexes.add(index); 176 } else { 177 System.err.println("invalid index spec, not including"+new XMLConverter().getPrettyString(index_elem)); 178 } 179 } 180 // TODO make sure all index names are unique 181 return true; 110 182 } 111 183 … … 120 192 121 193 // attempt to ensure that the text subdirectory exists 122 this.textDirectory = new File(outputDirectory, "text");123 if (!textDirectory.exists()) {124 if (!textDirectory.mkdir()) {125 return false;126 }127 }128 else if (!textDirectory.isDirectory()) {129 return false;130 }131 this.textStem = this.textDirectory.getPath() + File.separator + INDEX_FILE_STEM;132 133 // attempt to ensure that the index subdir exists134 this.indexDirectory = new File(outputDirectory, "idx");135 if (!indexDirectory.exists()) {136 if (!indexDirectory.mkdir()) {137 return false;138 }139 }140 else if (!indexDirectory.isDirectory()) {141 return false;142 }143 this.indexStem = this.indexDirectory.getPath() + File.separator + INDEX_FILE_STEM;194 // this.textDirectory = new File(outputDirectory, "text"); 195 // if (!textDirectory.exists()) { 196 // if (!textDirectory.mkdir()) { 197 // return false; 198 // } 199 // } 200 // else if (!textDirectory.isDirectory()) { 201 // return false; 202 // } 203 // this.textStem = this.textDirectory.getPath() + File.separator + INDEX_FILE_STEM; 204 205 // // attempt to ensure that the index subdir exists 206 // this.indexDirectory = new File(outputDirectory, "idx"); 207 // if (!indexDirectory.exists()) { 208 // if (!indexDirectory.mkdir()) { 209 // return false; 210 // } 211 // } 212 // else if (!indexDirectory.isDirectory()) { 213 // return false; 214 // } 215 // this.indexStem = this.indexDirectory.getPath() + File.separator + INDEX_FILE_STEM; 144 216 145 217 // Sign to the user which mg directory is being used... 146 System.out.println("Output MGPP text directory is " + this.textStem);147 System.out.println("Output MGPP index directory is " + this.indexStem);218 // System.out.println("Output MGPP text directory is " + this.textStem); 219 // System.out.println("Output MGPP index directory is " + this.indexStem); 148 220 } 149 221 this.pass = 0; … … 151 223 } 152 224 153 public String getIndexType() 154 { 155 return MGPP_INDEX_TYPE; 156 } 157 158 public boolean addIndex(String name, String level, String field) 159 { 160 // if (level == "doc_level") { 161 // passExtra = " -J " + level; 162 // } 163 // else { 164 // passExtra = " -K " + level; 165 // } 166 return true; 167 } 225 226 // public boolean addIndex(String name, String level, String field) 227 // { 228 // // if (level == "doc_level") { 229 // // passExtra = " -J " + level; 230 // // } 231 // // else { 232 // // passExtra = " -K " + level; 233 // // } 234 // return true; 235 // } 168 236 169 237 /** … … 176 244 if (this.pass == 0) { 177 245 document.removeAllMetadata("gsdl3", "mgseqno"); 178 } 179 180 // why do this at the start and not at the end??? 181 if (!this.firstDocument) { 182 // Send a '</Document>' at the end of the doc 183 this.indexBuffer.append(END_OF_DOCUMENT); 184 mgppPasses.processDocument(indexBuffer.toString()); 185 this.indexBuffer.delete(0, this.indexBuffer.length()); 186 } 187 246 document.addDocumentMetadata("gsdl3", "mgseqno", this.overallName+"."+Integer.toString(this.sectionSeqNo)); 247 } 248 249 188 250 String docText = null; 189 251 190 //int startSeqNo = this.sectionSeqNo; 191 //this.sectionSeqNo ++; 192 int startSeqNo = this.documentSeqNo; 193 252 this.sectionSeqNo ++; 253 194 254 Document domDocument = document.getDOMDocument(); 195 255 if (domDocument != null) { … … 198 258 if (sections != null) { 199 259 System.err.println("sections are not null"); 200 docText = this.prepareDOM(document, domDocument, sections, "gsdl3"); //this.name, "gsdl3", this.field); 201 // System.out.println(docText); 260 docText = this.prepareDOM(document, domDocument, sections, "gsdl3"); 202 261 } 203 262 } 204 263 if (docText == null) { 205 264 System.err.println("dom doc or sections was null - asking for doc text"); 206 //if (this.currentIndexField.equals("text")) { 207 //docText = Character.toString(END_OF_DOCUMENT) + document.getDocumentText(); 208 docText = document.getDocumentText(); 209 //} 210 // else { 211 // StringBuffer textBuffer = new StringBuffer(); 212 // //textBuffer.append(END_OF_DOCUMENT); 213 // List values = document.getDocumentMetadataItem("gsdl3", this.currentIndexField); 214 // if (values != null) { 215 // Iterator valueIter = values.iterator(); 216 // while (valueIter.hasNext()) { 217 // String value = valueIter.next().toString(); 218 219 // textBuffer.append(value); 220 // if (valueIter.hasNext()) { 221 // //textBuffer.append(END_OF_SECTION); 222 // // sectionSeqNo ++; 223 // } 224 // } 225 // } 226 // else { 227 // textBuffer.append("No data"); 228 // } 229 // docText = textBuffer.toString(); 230 // } 265 StringBuffer doc_text_buffer = new StringBuffer(); 266 List fields = this.current_index.getFields(); 267 for (int i=0; i<fields.size(); i++) { 268 String field = (String)fields.get(i); 269 if (field.equals("text")) { 270 doc_text_buffer.append(document.getDocumentText()); 271 doc_text_buffer.append(" "); 272 } else { 273 // its a metadata - do namespace properly!! 274 List values = document.getDocumentMetadataItem("gsdl3", field); 275 if (values != null) { 276 Iterator valueIter = values.iterator(); 277 while (valueIter.hasNext()) { 278 String value = valueIter.next().toString(); 279 doc_text_buffer.append(value); 280 doc_text_buffer.append(" "); 281 } 282 } 283 } 284 } // for each field 285 docText = doc_text_buffer.toString(); 231 286 sectionSeqNo ++; 232 287 } 233 234 //try { 235 // this.indexerTextfeed.write(documentSeparator.getBytes(), 0, documentSeparator.getBytes().length); 236 // } 237 // catch (IOException ex) { 238 // System.out.println("Bad output on end of document" + ex); 239 // ex.printStackTrace(); 240 // return false; 241 // } 242 // } 243 288 244 289 this.indexBuffer.append(START_OF_DOCUMENT); 245 //String docText = document.getDocumentText();290 this.indexBuffer.append(START_OF_SECTION); 246 291 this.indexBuffer.append(docText); 247 //int startSeqNo = this.documentSeqNo; 248 249 // byte [] bytes = docText.getBytes(); 250 // int pos = 0, end = bytes.length; 251 252 // try { 253 // while (pos < end) { 254 // this.indexerTextfeed.write(bytes, pos, (end - pos > 512 ? 512 : end - pos)); 255 // pos = pos + 512; 256 257 // try { 258 // while (this.indexerFeedback.available() > 0) { 259 // byte b[] = new byte[this.indexerFeedback.available()]; 260 // System.out.println("Feedback of " + this.indexerFeedback.available()); 261 // this.indexerFeedback.read(b); 262 // System.out.println(b); 263 // } 264 // } 265 // catch (IOException ex) { 266 267 // } 268 269 270 // try { 271 // while (this.indexerErrors.available() > 0) { 272 // byte b[] = new byte[this.indexerErrors.available()]; 273 // System.out.println("Feedback of " + this.indexerErrors.available()); 274 // this.indexerErrors.read(b); 275 // System.out.println(new String(b)); 276 // } 277 // } 278 // catch (IOException ex){ 279 280 // } 281 // } 282 // } 283 // catch (IOException ex) { 284 // System.out.println("Bad output during document write " + ex + " " + pos + " " + end); 285 // ex.printStackTrace(); 286 // return false; 287 // } 292 this.indexBuffer.append(END_OF_SECTION); 293 this.indexBuffer.append(END_OF_DOCUMENT); 294 this.mgppPasses.processDocument(indexBuffer.toString()); 295 this.indexBuffer.delete(0, this.indexBuffer.length()); 296 288 297 this.firstDocument = false; 289 298 290 if (this.pass == 0) {291 document.addDocumentMetadata("gsdl3", "mgseqno", "dtx."+Integer.toString(startSeqNo));292 }293 299 this.documentSeqNo++; 294 300 295 // try {296 // while (this.indexerErrors.available() > 0) {297 // char c = (char) this.indexerErrors.read();298 // System.out.println(c);299 // }300 // while (this.indexerFeedback.available() > 0) {301 // byte b[] = new byte[this.indexerFeedback.available()];302 // System.out.println("Feedback of " + this.indexerFeedback.available());303 // this.indexerFeedback.read(b);304 // }305 // }306 // catch (IOException ex) {307 308 // }309 301 return true; 310 302 } … … 323 315 this.indexBuffer = new StringBuffer(); 324 316 325 MGPPIndex index = null; // do something with this!! 326 317 int indexNo = this.pass/2; 318 this.current_index = (MGPPIndex) this.indexes.get(indexNo); 319 320 if (this.current_index.hasError()) { 321 // an error has already occurred for this index, don't continue 322 System.out.println("pass "+this.pass+": aborted due to errors in the previous pass"); 323 return false; 324 } 325 326 // attempt to ensure that the text/index subdirectory exists 327 this.indexDirectory = new File(outputDirectory, current_index.getName()); 328 if (!indexDirectory.exists()) { 329 if (!indexDirectory.mkdir()) { 330 return false; 331 } 332 } 333 else if (!indexDirectory.isDirectory()) { 334 return false; 335 } 336 337 this.indexStem = this.indexDirectory.getPath() + File.separatorChar + INDEX_FILE_STEM; // TODO: modify for index 338 // if (this.pass == 0) { 339 // // first pass, also set up the textStem 340 // this.textDirectory = this.indexDirectory; 341 // this.textStem = this.indexStem; 342 // } 343 327 344 // get the parameters for this execution of mg_passes 328 mgppPasses.setFileName( (this.pass < 2 ? this.textStem : this.indexStem ));345 mgppPasses.setFileName(this.indexStem); 329 346 if (!Misc.isWindows()) { 330 347 mgppPasses.setBasePath("/"); 331 348 } 332 349 333 mgppPasses.setDocumentTag("Document"); 334 //mgppPasses.addLevelTag("Section"); 335 336 this.currentIndexLevel = "Document";// index.getLevel(); 337 this.currentIndexField = "text";//index.getField(); 338 this.currentIndexName = "idx"; //index.getName(); 339 340 341 switch (this.pass) { 350 // always use Doc and Sec for now 351 mgppPasses.setDocumentTag(DOCUMENT); 352 mgppPasses.addLevelTag(SECTION); 353 354 //this.currentIndexLevel = "Document";// index.getLevel(); 355 //this.currentIndexField = "text";//index.getField(); 356 //this.currentIndexName = "idx"; //index.getName(); 357 358 int mgppPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2); 359 switch (mgppPass) { 342 360 case 0: 343 361 // -T1 344 362 mgppPasses.addPass(MGPPPassesWrapper.TEXT_PASS_1); 345 //mgpp_passes = Runtime.getRuntime().exec("mgpp_passes " + passExtra + " -f " + this.outputStem + " -T1");346 363 break; 347 364 … … 349 366 // -T2 350 367 mgppPasses.addPass(MGPPPassesWrapper.TEXT_PASS_2); 351 //mgpp_passes = Runtime.getRuntime().exec("mgpp_passes " + passExtra + " -f " + this.outputStem +" -T2");352 368 break; 353 369 … … 355 371 // -I1 356 372 mgppPasses.addPass(MGPPPassesWrapper.INDEX_PASS_1); 357 //mgpp_passes = Runtime.getRuntime().exec("mgpp_passes " + passExtra + " -f " + this.outputStem +" -I1");358 373 break; 359 374 360 375 case 3: 361 //Process p = Runtime.getRuntime().exec("mgpp_perf_hash_build -f " + this.outputStem);362 //p.waitFor();363 376 // -I2 364 377 mgppPasses.addPass(MGPPPassesWrapper.INDEX_PASS_2); 365 //mgpp_passes = Runtime.getRuntime().exec("mgpp_passes " + passExtra + " -f " + this.outputStem +" -I2");366 378 break; 367 379 } 368 369 //this.indexerFeedback = mgpp_passes.getInputStream(); 370 // this.indexerErrors = mgpp_passes.getErrorStream(); 371 // this.indexerTextfeed = mgpp_passes.getOutputStream(); 372 // } 373 //catch (IOException ex) { 374 // System.out.println(ex); 375 // ex.printStackTrace(); 376 // return false; 377 //}/ 378 //catch (InterruptedException ex) { 379 // System.out.println(ex); 380 // ex.printStackTrace(); 381 // return false; 382 //} 380 383 381 mgppPasses.init(); 384 382 System.out.println("Pass " + this.pass); … … 391 389 public boolean endPass(int passNumber) 392 390 { 393 // TODO: end pass394 Process p;395 MGPPIndex index = null; // do something with this!!396 391 try { 397 392 this.indexBuffer.append(END_OF_DOCUMENT); … … 403 398 System.out.println(ex); 404 399 } 400 405 401 mgppPasses.finish(); 402 406 403 try { 407 404 Thread.sleep(1000); … … 411 408 System.out.println("Pass " + this.pass + " completed with " + exit_value); 412 409 if (exit_value !=0) { 413 //assume something has gone wrong, don't continue 414 // if (index != null) { 415 // index.setError(true); 416 // return false; 417 // } 418 } 410 this.current_index.setError(true); 411 return false; 412 } 413 414 int mgppPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2); 419 415 420 416 String osextra = ""; … … 423 419 } 424 420 425 switch ( this.pass) {421 switch (mgppPass) { 426 422 case 0: 427 //System.exit(1);428 423 System.out.println("Compressing dictionary"); 429 exit_value = Processing.runProcess("mgpp_compression_dict -f " + this. textStem + " -S -H -2 -k 5120"+ osextra);424 exit_value = Processing.runProcess("mgpp_compression_dict -f " + this.indexStem + " -S -H -2 -k 5120"+ osextra); 430 425 431 426 if (exit_value == 0) { … … 433 428 } else { 434 429 System.err.println("Error from mgpp_compression_dict: " + exit_value); 435 //index.setError(true);430 this.current_index.setError(true); 436 431 return false; 437 432 } … … 445 440 } else { 446 441 System.err.println("Unable to build the perfect hash"); 447 //index.setError(true);442 this.current_index.setError(true); 448 443 return false; 449 444 } … … 457 452 } else { 458 453 System.err.println("Unable to create weights file"); 459 //index.setError(true);454 this.current_index.setError(true); 460 455 return false; 461 456 } … … 467 462 } else { 468 463 System.out.println("Unable to create inverted dictionary file"); 469 //index.setError(true);464 this.current_index.setError(true); 470 465 return false; 471 466 } … … 477 472 } else { 478 473 System.out.println("Unable to create stemmed index 1"); 479 //index.setError(true);474 this.current_index.setError(true); 480 475 return false; 481 476 } … … 486 481 } else { 487 482 System.out.println("Unable to create stemmed index 2"); 488 //index.setError(true);483 this.current_index.setError(true); 489 484 return false; 490 485 } … … 494 489 } else { 495 490 System.out.println("Unable to create stemmed index 3"); 496 //index.setError(true);491 this.current_index.setError(true); 497 492 return false; 498 493 } … … 516 511 public int getNumberOfPasses() 517 512 { 513 //return this.indexes.size()*2; 518 514 return 4; 519 515 } 520 516 521 517 public boolean addServiceDescriptions(Element service_rack_list) { 518 519 // we only have one real index at the moment, - the first index in the list will be the text one. 520 MGPPIndex index = (MGPPIndex)this.indexes.get(1); 521 if (index.hasError()) { 522 // we weren't able to create any indexes - don't add a search service 523 return false; 524 } 525 522 526 Document doc = service_rack_list.getOwnerDocument(); 523 524 // generate the list of indexes 527 Element search_service_elem = doc.createElement(GSXML.SERVICE_CLASS_ELEM); 528 service_rack_list.appendChild(search_service_elem); 529 Element retrieve_service_elem = doc.createElement(GSXML.SERVICE_CLASS_ELEM); 530 service_rack_list.appendChild(retrieve_service_elem); 531 532 // generate the list of indexes - with only one index in it 525 533 Element index_list = doc.createElement(GSXML.INDEX_ELEM+GSXML.LIST_MODIFIER); 526 Element e = doc.createElement(GSXML.INDEX_ELEM); 527 e.setAttribute(GSXML.NAME_ATT, "idx"); 528 index_list.appendChild(e); 529 String def_index = "idx"; 530 531 // boolean found_index = false; 532 // String def_index = ""; // the default index will just be the first one created for now. 533 // for (int i=0; i<this.indexes.size(); i++) { 534 // MGIndex index = (MGIndex)this.indexes.get(i); 535 // if (!index.hasError()) { 536 // Element e = doc.createElement(GSXML.INDEX_ELEM); 537 // e.setAttribute(GSXML.NAME_ATT, index.getName()); 538 // index_list.appendChild(e); 539 // if (found_index == false) { 540 // // this is the first index 541 // found_index = true; 542 // def_index = index.getName(); 543 // } 544 // } 545 // } 546 547 // if (!found_index) { 548 // // no indexes were able to be created, so we can't use them or the text 549 // return false; 550 // } 551 552 Element f = doc.createElement(GSXML.FIELD_ELEM+GSXML.LIST_MODIFIER); 534 Element index_elem = doc.createElement(GSXML.INDEX_ELEM); 535 index_elem.setAttribute(GSXML.NAME_ATT, index.getName()); 536 index_list.appendChild(index_elem); 553 537 554 538 Element default_index = doc.createElement("defaultIndex"); 555 default_index.setAttribute(GSXML.NAME_ATT, def_index);539 default_index.setAttribute(GSXML.NAME_ATT, index.getName()); 556 540 557 541 Element base_index_name = doc.createElement("baseIndexPrefix"); 558 base_index_name.setAttribute(GSXML.NAME_ATT, "dtx"); //overallName);542 base_index_name.setAttribute(GSXML.NAME_ATT, overallName); 559 543 560 544 Element index_stem = doc.createElement("indexStem"); 561 index_stem.setAttribute(GSXML.NAME_ATT, "index"); 562 563 Element search_service_elem = doc.createElement(GSXML.SERVICE_CLASS_ELEM); 564 Element retrieve_service_elem = doc.createElement(GSXML.SERVICE_CLASS_ELEM); 545 index_stem.setAttribute(GSXML.NAME_ATT, INDEX_FILE_STEM); 546 565 547 Element default_level = doc.createElement("defaultLevel"); 566 default_level.setAttribute(GSXML.NAME_ATT, "Document"); 567 568 Element level_list = doc.createElement("levelList"); 569 Element level = doc.createElement("level"); 570 level.setAttribute(GSXML.NAME_ATT, "Document"); 548 default_level.setAttribute(GSXML.NAME_ATT, SECTION); 549 550 // always have doc and sec at the moment 551 Element level_list = doc.createElement(GSXML.LEVEL_ELEM+GSXML.LIST_MODIFIER); 552 Element level = doc.createElement(GSXML.LEVEL_ELEM); 553 level.setAttribute(GSXML.NAME_ATT, DOCUMENT); 571 554 level_list.appendChild(level); 572 555 573 Element field_list = doc.createElement("fieldList"); 574 Element field = doc.createElement("field"); 575 field.setAttribute(GSXML.NAME_ATT, "ZZ"); 576 field_list.appendChild(field); 577 578 service_rack_list.appendChild(search_service_elem); 579 service_rack_list.appendChild(retrieve_service_elem); 580 556 level = doc.createElement(GSXML.LEVEL_ELEM); 557 level.setAttribute(GSXML.NAME_ATT, SECTION); 558 level_list.appendChild(level); 559 560 Element field_list = doc.createElement(GSXML.FIELD_ELEM+GSXML.LIST_MODIFIER); 561 Element field; 562 List fields = index.getFields(); 563 for (int i=0; i<fields.size(); i++) { 564 String f = (String) fields.get(i); 565 field = doc.createElement(GSXML.FIELD_ELEM); 566 field.setAttribute(GSXML.NAME_ATT, f); 567 field_list.appendChild(field); 568 } 569 581 570 search_service_elem.setAttribute(GSXML.NAME_ATT, "GS3MGPPSearch"); 582 571 search_service_elem.appendChild(index_list); … … 584 573 search_service_elem.appendChild(level_list); 585 574 search_service_elem.appendChild(default_level); 586 search_service_elem.appendChild(field_list); // do we need this?? 575 search_service_elem.appendChild(field_list); 587 576 search_service_elem.appendChild(base_index_name); 588 577 search_service_elem.appendChild(index_stem); 589 578 590 579 retrieve_service_elem.setAttribute(GSXML.NAME_ATT, "GS3MGPPRetrieve"); 591 580 retrieve_service_elem.appendChild(default_level.cloneNode(true)); … … 597 586 598 587 599 private Node recurseDOM(DocumentInterface metsDoc, Node node, 600 AbstractStructure structure, StringBuffer textBuffer, 601 StringBuffer extraBuffer, String namespace) 602 //String name, String namespace, String field) 603 { 604 // send out the ctrl-c...if this is 605 if (structure.getStructureType().equals(METSDivision.DIVISION_TYPE)) { 588 private Node recurseDOM(DocumentInterface metsDoc, Node node, 589 AbstractStructure structure, StringBuffer textBuffer, 590 StringBuffer extraBuffer, String namespace) 591 //String name, String namespace, String field) 592 { 593 List fields = current_index.getFields(); 594 // send out the ctrl-c...if this is 595 if (structure.getStructureType().equals(METSDivision.DIVISION_TYPE)) { 596 // try doing this for all index types 597 // actually we should only need to do this once ???? 598 if (this.pass == 0) { 599 System.err.println("division structure"); 600 //if ((this.currentIndexName != null)) { // && this.level != null && this.level.equals(IndexerInterface.SECTION_LEVEL)) { //name.startsWith("s")) { 601 METSDivision division = (METSDivision) structure; 602 603 // get the division metadata block 604 METSDescriptive descriptive; 605 String metadataId = division.getDefaultMetadataReference(); 606 if (metadataId == null) { 607 descriptive = metsDoc.getDocumentMetadata().createDescriptive(division.getLabel()); 608 division.addMetadataReference(descriptive.getID()); 609 } 610 else { 611 // Get the descriptive item... 612 descriptive = metsDoc.getDocumentMetadata().getDescriptiveById(metadataId); 613 } 614 615 descriptive.addMetadata("gsdl3", "mgseqno", this.overallName + "." + Integer.toString(this.sectionSeqNo)); 616 617 metsDoc.setChanged(true); 618 //metsDoc.setModified(true); 619 // System.out.println("Assigning " + this.sectionSeqNo + " to " + metsDoc.getID() + " " + division.getLabel()); 620 } // first pass 621 622 // append an 'end of section' marker 623 //textBuffer.append(END_OF_SECTION); 624 this.sectionSeqNo ++; 625 textBuffer.append(END_OF_SECTION); 626 textBuffer.append(START_OF_SECTION); 627 // for document-level indexes, always append an 'end of document' tag at the 628 // end of the document for each section. Otherwise, each section is followed 629 // by an end of document character. This ensures that all indexes use the 630 // same document numbering... 631 632 // if (this.current_index.getLevel().equals(IndexerInterface.DOCUMENT_LEVEL)) { 633 // extraBuffer.append(END_OF_DOCUMENT); 634 // } 635 // else { 636 // textBuffer.append(END_OF_DOCUMENT); 637 // this.documentSeqNo ++; 638 // } 639 640 // produce the body here for metadata output of divisions - in the case of 641 // text output, that will happen below... 642 643 if (fields.size()>1 || !((String)fields.get(0)).equals("text")) { 644 // if there is only text, don't do this 645 METSDescriptive descriptive; 646 647 METSDivision division = (METSDivision) structure; 648 649 String metadataId = division.getDefaultMetadataReference(); 650 // are there other metadata refs to get?? 651 descriptive = metsDoc.getDocumentMetadata().getDescriptiveById(metadataId); 652 if (descriptive != null) { 653 for (int i=0; i<fields.size(); i++) { 654 String field = (String)fields.get(i); 655 if (field.equals("text")) { 656 continue; 657 } 658 List values = descriptive.getMetadata(namespace, field); 659 if (values != null) { 660 Iterator valueIter = values.iterator(); 661 while (valueIter.hasNext()) { 662 String value = valueIter.next().toString(); 663 textBuffer.append("<"+field+">"); 664 textBuffer.append(value); 665 textBuffer.append("</"+field+">"); 666 } 667 } 668 } 669 } 670 } 671 } 672 673 // go through our children as required... 674 Iterator children = structure.getChildIterator(); 675 Node startNode; 676 boolean index_text = fields.contains("text"); 677 while (children.hasNext()) { 678 AbstractStructure child = (AbstractStructure) children.next(); 679 680 // get xpointer for child 681 // get start position node 682 if (metsDoc.getDocumentType() == "METS"){ 683 startNode = ((METSDocument) metsDoc).getSectionStartNode((METSDivision) child); 684 } else { 685 startNode = ((HTMLDocument) metsDoc).getSectionStartNode((METSDivision) child); 686 } 687 688 // while this node isn't the child's start node, produce the 689 // HTML node text, if in text field mode... 690 if (index_text) { 691 while (node != startNode) { 692 XPointer.printNode(node, textBuffer, false); 693 node = XPointer.getNextNode(node); 694 } 695 } 696 697 // recurse to child 698 node = this.recurseDOM(metsDoc, node, child, textBuffer, extraBuffer, namespace); // name, namespace, field); 699 } // while next child 700 701 // close a document - the actual closing \B will be done by the main 702 // loop, so only a required \C is printed here... 703 // why have we got STRUCTURE_TYPE here and DIVISION_TYPE above???? 704 if (structure.getStructureType().equals(METSStructure.STRUCTURE_TYPE)) { 705 if (this.pass == 0) { 706 System.err.println("structure structure"); 707 } 708 if (index_text) { 709 while (node != null) { 710 XPointer.printNode(node, textBuffer, false); 711 node = XPointer.getNextNode(node); 712 } 713 } 714 715 //textBuffer.append(END_OF_SECTION); 716 //this.sectionSeqNo ++; 717 718 } 719 return node; 720 } 721 722 723 private void printNode(Node node, StringBuffer buffer, 724 boolean preserve_format, boolean close) { 725 726 if (node.getNodeType() == org.w3c.dom.Node.TEXT_NODE) { 727 if (!close) { 728 buffer.append(node.getNodeValue()); 729 buffer.append(" "); 730 } 731 return; 732 } 733 734 if (node.getNodeType() == org.w3c.dom.Node.ENTITY_NODE) { 735 if (!close) { 736 buffer.append("&"); 737 buffer.append(node.getNodeValue()); 738 buffer.append(";"); 739 buffer.append(" "); 740 } 741 return; 742 } 743 if (!preserve_format) return; // we don't want any xml output 744 /* 745 else if (node.getNodeType() == org.w3c.dom.Node.COMMENT_NODE) { 746 if (!close) { 747 buffer.append("<!-- "); 748 buffer.append(node.getNodeValue()); 749 buffer.append(" -->"); 750 } 751 }*/ 752 if (node.getNodeType() == org.w3c.dom.Node.ELEMENT_NODE) { 753 754 boolean hasChildren = (node.getChildNodes().getLength() > 0); 755 756 if (close) { 757 if (hasChildren) { 758 // put the close node 759 buffer.append("</"); 760 buffer.append(node.getNodeName()); 761 buffer.append(">"); 762 } 763 // else it was closed off previously 764 return; 765 } 766 767 768 buffer.append("<"); 769 buffer.append(node.getNodeName()); 770 NamedNodeMap attributes = ((Element) node).getAttributes(); 771 for (int a = 0; a < attributes.getLength(); a ++) { 772 Node attributeNode = attributes.item(a); 773 buffer.append(" "); 774 buffer.append(attributeNode.getNodeName()); 775 776 String value = attributeNode.getNodeValue(); 777 if (value != null && value.length() > 0) { 778 buffer.append("=\""); 779 buffer.append(value); 780 buffer.append("\""); 781 } 782 } 783 784 785 if (!hasChildren) { 786 buffer.append(" /"); 787 } 788 buffer.append(">"); 789 790 //if (close && node.getNodeName()=="Section") { 791 //buffer.append((char) 3); 792 // } 793 } 794 795 } 796 797 /* 798 private Node recurseDOMold(DocumentInterface metsDoc, Node node, 799 AbstractStructure structure, StringBuffer textBuffer, 800 StringBuffer extraBuffer, String namespace) 801 //String name, String namespace, String field) 802 { 803 List fields = current_index.getFields(); 804 // send out the ctrl-c...if this is 805 if (structure.getStructureType().equals(METSDivision.DIVISION_TYPE)) { 606 806 // try doing this for all index types 607 if ((this.currentIndexName != null)) { // && this.level != null && this.level.equals(IndexerInterface.SECTION_LEVEL)) { //name.startsWith("s")) {807 if ((this.currentIndexName != null)) { // && this.level != null && this.level.equals(IndexerInterface.SECTION_LEVEL)) { //name.startsWith("s")) { 608 808 METSDivision division = (METSDivision) structure; 609 809 … … 718 918 return node; 719 919 } 720 920 */ 721 921 private String prepareDOM(DocumentInterface metsDoc, Document document, METSStructure structure, String namespace) 722 // String name, String namespace, String field) 723 { StringBuffer extraBuffer = new StringBuffer(); 724 Node node = document.getDocumentElement(); 725 StringBuffer textBuffer = new StringBuffer(); 726 727 this.recurseDOM(metsDoc, node, structure, textBuffer, extraBuffer, namespace); //name, namespace, field); 728 textBuffer.append(extraBuffer.toString()); 729 return textBuffer.toString(); 730 } 731 922 // String name, String namespace, String field) 923 { 924 StringBuffer extraBuffer = new StringBuffer(); 925 Node node = document.getDocumentElement(); 926 StringBuffer textBuffer = new StringBuffer(); 927 928 this.recurseDOM(metsDoc, node, structure, textBuffer, extraBuffer, namespace); 929 textBuffer.append(extraBuffer.toString()); 930 return textBuffer.toString(); 931 } 932 732 933 }
Note:
See TracChangeset
for help on using the changeset viewer.