package org.greenstone.gsdl3.gs3build.indexers; import java.util.List; import java.util.ArrayList; import java.util.Iterator; import java.io.File; import java.io.InputStream; import java.io.OutputStream; import java.io.IOException; import java.io.BufferedReader; import java.io.InputStreamReader; import org.w3c.dom.*; import org.greenstone.mg.*; import org.greenstone.gsdl3.gs3build.doctypes.DocumentID; import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface; import org.greenstone.gsdl3.gs3build.doctypes.HTMLDocument; import org.greenstone.gsdl3.gs3build.doctypes.METSDocument; import org.greenstone.gsdl3.gs3build.metadata.*; import org.greenstone.gsdl3.gs3build.xpointer.XPointer; import org.greenstone.gsdl3.util.GSXML; import org.greenstone.gsdl3.util.Misc; import org.greenstone.gsdl3.util.Processing; public class MGIndexer extends AbstractIndexer { int pass; int documentSeqNo; int sectionSeqNo; boolean firstDocument; String outputDirectory; // InputStream indexerFeedback; // InputStream indexerErrors; //OutputStream indexerTextfeed; StringBuffer indexBuffer; //Process mg_passes; File textDirectory; File indexDirectory; String indexStem; String textStem; List indexes; String overallName; String currentIndexName; String currentIndexLevel; String currentIndexField; MGPassesWrapper mgPasses; static final char END_OF_DOCUMENT = (char) 2; static final char END_OF_SECTION = (char) 3; // actually this is end of para for mg static final char END_OF_STREAM = (char) 4; public static final String MG_INDEX_TYPE = "mg"; public static final String INDEX_FILE_STEM = "index"; class MGIndex { String name=null; String level=null; String field=null; boolean error = false;// assume built until we get an error public MGIndex(String name, String level, String field) { this.name = name; this.level = level; this.field = field; } public MGIndex(String indexLabel) { int colonAt = indexLabel.indexOf(':'); if (colonAt >= 0) { this.field = indexLabel.substring(colonAt+1); this.level = indexLabel.substring(0, colonAt); createIndexName(); } } public String getLevel() { return this.level; } public String getField() { return this.field; } public String getName() { if (this.name==null || this.name.equals("")) { createIndexName(); } return this.name; } public boolean hasError() { return this.error; } public void setError(boolean b) { this.error = b; } private void createIndexName() { StringBuffer new_name = new StringBuffer(); new_name.append(Character.toLowerCase((char) this.level.charAt(0))); int c, w; w = 0; c = 0; while (c < this.field.length() && w < 2) { char ch = this.field.charAt(c); ch = Character.toLowerCase(ch); if (Character.isLetter(ch)) { if (ch != 'a' && ch != 'e' && ch != 'i' && ch != 'o' && ch != 'u') { new_name.append(ch); w++; } } c ++; } this.name = new_name.toString(); } } // MGIndex public MGIndexer(String name) { this.indexes = new ArrayList(); this.overallName = name; } public String getIndexType() { return MG_INDEX_TYPE; } public String getName() { return this.overallName; } // private String getIndexDirectory(String level, String field) // { StringBuffer directory = new StringBuffer(); // directory.append(Character.toLowerCase((char) level.charAt(0))); // int c, w; // w = 0; // c = 0; // while (c < field.length() && w < 2) { // char ch = field.charAt(c); // ch = Character.toLowerCase(ch); // if (Character.isLetter(ch)) { // if (ch != 'a' && ch != 'e' && ch != 'i' && // ch != 'o' && ch != 'u') { // directory.append(ch); // w++; // } // } // c ++; // } // return directory.toString(); // } /** * The output directory should be (collection)/building/text/ for * normal Greenstone builds. * * @param String the label to configure * @param String the value... */ public boolean configure(String label, String value) { if (label.equals(IndexerManager.outputDir)) { this.outputDirectory = value; this.pass = 0; // attempt to ensure that the text subdirectory exists this.textDirectory = new File(outputDirectory, "text"); if (!textDirectory.exists()) { if (!textDirectory.mkdir()) { return false; } } else if (!textDirectory.isDirectory()) { return false; } this.textStem = this.textDirectory.getPath() + File.separator + INDEX_FILE_STEM; // Sign to the user which mg directory is being used... System.out.println("Output MG directory is " + this.textStem); } else if (label.equals(IndexerInterface.GS2_INDEX_LABEL)) { this.indexes.add(new MGIndex(value)); } return true; } public boolean addIndex(String name, String level, String field) { MGIndex index = new MGIndex(name, level, field); this.indexes.add(index); return true; } private Node recurseDOM(DocumentInterface metsDoc, Node node, AbstractStructure structure, StringBuffer textBuffer, StringBuffer extraBuffer, String namespace) //String name, String namespace, String field) { // send out the ctrl-c...if this is if (structure.getStructureType().equals(METSDivision.DIVISION_TYPE)) { // try doing this for all index types if ((this.currentIndexName != null)) { // && this.level != null && this.level.equals(IndexerInterface.SECTION_LEVEL)) { //name.startsWith("s")) { METSDivision division = (METSDivision) structure; // get the division metadata block METSDescriptive descriptive; String metadataId = division.getDefaultMetadataReference(); if (metadataId == null) { descriptive = metsDoc.getDocumentMetadata().createDescriptive(division.getLabel()); division.addMetadataReference(descriptive.getID()); } else { // Get the descriptive item... descriptive = metsDoc.getDocumentMetadata().getDescriptiveById(metadataId); } descriptive.addMetadata("gsdl3", "mgseqno", this.overallName + "." + Integer.toString(this.sectionSeqNo)); metsDoc.setChanged(true); //metsDoc.setModified(true); // System.out.println("Assigning " + this.sectionSeqNo + " to " + metsDoc.getID() + " " + division.getLabel()); } // section level // append an 'end of section' marker //textBuffer.append(END_OF_SECTION); this.sectionSeqNo ++; // for document-level indexes, always append an 'end of document' tag at the // end of the document for each section. Otherwise, each section is followed // by an end of document character. This ensures that all indexes use the // same document numbering... if (this.currentIndexLevel == null || this.currentIndexLevel.equals(IndexerInterface.DOCUMENT_LEVEL)) { extraBuffer.append(END_OF_DOCUMENT); } else { textBuffer.append(END_OF_DOCUMENT); this.documentSeqNo ++; } // produce the body here for metadata output of divisions - in the case of // text output, that will happen below... if (!this.currentIndexField.equals("text")) { METSDescriptive descriptive; METSDivision division = (METSDivision) structure; String metadataId = division.getDefaultMetadataReference(); descriptive = metsDoc.getDocumentMetadata().getDescriptiveById(metadataId); if (descriptive != null) { List values = descriptive.getMetadata(namespace, this.currentIndexField); if (values != null) { Iterator valueIter = values.iterator(); while (valueIter.hasNext()) { String value = valueIter.next().toString(); textBuffer.append(value); if (valueIter.hasNext()) { //textBuffer.append(END_OF_SECTION); } } } } } } // go through our children as required... Iterator children = structure.getChildIterator(); Node startNode; while (children.hasNext()) { AbstractStructure child = (AbstractStructure) children.next(); // get xpointer for child // get start position node if (metsDoc.getDocumentType() == "METS"){ startNode = ((METSDocument) metsDoc).getSectionStartNode((METSDivision) child); } else { startNode = ((HTMLDocument) metsDoc).getSectionStartNode((METSDivision) child); } //Node startNode = ((HTMLDocument) metsDoc).getSectionStartNode((METSDivision) child); // while this node isn't the child's start node, produce the HTML node text, if // in text field mode... if (this.currentIndexField.equals("text")) { while (node != startNode) { XPointer.printNode(node, textBuffer, false); // print buffer to node node = XPointer.getNextNode(node, (this.currentIndexField.equals("text") ? textBuffer : null)); } } // recurse to child node = this.recurseDOM(metsDoc, node, child, textBuffer, extraBuffer, namespace); // name, namespace, field); } // while next child // close a document - the actual closing \B will be done by the main // loop, so only a required \C is printed here... if (structure.getStructureType().equals(METSStructure.STRUCTURE_TYPE)) { while (node != null) { if (this.currentIndexField.equals("text")) { XPointer.printNode(node, textBuffer, false); } node = XPointer.getNextNode(node, (this.currentIndexField.equals("text") ? textBuffer : null)); } //textBuffer.append(END_OF_SECTION); this.sectionSeqNo ++; } return node; } private String prepareDOM(DocumentInterface metsDoc, Document document, METSStructure structure, String namespace) { // String name, String namespace, String field) StringBuffer extraBuffer = new StringBuffer(); Node node = document.getDocumentElement(); StringBuffer textBuffer = new StringBuffer(); this.recurseDOM(metsDoc, node, structure, textBuffer, extraBuffer, namespace); //name, namespace, field); textBuffer.append(extraBuffer.toString()); return textBuffer.toString(); } /** * Index a single document; the document interface can be used to extract individual * metadata items etc. as required or desired and index those instead or as well as * the body text of the document. */ public boolean indexDocument(DocumentID docID, DocumentInterface document) { if (this.pass == 0) { document.removeAllMetadata("gsdl3", "mgseqno"); } if (!this.firstDocument) { this.indexBuffer.append(END_OF_DOCUMENT); mgPasses.processDocument(indexBuffer.toString()); this.indexBuffer.delete(0, this.indexBuffer.length()); } String docText = null; int startSeqNo = this.sectionSeqNo; this.sectionSeqNo ++; Document domDocument = document.getDOMDocument(); if (domDocument != null) { System.err.println("dom doc is not null"); METSStructure sections = document.getDocumentStructure().getStructure("Section"); if (sections != null) { System.err.println("sections are not null"); docText = this.prepareDOM(document, domDocument, sections, "gsdl3"); //this.name, "gsdl3", this.field); // System.out.println(docText); } } if (docText == null) { System.err.println("dom doc or sections was null - asking for doc text"); if (this.currentIndexField.equals("text")) { //docText = Character.toString(END_OF_DOCUMENT) + document.getDocumentText(); docText = document.getDocumentText(); } else { StringBuffer textBuffer = new StringBuffer(); //textBuffer.append(END_OF_DOCUMENT); List values = document.getDocumentMetadataItem("gsdl3", this.currentIndexField); if (values != null) { Iterator valueIter = values.iterator(); while (valueIter.hasNext()) { String value = valueIter.next().toString(); textBuffer.append(value); if (valueIter.hasNext()) { //textBuffer.append(END_OF_SECTION); // sectionSeqNo ++; } } } else { textBuffer.append("No data"); } docText = textBuffer.toString(); } sectionSeqNo ++; } this.indexBuffer.append(docText); // remember that we're not on the first document, this.firstDocument = false; // assign the sequence number on the first pass only, and increment the sequence number. if (this.pass == 0) { document.addDocumentMetadata("gsdl3", "mgseqno", this.overallName+"."+Integer.toString(startSeqNo)); } this.documentSeqNo += 1; return true; } /** * Initialise the pass: open required files, check status */ public boolean startPass(int passNumber) { this.pass = passNumber; this.firstDocument = true; this.documentSeqNo = 1; this.sectionSeqNo = 1; this.mgPasses = new MGPassesWrapper(); this.indexBuffer = new StringBuffer(); int indexNo = (this.pass - 2) / 2; MGIndex index = null; if (this.pass >= 2) { index = (MGIndex) this.indexes.get(indexNo); if (index.hasError()) { // an error has already occurred for this index, don't continue System.out.println("pass "+this.pass+": aborted due to errors in the previous pass"); return false; } // attempt to ensure that the text subdirectory exists this.indexDirectory = new File(outputDirectory, index.getName()); if (!indexDirectory.exists()) { if (!indexDirectory.mkdir()) { return false; } } else if (!indexDirectory.isDirectory()) { return false; } this.currentIndexLevel = index.getLevel(); this.currentIndexField = index.getField(); this.currentIndexName = index.getName(); if (this.currentIndexLevel == null || this.currentIndexField == null ) { System.out.println("invalid index - level or field was null"); return false; } this.indexStem = this.indexDirectory.getPath() + File.separatorChar + INDEX_FILE_STEM; // TODO: modify for index if (this.pass % 2 == 1) { this.currentIndexName = null; // why??? } } else { this.currentIndexField = "text"; this.currentIndexLevel = "section"; this.currentIndexName = null; } // get the parameters for this execution of mg_passes mgPasses.setFileName((this.pass < 2 ? this.textDirectory.toString() : this.indexDirectory.toString())+File.separator+ "index"); if (!Misc.isWindows()) { mgPasses.setBasePath("/"); } int mgPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2); mgPasses.setBufferSize(100000); switch (mgPass) { case 0: // -b 100000 -T1 mgPasses.addPass(MGPassesWrapper.TEXT_PASS_1); break; case 1: // -b 100000 -T2 mgPasses.addPass(MGPassesWrapper.TEXT_PASS_2); break; case 2: // -b 100000 -2 -m 32 -s 0 -G -t 10 -N1 mgPasses.addPass(MGPassesWrapper.INDEX_PASS_1); mgPasses.setInvfLevel(MGPassesWrapper.INVF_LEVEL_2); mgPasses.setStemOptions(MGPassesWrapper.STEMMER_ENGLISH, MGPassesWrapper.NO_STEM_OR_CASE); mgPasses.setInversionMemLimit(32); mgPasses.ignoreSGMLTags(true); break; case 3: // -b 100000 -2 -c 3 -G -t 10 -N2 mgPasses.addPass(MGPassesWrapper.INDEX_PASS_2); mgPasses.setInvfLevel(MGPassesWrapper.INVF_LEVEL_2); mgPasses.ignoreSGMLTags(true); break; } mgPasses.init(); System.out.println("Pass " + this.pass); return true; } /** * Complete a pass - reset file counters, close files, etc. */ public boolean endPass(int passNumber) { Process p; int indexNo = (passNumber - 2) / 2; MGIndex index = null; if (passNumber >= 2) { index = (MGIndex) this.indexes.get(indexNo); } try { this.indexBuffer.append(END_OF_DOCUMENT); mgPasses.processDocument(indexBuffer.toString()); this.indexBuffer.delete(0, this.indexBuffer.length()); Thread.sleep(1000); // what for?? } catch (InterruptedException ex) { System.out.println(ex); } mgPasses.finish(); try { Thread.sleep(1000); } catch (Exception e) {} int exit_value = 0; System.out.println("Pass " + this.pass + " completed with " + exit_value); if (exit_value !=0) { //assume something has gone wrong, don't continue if (index != null) { index.setError(true); return false; } } int mgPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2); String osextra = ""; if (!Misc.isWindows()) { osextra = " -d / "; } switch (mgPass) { case 0: System.out.println("Compressing dictionary"); exit_value = Processing.runProcess("mg_compression_dict -f " + this.textDirectory.toString()+File.separator+"index" + osextra + " -S -H -2 -k 5120"); if (exit_value == 0) { System.out.println("Compressed dictionary successfully written"); } else { System.err.println("Error from mg_compression_dict: " + exit_value); index.setError(true); return false; } break; case 2: System.out.println("Creating perfect hash"); exit_value = Processing.runProcess("mg_perf_hash_build -f " + this.indexDirectory.toString()+File.separator+ "index"+osextra); if (exit_value ==0) { System.out.println("Perfect hashes completed"); } else { System.err.println("Unable to build the perfect hash"); index.setError(true); return false; } break; case 3: System.out.println("Writing weights file"); exit_value = Processing.runProcess("mg_weights_build -f " + this.indexStem + " -t " + this.textStem + osextra); if (exit_value ==0) { System.out.println("Weights file successfully written"); } else { System.err.println("Unable to create weights file"); index.setError(true); return false; } System.out.println("Creating inverted dictionary"); exit_value = Processing.runProcess("mg_invf_dict -f " + this.indexDirectory.toString()+File.separator+"index" + osextra); if (exit_value ==0) { System.out.println("Inverted dictionary file successfully written"); } else { System.out.println("Unable to create inverted dictionary file"); index.setError(true); return false; } System.out.println("Creating Stem indexes"); exit_value = Processing.runProcess("mg_stem_idx -b 4096 -s1 -f " + this.indexDirectory.toString()+File.separator+"index"+osextra); if (exit_value == 0) { System.out.println("Stemmed index 1 successfully written"); } else { System.out.println("Unable to create stemmed index 1"); index.setError(true); return false; } exit_value = Processing.runProcess("mg_stem_idx -b 4096 -s2 -f " + this.indexDirectory.toString()+File.separator+"index"+osextra); if (exit_value == 0) { System.out.println("Stemmed index 2 successfully written"); } else { System.out.println("Unable to create stemmed index 2"); index.setError(true); return false; } exit_value = Processing.runProcess("mg_stem_idx -b 4096 -s3 -f " + this.indexDirectory.toString()+File.separator+"index"+osextra); if (exit_value == 0) { System.out.println("Stemmed index 3 successfully written"); } else { System.out.println("Unable to create stemmed index 3"); index.setError(true); return false; } break; } // switch mgPasses = null; return true; } /** * Do any tidying up */ public void tidyup() { } /** * Return the number of passes required for this index. */ public int getNumberOfPasses() { return 2 + this.indexes.size() * 2; } public boolean addServiceDescriptions(org.w3c.dom.Element service_rack_list) { Document doc = service_rack_list.getOwnerDocument(); // generate the list of indexes Element index_list = doc.createElement(GSXML.INDEX_ELEM+GSXML.LIST_MODIFIER); boolean found_index = false; String def_index = ""; // the default index will just be the first one created for now. for (int i=0; i