package org.greenstone.gsdl3.gs3build.classifier; import java.io.*; import java.net.URL; import java.util.List; import java.util.ArrayList; import java.util.Iterator; import org.apache.xerces.parsers.SAXParser; import org.xml.sax.XMLReader; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.Attributes; import org.xml.sax.helpers.XMLReaderFactory; import org.xml.sax.helpers.DefaultHandler; import org.w3c.dom.Element; import org.greenstone.gsdl3.gs3build.doctypes.DocumentID; import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface; import org.greenstone.gsdl3.gs3build.util.XMLTools; import org.greenstone.gsdl3.gs3build.util.GS3SQLConnection; import org.greenstone.gsdl3.util.GSXML; public class HierarchyClassifier extends AbstractClassifier { class HierarchyClassifierObserver implements ClassifierObserverInterface { List classifications; DocumentInterface document; public HierarchyClassifierObserver(DocumentInterface document) { this.document = document; } public void recordClassification(String label) { this.document.addDocumentMetadata("gsdl3", "classified", label); } public List getClassifications() { return this.classifications; } } class HierarchyNode extends AbstractHierarchyNode { /** * Simple node */ public HierarchyNode() { super(); } public HierarchyNode(String prefix, String name, String id, String descriptor) { super(prefix, name, id, descriptor); } public void getClassifications(DocumentID documentID, List values, String sortKey, ClassifierObserverInterface observer) { Iterator valueList = values.iterator(); while (valueList.hasNext()) { if (this.isMatch(valueList.next().toString())) { observer.recordClassification(this.name); this.addDocument(documentID, sortKey); } } // recurse into the child nodes for them to do the same... Iterator childList = this.childNodes.iterator(); while (childList.hasNext()) { ((HierarchyNode) childList.next()).getClassifications(documentID, values, sortKey, observer); } } } /** * An inner class to handle Classifier files */ class HierarchyHandler extends DefaultHandler { StringBuffer description; StringBuffer name; StringBuffer path; StringBuffer match; String prefix; boolean inElement; AbstractHierarchyNode rootNode = null; AbstractHierarchyNode currentNode = null; HierarchyHandler(String classifierName) { super(); this.name = null; this.description = null; this.path = null; this.match = null; this.prefix = classifierName; } public void startElement(String URI, String localName, String qName, Attributes attributes) { if (localName.equals("Hierarchy")) { // create a new node in the hierarchy HierarchyNode node = new HierarchyNode(prefix, "", "", "Classifier"); this.rootNode = node; this.currentNode = node; } else if (localName.equals("Classification")) { // create a new node in the hierarchy HierarchyNode node = new HierarchyNode(this.prefix, "", "", ""); // Initialise the tree if (this.currentNode == null) { this.rootNode = node; } // Add the node else { this.currentNode.addChild(node); } this.currentNode = node; } else if (localName.equals("Name")) { this.name = new StringBuffer(); } else if (localName.equals("Path")) { this.path = new StringBuffer(); } else if (localName.equals("Description")) { this.description = new StringBuffer(); } else if (localName.equals("Matches")) { this.match = new StringBuffer(); } else if (localName.equals("Children")) { } } public void endElement(String URI, String localName, String qName) { if (localName.equals("Classification")) { if (this.currentNode.getParent() == null) { // TODO: Error state // this.rootNodes.add(this.currentNode); } this.currentNode = this.currentNode.getParent(); } else if (localName.equals("Name")) { this.currentNode.setName(XMLTools.cleanString(this.name.toString())); System.out.println("Name is " + this.name); this.name = null; } else if (localName.equals("Path")) { this.currentNode.setID(XMLTools.cleanString(this.path.toString())); this.path = null; } else if (localName.equals("Description")) { this.currentNode.setDescriptor(XMLTools.cleanString(this.description.toString())); this.description = null; } else if (localName.equals("Matches")) { this.currentNode.addMatch(XMLTools.cleanString(this.match.toString())); this.match = null; } else if (localName.equals("Children")) { } } public void characters(char c[], int start, int length) { String string = new String(c, start, length); if (this.path != null) { this.path.append(string); } else if (this.name != null) { this.name.append(string); } else if (this.description != null) { this.description.append(string); } else if (this.match != null) { this.match.append(string); } } public AbstractHierarchyNode getHierarchy() { return this.rootNode; } } private GS3SQLConnection database; private AbstractHierarchyNode hierarchy; private List fields; private String sortBy; private String name; public HierarchyClassifier(String name, URL basefile, List fields, String sortBy) { try { SAXParser parser = new SAXParser(); HierarchyHandler handler = new HierarchyHandler(name); /* XMLReader reader = XMLReaderFactory.createXMLReader(); reader.setContentHandler(handler); reader.setErrorHandler(handler);*/ parser.setContentHandler(handler); // A Classifier consists of one file only - get it. /* FileReader fileReader = new FileReader(basefile); reader.parse(new InputSource(fileReader)); */ parser.parse(basefile.toString()); this.hierarchy = handler.getHierarchy(); this.fields = fields; this.sortBy = sortBy; this.name = name; System.out.println("Counting " + this.fields.size() + " fields"); } catch (SAXException saxException) { // TODO: log error System.out.println("unable to process hierarchy " + saxException.toString()); } catch (java.io.FileNotFoundException fileException) { System.out.println("unable to process hierarchy " + fileException.toString()); } catch (java.io.IOException ioException) { System.out.println("unable to process hierarchy " + ioException.toString()); } } public void configure(List parameters) { Iterator iterator = parameters.iterator(); while (iterator.hasNext()) { String param = iterator.next().toString(); if (param.equals("-metadata")) { if (iterator.hasNext()) { String field = iterator.next().toString(); if (field != null && field.length() > 0) { this.fields.add(field); } } } } } /** * Indicate the database to be used for saving the hierarchy to. * * @param GS3SQLConnection the database */ public void setDatabase(GS3SQLConnection database) { this.database = database; } /** * Classify a particular document - the document id will be used to record its * membership of any classifier nodes, and the document interface will also be * used to assign back knowledge of any classifiers to which the document is * assigned. * * @param DocumentID the document identifier * @param DocumentInterface the document interface * * @return boolean a return value - currently always * true for success */ public boolean classifyDocument(DocumentID documentID, DocumentInterface document) { // the observer records the assignment of documents to classifications HierarchyClassifierObserver classifyObserver = new HierarchyClassifierObserver(document); // if we have no fields, there is nothing to do... if (this.fields == null) { return true; } // get the metadata item from the document Iterator thisField = this.fields.iterator(); // for every field listed as being a scannable field... while (thisField.hasNext()) { String fieldName = thisField.next().toString(); // ...get the values for that field... List values = document.getDocumentMetadataItem(fieldName); if (values == null) { continue; } if (documentID == null) { System.out.println("Bad documentID"); continue; } // get the sort key for the metadata item if possible String sortKey = null; if (this.sortBy != null) { List sortKeys = document.getDocumentMetadataItem(this.sortBy); if (sortKeys != null && sortKeys.size() > 0) { sortKey = sortKeys.get(0).toString(); } } // ...and send them to the classifier this.hierarchy.getClassifications(documentID, values, sortKey, classifyObserver); } return true; } /** * @see ClassifierInterface:getClassifierPasses */ public int getClassifierPasses() { return 1; } public void startClassifierPass(int pass) { } public void endClassifierPass(int pass) { } public void completeClassification() { // TODO: store the classifications to file... if (this.hierarchy != null) { this.hierarchy.writeSQL(this.database); } } public boolean addClassifierDescription(Element classifier_list) { // TODO check that there are some docs in the classification Element classifier = classifier_list.getOwnerDocument().createElement(GSXML.CLASSIFIER_ELEM); classifier.setAttribute(GSXML.NAME_ATT, this.name); classifier_list.appendChild(classifier); return true; } }