package org.greenstone.gsdl3.gs3build.classifier; import java.io.*; import java.util.List; import java.util.ArrayList; import java.util.Iterator; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import org.xml.sax.XMLReader; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.Attributes; import org.xml.sax.helpers.XMLReaderFactory; import org.xml.sax.helpers.DefaultHandler; import org.w3c.dom.Element; import org.greenstone.gsdl3.gs3build.doctypes.DocumentID; import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface; import org.greenstone.gsdl3.gs3build.util.XMLTools; import org.greenstone.gsdl3.gs3build.database.*; import org.greenstone.gsdl3.gs3build.util.MultiMap; import org.greenstone.gsdl3.util.GSXML; public class AZListClassifier extends AbstractClassifier { GS3SQLConnection database; MultiMap alphaMap; List fields; String name; class AZDocumentItem { public DocumentID documentId; public String title; public AZDocumentItem(DocumentID documentId, String title) { this.documentId = documentId; this.title = title; } public String getTitle() { return this.title; } public DocumentID getID() { return this.documentId; } public String toString() { return this.title; } } private void sortList(String key) { List list = this.alphaMap.getAll(key); if (list.size() > 1) { int first, last; first = 0; last = list.size() - 1; while (first != last) { int at = (first + last) / 2; if (list.get(at).toString().compareTo(list.get(list.size() - 1).toString()) > 0) { last = at; } else { first = at + 1; } } Object newObject = list.get(list.size()-1); last = list.size() - 1; while (last > first) { list.set(last, list.get(last-1)); last --; } list.set(first, newObject); } } public AZListClassifier(String name, List fields) { this.fields = fields; this.alphaMap = new MultiMap(); this.name = name; // System.out.println("AZList :"+fields.size()); } public AZListClassifier() { this.fields = new ArrayList(); } public void configure(List parameters) { Iterator iterator = parameters.iterator(); while (iterator.hasNext()) { String param = iterator.next().toString(); if (param.equals("-metadata")) { if (iterator.hasNext()) { String field = iterator.next().toString(); if (field != null && field.length() > 0) { this.fields.add(field); } } } } } public void setDatabase(GS3SQLConnection connection) { this.database = connection; } public boolean classifyDocument(DocumentID documentID, DocumentInterface document) { // get the metadata item from the document Iterator thisField = this.fields.iterator(); // for every field listed as being a scannable field... while (thisField.hasNext()) { String fieldName = thisField.next().toString(); // ...get the values for that field... List values = document.getDocumentMetadataItem(fieldName); if (values == null) { continue; } // ...and send them to the classifier Iterator thisValue = values.iterator(); while (thisValue.hasNext()) { String value = thisValue.next().toString(); String classText = null; int c = 0; while (c < value.length() && classText == null) { if (Character.isDigit(value.charAt(c))) { classText = "0-9"; } else if (Character.isLetter(value.charAt(c))) { classText = value.substring(c, c+1).toLowerCase(); } c ++; } if (classText != null) { // System.out.println(classText + " " + documentID); this.alphaMap.put(classText, new AZDocumentItem(documentID, value)); this.sortList(classText); } } } return true; } public void startClassifierPass(int pass) { } public int getClassifierPasses() { return 1; } public void endClassifierPass(int pass) { } public void completeClassification() { this.writeSQL(this.database); } public int writeSQLClassifyNode(GS3SQLConnection connection, String parentId, int orderRef, String label, String name, String description, int noOfLeafDocs) { GS3SQLAction action; GS3SQLSelect select; GS3SQLInsert insert; int classifyRef; // check for existing node select = new GS3SQLSelect("classifiers"); select.addField("ClassifyRef"); GS3SQLWhereItem whereItem = new GS3SQLWhereItem("ClassifyID", "=", label); GS3SQLWhere where = new GS3SQLWhere(whereItem); select.setWhere(where); try { Statement statement = connection.createStatement(); ResultSet results = statement.executeQuery(select.toString()); if (results.first()) { GS3SQLUpdate update = new GS3SQLUpdate("classifiers"); update.setWhere(where); action = update; classifyRef = results.getInt("ClassifyRef"); } else { insert = new GS3SQLInsert("classifiers"); insert.addValue("ParentID", parentId); action = insert; classifyRef = -1; } action.addValue("ClassifyID", label); action.addValue("Name", name); action.addValue("Description", description); action.addValue("ClassifyOrder", Integer.toString(orderRef), GS3SQLField.INTEGER_TYPE); action.addValue("NumLeafDocs", Integer.toString(noOfLeafDocs), GS3SQLField.INTEGER_TYPE); // do the update/insert statement.execute(action.toString()); // get the ClassifyRef if we don't already have it (have done a // insert action above)... if (classifyRef == -1) { results = statement.executeQuery(select.toString()); if (!results.first()) { return -1; } classifyRef = results.getInt("ClassifyRef"); } statement.close(); } catch (SQLException sqlEx) { System.err.println("AZListClassifier.writeSQLClassifyNode(): "+sqlEx); return -1; } return classifyRef; } public boolean writeSQL(GS3SQLConnection connection) { GS3SQLAction action; GS3SQLSelect select; GS3SQLInsert insert; Iterator keys; String prefix = this.name; if (prefix==null || prefix.equals("")) { prefix = "CLAZ"+this.fields.get(0).toString(); // TODO: write this name back to collectionConfig.xml this.name = prefix; } int leafCount = 0; keys = this.alphaMap.keySet().iterator(); while (keys.hasNext()) { Object key = keys.next(); List childDocs = (List) this.alphaMap.getAll(key); if (childDocs != null) { leafCount += childDocs.size(); } } int parentClassify = this.writeSQLClassifyNode(connection, "", 0, prefix, "", "Classifier", leafCount); if (parentClassify < 0) { return false; } try { Statement statement = connection.createStatement(); List children; // TODO: cope with change rather than create from scratch... int nodeOrder = 1; keys = this.alphaMap.keySet().iterator(); while (keys.hasNext()) { Object key = keys.next(); List childDocs = (List) this.alphaMap.getAll(key); if (childDocs != null) { String className = prefix + "." + key.toString(); int classifyRef = this.writeSQLClassifyNode(connection, prefix, nodeOrder, className, key.toString(), key.toString(), childDocs.size()); // note the child documents... Iterator iterator = childDocs.iterator(); int childOrder = 1; //St while (iterator.hasNext()) { AZDocumentItem documentItem = (AZDocumentItem) iterator.next(); DocumentID docId = documentItem.getID(); insert = new GS3SQLInsert("classdocuments"); insert.addValue("ClassifyRef", Integer.toString(classifyRef), GS3SQLField.INTEGER_TYPE); insert.addValue("DocID", docId.toString()); insert.addValue("DocOrder", Integer.toString(childOrder), GS3SQLField.INTEGER_TYPE); statement.execute(insert.toString()); childOrder ++; } nodeOrder ++; } } /* else { // TODO: clear 'dead' child classifications // delete child documents GS3SQLDelete delete = new GS3SQLDelete("classdocuments"); delete.setWhere(where); statement.execute(delete.toString()); } // post the child nodes... Iterator iterator = this.childNodes.iterator(); while (iterator.hasNext()) { AbstractHierarchyNode childNode = (AbstractHierarchyNode) iterator.next(); if (!childNode.writeSQL(connection)) { return false; } } */ statement.close(); } catch (SQLException e) { System.err.println("AZListClassifier.writeSQL(): "+e); return false; } return true; } public boolean addClassifierDescription(Element classifier_list) { // TODO check that there are some docs in the classification Element classifier = classifier_list.getOwnerDocument().createElement(GSXML.CLASSIFIER_ELEM); classifier.setAttribute(GSXML.NAME_ATT, this.name); classifier_list.appendChild(classifier); return true; } }