1 | package org.greenstone.gsdl3.gs3build.classifier;
|
---|
2 |
|
---|
3 | import java.io.*;
|
---|
4 |
|
---|
5 | import java.net.URL;
|
---|
6 |
|
---|
7 | import java.util.List;
|
---|
8 | import java.util.ArrayList;
|
---|
9 | import java.util.Iterator;
|
---|
10 |
|
---|
11 | import org.apache.xerces.parsers.SAXParser;
|
---|
12 | import org.xml.sax.XMLReader;
|
---|
13 | import org.xml.sax.InputSource;
|
---|
14 | import org.xml.sax.SAXException;
|
---|
15 | import org.xml.sax.Attributes;
|
---|
16 | import org.xml.sax.helpers.XMLReaderFactory;
|
---|
17 | import org.xml.sax.helpers.DefaultHandler;
|
---|
18 | import org.w3c.dom.Element;
|
---|
19 | import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
|
---|
20 | import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
|
---|
21 | import org.greenstone.gsdl3.gs3build.util.XMLTools;
|
---|
22 | import org.greenstone.gsdl3.gs3build.database.GS3SQLConnection;
|
---|
23 | import org.greenstone.gsdl3.util.GSXML;
|
---|
24 |
|
---|
25 |
|
---|
26 | public class HierarchyClassifier extends AbstractClassifier
|
---|
27 | {
|
---|
28 | class HierarchyClassifierObserver implements ClassifierObserverInterface
|
---|
29 | { List classifications;
|
---|
30 | DocumentInterface document;
|
---|
31 |
|
---|
32 | public HierarchyClassifierObserver(DocumentInterface document)
|
---|
33 | { this.document = document;
|
---|
34 | }
|
---|
35 |
|
---|
36 | public void recordClassification(String label)
|
---|
37 | { this.document.addDocumentMetadata("gsdl3", "classified", label);
|
---|
38 | }
|
---|
39 |
|
---|
40 | public List getClassifications()
|
---|
41 | { return this.classifications;
|
---|
42 | }
|
---|
43 | }
|
---|
44 |
|
---|
45 | class HierarchyNode extends AbstractHierarchyNode
|
---|
46 | { /**
|
---|
47 | * Simple node
|
---|
48 | */
|
---|
49 | public HierarchyNode()
|
---|
50 | { super();
|
---|
51 | }
|
---|
52 |
|
---|
53 | public HierarchyNode(String prefix, String name, String id, String descriptor)
|
---|
54 | { super(prefix, name, id, descriptor);
|
---|
55 | }
|
---|
56 |
|
---|
57 | public void getClassifications(DocumentID documentID, List values, String sortKey,
|
---|
58 | ClassifierObserverInterface observer)
|
---|
59 | { Iterator valueList = values.iterator();
|
---|
60 |
|
---|
61 | while (valueList.hasNext())
|
---|
62 | { if (this.isMatch(valueList.next().toString()))
|
---|
63 | { observer.recordClassification(this.name);
|
---|
64 | this.addDocument(documentID, sortKey);
|
---|
65 | }
|
---|
66 | }
|
---|
67 |
|
---|
68 | // recurse into the child nodes for them to do the same...
|
---|
69 | Iterator childList = this.childNodes.iterator();
|
---|
70 |
|
---|
71 | while (childList.hasNext())
|
---|
72 | { ((HierarchyNode) childList.next()).getClassifications(documentID, values, sortKey, observer);
|
---|
73 | }
|
---|
74 | }
|
---|
75 | }
|
---|
76 |
|
---|
77 | /**
|
---|
78 | * An inner class to handle Classifier files
|
---|
79 | */
|
---|
80 | class HierarchyHandler extends DefaultHandler
|
---|
81 | { StringBuffer description;
|
---|
82 | StringBuffer name;
|
---|
83 | StringBuffer path;
|
---|
84 | StringBuffer match;
|
---|
85 | String prefix;
|
---|
86 | boolean inElement;
|
---|
87 | AbstractHierarchyNode rootNode = null;
|
---|
88 | AbstractHierarchyNode currentNode = null;
|
---|
89 |
|
---|
90 | HierarchyHandler(String classifierName)
|
---|
91 | { super();
|
---|
92 |
|
---|
93 | this.name = null;
|
---|
94 | this.description = null;
|
---|
95 | this.path = null;
|
---|
96 | this.match = null;
|
---|
97 | this.prefix = classifierName;
|
---|
98 | }
|
---|
99 |
|
---|
100 | public void startElement(String URI, String localName, String qName, Attributes attributes)
|
---|
101 | { if (localName.equals("Hierarchy"))
|
---|
102 | { // create a new node in the hierarchy
|
---|
103 | HierarchyNode node = new HierarchyNode(prefix, "", "", "Classifier");
|
---|
104 |
|
---|
105 | this.rootNode = node;
|
---|
106 | this.currentNode = node;
|
---|
107 | }
|
---|
108 | else if (localName.equals("Classification"))
|
---|
109 | { // create a new node in the hierarchy
|
---|
110 | HierarchyNode node = new HierarchyNode(this.prefix, "", "", "");
|
---|
111 |
|
---|
112 | // Initialise the tree
|
---|
113 | if (this.currentNode == null)
|
---|
114 | { this.rootNode = node;
|
---|
115 | }
|
---|
116 | // Add the node
|
---|
117 | else
|
---|
118 | { this.currentNode.addChild(node);
|
---|
119 | }
|
---|
120 | this.currentNode = node;
|
---|
121 | }
|
---|
122 | else if (localName.equals("Name"))
|
---|
123 | { this.name = new StringBuffer();
|
---|
124 | }
|
---|
125 | else if (localName.equals("Path"))
|
---|
126 | { this.path = new StringBuffer();
|
---|
127 | }
|
---|
128 | else if (localName.equals("Description"))
|
---|
129 | { this.description = new StringBuffer();
|
---|
130 | }
|
---|
131 | else if (localName.equals("Matches"))
|
---|
132 | { this.match = new StringBuffer();
|
---|
133 | }
|
---|
134 | else if (localName.equals("Children"))
|
---|
135 | {
|
---|
136 | }
|
---|
137 | }
|
---|
138 |
|
---|
139 | public void endElement(String URI, String localName, String qName)
|
---|
140 | { if (localName.equals("Classification"))
|
---|
141 | { if (this.currentNode.getParent() == null) {
|
---|
142 | // TODO: Error state
|
---|
143 | // this.rootNodes.add(this.currentNode);
|
---|
144 | }
|
---|
145 | this.currentNode = this.currentNode.getParent();
|
---|
146 | }
|
---|
147 | else if (localName.equals("Name"))
|
---|
148 | { this.currentNode.setName(XMLTools.cleanString(this.name.toString()));
|
---|
149 | System.out.println("Name is " + this.name);
|
---|
150 | this.name = null;
|
---|
151 | }
|
---|
152 | else if (localName.equals("Path"))
|
---|
153 | { this.currentNode.setID(XMLTools.cleanString(this.path.toString()));
|
---|
154 | this.path = null;
|
---|
155 | }
|
---|
156 | else if (localName.equals("Description"))
|
---|
157 | { this.currentNode.setDescriptor(XMLTools.cleanString(this.description.toString()));
|
---|
158 | this.description = null;
|
---|
159 | }
|
---|
160 | else if (localName.equals("Matches"))
|
---|
161 | { this.currentNode.addMatch(XMLTools.cleanString(this.match.toString()));
|
---|
162 | this.match = null;
|
---|
163 | }
|
---|
164 | else if (localName.equals("Children"))
|
---|
165 | {
|
---|
166 | }
|
---|
167 | }
|
---|
168 |
|
---|
169 | public void characters(char c[], int start, int length)
|
---|
170 | { String string = new String(c, start, length);
|
---|
171 | if (this.path != null)
|
---|
172 | { this.path.append(string);
|
---|
173 | }
|
---|
174 | else if (this.name != null)
|
---|
175 | { this.name.append(string);
|
---|
176 | }
|
---|
177 | else if (this.description != null)
|
---|
178 | { this.description.append(string);
|
---|
179 | }
|
---|
180 | else if (this.match != null)
|
---|
181 | { this.match.append(string);
|
---|
182 | }
|
---|
183 | }
|
---|
184 |
|
---|
185 | public AbstractHierarchyNode getHierarchy()
|
---|
186 | { return this.rootNode;
|
---|
187 | }
|
---|
188 | }
|
---|
189 |
|
---|
190 | private GS3SQLConnection database;
|
---|
191 | private AbstractHierarchyNode hierarchy;
|
---|
192 | private List fields;
|
---|
193 | private String sortBy;
|
---|
194 | private String name;
|
---|
195 |
|
---|
196 | public HierarchyClassifier(String name, URL basefile, List fields, String sortBy)
|
---|
197 | {
|
---|
198 | try
|
---|
199 | { SAXParser parser = new SAXParser();
|
---|
200 | HierarchyHandler handler = new HierarchyHandler(name);
|
---|
201 | /*
|
---|
202 | XMLReader reader = XMLReaderFactory.createXMLReader();
|
---|
203 | reader.setContentHandler(handler);
|
---|
204 | reader.setErrorHandler(handler);*/
|
---|
205 | parser.setContentHandler(handler);
|
---|
206 |
|
---|
207 | // A Classifier consists of one file only - get it.
|
---|
208 | /*
|
---|
209 | FileReader fileReader = new FileReader(basefile);
|
---|
210 | reader.parse(new InputSource(fileReader));
|
---|
211 | */
|
---|
212 | parser.parse(basefile.toString());
|
---|
213 |
|
---|
214 | this.hierarchy = handler.getHierarchy();
|
---|
215 | this.fields = fields;
|
---|
216 | this.sortBy = sortBy;
|
---|
217 | this.name = name;
|
---|
218 | System.out.println("Counting " + this.fields.size() + " fields");
|
---|
219 | }
|
---|
220 | catch (SAXException saxException)
|
---|
221 | { // TODO: log error
|
---|
222 | System.out.println("unable to process hierarchy " + saxException.toString());
|
---|
223 | }
|
---|
224 | catch (java.io.FileNotFoundException fileException)
|
---|
225 | {
|
---|
226 | System.out.println("unable to process hierarchy " + fileException.toString());
|
---|
227 | }
|
---|
228 | catch (java.io.IOException ioException)
|
---|
229 | {
|
---|
230 | System.out.println("unable to process hierarchy " + ioException.toString());
|
---|
231 | }
|
---|
232 | }
|
---|
233 |
|
---|
234 | public void configure(List parameters)
|
---|
235 | {
|
---|
236 | Iterator iterator = parameters.iterator();
|
---|
237 | while (iterator.hasNext()) {
|
---|
238 | String param = iterator.next().toString();
|
---|
239 | if (param.equals("-metadata")) {
|
---|
240 | if (iterator.hasNext()) {
|
---|
241 | String field = iterator.next().toString();
|
---|
242 | if (field != null && field.length() > 0) {
|
---|
243 | this.fields.add(field);
|
---|
244 | }
|
---|
245 | }
|
---|
246 | }
|
---|
247 | }
|
---|
248 | }
|
---|
249 |
|
---|
250 | /**
|
---|
251 | * Indicate the database to be used for saving the hierarchy to.
|
---|
252 | *
|
---|
253 | * @param <code>GS3SQLConnection</code> the database
|
---|
254 | */
|
---|
255 | public void setDatabase(GS3SQLConnection database)
|
---|
256 | { this.database = database;
|
---|
257 | }
|
---|
258 |
|
---|
259 | /**
|
---|
260 | * Classify a particular document - the document id will be used to record its
|
---|
261 | * membership of any classifier nodes, and the document interface will also be
|
---|
262 | * used to assign back knowledge of any classifiers to which the document is
|
---|
263 | * assigned.
|
---|
264 | *
|
---|
265 | * @param <code>DocumentID</code> the document identifier
|
---|
266 | * @param <code>DocumentInterface</code> the document interface
|
---|
267 | *
|
---|
268 | * @return <code>boolean</code> a return value - currently always
|
---|
269 | * <code>true</code> for success
|
---|
270 | */
|
---|
271 | public boolean classifyDocument(DocumentID documentID, DocumentInterface document)
|
---|
272 | { // the observer records the assignment of documents to classifications
|
---|
273 | HierarchyClassifierObserver classifyObserver = new HierarchyClassifierObserver(document);
|
---|
274 |
|
---|
275 | // if we have no fields, there is nothing to do...
|
---|
276 | if (this.fields == null)
|
---|
277 | { return true;
|
---|
278 | }
|
---|
279 |
|
---|
280 | // get the metadata item from the document
|
---|
281 | Iterator thisField = this.fields.iterator();
|
---|
282 |
|
---|
283 | // for every field listed as being a scannable field...
|
---|
284 | while (thisField.hasNext())
|
---|
285 | { String fieldName = thisField.next().toString();
|
---|
286 |
|
---|
287 | // ...get the values for that field...
|
---|
288 | List values = document.getDocumentMetadataItem(fieldName);
|
---|
289 |
|
---|
290 | if (values == null)
|
---|
291 | { continue;
|
---|
292 | }
|
---|
293 |
|
---|
294 | if (documentID == null) {
|
---|
295 | System.out.println("Bad documentID");
|
---|
296 | continue;
|
---|
297 | }
|
---|
298 |
|
---|
299 | // get the sort key for the metadata item if possible
|
---|
300 | String sortKey = null;
|
---|
301 |
|
---|
302 | if (this.sortBy != null) {
|
---|
303 | List sortKeys = document.getDocumentMetadataItem(this.sortBy);
|
---|
304 | if (sortKeys != null && sortKeys.size() > 0) {
|
---|
305 | sortKey = sortKeys.get(0).toString();
|
---|
306 | }
|
---|
307 | }
|
---|
308 |
|
---|
309 | // ...and send them to the classifier
|
---|
310 | this.hierarchy.getClassifications(documentID, values, sortKey, classifyObserver);
|
---|
311 | }
|
---|
312 | return true;
|
---|
313 | }
|
---|
314 |
|
---|
315 | /**
|
---|
316 | * @see ClassifierInterface:getClassifierPasses
|
---|
317 | */
|
---|
318 | public int getClassifierPasses()
|
---|
319 | { return 1;
|
---|
320 | }
|
---|
321 |
|
---|
322 | public void startClassifierPass(int pass)
|
---|
323 | {
|
---|
324 | }
|
---|
325 |
|
---|
326 | public void endClassifierPass(int pass)
|
---|
327 | {
|
---|
328 | }
|
---|
329 |
|
---|
330 | public void completeClassification()
|
---|
331 | { // TODO: store the classifications to file...
|
---|
332 | if (this.hierarchy != null) {
|
---|
333 | this.hierarchy.writeSQL(this.database);
|
---|
334 | }
|
---|
335 | }
|
---|
336 |
|
---|
337 | public boolean addClassifierDescription(Element classifier_list) {
|
---|
338 | // TODO check that there are some docs in the classification
|
---|
339 | Element classifier = classifier_list.getOwnerDocument().createElement(GSXML.CLASSIFIER_ELEM);
|
---|
340 | classifier.setAttribute(GSXML.NAME_ATT, this.name);
|
---|
341 | classifier_list.appendChild(classifier);
|
---|
342 | return true;
|
---|
343 | }
|
---|
344 | }
|
---|