source: trunk/greenstone3-extensions/gs3build/src/org/greenstone/gsdl3/gs3build/classifier/HierarchyClassifier.java@ 12188

Last change on this file since 12188 was 12188, checked in by kjdon, 18 years ago

Initial revision

  • Property svn:keywords set to Author Date Id Revision
File size: 9.9 KB
Line 
1package org.greenstone.gsdl3.gs3build.classifier;
2
3import java.io.*;
4
5import java.net.URL;
6
7import java.util.List;
8import java.util.ArrayList;
9import java.util.Iterator;
10
11import org.apache.xerces.parsers.SAXParser;
12import org.xml.sax.XMLReader;
13import org.xml.sax.InputSource;
14import org.xml.sax.SAXException;
15import org.xml.sax.Attributes;
16import org.xml.sax.helpers.XMLReaderFactory;
17import org.xml.sax.helpers.DefaultHandler;
18import org.w3c.dom.Element;
19import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
20import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
21import org.greenstone.gsdl3.gs3build.util.XMLTools;
22import org.greenstone.gsdl3.gs3build.database.GS3SQLConnection;
23import org.greenstone.gsdl3.util.GSXML;
24
25
26public class HierarchyClassifier extends AbstractClassifier
27{
28 class HierarchyClassifierObserver implements ClassifierObserverInterface
29 { List classifications;
30 DocumentInterface document;
31
32 public HierarchyClassifierObserver(DocumentInterface document)
33 { this.document = document;
34 }
35
36 public void recordClassification(String label)
37 { this.document.addDocumentMetadata("gsdl3", "classified", label);
38 }
39
40 public List getClassifications()
41 { return this.classifications;
42 }
43 }
44
45 class HierarchyNode extends AbstractHierarchyNode
46 { /**
47 * Simple node
48 */
49 public HierarchyNode()
50 { super();
51 }
52
53 public HierarchyNode(String prefix, String name, String id, String descriptor)
54 { super(prefix, name, id, descriptor);
55 }
56
57 public void getClassifications(DocumentID documentID, List values, String sortKey,
58 ClassifierObserverInterface observer)
59 { Iterator valueList = values.iterator();
60
61 while (valueList.hasNext())
62 { if (this.isMatch(valueList.next().toString()))
63 { observer.recordClassification(this.name);
64 this.addDocument(documentID, sortKey);
65 }
66 }
67
68 // recurse into the child nodes for them to do the same...
69 Iterator childList = this.childNodes.iterator();
70
71 while (childList.hasNext())
72 { ((HierarchyNode) childList.next()).getClassifications(documentID, values, sortKey, observer);
73 }
74 }
75 }
76
77 /**
78 * An inner class to handle Classifier files
79 */
80 class HierarchyHandler extends DefaultHandler
81 { StringBuffer description;
82 StringBuffer name;
83 StringBuffer path;
84 StringBuffer match;
85 String prefix;
86 boolean inElement;
87 AbstractHierarchyNode rootNode = null;
88 AbstractHierarchyNode currentNode = null;
89
90 HierarchyHandler(String classifierName)
91 { super();
92
93 this.name = null;
94 this.description = null;
95 this.path = null;
96 this.match = null;
97 this.prefix = classifierName;
98 }
99
100 public void startElement(String URI, String localName, String qName, Attributes attributes)
101 { if (localName.equals("Hierarchy"))
102 { // create a new node in the hierarchy
103 HierarchyNode node = new HierarchyNode(prefix, "", "", "Classifier");
104
105 this.rootNode = node;
106 this.currentNode = node;
107 }
108 else if (localName.equals("Classification"))
109 { // create a new node in the hierarchy
110 HierarchyNode node = new HierarchyNode(this.prefix, "", "", "");
111
112 // Initialise the tree
113 if (this.currentNode == null)
114 { this.rootNode = node;
115 }
116 // Add the node
117 else
118 { this.currentNode.addChild(node);
119 }
120 this.currentNode = node;
121 }
122 else if (localName.equals("Name"))
123 { this.name = new StringBuffer();
124 }
125 else if (localName.equals("Path"))
126 { this.path = new StringBuffer();
127 }
128 else if (localName.equals("Description"))
129 { this.description = new StringBuffer();
130 }
131 else if (localName.equals("Matches"))
132 { this.match = new StringBuffer();
133 }
134 else if (localName.equals("Children"))
135 {
136 }
137 }
138
139 public void endElement(String URI, String localName, String qName)
140 { if (localName.equals("Classification"))
141 { if (this.currentNode.getParent() == null) {
142 // TODO: Error state
143 // this.rootNodes.add(this.currentNode);
144 }
145 this.currentNode = this.currentNode.getParent();
146 }
147 else if (localName.equals("Name"))
148 { this.currentNode.setName(XMLTools.cleanString(this.name.toString()));
149 System.out.println("Name is " + this.name);
150 this.name = null;
151 }
152 else if (localName.equals("Path"))
153 { this.currentNode.setID(XMLTools.cleanString(this.path.toString()));
154 this.path = null;
155 }
156 else if (localName.equals("Description"))
157 { this.currentNode.setDescriptor(XMLTools.cleanString(this.description.toString()));
158 this.description = null;
159 }
160 else if (localName.equals("Matches"))
161 { this.currentNode.addMatch(XMLTools.cleanString(this.match.toString()));
162 this.match = null;
163 }
164 else if (localName.equals("Children"))
165 {
166 }
167 }
168
169 public void characters(char c[], int start, int length)
170 { String string = new String(c, start, length);
171 if (this.path != null)
172 { this.path.append(string);
173 }
174 else if (this.name != null)
175 { this.name.append(string);
176 }
177 else if (this.description != null)
178 { this.description.append(string);
179 }
180 else if (this.match != null)
181 { this.match.append(string);
182 }
183 }
184
185 public AbstractHierarchyNode getHierarchy()
186 { return this.rootNode;
187 }
188 }
189
190 private GS3SQLConnection database;
191 private AbstractHierarchyNode hierarchy;
192 private List fields;
193 private String sortBy;
194 private String name;
195
196 public HierarchyClassifier(String name, URL basefile, List fields, String sortBy)
197 {
198 try
199 { SAXParser parser = new SAXParser();
200 HierarchyHandler handler = new HierarchyHandler(name);
201 /*
202 XMLReader reader = XMLReaderFactory.createXMLReader();
203 reader.setContentHandler(handler);
204 reader.setErrorHandler(handler);*/
205 parser.setContentHandler(handler);
206
207 // A Classifier consists of one file only - get it.
208 /*
209 FileReader fileReader = new FileReader(basefile);
210 reader.parse(new InputSource(fileReader));
211 */
212 parser.parse(basefile.toString());
213
214 this.hierarchy = handler.getHierarchy();
215 this.fields = fields;
216 this.sortBy = sortBy;
217 this.name = name;
218 System.out.println("Counting " + this.fields.size() + " fields");
219 }
220 catch (SAXException saxException)
221 { // TODO: log error
222 System.out.println("unable to process hierarchy " + saxException.toString());
223 }
224 catch (java.io.FileNotFoundException fileException)
225 {
226 System.out.println("unable to process hierarchy " + fileException.toString());
227 }
228 catch (java.io.IOException ioException)
229 {
230 System.out.println("unable to process hierarchy " + ioException.toString());
231 }
232 }
233
234 public void configure(List parameters)
235 {
236 Iterator iterator = parameters.iterator();
237 while (iterator.hasNext()) {
238 String param = iterator.next().toString();
239 if (param.equals("-metadata")) {
240 if (iterator.hasNext()) {
241 String field = iterator.next().toString();
242 if (field != null && field.length() > 0) {
243 this.fields.add(field);
244 }
245 }
246 }
247 }
248 }
249
250 /**
251 * Indicate the database to be used for saving the hierarchy to.
252 *
253 * @param <code>GS3SQLConnection</code> the database
254 */
255 public void setDatabase(GS3SQLConnection database)
256 { this.database = database;
257 }
258
259 /**
260 * Classify a particular document - the document id will be used to record its
261 * membership of any classifier nodes, and the document interface will also be
262 * used to assign back knowledge of any classifiers to which the document is
263 * assigned.
264 *
265 * @param <code>DocumentID</code> the document identifier
266 * @param <code>DocumentInterface</code> the document interface
267 *
268 * @return <code>boolean</code> a return value - currently always
269 * <code>true</code> for success
270 */
271 public boolean classifyDocument(DocumentID documentID, DocumentInterface document)
272 { // the observer records the assignment of documents to classifications
273 HierarchyClassifierObserver classifyObserver = new HierarchyClassifierObserver(document);
274
275 // if we have no fields, there is nothing to do...
276 if (this.fields == null)
277 { return true;
278 }
279
280 // get the metadata item from the document
281 Iterator thisField = this.fields.iterator();
282
283 // for every field listed as being a scannable field...
284 while (thisField.hasNext())
285 { String fieldName = thisField.next().toString();
286
287 // ...get the values for that field...
288 List values = document.getDocumentMetadataItem(fieldName);
289
290 if (values == null)
291 { continue;
292 }
293
294 if (documentID == null) {
295 System.out.println("Bad documentID");
296 continue;
297 }
298
299 // get the sort key for the metadata item if possible
300 String sortKey = null;
301
302 if (this.sortBy != null) {
303 List sortKeys = document.getDocumentMetadataItem(this.sortBy);
304 if (sortKeys != null && sortKeys.size() > 0) {
305 sortKey = sortKeys.get(0).toString();
306 }
307 }
308
309 // ...and send them to the classifier
310 this.hierarchy.getClassifications(documentID, values, sortKey, classifyObserver);
311 }
312 return true;
313 }
314
315 /**
316 * @see ClassifierInterface:getClassifierPasses
317 */
318 public int getClassifierPasses()
319 { return 1;
320 }
321
322 public void startClassifierPass(int pass)
323 {
324 }
325
326 public void endClassifierPass(int pass)
327 {
328 }
329
330 public void completeClassification()
331 { // TODO: store the classifications to file...
332 if (this.hierarchy != null) {
333 this.hierarchy.writeSQL(this.database);
334 }
335 }
336
337 public boolean addClassifierDescription(Element classifier_list) {
338 // TODO check that there are some docs in the classification
339 Element classifier = classifier_list.getOwnerDocument().createElement(GSXML.CLASSIFIER_ELEM);
340 classifier.setAttribute(GSXML.NAME_ATT, this.name);
341 classifier_list.appendChild(classifier);
342 return true;
343 }
344}
Note: See TracBrowser for help on using the repository browser.