source: trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/classifier/HierarchyClassifier.java@ 7303

Last change on this file since 7303 was 7268, checked in by cs025, 20 years ago

Updated Hierarchy classifiers; small other changes

  • Property svn:keywords set to Author Date Id Revision
File size: 9.5 KB
Line 
1package org.greenstone.gsdl3.gs3build.classifier;
2
3import java.io.*;
4
5import java.net.URL;
6
7import java.util.List;
8import java.util.ArrayList;
9import java.util.Iterator;
10
11import org.apache.xerces.parsers.SAXParser;
12import org.xml.sax.XMLReader;
13import org.xml.sax.InputSource;
14import org.xml.sax.SAXException;
15import org.xml.sax.Attributes;
16import org.xml.sax.helpers.XMLReaderFactory;
17import org.xml.sax.helpers.DefaultHandler;
18
19import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
20import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
21import org.greenstone.gsdl3.gs3build.util.XMLTools;
22import org.greenstone.gsdl3.gs3build.util.GS3SQLConnection;
23
24public class HierarchyClassifier implements ClassifierInterface
25{
26 class HierarchyClassifierObserver implements ClassifierObserverInterface
27 { List classifications;
28 DocumentInterface document;
29
30 public HierarchyClassifierObserver(DocumentInterface document)
31 { this.document = document;
32 }
33
34 public void recordClassification(String label)
35 { this.document.addDocumentMetadata("gsdl3", "classified", label);
36 }
37
38 public List getClassifications()
39 { return this.classifications;
40 }
41 }
42
43 class HierarchyNode extends AbstractHierarchyNode
44 { /**
45 * Simple node
46 */
47 public HierarchyNode()
48 { super();
49 }
50
51 public HierarchyNode(String prefix, String name, String id, String descriptor)
52 { super(prefix, name, id, descriptor);
53 }
54
55 public void getClassifications(DocumentID documentID, List values, String sortKey,
56 ClassifierObserverInterface observer)
57 { Iterator valueList = values.iterator();
58
59 while (valueList.hasNext())
60 { if (this.isMatch(valueList.next().toString()))
61 { observer.recordClassification(this.name);
62 this.addDocument(documentID, sortKey);
63 }
64 }
65
66 // recurse into the child nodes for them to do the same...
67 Iterator childList = this.childNodes.iterator();
68
69 while (childList.hasNext())
70 { ((HierarchyNode) childList.next()).getClassifications(documentID, values, sortKey, observer);
71 }
72 }
73 }
74
75 /**
76 * An inner class to handle Classifier files
77 */
78 class HierarchyHandler extends DefaultHandler
79 { StringBuffer description;
80 StringBuffer name;
81 StringBuffer path;
82 StringBuffer match;
83 String prefix;
84 boolean inElement;
85 AbstractHierarchyNode rootNode = null;
86 AbstractHierarchyNode currentNode = null;
87
88 HierarchyHandler(String classifierName)
89 { super();
90
91 this.name = null;
92 this.description = null;
93 this.path = null;
94 this.match = null;
95 this.prefix = classifierName;
96 }
97
98 public void startElement(String URI, String localName, String qName, Attributes attributes)
99 { if (localName.equals("Hierarchy"))
100 { // create a new node in the hierarchy
101 HierarchyNode node = new HierarchyNode(prefix, "", "", "Classifier");
102
103 this.rootNode = node;
104 this.currentNode = node;
105 }
106 else if (localName.equals("Classification"))
107 { // create a new node in the hierarchy
108 HierarchyNode node = new HierarchyNode(this.prefix, "", "", "");
109
110 // Initialise the tree
111 if (this.currentNode == null)
112 { this.rootNode = node;
113 }
114 // Add the node
115 else
116 { this.currentNode.addChild(node);
117 }
118 this.currentNode = node;
119 }
120 else if (localName.equals("Name"))
121 { this.name = new StringBuffer();
122 }
123 else if (localName.equals("Path"))
124 { this.path = new StringBuffer();
125 }
126 else if (localName.equals("Description"))
127 { this.description = new StringBuffer();
128 }
129 else if (localName.equals("Matches"))
130 { this.match = new StringBuffer();
131 }
132 else if (localName.equals("Children"))
133 {
134 }
135 }
136
137 public void endElement(String URI, String localName, String qName)
138 { if (localName.equals("Classification"))
139 { if (this.currentNode.getParent() == null) {
140 // TODO: Error state
141 // this.rootNodes.add(this.currentNode);
142 }
143 this.currentNode = this.currentNode.getParent();
144 }
145 else if (localName.equals("Name"))
146 { this.currentNode.setName(XMLTools.cleanString(this.name.toString()));
147 System.out.println("Name is " + this.name);
148 this.name = null;
149 }
150 else if (localName.equals("Path"))
151 { this.currentNode.setID(XMLTools.cleanString(this.path.toString()));
152 this.path = null;
153 }
154 else if (localName.equals("Description"))
155 { this.currentNode.setDescriptor(XMLTools.cleanString(this.description.toString()));
156 this.description = null;
157 }
158 else if (localName.equals("Matches"))
159 { this.currentNode.addMatch(XMLTools.cleanString(this.match.toString()));
160 this.match = null;
161 }
162 else if (localName.equals("Children"))
163 {
164 }
165 }
166
167 public void characters(char c[], int start, int length)
168 { String string = new String(c, start, length);
169 if (this.path != null)
170 { this.path.append(string);
171 }
172 else if (this.name != null)
173 { this.name.append(string);
174 }
175 else if (this.description != null)
176 { this.description.append(string);
177 }
178 else if (this.match != null)
179 { this.match.append(string);
180 }
181 }
182
183 public AbstractHierarchyNode getHierarchy()
184 { return this.rootNode;
185 }
186 }
187
188 private GS3SQLConnection database;
189 private AbstractHierarchyNode hierarchy;
190 private List fields;
191 private String sortBy;
192 private String name;
193
194 public HierarchyClassifier(String name, URL basefile, List fields, String sortBy)
195 {
196 try
197 { SAXParser parser = new SAXParser();
198 HierarchyHandler handler = new HierarchyHandler(name);
199 /*
200 XMLReader reader = XMLReaderFactory.createXMLReader();
201 reader.setContentHandler(handler);
202 reader.setErrorHandler(handler);*/
203 parser.setContentHandler(handler);
204
205 // A Classifier consists of one file only - get it.
206 /*
207 FileReader fileReader = new FileReader(basefile);
208 reader.parse(new InputSource(fileReader));
209 */
210 parser.parse(basefile.toString());
211
212 this.hierarchy = handler.getHierarchy();
213 this.fields = fields;
214 this.sortBy = sortBy;
215
216 System.out.println("Counting " + this.fields.size() + " fields");
217 }
218 catch (SAXException saxException)
219 { // TODO: log error
220 System.out.println("unable to process hierarchy " + saxException.toString());
221 }
222 catch (java.io.FileNotFoundException fileException)
223 {
224 System.out.println("unable to process hierarchy " + fileException.toString());
225 }
226 catch (java.io.IOException ioException)
227 {
228 System.out.println("unable to process hierarchy " + ioException.toString());
229 }
230 }
231
232 public void configure(List parameters)
233 {
234 Iterator iterator = parameters.iterator();
235 while (iterator.hasNext()) {
236 String param = iterator.next().toString();
237 if (param.equals("-metadata")) {
238 if (iterator.hasNext()) {
239 String field = iterator.next().toString();
240 if (field != null && field.length() > 0) {
241 this.fields.add(field);
242 }
243 }
244 }
245 }
246 }
247
248 /**
249 * Indicate the database to be used for saving the hierarchy to.
250 *
251 * @param <code>GS3SQLConnection</code> the database
252 */
253 public void setDatabase(GS3SQLConnection database)
254 { this.database = database;
255 }
256
257 /**
258 * Classify a particular document - the document id will be used to record its
259 * membership of any classifier nodes, and the document interface will also be
260 * used to assign back knowledge of any classifiers to which the document is
261 * assigned.
262 *
263 * @param <code>DocumentID</code> the document identifier
264 * @param <code>DocumentInterface</code> the document interface
265 *
266 * @return <code>boolean</code> a return value - currently always
267 * <code>true</code> for success
268 */
269 public boolean classifyDocument(DocumentID documentID, DocumentInterface document)
270 { // the observer records the assignment of documents to classifications
271 HierarchyClassifierObserver classifyObserver = new HierarchyClassifierObserver(document);
272
273 // if we have no fields, there is nothing to do...
274 if (this.fields == null)
275 { return true;
276 }
277
278 // get the metadata item from the document
279 Iterator thisField = this.fields.iterator();
280
281 // for every field listed as being a scannable field...
282 while (thisField.hasNext())
283 { String fieldName = thisField.next().toString();
284
285 // ...get the values for that field...
286 List values = document.getDocumentMetadataItem(fieldName);
287
288 if (values == null)
289 { continue;
290 }
291
292 if (documentID == null) {
293 System.out.println("Bad documentID");
294 continue;
295 }
296
297 // get the sort key for the metadata item if possible
298 String sortKey = null;
299
300 if (this.sortBy != null) {
301 List sortKeys = document.getDocumentMetadataItem(this.sortBy);
302 if (sortKeys != null && sortKeys.size() > 0) {
303 sortKey = sortKeys.get(0).toString();
304 }
305 }
306
307 // ...and send them to the classifier
308 this.hierarchy.getClassifications(documentID, values, sortKey, classifyObserver);
309 }
310 return true;
311 }
312
313 /**
314 * @see ClassifierInterface:getClassifierPasses
315 */
316 public int getClassifierPasses()
317 { return 1;
318 }
319
320 public void startClassifierPass(int pass)
321 {
322 }
323
324 public void endClassifierPass(int pass)
325 {
326 }
327
328 public void completeClassification()
329 { // TODO: store the classifications to file...
330 if (this.hierarchy != null) {
331 this.hierarchy.writeSQL(this.database);
332 }
333 }
334}
Note: See TracBrowser for help on using the repository browser.