source: trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/classifier/GS2HierarchyClassifier.java@ 6288

Last change on this file since 6288 was 6288, checked in by cs025, 20 years ago

Various changes

  • Property svn:keywords set to Author Date Id Revision
File size: 7.8 KB
Line 
1package org.greenstone.gsdl3.gs3build.classifier;
2
3import java.io.*;
4
5import java.util.List;
6import java.util.ArrayList;
7import java.util.Iterator;
8
9import org.xml.sax.XMLReader;
10import org.xml.sax.InputSource;
11import org.xml.sax.SAXException;
12import org.xml.sax.Attributes;
13import org.xml.sax.helpers.XMLReaderFactory;
14import org.xml.sax.helpers.DefaultHandler;
15
16import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
17import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
18import org.greenstone.gsdl3.gs3build.doctypes.DocumentLoader;
19import org.greenstone.gsdl3.gs3build.util.XMLTools;
20import org.greenstone.gsdl3.gs3build.util.GS2TextFileHandler;
21import org.greenstone.gsdl3.gs3build.util.GS3SQLConnection;
22
23public class GS2HierarchyClassifier implements ClassifierInterface
24{
25 class GS2HierarchyClassifierObserver implements ClassifierObserverInterface
26 { List classifications;
27 DocumentInterface document;
28
29 public GS2HierarchyClassifierObserver(DocumentInterface document)
30 { this.document = document;
31 }
32
33 public void recordClassification(String label)
34 { this.document.addDocumentMetadata("gsdl3", "classified", label);
35 System.out.println("Assigned document " + this.document.getID().toString() + " to " + label);
36 }
37
38 public List getClassifications()
39 { return this.classifications;
40 }
41 }
42
43 class GS2HierarchyNode extends AbstractHierarchyNode
44 { /**
45 * Simple node
46 */
47 public GS2HierarchyNode()
48 { super();
49 }
50
51 public GS2HierarchyNode(String prefix, String name, String id, String descriptor)
52 { super(prefix, name, id, descriptor);
53 }
54
55 /**
56 * Take a document, and find the classifications that it matches against in
57 * the current hierarchy.
58 *
59 * @param <code>DocumentID</code> the id of the document being classified
60 * @param <code>List</code> the values against which the classifier should
61 * test for the document being a match - i.e. the pertinent document
62 * property values.
63 * @param <code>ClassifierObserverInterface</code>
64 * object modifies the document with information about the
65 * classifications that it fell within.
66 */
67 public void getClassifications(DocumentID documentID, List values,
68 ClassifierObserverInterface observer)
69 { // Go through the list of values, and if this node matches, record
70 // the fact in the classifier observer...
71 Iterator valueList = values.iterator();
72
73 while (valueList.hasNext())
74 { if (this.isMatch(valueList.next().toString()))
75 { observer.recordClassification(this.name);
76 this.addDocument(documentID);
77 }
78 }
79
80 // recurse into the child nodes for them to do the same...
81 Iterator childList = this.childNodes.iterator();
82
83 while (childList.hasNext())
84 { ((GS2HierarchyNode) childList.next()).getClassifications(documentID, values, observer);
85 }
86 }
87 }
88
89 /**
90 * An inner class to handle Classifier files
91 */
92 class GS2HierarchyHandler extends GS2TextFileHandler
93 { GS2HierarchyNode rootNode = null;
94
95 GS2HierarchyHandler(String hierarchyName, String content)
96 { super(content);
97
98 this.rootNode = new GS2HierarchyNode();
99 String prefix = "CL"+hierarchyName;
100
101 while (this.hasMoreLines()) {
102 String description;
103 String name, id;
104
105 this.getLine();
106
107 name = this.getEntry(true);
108 if (name == null || name.length() == 0) {
109 continue;
110 }
111
112 id = this.getEntry(true);
113 if (id == null || id.length() == 0) {
114 continue;
115 }
116
117 description = this.getEntry(true);
118 if (description == null || description.length() == 0) {
119 continue;
120 }
121
122 // Initialise the tree
123 GS2HierarchyNode node = new GS2HierarchyNode(prefix, name, id, description);
124
125 this.rootNode.add(node);
126 }
127 }
128
129 public GS2HierarchyNode getHierarchy()
130 { return this.rootNode;
131 }
132 }
133
134 private GS2HierarchyNode hierarchy;
135 List fields;
136 String sortBy;
137 GS3SQLConnection database;
138
139 public GS2HierarchyClassifier()
140 { // a 'do nothing' constructor which takes more information from configure...
141 this.fields = new ArrayList();
142 this.database = null;
143 }
144
145 public GS2HierarchyClassifier(File file)
146 {
147 String documentText =
148 DocumentLoader.getAsString(file);
149
150 if (documentText == null) {
151 System.err.println("GS2HierarchyClassifier: Unable to load any content for " + file);
152 return;
153 }
154
155 GS2HierarchyHandler handler = new GS2HierarchyHandler("CL", documentText);
156
157 this.hierarchy = handler.getHierarchy();
158 this.fields = new ArrayList();
159 System.out.println("loaded hierarchy for "+ file.toString());
160 this.database = null;
161 }
162
163 public void setDatabase(GS3SQLConnection database)
164 { this.database = database;
165 }
166
167 public void configure(List parameters)
168 {
169 File file = null;
170
171 Iterator iterator = parameters.iterator();
172 while (iterator.hasNext()) {
173 String param = iterator.next().toString();
174
175 if (param.equals("-metadata")) {
176 if (iterator.hasNext()) {
177 String field = iterator.next().toString();
178 if (field != null && field.length() > 0) {
179 this.fields.add(field);
180 }
181 }
182 }
183 else if (param.equals("-hfile")) {
184 // this parameter had better have been expanded previously to a full path,
185 // as we're not going to do that here!!!
186 if (iterator.hasNext()) {
187 file = new File(iterator.next().toString());
188 }
189 }
190 else if (param.equals("-sort")) {
191 if (iterator.hasNext()) {
192 String sort = iterator.next().toString();
193 if (sort != null && sort.length() > 0) {
194 this.sortBy = sort;
195 }
196 }
197 }
198 }
199
200 if (file == null) {
201 return;
202 }
203
204 String documentText =
205 DocumentLoader.getAsString(file);
206
207 if (documentText == null) {
208 System.err.println("GS2HierarchyClassifier: Unable to load any content for " + file);
209 return;
210 }
211
212 GS2HierarchyHandler handler = new GS2HierarchyHandler(this.fields.get(0).toString(), documentText);
213
214 this.hierarchy = handler.getHierarchy();
215 }
216
217 /**
218 * Classify a particular document - the document id will be used to record its
219 * membership of any classifier nodes, and the document interface will also be
220 * used to assign back knowledge of any classifiers to which the document is
221 * assigned.
222 *
223 * @param <code>DocumentID</code> the document identifier
224 * @param <code>DocumentInterface</code> the document interface
225 */
226 public boolean classifyDocument(DocumentID documentID, DocumentInterface document)
227 { if (this.hierarchy == null) {
228 return true;
229 }
230
231 // the observer records the assignment of documents to classifications
232 GS2HierarchyClassifierObserver classifyObserver = new GS2HierarchyClassifierObserver(document);
233
234 // get the metadata item from the document
235 Iterator thisField = this.fields.iterator();
236
237 // for every field listed as being a scannable field...
238 while (thisField.hasNext())
239 { String fieldName = thisField.next().toString();
240
241 // ...get the values for that field...
242 List values = document.getDocumentMetadataItem(fieldName);
243
244 if (values == null)
245 { continue;
246 }
247
248 System.out.println("Matching " + values.get(0).toString());
249
250 if (documentID == null) {
251 System.out.println("Bad documentID");
252 continue;
253 }
254
255 // ...and send them to the classifier
256 this.hierarchy.getClassifications(documentID, values, classifyObserver);
257 }
258 return true;
259 }
260
261 public int getClassifierPasses()
262 { return 1;
263 }
264
265 public void startClassifierPass(int pass)
266 {
267 }
268
269 public void endClassifierPass(int pass)
270 {
271 }
272
273 public void completeClassification()
274 { // TODO: store the classifications to file...
275
276 // The hierarchy member may be null if the classifier did not initialise correctly...
277 if (this.hierarchy != null) {
278 this.hierarchy.writeSQL(this.database);
279 }
280 }
281}
Note: See TracBrowser for help on using the repository browser.