source: trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/classifier/GS2HierarchyClassifier.java@ 6104

Last change on this file since 6104 was 6104, checked in by cs025, 20 years ago

Major changes & improvements to implement the GS2 classifiers

  • Property svn:keywords set to Author Date Id Revision
File size: 7.5 KB
Line 
1package org.greenstone.gsdl3.gs3build.classifier;
2
3import java.io.*;
4
5import java.util.List;
6import java.util.ArrayList;
7import java.util.Iterator;
8
9import org.xml.sax.XMLReader;
10import org.xml.sax.InputSource;
11import org.xml.sax.SAXException;
12import org.xml.sax.Attributes;
13import org.xml.sax.helpers.XMLReaderFactory;
14import org.xml.sax.helpers.DefaultHandler;
15
16import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
17import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
18import org.greenstone.gsdl3.gs3build.doctypes.DocumentLoader;
19import org.greenstone.gsdl3.gs3build.util.XMLTools;
20import org.greenstone.gsdl3.gs3build.util.GS2TextFileHandler;
21import org.greenstone.gsdl3.gs3build.util.GS3SQLConnection;
22
23public class GS2HierarchyClassifier implements ClassifierInterface
24{
25 class GS2HierarchyClassifierObserver implements ClassifierObserverInterface
26 { List classifications;
27 DocumentInterface document;
28
29 public GS2HierarchyClassifierObserver(DocumentInterface document)
30 { this.document = document;
31 }
32
33 public void recordClassification(String label)
34 { this.document.addDocumentMetadata("gsdl3", "classified", label);
35 System.out.println("Assigned document " + this.document.getID().toString() + " to " + label);
36 }
37
38 public List getClassifications()
39 { return this.classifications;
40 }
41 }
42
43 class GS2HierarchyNode extends AbstractHierarchyNode
44 { /**
45 * Simple node
46 */
47 public GS2HierarchyNode()
48 { super();
49 }
50
51 public GS2HierarchyNode(String name, String id, String descriptor)
52 { super(name, id, descriptor);
53 }
54
55 /**
56 * Take a document, and find the classifications that it matches against in
57 * the current hierarchy.
58 *
59 * @param <code>DocumentID</code> the id of the document being classified
60 * @param <code>List</code> the values against which the classifier should
61 * test for the document being a match - i.e. the pertinent document
62 * property values.
63 * @param <code>ClassifierObserverInterface</code>
64 * object modifies the document with information about the
65 * classifications that it fell within.
66 */
67 public void getClassifications(DocumentID documentID, List values,
68 ClassifierObserverInterface observer)
69 { // Go through the list of values, and if this node matches, record
70 // the fact in the classifier observer...
71 Iterator valueList = values.iterator();
72
73 while (valueList.hasNext())
74 { if (this.isMatch(valueList.next().toString()))
75 { observer.recordClassification(this.name);
76 this.addDocument(documentID);
77 }
78 }
79
80 // recurse into the child nodes for them to do the same...
81 Iterator childList = this.childNodes.iterator();
82
83 while (childList.hasNext())
84 { ((GS2HierarchyNode) childList.next()).getClassifications(documentID, values, observer);
85 }
86 }
87 }
88
89 /**
90 * An inner class to handle Classifier files
91 */
92 class GS2HierarchyHandler extends GS2TextFileHandler
93 { GS2HierarchyNode rootNode = null;
94
95 GS2HierarchyHandler(String hierarchyName, String content)
96 { super(content);
97
98 this.rootNode = new GS2HierarchyNode();
99
100 while (this.hasMoreLines()) {
101 String description;
102 String name, id;
103
104 this.getLine();
105
106 name = this.getEntry(true);
107 if (name == null || name.length() == 0) {
108 continue;
109 }
110
111 id = this.getEntry(true);
112 if (id == null || id.length() == 0) {
113 continue;
114 }
115
116 description = this.getEntry(true);
117 if (description == null || description.length() == 0) {
118 continue;
119 }
120
121 // Initialise the tree
122 GS2HierarchyNode node = new GS2HierarchyNode(name, "CL"+hierarchyName+"."+id, description);
123
124 this.rootNode.add(node);
125 }
126 }
127
128 public GS2HierarchyNode getHierarchy()
129 { return this.rootNode;
130 }
131 }
132
133 private GS2HierarchyNode hierarchy;
134 List fields;
135 String sortBy;
136 GS3SQLConnection database;
137
138 public GS2HierarchyClassifier()
139 { // a 'do nothing' constructor which takes more information from configure...
140 this.fields = new ArrayList();
141 this.database = null;
142 }
143
144 public GS2HierarchyClassifier(File file)
145 {
146 String documentText =
147 DocumentLoader.getAsString(file);
148
149 if (documentText == null) {
150 System.err.println("GS2HierarchyClassifier: Unable to load any content for " + file);
151 return;
152 }
153
154 GS2HierarchyHandler handler = new GS2HierarchyHandler("CL", documentText);
155
156 this.hierarchy = handler.getHierarchy();
157 this.fields = new ArrayList();
158 System.out.println("loaded hierarchy for "+ file.toString());
159 this.database = null;
160 }
161
162 public void setDatabase(GS3SQLConnection database)
163 { this.database = database;
164 }
165
166 public void configure(List parameters)
167 {
168 File file = null;
169
170 Iterator iterator = parameters.iterator();
171 while (iterator.hasNext()) {
172 String param = iterator.next().toString();
173
174 if (param.equals("-metadata")) {
175 if (iterator.hasNext()) {
176 String field = iterator.next().toString();
177 if (field != null && field.length() > 0) {
178 this.fields.add(field);
179 }
180 }
181 }
182 else if (param.equals("-hfile")) {
183 // this parameter had better have been expanded previously to a full path,
184 // as we're not going to do that here!!!
185 if (iterator.hasNext()) {
186 file = new File(iterator.next().toString());
187 }
188 }
189 else if (param.equals("-sort")) {
190 if (iterator.hasNext()) {
191 String sort = iterator.next().toString();
192 if (sort != null && sort.length() > 0) {
193 this.sortBy = sort;
194 }
195 }
196 }
197 }
198
199 if (file == null) {
200 return;
201 }
202
203 String documentText =
204 DocumentLoader.getAsString(file);
205
206 if (documentText == null) {
207 System.err.println("GS2HierarchyClassifier: Unable to load any content for " + file);
208 return;
209 }
210
211 GS2HierarchyHandler handler = new GS2HierarchyHandler(this.fields.get(0).toString(), documentText);
212
213 this.hierarchy = handler.getHierarchy();
214 }
215
216 /**
217 * Classify a particular document - the document id will be used to record its
218 * membership of any classifier nodes, and the document interface will also be
219 * used to assign back knowledge of any classifiers to which the document is
220 * assigned.
221 *
222 * @param <code>DocumentID</code> the document identifier
223 * @param <code>DocumentInterface</code> the document interface
224 */
225 public boolean classifyDocument(DocumentID documentID, DocumentInterface document)
226 { // the observer records the assignment of documents to classifications
227 GS2HierarchyClassifierObserver classifyObserver = new GS2HierarchyClassifierObserver(document);
228
229 // get the metadata item from the document
230 Iterator thisField = this.fields.iterator();
231
232 // for every field listed as being a scannable field...
233 while (thisField.hasNext())
234 { String fieldName = thisField.next().toString();
235
236 // ...get the values for that field...
237 List values = document.getDocumentMetadataItem(fieldName);
238
239 if (values == null)
240 { continue;
241 }
242
243 // ...and send them to the classifier
244 this.hierarchy.getClassifications(documentID, values, classifyObserver);
245 }
246 return true;
247 }
248
249 public int getClassifierPasses()
250 { return 1;
251 }
252
253 public void startClassifierPass(int pass)
254 {
255 }
256
257 public void endClassifierPass(int pass)
258 {
259 }
260
261 public void completeClassification()
262 { // TODO: store the classifications to file...
263
264 // The hierarchy member may be null if the classifier did not initialise correctly...
265 if (this.hierarchy != null) {
266 this.hierarchy.writeSQL(this.database);
267 }
268 }
269}
Note: See TracBrowser for help on using the repository browser.