source: trunk/greenstone3-extensions/gs3build/src/org/greenstone/gsdl3/gs3build/extractor/MetaXMLExtractor.java@ 12188

Last change on this file since 12188 was 12188, checked in by kjdon, 18 years ago

Initial revision

  • Property svn:keywords set to Author Date Id Revision
File size: 7.7 KB
Line 
1package org.greenstone.gsdl3.gs3build.extractor;
2
3import java.io.FileReader;
4
5import java.net.URL;
6
7import java.util.List;
8import java.util.ArrayList;
9import java.util.Iterator;
10import org.greenstone.gsdl3.gs3build.util.GS2TextFileHandler;
11
12import org.apache.xerces.parsers.SAXParser;
13import org.xml.sax.XMLReader;
14import org.xml.sax.InputSource;
15import org.xml.sax.SAXException;
16import org.xml.sax.Attributes;
17import org.xml.sax.helpers.XMLReaderFactory;
18import org.xml.sax.helpers.DefaultHandler;
19
20import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
21import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
22import org.greenstone.gsdl3.gs3build.doctypes.MetadataDocument;
23import org.greenstone.gsdl3.gs3build.doctypes.DocumentLoader;
24import org.greenstone.gsdl3.gs3build.doctypes.DocumentList;
25
26import org.greenstone.gsdl3.gs3build.metadata.MetadataLabel;
27
28public class MetaXMLExtractor implements ExtractorInterface
29{
30 /**
31 * An inner class to handle Metadata files
32 */
33 class MetadataHandler extends DefaultHandler
34 { List files;
35 String label;
36 StringBuffer value;
37 URL url;
38 boolean inElement;
39 boolean accumulate;
40 DocumentList documentList;
41 List documentIds;
42 List documents;
43
44 MetadataHandler(DocumentList documentList)
45 { super();
46
47 this.label = null;
48 this.value = null;
49 this.documentList = documentList;
50 }
51
52 public void startElement(String URI, String localName, String qName, Attributes attributes)
53 { if (localName.equals("FileName"))
54 { this.value = new StringBuffer();
55 }
56 else if (localName.equals("FileSet"))
57 { this.files = new ArrayList();
58 }
59 else if (localName.equals("Description"))
60 { this.documentIds = this.documentList.findDocumentIdsUsingFiles(this.files, this.url.toString());
61
62 if (documentIds != null && documentIds.size() > 0) {
63 this.documents = new ArrayList();
64
65 Iterator idIterator = documentIds.iterator();
66 while (idIterator.hasNext()) {
67 String docIdString = idIterator.next().toString();
68 DocumentID docId = new DocumentID(docIdString);
69 DocumentInterface document = documentList.getDocument(docId);
70 if (document != null) {
71 documents.add(document);
72 }
73 }
74 }
75 }
76 else if (localName.equals("Metadata"))
77 { this.label = attributes.getValue("name");
78 this.value = new StringBuffer();
79
80 String mode = attributes.getValue("mode");
81 this.accumulate = mode.equals(ExtractorManager.ACCUMULATE_MODE);
82 }
83 }
84
85 public void endElement(String URI, String localName, String qName)
86 { if (localName.equals("FileName"))
87 { String file = this.value.toString();
88 this.value = null;
89 this.files.add(file);
90 }
91 else if (localName.equals("FileSet"))
92 { // post the existing files item...
93 }
94 else if (localName.equals("Description"))
95 {
96 if (this.documents != null && documents.size() > 0) {
97 // write out the modified documents
98 // TODO: nicer/more generalised interface for this and related activity in
99 // extractor manager (actually, enricher manager);
100 Iterator docIterator = documents.iterator();
101 while (docIterator.hasNext()) {
102 DocumentInterface document = (DocumentInterface) docIterator.next();
103
104 // System.out.println("Writing modified document " + document.getID());
105 documentList.storeChangedDocument(document);
106 }
107 }
108 }
109 else if (localName.equals("Metadata"))
110 { MetaXMLExtractor.postMetadata(this.url, this.files,
111 this.label, this.value.toString(),
112 this.accumulate);
113 /*
114 if (documentIds != null) {
115 Iterator iterator = documentIds.iterator();
116 while (iterator.hasNext()) {
117 System.out.println("Matches file " + iterator.next().toString());
118 }
119 }
120 */
121
122 if (documentIds != null && documentIds.size() > 0) {
123 Iterator docIterator = this.documents.iterator();
124 while (docIterator.hasNext()) {
125 DocumentInterface document = (DocumentInterface) docIterator.next();
126
127 // Post to document
128 // TODO: tailor this to posting documents to *sections* as required...
129 if (accumulate) {
130 document.addDocumentMetadata(new MetadataLabel(this.label), this.value.toString());
131 }
132 else {
133 document.setDocumentMetadata(new MetadataLabel(this.label), this.value.toString());
134 }
135 }
136 }
137
138 // flatten the metadata items again...
139 this.value = null;
140 this.label = null;
141 }
142 }
143
144 public void characters(char c[], int start, int length)
145 { if (this.value != null)
146 { String string = new String(c, start, length);
147 this.value.append(string);
148 }
149 }
150
151 public void setUrl(URL url)
152 { this.url = url;
153 }
154 }
155
156 private DocumentList documentList;
157
158 /**
159 * Construct of extractor
160 */
161 public MetaXMLExtractor()
162 { // Intentionally left blank
163 }
164
165 /**
166 * This extractor doesn't need to do any preparation/completion work,
167 * so this member function is empty.
168 */
169 public void configure(String outputDir)
170 { // Intentionally left blank
171 }
172
173 public void configure(DocumentList list)
174 { this.documentList = list;
175 }
176
177 /**
178 * This extractor doesn't need to do any preparation/completion work,
179 * so this member function is empty.
180 */
181 public void startPass(int passNo)
182 { // Intentionally left blank
183 }
184
185 /**
186 * Process the document - for a metadata document, this results in the
187 * decoration of other files, for other documents, it does nothing.
188 */
189 public void extractDocument(DocumentID docID, DocumentInterface document)
190 { if (document.getDocumentType().equals(MetadataDocument.METADATA_DOCUMENT_TYPE))
191 { // Extract the content from the metadata file
192 URL url;
193
194 try {
195 SAXParser parser = new SAXParser();
196 MetadataHandler handler = new MetadataHandler(this.documentList);
197 /*
198 XMLReader reader = XMLReaderFactory.createXMLReader();
199 reader.setContentHandler(handler);
200 reader.setErrorHandler(handler);*/
201 parser.setContentHandler(handler);
202
203 // Get path of file; we cheat here by assuming that the url is a file - this
204 // really ought to be done better [TODO: fix to handle full paths & URLs]
205 url = document.getDocumentFiles().getFile(0).getURL();
206 String filePath = url.getPath();
207 handler.setUrl(new URL(url, "."));
208
209 // A metadata document consists of one file only - get it from the 'default'
210 // file group
211 /*
212 FileReader fileReader = new FileReader(filePath);
213 reader.parse(new InputSource(fileReader));
214 */
215 parser.parse(filePath);
216 }
217 catch (SAXException saxException)
218 { // TODO: log error
219 System.err.println(saxException);
220 }
221 catch (java.io.FileNotFoundException fileException)
222 { System.err.println(fileException);
223 }
224 catch (java.io.IOException ioException)
225 { System.err.println(ioException);
226 }
227 /* catch (java.net.MalformedURLException malEx) {
228 System.err.println("Unable to get parent of URL "+url.toString()+" in metadata extraction.");
229 return;
230 }
231 */
232
233 // for each document post it to the corresponding document
234 }
235 }
236
237 protected static void postMetadata(URL url, List files, String label, String value, boolean accumulate)
238 { String file;
239
240 Iterator fileIter = files.iterator();
241 while (fileIter.hasNext()) {
242 file = fileIter.next().toString();
243
244 System.out.println(url.toString() + " " + file + ": " + label + "=" + value);
245 }
246 }
247
248 /**
249 * This extractor doesn't need to do any preparation/completion work,
250 * so this member function is empty.
251 */
252 public void endPass(int passNo)
253 { // Intentionally left blank
254 }
255
256 /**
257 * This extractor is a simple, single-pass extractor
258 *
259 * @see: org.greenstone.gsdl3.gs3build.extractor.ExtractorInterface:getNumberOfPasses
260 */
261 public int getNumberOfPasses()
262 { return 1;
263 }
264}
265
266
267
268
Note: See TracBrowser for help on using the repository browser.