1 | package org.greenstone.gsdl3.gs3build.extractor;
|
---|
2 |
|
---|
3 | import java.io.FileReader;
|
---|
4 |
|
---|
5 | import java.net.URL;
|
---|
6 |
|
---|
7 | import java.util.List;
|
---|
8 | import java.util.ArrayList;
|
---|
9 | import java.util.Iterator;
|
---|
10 | import org.greenstone.gsdl3.gs3build.util.GS2TextFileHandler;
|
---|
11 |
|
---|
12 | import org.apache.xerces.parsers.SAXParser;
|
---|
13 | import org.xml.sax.XMLReader;
|
---|
14 | import org.xml.sax.InputSource;
|
---|
15 | import org.xml.sax.SAXException;
|
---|
16 | import org.xml.sax.Attributes;
|
---|
17 | import org.xml.sax.helpers.XMLReaderFactory;
|
---|
18 | import org.xml.sax.helpers.DefaultHandler;
|
---|
19 |
|
---|
20 | import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
|
---|
21 | import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
|
---|
22 | import org.greenstone.gsdl3.gs3build.doctypes.MetadataDocument;
|
---|
23 | import org.greenstone.gsdl3.gs3build.doctypes.DocumentLoader;
|
---|
24 | import org.greenstone.gsdl3.gs3build.doctypes.DocumentList;
|
---|
25 |
|
---|
26 | import org.greenstone.gsdl3.gs3build.metadata.MetadataLabel;
|
---|
27 |
|
---|
28 | public class MetaXMLExtractor implements ExtractorInterface
|
---|
29 | {
|
---|
30 | /**
|
---|
31 | * An inner class to handle Metadata files
|
---|
32 | */
|
---|
33 | class MetadataHandler extends DefaultHandler
|
---|
34 | { List files;
|
---|
35 | String label;
|
---|
36 | StringBuffer value;
|
---|
37 | URL url;
|
---|
38 | boolean inElement;
|
---|
39 | boolean accumulate;
|
---|
40 | DocumentList documentList;
|
---|
41 | List documentIds;
|
---|
42 | List documents;
|
---|
43 |
|
---|
44 | MetadataHandler(DocumentList documentList)
|
---|
45 | { super();
|
---|
46 |
|
---|
47 | this.label = null;
|
---|
48 | this.value = null;
|
---|
49 | this.documentList = documentList;
|
---|
50 | }
|
---|
51 |
|
---|
52 | public void startElement(String URI, String localName, String qName, Attributes attributes)
|
---|
53 | { if (localName.equals("FileName"))
|
---|
54 | { this.value = new StringBuffer();
|
---|
55 | }
|
---|
56 | else if (localName.equals("FileSet"))
|
---|
57 | { this.files = new ArrayList();
|
---|
58 | }
|
---|
59 | else if (localName.equals("Description"))
|
---|
60 | { this.documentIds = this.documentList.findDocumentIdsUsingFiles(this.files, this.url.toString());
|
---|
61 |
|
---|
62 | if (documentIds != null && documentIds.size() > 0) {
|
---|
63 | this.documents = new ArrayList();
|
---|
64 |
|
---|
65 | Iterator idIterator = documentIds.iterator();
|
---|
66 | while (idIterator.hasNext()) {
|
---|
67 | String docIdString = idIterator.next().toString();
|
---|
68 | DocumentID docId = new DocumentID(docIdString);
|
---|
69 | DocumentInterface document = documentList.getDocument(docId);
|
---|
70 | if (document != null) {
|
---|
71 | documents.add(document);
|
---|
72 | }
|
---|
73 | }
|
---|
74 | }
|
---|
75 | }
|
---|
76 | else if (localName.equals("Metadata"))
|
---|
77 | { this.label = attributes.getValue("name");
|
---|
78 | this.value = new StringBuffer();
|
---|
79 |
|
---|
80 | String mode = attributes.getValue("mode");
|
---|
81 | this.accumulate = mode.equals(ExtractorManager.ACCUMULATE_MODE);
|
---|
82 | }
|
---|
83 | }
|
---|
84 |
|
---|
85 | public void endElement(String URI, String localName, String qName)
|
---|
86 | { if (localName.equals("FileName"))
|
---|
87 | { String file = this.value.toString();
|
---|
88 | this.value = null;
|
---|
89 | this.files.add(file);
|
---|
90 | }
|
---|
91 | else if (localName.equals("FileSet"))
|
---|
92 | { // post the existing files item...
|
---|
93 | }
|
---|
94 | else if (localName.equals("Description"))
|
---|
95 | {
|
---|
96 | if (this.documents != null && documents.size() > 0) {
|
---|
97 | // write out the modified documents
|
---|
98 | // TODO: nicer/more generalised interface for this and related activity in
|
---|
99 | // extractor manager (actually, enricher manager);
|
---|
100 | Iterator docIterator = documents.iterator();
|
---|
101 | while (docIterator.hasNext()) {
|
---|
102 | DocumentInterface document = (DocumentInterface) docIterator.next();
|
---|
103 |
|
---|
104 | // System.out.println("Writing modified document " + document.getID());
|
---|
105 | documentList.storeChangedDocument(document);
|
---|
106 | }
|
---|
107 | }
|
---|
108 | }
|
---|
109 | else if (localName.equals("Metadata"))
|
---|
110 | { MetaXMLExtractor.postMetadata(this.url, this.files,
|
---|
111 | this.label, this.value.toString(),
|
---|
112 | this.accumulate);
|
---|
113 | /*
|
---|
114 | if (documentIds != null) {
|
---|
115 | Iterator iterator = documentIds.iterator();
|
---|
116 | while (iterator.hasNext()) {
|
---|
117 | System.out.println("Matches file " + iterator.next().toString());
|
---|
118 | }
|
---|
119 | }
|
---|
120 | */
|
---|
121 |
|
---|
122 | if (documentIds != null && documentIds.size() > 0) {
|
---|
123 | Iterator docIterator = this.documents.iterator();
|
---|
124 | while (docIterator.hasNext()) {
|
---|
125 | DocumentInterface document = (DocumentInterface) docIterator.next();
|
---|
126 |
|
---|
127 | // Post to document
|
---|
128 | // TODO: tailor this to posting documents to *sections* as required...
|
---|
129 | if (accumulate) {
|
---|
130 | document.addDocumentMetadata(new MetadataLabel(this.label), this.value.toString());
|
---|
131 | }
|
---|
132 | else {
|
---|
133 | document.setDocumentMetadata(new MetadataLabel(this.label), this.value.toString());
|
---|
134 | }
|
---|
135 | }
|
---|
136 | }
|
---|
137 |
|
---|
138 | // flatten the metadata items again...
|
---|
139 | this.value = null;
|
---|
140 | this.label = null;
|
---|
141 | }
|
---|
142 | }
|
---|
143 |
|
---|
144 | public void characters(char c[], int start, int length)
|
---|
145 | { if (this.value != null)
|
---|
146 | { String string = new String(c, start, length);
|
---|
147 | this.value.append(string);
|
---|
148 | }
|
---|
149 | }
|
---|
150 |
|
---|
151 | public void setUrl(URL url)
|
---|
152 | { this.url = url;
|
---|
153 | }
|
---|
154 | }
|
---|
155 |
|
---|
156 | private DocumentList documentList;
|
---|
157 |
|
---|
158 | /**
|
---|
159 | * Construct of extractor
|
---|
160 | */
|
---|
161 | public MetaXMLExtractor()
|
---|
162 | { // Intentionally left blank
|
---|
163 | }
|
---|
164 |
|
---|
165 | /**
|
---|
166 | * This extractor doesn't need to do any preparation/completion work,
|
---|
167 | * so this member function is empty.
|
---|
168 | */
|
---|
169 | public void configure(String outputDir)
|
---|
170 | { // Intentionally left blank
|
---|
171 | }
|
---|
172 |
|
---|
173 | public void configure(DocumentList list)
|
---|
174 | { this.documentList = list;
|
---|
175 | }
|
---|
176 |
|
---|
177 | /**
|
---|
178 | * This extractor doesn't need to do any preparation/completion work,
|
---|
179 | * so this member function is empty.
|
---|
180 | */
|
---|
181 | public void startPass(int passNo)
|
---|
182 | { // Intentionally left blank
|
---|
183 | }
|
---|
184 |
|
---|
185 | /**
|
---|
186 | * Process the document - for a metadata document, this results in the
|
---|
187 | * decoration of other files, for other documents, it does nothing.
|
---|
188 | */
|
---|
189 | public void extractDocument(DocumentID docID, DocumentInterface document)
|
---|
190 | { if (document.getDocumentType().equals(MetadataDocument.METADATA_DOCUMENT_TYPE))
|
---|
191 | { // Extract the content from the metadata file
|
---|
192 | URL url;
|
---|
193 |
|
---|
194 | try {
|
---|
195 | SAXParser parser = new SAXParser();
|
---|
196 | MetadataHandler handler = new MetadataHandler(this.documentList);
|
---|
197 | /*
|
---|
198 | XMLReader reader = XMLReaderFactory.createXMLReader();
|
---|
199 | reader.setContentHandler(handler);
|
---|
200 | reader.setErrorHandler(handler);*/
|
---|
201 | parser.setContentHandler(handler);
|
---|
202 |
|
---|
203 | // Get path of file; we cheat here by assuming that the url is a file - this
|
---|
204 | // really ought to be done better [TODO: fix to handle full paths & URLs]
|
---|
205 | url = document.getDocumentFiles().getFile(0).getURL();
|
---|
206 | String filePath = url.getPath();
|
---|
207 | handler.setUrl(new URL(url, "."));
|
---|
208 |
|
---|
209 | // A metadata document consists of one file only - get it from the 'default'
|
---|
210 | // file group
|
---|
211 | /*
|
---|
212 | FileReader fileReader = new FileReader(filePath);
|
---|
213 | reader.parse(new InputSource(fileReader));
|
---|
214 | */
|
---|
215 | parser.parse(filePath);
|
---|
216 | }
|
---|
217 | catch (SAXException saxException)
|
---|
218 | { // TODO: log error
|
---|
219 | System.err.println(saxException);
|
---|
220 | }
|
---|
221 | catch (java.io.FileNotFoundException fileException)
|
---|
222 | { System.err.println(fileException);
|
---|
223 | }
|
---|
224 | catch (java.io.IOException ioException)
|
---|
225 | { System.err.println(ioException);
|
---|
226 | }
|
---|
227 | /* catch (java.net.MalformedURLException malEx) {
|
---|
228 | System.err.println("Unable to get parent of URL "+url.toString()+" in metadata extraction.");
|
---|
229 | return;
|
---|
230 | }
|
---|
231 | */
|
---|
232 |
|
---|
233 | // for each document post it to the corresponding document
|
---|
234 | }
|
---|
235 | }
|
---|
236 |
|
---|
237 | protected static void postMetadata(URL url, List files, String label, String value, boolean accumulate)
|
---|
238 | { String file;
|
---|
239 |
|
---|
240 | Iterator fileIter = files.iterator();
|
---|
241 | while (fileIter.hasNext()) {
|
---|
242 | file = fileIter.next().toString();
|
---|
243 |
|
---|
244 | System.out.println(url.toString() + " " + file + ": " + label + "=" + value);
|
---|
245 | }
|
---|
246 | }
|
---|
247 |
|
---|
248 | /**
|
---|
249 | * This extractor doesn't need to do any preparation/completion work,
|
---|
250 | * so this member function is empty.
|
---|
251 | */
|
---|
252 | public void endPass(int passNo)
|
---|
253 | { // Intentionally left blank
|
---|
254 | }
|
---|
255 |
|
---|
256 | /**
|
---|
257 | * This extractor is a simple, single-pass extractor
|
---|
258 | *
|
---|
259 | * @see: org.greenstone.gsdl3.gs3build.extractor.ExtractorInterface:getNumberOfPasses
|
---|
260 | */
|
---|
261 | public int getNumberOfPasses()
|
---|
262 | { return 1;
|
---|
263 | }
|
---|
264 | }
|
---|
265 |
|
---|
266 |
|
---|
267 |
|
---|
268 |
|
---|