Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/greenstone3-extensions/gs3build/src/org/greenstone/gsdl3/gs3build/extractor/MetaXMLExtractor.java@ 12188

Last change on this file since 12188 was 12188, checked in by kjdon, 18 years ago
Initial revision
Property svn:keywords set to `Author Date Id Revision`
File size: 7.7 KB

Line
1	package org.greenstone.gsdl3.gs3build.extractor;
2
3	import java.io.FileReader;
4
5	import java.net.URL;
6
7	import java.util.List;
8	import java.util.ArrayList;
9	import java.util.Iterator;
10	import org.greenstone.gsdl3.gs3build.util.GS2TextFileHandler;
11
12	import org.apache.xerces.parsers.SAXParser;
13	import org.xml.sax.XMLReader;
14	import org.xml.sax.InputSource;
15	import org.xml.sax.SAXException;
16	import org.xml.sax.Attributes;
17	import org.xml.sax.helpers.XMLReaderFactory;
18	import org.xml.sax.helpers.DefaultHandler;
19
20	import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
21	import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
22	import org.greenstone.gsdl3.gs3build.doctypes.MetadataDocument;
23	import org.greenstone.gsdl3.gs3build.doctypes.DocumentLoader;
24	import org.greenstone.gsdl3.gs3build.doctypes.DocumentList;
25
26	import org.greenstone.gsdl3.gs3build.metadata.MetadataLabel;
27
28	public class MetaXMLExtractor implements ExtractorInterface
29	{
30	/**
31	* An inner class to handle Metadata files
32	*/
33	class MetadataHandler extends DefaultHandler
34	{ List files;
35	String label;
36	StringBuffer value;
37	URL url;
38	boolean inElement;
39	boolean accumulate;
40	DocumentList documentList;
41	List documentIds;
42	List documents;
43
44	MetadataHandler(DocumentList documentList)
45	{ super();
46
47	this.label = null;
48	this.value = null;
49	this.documentList = documentList;
50	}
51
52	public void startElement(String URI, String localName, String qName, Attributes attributes)
53	{ if (localName.equals("FileName"))
54	{ this.value = new StringBuffer();
55	}
56	else if (localName.equals("FileSet"))
57	{ this.files = new ArrayList();
58	}
59	else if (localName.equals("Description"))
60	{ this.documentIds = this.documentList.findDocumentIdsUsingFiles(this.files, this.url.toString());
61
62	if (documentIds != null && documentIds.size() > 0) {
63	this.documents = new ArrayList();
64
65	Iterator idIterator = documentIds.iterator();
66	while (idIterator.hasNext()) {
67	String docIdString = idIterator.next().toString();
68	DocumentID docId = new DocumentID(docIdString);
69	DocumentInterface document = documentList.getDocument(docId);
70	if (document != null) {
71	documents.add(document);
72	}
73	}
74	}
75	}
76	else if (localName.equals("Metadata"))
77	{ this.label = attributes.getValue("name");
78	this.value = new StringBuffer();
79
80	String mode = attributes.getValue("mode");
81	this.accumulate = mode.equals(ExtractorManager.ACCUMULATE_MODE);
82	}
83	}
84
85	public void endElement(String URI, String localName, String qName)
86	{ if (localName.equals("FileName"))
87	{ String file = this.value.toString();
88	this.value = null;
89	this.files.add(file);
90	}
91	else if (localName.equals("FileSet"))
92	{ // post the existing files item...
93	}
94	else if (localName.equals("Description"))
95	{
96	if (this.documents != null && documents.size() > 0) {
97	// write out the modified documents
98	// TODO: nicer/more generalised interface for this and related activity in
99	// extractor manager (actually, enricher manager);
100	Iterator docIterator = documents.iterator();
101	while (docIterator.hasNext()) {
102	DocumentInterface document = (DocumentInterface) docIterator.next();
103
104	// System.out.println("Writing modified document " + document.getID());
105	documentList.storeChangedDocument(document);
106	}
107	}
108	}
109	else if (localName.equals("Metadata"))
110	{ MetaXMLExtractor.postMetadata(this.url, this.files,
111	this.label, this.value.toString(),
112	this.accumulate);
113	/*
114	if (documentIds != null) {
115	Iterator iterator = documentIds.iterator();
116	while (iterator.hasNext()) {
117	System.out.println("Matches file " + iterator.next().toString());
118	}
119	}
120	*/
121
122	if (documentIds != null && documentIds.size() > 0) {
123	Iterator docIterator = this.documents.iterator();
124	while (docIterator.hasNext()) {
125	DocumentInterface document = (DocumentInterface) docIterator.next();
126
127	// Post to document
128	// TODO: tailor this to posting documents to sections as required...
129	if (accumulate) {
130	document.addDocumentMetadata(new MetadataLabel(this.label), this.value.toString());
131	}
132	else {
133	document.setDocumentMetadata(new MetadataLabel(this.label), this.value.toString());
134	}
135	}
136	}
137
138	// flatten the metadata items again...
139	this.value = null;
140	this.label = null;
141	}
142	}
143
144	public void characters(char c[], int start, int length)
145	{ if (this.value != null)
146	{ String string = new String(c, start, length);
147	this.value.append(string);
148	}
149	}
150
151	public void setUrl(URL url)
152	{ this.url = url;
153	}
154	}
155
156	private DocumentList documentList;
157
158	/**
159	* Construct of extractor
160	*/
161	public MetaXMLExtractor()
162	{ // Intentionally left blank
163	}
164
165	/**
166	* This extractor doesn't need to do any preparation/completion work,
167	* so this member function is empty.
168	*/
169	public void configure(String outputDir)
170	{ // Intentionally left blank
171	}
172
173	public void configure(DocumentList list)
174	{ this.documentList = list;
175	}
176
177	/**
178	* This extractor doesn't need to do any preparation/completion work,
179	* so this member function is empty.
180	*/
181	public void startPass(int passNo)
182	{ // Intentionally left blank
183	}
184
185	/**
186	* Process the document - for a metadata document, this results in the
187	* decoration of other files, for other documents, it does nothing.
188	*/
189	public void extractDocument(DocumentID docID, DocumentInterface document)
190	{ if (document.getDocumentType().equals(MetadataDocument.METADATA_DOCUMENT_TYPE))
191	{ // Extract the content from the metadata file
192	URL url;
193
194	try {
195	SAXParser parser = new SAXParser();
196	MetadataHandler handler = new MetadataHandler(this.documentList);
197	/*
198	XMLReader reader = XMLReaderFactory.createXMLReader();
199	reader.setContentHandler(handler);
200	reader.setErrorHandler(handler);*/
201	parser.setContentHandler(handler);
202
203	// Get path of file; we cheat here by assuming that the url is a file - this
204	// really ought to be done better [TODO: fix to handle full paths & URLs]
205	url = document.getDocumentFiles().getFile(0).getURL();
206	String filePath = url.getPath();
207	handler.setUrl(new URL(url, "."));
208
209	// A metadata document consists of one file only - get it from the 'default'
210	// file group
211	/*
212	FileReader fileReader = new FileReader(filePath);
213	reader.parse(new InputSource(fileReader));
214	*/
215	parser.parse(filePath);
216	}
217	catch (SAXException saxException)
218	{ // TODO: log error
219	System.err.println(saxException);
220	}
221	catch (java.io.FileNotFoundException fileException)
222	{ System.err.println(fileException);
223	}
224	catch (java.io.IOException ioException)
225	{ System.err.println(ioException);
226	}
227	/* catch (java.net.MalformedURLException malEx) {
228	System.err.println("Unable to get parent of URL "+url.toString()+" in metadata extraction.");
229	return;
230	}
231	*/
232
233	// for each document post it to the corresponding document
234	}
235	}
236
237	protected static void postMetadata(URL url, List files, String label, String value, boolean accumulate)
238	{ String file;
239
240	Iterator fileIter = files.iterator();
241	while (fileIter.hasNext()) {
242	file = fileIter.next().toString();
243
244	System.out.println(url.toString() + " " + file + ": " + label + "=" + value);
245	}
246	}
247
248	/**
249	* This extractor doesn't need to do any preparation/completion work,
250	* so this member function is empty.
251	*/
252	public void endPass(int passNo)
253	{ // Intentionally left blank
254	}
255
256	/**
257	* This extractor is a simple, single-pass extractor
258	*
259	* @see: org.greenstone.gsdl3.gs3build.extractor.ExtractorInterface:getNumberOfPasses
260	*/
261	public int getNumberOfPasses()
262	{ return 1;
263	}
264	}
265
266
267
268

Note: See TracBrowser for help on using the repository browser.

Download in other formats: