package org.greenstone.gsdl3.gs3build.extractor; import java.io.FileReader; import java.net.URL; import java.util.List; import java.util.ArrayList; import java.util.Iterator; import org.greenstone.gsdl3.gs3build.util.GS2TextFileHandler; import org.apache.xerces.parsers.SAXParser; import org.xml.sax.XMLReader; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.Attributes; import org.xml.sax.helpers.XMLReaderFactory; import org.xml.sax.helpers.DefaultHandler; import org.greenstone.gsdl3.gs3build.doctypes.DocumentID; import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface; import org.greenstone.gsdl3.gs3build.doctypes.MetadataDocument; import org.greenstone.gsdl3.gs3build.doctypes.DocumentLoader; import org.greenstone.gsdl3.gs3build.doctypes.DocumentList; import org.greenstone.gsdl3.gs3build.metadata.MetadataLabel; public class MetaXMLExtractor implements ExtractorInterface { /** * An inner class to handle Metadata files */ class MetadataHandler extends DefaultHandler { List files; String label; StringBuffer value; URL url; boolean inElement; boolean accumulate; DocumentList documentList; List documentIds; List documents; MetadataHandler(DocumentList documentList) { super(); this.label = null; this.value = null; this.documentList = documentList; } public void startElement(String URI, String localName, String qName, Attributes attributes) { if (localName.equals("FileName")) { this.value = new StringBuffer(); } else if (localName.equals("FileSet")) { this.files = new ArrayList(); } else if (localName.equals("Description")) { this.documentIds = this.documentList.findDocumentIdsUsingFiles(this.files, this.url.toString()); if (documentIds != null && documentIds.size() > 0) { this.documents = new ArrayList(); Iterator idIterator = documentIds.iterator(); while (idIterator.hasNext()) { String docIdString = idIterator.next().toString(); DocumentID docId = new DocumentID(docIdString); DocumentInterface document = documentList.getDocument(docId); if (document != null) { documents.add(document); } } } } else if (localName.equals("Metadata")) { this.label = attributes.getValue("name"); this.value = new StringBuffer(); String mode = attributes.getValue("mode"); this.accumulate = mode.equals(ExtractorManager.ACCUMULATE_MODE); } } public void endElement(String URI, String localName, String qName) { if (localName.equals("FileName")) { String file = this.value.toString(); this.value = null; this.files.add(file); } else if (localName.equals("FileSet")) { // post the existing files item... } else if (localName.equals("Description")) { if (this.documents != null && documents.size() > 0) { // write out the modified documents // TODO: nicer/more generalised interface for this and related activity in // extractor manager (actually, enricher manager); Iterator docIterator = documents.iterator(); while (docIterator.hasNext()) { DocumentInterface document = (DocumentInterface) docIterator.next(); // System.out.println("Writing modified document " + document.getID()); documentList.storeChangedDocument(document); } } } else if (localName.equals("Metadata")) { MetaXMLExtractor.postMetadata(this.url, this.files, this.label, this.value.toString(), this.accumulate); /* if (documentIds != null) { Iterator iterator = documentIds.iterator(); while (iterator.hasNext()) { System.out.println("Matches file " + iterator.next().toString()); } } */ if (documentIds != null && documentIds.size() > 0) { Iterator docIterator = this.documents.iterator(); while (docIterator.hasNext()) { DocumentInterface document = (DocumentInterface) docIterator.next(); // Post to document // TODO: tailor this to posting documents to *sections* as required... if (accumulate) { document.addDocumentMetadata(new MetadataLabel(this.label), this.value.toString()); } else { document.setDocumentMetadata(new MetadataLabel(this.label), this.value.toString()); } } } // flatten the metadata items again... this.value = null; this.label = null; } } public void characters(char c[], int start, int length) { if (this.value != null) { String string = new String(c, start, length); this.value.append(string); } } public void setUrl(URL url) { this.url = url; } } private DocumentList documentList; /** * Construct of extractor */ public MetaXMLExtractor() { // Intentionally left blank } /** * This extractor doesn't need to do any preparation/completion work, * so this member function is empty. */ public void configure(String outputDir) { // Intentionally left blank } public void configure(DocumentList list) { this.documentList = list; } /** * This extractor doesn't need to do any preparation/completion work, * so this member function is empty. */ public void startPass(int passNo) { // Intentionally left blank } /** * Process the document - for a metadata document, this results in the * decoration of other files, for other documents, it does nothing. */ public void extractDocument(DocumentID docID, DocumentInterface document) { if (document.getDocumentType().equals(MetadataDocument.METADATA_DOCUMENT_TYPE)) { // Extract the content from the metadata file URL url; try { SAXParser parser = new SAXParser(); MetadataHandler handler = new MetadataHandler(this.documentList); /* XMLReader reader = XMLReaderFactory.createXMLReader(); reader.setContentHandler(handler); reader.setErrorHandler(handler);*/ parser.setContentHandler(handler); // Get path of file; we cheat here by assuming that the url is a file - this // really ought to be done better [TODO: fix to handle full paths & URLs] url = document.getDocumentFiles().getFile(0).getURL(); String filePath = url.getPath(); handler.setUrl(new URL(url, ".")); // A metadata document consists of one file only - get it from the 'default' // file group /* FileReader fileReader = new FileReader(filePath); reader.parse(new InputSource(fileReader)); */ parser.parse(filePath); } catch (SAXException saxException) { // TODO: log error System.err.println(saxException); } catch (java.io.FileNotFoundException fileException) { System.err.println(fileException); } catch (java.io.IOException ioException) { System.err.println(ioException); } /* catch (java.net.MalformedURLException malEx) { System.err.println("Unable to get parent of URL "+url.toString()+" in metadata extraction."); return; } */ // for each document post it to the corresponding document } } protected static void postMetadata(URL url, List files, String label, String value, boolean accumulate) { String file; Iterator fileIter = files.iterator(); while (fileIter.hasNext()) { file = fileIter.next().toString(); System.out.println(url.toString() + " " + file + ": " + label + "=" + value); } } /** * This extractor doesn't need to do any preparation/completion work, * so this member function is empty. */ public void endPass(int passNo) { // Intentionally left blank } /** * This extractor is a simple, single-pass extractor * * @see: org.greenstone.gsdl3.gs3build.extractor.ExtractorInterface:getNumberOfPasses */ public int getNumberOfPasses() { return 1; } }