Changeset 6453
- Timestamp:
- 2004-01-12T15:55:11+13:00 (20 years ago)
- Location:
- trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor/ExtractorManager.java
r6013 r6453 9 9 public class ExtractorManager 10 10 { 11 public static final String ACCUMULATE_MODE = "accumulate"; 12 11 13 DocumentList documents; 12 14 ExtractorInterface [] list; -
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor/GMLExtractor.java
r6289 r6453 54 54 55 55 String mode = attributes.getValue("mode"); 56 this.accumulate = mode.equals( ACCUMULATE_MODE);56 this.accumulate = mode.equals(ExtractorManager.ACCUMULATE_MODE); 57 57 } 58 58 } -
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor/MetaXMLExtractor.java
r6289 r6453 10 10 import org.greenstone.gsdl3.gs3build.util.GS2TextFileHandler; 11 11 12 import org.apache.xerces.parsers.SAXParser; 12 13 import org.xml.sax.XMLReader; 13 14 import org.xml.sax.InputSource; … … 19 20 import org.greenstone.gsdl3.gs3build.doctypes.DocumentID; 20 21 import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface; 21 import org.greenstone.gsdl3.gs3build.doctypes. IndexDocument;22 import org.greenstone.gsdl3.gs3build.doctypes.MetadataDocument; 22 23 import org.greenstone.gsdl3.gs3build.doctypes.DocumentLoader; 23 24 import org.greenstone.gsdl3.gs3build.doctypes.DocumentList; … … 27 28 public class MetaXMLExtractor implements ExtractorInterface 28 29 { 29 class IndexHandlerException extends Exception 30 { public IndexHandlerException(String value) 31 { super(value); 32 } 33 } 34 35 /** 36 * An inner class to handle GML files 37 */ 38 class IndexHandler extends GS2TextFileHandler 39 { List labels; 40 URL base; 41 42 IndexHandler(String content, URL url, DocumentList documentList) throws IndexHandlerException 43 { 44 super(content); 45 46 this.labels = new ArrayList(); 47 this.base = url; 48 49 String parentDir; 50 int leaf = this.base.toString().lastIndexOf('/'); 51 if (leaf >= 0) { 52 parentDir = this.base.toString().substring(0, leaf+1); 53 } 54 else { 55 parentDir = this.base.toString(); 56 } 57 58 // get the first line 59 this.getLine(); 60 61 if (!this.hasMore()) 62 { throw new IndexHandlerException("No title line"); 63 } 64 65 // get the first totem - it should be "key:" 66 String entry = this.getEntry(true); 67 68 // now get all the labels 69 while (this.hasMore()) 70 { String label = this.getEntry(true); 71 if (label == null || label.length() == 0) { 72 continue; 73 } 30 /** 31 * An inner class to handle Metadata files 32 */ 33 class MetadataHandler extends DefaultHandler 34 { List files; 35 String label; 36 StringBuffer value; 37 URL url; 38 boolean inElement; 39 boolean accumulate; 40 DocumentList documentList; 41 42 MetadataHandler(DocumentList documentList) 43 { super(); 44 45 this.label = null; 46 this.value = null; 47 this.documentList = documentList; 48 } 49 50 public void startElement(String URI, String localName, String qName, Attributes attributes) 51 { if (localName.equals("FileName")) 52 { this.value = new StringBuffer(); 53 } 54 else if (localName.equals("FileSet")) 55 { this.files = new ArrayList(); 56 } 57 else if (localName.equals("Description")) 58 { 59 } 60 else if (localName.equals("Metadata")) 61 { this.label = attributes.getValue("name"); 62 this.value = new StringBuffer(); 63 64 String mode = attributes.getValue("mode"); 65 this.accumulate = mode.equals(ExtractorManager.ACCUMULATE_MODE); 66 } 67 } 68 69 public void endElement(String URI, String localName, String qName) 70 { if (localName.equals("FileName")) 71 { String file = this.value.toString(); 72 this.value = null; 73 this.files.add(file); 74 } 75 else if (localName.equals("FileSet")) 76 { // post the existing files item... 77 } 78 else if (localName.equals("Description")) 79 { 80 } 81 else if (localName.equals("Metadata")) 82 { List documentIds; 83 84 documentIds = this.documentList.findDocumentIdsUsingFiles(this.files, this.url.toString()); 74 85 75 this.labels.add(label); 76 System.out.println("Adding label: " + label); 77 } 78 79 while (this.hasMoreLines()) { 80 this.getLine(); 81 82 // Get the file pattern itself 83 String filePattern = this.getEntry(true); 84 if (filePattern == null || filePattern.length() == 0) { 85 continue; 86 } 87 88 // get a list of documents that match the file pattern 89 List documentIds = documentList.findDocumentIdsUsingFile(filePattern); 86 MetaXMLExtractor.postMetadata(this.url, this.files, 87 this.label, this.value.toString(), 88 this.accumulate); 90 89 if (documentIds != null) { 91 90 Iterator iterator = documentIds.iterator(); … … 95 94 } 96 95 97 // if no files match this data, then skip this row 98 // TODO: raise a quality error message 99 if (documentIds == null || documentIds.size() == 0) { 100 continue; 101 } 102 103 // cache up the documents that match for speed improvements... 104 List documents = new ArrayList(); 105 Iterator idIterator = documentIds.iterator(); 106 while (idIterator.hasNext()) { 107 String docIdString = idIterator.next().toString(); 108 System.out.println(docIdString); 109 DocumentID docId = new DocumentID(docIdString); 110 DocumentInterface document = documentList.getDocument(docId); 111 if (document != null) { 112 documents.add(document); 96 if (documentIds != null && documentIds.size() > 0) { 97 List documents = new ArrayList(); 98 99 Iterator idIterator = documentIds.iterator(); 100 while (idIterator.hasNext()) { 101 String docIdString = idIterator.next().toString(); 102 DocumentID docId = new DocumentID(docIdString); 103 DocumentInterface document = documentList.getDocument(docId); 104 if (document != null) { 105 documents.add(document); 106 } 107 } 108 109 Iterator docIterator = documents.iterator(); 110 while (docIterator.hasNext()) { 111 DocumentInterface document = (DocumentInterface) docIterator.next(); 112 113 // Post to document 114 // TODO: tailor this to posting documents to *sections* as required... 115 document.addDocumentMetadata(new MetadataLabel(this.label), this.value.toString()); 113 116 } 114 117 } 115 118 116 // Next, split the row into the separate metadata items 117 int entryNo = 0; 118 while (this.hasMore()) { 119 String item = this.getEntry(true); 120 if (item == null || item.length() == 0) { 121 entryNo ++; 122 continue; 123 } 124 125 String label = null; 126 if (item.startsWith("<")) { 127 int labelEnd = item.indexOf('>'); 128 if (labelEnd >= 0) { 129 label = item.substring(1, labelEnd); 130 131 item = item.substring(labelEnd+1, item.length()); 132 133 // eliminate any weird whitespace 134 item.trim(); 135 136 // cope with a solo 'item' label with no following string 137 if (item.length() == 0) { 138 entryNo ++; 139 continue; 140 } 141 } 142 // starts with a bracketed label 143 } 144 else if (entryNo < this.labels.size()) { 145 label = (String) this.labels.get(entryNo); 146 } 147 148 // Actually post the metadata - 149 // it may be good to have cached all the documents that we're going to change 150 // in order to minimise rewrites... 151 if (label != null) { 152 Iterator docIterator = documents.iterator(); 153 while (docIterator.hasNext()) { 154 DocumentInterface document = (DocumentInterface) docIterator.next(); 155 156 // Post to document 157 // TODO: tailor this to posting documents to *sections* as required... 158 document.addDocumentMetadata(new MetadataLabel(label), item); 159 System.out.println("Posting " + label + "=" + item + " to " + parentDir + filePattern); 160 } 161 } 162 entryNo ++; 163 } 164 165 // write out the modified documents 166 // TODO: nicer/more generalised interface for this and related activity in 167 // extractor manager (actually, enricher manager); 168 Iterator docIterator = documents.iterator(); 169 while (docIterator.hasNext()) { 170 DocumentInterface document = (DocumentInterface) docIterator.next(); 171 172 documentList.modifiedDocument(document); 173 } 174 } 175 } 176 119 // flatten the metadata items again... 120 this.value = null; 121 this.label = null; 122 } 123 } 124 125 public void characters(char c[], int start, int length) 126 { if (this.value != null) 127 { String string = new String(c, start, length); 128 this.value.append(string); 129 } 130 } 131 132 public void setUrl(URL url) 133 { this.url = url; 134 } 177 135 } 178 136 … … 207 165 208 166 /** 209 * Process the document - for a GMLdocument, this results in the167 * Process the document - for a metadata document, this results in the 210 168 * decoration of other files, for other documents, it does nothing. 211 169 */ 212 170 public void extractDocument(DocumentID docID, DocumentInterface document) 213 { if (document.getDocumentType().equals(IndexDocument.INDEX_DOCUMENT_TYPE)) 214 { // Extract the content from the index file 215 216 // get the file 217 String documentText = 218 DocumentLoader.getAsString(document.getDocumentFiles().getFile(0).getURL()); 219 220 if (documentText == null) { 221 System.err.println("MetaXMLExtractor: Unable to load any content for " + document.getDocumentFiles().getFile(0).toString()); 171 { if (document.getDocumentType().equals(MetadataDocument.METADATA_DOCUMENT_TYPE)) 172 { // Extract the content from the metadata file 173 URL url; 174 175 try { 176 SAXParser parser = new SAXParser(); 177 MetadataHandler handler = new MetadataHandler(this.documentList); 178 /* 179 XMLReader reader = XMLReaderFactory.createXMLReader(); 180 reader.setContentHandler(handler); 181 reader.setErrorHandler(handler);*/ 182 parser.setContentHandler(handler); 183 184 // Get path of file; we cheat here by assuming that the url is a file - this 185 // really ought to be done better [TODO: fix to handle full paths & URLs] 186 url = document.getDocumentFiles().getFile(0).getURL(); 187 String filePath = url.getPath(); 188 handler.setUrl(new URL(url, ".")); 189 190 // A metadata document consists of one file only - get it from the 'default' 191 // file group 192 /* 193 FileReader fileReader = new FileReader(filePath); 194 reader.parse(new InputSource(fileReader)); 195 */ 196 parser.parse(filePath); 197 } 198 catch (SAXException saxException) 199 { // TODO: log error 200 System.err.println(saxException); 201 } 202 catch (java.io.FileNotFoundException fileException) 203 { System.err.println(fileException); 204 } 205 catch (java.io.IOException ioException) 206 { System.err.println(ioException); 207 } 208 /* catch (java.net.MalformedURLException malEx) { 209 System.err.println("Unable to get parent of URL "+url.toString()+" in metadata extraction."); 222 210 return; 223 211 } 224 225 try { 226 IndexHandler handler = new IndexHandler(documentText, document.getDocumentFiles().getFile(0).getURL(), this.documentList); 227 } 228 catch (IndexHandlerException ex) { 229 } 230 212 */ 213 231 214 // for each document post it to the corresponding document 232 215 } 233 216 } 234 217 235 protected static void postMetadata(String file, String value, String label) 236 { 218 protected static void postMetadata(URL url, List files, String label, String value, boolean accumulate) 219 { String file; 220 221 Iterator fileIter = files.iterator(); 222 while (fileIter.hasNext()) { 223 file = fileIter.next().toString(); 224 225 System.out.println(url.toString() + " " + file + ": " + label + "=" + value); 226 } 237 227 } 238 228 … … 254 244 } 255 245 } 246 247 248 249
Note:
See TracChangeset
for help on using the changeset viewer.