Changeset 6013
- Timestamp:
- 2003-11-26T15:36:27+13:00 (20 years ago)
- Location:
- trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor/ExtractorInterface.java
r5800 r6013 3 3 import org.greenstone.gsdl3.gs3build.doctypes.DocumentID; 4 4 import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface; 5 import org.greenstone.gsdl3.gs3build.doctypes.DocumentList; 5 6 6 7 public interface ExtractorInterface 7 8 { 8 public void configure(String outputDir); 9 public void startPass(int passNumber); 10 public void extractDocument(DocumentID documentID, DocumentInterface document); 11 public void endPass(int passNumber); 12 public int getNumberOfPasses(); 9 public void configure(String outputDir); 10 public void configure(DocumentList list); 11 public void startPass(int passNumber); 12 public void extractDocument(DocumentID documentID, DocumentInterface document); 13 public void endPass(int passNumber); 14 public int getNumberOfPasses(); 13 15 } -
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor/ExtractorManager.java
r5946 r6013 25 25 this.list[this.used] = extractor; 26 26 this.used ++; 27 28 extractor.configure(this.documents); 27 29 } 28 30 -
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor/GMLExtractor.java
r5946 r6013 12 12 import org.greenstone.gsdl3.gs3build.doctypes.DocumentID; 13 13 import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface; 14 import org.greenstone.gsdl3.gs3build.doctypes.DocumentList; 14 15 import org.greenstone.gsdl3.gs3build.doctypes.GMLDocument; 15 16 … … 78 79 } 79 80 81 public void configure(DocumentList documentList) 82 { // Intentionally left blank 83 } 84 80 85 /** 81 86 * This extractor doesn't need to do any preparation/completion work, -
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor/IndexExtractor.java
r5946 r6013 2 2 3 3 import java.io.FileReader; 4 5 import java.net.URL; 4 6 5 7 import java.util.List; 6 8 import java.util.ArrayList; 9 import java.util.Iterator; 7 10 8 11 import org.xml.sax.XMLReader; … … 17 20 import org.greenstone.gsdl3.gs3build.doctypes.IndexDocument; 18 21 import org.greenstone.gsdl3.gs3build.doctypes.DocumentLoader; 22 import org.greenstone.gsdl3.gs3build.doctypes.DocumentList; 19 23 20 24 public class IndexExtractor implements ExtractorInterface … … 35 39 boolean doneRow; 36 40 List labels; 37 38 IndexHandler(String content) throws IndexHandlerException 41 URL base; 42 43 IndexHandler(String content, URL url, DocumentList documentList) throws IndexHandlerException 39 44 { this.content = content; 40 45 this.doneRow = false; 41 46 this.labels = new ArrayList(); 47 this.base = url; 48 49 String parentDir; 50 int leaf = this.base.toString().lastIndexOf('/'); 51 if (leaf >= 0) { 52 parentDir = this.base.toString().substring(0, leaf+1); 53 } 54 else { 55 parentDir = this.base.toString(); 56 } 42 57 43 58 // get the first line … … 48 63 } 49 64 50 // get the first totem - it should be blank 65 // get the first totem - it should be "key:" 66 String entry = this.getEntry(true); 51 67 68 // now get all the labels 69 while (this.hasMore()) 70 { String label = this.getEntry(true); 71 if (label == null) 72 continue; 73 label.trim(); 74 75 if (label.length() == 0) { 76 continue; 77 } 78 79 this.labels.add(label); 80 System.out.println("Adding label: " + label); 81 } 82 83 while (this.hasMoreLines()) { 84 this.getLine(); 85 86 // Get the file pattern itself 87 String filePattern = this.getEntry(true); 88 if (filePattern == null) { 89 continue; 90 } 91 92 filePattern.trim(); 93 if (filePattern.length() == 0) { 94 continue; 95 } 96 97 // get a list of documents that match the file pattern 98 List files = documentList.findDocumentIdsUsingFile(filePattern); 99 if (files != null) { 100 Iterator iterator = files.iterator(); 101 while (iterator.hasNext()) { 102 System.out.println("Matches file " + iterator.next().toString()); 103 } 104 } 105 106 // if no files match this data, then skip this row 107 // TODO: raise a quality error message 108 if (files == null || files.size() == 0) { 109 continue; 110 } 111 112 // TODO: cache up the documents that match for speed? 113 114 // Next, split the row into the separate metadata items 115 int entryNo = 0; 116 while (this.hasMore()) { 117 String item = this.getEntry(true); 118 if (item == null) { 119 entryNo ++; 120 continue; 121 } 122 123 item.trim(); 124 if (item.length() == 0) { 125 entryNo ++; 126 continue; 127 } 128 129 String label = null; 130 if (item.startsWith("<")) { 131 int labelEnd = item.indexOf('>'); 132 if (labelEnd >= 0) { 133 label = item.substring(1, labelEnd); 134 135 item = item.substring(labelEnd+1, item.length()); 136 137 // eliminate any weird whitespace 138 item.trim(); 139 140 // cope with a solo 'item' label with no following string 141 if (item.length() == 0) { 142 entryNo ++; 143 continue; 144 } 145 } 146 // starts with a bracketed label 147 } 148 else if (entryNo < this.labels.size()) { 149 label = (String) this.labels.get(entryNo); 150 } 151 152 // Actually post the metadata - 153 // it may be good to have cached all the documents that we're going to change 154 // in order to minimise rewrites... 155 if (label != null) { 156 System.out.println("Posting " + label + "=" + item + " to " + parentDir + filePattern); 157 // Post to document 158 } 159 entryNo ++; 160 } 161 } 52 162 } 53 163 … … 61 171 62 172 private String getEntry() 63 { int tab = this.line.indexOf('\t'); 64 String reply; 65 66 if (tab < 0) { 173 { return this.getEntry(false); 174 } 175 176 private String getEntry(boolean breakSpace) 177 { String reply; 178 int start, tab = 0; 179 boolean quoted = false; 180 181 start = 0; 182 while (start < this.line.length() && 183 this.line.charAt(start) == ' ') { 184 start ++; 185 } 186 187 if (start == this.line.length()) { 188 this.line = null; 189 return null; 190 } 191 192 if (this.line.charAt(start) == '"') { 193 quoted = true; 194 breakSpace = false; 195 start ++; 196 } 197 tab = start; 198 199 while (tab != this.line.length() && 200 this.line.charAt(tab) != '\t' && 201 !(quoted && this.line.charAt(tab) == '"') && 202 !(this.line.charAt(tab) == ' ' && breakSpace)) 203 { tab ++; 204 } 205 206 if (start > 0) { 207 this.line = this.line.substring(start); 208 tab -= start; 209 } 210 211 if (tab == this.line.length()) { 67 212 reply = this.line; 68 213 this.line = null; … … 77 222 78 223 private String getLine() 79 { do { 224 { if (this.content == null) { 225 this.line = null; 226 return null; 227 } 228 229 do { 80 230 int eol = this.content.indexOf('\n'); 81 231 if (eol < 0) { … … 95 245 this.line.trim(); 96 246 } 97 } while (this.line != null && this.line.length() == 0); 247 } while (this.content != null && this.line != null && this.line.length() == 0); 248 98 249 return this.line; 99 250 } 100 251 } 252 253 private DocumentList documentList; 101 254 102 255 /** … … 113 266 public void configure(String outputDir) 114 267 { // Intentionally left blank 268 } 269 270 public void configure(DocumentList list) 271 { this.documentList = list; 115 272 } 116 273 … … 132 289 133 290 // get the file 134 String documentText = null; 135 // String documentText = 136 // DocumentLoader.getAsString(document.getDocumentFiles().getFile(0).toString()); 291 String documentText = 292 DocumentLoader.getAsString(document.getDocumentFiles().getFile(0).getURL()); 137 293 138 294 if (documentText == null) { … … 142 298 143 299 try { 144 IndexHandler handler = new IndexHandler(documentText);300 IndexHandler handler = new IndexHandler(documentText, document.getDocumentFiles().getFile(0).getURL(), this.documentList); 145 301 } 146 302 catch (IndexHandlerException ex) {
Note:
See TracChangeset
for help on using the changeset viewer.