source: trunk/greenstone3-extensions/gs3build/src/org/greenstone/gsdl3/gs3build/extractor/IndexExtractor.java@ 13242

Last change on this file since 13242 was 12188, checked in by kjdon, 18 years ago

Initial revision

  • Property svn:keywords set to Author Date Id Revision
File size: 7.0 KB
Line 
1package org.greenstone.gsdl3.gs3build.extractor;
2
3import java.io.FileReader;
4
5import java.net.URL;
6
7import java.util.List;
8import java.util.ArrayList;
9import java.util.Iterator;
10import org.greenstone.gsdl3.gs3build.util.GS2TextFileHandler;
11
12import org.xml.sax.XMLReader;
13import org.xml.sax.InputSource;
14import org.xml.sax.SAXException;
15import org.xml.sax.Attributes;
16import org.xml.sax.helpers.XMLReaderFactory;
17import org.xml.sax.helpers.DefaultHandler;
18
19import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
20import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
21import org.greenstone.gsdl3.gs3build.doctypes.IndexDocument;
22import org.greenstone.gsdl3.gs3build.doctypes.DocumentLoader;
23import org.greenstone.gsdl3.gs3build.doctypes.DocumentList;
24
25import org.greenstone.gsdl3.gs3build.metadata.MetadataLabel;
26
27public class IndexExtractor implements ExtractorInterface
28{
29 class IndexHandlerException extends Exception
30 { public IndexHandlerException(String value)
31 { super(value);
32 }
33 }
34
35 /**
36 * An inner class to handle GML files
37 */
38 class IndexHandler extends GS2TextFileHandler
39 { List labels;
40 URL base;
41
42 IndexHandler(String content, URL url, DocumentList documentList) throws IndexHandlerException
43 {
44 super(content);
45
46 this.labels = new ArrayList();
47 this.base = url;
48
49 String parentDir;
50 int leaf = this.base.toString().lastIndexOf('/');
51 if (leaf >= 0) {
52 parentDir = this.base.toString().substring(0, leaf+1);
53 }
54 else {
55 parentDir = this.base.toString();
56 }
57
58 // get the first line
59 this.getLine();
60
61 if (!this.hasMore())
62 { throw new IndexHandlerException("No title line");
63 }
64
65 // get the first totem - it should be "key:"
66 String entry = this.getEntry(true);
67
68 // now get all the labels
69 while (this.hasMore())
70 { String label = this.getEntry(true);
71 if (label == null || label.length() == 0) {
72 continue;
73 }
74
75 this.labels.add(label);
76 System.out.println("Adding label: " + label);
77 }
78
79 while (this.hasMoreLines()) {
80 this.getLine();
81
82 // Get the file pattern itself
83 String filePattern = this.getEntry(true);
84 if (filePattern == null || filePattern.length() == 0) {
85 continue;
86 }
87
88 // get a list of documents that match the file pattern
89 List documentIds = documentList.findDocumentIdsUsingFile(filePattern);
90 if (documentIds != null) {
91 Iterator iterator = documentIds.iterator();
92 while (iterator.hasNext()) {
93 System.out.println("Matches file " + iterator.next().toString());
94 }
95 }
96
97 // if no files match this data, then skip this row
98 // TODO: raise a quality error message
99 if (documentIds == null || documentIds.size() == 0) {
100 continue;
101 }
102
103 // cache up the documents that match for speed improvements...
104 List documents = new ArrayList();
105 Iterator idIterator = documentIds.iterator();
106 while (idIterator.hasNext()) {
107 String docIdString = idIterator.next().toString();
108 System.out.println(docIdString);
109 DocumentID docId = new DocumentID(docIdString);
110 DocumentInterface document = documentList.getDocument(docId);
111 if (document != null) {
112 documents.add(document);
113 }
114 }
115
116 // Next, split the row into the separate metadata items
117 int entryNo = 0;
118 while (this.hasMore()) {
119 String item = this.getEntry(true);
120 if (item == null || item.length() == 0) {
121 entryNo ++;
122 continue;
123 }
124
125 String label = null;
126 if (item.startsWith("<")) {
127 int labelEnd = item.indexOf('>');
128 if (labelEnd >= 0) {
129 label = item.substring(1, labelEnd);
130
131 item = item.substring(labelEnd+1, item.length());
132
133 // eliminate any weird whitespace
134 item.trim();
135
136 // cope with a solo 'item' label with no following string
137 if (item.length() == 0) {
138 entryNo ++;
139 continue;
140 }
141 }
142 // starts with a bracketed label
143 }
144 else if (entryNo < this.labels.size()) {
145 label = (String) this.labels.get(entryNo);
146 }
147
148 // Actually post the metadata -
149 // it may be good to have cached all the documents that we're going to change
150 // in order to minimise rewrites...
151 if (label != null) {
152 Iterator docIterator = documents.iterator();
153 while (docIterator.hasNext()) {
154 DocumentInterface document = (DocumentInterface) docIterator.next();
155
156 // Post to document
157 // TODO: tailor this to posting documents to *sections* as required...
158 document.addDocumentMetadata(new MetadataLabel(label), item);
159 System.out.println("Posting " + label + "=" + item + " to " + parentDir + filePattern);
160 }
161 }
162 entryNo ++;
163 }
164
165 // write out the modified documents
166 // TODO: nicer/more generalised interface for this and related activity in
167 // extractor manager (actually, enricher manager);
168 Iterator docIterator = documents.iterator();
169 while (docIterator.hasNext()) {
170 DocumentInterface document = (DocumentInterface) docIterator.next();
171
172 System.out.println("Writing modified document " + document.getID());
173 documentList.storeChangedDocument(document);
174 }
175 }
176 }
177
178 }
179
180 private DocumentList documentList;
181
182 /**
183 * Construct of extractor
184 */
185 public IndexExtractor()
186 { // Intentionally left blank
187 }
188
189 /**
190 * This extractor doesn't need to do any preparation/completion work,
191 * so this member function is empty.
192 */
193 public void configure(String outputDir)
194 { // Intentionally left blank
195 }
196
197 public void configure(DocumentList list)
198 { this.documentList = list;
199 }
200
201 /**
202 * This extractor doesn't need to do any preparation/completion work,
203 * so this member function is empty.
204 */
205 public void startPass(int passNo)
206 { // Intentionally left blank
207 }
208
209 /**
210 * Process the document - for a GML document, this results in the
211 * decoration of other files, for other documents, it does nothing.
212 */
213 public void extractDocument(DocumentID docID, DocumentInterface document)
214 { if (document.getDocumentType().equals(IndexDocument.INDEX_DOCUMENT_TYPE))
215 { // Extract the content from the index file
216
217 // get the file
218 String documentText =
219 DocumentLoader.getAsString(document.getDocumentFiles().getFile(0).getURL());
220
221 if (documentText == null) {
222 System.err.println("IndexExtractor: Unable to load any content for " + document.getDocumentFiles().getFile(0).toString());
223 return;
224 }
225
226 try {
227 IndexHandler handler = new IndexHandler(documentText, document.getDocumentFiles().getFile(0).getURL(), this.documentList);
228 }
229 catch (IndexHandlerException ex) {
230 }
231
232 // for each document post it to the corresponding document
233 }
234 }
235
236 protected static void postMetadata(String file, String value, String label)
237 {
238 }
239
240 /**
241 * This extractor doesn't need to do any preparation/completion work,
242 * so this member function is empty.
243 */
244 public void endPass(int passNo)
245 { // Intentionally left blank
246 }
247
248 /**
249 * This extractor is a simple, single-pass extractor
250 *
251 * @see: org.greenstone.gsdl3.gs3build.extractor.ExtractorInterface:getNumberOfPasses
252 */
253 public int getNumberOfPasses()
254 { return 1;
255 }
256}
Note: See TracBrowser for help on using the repository browser.