1 | package org.greenstone.gsdl3.gs3build.extractor;
|
---|
2 |
|
---|
3 | import java.io.FileReader;
|
---|
4 |
|
---|
5 | import java.net.URL;
|
---|
6 |
|
---|
7 | import java.util.List;
|
---|
8 | import java.util.ArrayList;
|
---|
9 | import java.util.Iterator;
|
---|
10 | import org.greenstone.gsdl3.gs3build.util.GS2TextFileHandler;
|
---|
11 |
|
---|
12 | import org.xml.sax.XMLReader;
|
---|
13 | import org.xml.sax.InputSource;
|
---|
14 | import org.xml.sax.SAXException;
|
---|
15 | import org.xml.sax.Attributes;
|
---|
16 | import org.xml.sax.helpers.XMLReaderFactory;
|
---|
17 | import org.xml.sax.helpers.DefaultHandler;
|
---|
18 |
|
---|
19 | import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
|
---|
20 | import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
|
---|
21 | import org.greenstone.gsdl3.gs3build.doctypes.IndexDocument;
|
---|
22 | import org.greenstone.gsdl3.gs3build.doctypes.DocumentLoader;
|
---|
23 | import org.greenstone.gsdl3.gs3build.doctypes.DocumentList;
|
---|
24 |
|
---|
25 | import org.greenstone.gsdl3.gs3build.metadata.MetadataLabel;
|
---|
26 |
|
---|
27 | public class IndexExtractor implements ExtractorInterface
|
---|
28 | {
|
---|
29 | class IndexHandlerException extends Exception
|
---|
30 | { public IndexHandlerException(String value)
|
---|
31 | { super(value);
|
---|
32 | }
|
---|
33 | }
|
---|
34 |
|
---|
35 | /**
|
---|
36 | * An inner class to handle GML files
|
---|
37 | */
|
---|
38 | class IndexHandler extends GS2TextFileHandler
|
---|
39 | { List labels;
|
---|
40 | URL base;
|
---|
41 |
|
---|
42 | IndexHandler(String content, URL url, DocumentList documentList) throws IndexHandlerException
|
---|
43 | {
|
---|
44 | super(content);
|
---|
45 |
|
---|
46 | this.labels = new ArrayList();
|
---|
47 | this.base = url;
|
---|
48 |
|
---|
49 | String parentDir;
|
---|
50 | int leaf = this.base.toString().lastIndexOf('/');
|
---|
51 | if (leaf >= 0) {
|
---|
52 | parentDir = this.base.toString().substring(0, leaf+1);
|
---|
53 | }
|
---|
54 | else {
|
---|
55 | parentDir = this.base.toString();
|
---|
56 | }
|
---|
57 |
|
---|
58 | // get the first line
|
---|
59 | this.getLine();
|
---|
60 |
|
---|
61 | if (!this.hasMore())
|
---|
62 | { throw new IndexHandlerException("No title line");
|
---|
63 | }
|
---|
64 |
|
---|
65 | // get the first totem - it should be "key:"
|
---|
66 | String entry = this.getEntry(true);
|
---|
67 |
|
---|
68 | // now get all the labels
|
---|
69 | while (this.hasMore())
|
---|
70 | { String label = this.getEntry(true);
|
---|
71 | if (label == null || label.length() == 0) {
|
---|
72 | continue;
|
---|
73 | }
|
---|
74 |
|
---|
75 | this.labels.add(label);
|
---|
76 | System.out.println("Adding label: " + label);
|
---|
77 | }
|
---|
78 |
|
---|
79 | while (this.hasMoreLines()) {
|
---|
80 | this.getLine();
|
---|
81 |
|
---|
82 | // Get the file pattern itself
|
---|
83 | String filePattern = this.getEntry(true);
|
---|
84 | if (filePattern == null || filePattern.length() == 0) {
|
---|
85 | continue;
|
---|
86 | }
|
---|
87 |
|
---|
88 | // get a list of documents that match the file pattern
|
---|
89 | List documentIds = documentList.findDocumentIdsUsingFile(filePattern);
|
---|
90 | if (documentIds != null) {
|
---|
91 | Iterator iterator = documentIds.iterator();
|
---|
92 | while (iterator.hasNext()) {
|
---|
93 | System.out.println("Matches file " + iterator.next().toString());
|
---|
94 | }
|
---|
95 | }
|
---|
96 |
|
---|
97 | // if no files match this data, then skip this row
|
---|
98 | // TODO: raise a quality error message
|
---|
99 | if (documentIds == null || documentIds.size() == 0) {
|
---|
100 | continue;
|
---|
101 | }
|
---|
102 |
|
---|
103 | // cache up the documents that match for speed improvements...
|
---|
104 | List documents = new ArrayList();
|
---|
105 | Iterator idIterator = documentIds.iterator();
|
---|
106 | while (idIterator.hasNext()) {
|
---|
107 | String docIdString = idIterator.next().toString();
|
---|
108 | System.out.println(docIdString);
|
---|
109 | DocumentID docId = new DocumentID(docIdString);
|
---|
110 | DocumentInterface document = documentList.getDocument(docId);
|
---|
111 | if (document != null) {
|
---|
112 | documents.add(document);
|
---|
113 | }
|
---|
114 | }
|
---|
115 |
|
---|
116 | // Next, split the row into the separate metadata items
|
---|
117 | int entryNo = 0;
|
---|
118 | while (this.hasMore()) {
|
---|
119 | String item = this.getEntry(true);
|
---|
120 | if (item == null || item.length() == 0) {
|
---|
121 | entryNo ++;
|
---|
122 | continue;
|
---|
123 | }
|
---|
124 |
|
---|
125 | String label = null;
|
---|
126 | if (item.startsWith("<")) {
|
---|
127 | int labelEnd = item.indexOf('>');
|
---|
128 | if (labelEnd >= 0) {
|
---|
129 | label = item.substring(1, labelEnd);
|
---|
130 |
|
---|
131 | item = item.substring(labelEnd+1, item.length());
|
---|
132 |
|
---|
133 | // eliminate any weird whitespace
|
---|
134 | item.trim();
|
---|
135 |
|
---|
136 | // cope with a solo 'item' label with no following string
|
---|
137 | if (item.length() == 0) {
|
---|
138 | entryNo ++;
|
---|
139 | continue;
|
---|
140 | }
|
---|
141 | }
|
---|
142 | // starts with a bracketed label
|
---|
143 | }
|
---|
144 | else if (entryNo < this.labels.size()) {
|
---|
145 | label = (String) this.labels.get(entryNo);
|
---|
146 | }
|
---|
147 |
|
---|
148 | // Actually post the metadata -
|
---|
149 | // it may be good to have cached all the documents that we're going to change
|
---|
150 | // in order to minimise rewrites...
|
---|
151 | if (label != null) {
|
---|
152 | Iterator docIterator = documents.iterator();
|
---|
153 | while (docIterator.hasNext()) {
|
---|
154 | DocumentInterface document = (DocumentInterface) docIterator.next();
|
---|
155 |
|
---|
156 | // Post to document
|
---|
157 | // TODO: tailor this to posting documents to *sections* as required...
|
---|
158 | document.addDocumentMetadata(new MetadataLabel(label), item);
|
---|
159 | System.out.println("Posting " + label + "=" + item + " to " + parentDir + filePattern);
|
---|
160 | }
|
---|
161 | }
|
---|
162 | entryNo ++;
|
---|
163 | }
|
---|
164 |
|
---|
165 | // write out the modified documents
|
---|
166 | // TODO: nicer/more generalised interface for this and related activity in
|
---|
167 | // extractor manager (actually, enricher manager);
|
---|
168 | Iterator docIterator = documents.iterator();
|
---|
169 | while (docIterator.hasNext()) {
|
---|
170 | DocumentInterface document = (DocumentInterface) docIterator.next();
|
---|
171 |
|
---|
172 | System.out.println("Writing modified document " + document.getID());
|
---|
173 | documentList.storeChangedDocument(document);
|
---|
174 | }
|
---|
175 | }
|
---|
176 | }
|
---|
177 |
|
---|
178 | }
|
---|
179 |
|
---|
180 | private DocumentList documentList;
|
---|
181 |
|
---|
182 | /**
|
---|
183 | * Construct of extractor
|
---|
184 | */
|
---|
185 | public IndexExtractor()
|
---|
186 | { // Intentionally left blank
|
---|
187 | }
|
---|
188 |
|
---|
189 | /**
|
---|
190 | * This extractor doesn't need to do any preparation/completion work,
|
---|
191 | * so this member function is empty.
|
---|
192 | */
|
---|
193 | public void configure(String outputDir)
|
---|
194 | { // Intentionally left blank
|
---|
195 | }
|
---|
196 |
|
---|
197 | public void configure(DocumentList list)
|
---|
198 | { this.documentList = list;
|
---|
199 | }
|
---|
200 |
|
---|
201 | /**
|
---|
202 | * This extractor doesn't need to do any preparation/completion work,
|
---|
203 | * so this member function is empty.
|
---|
204 | */
|
---|
205 | public void startPass(int passNo)
|
---|
206 | { // Intentionally left blank
|
---|
207 | }
|
---|
208 |
|
---|
209 | /**
|
---|
210 | * Process the document - for a GML document, this results in the
|
---|
211 | * decoration of other files, for other documents, it does nothing.
|
---|
212 | */
|
---|
213 | public void extractDocument(DocumentID docID, DocumentInterface document)
|
---|
214 | { if (document.getDocumentType().equals(IndexDocument.INDEX_DOCUMENT_TYPE))
|
---|
215 | { // Extract the content from the index file
|
---|
216 |
|
---|
217 | // get the file
|
---|
218 | String documentText =
|
---|
219 | DocumentLoader.getAsString(document.getDocumentFiles().getFile(0).getURL());
|
---|
220 |
|
---|
221 | if (documentText == null) {
|
---|
222 | System.err.println("IndexExtractor: Unable to load any content for " + document.getDocumentFiles().getFile(0).toString());
|
---|
223 | return;
|
---|
224 | }
|
---|
225 |
|
---|
226 | try {
|
---|
227 | IndexHandler handler = new IndexHandler(documentText, document.getDocumentFiles().getFile(0).getURL(), this.documentList);
|
---|
228 | }
|
---|
229 | catch (IndexHandlerException ex) {
|
---|
230 | }
|
---|
231 |
|
---|
232 | // for each document post it to the corresponding document
|
---|
233 | }
|
---|
234 | }
|
---|
235 |
|
---|
236 | protected static void postMetadata(String file, String value, String label)
|
---|
237 | {
|
---|
238 | }
|
---|
239 |
|
---|
240 | /**
|
---|
241 | * This extractor doesn't need to do any preparation/completion work,
|
---|
242 | * so this member function is empty.
|
---|
243 | */
|
---|
244 | public void endPass(int passNo)
|
---|
245 | { // Intentionally left blank
|
---|
246 | }
|
---|
247 |
|
---|
248 | /**
|
---|
249 | * This extractor is a simple, single-pass extractor
|
---|
250 | *
|
---|
251 | * @see: org.greenstone.gsdl3.gs3build.extractor.ExtractorInterface:getNumberOfPasses
|
---|
252 | */
|
---|
253 | public int getNumberOfPasses()
|
---|
254 | { return 1;
|
---|
255 | }
|
---|
256 | }
|
---|