Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/greenstone3-extensions/gs3build/src/org/greenstone/gsdl3/gs3build/extractor/IndexExtractor.java@ 13242

Last change on this file since 13242 was 12188, checked in by kjdon, 18 years ago
Initial revision
Property svn:keywords set to `Author Date Id Revision`
File size: 7.0 KB

Line
1	package org.greenstone.gsdl3.gs3build.extractor;
2
3	import java.io.FileReader;
4
5	import java.net.URL;
6
7	import java.util.List;
8	import java.util.ArrayList;
9	import java.util.Iterator;
10	import org.greenstone.gsdl3.gs3build.util.GS2TextFileHandler;
11
12	import org.xml.sax.XMLReader;
13	import org.xml.sax.InputSource;
14	import org.xml.sax.SAXException;
15	import org.xml.sax.Attributes;
16	import org.xml.sax.helpers.XMLReaderFactory;
17	import org.xml.sax.helpers.DefaultHandler;
18
19	import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
20	import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
21	import org.greenstone.gsdl3.gs3build.doctypes.IndexDocument;
22	import org.greenstone.gsdl3.gs3build.doctypes.DocumentLoader;
23	import org.greenstone.gsdl3.gs3build.doctypes.DocumentList;
24
25	import org.greenstone.gsdl3.gs3build.metadata.MetadataLabel;
26
27	public class IndexExtractor implements ExtractorInterface
28	{
29	class IndexHandlerException extends Exception
30	{ public IndexHandlerException(String value)
31	{ super(value);
32	}
33	}
34
35	/**
36	* An inner class to handle GML files
37	*/
38	class IndexHandler extends GS2TextFileHandler
39	{ List labels;
40	URL base;
41
42	IndexHandler(String content, URL url, DocumentList documentList) throws IndexHandlerException
43	{
44	super(content);
45
46	this.labels = new ArrayList();
47	this.base = url;
48
49	String parentDir;
50	int leaf = this.base.toString().lastIndexOf('/');
51	if (leaf >= 0) {
52	parentDir = this.base.toString().substring(0, leaf+1);
53	}
54	else {
55	parentDir = this.base.toString();
56	}
57
58	// get the first line
59	this.getLine();
60
61	if (!this.hasMore())
62	{ throw new IndexHandlerException("No title line");
63	}
64
65	// get the first totem - it should be "key:"
66	String entry = this.getEntry(true);
67
68	// now get all the labels
69	while (this.hasMore())
70	{ String label = this.getEntry(true);
71	if (label == null \|\| label.length() == 0) {
72	continue;
73	}
74
75	this.labels.add(label);
76	System.out.println("Adding label: " + label);
77	}
78
79	while (this.hasMoreLines()) {
80	this.getLine();
81
82	// Get the file pattern itself
83	String filePattern = this.getEntry(true);
84	if (filePattern == null \|\| filePattern.length() == 0) {
85	continue;
86	}
87
88	// get a list of documents that match the file pattern
89	List documentIds = documentList.findDocumentIdsUsingFile(filePattern);
90	if (documentIds != null) {
91	Iterator iterator = documentIds.iterator();
92	while (iterator.hasNext()) {
93	System.out.println("Matches file " + iterator.next().toString());
94	}
95	}
96
97	// if no files match this data, then skip this row
98	// TODO: raise a quality error message
99	if (documentIds == null \|\| documentIds.size() == 0) {
100	continue;
101	}
102
103	// cache up the documents that match for speed improvements...
104	List documents = new ArrayList();
105	Iterator idIterator = documentIds.iterator();
106	while (idIterator.hasNext()) {
107	String docIdString = idIterator.next().toString();
108	System.out.println(docIdString);
109	DocumentID docId = new DocumentID(docIdString);
110	DocumentInterface document = documentList.getDocument(docId);
111	if (document != null) {
112	documents.add(document);
113	}
114	}
115
116	// Next, split the row into the separate metadata items
117	int entryNo = 0;
118	while (this.hasMore()) {
119	String item = this.getEntry(true);
120	if (item == null \|\| item.length() == 0) {
121	entryNo ++;
122	continue;
123	}
124
125	String label = null;
126	if (item.startsWith("<")) {
127	int labelEnd = item.indexOf('>');
128	if (labelEnd >= 0) {
129	label = item.substring(1, labelEnd);
130
131	item = item.substring(labelEnd+1, item.length());
132
133	// eliminate any weird whitespace
134	item.trim();
135
136	// cope with a solo 'item' label with no following string
137	if (item.length() == 0) {
138	entryNo ++;
139	continue;
140	}
141	}
142	// starts with a bracketed label
143	}
144	else if (entryNo < this.labels.size()) {
145	label = (String) this.labels.get(entryNo);
146	}
147
148	// Actually post the metadata -
149	// it may be good to have cached all the documents that we're going to change
150	// in order to minimise rewrites...
151	if (label != null) {
152	Iterator docIterator = documents.iterator();
153	while (docIterator.hasNext()) {
154	DocumentInterface document = (DocumentInterface) docIterator.next();
155
156	// Post to document
157	// TODO: tailor this to posting documents to sections as required...
158	document.addDocumentMetadata(new MetadataLabel(label), item);
159	System.out.println("Posting " + label + "=" + item + " to " + parentDir + filePattern);
160	}
161	}
162	entryNo ++;
163	}
164
165	// write out the modified documents
166	// TODO: nicer/more generalised interface for this and related activity in
167	// extractor manager (actually, enricher manager);
168	Iterator docIterator = documents.iterator();
169	while (docIterator.hasNext()) {
170	DocumentInterface document = (DocumentInterface) docIterator.next();
171
172	System.out.println("Writing modified document " + document.getID());
173	documentList.storeChangedDocument(document);
174	}
175	}
176	}
177
178	}
179
180	private DocumentList documentList;
181
182	/**
183	* Construct of extractor
184	*/
185	public IndexExtractor()
186	{ // Intentionally left blank
187	}
188
189	/**
190	* This extractor doesn't need to do any preparation/completion work,
191	* so this member function is empty.
192	*/
193	public void configure(String outputDir)
194	{ // Intentionally left blank
195	}
196
197	public void configure(DocumentList list)
198	{ this.documentList = list;
199	}
200
201	/**
202	* This extractor doesn't need to do any preparation/completion work,
203	* so this member function is empty.
204	*/
205	public void startPass(int passNo)
206	{ // Intentionally left blank
207	}
208
209	/**
210	* Process the document - for a GML document, this results in the
211	* decoration of other files, for other documents, it does nothing.
212	*/
213	public void extractDocument(DocumentID docID, DocumentInterface document)
214	{ if (document.getDocumentType().equals(IndexDocument.INDEX_DOCUMENT_TYPE))
215	{ // Extract the content from the index file
216
217	// get the file
218	String documentText =
219	DocumentLoader.getAsString(document.getDocumentFiles().getFile(0).getURL());
220
221	if (documentText == null) {
222	System.err.println("IndexExtractor: Unable to load any content for " + document.getDocumentFiles().getFile(0).toString());
223	return;
224	}
225
226	try {
227	IndexHandler handler = new IndexHandler(documentText, document.getDocumentFiles().getFile(0).getURL(), this.documentList);
228	}
229	catch (IndexHandlerException ex) {
230	}
231
232	// for each document post it to the corresponding document
233	}
234	}
235
236	protected static void postMetadata(String file, String value, String label)
237	{
238	}
239
240	/**
241	* This extractor doesn't need to do any preparation/completion work,
242	* so this member function is empty.
243	*/
244	public void endPass(int passNo)
245	{ // Intentionally left blank
246	}
247
248	/**
249	* This extractor is a simple, single-pass extractor
250	*
251	* @see: org.greenstone.gsdl3.gs3build.extractor.ExtractorInterface:getNumberOfPasses
252	*/
253	public int getNumberOfPasses()
254	{ return 1;
255	}
256	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: