source: trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor/IndexExtractor.java@ 5947

Last change on this file since 5947 was 5946, checked in by cs025, 21 years ago

Extensions and new IndexExtractor

  • Property svn:keywords set to Author Date Id Revision
File size: 4.1 KB
Line 
1package org.greenstone.gsdl3.gs3build.extractor;
2
3import java.io.FileReader;
4
5import java.util.List;
6import java.util.ArrayList;
7
8import org.xml.sax.XMLReader;
9import org.xml.sax.InputSource;
10import org.xml.sax.SAXException;
11import org.xml.sax.Attributes;
12import org.xml.sax.helpers.XMLReaderFactory;
13import org.xml.sax.helpers.DefaultHandler;
14
15import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
16import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
17import org.greenstone.gsdl3.gs3build.doctypes.IndexDocument;
18import org.greenstone.gsdl3.gs3build.doctypes.DocumentLoader;
19
20public class IndexExtractor implements ExtractorInterface
21{
22 class IndexHandlerException extends Exception
23 { public IndexHandlerException(String value)
24 { super(value);
25 }
26 }
27
28 /**
29 * An inner class to handle GML files
30 */
31 class IndexHandler
32 { String content;
33 String line;
34 int pos;
35 boolean doneRow;
36 List labels;
37
38 IndexHandler(String content) throws IndexHandlerException
39 { this.content = content;
40 this.doneRow = false;
41 this.labels = new ArrayList();
42
43 // get the first line
44 this.getLine();
45
46 if (!this.hasMore())
47 { throw new IndexHandlerException("No title line");
48 }
49
50 // get the first totem - it should be blank
51
52 }
53
54 private boolean hasMore()
55 { return this.line != null;
56 }
57
58 private boolean hasMoreLines()
59 { return this.content != null;
60 }
61
62 private String getEntry()
63 { int tab = this.line.indexOf('\t');
64 String reply;
65
66 if (tab < 0) {
67 reply = this.line;
68 this.line = null;
69 }
70 else {
71 reply = this.line.substring(0, tab);
72 this.line = this.line.substring(tab+1);
73 }
74
75 return reply;
76 }
77
78 private String getLine()
79 { do {
80 int eol = this.content.indexOf('\n');
81 if (eol < 0) {
82 this.line = this.content;
83 this.content = null;
84 }
85 else {
86 this.line = this.content.substring(0, eol);
87 this.content = this.content.substring(eol+1);
88 while (this.content.length() > 0 &&
89 this.content.charAt(0) < ' ')
90 { this.content = this.content.substring(1);
91 }
92 }
93
94 if (this.line != null) {
95 this.line.trim();
96 }
97 } while (this.line != null && this.line.length() == 0);
98 return this.line;
99 }
100 }
101
102 /**
103 * Construct of extractor
104 */
105 public IndexExtractor()
106 { // Intentionally left blank
107 }
108
109 /**
110 * This extractor doesn't need to do any preparation/completion work,
111 * so this member function is empty.
112 */
113 public void configure(String outputDir)
114 { // Intentionally left blank
115 }
116
117 /**
118 * This extractor doesn't need to do any preparation/completion work,
119 * so this member function is empty.
120 */
121 public void startPass(int passNo)
122 { // Intentionally left blank
123 }
124
125 /**
126 * Process the document - for a GML document, this results in the
127 * decoration of other files, for other documents, it does nothing.
128 */
129 public void extractDocument(DocumentID docID, DocumentInterface document)
130 { if (document.getDocumentType().equals(IndexDocument.INDEX_DOCUMENT_TYPE))
131 { // Extract the content from the index file
132
133 // get the file
134 String documentText = null;
135 // String documentText =
136 // DocumentLoader.getAsString(document.getDocumentFiles().getFile(0).toString());
137
138 if (documentText == null) {
139 System.err.println("IndexExtractor: Unable to load any content for " + document.getDocumentFiles().getFile(0).toString());
140 return;
141 }
142
143 try {
144 IndexHandler handler = new IndexHandler(documentText);
145 }
146 catch (IndexHandlerException ex) {
147 }
148
149 // for each document post it to the corresponding document
150 }
151 }
152
153 protected static void postMetadata(String file, String value, String label)
154 {
155 }
156
157 /**
158 * This extractor doesn't need to do any preparation/completion work,
159 * so this member function is empty.
160 */
161 public void endPass(int passNo)
162 { // Intentionally left blank
163 }
164
165 /**
166 * This extractor is a simple, single-pass extractor
167 *
168 * @see: org.greenstone.gsdl3.gs3build.extractor.ExtractorInterface:getNumberOfPasses
169 */
170 public int getNumberOfPasses()
171 { return 1;
172 }
173}
Note: See TracBrowser for help on using the repository browser.