1 | package org.greenstone.gsdl3.gs3build.extractor;
|
---|
2 |
|
---|
3 | import java.io.FileReader;
|
---|
4 |
|
---|
5 | import java.util.List;
|
---|
6 | import java.util.ArrayList;
|
---|
7 |
|
---|
8 | import org.xml.sax.XMLReader;
|
---|
9 | import org.xml.sax.InputSource;
|
---|
10 | import org.xml.sax.SAXException;
|
---|
11 | import org.xml.sax.Attributes;
|
---|
12 | import org.xml.sax.helpers.XMLReaderFactory;
|
---|
13 | import org.xml.sax.helpers.DefaultHandler;
|
---|
14 |
|
---|
15 | import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
|
---|
16 | import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
|
---|
17 | import org.greenstone.gsdl3.gs3build.doctypes.IndexDocument;
|
---|
18 | import org.greenstone.gsdl3.gs3build.doctypes.DocumentLoader;
|
---|
19 |
|
---|
20 | public class IndexExtractor implements ExtractorInterface
|
---|
21 | {
|
---|
22 | class IndexHandlerException extends Exception
|
---|
23 | { public IndexHandlerException(String value)
|
---|
24 | { super(value);
|
---|
25 | }
|
---|
26 | }
|
---|
27 |
|
---|
28 | /**
|
---|
29 | * An inner class to handle GML files
|
---|
30 | */
|
---|
31 | class IndexHandler
|
---|
32 | { String content;
|
---|
33 | String line;
|
---|
34 | int pos;
|
---|
35 | boolean doneRow;
|
---|
36 | List labels;
|
---|
37 |
|
---|
38 | IndexHandler(String content) throws IndexHandlerException
|
---|
39 | { this.content = content;
|
---|
40 | this.doneRow = false;
|
---|
41 | this.labels = new ArrayList();
|
---|
42 |
|
---|
43 | // get the first line
|
---|
44 | this.getLine();
|
---|
45 |
|
---|
46 | if (!this.hasMore())
|
---|
47 | { throw new IndexHandlerException("No title line");
|
---|
48 | }
|
---|
49 |
|
---|
50 | // get the first totem - it should be blank
|
---|
51 |
|
---|
52 | }
|
---|
53 |
|
---|
54 | private boolean hasMore()
|
---|
55 | { return this.line != null;
|
---|
56 | }
|
---|
57 |
|
---|
58 | private boolean hasMoreLines()
|
---|
59 | { return this.content != null;
|
---|
60 | }
|
---|
61 |
|
---|
62 | private String getEntry()
|
---|
63 | { int tab = this.line.indexOf('\t');
|
---|
64 | String reply;
|
---|
65 |
|
---|
66 | if (tab < 0) {
|
---|
67 | reply = this.line;
|
---|
68 | this.line = null;
|
---|
69 | }
|
---|
70 | else {
|
---|
71 | reply = this.line.substring(0, tab);
|
---|
72 | this.line = this.line.substring(tab+1);
|
---|
73 | }
|
---|
74 |
|
---|
75 | return reply;
|
---|
76 | }
|
---|
77 |
|
---|
78 | private String getLine()
|
---|
79 | { do {
|
---|
80 | int eol = this.content.indexOf('\n');
|
---|
81 | if (eol < 0) {
|
---|
82 | this.line = this.content;
|
---|
83 | this.content = null;
|
---|
84 | }
|
---|
85 | else {
|
---|
86 | this.line = this.content.substring(0, eol);
|
---|
87 | this.content = this.content.substring(eol+1);
|
---|
88 | while (this.content.length() > 0 &&
|
---|
89 | this.content.charAt(0) < ' ')
|
---|
90 | { this.content = this.content.substring(1);
|
---|
91 | }
|
---|
92 | }
|
---|
93 |
|
---|
94 | if (this.line != null) {
|
---|
95 | this.line.trim();
|
---|
96 | }
|
---|
97 | } while (this.line != null && this.line.length() == 0);
|
---|
98 | return this.line;
|
---|
99 | }
|
---|
100 | }
|
---|
101 |
|
---|
102 | /**
|
---|
103 | * Construct of extractor
|
---|
104 | */
|
---|
105 | public IndexExtractor()
|
---|
106 | { // Intentionally left blank
|
---|
107 | }
|
---|
108 |
|
---|
109 | /**
|
---|
110 | * This extractor doesn't need to do any preparation/completion work,
|
---|
111 | * so this member function is empty.
|
---|
112 | */
|
---|
113 | public void configure(String outputDir)
|
---|
114 | { // Intentionally left blank
|
---|
115 | }
|
---|
116 |
|
---|
117 | /**
|
---|
118 | * This extractor doesn't need to do any preparation/completion work,
|
---|
119 | * so this member function is empty.
|
---|
120 | */
|
---|
121 | public void startPass(int passNo)
|
---|
122 | { // Intentionally left blank
|
---|
123 | }
|
---|
124 |
|
---|
125 | /**
|
---|
126 | * Process the document - for a GML document, this results in the
|
---|
127 | * decoration of other files, for other documents, it does nothing.
|
---|
128 | */
|
---|
129 | public void extractDocument(DocumentID docID, DocumentInterface document)
|
---|
130 | { if (document.getDocumentType().equals(IndexDocument.INDEX_DOCUMENT_TYPE))
|
---|
131 | { // Extract the content from the index file
|
---|
132 |
|
---|
133 | // get the file
|
---|
134 | String documentText = null;
|
---|
135 | // String documentText =
|
---|
136 | // DocumentLoader.getAsString(document.getDocumentFiles().getFile(0).toString());
|
---|
137 |
|
---|
138 | if (documentText == null) {
|
---|
139 | System.err.println("IndexExtractor: Unable to load any content for " + document.getDocumentFiles().getFile(0).toString());
|
---|
140 | return;
|
---|
141 | }
|
---|
142 |
|
---|
143 | try {
|
---|
144 | IndexHandler handler = new IndexHandler(documentText);
|
---|
145 | }
|
---|
146 | catch (IndexHandlerException ex) {
|
---|
147 | }
|
---|
148 |
|
---|
149 | // for each document post it to the corresponding document
|
---|
150 | }
|
---|
151 | }
|
---|
152 |
|
---|
153 | protected static void postMetadata(String file, String value, String label)
|
---|
154 | {
|
---|
155 | }
|
---|
156 |
|
---|
157 | /**
|
---|
158 | * This extractor doesn't need to do any preparation/completion work,
|
---|
159 | * so this member function is empty.
|
---|
160 | */
|
---|
161 | public void endPass(int passNo)
|
---|
162 | { // Intentionally left blank
|
---|
163 | }
|
---|
164 |
|
---|
165 | /**
|
---|
166 | * This extractor is a simple, single-pass extractor
|
---|
167 | *
|
---|
168 | * @see: org.greenstone.gsdl3.gs3build.extractor.ExtractorInterface:getNumberOfPasses
|
---|
169 | */
|
---|
170 | public int getNumberOfPasses()
|
---|
171 | { return 1;
|
---|
172 | }
|
---|
173 | }
|
---|