source: trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/indexers/MGPPIndexer.java@ 6283

Last change on this file since 6283 was 6283, checked in by cs025, 20 years ago

Changes to indexer interface, improving configuration options. Also,
added section support to MGIndexer.

  • Property svn:keywords set to Author Date Id Revision
File size: 6.6 KB
Line 
1package org.greenstone.gsdl3.gs3build.indexers;
2
3import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
4import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
5
6import java.io.InputStream;
7import java.io.OutputStream;
8import java.io.IOException;
9
10public class MGPPIndexer implements IndexerInterface
11{
12 int pass;
13 boolean firstDocument;
14 String outputDirectory;
15 String outputStem;
16 InputStream indexerFeedback;
17 InputStream indexerErrors;
18 OutputStream indexerTextfeed;
19 Process mgpp_passes;
20 static final String documentSeparator = "<Document>";
21
22 public MGPPIndexer()
23 {
24 }
25
26 /**
27 * The output directory should be (collection)/building/text/ for
28 * normal Greenstone builds
29 */
30 public boolean configure(String label, String value)
31 {
32 if (label.equals("outputDir")) {
33 this.outputDirectory = value;
34 this.outputStem = value + "/index";
35 }
36 this.pass = 0;
37 return true;
38 }
39
40 /**
41 * Index a single document; the document interface can be used to extract individual
42 * metadata items etc. as required or desired and index those instead or as well as
43 * the body text of the document.
44 */
45 public boolean indexDocument(DocumentID docID, DocumentInterface document)
46 {
47 if (!this.firstDocument)
48 { // Send a '<document>' before the document itself
49 try {
50 this.indexerTextfeed.write(documentSeparator.getBytes(), 0, documentSeparator.getBytes().length);
51 }
52 catch (IOException ex)
53 { System.out.println("Bad output on end of document" + ex);
54 ex.printStackTrace();
55 return false;
56 }
57 }
58 String docText = document.getDocumentText();
59
60 byte [] bytes = docText.getBytes();
61 int pos = 0, end = bytes.length;
62
63 try {
64 while (pos < end) {
65 this.indexerTextfeed.write(bytes, pos, (end - pos > 512 ? 512 : end - pos));
66 pos = pos + 512;
67
68 try {
69 while (this.indexerFeedback.available() > 0)
70 { byte b[] = new byte[this.indexerFeedback.available()];
71 System.out.println("Feedback of " + this.indexerFeedback.available());
72 this.indexerFeedback.read(b);
73 System.out.println(b);
74 }
75 }
76 catch (IOException ex)
77 {
78 }
79
80
81 try {
82 while (this.indexerErrors.available() > 0)
83 { byte b[] = new byte[this.indexerErrors.available()];
84 System.out.println("Feedback of " + this.indexerErrors.available());
85 this.indexerErrors.read(b);
86 System.out.println(new String(b));
87 }
88 }
89 catch (IOException ex)
90 {
91 }
92 }
93 }
94 catch (IOException ex)
95 { System.out.println("Bad output during document write " + ex + " " + pos + " " + end);
96 ex.printStackTrace();
97 return false;
98 }
99 this.firstDocument = false;
100
101 try {
102 while (this.indexerErrors.available() > 0)
103 { char c = (char) this.indexerErrors.read();
104 System.out.println(c);
105 }
106 while (this.indexerFeedback.available() > 0)
107 { byte b[] = new byte[this.indexerFeedback.available()];
108 System.out.println("Feedback of " + this.indexerFeedback.available());
109 this.indexerFeedback.read(b);
110 }
111 }
112 catch (IOException ex)
113 {
114 }
115 return true;
116 }
117
118 /**
119 * Initialise the pass: open required files, check status
120 */
121 public boolean startPass(int passNumber)
122 { this.pass = passNumber;
123 this.firstDocument = true;
124
125 try {
126 switch (this.pass) {
127 case 0:
128 mgpp_passes = Runtime.getRuntime().exec("mgpp_passes -f " + this.outputStem + " -T1");
129 break;
130
131 case 1:
132 mgpp_passes = Runtime.getRuntime().exec("mgpp_passes -f " + this.outputStem +" -T2");
133 break;
134
135 case 2:
136 mgpp_passes = Runtime.getRuntime().exec("mgpp_passes -f " + this.outputStem +" -I1");
137 break;
138
139 case 3:
140 Process p = Runtime.getRuntime().exec("mgpp_perf_hash_build -f " + this.outputStem);
141 p.waitFor();
142
143 mgpp_passes = Runtime.getRuntime().exec("mgpp_passes -f " + this.outputStem +" -I2");
144 break;
145 }
146
147 this.indexerFeedback = mgpp_passes.getInputStream();
148 this.indexerErrors = mgpp_passes.getErrorStream();
149 this.indexerTextfeed = mgpp_passes.getOutputStream();
150 }
151 catch (IOException ex)
152 { System.out.println(ex);
153 ex.printStackTrace();
154 return false;
155 }
156 catch (InterruptedException ex)
157 { System.out.println(ex);
158 ex.printStackTrace();
159 return false;
160 }
161 System.out.println("Pass " + this.pass);
162 return true;
163 }
164
165 /**
166 * Complete a pass - reset file counters, close files, etc.
167 */
168 public boolean endPass(int passNumber)
169 { // TODO: end pass
170 Process p;
171
172 try {
173 this.indexerTextfeed.write((char) 2);
174 this.indexerTextfeed.write(4);
175 while (this.indexerErrors.available() > 0)
176 { char c = (char) this.indexerErrors.read();
177 System.out.print(c);
178 }
179 while (this.indexerFeedback.available() > 0)
180 { byte b[] = new byte[this.indexerFeedback.available()];
181 System.out.print("Feedback of " + this.indexerFeedback.available());
182 this.indexerFeedback.read(b);
183 }
184
185 this.indexerTextfeed.close();
186 Thread.sleep(1000);
187 this.mgpp_passes.waitFor();
188 }
189 catch (IOException ex)
190 { System.out.println(ex);
191 }
192 catch (InterruptedException ex)
193 { System.out.println(ex);
194 }
195 System.out.println("Completed with " + this.mgpp_passes.exitValue());
196
197 try {
198 switch (this.pass)
199 {
200 case 0:
201 System.out.println("Compressing dictionary");
202 p = Runtime.getRuntime().exec("mgpp_compression_dict -f " + this.outputStem + " -S -H -2 -k 5120");
203 p.waitFor();
204 System.out.println(p.exitValue());
205 break;
206
207 case 3:
208 p = Runtime.getRuntime().exec("mgpp_weights_build -f " + this.outputStem);
209 p.waitFor();
210 System.out.println(p.exitValue());
211
212 p = Runtime.getRuntime().exec("mgpp_invf_dict -f " + this.outputStem);
213 p.waitFor();
214 System.out.println(p.exitValue());
215
216 p = Runtime.getRuntime().exec("mgpp_stem_idx -b 4096 -s1 -f " + this.outputStem + " -d " + this.outputDirectory);
217 p.waitFor();
218 System.out.println(p.exitValue());
219 p = Runtime.getRuntime().exec("mgpp_stem_idx -b 4096 -s2 -f " + this.outputStem + " -d " + this.outputDirectory);
220 p.waitFor();
221 System.out.println(p.exitValue());
222 p = Runtime.getRuntime().exec("mgpp_stem_idx -b 4096 -s3 -f " + this.outputStem + " -d " + this.outputDirectory);
223 p.waitFor();
224 System.out.println(p.exitValue());
225 break;
226 }
227 }
228 catch (IOException ex)
229 { System.out.println(ex);
230 ex.printStackTrace();
231 return false;
232 }
233 catch (InterruptedException ex)
234 { System.out.println(ex);
235 ex.printStackTrace();
236 return false;
237 }
238 return true;
239 }
240
241 /**
242 * Do any tidying up
243 */
244 public void tidyup()
245 {
246 }
247
248 /**
249 * Return the number of passes required for this index.
250 */
251 public int getNumberOfPasses()
252 { return 4;
253 }
254}
Note: See TracBrowser for help on using the repository browser.