source: trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/indexers/MGPPIndexer.java@ 6349

Last change on this file since 6349 was 6349, checked in by cs025, 20 years ago

Modified indexerinterface to allow easier configuration, improved MG section handling.

  • Property svn:keywords set to Author Date Id Revision
File size: 6.8 KB
Line 
1package org.greenstone.gsdl3.gs3build.indexers;
2
3import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
4import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
5
6import java.io.InputStream;
7import java.io.OutputStream;
8import java.io.IOException;
9
10public class MGPPIndexer implements IndexerInterface
11{
12 int pass;
13 boolean firstDocument;
14 String outputDirectory;
15 String outputStem;
16 InputStream indexerFeedback;
17 InputStream indexerErrors;
18 OutputStream indexerTextfeed;
19 Process mgpp_passes;
20 static final String documentSeparator = "<Document>";
21
22 public MGPPIndexer()
23 {
24 }
25
26 /**
27 * The output directory should be (collection)/building/text/ for
28 * normal Greenstone builds
29 */
30 public boolean configure(String label, String value)
31 {
32 if (label.equals("outputDir")) {
33 this.outputDirectory = value;
34 this.outputStem = value + "/index";
35 }
36 this.pass = 0;
37 return true;
38 }
39
40 public boolean addIndex(String level, String field)
41 { return true;
42 }
43
44 /**
45 * Index a single document; the document interface can be used to extract individual
46 * metadata items etc. as required or desired and index those instead or as well as
47 * the body text of the document.
48 */
49 public boolean indexDocument(DocumentID docID, DocumentInterface document)
50 {
51 if (!this.firstDocument)
52 { // Send a '<document>' before the document itself
53 try {
54 this.indexerTextfeed.write(documentSeparator.getBytes(), 0, documentSeparator.getBytes().length);
55 }
56 catch (IOException ex)
57 { System.out.println("Bad output on end of document" + ex);
58 ex.printStackTrace();
59 return false;
60 }
61 }
62 String docText = document.getDocumentText();
63
64 byte [] bytes = docText.getBytes();
65 int pos = 0, end = bytes.length;
66
67 try {
68 while (pos < end) {
69 this.indexerTextfeed.write(bytes, pos, (end - pos > 512 ? 512 : end - pos));
70 pos = pos + 512;
71
72 try {
73 while (this.indexerFeedback.available() > 0)
74 { byte b[] = new byte[this.indexerFeedback.available()];
75 System.out.println("Feedback of " + this.indexerFeedback.available());
76 this.indexerFeedback.read(b);
77 System.out.println(b);
78 }
79 }
80 catch (IOException ex)
81 {
82 }
83
84
85 try {
86 while (this.indexerErrors.available() > 0)
87 { byte b[] = new byte[this.indexerErrors.available()];
88 System.out.println("Feedback of " + this.indexerErrors.available());
89 this.indexerErrors.read(b);
90 System.out.println(new String(b));
91 }
92 }
93 catch (IOException ex)
94 {
95 }
96 }
97 }
98 catch (IOException ex)
99 { System.out.println("Bad output during document write " + ex + " " + pos + " " + end);
100 ex.printStackTrace();
101 return false;
102 }
103 this.firstDocument = false;
104
105 try {
106 while (this.indexerErrors.available() > 0)
107 { char c = (char) this.indexerErrors.read();
108 System.out.println(c);
109 }
110 while (this.indexerFeedback.available() > 0)
111 { byte b[] = new byte[this.indexerFeedback.available()];
112 System.out.println("Feedback of " + this.indexerFeedback.available());
113 this.indexerFeedback.read(b);
114 }
115 }
116 catch (IOException ex)
117 {
118 }
119 return true;
120 }
121
122 /**
123 * Initialise the pass: open required files, check status
124 */
125 public boolean startPass(int passNumber)
126 { this.pass = passNumber;
127 this.firstDocument = true;
128
129 try {
130 switch (this.pass) {
131 case 0:
132 mgpp_passes = Runtime.getRuntime().exec("mgpp_passes -f " + this.outputStem + " -T1");
133 break;
134
135 case 1:
136 mgpp_passes = Runtime.getRuntime().exec("mgpp_passes -f " + this.outputStem +" -T2");
137 break;
138
139 case 2:
140 mgpp_passes = Runtime.getRuntime().exec("mgpp_passes -f " + this.outputStem +" -I1");
141 break;
142
143 case 3:
144 Process p = Runtime.getRuntime().exec("mgpp_perf_hash_build -f " + this.outputStem);
145 p.waitFor();
146
147 mgpp_passes = Runtime.getRuntime().exec("mgpp_passes -f " + this.outputStem +" -I2");
148 break;
149 }
150
151 this.indexerFeedback = mgpp_passes.getInputStream();
152 this.indexerErrors = mgpp_passes.getErrorStream();
153 this.indexerTextfeed = mgpp_passes.getOutputStream();
154 }
155 catch (IOException ex)
156 { System.out.println(ex);
157 ex.printStackTrace();
158 return false;
159 }
160 catch (InterruptedException ex)
161 { System.out.println(ex);
162 ex.printStackTrace();
163 return false;
164 }
165 System.out.println("Pass " + this.pass);
166 return true;
167 }
168
169 /**
170 * Complete a pass - reset file counters, close files, etc.
171 */
172 public boolean endPass(int passNumber)
173 { // TODO: end pass
174 Process p;
175
176 try {
177 this.indexerTextfeed.write((char) 2);
178 this.indexerTextfeed.write(4);
179 while (this.indexerErrors.available() > 0)
180 { char c = (char) this.indexerErrors.read();
181 System.out.print(c);
182 }
183 while (this.indexerFeedback.available() > 0)
184 { byte b[] = new byte[this.indexerFeedback.available()];
185 System.out.print("Feedback of " + this.indexerFeedback.available());
186 this.indexerFeedback.read(b);
187 }
188
189 this.indexerTextfeed.close();
190 Thread.sleep(1000);
191 this.mgpp_passes.waitFor();
192 }
193 catch (IOException ex)
194 { System.out.println(ex);
195 }
196 catch (InterruptedException ex)
197 { System.out.println(ex);
198 }
199 System.out.println("Completed with " + this.mgpp_passes.exitValue());
200
201 try {
202 switch (this.pass)
203 {
204 case 0:
205 System.out.println("Compressing dictionary");
206 p = Runtime.getRuntime().exec("mgpp_compression_dict -f " + this.outputStem + " -S -H -2 -k 5120");
207 p.waitFor();
208 System.out.println(p.exitValue());
209 break;
210
211 case 3:
212 p = Runtime.getRuntime().exec("mgpp_weights_build -f " + this.outputStem);
213 p.waitFor();
214 System.out.println(p.exitValue());
215
216 p = Runtime.getRuntime().exec("mgpp_invf_dict -f " + this.outputStem);
217 p.waitFor();
218 System.out.println(p.exitValue());
219
220 p = Runtime.getRuntime().exec("mgpp_stem_idx -b 4096 -s1 -f " + this.outputStem + " -d " + this.outputDirectory);
221 p.waitFor();
222 System.out.println(p.exitValue());
223 p = Runtime.getRuntime().exec("mgpp_stem_idx -b 4096 -s2 -f " + this.outputStem + " -d " + this.outputDirectory);
224 p.waitFor();
225 System.out.println(p.exitValue());
226 p = Runtime.getRuntime().exec("mgpp_stem_idx -b 4096 -s3 -f " + this.outputStem + " -d " + this.outputDirectory);
227 p.waitFor();
228 System.out.println(p.exitValue());
229 break;
230 }
231 }
232 catch (IOException ex)
233 { System.out.println(ex);
234 ex.printStackTrace();
235 return false;
236 }
237 catch (InterruptedException ex)
238 { System.out.println(ex);
239 ex.printStackTrace();
240 return false;
241 }
242 return true;
243 }
244
245 /**
246 * Do any tidying up
247 */
248 public void tidyup()
249 {
250 }
251
252 /**
253 * Return the number of passes required for this index.
254 */
255 public int getNumberOfPasses()
256 { return 4;
257 }
258}
Note: See TracBrowser for help on using the repository browser.