source: trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/indexers/MGIndexer.java@ 5952

Last change on this file since 5952 was 5800, checked in by cs025, 21 years ago

Adding gs3build

  • Property svn:keywords set to Author Date Id Revision
File size: 8.1 KB
Line 
1package org.greenstone.gsdl3.gs3build.indexers;
2
3import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
4import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
5
6import java.io.File;
7import java.io.InputStream;
8import java.io.OutputStream;
9import java.io.IOException;
10
11public class MGIndexer implements IndexerInterface
12{
13 int pass;
14 int documentSeqNo;
15 boolean firstDocument;
16 String outputDirectory;
17 String outputStem;
18 InputStream indexerFeedback;
19 InputStream indexerErrors;
20 OutputStream indexerTextfeed;
21 Process mg_passes;
22 File textDirectory;
23 File dtxDirectory;
24 String textStem;
25
26 public MGIndexer()
27 {
28 }
29
30 /**
31 * The output directory should be (collection)/building/text/ for
32 * normal Greenstone builds
33 */
34 public boolean configure(String outputDirectory)
35 { this.outputDirectory = outputDirectory;
36 this.outputStem = outputDirectory + "/dtx/index"; // TODO: modify for index
37 this.textStem = outputDirectory + "/text/index";
38 this.pass = 0;
39
40 // attempt to ensure that the text subdirectory exists
41 this.textDirectory = new File(outputDirectory, "text");
42 if (!textDirectory.exists()) {
43 if (!textDirectory.mkdir()) {
44 return false;
45 }
46 }
47 else if (!textDirectory.isDirectory()) {
48 return false;
49 }
50
51 // attempt to ensure that the text subdirectory exists
52 this.dtxDirectory = new File(outputDirectory, "dtx");
53 if (!dtxDirectory.exists()) {
54 if (!dtxDirectory.mkdir()) {
55 return false;
56 }
57 }
58 else if (!dtxDirectory.isDirectory()) {
59 return false;
60 }
61
62 System.out.println("Output MG directory is " + this.textStem);
63 return true;
64 }
65
66 /**
67 * Index a single document; the document interface can be used to extract individual
68 * metadata items etc. as required or desired and index those instead or as well as
69 * the body text of the document.
70 */
71 public boolean indexDocument(DocumentID docID, DocumentInterface document)
72 {
73 if (!this.firstDocument)
74 { // Send a 'CTRL-B' before the document itself
75 try {
76 this.indexerTextfeed.write(2);
77 }
78 catch (IOException ex)
79 { System.out.println("Bad output on end of document" + ex);
80 ex.printStackTrace();
81 return false;
82 }
83 }
84 String docText = document.getDocumentText();
85
86 byte [] bytes = docText.getBytes();
87 int pos = 0, end = bytes.length;
88
89 try {
90 while (pos < end) {
91 this.indexerTextfeed.write(bytes, pos, (end - pos > 512 ? 512 : end - pos));
92 pos = pos + 512;
93
94 try {
95 while (this.indexerFeedback.available() > 0)
96 { byte b[] = new byte[this.indexerFeedback.available()];
97 System.out.println("Feedback of " + this.indexerFeedback.available());
98 this.indexerFeedback.read(b);
99 System.out.println(b);
100 }
101 }
102 catch (IOException ex)
103 {
104 }
105
106
107 try {
108 while (this.indexerErrors.available() > 0)
109 { byte b[] = new byte[this.indexerErrors.available()];
110 System.out.println("Feedback of " + this.indexerErrors.available());
111 this.indexerErrors.read(b);
112 System.out.println(new String(b));
113 }
114 }
115 catch (IOException ex)
116 {
117 }
118 }
119 }
120 catch (IOException ex)
121 { System.out.println("Bad output during document write " + ex + " " + pos + " " + end);
122 ex.printStackTrace();
123 return false;
124 }
125
126 // remember that we're not on the first document, assign the sequence number
127 // on the first pass only, and increment the sequence number.
128 this.firstDocument = false;
129 if (this.pass == 0) {
130 document.addDocumentMetadata("gsdl3", "mgseqno", Integer.toString(this.documentSeqNo));
131 }
132 this.documentSeqNo += 1;
133
134 try {
135 while (this.indexerErrors.available() > 0)
136 { char c = (char) this.indexerErrors.read();
137 System.out.println(c);
138 }
139 while (this.indexerFeedback.available() > 0)
140 { byte b[] = new byte[this.indexerFeedback.available()];
141 System.out.println("Feedback of " + this.indexerFeedback.available());
142 this.indexerFeedback.read(b);
143 }
144 }
145 catch (IOException ex)
146 {
147 }
148 return true;
149 }
150
151 /**
152 * Initialise the pass: open required files, check status
153 */
154 public boolean startPass(int passNumber)
155 {
156 this.pass = passNumber;
157 this.firstDocument = true;
158 this.documentSeqNo = 0;
159
160 String pathParams = "-f index -d " + (pass < 2 ? this.textDirectory.toString() : this.dtxDirectory.toString());
161
162 try {
163 switch (this.pass) {
164 case 0:
165 mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams + " -b 100000 -T1");
166 break;
167
168 case 1:
169 mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams +" -b 100000 -T2 -M 10");
170 break;
171
172 case 2:
173 mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams + " -b 100000 -2 -m 32 -s 0 -G -t 10 -N1");
174 break;
175
176 case 3:
177 Process p = Runtime.getRuntime().exec("mg_perf_hash_build -f index -d " + this.dtxDirectory.toString());
178 p.waitFor();
179 System.out.println(p.exitValue());
180
181 mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams +" -b 100000 -2 -c 3 -G -t 10 -N2");
182 break;
183 }
184
185 this.indexerFeedback = mg_passes.getInputStream();
186 this.indexerErrors = mg_passes.getErrorStream();
187 this.indexerTextfeed = mg_passes.getOutputStream();
188 }
189 catch (IOException ex)
190 { System.out.println(ex);
191 ex.printStackTrace();
192 return false;
193 }
194 catch (InterruptedException ex)
195 { System.out.println(ex);
196 ex.printStackTrace();
197 return false;
198 }
199 System.out.println("Pass " + this.pass);
200 return true;
201 }
202
203 /**
204 * Complete a pass - reset file counters, close files, etc.
205 */
206 public boolean endPass(int passNumber)
207 { Process p;
208
209 try {
210 this.indexerTextfeed.write((char) 2);
211 this.indexerTextfeed.write(4);
212 while (this.indexerErrors.available() > 0)
213 { char c = (char) this.indexerErrors.read();
214 System.out.print(c);
215 }
216 while (this.indexerFeedback.available() > 0)
217 { byte b[] = new byte[this.indexerFeedback.available()];
218 System.out.print("Feedback of " + this.indexerFeedback.available());
219 this.indexerFeedback.read(b);
220 }
221
222 this.indexerTextfeed.close();
223 Thread.sleep(1000);
224 this.mg_passes.waitFor();
225 }
226 catch (IOException ex)
227 { System.out.println(ex);
228 }
229 catch (InterruptedException ex)
230 { System.out.println(ex);
231 }
232 System.out.println("Completed with " + this.mg_passes.exitValue());
233
234 try {
235 switch (this.pass)
236 {
237 case 0:
238 System.out.println("Compressing dictionary");
239 p = Runtime.getRuntime().exec("mg_compression_dict -f index -d " + this.textDirectory.toString() + " -S -H -2 -k 5120");
240 p.waitFor();
241 if (p.exitValue() != 0) {
242 System.out.println("Error from mg_compression_dict: " + p.exitValue());
243 }
244 break;
245
246 case 3:
247 p = Runtime.getRuntime().exec("mg_weights_build -f " + this.outputStem + " -t " + this.textStem + " -d /");
248 p.waitFor();
249 System.out.println(p.exitValue());
250
251 p = Runtime.getRuntime().exec("mg_invf_dict -f index -d " + this.dtxDirectory.toString());
252 p.waitFor();
253 System.out.println(p.exitValue());
254
255 p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s1 -f index -d " + this.dtxDirectory.toString());
256 p.waitFor();
257 System.out.println(p.exitValue());
258 p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s2 -f index -d " + this.dtxDirectory.toString());
259 p.waitFor();
260 System.out.println(p.exitValue());
261 p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s3 -f index -d " + this.dtxDirectory.toString());
262 p.waitFor();
263 System.out.println(p.exitValue());
264 break;
265 }
266 }
267 catch (IOException ex)
268 { System.out.println(ex);
269 ex.printStackTrace();
270 return false;
271 }
272 catch (InterruptedException ex)
273 { System.out.println(ex);
274 ex.printStackTrace();
275 return false;
276 }
277 return true;
278 }
279
280 /**
281 * Do any tidying up
282 */
283 public void tidyup()
284 {
285 }
286
287 /**
288 * Return the number of passes required for this index.
289 */
290 public int getNumberOfPasses()
291 { return 4;
292 }
293}
294
295
296
Note: See TracBrowser for help on using the repository browser.