source: trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/indexers/MGIndexer.java@ 6283

Last change on this file since 6283 was 6283, checked in by cs025, 20 years ago

Changes to indexer interface, improving configuration options. Also,
added section support to MGIndexer.

  • Property svn:keywords set to Author Date Id Revision
File size: 12.9 KB
Line 
1package org.greenstone.gsdl3.gs3build.indexers;
2
3import java.util.List;
4import java.util.ArrayList;
5import java.util.Iterator;
6
7import java.io.File;
8import java.io.InputStream;
9import java.io.OutputStream;
10import java.io.IOException;
11
12import org.w3c.dom.*;
13
14import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
15import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
16import org.greenstone.gsdl3.gs3build.doctypes.HTMLDocument;
17import org.greenstone.gsdl3.gs3build.metadata.*;
18import org.greenstone.gsdl3.gs3build.xpointer.XPointer;
19
20public class MGIndexer implements IndexerInterface
21{
22 int pass;
23 int documentSeqNo;
24 int sectionSeqNo;
25 boolean firstDocument;
26 String outputDirectory;
27 InputStream indexerFeedback;
28 InputStream indexerErrors;
29 OutputStream indexerTextfeed;
30 Process mg_passes;
31 File textDirectory;
32 File indexDirectory;
33 String indexStem;
34 String textStem;
35 List indexes;
36 String level;
37
38 public static final String MG_INDEX = "Index";
39
40 class MGIndex
41 { String level;
42 String field;
43
44 public MGIndex(String indexLabel)
45 { int colonAt = indexLabel.indexOf(':');
46 if (colonAt >= 0)
47 { field = indexLabel.substring(colonAt+1);
48 level = indexLabel.substring(0, colonAt);
49 }
50 else
51 {
52 }
53 }
54
55 public String getLevel()
56 { return this.level;
57 }
58
59 public String getField()
60 { return this.field;
61 }
62 }
63
64 public MGIndexer()
65 { this.indexes = new ArrayList();
66 }
67
68 private String getIndexDirectory(String level, String field)
69 { StringBuffer directory = new StringBuffer();
70 directory.append(Character.toLowerCase((char) level.charAt(0)));
71
72 int c, w;
73 w = 0;
74 c = 0;
75 while (c < field.length() && w < 2) {
76 char ch = field.charAt(c);
77
78 ch = Character.toLowerCase(ch);
79 if (Character.isLetter(ch)) {
80 if (ch != 'a' && ch != 'e' && ch != 'i' &&
81 ch != 'o' && ch != 'u') {
82 directory.append(ch);
83 w++;
84 }
85 }
86 c ++;
87 }
88 return directory.toString();
89 }
90
91 /**
92 * The output directory should be (collection)/building/text/ for
93 * normal Greenstone builds
94 */
95 public boolean configure(String label, String value)
96 {
97 if (label.equals(IndexerManager.outputDir)) {
98 this.outputDirectory = value;
99 this.textStem = value + "/text/index";
100 this.pass = 0;
101
102 // attempt to ensure that the text subdirectory exists
103 this.textDirectory = new File(outputDirectory, "text");
104 if (!textDirectory.exists()) {
105 if (!textDirectory.mkdir()) {
106 return false;
107 }
108 }
109 else if (!textDirectory.isDirectory()) {
110 return false;
111 }
112
113 // Sign to the user which mg directory is being used...
114 System.out.println("Output MG directory is " + this.textStem);
115 }
116 else if (label.equals(MG_INDEX)) {
117 this.indexes.add(new MGIndex(value));
118 }
119
120 return true;
121 }
122
123 private Node recurseDOM(DocumentInterface metsDoc, Node node,
124 AbstractStructure structure, StringBuffer buffer)
125 {
126 // send out the ctrl-c...if this is
127 if (structure.getStructureType().equals(METSDivision.DIVISION_TYPE)) {
128 if (this.pass == 0) {
129 METSDivision division = (METSDivision) structure;
130
131 // get the division metadata block
132 METSDescriptive descriptive;
133 String metadataId = division.getDefaultMetadataReference();
134 if (metadataId == null) {
135 descriptive = metsDoc.getDocumentMetadata().createDescriptive(division.getLabel());
136 division.addMetadataReference(descriptive.getID());
137 }
138 else {
139 // Get the descriptive item...
140 descriptive = metsDoc.getDocumentMetadata().getDescriptiveById(metadataId);
141 }
142
143 descriptive.addMetadata("gsdl3", "mgseqno", Integer.toString(this.sectionSeqNo));
144 }
145
146 buffer.append((char) 3);
147 if (this.level != null &&
148 this.level.equals("section")) {
149 buffer.append((char) 2);
150 }
151 this.sectionSeqNo ++;
152 }
153
154 // go through our children as required...
155 Iterator children = structure.getChildIterator();
156 while (children.hasNext()) {
157 AbstractStructure child = (AbstractStructure) children.next();
158
159 // get xpointer for child
160 // get start position node
161 Node startNode = ((HTMLDocument) metsDoc).getSectionStartNode((METSDivision) child);
162
163 // while this node isn't the child's start node, produce the node
164 while (node != startNode) {
165 XPointer.printNode(node, buffer, false);
166 // print buffer to node
167 node = XPointer.getNextNode(node, buffer);
168 }
169
170 // recurse to child
171 this.recurseDOM(metsDoc, node, child, buffer);
172 }
173
174 // close a document - the actual closing \B will be done by the main
175 // loop, so only a required \C is printed here...
176 if (structure.getStructureType().equals(METSStructure.STRUCTURE_TYPE)) {
177 while (node != null) {
178 XPointer.printNode(node, buffer, false);
179 node = XPointer.getNextNode(node, buffer);
180 }
181 buffer.append((char) 3);
182 this.sectionSeqNo ++;
183 }
184 return node;
185 }
186
187 private String prepareDOM(DocumentInterface metsDoc, Document document, METSStructure structure)
188 { Node node = document.getDocumentElement();
189 StringBuffer textBuffer = new StringBuffer();
190
191 this.recurseDOM(metsDoc, node, structure, textBuffer);
192 return textBuffer.toString();
193 }
194
195 /**
196 * Index a single document; the document interface can be used to extract individual
197 * metadata items etc. as required or desired and index those instead or as well as
198 * the body text of the document.
199 */
200 public boolean indexDocument(DocumentID docID, DocumentInterface document)
201 {
202 if (!this.firstDocument)
203 { // Send a 'CTRL-B' before the document itself
204 try {
205 this.indexerTextfeed.write(2);
206 }
207 catch (IOException ex)
208 { System.out.println("Bad output on end of document" + ex);
209 ex.printStackTrace();
210 return false;
211 }
212 }
213 String docText = null;
214
215 Document domDocument = document.getDOMDocument();
216 if (domDocument != null) {
217 METSStructure sections = document.getDocumentStructure().getStructure("Section");
218 if (sections != null) {
219 docText = this.prepareDOM(document, domDocument, sections);
220 // System.out.println(docText);
221 }
222 }
223 if (docText == null) {
224 docText = document.getDocumentText();
225 }
226
227 byte [] bytes = docText.getBytes();
228 int pos = 0, end = bytes.length;
229
230 try {
231 while (pos < end) {
232 this.indexerTextfeed.write(bytes, pos, (end - pos > 512 ? 512 : end - pos));
233 pos = pos + 512;
234
235 try {
236 while (this.indexerFeedback.available() > 0)
237 { byte b[] = new byte[this.indexerFeedback.available()];
238 System.out.println("Feedback of " + this.indexerFeedback.available());
239 this.indexerFeedback.read(b);
240 System.out.println(b);
241 }
242 }
243 catch (IOException ex)
244 {
245 }
246
247
248 try {
249 while (this.indexerErrors.available() > 0)
250 { byte b[] = new byte[this.indexerErrors.available()];
251 System.out.println("Feedback of " + this.indexerErrors.available());
252 this.indexerErrors.read(b);
253 System.out.println(new String(b));
254 }
255 }
256 catch (IOException ex)
257 {
258 }
259 }
260 }
261 catch (IOException ex)
262 { System.out.println("Bad output during document write " + ex + " " + pos + " " + end);
263 ex.printStackTrace();
264 return false;
265 }
266
267 // remember that we're not on the first document, assign the sequence number
268 // on the first pass only, and increment the sequence number.
269 this.firstDocument = false;
270 if (this.pass == 0) {
271 document.addDocumentMetadata("gsdl3", "mgseqno", Integer.toString(this.documentSeqNo));
272 }
273 this.documentSeqNo += 1;
274
275 try {
276 while (this.indexerErrors.available() > 0)
277 { char c = (char) this.indexerErrors.read();
278 System.out.println(c);
279 }
280 while (this.indexerFeedback.available() > 0)
281 { byte b[] = new byte[this.indexerFeedback.available()];
282 System.out.println("Feedback of " + this.indexerFeedback.available());
283 this.indexerFeedback.read(b);
284 }
285 }
286 catch (IOException ex)
287 {
288 }
289 return true;
290 }
291
292 /**
293 * Initialise the pass: open required files, check status
294 */
295 public boolean startPass(int passNumber)
296 {
297 this.pass = passNumber;
298 this.firstDocument = true;
299 this.documentSeqNo = 1;
300 this.sectionSeqNo = 1;
301
302 int indexNo = (this.pass - 2) / 2;
303 if (indexNo >= 0) {
304 MGIndex index = (MGIndex) this.indexes.get(indexNo);
305
306 // attempt to ensure that the text subdirectory exists
307 this.indexDirectory = new File(outputDirectory, this.getIndexDirectory(index.getLevel(), index.getField()));
308 if (!indexDirectory.exists()) {
309 if (!indexDirectory.mkdir()) {
310 return false;
311 }
312 }
313 else if (!indexDirectory.isDirectory()) {
314 return false;
315 }
316
317 this.indexStem = this.outputDirectory + File.separatorChar +
318 this.getIndexDirectory("document", "text") +
319 File.separatorChar + "index"; // TODO: modify for index
320 this.level = index.getLevel();
321 }
322
323 // get the parameters for this execution of mg_passes
324 String pathParams = "-f index -d " + (this.pass < 2 ? this.textDirectory.toString() : this.indexDirectory.toString());
325
326 int mgPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2);
327
328 try {
329 switch (mgPass) {
330 case 0:
331 mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams + " -b 100000 -T1");
332 break;
333
334 case 1:
335 mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams +" -b 100000 -T2");
336 break;
337
338 case 2:
339 mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams + " -b 100000 -2 -m 32 -s 0 -G -t 10 -N1");
340 break;
341
342 case 3:
343 Process p = Runtime.getRuntime().exec("mg_perf_hash_build -f index -d " + this.indexDirectory.toString());
344 p.waitFor();
345 System.out.println(p.exitValue());
346
347 mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams +" -b 100000 -2 -c 3 -G -t 10 -N2");
348 break;
349 }
350
351 this.indexerFeedback = mg_passes.getInputStream();
352 this.indexerErrors = mg_passes.getErrorStream();
353 this.indexerTextfeed = mg_passes.getOutputStream();
354 }
355 catch (IOException ex)
356 { System.out.println(ex);
357 ex.printStackTrace();
358 return false;
359 }
360 catch (InterruptedException ex)
361 { System.out.println(ex);
362 ex.printStackTrace();
363 return false;
364 }
365 System.out.println("Pass " + this.pass);
366 return true;
367 }
368
369 /**
370 * Complete a pass - reset file counters, close files, etc.
371 */
372 public boolean endPass(int passNumber)
373 { Process p;
374
375 try {
376 this.indexerTextfeed.write((char) 2);
377 this.indexerTextfeed.write(4);
378 while (this.indexerErrors.available() > 0)
379 { char c = (char) this.indexerErrors.read();
380 System.out.print(c);
381 }
382 while (this.indexerFeedback.available() > 0)
383 { byte b[] = new byte[this.indexerFeedback.available()];
384 System.out.print("Feedback of " + this.indexerFeedback.available());
385 this.indexerFeedback.read(b);
386 }
387
388 this.indexerTextfeed.close();
389 Thread.sleep(1000);
390 this.mg_passes.waitFor();
391 }
392 catch (IOException ex)
393 { System.out.println(ex);
394 }
395 catch (InterruptedException ex)
396 { System.out.println(ex);
397 }
398 System.out.println("Completed with " + this.mg_passes.exitValue());
399
400 int mgPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2);
401
402 try {
403 switch (mgPass)
404 {
405 case 0:
406 System.out.println("Compressing dictionary");
407 p = Runtime.getRuntime().exec("mg_compression_dict -f index -d " + this.textDirectory.toString() + " -S -H -2 -k 5120");
408 p.waitFor();
409 if (p.exitValue() != 0) {
410 System.out.println("Error from mg_compression_dict: " + p.exitValue());
411 }
412 break;
413
414 case 3:
415 p = Runtime.getRuntime().exec("mg_weights_build -f " + this.indexStem + " -t " + this.textStem + " -d /");
416 p.waitFor();
417 System.out.println(p.exitValue());
418
419 p = Runtime.getRuntime().exec("mg_invf_dict -f index -d " + this.indexDirectory.toString());
420 p.waitFor();
421 System.out.println(p.exitValue());
422
423 p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s1 -f index -d " + this.indexDirectory.toString());
424 p.waitFor();
425 System.out.println(p.exitValue());
426 p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s2 -f index -d " + this.indexDirectory.toString());
427 p.waitFor();
428 System.out.println(p.exitValue());
429 p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s3 -f index -d " + this.indexDirectory.toString());
430 p.waitFor();
431 System.out.println(p.exitValue());
432 break;
433 }
434 }
435 catch (IOException ex)
436 { System.out.println(ex);
437 ex.printStackTrace();
438 return false;
439 }
440 catch (InterruptedException ex)
441 { System.out.println(ex);
442 ex.printStackTrace();
443 return false;
444 }
445 return true;
446 }
447
448 /**
449 * Do any tidying up
450 */
451 public void tidyup()
452 {
453 }
454
455 /**
456 * Return the number of passes required for this index.
457 */
458 public int getNumberOfPasses()
459 { return 2 + this.indexes.size() * 2;
460 }
461}
Note: See TracBrowser for help on using the repository browser.