source: trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/indexers/MGIndexer.java@ 6349

Last change on this file since 6349 was 6349, checked in by cs025, 20 years ago

Modified indexerinterface to allow easier configuration, improved MG section handling.

  • Property svn:keywords set to Author Date Id Revision
File size: 15.1 KB
Line 
1package org.greenstone.gsdl3.gs3build.indexers;
2
3import java.util.List;
4import java.util.ArrayList;
5import java.util.Iterator;
6
7import java.io.File;
8import java.io.InputStream;
9import java.io.OutputStream;
10import java.io.IOException;
11
12import org.w3c.dom.*;
13
14import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
15import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
16import org.greenstone.gsdl3.gs3build.doctypes.HTMLDocument;
17import org.greenstone.gsdl3.gs3build.metadata.*;
18import org.greenstone.gsdl3.gs3build.xpointer.XPointer;
19
20public class MGIndexer implements IndexerInterface
21{
22 int pass;
23 int documentSeqNo;
24 int sectionSeqNo;
25 boolean firstDocument;
26 String outputDirectory;
27 InputStream indexerFeedback;
28 InputStream indexerErrors;
29 OutputStream indexerTextfeed;
30 Process mg_passes;
31 File textDirectory;
32 File indexDirectory;
33 String indexStem;
34 String textStem;
35 List indexes;
36 String level;
37 String field;
38
39 class MGIndex
40 { String level;
41 String field;
42
43 public MGIndex(String level, String field)
44 { this.level = level;
45 this.field = field;
46 }
47
48 public MGIndex(String indexLabel)
49 { int colonAt = indexLabel.indexOf(':');
50
51 if (colonAt >= 0)
52 { field = indexLabel.substring(colonAt+1);
53 level = indexLabel.substring(0, colonAt);
54 }
55 }
56
57 public String getLevel()
58 { return this.level;
59 }
60
61 public String getField()
62 { return this.field;
63 }
64 }
65
66 public MGIndexer()
67 { this.indexes = new ArrayList();
68 }
69
70 private String getIndexDirectory(String level, String field)
71 { StringBuffer directory = new StringBuffer();
72 directory.append(Character.toLowerCase((char) level.charAt(0)));
73
74 int c, w;
75 w = 0;
76 c = 0;
77 while (c < field.length() && w < 2) {
78 char ch = field.charAt(c);
79
80 ch = Character.toLowerCase(ch);
81 if (Character.isLetter(ch)) {
82 if (ch != 'a' && ch != 'e' && ch != 'i' &&
83 ch != 'o' && ch != 'u') {
84 directory.append(ch);
85 w++;
86 }
87 }
88 c ++;
89 }
90 return directory.toString();
91 }
92
93 /**
94 * The output directory should be (collection)/building/text/ for
95 * normal Greenstone builds.
96 *
97 * @param <code>String</code> the label to configure
98 * @param <code>String</code> the value...
99 */
100 public boolean configure(String label, String value)
101 {
102 if (label.equals(IndexerManager.outputDir)) {
103 this.outputDirectory = value;
104 this.textStem = value + "/text/index";
105 this.pass = 0;
106
107 // attempt to ensure that the text subdirectory exists
108 this.textDirectory = new File(outputDirectory, "text");
109 if (!textDirectory.exists()) {
110 if (!textDirectory.mkdir()) {
111 return false;
112 }
113 }
114 else if (!textDirectory.isDirectory()) {
115 return false;
116 }
117
118 // Sign to the user which mg directory is being used...
119 System.out.println("Output MG directory is " + this.textStem);
120 }
121 else if (label.equals(IndexerInterface.GS2_INDEX_LABEL)) {
122 this.indexes.add(new MGIndex(value));
123 }
124
125 return true;
126 }
127
128 public boolean addIndex(String level, String field)
129 {
130 MGIndex index = new MGIndex(level, field);
131 this.indexes.add(index);
132 return true;
133 }
134
135 private Node recurseDOM(DocumentInterface metsDoc, Node node,
136 AbstractStructure structure, StringBuffer buffer,
137 String namespace, String field)
138 {
139 // send out the ctrl-c...if this is
140 if (structure.getStructureType().equals(METSDivision.DIVISION_TYPE)) {
141 if (this.pass == 0) {
142 METSDivision division = (METSDivision) structure;
143
144 // get the division metadata block
145 METSDescriptive descriptive;
146 String metadataId = division.getDefaultMetadataReference();
147 if (metadataId == null) {
148 descriptive = metsDoc.getDocumentMetadata().createDescriptive(division.getLabel());
149 division.addMetadataReference(descriptive.getID());
150 }
151 else {
152 // Get the descriptive item...
153 descriptive = metsDoc.getDocumentMetadata().getDescriptiveById(metadataId);
154 }
155
156 descriptive.addMetadata("gsdl3", "mgseqno", Integer.toString(this.sectionSeqNo));
157 }
158
159 buffer.append((char) 3);
160 if (this.level != null &&
161 this.level.equals(IndexerInterface.SECTION_LEVEL)) {
162 buffer.append((char) 2);
163 }
164 this.sectionSeqNo ++;
165 }
166
167 // go through our children if required...
168 Iterator children = structure.getChildIterator();
169 while (children.hasNext()) {
170 AbstractStructure child = (AbstractStructure) children.next();
171
172 // get xpointer for child
173 // get start position node
174 Node startNode = ((HTMLDocument) metsDoc).getSectionStartNode((METSDivision) child);
175
176 // while this node isn't the child's start node, produce the node text
177 if (field.equals("text")) {
178 while (node != startNode) {
179 XPointer.printNode(node, buffer, false);
180
181 // print buffer to node
182 node = XPointer.getNextNode(node, (field.equals("text") ? buffer : null));
183 }
184 }
185
186 // recurse to child
187 this.recurseDOM(metsDoc, node, child, buffer, namespace, field);
188 }
189
190 // close a document - the actual closing \B will be done by the main
191 // loop, so only a required \C is printed here...
192 if (structure.getStructureType().equals(METSStructure.STRUCTURE_TYPE)) {
193 while (node != null) {
194 if (field.equals("text")) {
195 XPointer.printNode(node, buffer, false);
196 }
197 else {
198 METSDescriptive descriptive;
199
200 if (structure.getStructureType().equals(METSDivision.DIVISION_TYPE)) {
201 METSDivision division = (METSDivision) structure;
202
203 String metadataId = division.getDefaultMetadataReference();
204
205 descriptive = metsDoc.getDocumentMetadata().getDescriptiveById(metadataId);
206 if (descriptive != null) {
207 List values = descriptive.getMetadata(namespace, field);
208
209 Iterator valueIter = values.iterator();
210 while (valueIter.hasNext()) {
211 String value = valueIter.next().toString();
212
213 buffer.append(value);
214 if (valueIter.hasNext()) {
215 buffer.append((char) 3);
216 }
217 }
218 }
219 }
220 }
221 node = XPointer.getNextNode(node, (field.equals("text") ? buffer : null));
222 }
223 buffer.append((char) 3);
224 this.sectionSeqNo ++;
225 }
226 return node;
227 }
228
229 private String prepareDOM(DocumentInterface metsDoc, Document document, METSStructure structure, String namespace, String field)
230 { Node node = document.getDocumentElement();
231 StringBuffer textBuffer = new StringBuffer();
232
233 this.recurseDOM(metsDoc, node, structure, textBuffer, namespace, field);
234 return textBuffer.toString();
235 }
236
237 /**
238 * Index a single document; the document interface can be used to extract individual
239 * metadata items etc. as required or desired and index those instead or as well as
240 * the body text of the document.
241 */
242 public boolean indexDocument(DocumentID docID, DocumentInterface document)
243 {
244 if (!this.firstDocument)
245 { // Send a 'CTRL-B' before the document itself
246 try {
247 this.indexerTextfeed.write(2);
248 }
249 catch (IOException ex)
250 { System.out.println("Bad output on end of document" + ex);
251 ex.printStackTrace();
252 return false;
253 }
254 }
255 String docText = null;
256
257 Document domDocument = document.getDOMDocument();
258 if (domDocument != null) {
259 METSStructure sections = document.getDocumentStructure().getStructure("Section");
260 if (sections != null) {
261 docText = this.prepareDOM(document, domDocument, sections, "gsdl3", this.field);
262 // System.out.println(docText);
263 }
264 }
265 if (docText == null) {
266 docText = document.getDocumentText();
267 }
268
269 byte [] bytes = docText.getBytes();
270 int pos = 0, end = bytes.length;
271
272 try {
273 while (pos < end) {
274 this.indexerTextfeed.write(bytes, pos, (end - pos > 512 ? 512 : end - pos));
275 pos = pos + 512;
276
277 try {
278 while (this.indexerFeedback.available() > 0)
279 { byte b[] = new byte[this.indexerFeedback.available()];
280 System.out.println("Feedback of " + this.indexerFeedback.available());
281 this.indexerFeedback.read(b);
282 System.out.println(b);
283 }
284 }
285 catch (IOException ex)
286 {
287 }
288
289
290 try {
291 while (this.indexerErrors.available() > 0)
292 { byte b[] = new byte[this.indexerErrors.available()];
293 System.out.println("Feedback of " + this.indexerErrors.available());
294 this.indexerErrors.read(b);
295 System.out.println(new String(b));
296 }
297 }
298 catch (IOException ex)
299 {
300 }
301 }
302 }
303 catch (IOException ex)
304 { System.out.println("Bad output during document write " + ex + " " + pos + " " + end);
305 ex.printStackTrace();
306 return false;
307 }
308
309 // remember that we're not on the first document, assign the sequence number
310 // on the first pass only, and increment the sequence number.
311 this.firstDocument = false;
312 if (this.pass == 0) {
313 document.addDocumentMetadata("gsdl3", "mgseqno", Integer.toString(this.documentSeqNo));
314 }
315 this.documentSeqNo += 1;
316
317 try {
318 while (this.indexerErrors.available() > 0)
319 { char c = (char) this.indexerErrors.read();
320 System.out.println(c);
321 }
322 while (this.indexerFeedback.available() > 0)
323 { byte b[] = new byte[this.indexerFeedback.available()];
324 System.out.println("Feedback of " + this.indexerFeedback.available());
325 this.indexerFeedback.read(b);
326 }
327 }
328 catch (IOException ex)
329 {
330 }
331 return true;
332 }
333
334 /**
335 * Initialise the pass: open required files, check status
336 */
337 public boolean startPass(int passNumber)
338 {
339 this.pass = passNumber;
340 this.firstDocument = true;
341 this.documentSeqNo = 1;
342 this.sectionSeqNo = 1;
343
344 int indexNo = (this.pass - 2) / 2;
345 if (indexNo >= 0) {
346 MGIndex index = (MGIndex) this.indexes.get(indexNo);
347
348 // attempt to ensure that the text subdirectory exists
349 this.indexDirectory = new File(outputDirectory, this.getIndexDirectory(index.getLevel(), index.getField()));
350 if (!indexDirectory.exists()) {
351 if (!indexDirectory.mkdir()) {
352 return false;
353 }
354 }
355 else if (!indexDirectory.isDirectory()) {
356 return false;
357 }
358
359 this.indexStem = this.outputDirectory + File.separatorChar +
360 this.getIndexDirectory(index.getLevel(), index.getField()) +
361 File.separatorChar + "index"; // TODO: modify for index
362 this.level = index.getLevel();
363 this.field = index.getField();
364 }
365 else {
366 this.field = "text";
367 }
368
369 // get the parameters for this execution of mg_passes
370 String pathParams = "-f index -d " + (this.pass < 2 ? this.textDirectory.toString() : this.indexDirectory.toString());
371
372 int mgPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2);
373
374 try {
375 switch (mgPass) {
376 case 0:
377 mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams + " -b 100000 -T1");
378 break;
379
380 case 1:
381 mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams +" -b 100000 -T2");
382 break;
383
384 case 2:
385 mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams + " -b 100000 -2 -m 32 -s 0 -G -t 10 -N1");
386 break;
387
388 case 3:
389 Process p = Runtime.getRuntime().exec("mg_perf_hash_build -f index -d " + this.indexDirectory.toString());
390 p.waitFor();
391 if (p.exitValue() == 0) {
392 System.out.println("Perfect hashes completed");
393 }
394
395 mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams +" -b 100000 -2 -c 3 -G -t 10 -N2");
396 break;
397 }
398
399 this.indexerFeedback = mg_passes.getInputStream();
400 this.indexerErrors = mg_passes.getErrorStream();
401 this.indexerTextfeed = mg_passes.getOutputStream();
402 }
403 catch (IOException ex)
404 { System.out.println(ex);
405 ex.printStackTrace();
406 return false;
407 }
408 catch (InterruptedException ex)
409 { System.out.println(ex);
410 ex.printStackTrace();
411 return false;
412 }
413 System.out.println("Pass " + this.pass);
414 return true;
415 }
416
417 /**
418 * Complete a pass - reset file counters, close files, etc.
419 */
420 public boolean endPass(int passNumber)
421 { Process p;
422
423 try {
424 this.indexerTextfeed.write((char) 2);
425 this.indexerTextfeed.write(4);
426 while (this.indexerErrors.available() > 0)
427 { char c = (char) this.indexerErrors.read();
428 System.out.print(c);
429 }
430 while (this.indexerFeedback.available() > 0)
431 { byte b[] = new byte[this.indexerFeedback.available()];
432 System.out.print("Feedback of " + this.indexerFeedback.available());
433 this.indexerFeedback.read(b);
434 }
435
436 this.indexerTextfeed.close();
437 Thread.sleep(1000);
438 this.mg_passes.waitFor();
439 }
440 catch (IOException ex)
441 { System.out.println(ex);
442 }
443 catch (InterruptedException ex)
444 { System.out.println(ex);
445 }
446 System.out.println("Completed with " + this.mg_passes.exitValue());
447
448 int mgPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2);
449
450 try {
451 switch (mgPass)
452 {
453 case 0:
454 System.out.println("Compressing dictionary");
455 p = Runtime.getRuntime().exec("mg_compression_dict -f index -d " + this.textDirectory.toString() + " -S -H -2 -k 5120");
456 p.waitFor();
457 if (p.exitValue() != 0) {
458 System.out.println("Error from mg_compression_dict: " + p.exitValue());
459 }
460 break;
461
462 case 3:
463 System.out.println("Writing weights file");
464 p = Runtime.getRuntime().exec("mg_weights_build -f " + this.indexStem + " -t " + this.textStem + " -d /");
465 p.waitFor();
466 if (p.exitValue() == 0) {
467 System.out.println("Weights file successfully written");
468 }
469 else {
470 System.out.println("Unable to create weights file");
471 }
472
473 p = Runtime.getRuntime().exec("mg_invf_dict -f index -d " + this.indexDirectory.toString());
474 p.waitFor();
475 if (p.exitValue() == 0) {
476 System.out.println("Inverted dictionary file successfully written");
477 }
478 else {
479 System.out.println("Unable to create inverted dictionary file");
480 }
481
482 p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s1 -f index -d " + this.indexDirectory.toString());
483 p.waitFor();
484 if (p.exitValue() == 0) {
485 System.out.println("Stemmed index successfully written");
486 }
487 else {
488 System.out.println("Unable to create stemmed index");
489 }
490
491 p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s2 -f index -d " + this.indexDirectory.toString());
492 p.waitFor();
493 if (p.exitValue() == 0) {
494 System.out.println("Stemmed index successfully written");
495 }
496 else {
497 System.out.println("Unable to create stemmed index");
498 }
499
500 p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s3 -f index -d " + this.indexDirectory.toString());
501 p.waitFor();
502 if (p.exitValue() == 0) {
503 System.out.println("Stemmed index successfully written");
504 }
505 else {
506 System.out.println("Unable to create stemmed index");
507 }
508 break;
509 }
510 }
511 catch (IOException ex)
512 { System.out.println(ex);
513 ex.printStackTrace();
514 return false;
515 }
516 catch (InterruptedException ex)
517 { System.out.println(ex);
518 ex.printStackTrace();
519 return false;
520 }
521 return true;
522 }
523
524 /**
525 * Do any tidying up
526 */
527 public void tidyup()
528 {
529 }
530
531 /**
532 * Return the number of passes required for this index.
533 */
534 public int getNumberOfPasses()
535 { return 2 + this.indexes.size() * 2;
536 }
537}
Note: See TracBrowser for help on using the repository browser.