source: trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/indexers/MGIndexer.java@ 6698

Last change on this file since 6698 was 6698, checked in by cs025, 20 years ago

Fixed minor issues in indexermanager.

  • Property svn:keywords set to Author Date Id Revision
File size: 17.8 KB
Line 
1package org.greenstone.gsdl3.gs3build.indexers;
2
3import java.util.List;
4import java.util.ArrayList;
5import java.util.Iterator;
6
7import java.io.File;
8import java.io.InputStream;
9import java.io.OutputStream;
10import java.io.IOException;
11
12import org.w3c.dom.*;
13
14import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
15import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
16import org.greenstone.gsdl3.gs3build.doctypes.HTMLDocument;
17import org.greenstone.gsdl3.gs3build.metadata.*;
18import org.greenstone.gsdl3.gs3build.xpointer.XPointer;
19
20public class MGIndexer implements IndexerInterface
21{
22 int pass;
23 int documentSeqNo;
24 int sectionSeqNo;
25 boolean firstDocument;
26 String outputDirectory;
27 InputStream indexerFeedback;
28 InputStream indexerErrors;
29 OutputStream indexerTextfeed;
30 Process mg_passes;
31 File textDirectory;
32 File indexDirectory;
33 String indexStem;
34 String textStem;
35 List indexes;
36 String indexName;
37 String level;
38 String field;
39
40 static final char END_OF_DOCUMENT = (char) 2;
41 static final char END_OF_SECTION = (char) 3;
42 static final char END_OF_STREAM = (char) 4;
43
44 class MGIndex
45 { String level;
46 String field;
47
48 public MGIndex(String level, String field)
49 { this.level = level;
50 this.field = field;
51 }
52
53 public MGIndex(String indexLabel)
54 { int colonAt = indexLabel.indexOf(':');
55
56 if (colonAt >= 0)
57 { field = indexLabel.substring(colonAt+1);
58 level = indexLabel.substring(0, colonAt);
59 }
60 }
61
62 public String getLevel()
63 { return this.level;
64 }
65
66 public String getField()
67 { return this.field;
68 }
69 }
70
71 public MGIndexer()
72 { this.indexes = new ArrayList();
73 }
74
75 private String getIndexDirectory(String level, String field)
76 { StringBuffer directory = new StringBuffer();
77 directory.append(Character.toLowerCase((char) level.charAt(0)));
78
79 int c, w;
80 w = 0;
81 c = 0;
82 while (c < field.length() && w < 2) {
83 char ch = field.charAt(c);
84
85 ch = Character.toLowerCase(ch);
86 if (Character.isLetter(ch)) {
87 if (ch != 'a' && ch != 'e' && ch != 'i' &&
88 ch != 'o' && ch != 'u') {
89 directory.append(ch);
90 w++;
91 }
92 }
93 c ++;
94 }
95 return directory.toString();
96 }
97
98 /**
99 * The output directory should be (collection)/building/text/ for
100 * normal Greenstone builds.
101 *
102 * @param <code>String</code> the label to configure
103 * @param <code>String</code> the value...
104 */
105 public boolean configure(String label, String value)
106 {
107 if (label.equals(IndexerManager.outputDir)) {
108 this.outputDirectory = value;
109 this.textStem = value + "/text/index";
110 this.pass = 0;
111
112 // attempt to ensure that the text subdirectory exists
113 this.textDirectory = new File(outputDirectory, "text");
114 if (!textDirectory.exists()) {
115 if (!textDirectory.mkdir()) {
116 return false;
117 }
118 }
119 else if (!textDirectory.isDirectory()) {
120 return false;
121 }
122
123 // Sign to the user which mg directory is being used...
124 System.out.println("Output MG directory is " + this.textStem);
125 }
126 else if (label.equals(IndexerInterface.GS2_INDEX_LABEL)) {
127 this.indexes.add(new MGIndex(value));
128 }
129
130 return true;
131 }
132
133 public boolean addIndex(String level, String field)
134 {
135 MGIndex index = new MGIndex(level, field);
136 this.indexes.add(index);
137 return true;
138 }
139
140 private Node recurseDOM(DocumentInterface metsDoc, Node node,
141 AbstractStructure structure, StringBuffer textBuffer,
142 StringBuffer extraBuffer, String indexName,
143 String namespace, String field)
144 {
145 // send out the ctrl-c...if this is
146 if (structure.getStructureType().equals(METSDivision.DIVISION_TYPE)) {
147 if ((indexName != null) && indexName.startsWith("s")) {
148 METSDivision division = (METSDivision) structure;
149
150 // get the division metadata block
151 METSDescriptive descriptive;
152 String metadataId = division.getDefaultMetadataReference();
153 if (metadataId == null) {
154 descriptive = metsDoc.getDocumentMetadata().createDescriptive(division.getLabel());
155 division.addMetadataReference(descriptive.getID());
156 }
157 else {
158 // Get the descriptive item...
159 descriptive = metsDoc.getDocumentMetadata().getDescriptiveById(metadataId);
160 }
161
162 descriptive.setMetadata("gsdl3", "mgseqno", indexName + "." + Integer.toString(this.sectionSeqNo));
163 metsDoc.setModified(true);
164 // System.out.println("Assigning " + this.sectionSeqNo + " to " + metsDoc.getID() + " " + division.getLabel());
165 }
166
167 // append an 'end of section' marker
168 textBuffer.append(END_OF_SECTION);
169 this.sectionSeqNo ++;
170
171 // for document-level indexes, always append an 'end of document' tag at the
172 // end of the document for each section. Otherwise, each section is followed
173 // by an end of document character. This ensures that all indexes use the
174 // same document numbering...
175 if (this.level == null ||
176 this.level.equals(IndexerInterface.DOCUMENT_LEVEL)) {
177 extraBuffer.append(END_OF_DOCUMENT);
178 }
179 else {
180 textBuffer.append(END_OF_DOCUMENT);
181 this.documentSeqNo ++;
182 }
183
184 // produce the body here for metadata output of divisions - in the case of
185 // text output, that will happen below...
186 if (!this.field.equals("text"))
187 { METSDescriptive descriptive;
188
189 METSDivision division = (METSDivision) structure;
190
191 String metadataId = division.getDefaultMetadataReference();
192
193 descriptive = metsDoc.getDocumentMetadata().getDescriptiveById(metadataId);
194 if (descriptive != null) {
195 List values = descriptive.getMetadata(namespace, field);
196
197 if (values != null) {
198 Iterator valueIter = values.iterator();
199 while (valueIter.hasNext()) {
200 String value = valueIter.next().toString();
201
202 textBuffer.append(value);
203 if (valueIter.hasNext()) {
204 textBuffer.append(END_OF_SECTION);
205 }
206 }
207 }
208 }
209 }
210 }
211
212 // go through our children as required...
213 Iterator children = structure.getChildIterator();
214 while (children.hasNext()) {
215 AbstractStructure child = (AbstractStructure) children.next();
216
217 // get xpointer for child
218 // get start position node
219 Node startNode = ((HTMLDocument) metsDoc).getSectionStartNode((METSDivision) child);
220
221 // while this node isn't the child's start node, produce the HTML node text, if
222 // in text field mode...
223 if (field.equals("text")) {
224 while (node != startNode) {
225 XPointer.printNode(node, textBuffer, false);
226
227 // print buffer to node
228 node = XPointer.getNextNode(node, (field.equals("text") ? textBuffer : null));
229 }
230 }
231
232 // recurse to child
233 node = this.recurseDOM(metsDoc, node, child, textBuffer, extraBuffer, indexName, namespace, field);
234 }
235
236 // close a document - the actual closing \B will be done by the main
237 // loop, so only a required \C is printed here...
238 if (structure.getStructureType().equals(METSStructure.STRUCTURE_TYPE)) {
239 while (node != null) {
240 if (field.equals("text")) {
241 XPointer.printNode(node, textBuffer, false);
242 }
243 node = XPointer.getNextNode(node, (field.equals("text") ? textBuffer : null));
244 }
245 /*
246 textBuffer.append(END_OF_SECTION);
247 this.sectionSeqNo ++;
248 */
249 }
250 return node;
251 }
252
253 private String prepareDOM(DocumentInterface metsDoc, Document document, METSStructure structure,
254 String indexName, String namespace, String field)
255 { StringBuffer extraBuffer = new StringBuffer();
256 Node node = document.getDocumentElement();
257 StringBuffer textBuffer = new StringBuffer();
258
259 this.recurseDOM(metsDoc, node, structure, textBuffer, extraBuffer, indexName, namespace, field);
260 textBuffer.append(extraBuffer.toString());
261 return textBuffer.toString();
262 }
263
264 /**
265 * Index a single document; the document interface can be used to extract individual
266 * metadata items etc. as required or desired and index those instead or as well as
267 * the body text of the document.
268 */
269 public boolean indexDocument(DocumentID docID, DocumentInterface document)
270 {
271 if (!this.firstDocument)
272 { // Send a 'CTRL-B' before the document itself
273 try {
274 this.indexerTextfeed.write(END_OF_DOCUMENT);
275 }
276 catch (IOException ex)
277 { System.out.println("Bad output on end of document" + ex);
278 ex.printStackTrace();
279 return false;
280 }
281 }
282 String docText = null;
283
284 int startSeqNo = this.sectionSeqNo;
285 this.sectionSeqNo ++;
286
287 Document domDocument = document.getDOMDocument();
288 if (domDocument != null) {
289 METSStructure sections = document.getDocumentStructure().getStructure("Section");
290 if (sections != null) {
291 docText = this.prepareDOM(document, domDocument, sections, this.indexName, "gsdl3", this.field);
292 // System.out.println(docText);
293 }
294 }
295 if (docText == null) {
296 if (this.field.equals("text")) {
297 docText = Character.toString(END_OF_DOCUMENT) + Character.toString(END_OF_SECTION) +
298 document.getDocumentText();
299 }
300 else {
301 StringBuffer textBuffer = new StringBuffer();
302 textBuffer.append(END_OF_DOCUMENT);
303 textBuffer.append(END_OF_SECTION);
304 List values = document.getDocumentMetadataItem("gsdl3", this.field);
305 if (values != null) {
306 Iterator valueIter = values.iterator();
307 while (valueIter.hasNext()) {
308 String value = valueIter.next().toString();
309
310 textBuffer.append(value);
311 if (valueIter.hasNext()) {
312 textBuffer.append(END_OF_SECTION);
313 // sectionSeqNo ++;
314 }
315 }
316 }
317 else {
318 textBuffer.append("No data");
319 }
320 docText = textBuffer.toString();
321 }
322 sectionSeqNo ++;
323 }
324
325 /* if (this.pass == 0) {
326 System.err.println(docText);
327 }
328 */
329
330 byte [] bytes = docText.getBytes();
331 int pos = 0, end = bytes.length;
332
333 try {
334 while (pos < end) {
335 this.indexerTextfeed.write(bytes, pos, (end - pos > 512 ? 512 : end - pos));
336 pos = pos + 512;
337
338 try {
339 while (this.indexerFeedback.available() > 0)
340 { byte b[] = new byte[this.indexerFeedback.available()];
341 System.out.println("Feedback of " + this.indexerFeedback.available());
342 this.indexerFeedback.read(b);
343 System.out.println(b);
344 }
345 }
346 catch (IOException ex)
347 { System.out.println(ex);
348 }
349
350
351 try {
352 while (this.indexerErrors.available() > 0)
353 { byte b[] = new byte[this.indexerErrors.available()];
354 System.out.println("Feedback of " + this.indexerErrors.available());
355 this.indexerErrors.read(b);
356 System.out.println(new String(b));
357 }
358 }
359 catch (IOException ex)
360 { System.out.println(ex);
361 }
362 }
363 }
364 catch (IOException ex)
365 { System.out.println("Bad output during document write " + ex + " " + pos + " " + end);
366 ex.printStackTrace();
367 return false;
368 }
369
370 // remember that we're not on the first document, assign the sequence number
371 // on the first pass only, and increment the sequence number.
372 this.firstDocument = false;
373 if (this.pass == 0) {
374 document.addDocumentMetadata("gsdl3", "mgseqno", "dtx."+Integer.toString(startSeqNo));
375 //System.out.println("Assigning " + startSeqNo + " to " + document.getID());
376 }
377 this.documentSeqNo += 1;
378
379 try {
380 while (this.indexerErrors.available() > 0)
381 { char c = (char) this.indexerErrors.read();
382 System.out.println(c);
383 }
384 while (this.indexerFeedback.available() > 0)
385 { byte b[] = new byte[this.indexerFeedback.available()];
386 System.out.println("Feedback of " + this.indexerFeedback.available());
387 this.indexerFeedback.read(b);
388 }
389 }
390 catch (IOException ex)
391 {
392 }
393 return true;
394 }
395
396 /**
397 * Initialise the pass: open required files, check status
398 */
399 public boolean startPass(int passNumber)
400 {
401 this.pass = passNumber;
402 this.firstDocument = true;
403 this.documentSeqNo = 1;
404 this.sectionSeqNo = 1;
405
406 int indexNo = (this.pass - 2) / 2;
407 if (this.pass >= 2) {
408 MGIndex index = (MGIndex) this.indexes.get(indexNo);
409
410 // attempt to ensure that the text subdirectory exists
411 this.indexDirectory = new File(outputDirectory, this.getIndexDirectory(index.getLevel(), index.getField()));
412 if (!indexDirectory.exists()) {
413 if (!indexDirectory.mkdir()) {
414 return false;
415 }
416 }
417 else if (!indexDirectory.isDirectory()) {
418 return false;
419 }
420
421 this.level = index.getLevel();
422 this.field = index.getField();
423 this.indexName = this.getIndexDirectory(index.getLevel(), index.getField());
424 this.indexStem = this.outputDirectory + File.separatorChar +
425 this.indexName + File.separatorChar + "index"; // TODO: modify for index
426 if (this.pass % 2 == 1) {
427 this.indexName = null;
428 }
429 }
430 else {
431 this.field = "text";
432 this.level = "section";
433 this.indexName = null;
434 }
435 System.out.println("level is " + this.level);
436 System.out.println("field is " + this.field);
437 System.out.println("index name is " + this.indexName);
438
439 // get the parameters for this execution of mg_passes
440 String pathParams = "-f index -d " + (this.pass < 2 ? this.textDirectory.toString() : this.indexDirectory.toString());
441
442 int mgPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2);
443
444 try {
445 switch (mgPass) {
446 case 0:
447 mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams + " -b 100000 -T1");
448 break;
449
450 case 1:
451 mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams +" -b 100000 -T2");
452 break;
453
454 case 2:
455 mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams + " -b 100000 -2 -m 32 -s 0 -G -t 10 -N1");
456 break;
457
458 case 3:
459 Process p = Runtime.getRuntime().exec("mg_perf_hash_build -f index -d " + this.indexDirectory.toString());
460 p.waitFor();
461 if (p.exitValue() == 0) {
462 System.out.println("Perfect hashes completed");
463 }
464
465 mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams +" -b 100000 -2 -c 3 -G -t 10 -N2");
466 break;
467 }
468
469 this.indexerFeedback = mg_passes.getInputStream();
470 this.indexerErrors = mg_passes.getErrorStream();
471 this.indexerTextfeed = mg_passes.getOutputStream();
472 }
473 catch (IOException ex)
474 { System.out.println(ex);
475 ex.printStackTrace();
476 return false;
477 }
478 catch (InterruptedException ex)
479 { System.out.println(ex);
480 ex.printStackTrace();
481 return false;
482 }
483 System.out.println("Pass " + this.pass);
484 return true;
485 }
486
487 /**
488 * Complete a pass - reset file counters, close files, etc.
489 */
490 public boolean endPass(int passNumber)
491 { Process p;
492
493 try {
494 this.indexerTextfeed.write(END_OF_DOCUMENT);
495 this.indexerTextfeed.write(END_OF_STREAM);
496 while (this.indexerErrors.available() > 0)
497 { char c = (char) this.indexerErrors.read();
498 System.out.print(c);
499 }
500 while (this.indexerFeedback.available() > 0)
501 { byte b[] = new byte[this.indexerFeedback.available()];
502 System.out.print("Feedback of " + this.indexerFeedback.available());
503 this.indexerFeedback.read(b);
504 }
505
506 this.indexerTextfeed.close();
507 Thread.sleep(1000);
508 this.mg_passes.waitFor();
509 }
510 catch (IOException ex)
511 { System.out.println(ex);
512 }
513 catch (InterruptedException ex)
514 { System.out.println(ex);
515 }
516 System.out.println("Pass " + this.pass + " completed with " + this.mg_passes.exitValue());
517
518 int mgPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2);
519
520 try {
521 switch (mgPass)
522 {
523 case 0:
524 System.out.println("Compressing dictionary");
525 p = Runtime.getRuntime().exec("mg_compression_dict -f index -d " + this.textDirectory.toString() + " -S -H -2 -k 5120");
526 p.waitFor();
527 if (p.exitValue() != 0) {
528 System.out.println("Error from mg_compression_dict: " + p.exitValue());
529 }
530 else {
531 System.out.println("Compressed dictionary successfully written");
532 }
533 break;
534
535 case 3:
536 System.out.println("Writing weights file");
537 p = Runtime.getRuntime().exec("mg_weights_build -f " + this.indexStem + " -t " + this.textStem + " -d /");
538 p.waitFor();
539 if (p.exitValue() == 0) {
540 System.out.println("Weights file successfully written");
541 }
542 else {
543 System.out.println("Unable to create weights file " + "mg_weights_build -f " + this.indexStem + " -t " + this.textStem + " -d /");
544 }
545
546 p = Runtime.getRuntime().exec("mg_invf_dict -f index -d " + this.indexDirectory.toString());
547 p.waitFor();
548 if (p.exitValue() == 0) {
549 System.out.println("Inverted dictionary file successfully written");
550 }
551 else {
552 System.out.println("Unable to create inverted dictionary file");
553 }
554
555 p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s1 -f index -d " + this.indexDirectory.toString());
556 p.waitFor();
557 if (p.exitValue() == 0) {
558 System.out.println("Stemmed index successfully written");
559 }
560 else {
561 System.out.println("Unable to create stemmed index");
562 }
563
564 p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s2 -f index -d " + this.indexDirectory.toString());
565 p.waitFor();
566 if (p.exitValue() == 0) {
567 System.out.println("Stemmed index successfully written");
568 }
569 else {
570 System.out.println("Unable to create stemmed index");
571 }
572
573 p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s3 -f index -d " + this.indexDirectory.toString());
574 p.waitFor();
575 if (p.exitValue() == 0) {
576 System.out.println("Stemmed index successfully written");
577 }
578 else {
579 System.out.println("Unable to create stemmed index");
580 }
581 break;
582 }
583 }
584 catch (IOException ex)
585 { System.out.println(ex);
586 ex.printStackTrace();
587 return false;
588 }
589 catch (InterruptedException ex)
590 { System.out.println(ex);
591 ex.printStackTrace();
592 return false;
593 }
594 return true;
595 }
596
597 /**
598 * Do any tidying up
599 */
600 public void tidyup()
601 {
602 }
603
604 /**
605 * Return the number of passes required for this index.
606 */
607 public int getNumberOfPasses()
608 { return 2 + this.indexes.size() * 2;
609 }
610}
Note: See TracBrowser for help on using the repository browser.