source: trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/indexers/MGIndexer.java@ 9940

Last change on this file since 9940 was 9940, checked in by kjdon, 19 years ago

reformatted the code

  • Property svn:keywords set to Author Date Id Revision
File size: 21.4 KB
Line 
1package org.greenstone.gsdl3.gs3build.indexers;
2
3import java.util.List;
4import java.util.ArrayList;
5import java.util.Iterator;
6
7import java.io.File;
8import java.io.InputStream;
9import java.io.OutputStream;
10import java.io.IOException;
11import java.io.BufferedReader;
12import java.io.InputStreamReader;
13
14import org.w3c.dom.*;
15
16import org.greenstone.mg.*;
17
18import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
19import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
20import org.greenstone.gsdl3.gs3build.doctypes.HTMLDocument;
21import org.greenstone.gsdl3.gs3build.doctypes.METSDocument;
22import org.greenstone.gsdl3.gs3build.metadata.*;
23import org.greenstone.gsdl3.gs3build.xpointer.XPointer;
24import org.greenstone.gsdl3.util.GSXML;
25import org.greenstone.gsdl3.util.Misc;
26import org.greenstone.gsdl3.util.Processing;
27
28public class MGIndexer extends AbstractIndexer
29{
30 int pass;
31 int documentSeqNo;
32 int sectionSeqNo;
33 boolean firstDocument;
34 String outputDirectory;
35 // InputStream indexerFeedback;
36 // InputStream indexerErrors;
37 //OutputStream indexerTextfeed;
38 StringBuffer indexBuffer;
39 //Process mg_passes;
40 File textDirectory;
41 File indexDirectory;
42 String indexStem;
43 String textStem;
44 List indexes;
45 String overallName;
46
47 String currentIndexName;
48 String currentIndexLevel;
49 String currentIndexField;
50
51 MGPassesWrapper mgPasses;
52
53 static final char END_OF_DOCUMENT = (char) 2;
54 static final char END_OF_SECTION = (char) 3; // actually this is end of para for mg
55 static final char END_OF_STREAM = (char) 4;
56
57 public static final String MG_INDEX_TYPE = "mg";
58 public static final String INDEX_FILE_STEM = "index";
59
60 class MGIndex
61 {
62 String name=null;
63 String level=null;
64 String field=null;
65 boolean error = false;// assume built until we get an error
66
67 public MGIndex(String name, String level, String field)
68 {
69 this.name = name;
70 this.level = level;
71 this.field = field;
72 }
73
74 public MGIndex(String indexLabel)
75 {
76 int colonAt = indexLabel.indexOf(':');
77
78 if (colonAt >= 0) {
79 this.field = indexLabel.substring(colonAt+1);
80 this.level = indexLabel.substring(0, colonAt);
81 createIndexName();
82 }
83 }
84
85 public String getLevel()
86 {
87 return this.level;
88 }
89
90 public String getField()
91 {
92 return this.field;
93 }
94
95 public String getName()
96 {
97 if (this.name==null || this.name.equals("")) {
98 createIndexName();
99 }
100 return this.name;
101 }
102
103 public boolean hasError() {
104 return this.error;
105 }
106 public void setError(boolean b) {
107 this.error = b;
108 }
109
110 private void createIndexName() {
111 StringBuffer new_name = new StringBuffer();
112 new_name.append(Character.toLowerCase((char) this.level.charAt(0)));
113
114 int c, w;
115 w = 0;
116 c = 0;
117 while (c < this.field.length() && w < 2) {
118 char ch = this.field.charAt(c);
119
120 ch = Character.toLowerCase(ch);
121 if (Character.isLetter(ch)) {
122 if (ch != 'a' && ch != 'e' && ch != 'i' &&
123 ch != 'o' && ch != 'u') {
124 new_name.append(ch);
125 w++;
126 }
127 }
128 c ++;
129 }
130 this.name = new_name.toString();
131 }
132 } // MGIndex
133
134 public MGIndexer(String name)
135 {
136 this.indexes = new ArrayList();
137 this.overallName = name;
138 }
139
140 public String getIndexType()
141 {
142 return MG_INDEX_TYPE;
143 }
144
145 public String getName()
146 {
147 return this.overallName;
148 }
149
150 // private String getIndexDirectory(String level, String field)
151 // { StringBuffer directory = new StringBuffer();
152 // directory.append(Character.toLowerCase((char) level.charAt(0)));
153
154 // int c, w;
155 // w = 0;
156 // c = 0;
157 // while (c < field.length() && w < 2) {
158 // char ch = field.charAt(c);
159
160 // ch = Character.toLowerCase(ch);
161 // if (Character.isLetter(ch)) {
162 // if (ch != 'a' && ch != 'e' && ch != 'i' &&
163 // ch != 'o' && ch != 'u') {
164 // directory.append(ch);
165 // w++;
166 // }
167 // }
168 // c ++;
169 // }
170 // return directory.toString();
171 // }
172
173 /**
174 * The output directory should be (collection)/building/text/ for
175 * normal Greenstone builds.
176 *
177 * @param <code>String</code> the label to configure
178 * @param <code>String</code> the value...
179 */
180 public boolean configure(String label, String value)
181 {
182 if (label.equals(IndexerManager.outputDir)) {
183 this.outputDirectory = value;
184 this.pass = 0;
185
186 // attempt to ensure that the text subdirectory exists
187 this.textDirectory = new File(outputDirectory, "text");
188 if (!textDirectory.exists()) {
189 if (!textDirectory.mkdir()) {
190 return false;
191 }
192 }
193 else if (!textDirectory.isDirectory()) {
194 return false;
195 }
196 this.textStem = this.textDirectory.getPath() + File.separator + INDEX_FILE_STEM;
197
198 // Sign to the user which mg directory is being used...
199 System.out.println("Output MG directory is " + this.textStem);
200 }
201 else if (label.equals(IndexerInterface.GS2_INDEX_LABEL)) {
202 this.indexes.add(new MGIndex(value));
203 }
204
205 return true;
206 }
207
208 public boolean addIndex(String name, String level, String field)
209 {
210 MGIndex index = new MGIndex(name, level, field);
211 this.indexes.add(index);
212 return true;
213 }
214
215 private Node recurseDOM(DocumentInterface metsDoc, Node node,
216 AbstractStructure structure, StringBuffer textBuffer,
217 StringBuffer extraBuffer, String namespace)
218 //String name, String namespace, String field)
219 {
220 // send out the ctrl-c...if this is
221 if (structure.getStructureType().equals(METSDivision.DIVISION_TYPE)) {
222 // try doing this for all index types
223 if ((this.currentIndexName != null)) { // && this.level != null && this.level.equals(IndexerInterface.SECTION_LEVEL)) { //name.startsWith("s")) {
224 METSDivision division = (METSDivision) structure;
225
226 // get the division metadata block
227 METSDescriptive descriptive;
228 String metadataId = division.getDefaultMetadataReference();
229 if (metadataId == null) {
230 descriptive = metsDoc.getDocumentMetadata().createDescriptive(division.getLabel());
231 division.addMetadataReference(descriptive.getID());
232 }
233 else {
234 // Get the descriptive item...
235 descriptive = metsDoc.getDocumentMetadata().getDescriptiveById(metadataId);
236 }
237
238 descriptive.addMetadata("gsdl3", "mgseqno", this.overallName + "." + Integer.toString(this.sectionSeqNo));
239
240 metsDoc.setChanged(true);
241 //metsDoc.setModified(true);
242 // System.out.println("Assigning " + this.sectionSeqNo + " to " + metsDoc.getID() + " " + division.getLabel());
243 } // section level
244
245 // append an 'end of section' marker
246 //textBuffer.append(END_OF_SECTION);
247 this.sectionSeqNo ++;
248
249 // for document-level indexes, always append an 'end of document' tag at the
250 // end of the document for each section. Otherwise, each section is followed
251 // by an end of document character. This ensures that all indexes use the
252 // same document numbering...
253 if (this.currentIndexLevel == null ||
254 this.currentIndexLevel.equals(IndexerInterface.DOCUMENT_LEVEL)) {
255 extraBuffer.append(END_OF_DOCUMENT);
256 }
257 else {
258 textBuffer.append(END_OF_DOCUMENT);
259 this.documentSeqNo ++;
260 }
261
262 // produce the body here for metadata output of divisions - in the case of
263 // text output, that will happen below...
264 if (!this.currentIndexField.equals("text")) {
265 METSDescriptive descriptive;
266
267 METSDivision division = (METSDivision) structure;
268
269 String metadataId = division.getDefaultMetadataReference();
270
271 descriptive = metsDoc.getDocumentMetadata().getDescriptiveById(metadataId);
272 if (descriptive != null) {
273 List values = descriptive.getMetadata(namespace, this.currentIndexField);
274
275 if (values != null) {
276 Iterator valueIter = values.iterator();
277 while (valueIter.hasNext()) {
278 String value = valueIter.next().toString();
279
280 textBuffer.append(value);
281 if (valueIter.hasNext()) {
282 //textBuffer.append(END_OF_SECTION);
283 }
284 }
285 }
286 }
287 }
288 }
289
290 // go through our children as required...
291 Iterator children = structure.getChildIterator();
292 Node startNode;
293 while (children.hasNext()) {
294 AbstractStructure child = (AbstractStructure) children.next();
295
296 // get xpointer for child
297 // get start position node
298 if (metsDoc.getDocumentType() == "METS"){
299 startNode = ((METSDocument) metsDoc).getSectionStartNode((METSDivision) child);
300 } else {
301 startNode = ((HTMLDocument) metsDoc).getSectionStartNode((METSDivision) child);
302 }
303 //Node startNode = ((HTMLDocument) metsDoc).getSectionStartNode((METSDivision) child);
304
305 // while this node isn't the child's start node, produce the HTML node text, if
306 // in text field mode...
307 if (this.currentIndexField.equals("text")) {
308 while (node != startNode) {
309 XPointer.printNode(node, textBuffer, false);
310
311 // print buffer to node
312 node = XPointer.getNextNode(node, (this.currentIndexField.equals("text") ? textBuffer : null));
313 }
314 }
315
316 // recurse to child
317 node = this.recurseDOM(metsDoc, node, child, textBuffer, extraBuffer, namespace); // name, namespace, field);
318 } // while next child
319
320 // close a document - the actual closing \B will be done by the main
321 // loop, so only a required \C is printed here...
322 if (structure.getStructureType().equals(METSStructure.STRUCTURE_TYPE)) {
323 while (node != null) {
324 if (this.currentIndexField.equals("text")) {
325 XPointer.printNode(node, textBuffer, false);
326 }
327 node = XPointer.getNextNode(node, (this.currentIndexField.equals("text") ? textBuffer : null));
328 }
329
330 //textBuffer.append(END_OF_SECTION);
331 this.sectionSeqNo ++;
332
333 }
334 return node;
335 }
336
337 private String prepareDOM(DocumentInterface metsDoc, Document document, METSStructure structure, String namespace)
338 {
339 // String name, String namespace, String field)
340 StringBuffer extraBuffer = new StringBuffer();
341 Node node = document.getDocumentElement();
342 StringBuffer textBuffer = new StringBuffer();
343
344 this.recurseDOM(metsDoc, node, structure, textBuffer, extraBuffer, namespace); //name, namespace, field);
345 textBuffer.append(extraBuffer.toString());
346 return textBuffer.toString();
347 }
348
349 /**
350 * Index a single document; the document interface can be used to extract individual
351 * metadata items etc. as required or desired and index those instead or as well as
352 * the body text of the document.
353 */
354 public boolean indexDocument(DocumentID docID, DocumentInterface document)
355 {
356 if (this.pass == 0) {
357 document.removeAllMetadata("gsdl3", "mgseqno");
358 }
359
360 if (!this.firstDocument) {
361 this.indexBuffer.append(END_OF_DOCUMENT);
362 mgPasses.processDocument(indexBuffer.toString());
363 this.indexBuffer.delete(0, this.indexBuffer.length());
364
365 }
366
367 String docText = null;
368
369 int startSeqNo = this.sectionSeqNo;
370 this.sectionSeqNo ++;
371
372 Document domDocument = document.getDOMDocument();
373 if (domDocument != null) {
374 System.err.println("dom doc is not null");
375 METSStructure sections = document.getDocumentStructure().getStructure("Section");
376 if (sections != null) {
377 System.err.println("sections are not null");
378 docText = this.prepareDOM(document, domDocument, sections, "gsdl3"); //this.name, "gsdl3", this.field);
379 // System.out.println(docText);
380 }
381 }
382 if (docText == null) {
383 System.err.println("dom doc or sections was null - asking for doc text");
384 if (this.currentIndexField.equals("text")) {
385 //docText = Character.toString(END_OF_DOCUMENT) + document.getDocumentText();
386 docText = document.getDocumentText();
387 }
388 else {
389 StringBuffer textBuffer = new StringBuffer();
390 //textBuffer.append(END_OF_DOCUMENT);
391 List values = document.getDocumentMetadataItem("gsdl3", this.currentIndexField);
392 if (values != null) {
393 Iterator valueIter = values.iterator();
394 while (valueIter.hasNext()) {
395 String value = valueIter.next().toString();
396
397 textBuffer.append(value);
398 if (valueIter.hasNext()) {
399 //textBuffer.append(END_OF_SECTION);
400 // sectionSeqNo ++;
401 }
402 }
403 }
404 else {
405 textBuffer.append("No data");
406 }
407 docText = textBuffer.toString();
408 }
409 sectionSeqNo ++;
410 }
411
412
413 this.indexBuffer.append(docText);
414 // remember that we're not on the first document,
415 this.firstDocument = false;
416 // assign the sequence number on the first pass only, and increment the sequence number.
417 if (this.pass == 0) {
418 document.addDocumentMetadata("gsdl3", "mgseqno", this.overallName+"."+Integer.toString(startSeqNo));
419 }
420 this.documentSeqNo += 1;
421
422 return true;
423 }
424
425 /**
426 * Initialise the pass: open required files, check status
427 */
428 public boolean startPass(int passNumber) {
429
430
431 this.pass = passNumber;
432 this.firstDocument = true;
433 this.documentSeqNo = 1;
434 this.sectionSeqNo = 1;
435
436 this.mgPasses = new MGPassesWrapper();
437 this.indexBuffer = new StringBuffer();
438 int indexNo = (this.pass - 2) / 2;
439 MGIndex index = null;
440 if (this.pass >= 2) {
441 index = (MGIndex) this.indexes.get(indexNo);
442 if (index.hasError()) {
443 // an error has already occurred for this index, don't continue
444 System.out.println("pass "+this.pass+": aborted due to errors in the previous pass");
445 return false;
446 }
447 // attempt to ensure that the text subdirectory exists
448 this.indexDirectory = new File(outputDirectory, index.getName());
449 if (!indexDirectory.exists()) {
450 if (!indexDirectory.mkdir()) {
451 return false;
452 }
453 }
454 else if (!indexDirectory.isDirectory()) {
455 return false;
456 }
457
458 this.currentIndexLevel = index.getLevel();
459 this.currentIndexField = index.getField();
460 this.currentIndexName = index.getName();
461
462 if (this.currentIndexLevel == null || this.currentIndexField == null ) {
463 System.out.println("invalid index - level or field was null");
464 return false;
465 }
466 this.indexStem = this.indexDirectory.getPath() + File.separatorChar + INDEX_FILE_STEM; // TODO: modify for index
467 if (this.pass % 2 == 1) {
468 this.currentIndexName = null; // why???
469 }
470 }
471 else {
472
473 this.currentIndexField = "text";
474 this.currentIndexLevel = "section";
475 this.currentIndexName = null;
476 }
477
478 // get the parameters for this execution of mg_passes
479 mgPasses.setFileName((this.pass < 2 ? this.textDirectory.toString() : this.indexDirectory.toString())+File.separator+ "index");
480 if (!Misc.isWindows()) {
481 mgPasses.setBasePath("/");
482 }
483 int mgPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2);
484
485 mgPasses.setBufferSize(100000);
486
487 switch (mgPass) {
488 case 0:
489 // -b 100000 -T1
490 mgPasses.addPass(MGPassesWrapper.TEXT_PASS_1);
491
492
493 break;
494
495 case 1:
496 // -b 100000 -T2
497 mgPasses.addPass(MGPassesWrapper.TEXT_PASS_2);
498 break;
499
500 case 2:
501 // -b 100000 -2 -m 32 -s 0 -G -t 10 -N1
502 mgPasses.addPass(MGPassesWrapper.INDEX_PASS_1);
503 mgPasses.setInvfLevel(MGPassesWrapper.INVF_LEVEL_2);
504 mgPasses.setStemOptions(MGPassesWrapper.STEMMER_ENGLISH, MGPassesWrapper.NO_STEM_OR_CASE);
505 mgPasses.setInversionMemLimit(32);
506 mgPasses.ignoreSGMLTags(true);
507 break;
508
509 case 3:
510 // -b 100000 -2 -c 3 -G -t 10 -N2
511 mgPasses.addPass(MGPassesWrapper.INDEX_PASS_2);
512 mgPasses.setInvfLevel(MGPassesWrapper.INVF_LEVEL_2);
513 mgPasses.ignoreSGMLTags(true);
514 break;
515 }
516
517 mgPasses.init();
518 System.out.println("Pass " + this.pass);
519 return true;
520 }
521
522 /**
523 * Complete a pass - reset file counters, close files, etc.
524 */
525 public boolean endPass(int passNumber) {
526 Process p;
527
528 int indexNo = (passNumber - 2) / 2;
529 MGIndex index = null;
530 if (passNumber >= 2) {
531 index = (MGIndex) this.indexes.get(indexNo);
532 }
533 try {
534 this.indexBuffer.append(END_OF_DOCUMENT);
535 mgPasses.processDocument(indexBuffer.toString());
536 this.indexBuffer.delete(0, this.indexBuffer.length());
537 Thread.sleep(1000); // what for??
538 }
539 catch (InterruptedException ex) {
540 System.out.println(ex);
541 }
542 mgPasses.finish();
543 try {
544 Thread.sleep(1000);
545 } catch (Exception e) {}
546
547 int exit_value = 0;
548 System.out.println("Pass " + this.pass + " completed with " + exit_value);
549 if (exit_value !=0) {
550 //assume something has gone wrong, don't continue
551 if (index != null) {
552 index.setError(true);
553 return false;
554 }
555 }
556 int mgPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2);
557 String osextra = "";
558 if (!Misc.isWindows()) {
559 osextra = " -d / ";
560 }
561
562 switch (mgPass) {
563
564 case 0:
565 System.out.println("Compressing dictionary");
566 exit_value = Processing.runProcess("mg_compression_dict -f " + this.textDirectory.toString()+File.separator+"index" + osextra + " -S -H -2 -k 5120");
567 if (exit_value == 0) {
568 System.out.println("Compressed dictionary successfully written");
569 } else {
570 System.err.println("Error from mg_compression_dict: " + exit_value);
571 index.setError(true);
572
573 return false;
574 }
575 break;
576
577 case 2:
578 System.out.println("Creating perfect hash");
579 exit_value = Processing.runProcess("mg_perf_hash_build -f " + this.indexDirectory.toString()+File.separator+ "index"+osextra);
580 if (exit_value ==0) {
581 System.out.println("Perfect hashes completed");
582 } else {
583 System.err.println("Unable to build the perfect hash");
584 index.setError(true);
585 return false;
586 }
587 break;
588
589 case 3:
590 System.out.println("Writing weights file");
591 exit_value = Processing.runProcess("mg_weights_build -f " + this.indexStem + " -t " + this.textStem + osextra);
592 if (exit_value ==0) {
593 System.out.println("Weights file successfully written");
594 } else {
595 System.err.println("Unable to create weights file");
596 index.setError(true);
597 return false;
598 }
599
600 System.out.println("Creating inverted dictionary");
601 exit_value = Processing.runProcess("mg_invf_dict -f " + this.indexDirectory.toString()+File.separator+"index" + osextra);
602 if (exit_value ==0) {
603 System.out.println("Inverted dictionary file successfully written");
604 } else {
605 System.out.println("Unable to create inverted dictionary file");
606 index.setError(true);
607 return false;
608 }
609
610 System.out.println("Creating Stem indexes");
611 exit_value = Processing.runProcess("mg_stem_idx -b 4096 -s1 -f " + this.indexDirectory.toString()+File.separator+"index"+osextra);
612 if (exit_value == 0) {
613 System.out.println("Stemmed index 1 successfully written");
614 } else {
615 System.out.println("Unable to create stemmed index 1");
616 index.setError(true);
617 return false;
618 }
619
620 exit_value = Processing.runProcess("mg_stem_idx -b 4096 -s2 -f " + this.indexDirectory.toString()+File.separator+"index"+osextra);
621 if (exit_value == 0) {
622 System.out.println("Stemmed index 2 successfully written");
623 } else {
624 System.out.println("Unable to create stemmed index 2");
625 index.setError(true);
626 return false;
627 }
628 exit_value = Processing.runProcess("mg_stem_idx -b 4096 -s3 -f " + this.indexDirectory.toString()+File.separator+"index"+osextra);
629 if (exit_value == 0) {
630 System.out.println("Stemmed index 3 successfully written");
631 } else {
632 System.out.println("Unable to create stemmed index 3");
633 index.setError(true);
634 return false;
635 }
636
637 break;
638 } // switch
639
640 mgPasses = null;
641 return true;
642 }
643
644 /**
645 * Do any tidying up
646 */
647 public void tidyup()
648 {
649 }
650
651 /**
652 * Return the number of passes required for this index.
653 */
654 public int getNumberOfPasses()
655 {
656 return 2 + this.indexes.size() * 2;
657 }
658
659 public boolean addServiceDescriptions(org.w3c.dom.Element service_rack_list)
660 {
661 Document doc = service_rack_list.getOwnerDocument();
662
663 // generate the list of indexes
664 Element index_list = doc.createElement(GSXML.INDEX_ELEM+GSXML.LIST_MODIFIER);
665 boolean found_index = false;
666 String def_index = ""; // the default index will just be the first one created for now.
667 for (int i=0; i<this.indexes.size(); i++) {
668 MGIndex index = (MGIndex)this.indexes.get(i);
669 if (!index.hasError()) {
670 Element e = doc.createElement(GSXML.INDEX_ELEM);
671 e.setAttribute(GSXML.NAME_ATT, index.getName());
672 index_list.appendChild(e);
673 if (found_index == false) {
674 // this is the first index
675 found_index = true;
676 def_index = index.getName();
677 }
678 }
679 }
680
681 if (!found_index) {
682 // no indexes were able to be created, so we can't use them or the text
683 return false;
684 }
685 Element default_index = doc.createElement("defaultIndex");
686 default_index.setAttribute(GSXML.NAME_ATT, def_index);
687 Element base_index_name = doc.createElement("baseIndexPrefix");
688 base_index_name.setAttribute(GSXML.NAME_ATT, overallName);
689 Element index_stem = doc.createElement("indexStem");
690 index_stem.setAttribute(GSXML.NAME_ATT, "index");
691
692 Element search_service_elem = doc.createElement(GSXML.SERVICE_CLASS_ELEM);
693 Element retrieve_service_elem = doc.createElement(GSXML.SERVICE_CLASS_ELEM);
694 service_rack_list.appendChild(search_service_elem);
695 service_rack_list.appendChild(retrieve_service_elem);
696
697 search_service_elem.setAttribute(GSXML.NAME_ATT, "GS3MGSearch");
698 search_service_elem.appendChild(index_list);
699 search_service_elem.appendChild(default_index);
700 search_service_elem.appendChild(base_index_name);
701 search_service_elem.appendChild(index_stem);
702
703 retrieve_service_elem.setAttribute(GSXML.NAME_ATT, "GS3MGRetrieve");
704 retrieve_service_elem.appendChild(default_index.cloneNode(true));
705 retrieve_service_elem.appendChild(base_index_name.cloneNode(true));
706 retrieve_service_elem.appendChild(index_stem.cloneNode(true));
707
708 return true;
709 }
710
711}
712
Note: See TracBrowser for help on using the repository browser.