source: trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/indexers/MGIndexer.java@ 6376

Last change on this file since 6376 was 6376, checked in by cs025, 20 years ago

Fixed some problems in MG indexing; also changed use of mgseqno to a
by-index basis.

  • Property svn:keywords set to Author Date Id Revision
File size: 17.7 KB
Line 
1package org.greenstone.gsdl3.gs3build.indexers;
2
3import java.util.List;
4import java.util.ArrayList;
5import java.util.Iterator;
6
7import java.io.File;
8import java.io.InputStream;
9import java.io.OutputStream;
10import java.io.IOException;
11
12import org.w3c.dom.*;
13
14import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
15import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
16import org.greenstone.gsdl3.gs3build.doctypes.HTMLDocument;
17import org.greenstone.gsdl3.gs3build.metadata.*;
18import org.greenstone.gsdl3.gs3build.xpointer.XPointer;
19
20public class MGIndexer implements IndexerInterface
21{
22 int pass;
23 int documentSeqNo;
24 int sectionSeqNo;
25 boolean firstDocument;
26 String outputDirectory;
27 InputStream indexerFeedback;
28 InputStream indexerErrors;
29 OutputStream indexerTextfeed;
30 Process mg_passes;
31 File textDirectory;
32 File indexDirectory;
33 String indexStem;
34 String textStem;
35 List indexes;
36 String indexName;
37 String level;
38 String field;
39
40 static final char END_OF_DOCUMENT = (char) 2;
41 static final char END_OF_SECTION = (char) 3;
42 static final char END_OF_STREAM = (char) 4;
43
44 class MGIndex
45 { String level;
46 String field;
47
48 public MGIndex(String level, String field)
49 { this.level = level;
50 this.field = field;
51 }
52
53 public MGIndex(String indexLabel)
54 { int colonAt = indexLabel.indexOf(':');
55
56 if (colonAt >= 0)
57 { field = indexLabel.substring(colonAt+1);
58 level = indexLabel.substring(0, colonAt);
59 }
60 }
61
62 public String getLevel()
63 { return this.level;
64 }
65
66 public String getField()
67 { return this.field;
68 }
69 }
70
71 public MGIndexer()
72 { this.indexes = new ArrayList();
73 }
74
75 private String getIndexDirectory(String level, String field)
76 { StringBuffer directory = new StringBuffer();
77 directory.append(Character.toLowerCase((char) level.charAt(0)));
78
79 int c, w;
80 w = 0;
81 c = 0;
82 while (c < field.length() && w < 2) {
83 char ch = field.charAt(c);
84
85 ch = Character.toLowerCase(ch);
86 if (Character.isLetter(ch)) {
87 if (ch != 'a' && ch != 'e' && ch != 'i' &&
88 ch != 'o' && ch != 'u') {
89 directory.append(ch);
90 w++;
91 }
92 }
93 c ++;
94 }
95 return directory.toString();
96 }
97
98 /**
99 * The output directory should be (collection)/building/text/ for
100 * normal Greenstone builds.
101 *
102 * @param <code>String</code> the label to configure
103 * @param <code>String</code> the value...
104 */
105 public boolean configure(String label, String value)
106 {
107 if (label.equals(IndexerManager.outputDir)) {
108 this.outputDirectory = value;
109 this.textStem = value + "/text/index";
110 this.pass = 0;
111
112 // attempt to ensure that the text subdirectory exists
113 this.textDirectory = new File(outputDirectory, "text");
114 if (!textDirectory.exists()) {
115 if (!textDirectory.mkdir()) {
116 return false;
117 }
118 }
119 else if (!textDirectory.isDirectory()) {
120 return false;
121 }
122
123 // Sign to the user which mg directory is being used...
124 System.out.println("Output MG directory is " + this.textStem);
125 }
126 else if (label.equals(IndexerInterface.GS2_INDEX_LABEL)) {
127 this.indexes.add(new MGIndex(value));
128 }
129
130 return true;
131 }
132
133 public boolean addIndex(String level, String field)
134 {
135 MGIndex index = new MGIndex(level, field);
136 this.indexes.add(index);
137 return true;
138 }
139
140 private Node recurseDOM(DocumentInterface metsDoc, Node node,
141 AbstractStructure structure, StringBuffer textBuffer,
142 StringBuffer extraBuffer, String indexName,
143 String namespace, String field)
144 {
145 // send out the ctrl-c...if this is
146 if (structure.getStructureType().equals(METSDivision.DIVISION_TYPE)) {
147 if ((indexName != null) && indexName.startsWith("s")) {
148 METSDivision division = (METSDivision) structure;
149
150 // get the division metadata block
151 METSDescriptive descriptive;
152 String metadataId = division.getDefaultMetadataReference();
153 if (metadataId == null) {
154 descriptive = metsDoc.getDocumentMetadata().createDescriptive(division.getLabel());
155 division.addMetadataReference(descriptive.getID());
156 }
157 else {
158 // Get the descriptive item...
159 descriptive = metsDoc.getDocumentMetadata().getDescriptiveById(metadataId);
160 }
161
162 descriptive.addMetadata("gsdl3", "mgseqno", indexName + "." + Integer.toString(this.documentSeqNo));
163 metsDoc.setModified(true);
164 // System.out.println("Assigning " + this.documentSeqNo + " to " + metsDoc.getID() + " " + division.getLabel());
165 }
166
167 // append an 'end of section' marker
168 textBuffer.append(END_OF_SECTION);
169
170 // for document-level indexes, always append an 'end of document' tag at the
171 // end of the document for each section. Otherwise, each section is followed
172 // by an end of document character. This ensures that all indexes use the
173 // same document numbering...
174 if (this.level == null ||
175 this.level.equals(IndexerInterface.DOCUMENT_LEVEL)) {
176 // extraBuffer.append(END_OF_DOCUMENT);
177 }
178 else {
179 textBuffer.append(END_OF_DOCUMENT);
180 this.documentSeqNo ++;
181 }
182 this.sectionSeqNo ++;
183
184 // produce the body here for metadata output of divisions - in the case of
185 // text output, that will happen below...
186 if (!this.field.equals("text"))
187 { METSDescriptive descriptive;
188
189 METSDivision division = (METSDivision) structure;
190
191 String metadataId = division.getDefaultMetadataReference();
192
193 descriptive = metsDoc.getDocumentMetadata().getDescriptiveById(metadataId);
194 if (descriptive != null) {
195 List values = descriptive.getMetadata(namespace, field);
196
197 if (values != null) {
198 Iterator valueIter = values.iterator();
199 while (valueIter.hasNext()) {
200 String value = valueIter.next().toString();
201
202 textBuffer.append(value);
203 if (valueIter.hasNext()) {
204 textBuffer.append(END_OF_SECTION);
205 }
206 }
207 }
208 }
209 }
210 }
211
212 // go through our children as required...
213 Iterator children = structure.getChildIterator();
214 while (children.hasNext()) {
215 AbstractStructure child = (AbstractStructure) children.next();
216
217 // get xpointer for child
218 // get start position node
219 Node startNode = ((HTMLDocument) metsDoc).getSectionStartNode((METSDivision) child);
220
221 // while this node isn't the child's start node, produce the HTML node text, if
222 // in text field mode...
223 if (field.equals("text")) {
224 while (node != startNode) {
225 XPointer.printNode(node, textBuffer, false);
226
227 // print buffer to node
228 node = XPointer.getNextNode(node, (field.equals("text") ? textBuffer : null));
229 }
230 }
231
232 // recurse to child
233 this.recurseDOM(metsDoc, node, child, textBuffer, extraBuffer, indexName, namespace, field);
234 }
235
236 // close a document - the actual closing \B will be done by the main
237 // loop, so only a required \C is printed here...
238 if (structure.getStructureType().equals(METSStructure.STRUCTURE_TYPE)) {
239 while (node != null) {
240 if (field.equals("text")) {
241 XPointer.printNode(node, textBuffer, false);
242 }
243 node = XPointer.getNextNode(node, (field.equals("text") ? textBuffer : null));
244 }
245 /*
246 textBuffer.append(END_OF_SECTION);
247 this.sectionSeqNo ++;
248 */
249 }
250 return node;
251 }
252
253 private String prepareDOM(DocumentInterface metsDoc, Document document, METSStructure structure,
254 String indexName, String namespace, String field)
255 { StringBuffer extraBuffer = new StringBuffer();
256 Node node = document.getDocumentElement();
257 StringBuffer textBuffer = new StringBuffer();
258
259 this.recurseDOM(metsDoc, node, structure, textBuffer, extraBuffer, indexName, namespace, field);
260 textBuffer.append(extraBuffer.toString());
261 return textBuffer.toString();
262 }
263
264 /**
265 * Index a single document; the document interface can be used to extract individual
266 * metadata items etc. as required or desired and index those instead or as well as
267 * the body text of the document.
268 */
269 public boolean indexDocument(DocumentID docID, DocumentInterface document)
270 {
271 if (!this.firstDocument)
272 { // Send a 'CTRL-B' before the document itself
273 try {
274 this.indexerTextfeed.write(2);
275 }
276 catch (IOException ex)
277 { System.out.println("Bad output on end of document" + ex);
278 ex.printStackTrace();
279 return false;
280 }
281 }
282 String docText = null;
283
284 int startSeqNo = this.sectionSeqNo;
285
286 Document domDocument = document.getDOMDocument();
287 if (domDocument != null) {
288 METSStructure sections = document.getDocumentStructure().getStructure("Section");
289 if (sections != null) {
290 docText = this.prepareDOM(document, domDocument, sections, this.indexName, "gsdl3", this.field);
291 // System.out.println(docText);
292 }
293 }
294 if (docText == null) {
295 if (this.field.equals("text")) {
296 docText = Character.toString(END_OF_DOCUMENT) + Character.toString(END_OF_SECTION) +
297 document.getDocumentText();
298 }
299 else {
300 StringBuffer textBuffer = new StringBuffer();
301 textBuffer.append(END_OF_DOCUMENT);
302 textBuffer.append(END_OF_SECTION);
303 List values = document.getDocumentMetadataItem("gsdl3", this.field);
304 if (values != null) {
305 Iterator valueIter = values.iterator();
306 while (valueIter.hasNext()) {
307 String value = valueIter.next().toString();
308
309 textBuffer.append(value);
310 if (valueIter.hasNext()) {
311 textBuffer.append(END_OF_SECTION);
312 sectionSeqNo ++;
313 }
314 }
315 }
316 else {
317 textBuffer.append("No data");
318 }
319 docText = textBuffer.toString();
320 }
321 sectionSeqNo ++;
322 }
323
324 byte [] bytes = docText.getBytes();
325 int pos = 0, end = bytes.length;
326
327 try {
328 while (pos < end) {
329 this.indexerTextfeed.write(bytes, pos, (end - pos > 512 ? 512 : end - pos));
330 pos = pos + 512;
331
332 try {
333 while (this.indexerFeedback.available() > 0)
334 { byte b[] = new byte[this.indexerFeedback.available()];
335 System.out.println("Feedback of " + this.indexerFeedback.available());
336 this.indexerFeedback.read(b);
337 System.out.println(b);
338 }
339 }
340 catch (IOException ex)
341 { System.out.println(ex);
342 }
343
344
345 try {
346 while (this.indexerErrors.available() > 0)
347 { byte b[] = new byte[this.indexerErrors.available()];
348 System.out.println("Feedback of " + this.indexerErrors.available());
349 this.indexerErrors.read(b);
350 System.out.println(new String(b));
351 }
352 }
353 catch (IOException ex)
354 { System.out.println(ex);
355 }
356 }
357 }
358 catch (IOException ex)
359 { System.out.println("Bad output during document write " + ex + " " + pos + " " + end);
360 ex.printStackTrace();
361 return false;
362 }
363
364 // remember that we're not on the first document, assign the sequence number
365 // on the first pass only, and increment the sequence number.
366 this.firstDocument = false;
367 if (this.pass == 0) {
368 document.addDocumentMetadata("gsdl3", "mgseqno", "dtx."+Integer.toString(this.documentSeqNo));
369 // System.out.println("Assigning " + startSeqNo + " to " + document.getID());
370 }
371 this.documentSeqNo += 1;
372
373 try {
374 while (this.indexerErrors.available() > 0)
375 { char c = (char) this.indexerErrors.read();
376 System.out.println(c);
377 }
378 while (this.indexerFeedback.available() > 0)
379 { byte b[] = new byte[this.indexerFeedback.available()];
380 System.out.println("Feedback of " + this.indexerFeedback.available());
381 this.indexerFeedback.read(b);
382 }
383 }
384 catch (IOException ex)
385 {
386 }
387 return true;
388 }
389
390 /**
391 * Initialise the pass: open required files, check status
392 */
393 public boolean startPass(int passNumber)
394 {
395 this.pass = passNumber;
396 this.firstDocument = true;
397 this.documentSeqNo = 1;
398 this.sectionSeqNo = 1;
399
400 int indexNo = (this.pass - 2) / 2;
401 if (this.pass >= 2) {
402 MGIndex index = (MGIndex) this.indexes.get(indexNo);
403
404 // attempt to ensure that the text subdirectory exists
405 this.indexDirectory = new File(outputDirectory, this.getIndexDirectory(index.getLevel(), index.getField()));
406 if (!indexDirectory.exists()) {
407 if (!indexDirectory.mkdir()) {
408 return false;
409 }
410 }
411 else if (!indexDirectory.isDirectory()) {
412 return false;
413 }
414
415 this.level = index.getLevel();
416 this.field = index.getField();
417 this.indexName = this.getIndexDirectory(index.getLevel(), index.getField());
418 this.indexStem = this.outputDirectory + File.separatorChar +
419 this.indexName + File.separatorChar + "index"; // TODO: modify for index
420 if (this.pass % 2 == 1) {
421 this.indexName = null;
422 }
423 }
424 else {
425 this.field = "text";
426 this.level = "document";
427 this.indexName = null;
428 }
429 System.out.println("level is " + this.level);
430 System.out.println("field is " + this.field);
431 System.out.println("index name is " + this.indexName);
432
433 // get the parameters for this execution of mg_passes
434 String pathParams = "-f index -d " + (this.pass < 2 ? this.textDirectory.toString() : this.indexDirectory.toString());
435
436 int mgPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2);
437
438 try {
439 switch (mgPass) {
440 case 0:
441 mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams + " -b 100000 -T1");
442 break;
443
444 case 1:
445 mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams +" -b 100000 -T2");
446 break;
447
448 case 2:
449 mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams + " -b 100000 -2 -m 32 -s 0 -G -t 10 -N1");
450 break;
451
452 case 3:
453 Process p = Runtime.getRuntime().exec("mg_perf_hash_build -f index -d " + this.indexDirectory.toString());
454 p.waitFor();
455 if (p.exitValue() == 0) {
456 System.out.println("Perfect hashes completed");
457 }
458
459 mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams +" -b 100000 -2 -c 3 -G -t 10 -N2");
460 break;
461 }
462
463 this.indexerFeedback = mg_passes.getInputStream();
464 this.indexerErrors = mg_passes.getErrorStream();
465 this.indexerTextfeed = mg_passes.getOutputStream();
466 }
467 catch (IOException ex)
468 { System.out.println(ex);
469 ex.printStackTrace();
470 return false;
471 }
472 catch (InterruptedException ex)
473 { System.out.println(ex);
474 ex.printStackTrace();
475 return false;
476 }
477 System.out.println("Pass " + this.pass);
478 return true;
479 }
480
481 /**
482 * Complete a pass - reset file counters, close files, etc.
483 */
484 public boolean endPass(int passNumber)
485 { Process p;
486
487 try {
488 this.indexerTextfeed.write(END_OF_DOCUMENT);
489 this.indexerTextfeed.write(END_OF_STREAM);
490 while (this.indexerErrors.available() > 0)
491 { char c = (char) this.indexerErrors.read();
492 System.out.print(c);
493 }
494 while (this.indexerFeedback.available() > 0)
495 { byte b[] = new byte[this.indexerFeedback.available()];
496 System.out.print("Feedback of " + this.indexerFeedback.available());
497 this.indexerFeedback.read(b);
498 }
499
500 this.indexerTextfeed.close();
501 Thread.sleep(1000);
502 this.mg_passes.waitFor();
503 }
504 catch (IOException ex)
505 { System.out.println(ex);
506 }
507 catch (InterruptedException ex)
508 { System.out.println(ex);
509 }
510 System.out.println("Pass " + this.pass + " completed with " + this.mg_passes.exitValue());
511
512 int mgPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2);
513
514 try {
515 switch (mgPass)
516 {
517 case 0:
518 System.out.println("Compressing dictionary");
519 p = Runtime.getRuntime().exec("mg_compression_dict -f index -d " + this.textDirectory.toString() + " -S -H -2 -k 5120");
520 p.waitFor();
521 if (p.exitValue() != 0) {
522 System.out.println("Error from mg_compression_dict: " + p.exitValue());
523 }
524 else {
525 System.out.println("Compressed dictionary successfully written");
526 }
527 break;
528
529 case 3:
530 System.out.println("Writing weights file");
531 p = Runtime.getRuntime().exec("mg_weights_build -f " + this.indexStem + " -t " + this.textStem + " -d /");
532 p.waitFor();
533 if (p.exitValue() == 0) {
534 System.out.println("Weights file successfully written");
535 }
536 else {
537 System.out.println("Unable to create weights file " + "mg_weights_build -f " + this.indexStem + " -t " + this.textStem + " -d /");
538 }
539
540 p = Runtime.getRuntime().exec("mg_invf_dict -f index -d " + this.indexDirectory.toString());
541 p.waitFor();
542 if (p.exitValue() == 0) {
543 System.out.println("Inverted dictionary file successfully written");
544 }
545 else {
546 System.out.println("Unable to create inverted dictionary file");
547 }
548
549 p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s1 -f index -d " + this.indexDirectory.toString());
550 p.waitFor();
551 if (p.exitValue() == 0) {
552 System.out.println("Stemmed index successfully written");
553 }
554 else {
555 System.out.println("Unable to create stemmed index");
556 }
557
558 p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s2 -f index -d " + this.indexDirectory.toString());
559 p.waitFor();
560 if (p.exitValue() == 0) {
561 System.out.println("Stemmed index successfully written");
562 }
563 else {
564 System.out.println("Unable to create stemmed index");
565 }
566
567 p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s3 -f index -d " + this.indexDirectory.toString());
568 p.waitFor();
569 if (p.exitValue() == 0) {
570 System.out.println("Stemmed index successfully written");
571 }
572 else {
573 System.out.println("Unable to create stemmed index");
574 }
575 break;
576 }
577 }
578 catch (IOException ex)
579 { System.out.println(ex);
580 ex.printStackTrace();
581 return false;
582 }
583 catch (InterruptedException ex)
584 { System.out.println(ex);
585 ex.printStackTrace();
586 return false;
587 }
588 return true;
589 }
590
591 /**
592 * Do any tidying up
593 */
594 public void tidyup()
595 {
596 }
597
598 /**
599 * Return the number of passes required for this index.
600 */
601 public int getNumberOfPasses()
602 { return 2 + this.indexes.size() * 2;
603 }
604}
Note: See TracBrowser for help on using the repository browser.