1 | package org.greenstone.gsdl3.gs3build.indexers;
|
---|
2 |
|
---|
3 | import org.greenstone.mgpp.MGPPPassesWrapper;
|
---|
4 |
|
---|
5 | import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
|
---|
6 | import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
|
---|
7 | import org.greenstone.gsdl3.gs3build.doctypes.HTMLDocument;
|
---|
8 | import org.greenstone.gsdl3.gs3build.doctypes.METSDocument;
|
---|
9 | import org.greenstone.gsdl3.util.Misc;
|
---|
10 | import org.greenstone.gsdl3.util.GSXML;
|
---|
11 | import org.greenstone.gsdl3.util.Processing;
|
---|
12 | import org.greenstone.gsdl3.gs3build.xpointer.XPointer;
|
---|
13 | import org.greenstone.gsdl3.gs3build.metadata.*;
|
---|
14 | import java.io.InputStream;
|
---|
15 | import java.io.OutputStream;
|
---|
16 | import java.io.IOException;
|
---|
17 | import java.util.ArrayList;
|
---|
18 | import java.util.List;
|
---|
19 | import java.io.File;
|
---|
20 | import java.util.Iterator;
|
---|
21 |
|
---|
22 | import org.w3c.dom.Element;
|
---|
23 | import org.w3c.dom.Node;
|
---|
24 | import org.w3c.dom.Document;
|
---|
25 |
|
---|
26 | public class MGPPIndexer extends AbstractIndexer
|
---|
27 | {
|
---|
28 | int pass;
|
---|
29 | int documentSeqNo;
|
---|
30 | int sectionSeqNo;
|
---|
31 | String name;
|
---|
32 | boolean firstDocument;
|
---|
33 | File indexDirectory;
|
---|
34 | File textDirectory;
|
---|
35 | String indexStem;
|
---|
36 | String textStem;
|
---|
37 | StringBuffer indexBuffer;
|
---|
38 | String outputDirectory;
|
---|
39 | //String outputStem;
|
---|
40 | // String passExtra;
|
---|
41 | // InputStream indexerFeedback;
|
---|
42 | // InputStream indexerErrors;
|
---|
43 | // OutputStream indexerTextfeed;
|
---|
44 | // Process mgpp_passes;
|
---|
45 | //String overallName;
|
---|
46 | String currentIndexName;
|
---|
47 | String currentIndexLevel;
|
---|
48 | String currentIndexField;
|
---|
49 | MGPPPassesWrapper mgppPasses;
|
---|
50 |
|
---|
51 |
|
---|
52 | static final String documentSeparator = "<Document>";
|
---|
53 | static final String sectionSeparator = "<Section>";
|
---|
54 |
|
---|
55 | static final String START_OF_DOCUMENT = "<Document>";
|
---|
56 | static final String END_OF_DOCUMENT = "</Document>";
|
---|
57 | static final String START_OF_SECTION = "<Section>";
|
---|
58 | static final String END_OF_SECTION = "</Section>";
|
---|
59 |
|
---|
60 |
|
---|
61 | public static final String MGPP_INDEX_TYPE = "mgpp";
|
---|
62 | public static final String INDEX_FILE_STEM = "index";
|
---|
63 |
|
---|
64 |
|
---|
65 | class MGPPIndex
|
---|
66 | {
|
---|
67 | public String name = null;
|
---|
68 | public String doc_level = null;
|
---|
69 | public ArrayList levels = null;
|
---|
70 | public ArrayList fields = null;
|
---|
71 | boolean error = false;// assume built until we get an error
|
---|
72 |
|
---|
73 | public MGPPIndex(String name) {
|
---|
74 | this.name = name;
|
---|
75 | doc_level = "Document";
|
---|
76 | }
|
---|
77 |
|
---|
78 | public void setDocLevel(String doc_level) {
|
---|
79 | this.doc_level = doc_level;
|
---|
80 | }
|
---|
81 |
|
---|
82 | public void addLevel(String level) {
|
---|
83 | this.levels.add(level);
|
---|
84 | }
|
---|
85 |
|
---|
86 | // change to allow nested fields
|
---|
87 | public void addField(String field) {
|
---|
88 | this.fields.add(field);
|
---|
89 | }
|
---|
90 |
|
---|
91 | public boolean hasError() {
|
---|
92 | return this.error;
|
---|
93 | }
|
---|
94 | public void setError(boolean b) {
|
---|
95 | this.error = b;
|
---|
96 | }
|
---|
97 |
|
---|
98 | }
|
---|
99 |
|
---|
100 |
|
---|
101 | public MGPPIndexer(String name)
|
---|
102 | {
|
---|
103 | this.name = name;
|
---|
104 | //this.passExtra = "";
|
---|
105 | }
|
---|
106 |
|
---|
107 | public String getName()
|
---|
108 | {
|
---|
109 | return this.name;
|
---|
110 | }
|
---|
111 |
|
---|
112 | /**
|
---|
113 | * The output directory should be (collection)/building/text/ for
|
---|
114 | * normal Greenstone builds
|
---|
115 | */
|
---|
116 | public boolean configure(String label, String value)
|
---|
117 | {
|
---|
118 | if (label.equals(IndexerManager.outputDir)) {
|
---|
119 | this.outputDirectory = value;
|
---|
120 |
|
---|
121 | // attempt to ensure that the text subdirectory exists
|
---|
122 | this.textDirectory = new File(outputDirectory, "text");
|
---|
123 | if (!textDirectory.exists()) {
|
---|
124 | if (!textDirectory.mkdir()) {
|
---|
125 | return false;
|
---|
126 | }
|
---|
127 | }
|
---|
128 | else if (!textDirectory.isDirectory()) {
|
---|
129 | return false;
|
---|
130 | }
|
---|
131 | this.textStem = this.textDirectory.getPath() + File.separator + INDEX_FILE_STEM;
|
---|
132 |
|
---|
133 | // attempt to ensure that the index subdir exists
|
---|
134 | this.indexDirectory = new File(outputDirectory, "idx");
|
---|
135 | if (!indexDirectory.exists()) {
|
---|
136 | if (!indexDirectory.mkdir()) {
|
---|
137 | return false;
|
---|
138 | }
|
---|
139 | }
|
---|
140 | else if (!indexDirectory.isDirectory()) {
|
---|
141 | return false;
|
---|
142 | }
|
---|
143 | this.indexStem = this.indexDirectory.getPath() + File.separator + INDEX_FILE_STEM;
|
---|
144 |
|
---|
145 | // Sign to the user which mg directory is being used...
|
---|
146 | System.out.println("Output MGPP text directory is " + this.textStem);
|
---|
147 | System.out.println("Output MGPP index directory is " + this.indexStem);
|
---|
148 | }
|
---|
149 | this.pass = 0;
|
---|
150 | return true;
|
---|
151 | }
|
---|
152 |
|
---|
153 | public String getIndexType()
|
---|
154 | {
|
---|
155 | return MGPP_INDEX_TYPE;
|
---|
156 | }
|
---|
157 |
|
---|
158 | public boolean addIndex(String name, String level, String field)
|
---|
159 | {
|
---|
160 | // if (level == "doc_level") {
|
---|
161 | // passExtra = " -J " + level;
|
---|
162 | // }
|
---|
163 | // else {
|
---|
164 | // passExtra = " -K " + level;
|
---|
165 | // }
|
---|
166 | return true;
|
---|
167 | }
|
---|
168 |
|
---|
169 | /**
|
---|
170 | * Index a single document; the document interface can be used to extract individual
|
---|
171 | * metadata items etc. as required or desired and index those instead or as well as
|
---|
172 | * the body text of the document.
|
---|
173 | */
|
---|
174 | public boolean indexDocument(DocumentID docID, DocumentInterface document)
|
---|
175 | {
|
---|
176 | if (this.pass == 0) {
|
---|
177 | document.removeAllMetadata("gsdl3", "mgseqno");
|
---|
178 | }
|
---|
179 |
|
---|
180 | // why do this at the start and not at the end???
|
---|
181 | if (!this.firstDocument) {
|
---|
182 | // Send a '</Document>' at the end of the doc
|
---|
183 | this.indexBuffer.append(END_OF_DOCUMENT);
|
---|
184 | mgppPasses.processDocument(indexBuffer.toString());
|
---|
185 | this.indexBuffer.delete(0, this.indexBuffer.length());
|
---|
186 | }
|
---|
187 |
|
---|
188 | String docText = null;
|
---|
189 |
|
---|
190 | //int startSeqNo = this.sectionSeqNo;
|
---|
191 | //this.sectionSeqNo ++;
|
---|
192 | int startSeqNo = this.documentSeqNo;
|
---|
193 |
|
---|
194 | Document domDocument = document.getDOMDocument();
|
---|
195 | if (domDocument != null) {
|
---|
196 | System.err.println("dom doc is not null");
|
---|
197 | METSStructure sections = document.getDocumentStructure().getStructure("Section");
|
---|
198 | if (sections != null) {
|
---|
199 | System.err.println("sections are not null");
|
---|
200 | docText = this.prepareDOM(document, domDocument, sections, "gsdl3"); //this.name, "gsdl3", this.field);
|
---|
201 | // System.out.println(docText);
|
---|
202 | }
|
---|
203 | }
|
---|
204 | if (docText == null) {
|
---|
205 | System.err.println("dom doc or sections was null - asking for doc text");
|
---|
206 | //if (this.currentIndexField.equals("text")) {
|
---|
207 | //docText = Character.toString(END_OF_DOCUMENT) + document.getDocumentText();
|
---|
208 | docText = document.getDocumentText();
|
---|
209 | //}
|
---|
210 | // else {
|
---|
211 | // StringBuffer textBuffer = new StringBuffer();
|
---|
212 | // //textBuffer.append(END_OF_DOCUMENT);
|
---|
213 | // List values = document.getDocumentMetadataItem("gsdl3", this.currentIndexField);
|
---|
214 | // if (values != null) {
|
---|
215 | // Iterator valueIter = values.iterator();
|
---|
216 | // while (valueIter.hasNext()) {
|
---|
217 | // String value = valueIter.next().toString();
|
---|
218 |
|
---|
219 | // textBuffer.append(value);
|
---|
220 | // if (valueIter.hasNext()) {
|
---|
221 | // //textBuffer.append(END_OF_SECTION);
|
---|
222 | // // sectionSeqNo ++;
|
---|
223 | // }
|
---|
224 | // }
|
---|
225 | // }
|
---|
226 | // else {
|
---|
227 | // textBuffer.append("No data");
|
---|
228 | // }
|
---|
229 | // docText = textBuffer.toString();
|
---|
230 | // }
|
---|
231 | sectionSeqNo ++;
|
---|
232 | }
|
---|
233 |
|
---|
234 | //try {
|
---|
235 | // this.indexerTextfeed.write(documentSeparator.getBytes(), 0, documentSeparator.getBytes().length);
|
---|
236 | // }
|
---|
237 | // catch (IOException ex) {
|
---|
238 | // System.out.println("Bad output on end of document" + ex);
|
---|
239 | // ex.printStackTrace();
|
---|
240 | // return false;
|
---|
241 | // }
|
---|
242 | // }
|
---|
243 |
|
---|
244 | this.indexBuffer.append(START_OF_DOCUMENT);
|
---|
245 | //String docText = document.getDocumentText();
|
---|
246 | this.indexBuffer.append(docText);
|
---|
247 | //int startSeqNo = this.documentSeqNo;
|
---|
248 |
|
---|
249 | // byte [] bytes = docText.getBytes();
|
---|
250 | // int pos = 0, end = bytes.length;
|
---|
251 |
|
---|
252 | // try {
|
---|
253 | // while (pos < end) {
|
---|
254 | // this.indexerTextfeed.write(bytes, pos, (end - pos > 512 ? 512 : end - pos));
|
---|
255 | // pos = pos + 512;
|
---|
256 |
|
---|
257 | // try {
|
---|
258 | // while (this.indexerFeedback.available() > 0) {
|
---|
259 | // byte b[] = new byte[this.indexerFeedback.available()];
|
---|
260 | // System.out.println("Feedback of " + this.indexerFeedback.available());
|
---|
261 | // this.indexerFeedback.read(b);
|
---|
262 | // System.out.println(b);
|
---|
263 | // }
|
---|
264 | // }
|
---|
265 | // catch (IOException ex) {
|
---|
266 |
|
---|
267 | // }
|
---|
268 |
|
---|
269 |
|
---|
270 | // try {
|
---|
271 | // while (this.indexerErrors.available() > 0) {
|
---|
272 | // byte b[] = new byte[this.indexerErrors.available()];
|
---|
273 | // System.out.println("Feedback of " + this.indexerErrors.available());
|
---|
274 | // this.indexerErrors.read(b);
|
---|
275 | // System.out.println(new String(b));
|
---|
276 | // }
|
---|
277 | // }
|
---|
278 | // catch (IOException ex){
|
---|
279 |
|
---|
280 | // }
|
---|
281 | // }
|
---|
282 | // }
|
---|
283 | // catch (IOException ex) {
|
---|
284 | // System.out.println("Bad output during document write " + ex + " " + pos + " " + end);
|
---|
285 | // ex.printStackTrace();
|
---|
286 | // return false;
|
---|
287 | // }
|
---|
288 | this.firstDocument = false;
|
---|
289 |
|
---|
290 | if (this.pass == 0) {
|
---|
291 | document.addDocumentMetadata("gsdl3", "mgseqno", "dtx."+Integer.toString(startSeqNo));
|
---|
292 | }
|
---|
293 | this.documentSeqNo++;
|
---|
294 |
|
---|
295 | // try {
|
---|
296 | // while (this.indexerErrors.available() > 0) {
|
---|
297 | // char c = (char) this.indexerErrors.read();
|
---|
298 | // System.out.println(c);
|
---|
299 | // }
|
---|
300 | // while (this.indexerFeedback.available() > 0) {
|
---|
301 | // byte b[] = new byte[this.indexerFeedback.available()];
|
---|
302 | // System.out.println("Feedback of " + this.indexerFeedback.available());
|
---|
303 | // this.indexerFeedback.read(b);
|
---|
304 | // }
|
---|
305 | // }
|
---|
306 | // catch (IOException ex) {
|
---|
307 |
|
---|
308 | // }
|
---|
309 | return true;
|
---|
310 | }
|
---|
311 |
|
---|
312 | /**
|
---|
313 | * Initialise the pass: open required files, check status
|
---|
314 | */
|
---|
315 | public boolean startPass(int passNumber)
|
---|
316 | {
|
---|
317 | this.pass = passNumber;
|
---|
318 | this.firstDocument = true;
|
---|
319 | this.documentSeqNo = 1;
|
---|
320 | this.sectionSeqNo = 1;
|
---|
321 |
|
---|
322 | this.mgppPasses = new MGPPPassesWrapper();
|
---|
323 | this.indexBuffer = new StringBuffer();
|
---|
324 |
|
---|
325 | MGPPIndex index = null; // do something with this!!
|
---|
326 |
|
---|
327 | // get the parameters for this execution of mg_passes
|
---|
328 | mgppPasses.setFileName((this.pass < 2 ? this.textStem : this.indexStem ));
|
---|
329 | if (!Misc.isWindows()) {
|
---|
330 | mgppPasses.setBasePath("/");
|
---|
331 | }
|
---|
332 |
|
---|
333 | mgppPasses.setDocumentTag("Document");
|
---|
334 | //mgppPasses.addLevelTag("Section");
|
---|
335 |
|
---|
336 | this.currentIndexLevel = "Document";// index.getLevel();
|
---|
337 | this.currentIndexField = "text";//index.getField();
|
---|
338 | this.currentIndexName = "idx"; //index.getName();
|
---|
339 |
|
---|
340 |
|
---|
341 | switch (this.pass) {
|
---|
342 | case 0:
|
---|
343 | // -T1
|
---|
344 | mgppPasses.addPass(MGPPPassesWrapper.TEXT_PASS_1);
|
---|
345 | //mgpp_passes = Runtime.getRuntime().exec("mgpp_passes " + passExtra + " -f " + this.outputStem + " -T1");
|
---|
346 | break;
|
---|
347 |
|
---|
348 | case 1:
|
---|
349 | // -T2
|
---|
350 | mgppPasses.addPass(MGPPPassesWrapper.TEXT_PASS_2);
|
---|
351 | //mgpp_passes = Runtime.getRuntime().exec("mgpp_passes " + passExtra + " -f " + this.outputStem +" -T2");
|
---|
352 | break;
|
---|
353 |
|
---|
354 | case 2:
|
---|
355 | // -I1
|
---|
356 | mgppPasses.addPass(MGPPPassesWrapper.INDEX_PASS_1);
|
---|
357 | //mgpp_passes = Runtime.getRuntime().exec("mgpp_passes " + passExtra + " -f " + this.outputStem +" -I1");
|
---|
358 | break;
|
---|
359 |
|
---|
360 | case 3:
|
---|
361 | //Process p = Runtime.getRuntime().exec("mgpp_perf_hash_build -f " + this.outputStem);
|
---|
362 | //p.waitFor();
|
---|
363 | // -I2
|
---|
364 | mgppPasses.addPass(MGPPPassesWrapper.INDEX_PASS_2);
|
---|
365 | //mgpp_passes = Runtime.getRuntime().exec("mgpp_passes " + passExtra + " -f " + this.outputStem +" -I2");
|
---|
366 | break;
|
---|
367 | }
|
---|
368 |
|
---|
369 | //this.indexerFeedback = mgpp_passes.getInputStream();
|
---|
370 | // this.indexerErrors = mgpp_passes.getErrorStream();
|
---|
371 | // this.indexerTextfeed = mgpp_passes.getOutputStream();
|
---|
372 | // }
|
---|
373 | //catch (IOException ex) {
|
---|
374 | // System.out.println(ex);
|
---|
375 | // ex.printStackTrace();
|
---|
376 | // return false;
|
---|
377 | //}/
|
---|
378 | //catch (InterruptedException ex) {
|
---|
379 | // System.out.println(ex);
|
---|
380 | // ex.printStackTrace();
|
---|
381 | // return false;
|
---|
382 | //}
|
---|
383 | mgppPasses.init();
|
---|
384 | System.out.println("Pass " + this.pass);
|
---|
385 | return true;
|
---|
386 | }
|
---|
387 |
|
---|
388 | /**
|
---|
389 | * Complete a pass - reset file counters, close files, etc.
|
---|
390 | */
|
---|
391 | public boolean endPass(int passNumber)
|
---|
392 | {
|
---|
393 | // TODO: end pass
|
---|
394 | Process p;
|
---|
395 | MGPPIndex index = null; // do something with this!!
|
---|
396 | try {
|
---|
397 | this.indexBuffer.append(END_OF_DOCUMENT);
|
---|
398 | mgppPasses.processDocument(indexBuffer.toString());
|
---|
399 | this.indexBuffer.delete(0, this.indexBuffer.length());
|
---|
400 | Thread.sleep(1000); // what for??
|
---|
401 | }
|
---|
402 | catch (InterruptedException ex) {
|
---|
403 | System.out.println(ex);
|
---|
404 | }
|
---|
405 | mgppPasses.finish();
|
---|
406 | try {
|
---|
407 | Thread.sleep(1000);
|
---|
408 | } catch (Exception e) {}
|
---|
409 |
|
---|
410 | int exit_value = mgppPasses.exitValue();
|
---|
411 | System.out.println("Pass " + this.pass + " completed with " + exit_value);
|
---|
412 | if (exit_value !=0) {
|
---|
413 | //assume something has gone wrong, don't continue
|
---|
414 | // if (index != null) {
|
---|
415 | // index.setError(true);
|
---|
416 | // return false;
|
---|
417 | // }
|
---|
418 | }
|
---|
419 |
|
---|
420 | String osextra = "";
|
---|
421 | if (!Misc.isWindows()) {
|
---|
422 | osextra = " -d / ";
|
---|
423 | }
|
---|
424 |
|
---|
425 | switch (this.pass) {
|
---|
426 | case 0:
|
---|
427 | //System.exit(1);
|
---|
428 | System.out.println("Compressing dictionary");
|
---|
429 | exit_value = Processing.runProcess("mgpp_compression_dict -f " + this.textStem + " -S -H -2 -k 5120"+ osextra);
|
---|
430 |
|
---|
431 | if (exit_value == 0) {
|
---|
432 | System.out.println("Compressed dictionary successfully written");
|
---|
433 | } else {
|
---|
434 | System.err.println("Error from mgpp_compression_dict: " + exit_value);
|
---|
435 | //index.setError(true);
|
---|
436 | return false;
|
---|
437 | }
|
---|
438 | break;
|
---|
439 |
|
---|
440 | case 2:
|
---|
441 | System.out.println("Creating perfect hash");
|
---|
442 | exit_value = Processing.runProcess("mgpp_perf_hash_build -f " + this.indexStem + osextra);
|
---|
443 | if (exit_value ==0) {
|
---|
444 | System.out.println("Perfect hashes completed");
|
---|
445 | } else {
|
---|
446 | System.err.println("Unable to build the perfect hash");
|
---|
447 | //index.setError(true);
|
---|
448 | return false;
|
---|
449 | }
|
---|
450 | break;
|
---|
451 |
|
---|
452 | case 3:
|
---|
453 | System.out.println("Writing weights file");
|
---|
454 | exit_value = Processing.runProcess("mgpp_weights_build -f " + this.indexStem + osextra);
|
---|
455 | if (exit_value ==0) {
|
---|
456 | System.out.println("Weights file successfully written");
|
---|
457 | } else {
|
---|
458 | System.err.println("Unable to create weights file");
|
---|
459 | //index.setError(true);
|
---|
460 | return false;
|
---|
461 | }
|
---|
462 |
|
---|
463 | System.out.println("Creating inverted dictionary");
|
---|
464 | exit_value = Processing.runProcess("mgpp_invf_dict -f " + this.indexStem + osextra);
|
---|
465 | if (exit_value ==0) {
|
---|
466 | System.out.println("Inverted dictionary file successfully written");
|
---|
467 | } else {
|
---|
468 | System.out.println("Unable to create inverted dictionary file");
|
---|
469 | //index.setError(true);
|
---|
470 | return false;
|
---|
471 | }
|
---|
472 |
|
---|
473 | System.out.println("Creating Stem indexes");
|
---|
474 | exit_value = Processing.runProcess("mgpp_stem_idx -b 4096 -s1 -f " + this.indexStem +osextra);
|
---|
475 | if (exit_value == 0) {
|
---|
476 | System.out.println("Stemmed index 1 successfully written");
|
---|
477 | } else {
|
---|
478 | System.out.println("Unable to create stemmed index 1");
|
---|
479 | //index.setError(true);
|
---|
480 | return false;
|
---|
481 | }
|
---|
482 |
|
---|
483 | exit_value = Processing.runProcess("mgpp_stem_idx -b 4096 -s2 -f " + this.indexStem + osextra);
|
---|
484 | if (exit_value == 0) {
|
---|
485 | System.out.println("Stemmed index 2 successfully written");
|
---|
486 | } else {
|
---|
487 | System.out.println("Unable to create stemmed index 2");
|
---|
488 | //index.setError(true);
|
---|
489 | return false;
|
---|
490 | }
|
---|
491 | exit_value = Processing.runProcess("mgpp_stem_idx -b 4096 -s3 -f " + this.indexStem + osextra);
|
---|
492 | if (exit_value == 0) {
|
---|
493 | System.out.println("Stemmed index 3 successfully written");
|
---|
494 | } else {
|
---|
495 | System.out.println("Unable to create stemmed index 3");
|
---|
496 | //index.setError(true);
|
---|
497 | return false;
|
---|
498 | }
|
---|
499 |
|
---|
500 | break;
|
---|
501 | } // switch
|
---|
502 |
|
---|
503 | return true;
|
---|
504 | }
|
---|
505 |
|
---|
506 | /**
|
---|
507 | * Do any tidying up
|
---|
508 | */
|
---|
509 | public void tidyup()
|
---|
510 | {
|
---|
511 | }
|
---|
512 |
|
---|
513 | /**
|
---|
514 | * Return the number of passes required for this index.
|
---|
515 | */
|
---|
516 | public int getNumberOfPasses()
|
---|
517 | {
|
---|
518 | return 4;
|
---|
519 | }
|
---|
520 |
|
---|
521 | public boolean addServiceDescriptions(Element service_rack_list) {
|
---|
522 | Document doc = service_rack_list.getOwnerDocument();
|
---|
523 |
|
---|
524 | // generate the list of indexes
|
---|
525 | Element index_list = doc.createElement(GSXML.INDEX_ELEM+GSXML.LIST_MODIFIER);
|
---|
526 | Element e = doc.createElement(GSXML.INDEX_ELEM);
|
---|
527 | e.setAttribute(GSXML.NAME_ATT, "idx");
|
---|
528 | index_list.appendChild(e);
|
---|
529 | String def_index = "idx";
|
---|
530 |
|
---|
531 | // boolean found_index = false;
|
---|
532 | // String def_index = ""; // the default index will just be the first one created for now.
|
---|
533 | // for (int i=0; i<this.indexes.size(); i++) {
|
---|
534 | // MGIndex index = (MGIndex)this.indexes.get(i);
|
---|
535 | // if (!index.hasError()) {
|
---|
536 | // Element e = doc.createElement(GSXML.INDEX_ELEM);
|
---|
537 | // e.setAttribute(GSXML.NAME_ATT, index.getName());
|
---|
538 | // index_list.appendChild(e);
|
---|
539 | // if (found_index == false) {
|
---|
540 | // // this is the first index
|
---|
541 | // found_index = true;
|
---|
542 | // def_index = index.getName();
|
---|
543 | // }
|
---|
544 | // }
|
---|
545 | // }
|
---|
546 |
|
---|
547 | // if (!found_index) {
|
---|
548 | // // no indexes were able to be created, so we can't use them or the text
|
---|
549 | // return false;
|
---|
550 | // }
|
---|
551 |
|
---|
552 | Element f = doc.createElement(GSXML.FIELD_ELEM+GSXML.LIST_MODIFIER);
|
---|
553 |
|
---|
554 | Element default_index = doc.createElement("defaultIndex");
|
---|
555 | default_index.setAttribute(GSXML.NAME_ATT, def_index);
|
---|
556 |
|
---|
557 | Element base_index_name = doc.createElement("baseIndexPrefix");
|
---|
558 | base_index_name.setAttribute(GSXML.NAME_ATT, "dtx"); //overallName);
|
---|
559 |
|
---|
560 | Element index_stem = doc.createElement("indexStem");
|
---|
561 | index_stem.setAttribute(GSXML.NAME_ATT, "index");
|
---|
562 |
|
---|
563 | Element search_service_elem = doc.createElement(GSXML.SERVICE_CLASS_ELEM);
|
---|
564 | Element retrieve_service_elem = doc.createElement(GSXML.SERVICE_CLASS_ELEM);
|
---|
565 | Element default_level = doc.createElement("defaultLevel");
|
---|
566 | default_level.setAttribute(GSXML.NAME_ATT, "Document");
|
---|
567 |
|
---|
568 | Element level_list = doc.createElement("levelList");
|
---|
569 | Element level = doc.createElement("level");
|
---|
570 | level.setAttribute(GSXML.NAME_ATT, "Document");
|
---|
571 | level_list.appendChild(level);
|
---|
572 |
|
---|
573 | Element field_list = doc.createElement("fieldList");
|
---|
574 | Element field = doc.createElement("field");
|
---|
575 | field.setAttribute(GSXML.NAME_ATT, "ZZ");
|
---|
576 | field_list.appendChild(field);
|
---|
577 |
|
---|
578 | service_rack_list.appendChild(search_service_elem);
|
---|
579 | service_rack_list.appendChild(retrieve_service_elem);
|
---|
580 |
|
---|
581 | search_service_elem.setAttribute(GSXML.NAME_ATT, "GS3MGPPSearch");
|
---|
582 | search_service_elem.appendChild(index_list);
|
---|
583 | search_service_elem.appendChild(default_index);
|
---|
584 | search_service_elem.appendChild(level_list);
|
---|
585 | search_service_elem.appendChild(default_level);
|
---|
586 | search_service_elem.appendChild(field_list); // do we need this??
|
---|
587 | search_service_elem.appendChild(base_index_name);
|
---|
588 | search_service_elem.appendChild(index_stem);
|
---|
589 |
|
---|
590 | retrieve_service_elem.setAttribute(GSXML.NAME_ATT, "GS3MGPPRetrieve");
|
---|
591 | retrieve_service_elem.appendChild(default_level.cloneNode(true));
|
---|
592 | retrieve_service_elem.appendChild(base_index_name.cloneNode(true));
|
---|
593 | retrieve_service_elem.appendChild(index_stem.cloneNode(true));
|
---|
594 |
|
---|
595 | return true;
|
---|
596 | }
|
---|
597 |
|
---|
598 |
|
---|
599 | private Node recurseDOM(DocumentInterface metsDoc, Node node,
|
---|
600 | AbstractStructure structure, StringBuffer textBuffer,
|
---|
601 | StringBuffer extraBuffer, String namespace)
|
---|
602 | //String name, String namespace, String field)
|
---|
603 | {
|
---|
604 | // send out the ctrl-c...if this is
|
---|
605 | if (structure.getStructureType().equals(METSDivision.DIVISION_TYPE)) {
|
---|
606 | // try doing this for all index types
|
---|
607 | if ((this.currentIndexName != null)) { // && this.level != null && this.level.equals(IndexerInterface.SECTION_LEVEL)) { //name.startsWith("s")) {
|
---|
608 | METSDivision division = (METSDivision) structure;
|
---|
609 |
|
---|
610 | // get the division metadata block
|
---|
611 | METSDescriptive descriptive;
|
---|
612 | String metadataId = division.getDefaultMetadataReference();
|
---|
613 | if (metadataId == null) {
|
---|
614 | descriptive = metsDoc.getDocumentMetadata().createDescriptive(division.getLabel());
|
---|
615 | division.addMetadataReference(descriptive.getID());
|
---|
616 | }
|
---|
617 | else {
|
---|
618 | // Get the descriptive item...
|
---|
619 | descriptive = metsDoc.getDocumentMetadata().getDescriptiveById(metadataId);
|
---|
620 | }
|
---|
621 |
|
---|
622 | descriptive.addMetadata("gsdl3", "mgseqno", this.name + "." + Integer.toString(this.sectionSeqNo));
|
---|
623 |
|
---|
624 | metsDoc.setChanged(true);
|
---|
625 | //metsDoc.setModified(true);
|
---|
626 | // System.out.println("Assigning " + this.sectionSeqNo + " to " + metsDoc.getID() + " " + division.getLabel());
|
---|
627 | } // section level
|
---|
628 |
|
---|
629 | // append an 'end of section' marker
|
---|
630 | //textBuffer.append(END_OF_SECTION);
|
---|
631 | this.sectionSeqNo ++;
|
---|
632 |
|
---|
633 | // for document-level indexes, always append an 'end of document' tag at the
|
---|
634 | // end of the document for each section. Otherwise, each section is followed
|
---|
635 | // by an end of document character. This ensures that all indexes use the
|
---|
636 | // same document numbering...
|
---|
637 | if (this.currentIndexLevel == null ||
|
---|
638 | this.currentIndexLevel.equals(IndexerInterface.DOCUMENT_LEVEL)) {
|
---|
639 | extraBuffer.append(END_OF_DOCUMENT);
|
---|
640 | }
|
---|
641 | else {
|
---|
642 | textBuffer.append(END_OF_DOCUMENT);
|
---|
643 | this.documentSeqNo ++;
|
---|
644 | }
|
---|
645 |
|
---|
646 | // produce the body here for metadata output of divisions - in the case of
|
---|
647 | // text output, that will happen below...
|
---|
648 | if (!this.currentIndexField.equals("text"))
|
---|
649 | { METSDescriptive descriptive;
|
---|
650 |
|
---|
651 | METSDivision division = (METSDivision) structure;
|
---|
652 |
|
---|
653 | String metadataId = division.getDefaultMetadataReference();
|
---|
654 |
|
---|
655 | descriptive = metsDoc.getDocumentMetadata().getDescriptiveById(metadataId);
|
---|
656 | if (descriptive != null) {
|
---|
657 | List values = descriptive.getMetadata(namespace, this.currentIndexField);
|
---|
658 |
|
---|
659 | if (values != null) {
|
---|
660 | Iterator valueIter = values.iterator();
|
---|
661 | while (valueIter.hasNext()) {
|
---|
662 | String value = valueIter.next().toString();
|
---|
663 |
|
---|
664 | textBuffer.append(value);
|
---|
665 | if (valueIter.hasNext()) {
|
---|
666 | //textBuffer.append(END_OF_SECTION);
|
---|
667 | }
|
---|
668 | }
|
---|
669 | }
|
---|
670 | }
|
---|
671 | }
|
---|
672 | }
|
---|
673 |
|
---|
674 | // go through our children as required...
|
---|
675 | Iterator children = structure.getChildIterator();
|
---|
676 | Node startNode;
|
---|
677 | while (children.hasNext()) {
|
---|
678 | AbstractStructure child = (AbstractStructure) children.next();
|
---|
679 |
|
---|
680 | // get xpointer for child
|
---|
681 | // get start position node
|
---|
682 | if (metsDoc.getDocumentType() == "METS"){
|
---|
683 | startNode = ((METSDocument) metsDoc).getSectionStartNode((METSDivision) child);
|
---|
684 | } else {
|
---|
685 | startNode = ((HTMLDocument) metsDoc).getSectionStartNode((METSDivision) child);
|
---|
686 | }
|
---|
687 | //Node startNode = ((HTMLDocument) metsDoc).getSectionStartNode((METSDivision) child);
|
---|
688 |
|
---|
689 | // while this node isn't the child's start node, produce the HTML node text, if
|
---|
690 | // in text field mode...
|
---|
691 | if (this.currentIndexField.equals("text")) {
|
---|
692 | while (node != startNode) {
|
---|
693 | XPointer.printNode(node, textBuffer, false);
|
---|
694 |
|
---|
695 | // print buffer to node
|
---|
696 | node = XPointer.getNextNode(node, (this.currentIndexField.equals("text") ? textBuffer : null));
|
---|
697 | }
|
---|
698 | }
|
---|
699 |
|
---|
700 | // recurse to child
|
---|
701 | node = this.recurseDOM(metsDoc, node, child, textBuffer, extraBuffer, namespace); // name, namespace, field);
|
---|
702 | } // while next child
|
---|
703 |
|
---|
704 | // close a document - the actual closing \B will be done by the main
|
---|
705 | // loop, so only a required \C is printed here...
|
---|
706 | if (structure.getStructureType().equals(METSStructure.STRUCTURE_TYPE)) {
|
---|
707 | while (node != null) {
|
---|
708 | if (this.currentIndexField.equals("text")) {
|
---|
709 | XPointer.printNode(node, textBuffer, false);
|
---|
710 | }
|
---|
711 | node = XPointer.getNextNode(node, (this.currentIndexField.equals("text") ? textBuffer : null));
|
---|
712 | }
|
---|
713 |
|
---|
714 | //textBuffer.append(END_OF_SECTION);
|
---|
715 | this.sectionSeqNo ++;
|
---|
716 |
|
---|
717 | }
|
---|
718 | return node;
|
---|
719 | }
|
---|
720 |
|
---|
721 | private String prepareDOM(DocumentInterface metsDoc, Document document, METSStructure structure, String namespace)
|
---|
722 | // String name, String namespace, String field)
|
---|
723 | { StringBuffer extraBuffer = new StringBuffer();
|
---|
724 | Node node = document.getDocumentElement();
|
---|
725 | StringBuffer textBuffer = new StringBuffer();
|
---|
726 |
|
---|
727 | this.recurseDOM(metsDoc, node, structure, textBuffer, extraBuffer, namespace); //name, namespace, field);
|
---|
728 | textBuffer.append(extraBuffer.toString());
|
---|
729 | return textBuffer.toString();
|
---|
730 | }
|
---|
731 |
|
---|
732 | }
|
---|