Context Navigation

source: trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/indexers/MGIndexer.java@ 6698

Last change on this file since 6698 was 6698, checked in by cs025, 20 years ago
Fixed minor issues in indexermanager.
Property svn:keywords set to `Author Date Id Revision`
File size: 17.8 KB

Line
1	package org.greenstone.gsdl3.gs3build.indexers;
2
3	import java.util.List;
4	import java.util.ArrayList;
5	import java.util.Iterator;
6
7	import java.io.File;
8	import java.io.InputStream;
9	import java.io.OutputStream;
10	import java.io.IOException;
11
12	import org.w3c.dom.*;
13
14	import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
15	import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
16	import org.greenstone.gsdl3.gs3build.doctypes.HTMLDocument;
17	import org.greenstone.gsdl3.gs3build.metadata.*;
18	import org.greenstone.gsdl3.gs3build.xpointer.XPointer;
19
20	public class MGIndexer implements IndexerInterface
21	{
22	int pass;
23	int documentSeqNo;
24	int sectionSeqNo;
25	boolean firstDocument;
26	String outputDirectory;
27	InputStream indexerFeedback;
28	InputStream indexerErrors;
29	OutputStream indexerTextfeed;
30	Process mg_passes;
31	File textDirectory;
32	File indexDirectory;
33	String indexStem;
34	String textStem;
35	List indexes;
36	String indexName;
37	String level;
38	String field;
39
40	static final char END_OF_DOCUMENT = (char) 2;
41	static final char END_OF_SECTION = (char) 3;
42	static final char END_OF_STREAM = (char) 4;
43
44	class MGIndex
45	{ String level;
46	String field;
47
48	public MGIndex(String level, String field)
49	{ this.level = level;
50	this.field = field;
51	}
52
53	public MGIndex(String indexLabel)
54	{ int colonAt = indexLabel.indexOf(':');
55
56	if (colonAt >= 0)
57	{ field = indexLabel.substring(colonAt+1);
58	level = indexLabel.substring(0, colonAt);
59	}
60	}
61
62	public String getLevel()
63	{ return this.level;
64	}
65
66	public String getField()
67	{ return this.field;
68	}
69	}
70
71	public MGIndexer()
72	{ this.indexes = new ArrayList();
73	}
74
75	private String getIndexDirectory(String level, String field)
76	{ StringBuffer directory = new StringBuffer();
77	directory.append(Character.toLowerCase((char) level.charAt(0)));
78
79	int c, w;
80	w = 0;
81	c = 0;
82	while (c < field.length() && w < 2) {
83	char ch = field.charAt(c);
84
85	ch = Character.toLowerCase(ch);
86	if (Character.isLetter(ch)) {
87	if (ch != 'a' && ch != 'e' && ch != 'i' &&
88	ch != 'o' && ch != 'u') {
89	directory.append(ch);
90	w++;
91	}
92	}
93	c ++;
94	}
95	return directory.toString();
96	}
97
98	/**
99	* The output directory should be (collection)/building/text/ for
100	* normal Greenstone builds.
101	*
102	* @param <code>String</code> the label to configure
103	* @param <code>String</code> the value...
104	*/
105	public boolean configure(String label, String value)
106	{
107	if (label.equals(IndexerManager.outputDir)) {
108	this.outputDirectory = value;
109	this.textStem = value + "/text/index";
110	this.pass = 0;
111
112	// attempt to ensure that the text subdirectory exists
113	this.textDirectory = new File(outputDirectory, "text");
114	if (!textDirectory.exists()) {
115	if (!textDirectory.mkdir()) {
116	return false;
117	}
118	}
119	else if (!textDirectory.isDirectory()) {
120	return false;
121	}
122
123	// Sign to the user which mg directory is being used...
124	System.out.println("Output MG directory is " + this.textStem);
125	}
126	else if (label.equals(IndexerInterface.GS2_INDEX_LABEL)) {
127	this.indexes.add(new MGIndex(value));
128	}
129
130	return true;
131	}
132
133	public boolean addIndex(String level, String field)
134	{
135	MGIndex index = new MGIndex(level, field);
136	this.indexes.add(index);
137	return true;
138	}
139
140	private Node recurseDOM(DocumentInterface metsDoc, Node node,
141	AbstractStructure structure, StringBuffer textBuffer,
142	StringBuffer extraBuffer, String indexName,
143	String namespace, String field)
144	{
145	// send out the ctrl-c...if this is
146	if (structure.getStructureType().equals(METSDivision.DIVISION_TYPE)) {
147	if ((indexName != null) && indexName.startsWith("s")) {
148	METSDivision division = (METSDivision) structure;
149
150	// get the division metadata block
151	METSDescriptive descriptive;
152	String metadataId = division.getDefaultMetadataReference();
153	if (metadataId == null) {
154	descriptive = metsDoc.getDocumentMetadata().createDescriptive(division.getLabel());
155	division.addMetadataReference(descriptive.getID());
156	}
157	else {
158	// Get the descriptive item...
159	descriptive = metsDoc.getDocumentMetadata().getDescriptiveById(metadataId);
160	}
161
162	descriptive.setMetadata("gsdl3", "mgseqno", indexName + "." + Integer.toString(this.sectionSeqNo));
163	metsDoc.setModified(true);
164	// System.out.println("Assigning " + this.sectionSeqNo + " to " + metsDoc.getID() + " " + division.getLabel());
165	}
166
167	// append an 'end of section' marker
168	textBuffer.append(END_OF_SECTION);
169	this.sectionSeqNo ++;
170
171	// for document-level indexes, always append an 'end of document' tag at the
172	// end of the document for each section. Otherwise, each section is followed
173	// by an end of document character. This ensures that all indexes use the
174	// same document numbering...
175	if (this.level == null \|\|
176	this.level.equals(IndexerInterface.DOCUMENT_LEVEL)) {
177	extraBuffer.append(END_OF_DOCUMENT);
178	}
179	else {
180	textBuffer.append(END_OF_DOCUMENT);
181	this.documentSeqNo ++;
182	}
183
184	// produce the body here for metadata output of divisions - in the case of
185	// text output, that will happen below...
186	if (!this.field.equals("text"))
187	{ METSDescriptive descriptive;
188
189	METSDivision division = (METSDivision) structure;
190
191	String metadataId = division.getDefaultMetadataReference();
192
193	descriptive = metsDoc.getDocumentMetadata().getDescriptiveById(metadataId);
194	if (descriptive != null) {
195	List values = descriptive.getMetadata(namespace, field);
196
197	if (values != null) {
198	Iterator valueIter = values.iterator();
199	while (valueIter.hasNext()) {
200	String value = valueIter.next().toString();
201
202	textBuffer.append(value);
203	if (valueIter.hasNext()) {
204	textBuffer.append(END_OF_SECTION);
205	}
206	}
207	}
208	}
209	}
210	}
211
212	// go through our children as required...
213	Iterator children = structure.getChildIterator();
214	while (children.hasNext()) {
215	AbstractStructure child = (AbstractStructure) children.next();
216
217	// get xpointer for child
218	// get start position node
219	Node startNode = ((HTMLDocument) metsDoc).getSectionStartNode((METSDivision) child);
220
221	// while this node isn't the child's start node, produce the HTML node text, if
222	// in text field mode...
223	if (field.equals("text")) {
224	while (node != startNode) {
225	XPointer.printNode(node, textBuffer, false);
226
227	// print buffer to node
228	node = XPointer.getNextNode(node, (field.equals("text") ? textBuffer : null));
229	}
230	}
231
232	// recurse to child
233	node = this.recurseDOM(metsDoc, node, child, textBuffer, extraBuffer, indexName, namespace, field);
234	}
235
236	// close a document - the actual closing \B will be done by the main
237	// loop, so only a required \C is printed here...
238	if (structure.getStructureType().equals(METSStructure.STRUCTURE_TYPE)) {
239	while (node != null) {
240	if (field.equals("text")) {
241	XPointer.printNode(node, textBuffer, false);
242	}
243	node = XPointer.getNextNode(node, (field.equals("text") ? textBuffer : null));
244	}
245	/*
246	textBuffer.append(END_OF_SECTION);
247	this.sectionSeqNo ++;
248	*/
249	}
250	return node;
251	}
252
253	private String prepareDOM(DocumentInterface metsDoc, Document document, METSStructure structure,
254	String indexName, String namespace, String field)
255	{ StringBuffer extraBuffer = new StringBuffer();
256	Node node = document.getDocumentElement();
257	StringBuffer textBuffer = new StringBuffer();
258
259	this.recurseDOM(metsDoc, node, structure, textBuffer, extraBuffer, indexName, namespace, field);
260	textBuffer.append(extraBuffer.toString());
261	return textBuffer.toString();
262	}
263
264	/**
265	* Index a single document; the document interface can be used to extract individual
266	* metadata items etc. as required or desired and index those instead or as well as
267	* the body text of the document.
268	*/
269	public boolean indexDocument(DocumentID docID, DocumentInterface document)
270	{
271	if (!this.firstDocument)
272	{ // Send a 'CTRL-B' before the document itself
273	try {
274	this.indexerTextfeed.write(END_OF_DOCUMENT);
275	}
276	catch (IOException ex)
277	{ System.out.println("Bad output on end of document" + ex);
278	ex.printStackTrace();
279	return false;
280	}
281	}
282	String docText = null;
283
284	int startSeqNo = this.sectionSeqNo;
285	this.sectionSeqNo ++;
286
287	Document domDocument = document.getDOMDocument();
288	if (domDocument != null) {
289	METSStructure sections = document.getDocumentStructure().getStructure("Section");
290	if (sections != null) {
291	docText = this.prepareDOM(document, domDocument, sections, this.indexName, "gsdl3", this.field);
292	// System.out.println(docText);
293	}
294	}
295	if (docText == null) {
296	if (this.field.equals("text")) {
297	docText = Character.toString(END_OF_DOCUMENT) + Character.toString(END_OF_SECTION) +
298	document.getDocumentText();
299	}
300	else {
301	StringBuffer textBuffer = new StringBuffer();
302	textBuffer.append(END_OF_DOCUMENT);
303	textBuffer.append(END_OF_SECTION);
304	List values = document.getDocumentMetadataItem("gsdl3", this.field);
305	if (values != null) {
306	Iterator valueIter = values.iterator();
307	while (valueIter.hasNext()) {
308	String value = valueIter.next().toString();
309
310	textBuffer.append(value);
311	if (valueIter.hasNext()) {
312	textBuffer.append(END_OF_SECTION);
313	// sectionSeqNo ++;
314	}
315	}
316	}
317	else {
318	textBuffer.append("No data");
319	}
320	docText = textBuffer.toString();
321	}
322	sectionSeqNo ++;
323	}
324
325	/* if (this.pass == 0) {
326	System.err.println(docText);
327	}
328	*/
329
330	byte [] bytes = docText.getBytes();
331	int pos = 0, end = bytes.length;
332
333	try {
334	while (pos < end) {
335	this.indexerTextfeed.write(bytes, pos, (end - pos > 512 ? 512 : end - pos));
336	pos = pos + 512;
337
338	try {
339	while (this.indexerFeedback.available() > 0)
340	{ byte b[] = new byte[this.indexerFeedback.available()];
341	System.out.println("Feedback of " + this.indexerFeedback.available());
342	this.indexerFeedback.read(b);
343	System.out.println(b);
344	}
345	}
346	catch (IOException ex)
347	{ System.out.println(ex);
348	}
349
350
351	try {
352	while (this.indexerErrors.available() > 0)
353	{ byte b[] = new byte[this.indexerErrors.available()];
354	System.out.println("Feedback of " + this.indexerErrors.available());
355	this.indexerErrors.read(b);
356	System.out.println(new String(b));
357	}
358	}
359	catch (IOException ex)
360	{ System.out.println(ex);
361	}
362	}
363	}
364	catch (IOException ex)
365	{ System.out.println("Bad output during document write " + ex + " " + pos + " " + end);
366	ex.printStackTrace();
367	return false;
368	}
369
370	// remember that we're not on the first document, assign the sequence number
371	// on the first pass only, and increment the sequence number.
372	this.firstDocument = false;
373	if (this.pass == 0) {
374	document.addDocumentMetadata("gsdl3", "mgseqno", "dtx."+Integer.toString(startSeqNo));
375	//System.out.println("Assigning " + startSeqNo + " to " + document.getID());
376	}
377	this.documentSeqNo += 1;
378
379	try {
380	while (this.indexerErrors.available() > 0)
381	{ char c = (char) this.indexerErrors.read();
382	System.out.println(c);
383	}
384	while (this.indexerFeedback.available() > 0)
385	{ byte b[] = new byte[this.indexerFeedback.available()];
386	System.out.println("Feedback of " + this.indexerFeedback.available());
387	this.indexerFeedback.read(b);
388	}
389	}
390	catch (IOException ex)
391	{
392	}
393	return true;
394	}
395
396	/**
397	* Initialise the pass: open required files, check status
398	*/
399	public boolean startPass(int passNumber)
400	{
401	this.pass = passNumber;
402	this.firstDocument = true;
403	this.documentSeqNo = 1;
404	this.sectionSeqNo = 1;
405
406	int indexNo = (this.pass - 2) / 2;
407	if (this.pass >= 2) {
408	MGIndex index = (MGIndex) this.indexes.get(indexNo);
409
410	// attempt to ensure that the text subdirectory exists
411	this.indexDirectory = new File(outputDirectory, this.getIndexDirectory(index.getLevel(), index.getField()));
412	if (!indexDirectory.exists()) {
413	if (!indexDirectory.mkdir()) {
414	return false;
415	}
416	}
417	else if (!indexDirectory.isDirectory()) {
418	return false;
419	}
420
421	this.level = index.getLevel();
422	this.field = index.getField();
423	this.indexName = this.getIndexDirectory(index.getLevel(), index.getField());
424	this.indexStem = this.outputDirectory + File.separatorChar +
425	this.indexName + File.separatorChar + "index"; // TODO: modify for index
426	if (this.pass % 2 == 1) {
427	this.indexName = null;
428	}
429	}
430	else {
431	this.field = "text";
432	this.level = "section";
433	this.indexName = null;
434	}
435	System.out.println("level is " + this.level);
436	System.out.println("field is " + this.field);
437	System.out.println("index name is " + this.indexName);
438
439	// get the parameters for this execution of mg_passes
440	String pathParams = "-f index -d " + (this.pass < 2 ? this.textDirectory.toString() : this.indexDirectory.toString());
441
442	int mgPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2);
443
444	try {
445	switch (mgPass) {
446	case 0:
447	mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams + " -b 100000 -T1");
448	break;
449
450	case 1:
451	mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams +" -b 100000 -T2");
452	break;
453
454	case 2:
455	mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams + " -b 100000 -2 -m 32 -s 0 -G -t 10 -N1");
456	break;
457
458	case 3:
459	Process p = Runtime.getRuntime().exec("mg_perf_hash_build -f index -d " + this.indexDirectory.toString());
460	p.waitFor();
461	if (p.exitValue() == 0) {
462	System.out.println("Perfect hashes completed");
463	}
464
465	mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams +" -b 100000 -2 -c 3 -G -t 10 -N2");
466	break;
467	}
468
469	this.indexerFeedback = mg_passes.getInputStream();
470	this.indexerErrors = mg_passes.getErrorStream();
471	this.indexerTextfeed = mg_passes.getOutputStream();
472	}
473	catch (IOException ex)
474	{ System.out.println(ex);
475	ex.printStackTrace();
476	return false;
477	}
478	catch (InterruptedException ex)
479	{ System.out.println(ex);
480	ex.printStackTrace();
481	return false;
482	}
483	System.out.println("Pass " + this.pass);
484	return true;
485	}
486
487	/**
488	* Complete a pass - reset file counters, close files, etc.
489	*/
490	public boolean endPass(int passNumber)
491	{ Process p;
492
493	try {
494	this.indexerTextfeed.write(END_OF_DOCUMENT);
495	this.indexerTextfeed.write(END_OF_STREAM);
496	while (this.indexerErrors.available() > 0)
497	{ char c = (char) this.indexerErrors.read();
498	System.out.print(c);
499	}
500	while (this.indexerFeedback.available() > 0)
501	{ byte b[] = new byte[this.indexerFeedback.available()];
502	System.out.print("Feedback of " + this.indexerFeedback.available());
503	this.indexerFeedback.read(b);
504	}
505
506	this.indexerTextfeed.close();
507	Thread.sleep(1000);
508	this.mg_passes.waitFor();
509	}
510	catch (IOException ex)
511	{ System.out.println(ex);
512	}
513	catch (InterruptedException ex)
514	{ System.out.println(ex);
515	}
516	System.out.println("Pass " + this.pass + " completed with " + this.mg_passes.exitValue());
517
518	int mgPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2);
519
520	try {
521	switch (mgPass)
522	{
523	case 0:
524	System.out.println("Compressing dictionary");
525	p = Runtime.getRuntime().exec("mg_compression_dict -f index -d " + this.textDirectory.toString() + " -S -H -2 -k 5120");
526	p.waitFor();
527	if (p.exitValue() != 0) {
528	System.out.println("Error from mg_compression_dict: " + p.exitValue());
529	}
530	else {
531	System.out.println("Compressed dictionary successfully written");
532	}
533	break;
534
535	case 3:
536	System.out.println("Writing weights file");
537	p = Runtime.getRuntime().exec("mg_weights_build -f " + this.indexStem + " -t " + this.textStem + " -d /");
538	p.waitFor();
539	if (p.exitValue() == 0) {
540	System.out.println("Weights file successfully written");
541	}
542	else {
543	System.out.println("Unable to create weights file " + "mg_weights_build -f " + this.indexStem + " -t " + this.textStem + " -d /");
544	}
545
546	p = Runtime.getRuntime().exec("mg_invf_dict -f index -d " + this.indexDirectory.toString());
547	p.waitFor();
548	if (p.exitValue() == 0) {
549	System.out.println("Inverted dictionary file successfully written");
550	}
551	else {
552	System.out.println("Unable to create inverted dictionary file");
553	}
554
555	p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s1 -f index -d " + this.indexDirectory.toString());
556	p.waitFor();
557	if (p.exitValue() == 0) {
558	System.out.println("Stemmed index successfully written");
559	}
560	else {
561	System.out.println("Unable to create stemmed index");
562	}
563
564	p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s2 -f index -d " + this.indexDirectory.toString());
565	p.waitFor();
566	if (p.exitValue() == 0) {
567	System.out.println("Stemmed index successfully written");
568	}
569	else {
570	System.out.println("Unable to create stemmed index");
571	}
572
573	p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s3 -f index -d " + this.indexDirectory.toString());
574	p.waitFor();
575	if (p.exitValue() == 0) {
576	System.out.println("Stemmed index successfully written");
577	}
578	else {
579	System.out.println("Unable to create stemmed index");
580	}
581	break;
582	}
583	}
584	catch (IOException ex)
585	{ System.out.println(ex);
586	ex.printStackTrace();
587	return false;
588	}
589	catch (InterruptedException ex)
590	{ System.out.println(ex);
591	ex.printStackTrace();
592	return false;
593	}
594	return true;
595	}
596
597	/**
598	* Do any tidying up
599	*/
600	public void tidyup()
601	{
602	}
603
604	/**
605	* Return the number of passes required for this index.
606	*/
607	public int getNumberOfPasses()
608	{ return 2 + this.indexes.size() * 2;
609	}
610	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: