Context Navigation

source: trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/indexers/MGIndexer.java@ 6376

Last change on this file since 6376 was 6376, checked in by cs025, 20 years ago
Fixed some problems in MG indexing; also changed use of mgseqno to a by-index basis.
Property svn:keywords set to `Author Date Id Revision`
File size: 17.7 KB

Line
1	package org.greenstone.gsdl3.gs3build.indexers;
2
3	import java.util.List;
4	import java.util.ArrayList;
5	import java.util.Iterator;
6
7	import java.io.File;
8	import java.io.InputStream;
9	import java.io.OutputStream;
10	import java.io.IOException;
11
12	import org.w3c.dom.*;
13
14	import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
15	import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
16	import org.greenstone.gsdl3.gs3build.doctypes.HTMLDocument;
17	import org.greenstone.gsdl3.gs3build.metadata.*;
18	import org.greenstone.gsdl3.gs3build.xpointer.XPointer;
19
20	public class MGIndexer implements IndexerInterface
21	{
22	int pass;
23	int documentSeqNo;
24	int sectionSeqNo;
25	boolean firstDocument;
26	String outputDirectory;
27	InputStream indexerFeedback;
28	InputStream indexerErrors;
29	OutputStream indexerTextfeed;
30	Process mg_passes;
31	File textDirectory;
32	File indexDirectory;
33	String indexStem;
34	String textStem;
35	List indexes;
36	String indexName;
37	String level;
38	String field;
39
40	static final char END_OF_DOCUMENT = (char) 2;
41	static final char END_OF_SECTION = (char) 3;
42	static final char END_OF_STREAM = (char) 4;
43
44	class MGIndex
45	{ String level;
46	String field;
47
48	public MGIndex(String level, String field)
49	{ this.level = level;
50	this.field = field;
51	}
52
53	public MGIndex(String indexLabel)
54	{ int colonAt = indexLabel.indexOf(':');
55
56	if (colonAt >= 0)
57	{ field = indexLabel.substring(colonAt+1);
58	level = indexLabel.substring(0, colonAt);
59	}
60	}
61
62	public String getLevel()
63	{ return this.level;
64	}
65
66	public String getField()
67	{ return this.field;
68	}
69	}
70
71	public MGIndexer()
72	{ this.indexes = new ArrayList();
73	}
74
75	private String getIndexDirectory(String level, String field)
76	{ StringBuffer directory = new StringBuffer();
77	directory.append(Character.toLowerCase((char) level.charAt(0)));
78
79	int c, w;
80	w = 0;
81	c = 0;
82	while (c < field.length() && w < 2) {
83	char ch = field.charAt(c);
84
85	ch = Character.toLowerCase(ch);
86	if (Character.isLetter(ch)) {
87	if (ch != 'a' && ch != 'e' && ch != 'i' &&
88	ch != 'o' && ch != 'u') {
89	directory.append(ch);
90	w++;
91	}
92	}
93	c ++;
94	}
95	return directory.toString();
96	}
97
98	/**
99	* The output directory should be (collection)/building/text/ for
100	* normal Greenstone builds.
101	*
102	* @param <code>String</code> the label to configure
103	* @param <code>String</code> the value...
104	*/
105	public boolean configure(String label, String value)
106	{
107	if (label.equals(IndexerManager.outputDir)) {
108	this.outputDirectory = value;
109	this.textStem = value + "/text/index";
110	this.pass = 0;
111
112	// attempt to ensure that the text subdirectory exists
113	this.textDirectory = new File(outputDirectory, "text");
114	if (!textDirectory.exists()) {
115	if (!textDirectory.mkdir()) {
116	return false;
117	}
118	}
119	else if (!textDirectory.isDirectory()) {
120	return false;
121	}
122
123	// Sign to the user which mg directory is being used...
124	System.out.println("Output MG directory is " + this.textStem);
125	}
126	else if (label.equals(IndexerInterface.GS2_INDEX_LABEL)) {
127	this.indexes.add(new MGIndex(value));
128	}
129
130	return true;
131	}
132
133	public boolean addIndex(String level, String field)
134	{
135	MGIndex index = new MGIndex(level, field);
136	this.indexes.add(index);
137	return true;
138	}
139
140	private Node recurseDOM(DocumentInterface metsDoc, Node node,
141	AbstractStructure structure, StringBuffer textBuffer,
142	StringBuffer extraBuffer, String indexName,
143	String namespace, String field)
144	{
145	// send out the ctrl-c...if this is
146	if (structure.getStructureType().equals(METSDivision.DIVISION_TYPE)) {
147	if ((indexName != null) && indexName.startsWith("s")) {
148	METSDivision division = (METSDivision) structure;
149
150	// get the division metadata block
151	METSDescriptive descriptive;
152	String metadataId = division.getDefaultMetadataReference();
153	if (metadataId == null) {
154	descriptive = metsDoc.getDocumentMetadata().createDescriptive(division.getLabel());
155	division.addMetadataReference(descriptive.getID());
156	}
157	else {
158	// Get the descriptive item...
159	descriptive = metsDoc.getDocumentMetadata().getDescriptiveById(metadataId);
160	}
161
162	descriptive.addMetadata("gsdl3", "mgseqno", indexName + "." + Integer.toString(this.documentSeqNo));
163	metsDoc.setModified(true);
164	// System.out.println("Assigning " + this.documentSeqNo + " to " + metsDoc.getID() + " " + division.getLabel());
165	}
166
167	// append an 'end of section' marker
168	textBuffer.append(END_OF_SECTION);
169
170	// for document-level indexes, always append an 'end of document' tag at the
171	// end of the document for each section. Otherwise, each section is followed
172	// by an end of document character. This ensures that all indexes use the
173	// same document numbering...
174	if (this.level == null \|\|
175	this.level.equals(IndexerInterface.DOCUMENT_LEVEL)) {
176	// extraBuffer.append(END_OF_DOCUMENT);
177	}
178	else {
179	textBuffer.append(END_OF_DOCUMENT);
180	this.documentSeqNo ++;
181	}
182	this.sectionSeqNo ++;
183
184	// produce the body here for metadata output of divisions - in the case of
185	// text output, that will happen below...
186	if (!this.field.equals("text"))
187	{ METSDescriptive descriptive;
188
189	METSDivision division = (METSDivision) structure;
190
191	String metadataId = division.getDefaultMetadataReference();
192
193	descriptive = metsDoc.getDocumentMetadata().getDescriptiveById(metadataId);
194	if (descriptive != null) {
195	List values = descriptive.getMetadata(namespace, field);
196
197	if (values != null) {
198	Iterator valueIter = values.iterator();
199	while (valueIter.hasNext()) {
200	String value = valueIter.next().toString();
201
202	textBuffer.append(value);
203	if (valueIter.hasNext()) {
204	textBuffer.append(END_OF_SECTION);
205	}
206	}
207	}
208	}
209	}
210	}
211
212	// go through our children as required...
213	Iterator children = structure.getChildIterator();
214	while (children.hasNext()) {
215	AbstractStructure child = (AbstractStructure) children.next();
216
217	// get xpointer for child
218	// get start position node
219	Node startNode = ((HTMLDocument) metsDoc).getSectionStartNode((METSDivision) child);
220
221	// while this node isn't the child's start node, produce the HTML node text, if
222	// in text field mode...
223	if (field.equals("text")) {
224	while (node != startNode) {
225	XPointer.printNode(node, textBuffer, false);
226
227	// print buffer to node
228	node = XPointer.getNextNode(node, (field.equals("text") ? textBuffer : null));
229	}
230	}
231
232	// recurse to child
233	this.recurseDOM(metsDoc, node, child, textBuffer, extraBuffer, indexName, namespace, field);
234	}
235
236	// close a document - the actual closing \B will be done by the main
237	// loop, so only a required \C is printed here...
238	if (structure.getStructureType().equals(METSStructure.STRUCTURE_TYPE)) {
239	while (node != null) {
240	if (field.equals("text")) {
241	XPointer.printNode(node, textBuffer, false);
242	}
243	node = XPointer.getNextNode(node, (field.equals("text") ? textBuffer : null));
244	}
245	/*
246	textBuffer.append(END_OF_SECTION);
247	this.sectionSeqNo ++;
248	*/
249	}
250	return node;
251	}
252
253	private String prepareDOM(DocumentInterface metsDoc, Document document, METSStructure structure,
254	String indexName, String namespace, String field)
255	{ StringBuffer extraBuffer = new StringBuffer();
256	Node node = document.getDocumentElement();
257	StringBuffer textBuffer = new StringBuffer();
258
259	this.recurseDOM(metsDoc, node, structure, textBuffer, extraBuffer, indexName, namespace, field);
260	textBuffer.append(extraBuffer.toString());
261	return textBuffer.toString();
262	}
263
264	/**
265	* Index a single document; the document interface can be used to extract individual
266	* metadata items etc. as required or desired and index those instead or as well as
267	* the body text of the document.
268	*/
269	public boolean indexDocument(DocumentID docID, DocumentInterface document)
270	{
271	if (!this.firstDocument)
272	{ // Send a 'CTRL-B' before the document itself
273	try {
274	this.indexerTextfeed.write(2);
275	}
276	catch (IOException ex)
277	{ System.out.println("Bad output on end of document" + ex);
278	ex.printStackTrace();
279	return false;
280	}
281	}
282	String docText = null;
283
284	int startSeqNo = this.sectionSeqNo;
285
286	Document domDocument = document.getDOMDocument();
287	if (domDocument != null) {
288	METSStructure sections = document.getDocumentStructure().getStructure("Section");
289	if (sections != null) {
290	docText = this.prepareDOM(document, domDocument, sections, this.indexName, "gsdl3", this.field);
291	// System.out.println(docText);
292	}
293	}
294	if (docText == null) {
295	if (this.field.equals("text")) {
296	docText = Character.toString(END_OF_DOCUMENT) + Character.toString(END_OF_SECTION) +
297	document.getDocumentText();
298	}
299	else {
300	StringBuffer textBuffer = new StringBuffer();
301	textBuffer.append(END_OF_DOCUMENT);
302	textBuffer.append(END_OF_SECTION);
303	List values = document.getDocumentMetadataItem("gsdl3", this.field);
304	if (values != null) {
305	Iterator valueIter = values.iterator();
306	while (valueIter.hasNext()) {
307	String value = valueIter.next().toString();
308
309	textBuffer.append(value);
310	if (valueIter.hasNext()) {
311	textBuffer.append(END_OF_SECTION);
312	sectionSeqNo ++;
313	}
314	}
315	}
316	else {
317	textBuffer.append("No data");
318	}
319	docText = textBuffer.toString();
320	}
321	sectionSeqNo ++;
322	}
323
324	byte [] bytes = docText.getBytes();
325	int pos = 0, end = bytes.length;
326
327	try {
328	while (pos < end) {
329	this.indexerTextfeed.write(bytes, pos, (end - pos > 512 ? 512 : end - pos));
330	pos = pos + 512;
331
332	try {
333	while (this.indexerFeedback.available() > 0)
334	{ byte b[] = new byte[this.indexerFeedback.available()];
335	System.out.println("Feedback of " + this.indexerFeedback.available());
336	this.indexerFeedback.read(b);
337	System.out.println(b);
338	}
339	}
340	catch (IOException ex)
341	{ System.out.println(ex);
342	}
343
344
345	try {
346	while (this.indexerErrors.available() > 0)
347	{ byte b[] = new byte[this.indexerErrors.available()];
348	System.out.println("Feedback of " + this.indexerErrors.available());
349	this.indexerErrors.read(b);
350	System.out.println(new String(b));
351	}
352	}
353	catch (IOException ex)
354	{ System.out.println(ex);
355	}
356	}
357	}
358	catch (IOException ex)
359	{ System.out.println("Bad output during document write " + ex + " " + pos + " " + end);
360	ex.printStackTrace();
361	return false;
362	}
363
364	// remember that we're not on the first document, assign the sequence number
365	// on the first pass only, and increment the sequence number.
366	this.firstDocument = false;
367	if (this.pass == 0) {
368	document.addDocumentMetadata("gsdl3", "mgseqno", "dtx."+Integer.toString(this.documentSeqNo));
369	// System.out.println("Assigning " + startSeqNo + " to " + document.getID());
370	}
371	this.documentSeqNo += 1;
372
373	try {
374	while (this.indexerErrors.available() > 0)
375	{ char c = (char) this.indexerErrors.read();
376	System.out.println(c);
377	}
378	while (this.indexerFeedback.available() > 0)
379	{ byte b[] = new byte[this.indexerFeedback.available()];
380	System.out.println("Feedback of " + this.indexerFeedback.available());
381	this.indexerFeedback.read(b);
382	}
383	}
384	catch (IOException ex)
385	{
386	}
387	return true;
388	}
389
390	/**
391	* Initialise the pass: open required files, check status
392	*/
393	public boolean startPass(int passNumber)
394	{
395	this.pass = passNumber;
396	this.firstDocument = true;
397	this.documentSeqNo = 1;
398	this.sectionSeqNo = 1;
399
400	int indexNo = (this.pass - 2) / 2;
401	if (this.pass >= 2) {
402	MGIndex index = (MGIndex) this.indexes.get(indexNo);
403
404	// attempt to ensure that the text subdirectory exists
405	this.indexDirectory = new File(outputDirectory, this.getIndexDirectory(index.getLevel(), index.getField()));
406	if (!indexDirectory.exists()) {
407	if (!indexDirectory.mkdir()) {
408	return false;
409	}
410	}
411	else if (!indexDirectory.isDirectory()) {
412	return false;
413	}
414
415	this.level = index.getLevel();
416	this.field = index.getField();
417	this.indexName = this.getIndexDirectory(index.getLevel(), index.getField());
418	this.indexStem = this.outputDirectory + File.separatorChar +
419	this.indexName + File.separatorChar + "index"; // TODO: modify for index
420	if (this.pass % 2 == 1) {
421	this.indexName = null;
422	}
423	}
424	else {
425	this.field = "text";
426	this.level = "document";
427	this.indexName = null;
428	}
429	System.out.println("level is " + this.level);
430	System.out.println("field is " + this.field);
431	System.out.println("index name is " + this.indexName);
432
433	// get the parameters for this execution of mg_passes
434	String pathParams = "-f index -d " + (this.pass < 2 ? this.textDirectory.toString() : this.indexDirectory.toString());
435
436	int mgPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2);
437
438	try {
439	switch (mgPass) {
440	case 0:
441	mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams + " -b 100000 -T1");
442	break;
443
444	case 1:
445	mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams +" -b 100000 -T2");
446	break;
447
448	case 2:
449	mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams + " -b 100000 -2 -m 32 -s 0 -G -t 10 -N1");
450	break;
451
452	case 3:
453	Process p = Runtime.getRuntime().exec("mg_perf_hash_build -f index -d " + this.indexDirectory.toString());
454	p.waitFor();
455	if (p.exitValue() == 0) {
456	System.out.println("Perfect hashes completed");
457	}
458
459	mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams +" -b 100000 -2 -c 3 -G -t 10 -N2");
460	break;
461	}
462
463	this.indexerFeedback = mg_passes.getInputStream();
464	this.indexerErrors = mg_passes.getErrorStream();
465	this.indexerTextfeed = mg_passes.getOutputStream();
466	}
467	catch (IOException ex)
468	{ System.out.println(ex);
469	ex.printStackTrace();
470	return false;
471	}
472	catch (InterruptedException ex)
473	{ System.out.println(ex);
474	ex.printStackTrace();
475	return false;
476	}
477	System.out.println("Pass " + this.pass);
478	return true;
479	}
480
481	/**
482	* Complete a pass - reset file counters, close files, etc.
483	*/
484	public boolean endPass(int passNumber)
485	{ Process p;
486
487	try {
488	this.indexerTextfeed.write(END_OF_DOCUMENT);
489	this.indexerTextfeed.write(END_OF_STREAM);
490	while (this.indexerErrors.available() > 0)
491	{ char c = (char) this.indexerErrors.read();
492	System.out.print(c);
493	}
494	while (this.indexerFeedback.available() > 0)
495	{ byte b[] = new byte[this.indexerFeedback.available()];
496	System.out.print("Feedback of " + this.indexerFeedback.available());
497	this.indexerFeedback.read(b);
498	}
499
500	this.indexerTextfeed.close();
501	Thread.sleep(1000);
502	this.mg_passes.waitFor();
503	}
504	catch (IOException ex)
505	{ System.out.println(ex);
506	}
507	catch (InterruptedException ex)
508	{ System.out.println(ex);
509	}
510	System.out.println("Pass " + this.pass + " completed with " + this.mg_passes.exitValue());
511
512	int mgPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2);
513
514	try {
515	switch (mgPass)
516	{
517	case 0:
518	System.out.println("Compressing dictionary");
519	p = Runtime.getRuntime().exec("mg_compression_dict -f index -d " + this.textDirectory.toString() + " -S -H -2 -k 5120");
520	p.waitFor();
521	if (p.exitValue() != 0) {
522	System.out.println("Error from mg_compression_dict: " + p.exitValue());
523	}
524	else {
525	System.out.println("Compressed dictionary successfully written");
526	}
527	break;
528
529	case 3:
530	System.out.println("Writing weights file");
531	p = Runtime.getRuntime().exec("mg_weights_build -f " + this.indexStem + " -t " + this.textStem + " -d /");
532	p.waitFor();
533	if (p.exitValue() == 0) {
534	System.out.println("Weights file successfully written");
535	}
536	else {
537	System.out.println("Unable to create weights file " + "mg_weights_build -f " + this.indexStem + " -t " + this.textStem + " -d /");
538	}
539
540	p = Runtime.getRuntime().exec("mg_invf_dict -f index -d " + this.indexDirectory.toString());
541	p.waitFor();
542	if (p.exitValue() == 0) {
543	System.out.println("Inverted dictionary file successfully written");
544	}
545	else {
546	System.out.println("Unable to create inverted dictionary file");
547	}
548
549	p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s1 -f index -d " + this.indexDirectory.toString());
550	p.waitFor();
551	if (p.exitValue() == 0) {
552	System.out.println("Stemmed index successfully written");
553	}
554	else {
555	System.out.println("Unable to create stemmed index");
556	}
557
558	p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s2 -f index -d " + this.indexDirectory.toString());
559	p.waitFor();
560	if (p.exitValue() == 0) {
561	System.out.println("Stemmed index successfully written");
562	}
563	else {
564	System.out.println("Unable to create stemmed index");
565	}
566
567	p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s3 -f index -d " + this.indexDirectory.toString());
568	p.waitFor();
569	if (p.exitValue() == 0) {
570	System.out.println("Stemmed index successfully written");
571	}
572	else {
573	System.out.println("Unable to create stemmed index");
574	}
575	break;
576	}
577	}
578	catch (IOException ex)
579	{ System.out.println(ex);
580	ex.printStackTrace();
581	return false;
582	}
583	catch (InterruptedException ex)
584	{ System.out.println(ex);
585	ex.printStackTrace();
586	return false;
587	}
588	return true;
589	}
590
591	/**
592	* Do any tidying up
593	*/
594	public void tidyup()
595	{
596	}
597
598	/**
599	* Return the number of passes required for this index.
600	*/
601	public int getNumberOfPasses()
602	{ return 2 + this.indexes.size() * 2;
603	}
604	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: