source: main/trunk/greenstone2/bin/script/lucene_passes.pl@ 21323

Last change on this file since 21323 was 21323, checked in by ak19, 14 years ago

Minor corrective changes before major commits for new compile settings to compile indexers individually.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 5.8 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# lucene_passes.pl -- perl wrapper, akin to mgpp_passes, for Lucene
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29BEGIN {
30 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
31 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
32 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
33}
34
35
36use strict;
37use util;
38
39
40sub open_java_lucene
41{
42 my ($doc_tag_level,$full_builddir,$indexdir,$java_lucene_options) = @_;
43
44 # Is there a collection-specific bin/java/LuceneWrapper.jar file?
45 my $bin_java = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"bin","java");
46 my $classpath = &util::filename_cat($bin_java,"LuceneWrapper.jar");
47 if (!-f $classpath)
48 {
49 # No, so use the Greenstone one
50 $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
51 $classpath = &util::filename_cat($bin_java,"LuceneWrapper.jar");
52 if(!-f $classpath) {
53 die "***** ERROR: $classpath does not exist\n";
54 }
55 }
56
57 my $java_lucene = "java -classpath \"$classpath\" org.greenstone.LuceneWrapper.GS2LuceneIndexer";
58 my $java_cmd = "$java_lucene $java_lucene_options $doc_tag_level \"$full_builddir\" $indexdir";
59
60 open (PIPEOUT, "| $java_cmd") or die "lucene_passes.pl - couldn't run $java_cmd\n";
61}
62
63
64sub close_java_lucene
65{
66 close(PIPEOUT);
67}
68
69
70sub save_xml_doc
71{
72 my ($full_textdir,$output_filename,$doc_xml) = @_;
73
74 my $dir_sep = &util::get_os_dirsep();
75
76 my $full_output_filename = &util::filename_cat($full_textdir,$output_filename);
77 my ($full_output_dir) = ($full_output_filename =~ m/^(.*$dir_sep)/x);
78 &util::mk_all_dir($full_output_dir);
79
80 open(DOCOUT,">$full_output_filename")
81 || die "Unable to open $full_output_filename";
82
83 print DOCOUT $doc_xml;
84 close(DOCOUT);
85
86 my @secs = ($doc_xml =~ m/<Sec\s+gs2:id="\d+"\s*>.*?<\/Sec>/sg);
87}
88
89
90sub compress_xml_doc
91{
92 my ($full_textdir,$output_filename) = @_;
93
94 my $full_output_filename
95 = &util::filename_cat($full_textdir,$output_filename);
96
97 `gzip $full_output_filename`;
98}
99
100
101# This appears to be the callback that gets the xml stream during the
102# build process, so I need to intercept it here and call my XML RPC
103# to insert into the Lucene database.
104sub monitor_xml_stream
105{
106 my ($mode, $full_textdir) = @_;
107
108 my $doc_xml = "";
109 my $output_filename = "";
110
111 my $line;
112 while (defined ($line = <STDIN>)) {
113 $doc_xml .= $line;
114 if ($line =~ m/^<Doc.+file=\"(.*?)\".*>$/) {
115 $output_filename = $1;
116
117 }
118
119 if ($line =~ m/^<\/Doc>$/) {
120 if ($mode eq "text") {
121 save_xml_doc($full_textdir,$output_filename,$doc_xml);
122 } elsif ($mode eq "index") {
123 # notify lucene indexer
124
125 # SAX parser seems to be sensitive to blank lines
126 # => remove them
127 $doc_xml =~ s/\n+/\n/g;
128
129# print STDERR $doc_xml;
130
131## print PIPEOUT "$output_filename\n";
132
133 print PIPEOUT "$doc_xml";
134
135
136 #save_xml_doc($full_textdir, "$output_filename.txt", $doc_xml);
137 }
138 # compress file
139### compress_xml_doc($full_textdir,$output_filename);
140
141 $doc_xml = "";
142 $output_filename = "";
143 }
144 }
145}
146
147
148# /** This checks the arguments on the command line, filters the
149# * unknown command line arguments and then calls the open_java_lucene
150# * function to begin processing. Most of the arguments are passed on
151# * the command line of the java wrapper.
152# *
153# */
154sub main
155{
156 my (@argv) = @_;
157 my $argc = scalar(@argv);
158
159 my $java_lucene_options = "";
160 my @filtered_argv = ();
161
162 my $i = 0;
163 while ($i<$argc) {
164 if ($argv[$i] =~ m/^\-(.*)$/) {
165
166 my $option = $1;
167
168 # -removeold causes the existing index to be overwritten
169 if ($option eq "removeold") {
170 print STDERR "\n-removeold set\n";
171 $java_lucene_options .= "-removeold ";
172 }
173 # -verbosity <num>
174 elsif ($option eq "verbosity") {
175 $i++;
176 if ($i<$argc)
177 {
178 $java_lucene_options .= "-verbosity " . $argv[$i];
179 }
180 }
181 else {
182 print STDERR "Unrecognised minus option: -$option\n";
183 }
184 }
185 else {
186 push(@filtered_argv,$argv[$i]);
187 }
188 $i++;
189 }
190
191 my $filtered_argc = scalar(@filtered_argv);
192
193 if ($filtered_argc < 4) {
194 print STDERR "Usage: lucene_passes.pl [-removeold|-verbosity num] \"text\"|\"index\" doc-tag-level build-dir index-name\n";
195 exit 1;
196 }
197
198 my $mode = $filtered_argv[0];
199 my $doc_tag_level = $filtered_argv[1];
200 my $full_builddir = $filtered_argv[2];
201 my $indexdir = $filtered_argv[3];
202### print STDERR "**** ARGS = ", join(" ", @argv), "\n";
203
204 # We only need the Lucene handle opened if we are indexing the documents, not if we are just storing the text
205 if ($mode eq "index") {
206 open_java_lucene($doc_tag_level, $full_builddir, $indexdir, $java_lucene_options);
207 }
208
209 print STDERR "Monitoring for input!\n";
210 my $full_textdir = &util::filename_cat($full_builddir,"text");
211 monitor_xml_stream($mode, $full_textdir);
212
213 if ($mode eq "index") {
214 close_java_lucene();
215 }
216}
217
218
219&main(@ARGV);
Note: See TracBrowser for help on using the repository browser.