source: gsdl/trunk/bin/script/lucene_passes.pl@ 16968

Last change on this file since 16968 was 16264, checked in by mdewsnip, 16 years ago

Changed "-create" to "-removeold", and removed the non-functional "-remove <OID>" option.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 5.5 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# lucene_passes.pl -- perl wrapper, akin to mgpp_passes, for Lucene
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29BEGIN {
30 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
31 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
32 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
33}
34
35
36use strict;
37use util;
38
39
40sub open_java_lucene
41{
42 my ($doc_tag_level,$full_builddir,$indexdir,$java_lucene_options) = @_;
43
44 my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
45 my $classpath = &util::filename_cat($bin_java,"LuceneWrapper.jar");
46
47 my $java_lucene = "java -classpath \"$classpath\" org.greenstone.LuceneWrapper.GS2LuceneIndexer";
48 my $java_cmd = "$java_lucene $java_lucene_options $doc_tag_level \"$full_builddir\" $indexdir";
49
50 open (PIPEOUT, "| $java_cmd") or die "lucene_passes.pl - couldn't run $java_cmd\n";
51}
52
53
54sub close_java_lucene
55{
56 close(PIPEOUT);
57}
58
59
60sub save_xml_doc
61{
62 my ($full_textdir,$output_filename,$doc_xml) = @_;
63
64 my $dir_sep = &util::get_os_dirsep();
65
66 my $full_output_filename = &util::filename_cat($full_textdir,$output_filename);
67 my ($full_output_dir) = ($full_output_filename =~ m/^(.*$dir_sep)/x);
68 &util::mk_all_dir($full_output_dir);
69
70 open(DOCOUT,">$full_output_filename")
71 || die "Unable to open $full_output_filename";
72
73 print DOCOUT $doc_xml;
74 close(DOCOUT);
75
76 my @secs = ($doc_xml =~ m/<Sec\s+gs2:id="\d+"\s*>.*?<\/Sec>/sg);
77}
78
79
80sub compress_xml_doc
81{
82 my ($full_textdir,$output_filename) = @_;
83
84 my $full_output_filename
85 = &util::filename_cat($full_textdir,$output_filename);
86
87 `gzip $full_output_filename`;
88}
89
90
91# This appears to be the callback that gets the xml stream during the
92# build process, so I need to intercept it here and call my XML RPC
93# to insert into the Lucene database.
94sub monitor_xml_stream
95{
96 my ($mode, $full_textdir) = @_;
97
98 my $doc_xml = "";
99 my $output_filename = "";
100
101 my $line;
102 while (defined ($line = <STDIN>)) {
103 $doc_xml .= $line;
104 if ($line =~ m/^<Doc.+file=\"(.*?)\".*>$/) {
105 $output_filename = $1;
106
107 }
108
109 if ($line =~ m/^<\/Doc>$/) {
110 if ($mode eq "text") {
111 save_xml_doc($full_textdir,$output_filename,$doc_xml);
112 } elsif ($mode eq "index") {
113 # notify lucene indexer
114
115 # SAX parser seems to be sensitive to blank lines
116 # => remove them
117 $doc_xml =~ s/\n+/\n/g;
118
119# print STDERR $doc_xml;
120
121## print PIPEOUT "$output_filename\n";
122
123 print PIPEOUT "$doc_xml";
124
125
126 #save_xml_doc($full_textdir, "$output_filename.txt", $doc_xml);
127 }
128 # compress file
129### compress_xml_doc($full_textdir,$output_filename);
130
131 $doc_xml = "";
132 $output_filename = "";
133 }
134 }
135}
136
137
138# /** This checks the arguments on the command line, filters the
139# * unknown command line arguments and then calls the open_java_lucene
140# * function to begin processing. Most of the arguments are passed on
141# * the command line of the java wrapper.
142# *
143# * @author John Rowe, DL Consulting
144# */
145sub main
146{
147 my (@argv) = @_;
148 my $argc = scalar(@argv);
149
150 my $java_lucene_options = "";
151 my @filtered_argv = ();
152
153 my $i = 0;
154 while ($i<$argc) {
155 if ($argv[$i] =~ m/^\-(.*)$/) {
156
157 my $option = $1;
158
159 # -removeold causes the existing index to be overwritten
160 if ($option eq "removeold") {
161 print STDERR "\n-removeold set\n";
162 $java_lucene_options .= "-removeold ";
163 }
164 # -verbosity <num>
165 elsif ($option eq "verbosity") {
166 $i++;
167 if ($i<$argc)
168 {
169 $java_lucene_options .= "-verbosity " . $argv[$i];
170 }
171 }
172 else {
173 print STDERR "Unrecognised minus option: -$option\n";
174 }
175 }
176 else {
177 push(@filtered_argv,$argv[$i]);
178 }
179 $i++;
180 }
181
182 my $filtered_argc = scalar(@filtered_argv);
183
184 if ($filtered_argc < 4) {
185 print STDERR "Usage: lucene_passes.pl [-removeold|-verbosity num] \"text\"|\"index\" doc-tag-level build-dir index-name\n";
186 exit 1;
187 }
188
189 my $mode = $filtered_argv[0];
190 my $doc_tag_level = $filtered_argv[1];
191 my $full_builddir = $filtered_argv[2];
192 my $indexdir = $filtered_argv[3];
193### print STDERR "**** ARGS = ", join(" ", @argv), "\n";
194
195 # We only need the Lucene handle opened if we are indexing the documents, not if we are just storing the text
196 if ($mode eq "index") {
197 open_java_lucene($doc_tag_level, $full_builddir, $indexdir, $java_lucene_options);
198 }
199
200 print STDERR "Monitoring for input!\n";
201 my $full_textdir = &util::filename_cat($full_builddir,"text");
202 monitor_xml_stream($mode, $full_textdir);
203
204 if ($mode eq "index") {
205 close_java_lucene();
206 }
207}
208
209
210&main(@ARGV);
Note: See TracBrowser for help on using the repository browser.