source: main/trunk/greenstone2/bin/script/lucene_passes.pl@ 30741

Last change on this file since 30741 was 30741, checked in by kjdon, 8 years ago

lucene is expecting the stored text file to be doc.xml. So output it as such, no matter what the input file was. usually the input will have been doc.xml, but it may not be, eg when using METS as archive format

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.2 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# lucene_passes.pl -- perl wrapper, akin to mgpp_passes, for Lucene
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29BEGIN {
30 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
31 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
32 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
33}
34
35
36use strict;
37use util;
38use FileUtils;
39
40sub open_java_lucene
41{
42 my ($doc_tag_level,$full_builddir,$indexdir,$java_lucene_options) = @_;
43
44 # Is there a collection-specific bin/java/LuceneWrapper4.jar file?
45 my $bin_java = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},"bin","java");
46 my $classpath = &FileUtils::javaFilenameConcatenate($bin_java,"LuceneWrapper4.jar");
47 if (!-f $classpath)
48 {
49 # No, so use the Greenstone one
50 $bin_java = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java");
51 $classpath = &FileUtils::javaFilenameConcatenate($bin_java,"LuceneWrapper4.jar");
52 if(!-f $classpath) {
53 die "***** ERROR: $classpath does not exist\n";
54 }
55 }
56
57 $full_builddir = &util::makeFilenameJavaCygwinCompatible($full_builddir);
58
59 my $java_lucene = "java -classpath \"$classpath\" org.greenstone.LuceneWrapper4.GS2LuceneIndexer";
60 my $java_cmd = "$java_lucene $java_lucene_options $doc_tag_level \"$full_builddir\" $indexdir";
61
62 open (PIPEOUT, "| $java_cmd") or die "lucene_passes.pl - couldn't run $java_cmd\n";
63}
64
65
66sub close_java_lucene
67{
68 close(PIPEOUT);
69}
70
71
72sub save_xml_doc
73{
74 my ($full_textdir,$output_filename,$doc_xml) = @_;
75
76 my $dir_sep = &util::get_os_dirsep();
77
78 my $full_output_filename = &FileUtils::filenameConcatenate($full_textdir,$output_filename);
79 my ($full_output_dir) = ($full_output_filename =~ m/^(.*$dir_sep)/x);
80 &FileUtils::makeAllDirectories($full_output_dir);
81
82 open(DOCOUT,">$full_output_filename")
83 || die "Unable to open $full_output_filename";
84
85 print DOCOUT $doc_xml;
86 close(DOCOUT);
87
88 my @secs = ($doc_xml =~ m/<Sec\s+gs2:id="\d+"\s*>.*?<\/Sec>/sg);
89}
90
91
92sub compress_xml_doc
93{
94 my ($full_textdir,$output_filename) = @_;
95
96 my $full_output_filename
97 = &FileUtils::filenameConcatenate($full_textdir,$output_filename);
98
99 `gzip $full_output_filename`;
100}
101
102
103# This appears to be the callback that gets the xml stream during the
104# build process, so I need to intercept it here and call my XML RPC
105# to insert into the Lucene database.
106sub monitor_xml_stream
107{
108 my ($mode, $full_textdir) = @_;
109
110 my $doc_xml = "";
111 my $output_filename = "";
112
113 my $line;
114 while (defined ($line = <STDIN>)) {
115 $doc_xml .= $line;
116 if ($line =~ m/^<Doc.+file=\"(.*?)\".*>$/) {
117 $output_filename = $1;
118 #change the filename to doc.xml, keeping any path
119 $output_filename = &util::filename_head($output_filename);
120 $output_filename = &util::filename_cat($output_filename, "doc.xml");
121 }
122
123 if ($line =~ m/^<\/Doc>$/) {
124 if ($mode eq "text") {
125 save_xml_doc($full_textdir,$output_filename,$doc_xml);
126 } elsif ($mode eq "index") {
127 # notify lucene indexer
128
129 # SAX parser seems to be sensitive to blank lines
130 # => remove them
131 $doc_xml =~ s/\n+/\n/g;
132
133# print STDERR $doc_xml;
134
135## print PIPEOUT "$output_filename\n";
136
137 print PIPEOUT "$doc_xml";
138
139
140 #save_xml_doc($full_textdir, "$output_filename.txt", $doc_xml);
141 }
142 # compress file
143### compress_xml_doc($full_textdir,$output_filename);
144
145 $doc_xml = "";
146 $output_filename = "";
147 }
148 }
149}
150
151
152# /** This checks the arguments on the command line, filters the
153# * unknown command line arguments and then calls the open_java_lucene
154# * function to begin processing. Most of the arguments are passed on
155# * the command line of the java wrapper.
156# *
157# */
158sub main
159{
160 my (@argv) = @_;
161 my $argc = scalar(@argv);
162
163 my $java_lucene_options = "";
164 my @filtered_argv = ();
165
166 my $i = 0;
167 while ($i<$argc) {
168 if ($argv[$i] =~ m/^\-(.*)$/) {
169
170 my $option = $1;
171
172 # -removeold causes the existing index to be overwritten
173 if ($option eq "removeold") {
174 print STDERR "\n-removeold set\n";
175 $java_lucene_options .= "-removeold ";
176 }
177 # -verbosity <num>
178 elsif ($option eq "verbosity") {
179 $i++;
180 if ($i<$argc)
181 {
182 $java_lucene_options .= "-verbosity " . $argv[$i];
183 }
184 }
185 else {
186 print STDERR "Unrecognised minus option: -$option\n";
187 }
188 }
189 else {
190 push(@filtered_argv,$argv[$i]);
191 }
192 $i++;
193 }
194
195 my $filtered_argc = scalar(@filtered_argv);
196
197 if ($filtered_argc < 4) {
198 print STDERR "Usage: lucene_passes.pl [-removeold|-verbosity num] \"text\"|\"index\" doc-tag-level build-dir index-name\n";
199 exit 1;
200 }
201
202 my $mode = $filtered_argv[0];
203 my $doc_tag_level = $filtered_argv[1];
204 my $full_builddir = $filtered_argv[2];
205 my $indexdir = $filtered_argv[3];
206### print STDERR "**** ARGS = ", join(" ", @argv), "\n";
207
208 # We only need the Lucene handle opened if we are indexing the documents, not if we are just storing the text
209 if ($mode eq "index") {
210 open_java_lucene($doc_tag_level, $full_builddir, $indexdir, $java_lucene_options);
211 }
212
213 print STDERR "Monitoring for input!\n";
214 my $full_textdir = &FileUtils::filenameConcatenate($full_builddir,"text");
215
216 monitor_xml_stream($mode, $full_textdir);
217
218 if ($mode eq "index") {
219 close_java_lucene();
220 }
221}
222
223
224&main(@ARGV);
Note: See TracBrowser for help on using the repository browser.