root/main/trunk/greenstone2/bin/script/lucene_passes.pl @ 29144

Revision 29144, 6.0 KB (checked in by ak19, 6 years ago)

Part of port from lucene3.3.0 to lucene4.7.2. LuceneWrapper? related. Changes to gs2build/greenstone 2's perllib and bin/script lucene related perl scripts, to switch over from using Lucene3Wrapper to Lucene4Wrapper

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# lucene_passes.pl -- perl wrapper, akin to mgpp_passes, for Lucene
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29BEGIN {
30    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
31    die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
32    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
33}
34
35
36use strict;
37use util;
38use FileUtils;
39
40sub open_java_lucene
41{
42  my ($doc_tag_level,$full_builddir,$indexdir,$java_lucene_options) = @_;
43
44  # Is there a collection-specific bin/java/LuceneWrapper4.jar file?
45  my $bin_java = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},"bin","java");
46  my $classpath = &FileUtils::javaFilenameConcatenate($bin_java,"LuceneWrapper4.jar");
47  if (!-f $classpath)
48  {
49      # No, so use the Greenstone one
50      $bin_java = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java");
51      $classpath = &FileUtils::javaFilenameConcatenate($bin_java,"LuceneWrapper4.jar");
52      if(!-f $classpath) {
53      die "***** ERROR: $classpath does not exist\n";     
54      }
55  }
56
57  $full_builddir = &util::makeFilenameJavaCygwinCompatible($full_builddir);
58
59  my $java_lucene = "java -classpath \"$classpath\" org.greenstone.LuceneWrapper4.GS2LuceneIndexer";
60  my $java_cmd = "$java_lucene $java_lucene_options $doc_tag_level \"$full_builddir\" $indexdir";
61
62  open (PIPEOUT, "| $java_cmd") or die "lucene_passes.pl - couldn't run $java_cmd\n";
63}
64
65
66sub close_java_lucene
67{
68  close(PIPEOUT);
69}
70
71
72sub save_xml_doc
73{
74    my ($full_textdir,$output_filename,$doc_xml) = @_;
75
76    my $dir_sep = &util::get_os_dirsep();
77
78    my $full_output_filename = &FileUtils::filenameConcatenate($full_textdir,$output_filename);
79    my ($full_output_dir) = ($full_output_filename =~ m/^(.*$dir_sep)/x);
80    &FileUtils::makeAllDirectories($full_output_dir);
81
82    open(DOCOUT,">$full_output_filename")
83    || die "Unable to open $full_output_filename";
84
85    print DOCOUT $doc_xml;
86    close(DOCOUT);
87
88    my @secs =  ($doc_xml =~ m/<Sec\s+gs2:id="\d+"\s*>.*?<\/Sec>/sg);
89}
90
91
92sub compress_xml_doc
93{
94    my ($full_textdir,$output_filename) = @_;
95
96    my $full_output_filename
97    = &FileUtils::filenameConcatenate($full_textdir,$output_filename);
98
99    `gzip $full_output_filename`;
100}
101
102
103# This appears to be the callback that gets the xml stream during the
104# build process, so I need to intercept it here and call my XML RPC
105# to insert into the Lucene database.
106sub monitor_xml_stream
107{
108    my ($mode, $full_textdir) = @_;
109
110    my $doc_xml = "";
111    my $output_filename = "";
112
113    my $line;
114    while (defined ($line = <STDIN>)) {
115    $doc_xml .= $line;
116    if ($line =~ m/^<Doc.+file=\"(.*?)\".*>$/) {
117        $output_filename = $1;
118       
119    }
120   
121    if ($line =~ m/^<\/Doc>$/) {
122        if ($mode eq "text") {
123        save_xml_doc($full_textdir,$output_filename,$doc_xml);
124        } elsif ($mode eq "index") {
125        # notify lucene indexer
126
127        # SAX parser seems to be sensitive to blank lines
128        # => remove them
129        $doc_xml =~ s/\n+/\n/g;
130
131#        print STDERR $doc_xml;
132
133##      print PIPEOUT "$output_filename\n";
134
135        print PIPEOUT "$doc_xml";
136
137
138        #save_xml_doc($full_textdir, "$output_filename.txt", $doc_xml);
139        }
140        # compress file
141###     compress_xml_doc($full_textdir,$output_filename);
142
143        $doc_xml = "";
144        $output_filename = "";
145    }
146    }
147}
148
149
150# /** This checks the arguments on the command line, filters the
151#  *  unknown command line arguments and then calls the open_java_lucene
152#  *  function to begin processing. Most of the arguments are passed on
153#  *  the command line of the java wrapper.
154#  *
155#  */
156sub main
157{
158  my (@argv) = @_;
159  my $argc = scalar(@argv);
160
161  my $java_lucene_options = "";
162  my @filtered_argv = ();
163
164  my $i = 0;
165  while ($i<$argc) {
166    if ($argv[$i] =~ m/^\-(.*)$/) {
167
168      my $option = $1;
169
170      # -removeold causes the existing index to be overwritten
171      if ($option eq "removeold") {
172        print STDERR "\n-removeold set\n";
173        $java_lucene_options .= "-removeold ";
174      }
175      # -verbosity <num>
176      elsif ($option eq "verbosity") {
177        $i++;
178        if ($i<$argc)
179    {
180      $java_lucene_options .= "-verbosity " . $argv[$i];
181        }
182      }
183      else {
184        print STDERR "Unrecognised minus option: -$option\n";
185      }
186    }
187    else {
188        push(@filtered_argv,$argv[$i]);
189    }
190    $i++;
191  }
192
193  my $filtered_argc = scalar(@filtered_argv);
194
195  if ($filtered_argc < 4) {
196    print STDERR "Usage: lucene_passes.pl [-removeold|-verbosity num] \"text\"|\"index\" doc-tag-level build-dir index-name\n";
197    exit 1;
198  }
199
200  my $mode = $filtered_argv[0];
201  my $doc_tag_level = $filtered_argv[1];
202  my $full_builddir = $filtered_argv[2];
203  my $indexdir      = $filtered_argv[3];
204###    print STDERR "**** ARGS = ", join(" ", @argv), "\n";
205
206  # We only need the Lucene handle opened if we are indexing the documents, not if we are just storing the text
207  if ($mode eq "index") {
208    open_java_lucene($doc_tag_level, $full_builddir, $indexdir, $java_lucene_options);
209  }
210
211  print STDERR "Monitoring for input!\n";
212  my $full_textdir = &FileUtils::filenameConcatenate($full_builddir,"text");
213
214  monitor_xml_stream($mode, $full_textdir);
215
216  if ($mode eq "index") {
217    close_java_lucene();
218  }
219}
220
221
222&main(@ARGV);
Note: See TracBrowser for help on using the browser.