root/gs3-extensions/solr/trunk/src/bin/script/solr_passes.pl @ 31490

Revision 31490, 6.3 KB (checked in by ak19, 4 years ago)

1. Fix to issue of a tomcat host/port change not propagating to solr host/port change when rebuilding a solr collection after tomcat host/port change. The change to tomcat server props need to be made after gs3-setup.sh was already run in the terminal earlier, to encouner the problem upon solr build. The bug was reproduced on Linux, and the fix for it also tested on Linux. Still need to test fix out on Windows. 2. Simultaneously made http protocol used in solr more robust to whether it's http or https.

  • Property svn:executable set to *
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# solr_passes.pl -- perl wrapper, akin to mgpp_passes, for Solr
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# Heavily based on lucene_passes.pl, but does not need a SolrWrapper.jar
29# style solution as Solr has its own XML syntax:
30#
31#  http://wiki.apache.org/solr/UpdateXmlMessages
32#
33# This syntax is rather similar to what we already use, so the
34# main task of monitor_xml() is to translate the XML syntax Greenstone uses
35# into that needed by the solr server. 
36
37
38BEGIN {
39    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
40    die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
41    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
42    die "GEXT_SOLR not set\n" unless defined $ENV{'GEXT_SOLR'};
43
44    my $solr_ext = $ENV{'GEXT_SOLR'};
45    unshift (@INC, "$solr_ext/perllib");
46}
47
48use strict;
49use util;
50use solrutil;
51use solrserver;
52
53# Not quite OO, but close enough for now
54#
55my $self = { 'solr_server' => undef };
56
57sub open_java_solr
58{
59  my ($core,$full_builddir,$indexdir) = @_;
60
61  # If the Solr/Jetty server is not already running, the following starts
62  # it up, and only returns when the server is "reading and listening"
63 
64  my $solr_server = new solrserver($full_builddir);
65  $solr_server->start();
66  $self->{'solr_server'} = $solr_server;
67
68  # Now start up the solr-post command
69  &solrutil::open_post_pipe($core, $solr_server->get_solr_base_url());
70}
71
72sub close_java_solr
73{
74    &solrutil::close_post_pipe();
75     
76    my $solr_server = $self->{'solr_server'};
77    if ($solr_server->explicitly_started()) {
78    $solr_server->stop();
79    }
80}
81
82#----
83
84sub save_xml_doc
85{
86    # This is identical to the one in lucene_passes.pl, and should be
87    # moved in to a package and shared ####
88
89    my ($full_textdir,$output_filename,$doc_xml) = @_;
90
91    my $dir_sep = &util::get_os_dirsep();
92
93    my $full_output_filename = &util::filename_cat($full_textdir,$output_filename);
94    my ($full_output_dir) = ($full_output_filename =~ m/^(.*$dir_sep)/x);
95    &FileUtils::makeAllDirectories($full_output_dir);
96
97    open(DOCOUT,">$full_output_filename")
98    || die "Unable to open $full_output_filename";
99
100    print DOCOUT $doc_xml;
101    close(DOCOUT);
102
103    # What this the purpose of the following? ####
104    my @secs =  ($doc_xml =~ m/<Sec\s+gs2:id="\d+"\s*>.*?<\/Sec>/sg);
105}
106
107
108sub compress_xml_doc
109{
110    # This is identical to the one in lucene_passes.pl, and should be
111    # moved in to a package and shared ####
112
113    my ($full_textdir,$output_filename) = @_;
114
115    my $full_output_filename
116    = &util::filename_cat($full_textdir,$output_filename);
117
118    # Greenstone ships with gzip for Windows
119    `gzip $full_output_filename`;
120}
121
122
123sub monitor_xml_stream
124{
125    # based on lucene's monitor_xml_stream, but simplified
126    # as only now used when in "text" mode
127
128    my ($full_textdir) = @_;
129
130    my $doc_xml = "";
131    my $output_filename = "";
132
133    my $line;
134    while (defined ($line = <STDIN>)) {
135
136    $doc_xml .= $line;
137
138    if ($line =~ m/^<Doc.+file=\"(.*?)\".*>$/) {
139        $output_filename = $1;     
140    }
141   
142    if ($line =~ m/^<\/Doc>$/) {
143        save_xml_doc($full_textdir,$output_filename,$doc_xml);
144
145        # Compress file
146        #
147        # The compress option was taken out for efficiency
148        # reasons.  Consider putting it back in but making it a
149        # switch so a collection builder can decide for themselves on a
150        # case by case basis if they want to save on diskspace, but have
151        # the overhead of uncompressing at runtime
152       
153###     compress_xml_doc($full_textdir,$output_filename);
154
155        $doc_xml = "";
156        $output_filename = "";
157    }
158    }
159}
160
161
162sub pass_on_xml_stream
163{
164    my $line;
165    while (defined ($line = <STDIN>)) {
166    &solrutil::print_to_post_pipe($line);
167    }
168}
169
170
171
172
173# /** This checks the arguments on the command line, filters the
174#  *  unknown command line arguments and then calls the open_java_solr
175#  *  function to begin processing.
176#  */
177sub main
178{
179  my (@argv) = @_;
180  my $argc = scalar(@argv);
181
182  my @filtered_argv = ();
183
184  my $i = 0;
185  while ($i<$argc) {
186    if ($argv[$i] =~ m/^\-(.*)$/) {
187
188      my $option = $1;
189
190      # -verbosity <num>
191      if ($option eq "verbosity") {
192        $i++;
193        if ($i<$argc)
194    {
195      # solr indexing has no support for verbosity
196      # => parse to be compatible with calling program, but supress it
197      #    for solr-post.jar
198        }
199      }
200      else {
201        print STDERR "Unrecognised minus option: -$option\n";
202      }
203    }
204    else {
205        push(@filtered_argv,$argv[$i]);
206    }
207    $i++;
208  }
209
210  my $filtered_argc = scalar(@filtered_argv);
211
212  if ($filtered_argc < 4) {
213    print STDERR "Usage: solr_passes.pl [-verbosity num] core \"text\"|\"index\" build-dir index-name\n";
214    exit 1;
215  }
216
217  my $core          = $filtered_argv[0];
218  my $mode          = $filtered_argv[1];
219  my $full_builddir = $filtered_argv[2];
220  my $indexdir      = $filtered_argv[3];
221
222  # We only need the Solr handle opened if we are indexing the
223  # documents, not if we are just storing the text
224  if ($mode eq "index") {
225    open_java_solr($core, $full_builddir, $indexdir);
226  }
227
228  if ($mode eq "text") {
229      print STDERR "Monitoring for input!\n";
230      my $full_textdir = &util::filename_cat($full_builddir,"text");
231      monitor_xml_stream($full_textdir);
232  }
233  else {
234      print STDERR "Streaming document input onto Solr server!\n";
235      pass_on_xml_stream();
236  }
237
238
239  if ($mode eq "index") {
240    close_java_solr();
241  }
242}
243
244
245&main(@ARGV);
Note: See TracBrowser for help on using the browser.