root/gs3-extensions/solr/trunk/src/bin/script/solr_passes.pl @ 24501

Revision 24501, 6.2 KB (checked in by davidb, 9 years ago)

Relocation of files to make solr.solr.home more natural. Plus, more carefully control the order in which the build_dir/index_dir folder is deleted in. For solr we need to do this earlier than lucene

Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# solr_passes.pl -- perl wrapper, akin to mgpp_passes, for Solr
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# Heavily based on lucene_passes.pl, but does not need a SolrWrapper.jar
29# style solution as Solr has its own XML syntax:
30#
31#  http://wiki.apache.org/solr/UpdateXmlMessages
32#
33# This syntax is rather similar to what we already use, so the
34# main task of monitor_xml() is to translate the XML syntax Greenstone uses
35# into that needed by the solr server. 
36
37
38BEGIN {
39    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
40    die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
41    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
42    die "GEXT_SOLR not set\n" unless defined $ENV{'GEXT_SOLR'};
43
44    my $solr_ext = $ENV{'GEXT_SOLR'};
45    unshift (@INC, "$solr_ext/perllib");
46}
47
48use strict;
49use util;
50use solrutil;
51use solrserver;
52
53
54# Not quite OO, but close enough for now
55#
56my $self = { 'solr_server' => undef };
57
58sub open_java_solr
59{
60  my ($core,$full_builddir,$indexdir) = @_;
61
62  # If the Solr/Jetty server is not already running, the following starts
63  # it up, and only returns when the server is "reading and listening"
64 
65  my $solr_server = new solrserver($full_builddir);
66  $solr_server->start();
67  $self->{'solr_server'} = $solr_server;
68
69  # Now start up the solr-post command
70  &solrutil::open_post_pipe($core);
71}
72
73sub close_java_solr
74{
75    &solrutil::close_post_pipe();
76     
77    my $solr_server = $self->{'solr_server'};
78    if ($solr_server->explicitly_started()) {
79    $solr_server->stop();
80    }
81}
82
83#----
84
85sub save_xml_doc
86{
87    # This is identical to the one in lucene_passes.pl, and should be
88    # moved in to a package and shared ####
89
90    my ($full_textdir,$output_filename,$doc_xml) = @_;
91
92    my $dir_sep = &util::get_os_dirsep();
93
94    my $full_output_filename = &util::filename_cat($full_textdir,$output_filename);
95    my ($full_output_dir) = ($full_output_filename =~ m/^(.*$dir_sep)/x);
96    &util::mk_all_dir($full_output_dir);
97
98    open(DOCOUT,">$full_output_filename")
99    || die "Unable to open $full_output_filename";
100
101    print DOCOUT $doc_xml;
102    close(DOCOUT);
103
104    # What this the purpose of the following? ####
105    my @secs =  ($doc_xml =~ m/<Sec\s+gs2:id="\d+"\s*>.*?<\/Sec>/sg);
106}
107
108
109sub compress_xml_doc
110{
111    # This is identical to the one in lucene_passes.pl, and should be
112    # moved in to a package and shared ####
113
114    my ($full_textdir,$output_filename) = @_;
115
116    my $full_output_filename
117    = &util::filename_cat($full_textdir,$output_filename);
118
119    # Greenstone ships with gzip for Windows
120    `gzip $full_output_filename`;
121}
122
123
124sub monitor_xml_stream
125{
126    # based on lucene's monitor_xml_stream, but simplified
127    # as only now used when in "text" mode
128
129    my ($full_textdir) = @_;
130
131    my $doc_xml = "";
132    my $output_filename = "";
133
134    my $line;
135    while (defined ($line = <STDIN>)) {
136
137    $doc_xml .= $line;
138
139    if ($line =~ m/^<Doc.+file=\"(.*?)\".*>$/) {
140        $output_filename = $1;     
141    }
142   
143    if ($line =~ m/^<\/Doc>$/) {
144        save_xml_doc($full_textdir,$output_filename,$doc_xml);
145
146        # Compress file
147        #
148        # The compress option was taken out for efficiency
149        # reasons.  Consider putting it back in but making it a
150        # switch so a collection builder can decide for themselves on a
151        # case by case basis if they want to save on diskspace, but have
152        # the overhead of uncompressing at runtime
153       
154###     compress_xml_doc($full_textdir,$output_filename);
155
156        $doc_xml = "";
157        $output_filename = "";
158    }
159    }
160}
161
162
163sub pass_on_xml_stream
164{
165    my $line;
166    while (defined ($line = <STDIN>)) {
167    &solrutil::print_to_post_pipe($line);
168    }
169}
170
171
172
173
174# /** This checks the arguments on the command line, filters the
175#  *  unknown command line arguments and then calls the open_java_solr
176#  *  function to begin processing.
177#  */
178sub main
179{
180  my (@argv) = @_;
181  my $argc = scalar(@argv);
182
183  my @filtered_argv = ();
184
185  my $i = 0;
186  while ($i<$argc) {
187    if ($argv[$i] =~ m/^\-(.*)$/) {
188
189      my $option = $1;
190
191      # -verbosity <num>
192      if ($option eq "verbosity") {
193        $i++;
194        if ($i<$argc)
195    {
196      # solr indexing has no support for verbosity
197      # => parse to be compatible with calling program, but supress it
198      #    for solr-post.jar
199        }
200      }
201      else {
202        print STDERR "Unrecognised minus option: -$option\n";
203      }
204    }
205    else {
206        push(@filtered_argv,$argv[$i]);
207    }
208    $i++;
209  }
210
211  my $filtered_argc = scalar(@filtered_argv);
212
213  if ($filtered_argc < 4) {
214    print STDERR "Usage: solr_passes.pl [-verbosity num] core \"text\"|\"index\" build-dir index-name\n";
215    exit 1;
216  }
217
218  my $core          = $filtered_argv[0];
219  my $mode          = $filtered_argv[1];
220  my $full_builddir = $filtered_argv[2];
221  my $indexdir      = $filtered_argv[3];
222
223  # We only need the Solr handle opened if we are indexing the
224  # documents, not if we are just storing the text
225  if ($mode eq "index") {
226    open_java_solr($core, $full_builddir, $indexdir);
227  }
228
229  if ($mode eq "text") {
230      print STDERR "Monitoring for input!\n";
231      my $full_textdir = &util::filename_cat($full_builddir,"text");
232      monitor_xml_stream($full_textdir);
233  }
234  else {
235      print STDERR "Streaming document input onto Solr server!\n";
236      pass_on_xml_stream();
237  }
238
239
240  if ($mode eq "index") {
241    close_java_solr();
242  }
243}
244
245
246&main(@ARGV);
Note: See TracBrowser for help on using the browser.