root/gs3-extensions/solr/trunk/src/bin/script/solr_passes.pl @ 27780

Revision 27780, 6.2 KB (checked in by jmt12, 7 years ago)

replacing deprecated call to make all directories

Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# solr_passes.pl -- perl wrapper, akin to mgpp_passes, for Solr
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# Heavily based on lucene_passes.pl, but does not need a SolrWrapper.jar
29# style solution as Solr has its own XML syntax:
30#
31#  http://wiki.apache.org/solr/UpdateXmlMessages
32#
33# This syntax is rather similar to what we already use, so the
34# main task of monitor_xml() is to translate the XML syntax Greenstone uses
35# into that needed by the solr server. 
36
37
38BEGIN {
39    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
40    die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
41    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
42    die "GEXT_SOLR not set\n" unless defined $ENV{'GEXT_SOLR'};
43
44    my $solr_ext = $ENV{'GEXT_SOLR'};
45    unshift (@INC, "$solr_ext/perllib");
46}
47
48use strict;
49use util;
50use solrutil;
51use solrserver;
52
53# Not quite OO, but close enough for now
54#
55my $self = { 'solr_server' => undef };
56
57sub open_java_solr
58{
59  my ($core,$full_builddir,$indexdir) = @_;
60
61  # If the Solr/Jetty server is not already running, the following starts
62  # it up, and only returns when the server is "reading and listening"
63 
64  my $solr_server = new solrserver($full_builddir);
65  $solr_server->start();
66  $self->{'solr_server'} = $solr_server;
67
68  # Now start up the solr-post command
69  &solrutil::open_post_pipe($core);
70}
71
72sub close_java_solr
73{
74    &solrutil::close_post_pipe();
75     
76    my $solr_server = $self->{'solr_server'};
77    if ($solr_server->explicitly_started()) {
78    $solr_server->stop();
79    }
80}
81
82#----
83
84sub save_xml_doc
85{
86    # This is identical to the one in lucene_passes.pl, and should be
87    # moved in to a package and shared ####
88
89    my ($full_textdir,$output_filename,$doc_xml) = @_;
90
91    my $dir_sep = &util::get_os_dirsep();
92
93    my $full_output_filename = &util::filename_cat($full_textdir,$output_filename);
94    my ($full_output_dir) = ($full_output_filename =~ m/^(.*$dir_sep)/x);
95    &FileUtils::makeAllDirectories($full_output_dir);
96
97    open(DOCOUT,">$full_output_filename")
98    || die "Unable to open $full_output_filename";
99
100    print DOCOUT $doc_xml;
101    close(DOCOUT);
102
103    # What this the purpose of the following? ####
104    my @secs =  ($doc_xml =~ m/<Sec\s+gs2:id="\d+"\s*>.*?<\/Sec>/sg);
105}
106
107
108sub compress_xml_doc
109{
110    # This is identical to the one in lucene_passes.pl, and should be
111    # moved in to a package and shared ####
112
113    my ($full_textdir,$output_filename) = @_;
114
115    my $full_output_filename
116    = &util::filename_cat($full_textdir,$output_filename);
117
118    # Greenstone ships with gzip for Windows
119    `gzip $full_output_filename`;
120}
121
122
123sub monitor_xml_stream
124{
125    # based on lucene's monitor_xml_stream, but simplified
126    # as only now used when in "text" mode
127
128    my ($full_textdir) = @_;
129
130    my $doc_xml = "";
131    my $output_filename = "";
132
133    my $line;
134    while (defined ($line = <STDIN>)) {
135
136    $doc_xml .= $line;
137
138    if ($line =~ m/^<Doc.+file=\"(.*?)\".*>$/) {
139        $output_filename = $1;     
140    }
141   
142    if ($line =~ m/^<\/Doc>$/) {
143        save_xml_doc($full_textdir,$output_filename,$doc_xml);
144
145        # Compress file
146        #
147        # The compress option was taken out for efficiency
148        # reasons.  Consider putting it back in but making it a
149        # switch so a collection builder can decide for themselves on a
150        # case by case basis if they want to save on diskspace, but have
151        # the overhead of uncompressing at runtime
152       
153###     compress_xml_doc($full_textdir,$output_filename);
154
155        $doc_xml = "";
156        $output_filename = "";
157    }
158    }
159}
160
161
162sub pass_on_xml_stream
163{
164    my $line;
165    while (defined ($line = <STDIN>)) {
166    &solrutil::print_to_post_pipe($line);
167    }
168}
169
170
171
172
173# /** This checks the arguments on the command line, filters the
174#  *  unknown command line arguments and then calls the open_java_solr
175#  *  function to begin processing.
176#  */
177sub main
178{
179  my (@argv) = @_;
180  my $argc = scalar(@argv);
181
182  my @filtered_argv = ();
183
184  my $i = 0;
185  while ($i<$argc) {
186    if ($argv[$i] =~ m/^\-(.*)$/) {
187
188      my $option = $1;
189
190      # -verbosity <num>
191      if ($option eq "verbosity") {
192        $i++;
193        if ($i<$argc)
194    {
195      # solr indexing has no support for verbosity
196      # => parse to be compatible with calling program, but supress it
197      #    for solr-post.jar
198        }
199      }
200      else {
201        print STDERR "Unrecognised minus option: -$option\n";
202      }
203    }
204    else {
205        push(@filtered_argv,$argv[$i]);
206    }
207    $i++;
208  }
209
210  my $filtered_argc = scalar(@filtered_argv);
211
212  if ($filtered_argc < 4) {
213    print STDERR "Usage: solr_passes.pl [-verbosity num] core \"text\"|\"index\" build-dir index-name\n";
214    exit 1;
215  }
216
217  my $core          = $filtered_argv[0];
218  my $mode          = $filtered_argv[1];
219  my $full_builddir = $filtered_argv[2];
220  my $indexdir      = $filtered_argv[3];
221
222  # We only need the Solr handle opened if we are indexing the
223  # documents, not if we are just storing the text
224  if ($mode eq "index") {
225    open_java_solr($core, $full_builddir, $indexdir);
226  }
227
228  if ($mode eq "text") {
229      print STDERR "Monitoring for input!\n";
230      my $full_textdir = &util::filename_cat($full_builddir,"text");
231      monitor_xml_stream($full_textdir);
232  }
233  else {
234      print STDERR "Streaming document input onto Solr server!\n";
235      pass_on_xml_stream();
236  }
237
238
239  if ($mode eq "index") {
240    close_java_solr();
241  }
242}
243
244
245&main(@ARGV);
Note: See TracBrowser for help on using the browser.