source: gs3-extensions/solr/trunk/src/bin/script/solr_passes.pl@ 24501

Last change on this file since 24501 was 24501, checked in by davidb, 13 years ago

Relocation of files to make solr.solr.home more natural. Plus, more carefully control the order in which the build_dir/index_dir folder is deleted in. For solr we need to do this earlier than lucene

File size: 6.2 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# solr_passes.pl -- perl wrapper, akin to mgpp_passes, for Solr
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# Heavily based on lucene_passes.pl, but does not need a SolrWrapper.jar
29# style solution as Solr has its own XML syntax:
30#
31# http://wiki.apache.org/solr/UpdateXmlMessages
32#
33# This syntax is rather similar to what we already use, so the
34# main task of monitor_xml() is to translate the XML syntax Greenstone uses
35# into that needed by the solr server.
36
37
38BEGIN {
39 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
40 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
41 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
42 die "GEXT_SOLR not set\n" unless defined $ENV{'GEXT_SOLR'};
43
44 my $solr_ext = $ENV{'GEXT_SOLR'};
45 unshift (@INC, "$solr_ext/perllib");
46}
47
48use strict;
49use util;
50use solrutil;
51use solrserver;
52
53
54# Not quite OO, but close enough for now
55#
56my $self = { 'solr_server' => undef };
57
58sub open_java_solr
59{
60 my ($core,$full_builddir,$indexdir) = @_;
61
62 # If the Solr/Jetty server is not already running, the following starts
63 # it up, and only returns when the server is "reading and listening"
64
65 my $solr_server = new solrserver($full_builddir);
66 $solr_server->start();
67 $self->{'solr_server'} = $solr_server;
68
69 # Now start up the solr-post command
70 &solrutil::open_post_pipe($core);
71}
72
73sub close_java_solr
74{
75 &solrutil::close_post_pipe();
76
77 my $solr_server = $self->{'solr_server'};
78 if ($solr_server->explicitly_started()) {
79 $solr_server->stop();
80 }
81}
82
83#----
84
85sub save_xml_doc
86{
87 # This is identical to the one in lucene_passes.pl, and should be
88 # moved in to a package and shared ####
89
90 my ($full_textdir,$output_filename,$doc_xml) = @_;
91
92 my $dir_sep = &util::get_os_dirsep();
93
94 my $full_output_filename = &util::filename_cat($full_textdir,$output_filename);
95 my ($full_output_dir) = ($full_output_filename =~ m/^(.*$dir_sep)/x);
96 &util::mk_all_dir($full_output_dir);
97
98 open(DOCOUT,">$full_output_filename")
99 || die "Unable to open $full_output_filename";
100
101 print DOCOUT $doc_xml;
102 close(DOCOUT);
103
104 # What this the purpose of the following? ####
105 my @secs = ($doc_xml =~ m/<Sec\s+gs2:id="\d+"\s*>.*?<\/Sec>/sg);
106}
107
108
109sub compress_xml_doc
110{
111 # This is identical to the one in lucene_passes.pl, and should be
112 # moved in to a package and shared ####
113
114 my ($full_textdir,$output_filename) = @_;
115
116 my $full_output_filename
117 = &util::filename_cat($full_textdir,$output_filename);
118
119 # Greenstone ships with gzip for Windows
120 `gzip $full_output_filename`;
121}
122
123
124sub monitor_xml_stream
125{
126 # based on lucene's monitor_xml_stream, but simplified
127 # as only now used when in "text" mode
128
129 my ($full_textdir) = @_;
130
131 my $doc_xml = "";
132 my $output_filename = "";
133
134 my $line;
135 while (defined ($line = <STDIN>)) {
136
137 $doc_xml .= $line;
138
139 if ($line =~ m/^<Doc.+file=\"(.*?)\".*>$/) {
140 $output_filename = $1;
141 }
142
143 if ($line =~ m/^<\/Doc>$/) {
144 save_xml_doc($full_textdir,$output_filename,$doc_xml);
145
146 # Compress file
147 #
148 # The compress option was taken out for efficiency
149 # reasons. Consider putting it back in but making it a
150 # switch so a collection builder can decide for themselves on a
151 # case by case basis if they want to save on diskspace, but have
152 # the overhead of uncompressing at runtime
153
154### compress_xml_doc($full_textdir,$output_filename);
155
156 $doc_xml = "";
157 $output_filename = "";
158 }
159 }
160}
161
162
163sub pass_on_xml_stream
164{
165 my $line;
166 while (defined ($line = <STDIN>)) {
167 &solrutil::print_to_post_pipe($line);
168 }
169}
170
171
172
173
174# /** This checks the arguments on the command line, filters the
175# * unknown command line arguments and then calls the open_java_solr
176# * function to begin processing.
177# */
178sub main
179{
180 my (@argv) = @_;
181 my $argc = scalar(@argv);
182
183 my @filtered_argv = ();
184
185 my $i = 0;
186 while ($i<$argc) {
187 if ($argv[$i] =~ m/^\-(.*)$/) {
188
189 my $option = $1;
190
191 # -verbosity <num>
192 if ($option eq "verbosity") {
193 $i++;
194 if ($i<$argc)
195 {
196 # solr indexing has no support for verbosity
197 # => parse to be compatible with calling program, but supress it
198 # for solr-post.jar
199 }
200 }
201 else {
202 print STDERR "Unrecognised minus option: -$option\n";
203 }
204 }
205 else {
206 push(@filtered_argv,$argv[$i]);
207 }
208 $i++;
209 }
210
211 my $filtered_argc = scalar(@filtered_argv);
212
213 if ($filtered_argc < 4) {
214 print STDERR "Usage: solr_passes.pl [-verbosity num] core \"text\"|\"index\" build-dir index-name\n";
215 exit 1;
216 }
217
218 my $core = $filtered_argv[0];
219 my $mode = $filtered_argv[1];
220 my $full_builddir = $filtered_argv[2];
221 my $indexdir = $filtered_argv[3];
222
223 # We only need the Solr handle opened if we are indexing the
224 # documents, not if we are just storing the text
225 if ($mode eq "index") {
226 open_java_solr($core, $full_builddir, $indexdir);
227 }
228
229 if ($mode eq "text") {
230 print STDERR "Monitoring for input!\n";
231 my $full_textdir = &util::filename_cat($full_builddir,"text");
232 monitor_xml_stream($full_textdir);
233 }
234 else {
235 print STDERR "Streaming document input onto Solr server!\n";
236 pass_on_xml_stream();
237 }
238
239
240 if ($mode eq "index") {
241 close_java_solr();
242 }
243}
244
245
246&main(@ARGV);
Note: See TracBrowser for help on using the repository browser.