source: gs3-extensions/solr/trunk/src/bin/script/solr_passes.pl@ 24453

Last change on this file since 24453 was 24453, checked in by davidb, 13 years ago

Tidy up of code. Better structuring of classes

File size: 6.8 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# solr_passes.pl -- perl wrapper, akin to mgpp_passes, for Solr
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# Heavily based on lucene_passes.pl, but does not need a SolrWrapper.jar
29# style solution as Solr has its own XML syntax:
30#
31# http://wiki.apache.org/solr/UpdateXmlMessages
32#
33# This syntax is rather similar to what we already use, so the
34# main task of monitor_xml() is to translate the XML syntax Greenstone uses
35# into that needed by the solr server.
36
37
38BEGIN {
39 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
40 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
41 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
42 die "GEXT_SOLR not set\n" unless defined $ENV{'GEXT_SOLR'};
43
44 my $solr_ext = $ENV{'GEXT_SOLR'};
45 unshift (@INC, "$solr_ext/perllib");
46}
47
48use strict;
49use util;
50use solrutil;
51use solrserver;
52
53
54# Not quite OO, but close enough for now
55#
56my $self = { 'solr_server' => undef };
57
58sub open_java_solr
59{
60 my ($collect, $doc_tag_level,$full_builddir,$indexdir,$removeold) = @_;
61
62 # if removeold set, then delete the curring $full_builddir
63 if ($removeold) {
64 my $full_indexdir = &util::filename_cat($full_builddir,$indexdir);
65 &util::rm_r($full_indexdir);
66 }
67
68 # If the Solr/Jetty server is not already running, the following starts
69 # it up, and only returns when the server is "reading and listening"
70
71 my $solr_server = new solrserver();
72 $solr_server->start();
73 $self->{'solr_server'} = $solr_server;
74
75 # Now start up the solr-post command
76 &solrutil::open_post_pipe($collect,$doc_tag_level);
77}
78
79sub close_java_solr
80{
81 &solrutil::close_post_pipe();
82
83 my $solr_server = $self->{'solr_server'};
84 if ($solr_server->explicitly_started()) {
85 $solr_server->stop();
86 }
87}
88
89#----
90
91sub save_xml_doc
92{
93 # This is identical to the one in lucene_passes.pl, and should be
94 # moved in to a package and shared ####
95
96 my ($full_textdir,$output_filename,$doc_xml) = @_;
97
98 my $dir_sep = &util::get_os_dirsep();
99
100 my $full_output_filename = &util::filename_cat($full_textdir,$output_filename);
101 my ($full_output_dir) = ($full_output_filename =~ m/^(.*$dir_sep)/x);
102 &util::mk_all_dir($full_output_dir);
103
104 open(DOCOUT,">$full_output_filename")
105 || die "Unable to open $full_output_filename";
106
107 print DOCOUT $doc_xml;
108 close(DOCOUT);
109
110 # What this the purpose of the following? ####
111 my @secs = ($doc_xml =~ m/<Sec\s+gs2:id="\d+"\s*>.*?<\/Sec>/sg);
112}
113
114
115sub compress_xml_doc
116{
117 # This is identical to the one in lucene_passes.pl, and should be
118 # moved in to a package and shared ####
119
120 my ($full_textdir,$output_filename) = @_;
121
122 my $full_output_filename
123 = &util::filename_cat($full_textdir,$output_filename);
124
125 # Greenstone ships with gzip for Windows
126 `gzip $full_output_filename`;
127}
128
129
130sub monitor_xml_stream
131{
132 # based on lucene's monitor_xml_stream, but simplified
133 # as only now used when in "text" mode
134
135 my ($full_textdir) = @_;
136
137 my $doc_xml = "";
138 my $output_filename = "";
139
140 my $line;
141 while (defined ($line = <STDIN>)) {
142
143 $doc_xml .= $line;
144
145 if ($line =~ m/^<Doc.+file=\"(.*?)\".*>$/) {
146 $output_filename = $1;
147 }
148
149 if ($line =~ m/^<\/Doc>$/) {
150 save_xml_doc($full_textdir,$output_filename,$doc_xml);
151
152 # Compress file
153 #
154 # The compress option was taken out for efficiency
155 # reasons. Consider putting it back in but making it a
156 # switch so a collection builder can decide for themselves on a
157 # case by case basis if they want to save on diskspace, but have
158 # the overhead of uncompressing at runtime
159
160### compress_xml_doc($full_textdir,$output_filename);
161
162 $doc_xml = "";
163 $output_filename = "";
164 }
165 }
166}
167
168
169sub pass_on_xml_stream
170{
171 my $line;
172 while (defined ($line = <STDIN>)) {
173 &solrutil::print_to_post_pipe($line);
174 }
175}
176
177
178
179
180# /** This checks the arguments on the command line, filters the
181# * unknown command line arguments and then calls the open_java_solr
182# * function to begin processing.
183# */
184sub main
185{
186 my (@argv) = @_;
187 my $argc = scalar(@argv);
188
189 my $removeold = 0;
190 my @filtered_argv = ();
191
192 my $i = 0;
193 while ($i<$argc) {
194 if ($argv[$i] =~ m/^\-(.*)$/) {
195
196 my $option = $1;
197
198 # -removeold causes the existing index to be overwritten
199 if ($option eq "removeold") {
200 print STDERR "\n-removeold set (new index will be created)\n";
201 $removeold = 1;
202 }
203 # -verbosity <num>
204 elsif ($option eq "verbosity") {
205 $i++;
206 if ($i<$argc)
207 {
208 # solr indexing has no support for verbosity
209 # => parse to be compatible with calling program, but supress it
210 # for solr-post.jar
211 }
212 }
213 else {
214 print STDERR "Unrecognised minus option: -$option\n";
215 }
216 }
217 else {
218 push(@filtered_argv,$argv[$i]);
219 }
220 $i++;
221 }
222
223 my $filtered_argc = scalar(@filtered_argv);
224
225 if ($filtered_argc < 5) {
226 print STDERR "Usage: solr_passes.pl [-removeold|-verbosity num] collect \"text\"|\"index\" doc-tag-level build-dir index-name\n";
227 exit 1;
228 }
229
230 my $collect = $filtered_argv[0];
231 my $mode = $filtered_argv[1];
232 my $doc_tag_level = $filtered_argv[2];
233 my $full_builddir = $filtered_argv[3];
234 my $indexdir = $filtered_argv[4];
235
236 # We only need the Solr handle opened if we are indexing the
237 # documents, not if we are just storing the text
238 if ($mode eq "index") {
239 open_java_solr($collect, $doc_tag_level, $full_builddir, $indexdir, $removeold);
240 }
241
242 if ($mode eq "text") {
243 print STDERR "Monitoring for input!\n";
244 my $full_textdir = &util::filename_cat($full_builddir,"text");
245 monitor_xml_stream($full_textdir);
246 }
247 else {
248 print STDERR "Streaming document input onto Solr server!\n";
249 pass_on_xml_stream();
250 }
251
252
253 if ($mode eq "index") {
254 close_java_solr();
255 }
256}
257
258
259&main(@ARGV);
Note: See TracBrowser for help on using the repository browser.