source: gs3-extensions/solr/trunk/src/bin/script/solr_passes.pl@ 27780

Last change on this file since 27780 was 27780, checked in by jmt12, 11 years ago

replacing deprecated call to make all directories

File size: 6.2 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# solr_passes.pl -- perl wrapper, akin to mgpp_passes, for Solr
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# Heavily based on lucene_passes.pl, but does not need a SolrWrapper.jar
29# style solution as Solr has its own XML syntax:
30#
31# http://wiki.apache.org/solr/UpdateXmlMessages
32#
33# This syntax is rather similar to what we already use, so the
34# main task of monitor_xml() is to translate the XML syntax Greenstone uses
35# into that needed by the solr server.
36
37
38BEGIN {
39 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
40 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
41 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
42 die "GEXT_SOLR not set\n" unless defined $ENV{'GEXT_SOLR'};
43
44 my $solr_ext = $ENV{'GEXT_SOLR'};
45 unshift (@INC, "$solr_ext/perllib");
46}
47
48use strict;
49use util;
50use solrutil;
51use solrserver;
52
53# Not quite OO, but close enough for now
54#
55my $self = { 'solr_server' => undef };
56
57sub open_java_solr
58{
59 my ($core,$full_builddir,$indexdir) = @_;
60
61 # If the Solr/Jetty server is not already running, the following starts
62 # it up, and only returns when the server is "reading and listening"
63
64 my $solr_server = new solrserver($full_builddir);
65 $solr_server->start();
66 $self->{'solr_server'} = $solr_server;
67
68 # Now start up the solr-post command
69 &solrutil::open_post_pipe($core);
70}
71
72sub close_java_solr
73{
74 &solrutil::close_post_pipe();
75
76 my $solr_server = $self->{'solr_server'};
77 if ($solr_server->explicitly_started()) {
78 $solr_server->stop();
79 }
80}
81
82#----
83
84sub save_xml_doc
85{
86 # This is identical to the one in lucene_passes.pl, and should be
87 # moved in to a package and shared ####
88
89 my ($full_textdir,$output_filename,$doc_xml) = @_;
90
91 my $dir_sep = &util::get_os_dirsep();
92
93 my $full_output_filename = &util::filename_cat($full_textdir,$output_filename);
94 my ($full_output_dir) = ($full_output_filename =~ m/^(.*$dir_sep)/x);
95 &FileUtils::makeAllDirectories($full_output_dir);
96
97 open(DOCOUT,">$full_output_filename")
98 || die "Unable to open $full_output_filename";
99
100 print DOCOUT $doc_xml;
101 close(DOCOUT);
102
103 # What this the purpose of the following? ####
104 my @secs = ($doc_xml =~ m/<Sec\s+gs2:id="\d+"\s*>.*?<\/Sec>/sg);
105}
106
107
108sub compress_xml_doc
109{
110 # This is identical to the one in lucene_passes.pl, and should be
111 # moved in to a package and shared ####
112
113 my ($full_textdir,$output_filename) = @_;
114
115 my $full_output_filename
116 = &util::filename_cat($full_textdir,$output_filename);
117
118 # Greenstone ships with gzip for Windows
119 `gzip $full_output_filename`;
120}
121
122
123sub monitor_xml_stream
124{
125 # based on lucene's monitor_xml_stream, but simplified
126 # as only now used when in "text" mode
127
128 my ($full_textdir) = @_;
129
130 my $doc_xml = "";
131 my $output_filename = "";
132
133 my $line;
134 while (defined ($line = <STDIN>)) {
135
136 $doc_xml .= $line;
137
138 if ($line =~ m/^<Doc.+file=\"(.*?)\".*>$/) {
139 $output_filename = $1;
140 }
141
142 if ($line =~ m/^<\/Doc>$/) {
143 save_xml_doc($full_textdir,$output_filename,$doc_xml);
144
145 # Compress file
146 #
147 # The compress option was taken out for efficiency
148 # reasons. Consider putting it back in but making it a
149 # switch so a collection builder can decide for themselves on a
150 # case by case basis if they want to save on diskspace, but have
151 # the overhead of uncompressing at runtime
152
153### compress_xml_doc($full_textdir,$output_filename);
154
155 $doc_xml = "";
156 $output_filename = "";
157 }
158 }
159}
160
161
162sub pass_on_xml_stream
163{
164 my $line;
165 while (defined ($line = <STDIN>)) {
166 &solrutil::print_to_post_pipe($line);
167 }
168}
169
170
171
172
173# /** This checks the arguments on the command line, filters the
174# * unknown command line arguments and then calls the open_java_solr
175# * function to begin processing.
176# */
177sub main
178{
179 my (@argv) = @_;
180 my $argc = scalar(@argv);
181
182 my @filtered_argv = ();
183
184 my $i = 0;
185 while ($i<$argc) {
186 if ($argv[$i] =~ m/^\-(.*)$/) {
187
188 my $option = $1;
189
190 # -verbosity <num>
191 if ($option eq "verbosity") {
192 $i++;
193 if ($i<$argc)
194 {
195 # solr indexing has no support for verbosity
196 # => parse to be compatible with calling program, but supress it
197 # for solr-post.jar
198 }
199 }
200 else {
201 print STDERR "Unrecognised minus option: -$option\n";
202 }
203 }
204 else {
205 push(@filtered_argv,$argv[$i]);
206 }
207 $i++;
208 }
209
210 my $filtered_argc = scalar(@filtered_argv);
211
212 if ($filtered_argc < 4) {
213 print STDERR "Usage: solr_passes.pl [-verbosity num] core \"text\"|\"index\" build-dir index-name\n";
214 exit 1;
215 }
216
217 my $core = $filtered_argv[0];
218 my $mode = $filtered_argv[1];
219 my $full_builddir = $filtered_argv[2];
220 my $indexdir = $filtered_argv[3];
221
222 # We only need the Solr handle opened if we are indexing the
223 # documents, not if we are just storing the text
224 if ($mode eq "index") {
225 open_java_solr($core, $full_builddir, $indexdir);
226 }
227
228 if ($mode eq "text") {
229 print STDERR "Monitoring for input!\n";
230 my $full_textdir = &util::filename_cat($full_builddir,"text");
231 monitor_xml_stream($full_textdir);
232 }
233 else {
234 print STDERR "Streaming document input onto Solr server!\n";
235 pass_on_xml_stream();
236 }
237
238
239 if ($mode eq "index") {
240 close_java_solr();
241 }
242}
243
244
245&main(@ARGV);
Note: See TracBrowser for help on using the repository browser.