source: trunk/gsdl/bin/script/lucene_passes.pl@ 10163

Last change on this file since 10163 was 10163, checked in by davidb, 19 years ago

lucene_passes.pl upgraded to support incremental building. Changes mostly
involve supporting minus options (i.e. -create to start a new index, nothing
for incremental) and passing this down into the indexer code.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 5.3 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# lucene_passes.pl -- perl wrapper, akin to mgpp_passes, for Lucene
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29BEGIN {
30 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
31 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
32 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
34 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
36}
37
38use util;
39use ghtml;
40
41
42sub open_java_lucene
43{
44 my ($doc_tag_level,$full_builddir,$indexdir,$create,$verbosity) = @_;
45
46 my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
47 my $classpath = &util::filename_cat($bin_java,"LuceneWrap.jar");
48
49 my $java_lucene = "java -classpath \"$classpath\" GS2LuceneIndexer";
50 my $cmd_options = "$create -verbosity $verbosity";
51 my $java_cmd = "$java_lucene $cmd_options $doc_tag_level \"$full_builddir\" $indexdir";
52
53 if (!open (PIPEOUT, "| $java_cmd")) {
54 die "$PROGNAME - couldn't run $java_cmd\n";
55 }
56}
57
58sub close_java_lucene
59{
60 close(PIPEOUT);
61}
62
63sub save_xml_doc
64{
65 my ($full_textdir,$output_filename,$doc_xml) = @_;
66 $dir_sep = &util::get_os_dirsep();
67
68 my $full_output_filename
69 = &util::filename_cat($full_textdir,$output_filename);
70 my ($full_output_dir) = ($full_output_filename =~ m/^(.*$dir_sep)/x);
71 &util::mk_all_dir($full_output_dir);
72
73 open(DOCOUT,">$full_output_filename")
74 || die "Unable to open $full_output_filename";
75
76 print DOCOUT $doc_xml;
77 close(DOCOUT);
78
79 my @secs = ($doc_xml =~ m/<Sec\s+gs2:id="\d+"\s*>.*?<\/Sec>/sg);
80
81 foreach my $sec (@secs) {
82 my ($docnum,$sec_text) = ($sec =~ m/<Sec\s+gs2:id="(\d+)"\s*>(.*?)<\/Sec>/s);
83 my $docnum_filename
84 = &util::filename_cat($full_textdir,"$docnum.xml");
85
86 open(SECOUT,">$docnum_filename")
87 || die "Unable to open $docnum_filename";
88
89 print SECOUT &ghtml::unescape_html($sec_text);
90 close(SECOUT);
91 }
92
93}
94
95sub compress_xml_doc
96{
97 my ($full_textdir,$output_filename) = @_;
98
99 my $full_output_filename
100 = &util::filename_cat($full_textdir,$output_filename);
101
102 `gzip $full_output_filename`;
103}
104
105sub monitor_xml_stream
106{
107 my ($mode, $full_textdir) = @_;
108
109 my $doc_xml = "";
110 my $output_filename = "";
111
112 my $line;
113 while (defined ($line = <STDIN>)) {
114 $line =~ s/&nbsp;/&amp;nbsp;/g;
115 $doc_xml .= $line;
116 if ($line =~ m/^<Doc.+file=\"(.*?)\".*>$/) {
117 $output_filename = $1;
118
119 }
120
121 if ($line =~ m/^<\/Doc>$/) {
122 if ($mode eq "text") {
123 save_xml_doc($full_textdir,$output_filename,$doc_xml);
124 } elsif ($mode eq "index") {
125 # notify lucene indexer
126 # print STDERR $doc_xml;
127## print PIPEOUT "$output_filename\n";
128 print PIPEOUT "$doc_xml";
129 #save_xml_doc($full_textdir, "$output_filename.txt", $doc_xml);
130 }
131 # compress file
132### compress_xml_doc($full_textdir,$output_filename);
133
134 $doc_xml = "";
135 $output_filename = "";
136 }
137 }
138}
139
140sub main
141{
142 my (@argv) = @_;
143 my $argc = scalar(@argv);
144
145 my $create = "";
146 my $verbosity = 1;
147
148 my @filtered_argv = ();
149
150 my $i = 0;
151 while ($i<$argc) {
152 if ($argv[$i] =~ m/^-(.*)$/) {
153
154 my $option = $1;
155
156 # -create causes build to be incremental
157 if ($option eq ("create")) {
158 $create = "-create";
159 }
160
161 # -verbosity num
162 elsif ($option eq "verbosity") {
163 $i++;
164 if ($i<$argc) {
165 $verbosity = $argv[$i];
166 }
167 }
168 else {
169 print STDERR "Unrecognised minus option: -$option\n";
170 }
171 }
172 else {
173 push(@filtered_argv,$argv[$i]);
174 }
175 $i++;
176 }
177
178 my $filtered_argc = scalar(@filtered_argv);
179
180 if ($filtered_argc < 4) {
181 print STDERR "Usage: $PROGNAME [-create|-verbosity num] \"text\"|\"index\" doc-tag-level build-dir index-name\n";
182 exit 1;
183 }
184
185 my $mode = $filtered_argv[0];
186 my $doc_tag_level = $filtered_argv[1];
187 my $full_builddir = $filtered_argv[2];
188 my $indexdir = $filtered_argv[3];
189### print STDERR "**** ARGS = ", join(" ", @argv), "\n";
190
191 my $full_textdir = &util::filename_cat($full_builddir,"text");
192
193 if ($mode eq "index") {
194 # don't need the lucene stuff if we are just storing the docs
195 open_java_lucene($doc_tag_level,$full_builddir,$indexdir,$create,$verbosity);
196 }
197 monitor_xml_stream($mode, $full_textdir);
198 if ($mode eq "index") {
199 close_java_lucene();
200 }
201}
202
203$PROGNAME = $0;
204$PROGNAME =~ s/^.*\/(.*)$/$1/;
205
206&main(@ARGV);
207
Note: See TracBrowser for help on using the repository browser.