source: trunk/gsdl/bin/script/lucene_passes.pl@ 12484

Last change on this file since 12484 was 12258, checked in by mdewsnip, 18 years ago

Now references the GS2Lucene classes in the org.nzdl.gsdl.LuceneWrap package.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 5.5 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# lucene_passes.pl -- perl wrapper, akin to mgpp_passes, for Lucene
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29BEGIN {
30 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
31 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
32 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
34 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
36}
37
38use util;
39use ghtml;
40
41
42sub open_java_lucene
43{
44 my ($doc_tag_level,$full_builddir,$indexdir,$create,$verbosity) = @_;
45
46 my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
47 my $classpath = &util::filename_cat($bin_java,"LuceneWrap.jar");
48
49 my $java_lucene = "java -classpath \"$classpath\" org.nzdl.gsdl.LuceneWrap.GS2LuceneIndexer";
50 my $cmd_options = "$create -verbosity $verbosity";
51 my $java_cmd = "$java_lucene $cmd_options $doc_tag_level \"$full_builddir\" $indexdir";
52
53 if (!open (PIPEOUT, "| $java_cmd")) {
54 die "$PROGNAME - couldn't run $java_cmd\n";
55 }
56}
57
58sub close_java_lucene
59{
60 close(PIPEOUT);
61}
62
63sub save_xml_doc
64{
65 my ($full_textdir,$output_filename,$doc_xml) = @_;
66 $dir_sep = &util::get_os_dirsep();
67
68 my $full_output_filename
69 = &util::filename_cat($full_textdir,$output_filename);
70 my ($full_output_dir) = ($full_output_filename =~ m/^(.*$dir_sep)/x);
71 &util::mk_all_dir($full_output_dir);
72
73 open(DOCOUT,">$full_output_filename")
74 || die "Unable to open $full_output_filename";
75
76 print DOCOUT $doc_xml;
77 close(DOCOUT);
78
79 my @secs = ($doc_xml =~ m/<Sec\s+gs2:id="\d+"\s*>.*?<\/Sec>/sg);
80
81
82# Currently not used, but consult with DB before removing
83# foreach my $sec (@secs) {
84# my ($docnum,$sec_text) = ($sec =~ m/<Sec\s+gs2:id="(\d+)"\s*>(.*?)<\/Sec>/s);
85# my $docnum_filename
86# = &util::filename_cat($full_textdir,"$docnum.xml");
87
88#
89# open(SECOUT,">$docnum_filename")
90# || die "Unable to open $docnum_filename";
91
92# print SECOUT &ghtml::unescape_html($sec_text);
93# close(SECOUT);
94# }
95
96}
97
98sub compress_xml_doc
99{
100 my ($full_textdir,$output_filename) = @_;
101
102 my $full_output_filename
103 = &util::filename_cat($full_textdir,$output_filename);
104
105 `gzip $full_output_filename`;
106}
107
108sub monitor_xml_stream
109{
110 my ($mode, $full_textdir) = @_;
111
112 my $doc_xml = "";
113 my $output_filename = "";
114
115 my $line;
116 while (defined ($line = <STDIN>)) {
117 $line =~ s/&nbsp;/&amp;nbsp;/g;
118 $doc_xml .= $line;
119 if ($line =~ m/^<Doc.+file=\"(.*?)\".*>$/) {
120 $output_filename = $1;
121
122 }
123
124 if ($line =~ m/^<\/Doc>$/) {
125 if ($mode eq "text") {
126 save_xml_doc($full_textdir,$output_filename,$doc_xml);
127 } elsif ($mode eq "index") {
128 # notify lucene indexer
129
130 # SAX parser seems to be sensitive to blank lines
131 # => remove them
132 $doc_xml =~ s/\n+/\n/g;
133
134# print STDERR $doc_xml;
135
136## print PIPEOUT "$output_filename\n";
137
138 print PIPEOUT "$doc_xml";
139
140
141 #save_xml_doc($full_textdir, "$output_filename.txt", $doc_xml);
142 }
143 # compress file
144### compress_xml_doc($full_textdir,$output_filename);
145
146 $doc_xml = "";
147 $output_filename = "";
148 }
149 }
150}
151
152sub main
153{
154 my (@argv) = @_;
155 my $argc = scalar(@argv);
156
157 my $create = "";
158 my $verbosity = 1;
159
160 my @filtered_argv = ();
161
162 my $i = 0;
163 while ($i<$argc) {
164 if ($argv[$i] =~ m/^-(.*)$/) {
165
166 my $option = $1;
167
168 # -create causes build to be incremental
169 if ($option eq ("create")) {
170 $create = "-create";
171 }
172
173 # -verbosity num
174 elsif ($option eq "verbosity") {
175 $i++;
176 if ($i<$argc) {
177 $verbosity = $argv[$i];
178 }
179 }
180 else {
181 print STDERR "Unrecognised minus option: -$option\n";
182 }
183 }
184 else {
185 push(@filtered_argv,$argv[$i]);
186 }
187 $i++;
188 }
189
190 my $filtered_argc = scalar(@filtered_argv);
191
192 if ($filtered_argc < 4) {
193 print STDERR "Usage: $PROGNAME [-create|-verbosity num] \"text\"|\"index\" doc-tag-level build-dir index-name\n";
194 exit 1;
195 }
196
197 my $mode = $filtered_argv[0];
198 my $doc_tag_level = $filtered_argv[1];
199 my $full_builddir = $filtered_argv[2];
200 my $indexdir = $filtered_argv[3];
201### print STDERR "**** ARGS = ", join(" ", @argv), "\n";
202
203 my $full_textdir = &util::filename_cat($full_builddir,"text");
204
205 if ($mode eq "index") {
206 # don't need the lucene stuff if we are just storing the docs
207 open_java_lucene($doc_tag_level,$full_builddir,$indexdir,$create,$verbosity);
208 }
209 monitor_xml_stream($mode, $full_textdir);
210 if ($mode eq "index") {
211 close_java_lucene();
212 }
213}
214
215$PROGNAME = $0;
216$PROGNAME =~ s/^.*\/(.*)$/$1/;
217
218&main(@ARGV);
219
Note: See TracBrowser for help on using the repository browser.