source: tags/gsdl-2_70-distribution-branch-merged/gsdl/bin/script/lucene_passes.pl@ 11818

Last change on this file since 11818 was 10165, checked in by davidb, 19 years ago

Extra section of code (currently not needed) comment out for the meantime.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 5.4 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# lucene_passes.pl -- perl wrapper, akin to mgpp_passes, for Lucene
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29BEGIN {
30 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
31 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
32 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
34 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
36}
37
38use util;
39use ghtml;
40
41
42sub open_java_lucene
43{
44 my ($doc_tag_level,$full_builddir,$indexdir,$create,$verbosity) = @_;
45
46 my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
47 my $classpath = &util::filename_cat($bin_java,"LuceneWrap.jar");
48
49 my $java_lucene = "java -classpath \"$classpath\" GS2LuceneIndexer";
50 my $cmd_options = "$create -verbosity $verbosity";
51 my $java_cmd = "$java_lucene $cmd_options $doc_tag_level \"$full_builddir\" $indexdir";
52
53 if (!open (PIPEOUT, "| $java_cmd")) {
54 die "$PROGNAME - couldn't run $java_cmd\n";
55 }
56}
57
58sub close_java_lucene
59{
60 close(PIPEOUT);
61}
62
63sub save_xml_doc
64{
65 my ($full_textdir,$output_filename,$doc_xml) = @_;
66 $dir_sep = &util::get_os_dirsep();
67
68 my $full_output_filename
69 = &util::filename_cat($full_textdir,$output_filename);
70 my ($full_output_dir) = ($full_output_filename =~ m/^(.*$dir_sep)/x);
71 &util::mk_all_dir($full_output_dir);
72
73 open(DOCOUT,">$full_output_filename")
74 || die "Unable to open $full_output_filename";
75
76 print DOCOUT $doc_xml;
77 close(DOCOUT);
78
79 my @secs = ($doc_xml =~ m/<Sec\s+gs2:id="\d+"\s*>.*?<\/Sec>/sg);
80
81
82# Currently not used, but consult with DB before removing
83# foreach my $sec (@secs) {
84# my ($docnum,$sec_text) = ($sec =~ m/<Sec\s+gs2:id="(\d+)"\s*>(.*?)<\/Sec>/s);
85# my $docnum_filename
86# = &util::filename_cat($full_textdir,"$docnum.xml");
87
88#
89# open(SECOUT,">$docnum_filename")
90# || die "Unable to open $docnum_filename";
91
92# print SECOUT &ghtml::unescape_html($sec_text);
93# close(SECOUT);
94# }
95
96}
97
98sub compress_xml_doc
99{
100 my ($full_textdir,$output_filename) = @_;
101
102 my $full_output_filename
103 = &util::filename_cat($full_textdir,$output_filename);
104
105 `gzip $full_output_filename`;
106}
107
108sub monitor_xml_stream
109{
110 my ($mode, $full_textdir) = @_;
111
112 my $doc_xml = "";
113 my $output_filename = "";
114
115 my $line;
116 while (defined ($line = <STDIN>)) {
117 $line =~ s/&nbsp;/&amp;nbsp;/g;
118 $doc_xml .= $line;
119 if ($line =~ m/^<Doc.+file=\"(.*?)\".*>$/) {
120 $output_filename = $1;
121
122 }
123
124 if ($line =~ m/^<\/Doc>$/) {
125 if ($mode eq "text") {
126 save_xml_doc($full_textdir,$output_filename,$doc_xml);
127 } elsif ($mode eq "index") {
128 # notify lucene indexer
129 # print STDERR $doc_xml;
130## print PIPEOUT "$output_filename\n";
131 print PIPEOUT "$doc_xml";
132 #save_xml_doc($full_textdir, "$output_filename.txt", $doc_xml);
133 }
134 # compress file
135### compress_xml_doc($full_textdir,$output_filename);
136
137 $doc_xml = "";
138 $output_filename = "";
139 }
140 }
141}
142
143sub main
144{
145 my (@argv) = @_;
146 my $argc = scalar(@argv);
147
148 my $create = "";
149 my $verbosity = 1;
150
151 my @filtered_argv = ();
152
153 my $i = 0;
154 while ($i<$argc) {
155 if ($argv[$i] =~ m/^-(.*)$/) {
156
157 my $option = $1;
158
159 # -create causes build to be incremental
160 if ($option eq ("create")) {
161 $create = "-create";
162 }
163
164 # -verbosity num
165 elsif ($option eq "verbosity") {
166 $i++;
167 if ($i<$argc) {
168 $verbosity = $argv[$i];
169 }
170 }
171 else {
172 print STDERR "Unrecognised minus option: -$option\n";
173 }
174 }
175 else {
176 push(@filtered_argv,$argv[$i]);
177 }
178 $i++;
179 }
180
181 my $filtered_argc = scalar(@filtered_argv);
182
183 if ($filtered_argc < 4) {
184 print STDERR "Usage: $PROGNAME [-create|-verbosity num] \"text\"|\"index\" doc-tag-level build-dir index-name\n";
185 exit 1;
186 }
187
188 my $mode = $filtered_argv[0];
189 my $doc_tag_level = $filtered_argv[1];
190 my $full_builddir = $filtered_argv[2];
191 my $indexdir = $filtered_argv[3];
192### print STDERR "**** ARGS = ", join(" ", @argv), "\n";
193
194 my $full_textdir = &util::filename_cat($full_builddir,"text");
195
196 if ($mode eq "index") {
197 # don't need the lucene stuff if we are just storing the docs
198 open_java_lucene($doc_tag_level,$full_builddir,$indexdir,$create,$verbosity);
199 }
200 monitor_xml_stream($mode, $full_textdir);
201 if ($mode eq "index") {
202 close_java_lucene();
203 }
204}
205
206$PROGNAME = $0;
207$PROGNAME =~ s/^.*\/(.*)$/$1/;
208
209&main(@ARGV);
210
Note: See TracBrowser for help on using the repository browser.