source: trunk/cstr/perllib/plugins/PrePlug.pm@ 891

Last change on this file since 891 was 891, checked in by sjboddie, 21 years ago

Initial revision

  • Property svn:keywords set to Author Date Id Revision
File size: 8.3 KB
Line 
1###########################################################################
2#
3# PrePlug.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# plugin that processes simple html files like those output
27# by prescript when converting postscript to html.
28# prescript output has <p> tags separating paragraphs
29# and <!--End Of Page--> tags separating pages.
30# it may also have <!--Page No--> tags at top or bottom of
31# some or all pages.
32
33# if a .html1 version of a .html file exists then that version
34# will be used so that the author name can be extracted
35# (.html1 files were generated by software which automatically
36# extracts author names from .html files).
37
38# also looks for a .info file containing metadata (as used by
39# the cstr collection)
40
41package PrePlug;
42
43use BasPlug;
44use sorttools;
45
46sub BEGIN {
47 @ISA = ('BasPlug');
48}
49
50sub new {
51 my ($class) = @_;
52 $self = new BasPlug ();
53
54 return bless $self, $class;
55}
56
57sub is_recursive {
58 my $self = shift (@_);
59
60 return 0; # this is not a recursive plugin
61}
62
63
64# return number of files processed, undef if can't process
65# Note that $base_dir might be "" and that $file might
66# include directories
67sub read {
68 my $self = shift (@_);
69 my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
70
71 my $filename = &util::filename_cat($base_dir, $file);
72
73 return 0 if ($filename =~ /\.(html1|html1\.gz|info|gif)/);
74
75 return undef unless ($filename =~ /\.(html(\.gz)?)$/i && (-e $filename));
76
77 my $gz = 0;
78 $gz = 1 if (defined $2 && $2 eq ".gz");
79
80 my $filename1 = $filename;
81 if ($gz) {
82 $filename1 =~ s/\.html\.gz$/\.html1\.gz/;
83 } else {
84 $filename1 .= "1";
85 }
86
87 $filename = $filename1 if (-e $filename1);
88
89 print STDERR "PrePlug: processing $filename\n" if $processor->{'verbosity'};
90
91 # create a new document
92 my $doc_obj = new doc ($file, "indexed_doc");
93
94 if ($gz) {
95 open (FILE, "zcat $filename |") || die "PrePlug::read - zcat can't open $filename\n";
96 } else {
97 open (FILE, $filename) || die "PrePlug::read - can't open $filename\n";
98 }
99 my $cursection = $doc_obj->get_top_section();
100
101 my $text = "";
102 my $line = "";
103 my $pagenumber = 1;
104 my @creators = ();
105 my $numlines = 0;
106 while (defined ($line = <FILE>)) {
107 if ($filename =~ /html1$/) {
108 while ($text =~ s/<_author_search_\([^\)]*\)>([^<]*)<\/a>/$1/i) {
109 push (@creators, $1);
110 }
111 }
112
113 # numlines sorts out documents that just have a single <!-- End of Page--> tag at the end
114 if ($line =~ /<!--End\s+Of\s+Page-->/i && $numlines < 200) {
115
116 $numlines = -1000;
117 $cursection =
118 $doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_top_section()));
119
120 $text =~ s/<!--Page No-->\s*<p>\s*<center>\s*<b>\s*(\d+)<\/b>\s*<\/center>\s*//i;
121# if (defined $1 && $1 != $pagenumber) {
122# print STDERR "PrePlug Warning: Pagenumbers don't line up in $filename\n";
123# }
124 $doc_obj->add_text ($cursection, $text);
125 $doc_obj->add_metadata ($cursection, "Title", $pagenumber);
126 $pagenumber ++;
127 $text = "";
128 } else {
129 $text .= $line;
130 $numlines ++;
131 }
132 }
133 close FILE;
134
135 if ($cursection eq $doc_obj->get_top_section()) {
136 # there weren't any <!--End Of Page--> tags
137
138 # see if there were any <!--Page No--> tags to split on
139 if ($text =~ /<!--Page No-->/i) {
140 while ($text =~ s/^(.*?)<!--Page No-->\s*<p>\s*<center>\s*<b>\s*\d+<\/b>\s*<\/center>\s*//i) {
141 $cursection =
142 $doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_top_section()));
143 $doc_obj->add_text ($cursection, $text);
144 $doc_obj->add_metadata ($cursection, "Title", $pagenumber);
145 $pagenumber ++;
146 }
147 }
148 # otherwise we'll just have to split pages on set number of lines
149 else {
150 my $pagetext = "";
151 my $line = "";
152 my $count = 0;
153 while (length ($text) && $text =~ s/^(.*?)(\n|$)//) {
154 $line = $1;
155 $line = "" unless defined $line;
156 $pagetext .= $line . "\n";
157 if (($count >= 50 && $line !~ /\w/) || $count == 80) {
158 $cursection =
159 $doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_top_section()));
160 $doc_obj->add_text ($cursection, $pagetext);
161 $doc_obj->add_metadata ($cursection, "Title", $pagenumber);
162 $pagetext = "";
163 $pagenumber ++;
164 $count = 0;
165 } else {
166 $count ++;
167 }
168 }
169 }
170
171 }
172
173 # final section
174 if ($text =~ /\w/) {
175 $cursection =
176 $doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_top_section()));
177
178 $text =~ s/<!--Page No-->\s*<p>\s*<center>\s*<b>\s*(\d+)<\/b>\s*<\/center>\s*//;
179# if (defined $1 && $1 != $pagenumber) {
180# print STDERR "PrePlug Warning: Pagenumbers don't line up in $filename\n";
181# }
182 $doc_obj->add_text ($cursection, $text);
183 $doc_obj->add_metadata ($cursection, "Title", $pagenumber);
184 }
185
186 # add meta data for top level of document
187 map {$doc_obj->add_metadata ($doc_obj->get_top_section(), "Creator", $_)} @creators;
188
189 my ($filesuff) = $filename =~ /^(.*?)\.html1?/;
190 if (!-e $filesuff . ".info") {
191 print STDERR "Preplug Warning: $filename has no corresponding .info file\n";
192 } else {
193 my ($dir) = $filesuff =~ /^(.*?)~?[^~]*$/;
194 open (INFO, $filesuff . ".info") || die "PrePlug::read - can't open $filesuff.info\n";
195 my $line = "";
196 while (defined ($line = <INFO>)) {
197 chop $line;
198 my ($key, $value) = $line =~ /^<([^>]*)>(.*)$/;
199 next if $key =~ /^(pages|compressedsize|\/?info|size)$/i;
200 next if $value !~ /\w/;
201 if ($key =~ /^abstract$/i) {
202 $key = "Description";
203 } elsif ($key =~ /^url$/i) {
204 $key = "Source";
205 } elsif ($key =~ /^filedate$/i) {
206 $key = "Date";
207 my ($day, $month, $year) = split /\//, $value;
208 $value = &sorttools::format_date ($day, $month, $year);
209 } elsif ($key =~ /^transferdate$/i) {
210 my ($day, $month, $year) = split /\//, $value;
211 $value = &sorttools::format_date ($day, $month, $year);
212 } elsif ($key =~ /^facsimiles$/i) {
213 $value =~ s/^\s+//;
214 $value =~ s/\s+$//;
215 my @facsimiles = split /\s+/, $value;
216 $value = join ",", @facsimiles;
217 foreach $facsimile (@facsimiles) {
218 # assume images are in the same directory
219 my $imagefile = $dir . "~" . $facsimile;
220 if (-e $imagefile) {
221 my ($imagetype) = $facsimile =~ /\.([^\.]*)$/;
222 $doc_obj->associate_file($imagefile, $facsimile, "image/" . $imagetype);
223 } else {
224 print STDERR "PrePlug: Warning - facsimile file $imagefile doesn't exist\n";
225 next;
226 }
227 }
228 } elsif ($key =~ /^figures$/i) {
229 $value =~ s/^\s+//;
230 $value =~ s/\s+$//;
231 my @figures = split /\s+/, $value;
232 foreach $figure (@figures) {
233 # assume images are in the same directory
234 my $imagefile = $dir . "~" . $figure;
235 if (-e $imagefile) {
236 my ($imagetype) = $figure =~ /\.([^\.]*)$/;
237 $doc_obj->associate_file($imagefile, $figure, "image/" . $imagetype);
238 } else {
239 print STDERR "PrePlug: Warning - figure file $imagefile doesn't exist\n";
240 next;
241 }
242 $value = join ",", @figures;
243 }
244 }
245 $doc_obj->add_metadata ($doc_obj->get_top_section(), $key, $value);
246 }
247 close INFO;
248 }
249
250 foreach $field (keys(%$metadata)) {
251 # $metadata->{$field} may be an array reference
252 if (ref ($metadata->{$field}) eq "ARRAY") {
253 map {
254 $doc_obj->add_metadata ($doc_obj->get_top_section(), $field, $_);
255 } @{$metadata->{$field}};
256 } else {
257 $doc_obj->add_metadata ($doc_obj->get_top_section(), $field, $metadata->{$field});
258 }
259 }
260
261 # add OID
262 $doc_obj->set_OID ();
263
264 # process the document
265 $processor->process($doc_obj);
266
267 return 1; # processed the file
268}
269
2701;
271
272
273
274
275
276
277
278
279
280
281
Note: See TracBrowser for help on using the repository browser.