Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/cstr/perllib/plugins/PrePlug.pm@ 891

Last change on this file since 891 was 891, checked in by sjboddie, 24 years ago
Initial revision
Property svn:keywords set to `Author Date Id Revision`
File size: 8.3 KB

Line
1	###########################################################################
2	#
3	# PrePlug.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# plugin that processes simple html files like those output
27	# by prescript when converting postscript to html.
28	# prescript output has <p> tags separating paragraphs
29	# and <!--End Of Page--> tags separating pages.
30	# it may also have <!--Page No--> tags at top or bottom of
31	# some or all pages.
32
33	# if a .html1 version of a .html file exists then that version
34	# will be used so that the author name can be extracted
35	# (.html1 files were generated by software which automatically
36	# extracts author names from .html files).
37
38	# also looks for a .info file containing metadata (as used by
39	# the cstr collection)
40
41	package PrePlug;
42
43	use BasPlug;
44	use sorttools;
45
46	sub BEGIN {
47	@ISA = ('BasPlug');
48	}
49
50	sub new {
51	my ($class) = @_;
52	$self = new BasPlug ();
53
54	return bless $self, $class;
55	}
56
57	sub is_recursive {
58	my $self = shift (@_);
59
60	return 0; # this is not a recursive plugin
61	}
62
63
64	# return number of files processed, undef if can't process
65	# Note that $base_dir might be "" and that $file might
66	# include directories
67	sub read {
68	my $self = shift (@_);
69	my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
70
71	my $filename = &util::filename_cat($base_dir, $file);
72
73	return 0 if ($filename =~ /\.(html1\|html1\.gz\|info\|gif)/);
74
75	return undef unless ($filename =~ /\.(html(\.gz)?)$/i && (-e $filename));
76
77	my $gz = 0;
78	$gz = 1 if (defined $2 && $2 eq ".gz");
79
80	my $filename1 = $filename;
81	if ($gz) {
82	$filename1 =~ s/\.html\.gz$/\.html1\.gz/;
83	} else {
84	$filename1 .= "1";
85	}
86
87	$filename = $filename1 if (-e $filename1);
88
89	print STDERR "PrePlug: processing $filename\n" if $processor->{'verbosity'};
90
91	# create a new document
92	my $doc_obj = new doc ($file, "indexed_doc");
93
94	if ($gz) {
95	open (FILE, "zcat $filename \|") \|\| die "PrePlug::read - zcat can't open $filename\n";
96	} else {
97	open (FILE, $filename) \|\| die "PrePlug::read - can't open $filename\n";
98	}
99	my $cursection = $doc_obj->get_top_section();
100
101	my $text = "";
102	my $line = "";
103	my $pagenumber = 1;
104	my @creators = ();
105	my $numlines = 0;
106	while (defined ($line = <FILE>)) {
107	if ($filename =~ /html1$/) {
108	while ($text =~ s/<_author_search_\([^\)]\)>([^<])<\/a>/$1/i) {
109	push (@creators, $1);
110	}
111	}
112
113	# numlines sorts out documents that just have a single <!-- End of Page--> tag at the end
114	if ($line =~ /<!--End\s+Of\s+Page-->/i && $numlines < 200) {
115
116	$numlines = -1000;
117	$cursection =
118	$doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_top_section()));
119
120	$text =~ s/<!--Page No-->\s<p>\s<center>\s<b>\s(\d+)<\/b>\s<\/center>\s//i;
121	# if (defined $1 && $1 != $pagenumber) {
122	# print STDERR "PrePlug Warning: Pagenumbers don't line up in $filename\n";
123	# }
124	$doc_obj->add_text ($cursection, $text);
125	$doc_obj->add_metadata ($cursection, "Title", $pagenumber);
126	$pagenumber ++;
127	$text = "";
128	} else {
129	$text .= $line;
130	$numlines ++;
131	}
132	}
133	close FILE;
134
135	if ($cursection eq $doc_obj->get_top_section()) {
136	# there weren't any <!--End Of Page--> tags
137
138	# see if there were any <!--Page No--> tags to split on
139	if ($text =~ /<!--Page No-->/i) {
140	while ($text =~ s/^(.?)<!--Page No-->\s<p>\s<center>\s<b>\s\d+<\/b>\s<\/center>\s*//i) {
141	$cursection =
142	$doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_top_section()));
143	$doc_obj->add_text ($cursection, $text);
144	$doc_obj->add_metadata ($cursection, "Title", $pagenumber);
145	$pagenumber ++;
146	}
147	}
148	# otherwise we'll just have to split pages on set number of lines
149	else {
150	my $pagetext = "";
151	my $line = "";
152	my $count = 0;
153	while (length ($text) && $text =~ s/^(.*?)(\n\|$)//) {
154	$line = $1;
155	$line = "" unless defined $line;
156	$pagetext .= $line . "\n";
157	if (($count >= 50 && $line !~ /\w/) \|\| $count == 80) {
158	$cursection =
159	$doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_top_section()));
160	$doc_obj->add_text ($cursection, $pagetext);
161	$doc_obj->add_metadata ($cursection, "Title", $pagenumber);
162	$pagetext = "";
163	$pagenumber ++;
164	$count = 0;
165	} else {
166	$count ++;
167	}
168	}
169	}
170
171	}
172
173	# final section
174	if ($text =~ /\w/) {
175	$cursection =
176	$doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_top_section()));
177
178	$text =~ s/<!--Page No-->\s<p>\s<center>\s<b>\s(\d+)<\/b>\s<\/center>\s//;
179	# if (defined $1 && $1 != $pagenumber) {
180	# print STDERR "PrePlug Warning: Pagenumbers don't line up in $filename\n";
181	# }
182	$doc_obj->add_text ($cursection, $text);
183	$doc_obj->add_metadata ($cursection, "Title", $pagenumber);
184	}
185
186	# add meta data for top level of document
187	map {$doc_obj->add_metadata ($doc_obj->get_top_section(), "Creator", $_)} @creators;
188
189	my ($filesuff) = $filename =~ /^(.*?)\.html1?/;
190	if (!-e $filesuff . ".info") {
191	print STDERR "Preplug Warning: $filename has no corresponding .info file\n";
192	} else {
193	my ($dir) = $filesuff =~ /^(.?)~?[^~]$/;
194	open (INFO, $filesuff . ".info") \|\| die "PrePlug::read - can't open $filesuff.info\n";
195	my $line = "";
196	while (defined ($line = <INFO>)) {
197	chop $line;
198	my ($key, $value) = $line =~ /^<([^>])>(.)$/;
199	next if $key =~ /^(pages\|compressedsize\|\/?info\|size)$/i;
200	next if $value !~ /\w/;
201	if ($key =~ /^abstract$/i) {
202	$key = "Description";
203	} elsif ($key =~ /^url$/i) {
204	$key = "Source";
205	} elsif ($key =~ /^filedate$/i) {
206	$key = "Date";
207	my ($day, $month, $year) = split /\//, $value;
208	$value = &sorttools::format_date ($day, $month, $year);
209	} elsif ($key =~ /^transferdate$/i) {
210	my ($day, $month, $year) = split /\//, $value;
211	$value = &sorttools::format_date ($day, $month, $year);
212	} elsif ($key =~ /^facsimiles$/i) {
213	$value =~ s/^\s+//;
214	$value =~ s/\s+$//;
215	my @facsimiles = split /\s+/, $value;
216	$value = join ",", @facsimiles;
217	foreach $facsimile (@facsimiles) {
218	# assume images are in the same directory
219	my $imagefile = $dir . "~" . $facsimile;
220	if (-e $imagefile) {
221	my ($imagetype) = $facsimile =~ /\.([^\.]*)$/;
222	$doc_obj->associate_file($imagefile, $facsimile, "image/" . $imagetype);
223	} else {
224	print STDERR "PrePlug: Warning - facsimile file $imagefile doesn't exist\n";
225	next;
226	}
227	}
228	} elsif ($key =~ /^figures$/i) {
229	$value =~ s/^\s+//;
230	$value =~ s/\s+$//;
231	my @figures = split /\s+/, $value;
232	foreach $figure (@figures) {
233	# assume images are in the same directory
234	my $imagefile = $dir . "~" . $figure;
235	if (-e $imagefile) {
236	my ($imagetype) = $figure =~ /\.([^\.]*)$/;
237	$doc_obj->associate_file($imagefile, $figure, "image/" . $imagetype);
238	} else {
239	print STDERR "PrePlug: Warning - figure file $imagefile doesn't exist\n";
240	next;
241	}
242	$value = join ",", @figures;
243	}
244	}
245	$doc_obj->add_metadata ($doc_obj->get_top_section(), $key, $value);
246	}
247	close INFO;
248	}
249
250	foreach $field (keys(%$metadata)) {
251	# $metadata->{$field} may be an array reference
252	if (ref ($metadata->{$field}) eq "ARRAY") {
253	map {
254	$doc_obj->add_metadata ($doc_obj->get_top_section(), $field, $_);
255	} @{$metadata->{$field}};
256	} else {
257	$doc_obj->add_metadata ($doc_obj->get_top_section(), $field, $metadata->{$field});
258	}
259	}
260
261	# add OID
262	$doc_obj->set_OID ();
263
264	# process the document
265	$processor->process($doc_obj);
266
267	return 1; # processed the file
268	}
269
270	1;
271
272
273
274
275
276
277
278
279
280
281

Note: See TracBrowser for help on using the repository browser.

Download in other formats: