Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: trunk/cstr/perllib/plugins/PrePlug.pm@ 891

Last change on this file since 891 was 891, checked in by sjboddie, 24 years ago
Initial revision
Property svn:keywords set to `Author Date Id Revision`
File size: 8.3 KB

Rev	Line
[891]	1	###########################################################################
	2	#
	3	# PrePlug.pm --
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
	26	# plugin that processes simple html files like those output
	27	# by prescript when converting postscript to html.
	28	# prescript output has <p> tags separating paragraphs
	29	# and <!--End Of Page--> tags separating pages.
	30	# it may also have <!--Page No--> tags at top or bottom of
	31	# some or all pages.
	32
	33	# if a .html1 version of a .html file exists then that version
	34	# will be used so that the author name can be extracted
	35	# (.html1 files were generated by software which automatically
	36	# extracts author names from .html files).
	37
	38	# also looks for a .info file containing metadata (as used by
	39	# the cstr collection)
	40
	41	package PrePlug;
	42
	43	use BasPlug;
	44	use sorttools;
	45
	46	sub BEGIN {
	47	@ISA = ('BasPlug');
	48	}
	49
	50	sub new {
	51	my ($class) = @_;
	52	$self = new BasPlug ();
	53
	54	return bless $self, $class;
	55	}
	56
	57	sub is_recursive {
	58	my $self = shift (@_);
	59
	60	return 0; # this is not a recursive plugin
	61	}
	62
	63
	64	# return number of files processed, undef if can't process
	65	# Note that $base_dir might be "" and that $file might
	66	# include directories
	67	sub read {
	68	my $self = shift (@_);
	69	my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
	70
	71	my $filename = &util::filename_cat($base_dir, $file);
	72
	73	return 0 if ($filename =~ /\.(html1\|html1\.gz\|info\|gif)/);
	74
	75	return undef unless ($filename =~ /\.(html(\.gz)?)$/i && (-e $filename));
	76
	77	my $gz = 0;
	78	$gz = 1 if (defined $2 && $2 eq ".gz");
	79
	80	my $filename1 = $filename;
	81	if ($gz) {
	82	$filename1 =~ s/\.html\.gz$/\.html1\.gz/;
	83	} else {
	84	$filename1 .= "1";
	85	}
	86
	87	$filename = $filename1 if (-e $filename1);
	88
	89	print STDERR "PrePlug: processing $filename\n" if $processor->{'verbosity'};
	90
	91	# create a new document
	92	my $doc_obj = new doc ($file, "indexed_doc");
	93
	94	if ($gz) {
	95	open (FILE, "zcat $filename \|") \|\| die "PrePlug::read - zcat can't open $filename\n";
	96	} else {
	97	open (FILE, $filename) \|\| die "PrePlug::read - can't open $filename\n";
	98	}
	99	my $cursection = $doc_obj->get_top_section();
	100
	101	my $text = "";
	102	my $line = "";
	103	my $pagenumber = 1;
	104	my @creators = ();
	105	my $numlines = 0;
	106	while (defined ($line = <FILE>)) {
	107	if ($filename =~ /html1$/) {
	108	while ($text =~ s/<_author_search_\([^\)]\)>([^<])<\/a>/$1/i) {
	109	push (@creators, $1);
	110	}
	111	}
	112
	113	# numlines sorts out documents that just have a single <!-- End of Page--> tag at the end
	114	if ($line =~ /<!--End\s+Of\s+Page-->/i && $numlines < 200) {
	115
	116	$numlines = -1000;
	117	$cursection =
	118	$doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_top_section()));
	119
	120	$text =~ s/<!--Page No-->\s<p>\s<center>\s<b>\s(\d+)<\/b>\s<\/center>\s//i;
	121	# if (defined $1 && $1 != $pagenumber) {
	122	# print STDERR "PrePlug Warning: Pagenumbers don't line up in $filename\n";
	123	# }
	124	$doc_obj->add_text ($cursection, $text);
	125	$doc_obj->add_metadata ($cursection, "Title", $pagenumber);
	126	$pagenumber ++;
	127	$text = "";
	128	} else {
	129	$text .= $line;
	130	$numlines ++;
	131	}
	132	}
	133	close FILE;
	134
	135	if ($cursection eq $doc_obj->get_top_section()) {
	136	# there weren't any <!--End Of Page--> tags
	137
	138	# see if there were any <!--Page No--> tags to split on
	139	if ($text =~ /<!--Page No-->/i) {
	140	while ($text =~ s/^(.?)<!--Page No-->\s<p>\s<center>\s<b>\s\d+<\/b>\s<\/center>\s*//i) {
	141	$cursection =
	142	$doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_top_section()));
	143	$doc_obj->add_text ($cursection, $text);
	144	$doc_obj->add_metadata ($cursection, "Title", $pagenumber);
	145	$pagenumber ++;
	146	}
	147	}
	148	# otherwise we'll just have to split pages on set number of lines
	149	else {
	150	my $pagetext = "";
	151	my $line = "";
	152	my $count = 0;
	153	while (length ($text) && $text =~ s/^(.*?)(\n\|$)//) {
	154	$line = $1;
	155	$line = "" unless defined $line;
	156	$pagetext .= $line . "\n";
	157	if (($count >= 50 && $line !~ /\w/) \|\| $count == 80) {
	158	$cursection =
	159	$doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_top_section()));
	160	$doc_obj->add_text ($cursection, $pagetext);
	161	$doc_obj->add_metadata ($cursection, "Title", $pagenumber);
	162	$pagetext = "";
	163	$pagenumber ++;
	164	$count = 0;
	165	} else {
	166	$count ++;
	167	}
	168	}
	169	}
	170
	171	}
	172
	173	# final section
	174	if ($text =~ /\w/) {
	175	$cursection =
	176	$doc_obj->insert_section($doc_obj->get_end_child($doc_obj->get_top_section()));
	177
	178	$text =~ s/<!--Page No-->\s<p>\s<center>\s<b>\s(\d+)<\/b>\s<\/center>\s//;
	179	# if (defined $1 && $1 != $pagenumber) {
	180	# print STDERR "PrePlug Warning: Pagenumbers don't line up in $filename\n";
	181	# }
	182	$doc_obj->add_text ($cursection, $text);
	183	$doc_obj->add_metadata ($cursection, "Title", $pagenumber);
	184	}
	185
	186	# add meta data for top level of document
	187	map {$doc_obj->add_metadata ($doc_obj->get_top_section(), "Creator", $_)} @creators;
	188
	189	my ($filesuff) = $filename =~ /^(.*?)\.html1?/;
	190	if (!-e $filesuff . ".info") {
	191	print STDERR "Preplug Warning: $filename has no corresponding .info file\n";
	192	} else {
	193	my ($dir) = $filesuff =~ /^(.?)~?[^~]$/;
	194	open (INFO, $filesuff . ".info") \|\| die "PrePlug::read - can't open $filesuff.info\n";
	195	my $line = "";
	196	while (defined ($line = <INFO>)) {
	197	chop $line;
	198	my ($key, $value) = $line =~ /^<([^>])>(.)$/;
	199	next if $key =~ /^(pages\|compressedsize\|\/?info\|size)$/i;
	200	next if $value !~ /\w/;
	201	if ($key =~ /^abstract$/i) {
	202	$key = "Description";
	203	} elsif ($key =~ /^url$/i) {
	204	$key = "Source";
	205	} elsif ($key =~ /^filedate$/i) {
	206	$key = "Date";
	207	my ($day, $month, $year) = split /\//, $value;
	208	$value = &sorttools::format_date ($day, $month, $year);
	209	} elsif ($key =~ /^transferdate$/i) {
	210	my ($day, $month, $year) = split /\//, $value;
	211	$value = &sorttools::format_date ($day, $month, $year);
	212	} elsif ($key =~ /^facsimiles$/i) {
	213	$value =~ s/^\s+//;
	214	$value =~ s/\s+$//;
	215	my @facsimiles = split /\s+/, $value;
	216	$value = join ",", @facsimiles;
	217	foreach $facsimile (@facsimiles) {
	218	# assume images are in the same directory
	219	my $imagefile = $dir . "~" . $facsimile;
	220	if (-e $imagefile) {
	221	my ($imagetype) = $facsimile =~ /\.([^\.]*)$/;
	222	$doc_obj->associate_file($imagefile, $facsimile, "image/" . $imagetype);
	223	} else {
	224	print STDERR "PrePlug: Warning - facsimile file $imagefile doesn't exist\n";
	225	next;
	226	}
	227	}
	228	} elsif ($key =~ /^figures$/i) {
	229	$value =~ s/^\s+//;
	230	$value =~ s/\s+$//;
	231	my @figures = split /\s+/, $value;
	232	foreach $figure (@figures) {
	233	# assume images are in the same directory
	234	my $imagefile = $dir . "~" . $figure;
	235	if (-e $imagefile) {
	236	my ($imagetype) = $figure =~ /\.([^\.]*)$/;
	237	$doc_obj->associate_file($imagefile, $figure, "image/" . $imagetype);
	238	} else {
	239	print STDERR "PrePlug: Warning - figure file $imagefile doesn't exist\n";
	240	next;
	241	}
	242	$value = join ",", @figures;
	243	}
	244	}
	245	$doc_obj->add_metadata ($doc_obj->get_top_section(), $key, $value);
	246	}
	247	close INFO;
	248	}
	249
	250	foreach $field (keys(%$metadata)) {
	251	# $metadata->{$field} may be an array reference
	252	if (ref ($metadata->{$field}) eq "ARRAY") {
	253	map {
	254	$doc_obj->add_metadata ($doc_obj->get_top_section(), $field, $_);
	255	} @{$metadata->{$field}};
	256	} else {
	257	$doc_obj->add_metadata ($doc_obj->get_top_section(), $field, $metadata->{$field});
	258	}
	259	}
	260
	261	# add OID
	262	$doc_obj->set_OID ();
	263
	264	# process the document
	265	$processor->process($doc_obj);
	266
	267	return 1; # processed the file
	268	}
	269
	270	1;
	271
	272
	273
	274
	275
	276
	277
	278
	279
	280
	281

Note: See TracBrowser for help on using the repository browser.

Download in other formats: