Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/perllib/plugins/HBPlug.pm@ 229

Last change on this file since 229 was 4, checked in by sjboddie, 26 years ago
Initial revision
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 6.7 KB

Line
1	# plugin which process an HTML book directory
2
3	package HBPlug;
4
5	use html;
6	use BasPlug;
7
8
9	sub BEGIN {
10	@ISA = ('BasPlug');
11	}
12
13	sub new {
14	my ($class) = @_;
15	$self = new BasPlug ();
16
17	return bless $self, $class;
18	}
19
20	sub is_recursive {
21	my $self = shift (@_);
22
23	return 0; # this is not a recursive plugin
24	}
25
26	sub HB_read_html_file {
27	my $self = shift (@_);
28	my ($htmlfile) = @_;
29
30	# load in the file
31	if (!open (FILE, $htmlfile)) {
32	print STDERR "ERROR - could not open $htmlfile\n";
33	return "";
34	}
35
36	my $line = "";
37	my $file = "";
38	my $foundbody = 0;
39	while (defined ($line = <FILE>)) {
40	# look for body tag
41	if (!$foundbody) {
42	if ($line =~ s/^.<body[^>]>//i) {
43	$foundbody = 1;
44	} else {
45	next;
46	}
47	}
48
49	# check for symbol fonts
50	if ($line =~ /<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
51	my $font = $1;
52	print STDERR "HBPlug::HB_read_html_file - warning font $font used\n"
53	if ($font !~ /^arial$/i);
54	}
55
56
57	$line =~ s/<\/p>//ig; # remove </p> tags
58	$line =~ s/<\/?(body\|html\|font)\b[^>]*>//ig; # remove any unwanted tags
59
60	# convert any alphanumeric character entities to their extended
61	# ascii equivalent for indexing purposes
62	&html::convertcharentities ($line);
63
64	$file .= $line;
65	}
66	close FILE;
67
68	$file =~ s/\s+/ /g; # remove \n's
69
70	return $file;
71	}
72
73	sub HB_clean_section {
74	my $self = shift (@_);
75	my ($section) = @_;
76
77	# remove tags without a starting tag from the section
78	my ($tag, $tagstart);
79	while ($section =~ /<\/([^>]{1,10})>/) {
80	$tag = $1;
81	$tagstart = index($section, "<$tag");
82	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
83	$section =~ s/<\/$tag>//;
84	}
85
86	# remove extra paragraph tags
87	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
88
89	# remove extra stuff at the end of the section
90	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
91
92	# add a newline at the beginning of each paragraph
93	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
94
95	# add a newline every 80 characters at a word boundary
96	# Note: this regular expression puts a line feed before
97	# the last word in each section, even when it is not
98	# needed.
99	$section =~ s/(.{1,80})\s/$1\n/g;
100
101	# fix up the image links -- not sure what Rodger intended here - Stefan.
102	# $section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/<img src=\"_linkOID_(_thisOID_\/$1)\"><br>/ig;
103	# $section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/<img src=\"_linkOID_(_thisOID_\/$1)\"><br>/ig;
104
105	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/<center><img src=\"_httpcollection_\/archives\/_thisOID_\/$1\"><\/center><br>/ig;
106	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/<center><img src=\"_httpcollection_\/archives\/_thisOID_\/$1\"><\/center><br>/ig;
107
108	return $section;
109	}
110
111
112	sub shorten {
113	my $self = shift (@_);
114	my ($text) = @_;
115
116	return "\"$text\"" if (length($text) < 100);
117
118	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
119	substr ($text, length($text)-50) . "\"";
120	}
121
122
123	# return 1 if processed, 0 if not processed
124	# Note that $base_dir might be "" and that $file might
125	# include directories
126	sub read {
127	my $self = shift (@_);
128	my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
129
130	# get the html filename and see if this is a HTML Book...
131	my ($jobnumber);
132	if ($file =~ /[\\\/]/) {
133	($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/;
134	} else {
135	($jobnumber) = $file =~ /^([^\\\/]+)$/;
136	}
137	return 0 unless defined $jobnumber;
138	my $htmlfile = "$base_dir$file/$jobnumber.htm";
139	return 0 unless -e $htmlfile;
140
141	print STDERR "HBPlug: processing $file\n";
142
143	# read in the file and do basic html cleaning (removing header etc)
144	my $html = $self->HB_read_html_file ($htmlfile);
145
146	# create a new document
147	my $doc_obj = new doc ($file, "indexed_doc");
148
149	# copy the book cover if it exists
150	my $bookcover = "$base_dir$file/$jobnumber.jpg";
151	$doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg")
152	if -e $bookcover;
153
154	# process the file one section at a time
155	my $curtoclevel = 1;
156	my $cursection = $doc_obj->get_top_section();
157	my $firstsection = 1;
158	while (length ($html) > 0) {
159	if ($html =~ s/^.?<p\b[^>]>((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
160	my $toclevel = $3;
161	my $title = $4;
162	my $sectiontext = "";
163	if ($html =~ s/^(.?)(<p\b[^>]>((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
164	$sectiontext = $1;
165	} else {
166	$sectiontext = $html;
167	$html = "";
168	}
169
170	# remove tags and extra spaces from the title
171	$title =~ s/<\/?[^>]+>//g;
172	$title =~ s/^\s+\|\s+$//g;
173
174	# close any sections below the current level and
175	# create a new section (special case for the firstsection)
176	while (($curtoclevel > $toclevel) \|\|
177	(!$firstsection && $curtoclevel == $toclevel)) {
178	$cursection = $doc_obj->get_parent_section ($cursection);
179	$curtoclevel--;
180	}
181	if ($curtoclevel+1 < $toclevel) {
182	print STDERR "WARNING - jump in toc levels in $htmlfile " .
183	"from $curtoclevel to $toclevel\n";
184	}
185	while ($curtoclevel < $toclevel) {
186	$curtoclevel++;
187	$cursection =
188	$doc_obj->insert_section($doc_obj->get_end_child($cursection));
189	}
190
191	# add the metadata to this section
192	if (!$firstsection \|\| !defined($metadata->{'Title'})) {
193	$doc_obj->add_metadata ($cursection, "Title", $title);
194	}
195	if ($firstsection) {
196	foreach $field (keys(%$metadata)) {
197	# Subjects may be a colon separated list
198	if ($field eq "Subject") {
199	my @subjects = split /:/, $metadata->{'Subject'};
200	foreach $subject (@subjects) {
201	$doc_obj->add_metadata ($cursection, 'Subject', $subject);
202	}
203	} else {
204	$doc_obj->add_metadata ($cursection, $field, $metadata->{$field});
205	}
206	}
207	$firstsection = 0;
208	}
209
210	# clean up the section html
211	$sectiontext = $self->HB_clean_section($sectiontext);
212
213	# associate any files
214	# map { $doc_obj->associate_file("$base_dir$file/$1", $1)
215	# if /_linkOID_\(_thisOID_\/([^\)]+)\)/; 0; }
216	# split (/(_linkOID_\(_thisOID_\/[^\)]+\))/, $sectiontext);
217
218	map { $doc_obj->associate_file("$base_dir$file/$1", $1)
219	if /_httpcollection_\/archives\/_thisOID_\/([^\"]+)\"/; 0; }
220	split (/(_httpcollection_\/archives\/_thisOID_\/[^\"]+\")/, $sectiontext);
221
222	# add the text for this section
223	$doc_obj->add_text ($cursection, $sectiontext);
224
225	} else {
226	print STDERR "WARNING - leftover text\n" , $self->shorten($html),
227	"\nin $htmlfile\n";
228	last;
229	}
230	}
231
232	# add a OID
233	$doc_obj->set_OID ();
234
235	print STDERR "OID: ", $doc_obj->get_OID(), "\n";
236
237	# process the document
238	$processor->process($doc_obj);
239
240	return 1; # processed the file
241	}
242
243
244	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: