Context Navigation

source: trunk/gsdl/perllib/plugins/HBPlug.pm@ 433

Last change on this file since 433 was 339, checked in by sjboddie, 25 years ago
Fixed a bug that I created last time
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 6.8 KB

Line
1	# plugin which process an HTML book directory
2
3	package HBPlug;
4
5	use plugin;
6	use html;
7	use BasPlug;
8	use util;
9	use lang;
10	use doc;
11	use cfgread;
12
13
14	sub BEGIN {
15	@ISA = ('BasPlug');
16	}
17
18	sub new {
19	my ($class) = @_;
20	$self = new BasPlug ();
21
22	return bless $self, $class;
23	}
24
25	sub is_recursive {
26	my $self = shift (@_);
27
28	return 0; # this is not a recursive plugin
29	}
30
31	sub HB_read_html_file {
32	my $self = shift (@_);
33	my ($htmlfile, $text) = @_;
34
35	# load in the file
36	if (!open (FILE, $htmlfile)) {
37	print STDERR "ERROR - could not open $htmlfile\n";
38	return;
39	}
40
41	my $foundbody = 0;
42	$self->HB_gettext (\$foundbody, $text, FILE);
43	close FILE;
44
45	# just in case there was no <body> tag
46	if (!$foundbody) {
47	$foundbody = 1;
48	open (FILE, $htmlfile) \|\| return;
49	$self->HB_gettext (\$foundbody, $text, FILE);
50	close FILE;
51	}
52	}
53
54	sub HB_gettext {
55	my $self = shift (@_);
56	my ($foundbody, $text, $handle) = @_;
57
58	my $line = "";
59	while (defined ($line = <$handle>)) {
60	# look for body tag
61	if (!$$foundbody) {
62	if ($line =~ s/^.<body[^>]>//i) {
63	$$foundbody = 1;
64	} else {
65	next;
66	}
67	}
68
69	# check for symbol fonts
70	if ($line =~ /<font [^>]?face\s=\s*\"?(\w+)\"?/i) {
71	my $font = $1;
72	print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
73	if ($font !~ /^arial$/i);
74	}
75
76	$line =~ s/<\/p>//ig; # remove </p> tags
77	$line =~ s/<\/?(body\|html\|font)\b[^>]*>//ig; # remove any unwanted tags
78
79	# convert any alphanumeric character entities to their extended
80	# ascii equivalent for indexing purposes
81	&html::convertcharentities ($line);
82
83	$$text .= $line;
84	}
85	$$text =~ s/\s+/ /g; # remove \n's
86	}
87
88	sub HB_clean_section {
89	my $self = shift (@_);
90	my ($section) = @_;
91
92	# remove tags without a starting tag from the section
93	my ($tag, $tagstart);
94	while ($section =~ /<\/([^>]{1,10})>/) {
95	$tag = $1;
96	$tagstart = index($section, "<$tag");
97	last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
98	$section =~ s/<\/$tag>//;
99	}
100
101	# remove extra paragraph tags
102	while ($section =~ s/<p\b[^>]>\s<p\b/<p/ig) {}
103
104	# remove extra stuff at the end of the section
105	while ($section =~ s/(<u>\|<i>\|<b>\|<p\b[^>]*>\| \|\s)$//i) {}
106
107	# add a newline at the beginning of each paragraph
108	$section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
109
110	# add a newline every 80 characters at a word boundary
111	# Note: this regular expression puts a line feed before
112	# the last word in each section, even when it is not
113	# needed.
114	$section =~ s/(.{1,80})\s/$1\n/g;
115
116	# fix up the image links
117	$section =~ s/<img[^>]?src=\"?([^\">]+)\"?[^>]>/
118	<center><img src=\"_httpcollection_\/archives\/_thisOID_\/$1\"><\/center><br>/ig;
119	$section =~ s/<<I>>\s*([^\.]+\.(png\|jpg\|gif))/
120	<center><img src=\"_httpcollection_\/archives\/_thisOID_\/$1\"><\/center><br>/ig;
121
122	return $section;
123	}
124
125
126	sub shorten {
127	my $self = shift (@_);
128	my ($text) = @_;
129
130	return "\"$text\"" if (length($text) < 100);
131
132	return "\"" . substr ($text, 0, 50) . "\" ... \"" .
133	substr ($text, length($text)-50) . "\"";
134	}
135
136
137	# return number of files processed, undef if can't process
138	# Note that $base_dir might be "" and that $file might
139	# include directories
140	sub read {
141	my $self = shift (@_);
142	my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
143
144	# get the html filename and see if this is an HTML Book...
145	my $jobnumber = $file;
146	if ($file =~ /[\\\/]/) {
147	($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/;
148	}
149	return undef unless defined $jobnumber;
150	my $htmlfile = &util::filename_cat($base_dir, $file, "$jobnumber.htm");
151	return undef unless -e $htmlfile;
152
153	print STDERR "HBPlug: processing $file\n";
154
155	# read in the file and do basic html cleaning (removing header etc)
156	my $html = "";
157	$self->HB_read_html_file ($htmlfile, \$html);
158
159	# create a new document
160	my $doc_obj = new doc ($file, "indexed_doc");
161
162	# copy the book cover if it exists
163	my $bookcover = &util::filename_cat($base_dir, $file, "$jobnumber.jpg");
164	$doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg")
165	if -e $bookcover;
166
167	my $cursection = $doc_obj->get_top_section();
168
169	# add metadata for top level of document
170	foreach $field (keys(%$metadata)) {
171	# $metadata->{$field} may be an array reference
172	if (ref ($metadata->{$field}) eq "ARRAY") {
173	map {
174	$doc_obj->add_metadata ($cursection, $field, $_);
175	} @{$metadata->{$field}};
176	} else {
177	$doc_obj->add_metadata ($cursection, $field, $metadata->{$field});
178	}
179	}
180	# need to add <classifytype> to each book as we'll be using 'Hierarchy'
181	# for HB collections rather than the default ('Book')
182	$doc_obj->add_metadata ($cursection, 'classifytype', 'Hierarchy');
183
184
185	# process the file one section at a time
186	my $curtoclevel = 1;
187	my $firstsection = 1;
188	while (length ($html) > 0) {
189	if ($html =~ s/^.?(?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s))<<TOC(\d+)>>\s(.*?)<p\b/<p/i) {
190	my $toclevel = $3;
191	my $title = $4;
192	my $sectiontext = "";
193	if ($html =~ s/^(.?)((?:<p\b[^>]>)?((<b>\|<i>\|<u>\|\s)*)<<TOC\d+>>)/$2/i) {
194	$sectiontext = $1;
195	} else {
196	$sectiontext = $html;
197	$html = "";
198	}
199
200	# remove tags and extra spaces from the title
201	$title =~ s/<\/?[^>]+>//g;
202	$title =~ s/^\s+\|\s+$//g;
203
204	# close any sections below the current level and
205	# create a new section (special case for the firstsection)
206	while (($curtoclevel > $toclevel) \|\|
207	(!$firstsection && $curtoclevel == $toclevel)) {
208	$cursection = $doc_obj->get_parent_section ($cursection);
209	$curtoclevel--;
210	}
211	if ($curtoclevel+1 < $toclevel) {
212	print STDERR "WARNING - jump in toc levels in $htmlfile " .
213	"from $curtoclevel to $toclevel\n";
214	}
215	while ($curtoclevel < $toclevel) {
216	$curtoclevel++;
217	$cursection =
218	$doc_obj->insert_section($doc_obj->get_end_child($cursection));
219	}
220
221	# add the metadata to this section
222	$doc_obj->add_metadata ($cursection, "Title", $title);
223
224	# clean up the section html
225	$sectiontext = $self->HB_clean_section($sectiontext);
226
227	# associate any files
228	map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1)
229	if /_httpcollection_\/archives\/_thisOID_\/([^\"]+)\"/; 0; }
230	split (/(_httpcollection_\/archives\/_thisOID_\/[^\"]+\")/, $sectiontext);
231
232	# add the text for this section
233	$doc_obj->add_text ($cursection, $sectiontext);
234
235	} else {
236	print STDERR "WARNING - leftover text\n" , $self->shorten($html),
237	"\nin $htmlfile\n";
238	last;
239	}
240	$firstsection = 0;
241	}
242
243	# add a OID
244	$doc_obj->set_OID ();
245
246	# process the document
247	$processor->process($doc_obj, &util::filename_cat($file, "$jobnumber.htm"));
248
249	return 1; # processed the file
250	}
251
252
253	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: