source: trunk/gsdl/perllib/plugins/HBPlug.pm@ 433

Last change on this file since 433 was 339, checked in by sjboddie, 25 years ago

Fixed a bug that I created last time

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.8 KB
Line 
1# plugin which process an HTML book directory
2
3package HBPlug;
4
5use plugin;
6use html;
7use BasPlug;
8use util;
9use lang;
10use doc;
11use cfgread;
12
13
14sub BEGIN {
15 @ISA = ('BasPlug');
16}
17
18sub new {
19 my ($class) = @_;
20 $self = new BasPlug ();
21
22 return bless $self, $class;
23}
24
25sub is_recursive {
26 my $self = shift (@_);
27
28 return 0; # this is not a recursive plugin
29}
30
31sub HB_read_html_file {
32 my $self = shift (@_);
33 my ($htmlfile, $text) = @_;
34
35 # load in the file
36 if (!open (FILE, $htmlfile)) {
37 print STDERR "ERROR - could not open $htmlfile\n";
38 return;
39 }
40
41 my $foundbody = 0;
42 $self->HB_gettext (\$foundbody, $text, FILE);
43 close FILE;
44
45 # just in case there was no <body> tag
46 if (!$foundbody) {
47 $foundbody = 1;
48 open (FILE, $htmlfile) || return;
49 $self->HB_gettext (\$foundbody, $text, FILE);
50 close FILE;
51 }
52}
53
54sub HB_gettext {
55 my $self = shift (@_);
56 my ($foundbody, $text, $handle) = @_;
57
58 my $line = "";
59 while (defined ($line = <$handle>)) {
60 # look for body tag
61 if (!$$foundbody) {
62 if ($line =~ s/^.*<body[^>]*>//i) {
63 $$foundbody = 1;
64 } else {
65 next;
66 }
67 }
68
69 # check for symbol fonts
70 if ($line =~ /<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
71 my $font = $1;
72 print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
73 if ($font !~ /^arial$/i);
74 }
75
76 $line =~ s/<\/p>//ig; # remove </p> tags
77 $line =~ s/<\/?(body|html|font)\b[^>]*>//ig; # remove any unwanted tags
78
79 # convert any alphanumeric character entities to their extended
80 # ascii equivalent for indexing purposes
81 &html::convertcharentities ($line);
82
83 $$text .= $line;
84 }
85 $$text =~ s/\s+/ /g; # remove \n's
86}
87
88sub HB_clean_section {
89 my $self = shift (@_);
90 my ($section) = @_;
91
92 # remove tags without a starting tag from the section
93 my ($tag, $tagstart);
94 while ($section =~ /<\/([^>]{1,10})>/) {
95 $tag = $1;
96 $tagstart = index($section, "<$tag");
97 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
98 $section =~ s/<\/$tag>//;
99 }
100
101 # remove extra paragraph tags
102 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
103
104 # remove extra stuff at the end of the section
105 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
106
107 # add a newline at the beginning of each paragraph
108 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
109
110 # add a newline every 80 characters at a word boundary
111 # Note: this regular expression puts a line feed before
112 # the last word in each section, even when it is not
113 # needed.
114 $section =~ s/(.{1,80})\s/$1\n/g;
115
116 # fix up the image links
117 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/
118 <center><img src=\"_httpcollection_\/archives\/_thisOID_\/$1\"><\/center><br>/ig;
119 $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/
120 <center><img src=\"_httpcollection_\/archives\/_thisOID_\/$1\"><\/center><br>/ig;
121
122 return $section;
123}
124
125
126sub shorten {
127 my $self = shift (@_);
128 my ($text) = @_;
129
130 return "\"$text\"" if (length($text) < 100);
131
132 return "\"" . substr ($text, 0, 50) . "\" ... \"" .
133 substr ($text, length($text)-50) . "\"";
134}
135
136
137# return number of files processed, undef if can't process
138# Note that $base_dir might be "" and that $file might
139# include directories
140sub read {
141 my $self = shift (@_);
142 my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
143
144 # get the html filename and see if this is an HTML Book...
145 my $jobnumber = $file;
146 if ($file =~ /[\\\/]/) {
147 ($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/;
148 }
149 return undef unless defined $jobnumber;
150 my $htmlfile = &util::filename_cat($base_dir, $file, "$jobnumber.htm");
151 return undef unless -e $htmlfile;
152
153 print STDERR "HBPlug: processing $file\n";
154
155 # read in the file and do basic html cleaning (removing header etc)
156 my $html = "";
157 $self->HB_read_html_file ($htmlfile, \$html);
158
159 # create a new document
160 my $doc_obj = new doc ($file, "indexed_doc");
161
162 # copy the book cover if it exists
163 my $bookcover = &util::filename_cat($base_dir, $file, "$jobnumber.jpg");
164 $doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg")
165 if -e $bookcover;
166
167 my $cursection = $doc_obj->get_top_section();
168
169 # add metadata for top level of document
170 foreach $field (keys(%$metadata)) {
171 # $metadata->{$field} may be an array reference
172 if (ref ($metadata->{$field}) eq "ARRAY") {
173 map {
174 $doc_obj->add_metadata ($cursection, $field, $_);
175 } @{$metadata->{$field}};
176 } else {
177 $doc_obj->add_metadata ($cursection, $field, $metadata->{$field});
178 }
179 }
180 # need to add <classifytype> to each book as we'll be using 'Hierarchy'
181 # for HB collections rather than the default ('Book')
182 $doc_obj->add_metadata ($cursection, 'classifytype', 'Hierarchy');
183
184
185 # process the file one section at a time
186 my $curtoclevel = 1;
187 my $firstsection = 1;
188 while (length ($html) > 0) {
189 if ($html =~ s/^.*?(?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
190 my $toclevel = $3;
191 my $title = $4;
192 my $sectiontext = "";
193 if ($html =~ s/^(.*?)((?:<p\b[^>]*>)?((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
194 $sectiontext = $1;
195 } else {
196 $sectiontext = $html;
197 $html = "";
198 }
199
200 # remove tags and extra spaces from the title
201 $title =~ s/<\/?[^>]+>//g;
202 $title =~ s/^\s+|\s+$//g;
203
204 # close any sections below the current level and
205 # create a new section (special case for the firstsection)
206 while (($curtoclevel > $toclevel) ||
207 (!$firstsection && $curtoclevel == $toclevel)) {
208 $cursection = $doc_obj->get_parent_section ($cursection);
209 $curtoclevel--;
210 }
211 if ($curtoclevel+1 < $toclevel) {
212 print STDERR "WARNING - jump in toc levels in $htmlfile " .
213 "from $curtoclevel to $toclevel\n";
214 }
215 while ($curtoclevel < $toclevel) {
216 $curtoclevel++;
217 $cursection =
218 $doc_obj->insert_section($doc_obj->get_end_child($cursection));
219 }
220
221 # add the metadata to this section
222 $doc_obj->add_metadata ($cursection, "Title", $title);
223
224 # clean up the section html
225 $sectiontext = $self->HB_clean_section($sectiontext);
226
227 # associate any files
228 map { $doc_obj->associate_file(&util::filename_cat ($base_dir, $file, $1), $1)
229 if /_httpcollection_\/archives\/_thisOID_\/([^\"]+)\"/; 0; }
230 split (/(_httpcollection_\/archives\/_thisOID_\/[^\"]+\")/, $sectiontext);
231
232 # add the text for this section
233 $doc_obj->add_text ($cursection, $sectiontext);
234
235 } else {
236 print STDERR "WARNING - leftover text\n" , $self->shorten($html),
237 "\nin $htmlfile\n";
238 last;
239 }
240 $firstsection = 0;
241 }
242
243 # add a OID
244 $doc_obj->set_OID ();
245
246 # process the document
247 $processor->process($doc_obj, &util::filename_cat($file, "$jobnumber.htm"));
248
249 return 1; # processed the file
250}
251
252
2531;
Note: See TracBrowser for help on using the repository browser.