source: trunk/gsdl/perllib/plugins/HBPlug.pm@ 229

Last change on this file since 229 was 4, checked in by sjboddie, 26 years ago

Initial revision

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.7 KB
Line 
1# plugin which process an HTML book directory
2
3package HBPlug;
4
5use html;
6use BasPlug;
7
8
9sub BEGIN {
10 @ISA = ('BasPlug');
11}
12
13sub new {
14 my ($class) = @_;
15 $self = new BasPlug ();
16
17 return bless $self, $class;
18}
19
20sub is_recursive {
21 my $self = shift (@_);
22
23 return 0; # this is not a recursive plugin
24}
25
26sub HB_read_html_file {
27 my $self = shift (@_);
28 my ($htmlfile) = @_;
29
30 # load in the file
31 if (!open (FILE, $htmlfile)) {
32 print STDERR "ERROR - could not open $htmlfile\n";
33 return "";
34 }
35
36 my $line = "";
37 my $file = "";
38 my $foundbody = 0;
39 while (defined ($line = <FILE>)) {
40 # look for body tag
41 if (!$foundbody) {
42 if ($line =~ s/^.*<body[^>]*>//i) {
43 $foundbody = 1;
44 } else {
45 next;
46 }
47 }
48
49 # check for symbol fonts
50 if ($line =~ /<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
51 my $font = $1;
52 print STDERR "HBPlug::HB_read_html_file - warning font $font used\n"
53 if ($font !~ /^arial$/i);
54 }
55
56
57 $line =~ s/<\/p>//ig; # remove </p> tags
58 $line =~ s/<\/?(body|html|font)\b[^>]*>//ig; # remove any unwanted tags
59
60 # convert any alphanumeric character entities to their extended
61 # ascii equivalent for indexing purposes
62 &html::convertcharentities ($line);
63
64 $file .= $line;
65 }
66 close FILE;
67
68 $file =~ s/\s+/ /g; # remove \n's
69
70 return $file;
71}
72
73sub HB_clean_section {
74 my $self = shift (@_);
75 my ($section) = @_;
76
77 # remove tags without a starting tag from the section
78 my ($tag, $tagstart);
79 while ($section =~ /<\/([^>]{1,10})>/) {
80 $tag = $1;
81 $tagstart = index($section, "<$tag");
82 last if (($tagstart >= 0) && ($tagstart < index($section, "<\/$tag")));
83 $section =~ s/<\/$tag>//;
84 }
85
86 # remove extra paragraph tags
87 while ($section =~ s/<p\b[^>]*>\s*<p\b/<p/ig) {}
88
89 # remove extra stuff at the end of the section
90 while ($section =~ s/(<u>|<i>|<b>|<p\b[^>]*>|&nbsp;|\s)$//i) {}
91
92 # add a newline at the beginning of each paragraph
93 $section =~ s/(.)\s*<p\b/$1\n\n<p/gi;
94
95 # add a newline every 80 characters at a word boundary
96 # Note: this regular expression puts a line feed before
97 # the last word in each section, even when it is not
98 # needed.
99 $section =~ s/(.{1,80})\s/$1\n/g;
100
101 # fix up the image links -- not sure what Rodger intended here - Stefan.
102# $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/<img src=\"_linkOID_(_thisOID_\/$1)\"><br>/ig;
103# $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/<img src=\"_linkOID_(_thisOID_\/$1)\"><br>/ig;
104
105 $section =~ s/<img[^>]*?src=\"?([^\">]+)\"?[^>]*>/<center><img src=\"_httpcollection_\/archives\/_thisOID_\/$1\"><\/center><br>/ig;
106 $section =~ s/&lt;&lt;I&gt;&gt;\s*([^\.]+\.(png|jpg|gif))/<center><img src=\"_httpcollection_\/archives\/_thisOID_\/$1\"><\/center><br>/ig;
107
108 return $section;
109}
110
111
112sub shorten {
113 my $self = shift (@_);
114 my ($text) = @_;
115
116 return "\"$text\"" if (length($text) < 100);
117
118 return "\"" . substr ($text, 0, 50) . "\" ... \"" .
119 substr ($text, length($text)-50) . "\"";
120}
121
122
123# return 1 if processed, 0 if not processed
124# Note that $base_dir might be "" and that $file might
125# include directories
126sub read {
127 my $self = shift (@_);
128 my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
129
130 # get the html filename and see if this is a HTML Book...
131 my ($jobnumber);
132 if ($file =~ /[\\\/]/) {
133 ($jobnumber) = $file =~ /[\\\/]([^\\\/]+)$/;
134 } else {
135 ($jobnumber) = $file =~ /^([^\\\/]+)$/;
136 }
137 return 0 unless defined $jobnumber;
138 my $htmlfile = "$base_dir$file/$jobnumber.htm";
139 return 0 unless -e $htmlfile;
140
141 print STDERR "HBPlug: processing $file\n";
142
143 # read in the file and do basic html cleaning (removing header etc)
144 my $html = $self->HB_read_html_file ($htmlfile);
145
146 # create a new document
147 my $doc_obj = new doc ($file, "indexed_doc");
148
149 # copy the book cover if it exists
150 my $bookcover = "$base_dir$file/$jobnumber.jpg";
151 $doc_obj->associate_file($bookcover, "cover.jpg", "image/jpeg")
152 if -e $bookcover;
153
154 # process the file one section at a time
155 my $curtoclevel = 1;
156 my $cursection = $doc_obj->get_top_section();
157 my $firstsection = 1;
158 while (length ($html) > 0) {
159 if ($html =~ s/^.*?<p\b[^>]*>((<b>|<i>|<u>|\s)*)&lt;&lt;TOC(\d+)&gt;&gt;\s*(.*?)<p\b/<p/i) {
160 my $toclevel = $3;
161 my $title = $4;
162 my $sectiontext = "";
163 if ($html =~ s/^(.*?)(<p\b[^>]*>((<b>|<i>|<u>|\s)*)&lt;&lt;TOC\d+&gt;&gt;)/$2/i) {
164 $sectiontext = $1;
165 } else {
166 $sectiontext = $html;
167 $html = "";
168 }
169
170 # remove tags and extra spaces from the title
171 $title =~ s/<\/?[^>]+>//g;
172 $title =~ s/^\s+|\s+$//g;
173
174 # close any sections below the current level and
175 # create a new section (special case for the firstsection)
176 while (($curtoclevel > $toclevel) ||
177 (!$firstsection && $curtoclevel == $toclevel)) {
178 $cursection = $doc_obj->get_parent_section ($cursection);
179 $curtoclevel--;
180 }
181 if ($curtoclevel+1 < $toclevel) {
182 print STDERR "WARNING - jump in toc levels in $htmlfile " .
183 "from $curtoclevel to $toclevel\n";
184 }
185 while ($curtoclevel < $toclevel) {
186 $curtoclevel++;
187 $cursection =
188 $doc_obj->insert_section($doc_obj->get_end_child($cursection));
189 }
190
191 # add the metadata to this section
192 if (!$firstsection || !defined($metadata->{'Title'})) {
193 $doc_obj->add_metadata ($cursection, "Title", $title);
194 }
195 if ($firstsection) {
196 foreach $field (keys(%$metadata)) {
197 # Subjects may be a colon separated list
198 if ($field eq "Subject") {
199 my @subjects = split /:/, $metadata->{'Subject'};
200 foreach $subject (@subjects) {
201 $doc_obj->add_metadata ($cursection, 'Subject', $subject);
202 }
203 } else {
204 $doc_obj->add_metadata ($cursection, $field, $metadata->{$field});
205 }
206 }
207 $firstsection = 0;
208 }
209
210 # clean up the section html
211 $sectiontext = $self->HB_clean_section($sectiontext);
212
213 # associate any files
214# map { $doc_obj->associate_file("$base_dir$file/$1", $1)
215# if /_linkOID_\(_thisOID_\/([^\)]+)\)/; 0; }
216# split (/(_linkOID_\(_thisOID_\/[^\)]+\))/, $sectiontext);
217
218 map { $doc_obj->associate_file("$base_dir$file/$1", $1)
219 if /_httpcollection_\/archives\/_thisOID_\/([^\"]+)\"/; 0; }
220 split (/(_httpcollection_\/archives\/_thisOID_\/[^\"]+\")/, $sectiontext);
221
222 # add the text for this section
223 $doc_obj->add_text ($cursection, $sectiontext);
224
225 } else {
226 print STDERR "WARNING - leftover text\n" , $self->shorten($html),
227 "\nin $htmlfile\n";
228 last;
229 }
230 }
231
232 # add a OID
233 $doc_obj->set_OID ();
234
235 print STDERR "OID: ", $doc_obj->get_OID(), "\n";
236
237 # process the document
238 $processor->process($doc_obj);
239
240 return 1; # processed the file
241}
242
243
2441;
Note: See TracBrowser for help on using the repository browser.