Changeset 13968
- Timestamp:
- 2007-03-12T16:17:48+13:00 (17 years ago)
- Location:
- trunk/gsdl/perllib
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/BasPlug.pm
r12970 r13968 1591 1591 1592 1592 if (-e $filename) { 1593 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");1593 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg"); 1594 1594 $doc_obj->add_utf8_metadata($top_section, "hascover", 1); 1595 1595 } else { -
trunk/gsdl/perllib/plugins/HTMLPlug.pm
r13241 r13968 42 42 use XMLParser; 43 43 44 use HTML::TokeParser::Simple; 45 use Image::Size; 46 44 47 sub BEGIN { 45 48 @HTMLPlug::ISA = ('BasPlug'); … … 63 66 { 'name' => "keep_head", 64 67 'desc' => "{HTMLPlug.keep_head}", 65 'type' => "flag" },66 { 'name' => "extract_style",67 'desc' => "{HTMLPlug.extract_style}",68 68 'type' => "flag" }, 69 69 { 'name' => "no_metadata", … … 107 107 { 'name' => "sectionalise_using_h_tags", 108 108 'desc' => "{HTMLPlug.sectionalise_using_h_tags}", 109 'type' => "flag" } 109 'type' => "flag" }, 110 { 'name' => "tidy_html", 111 'desc' => "{HTMLPlug.tidy_html}", 112 'type' => "flag"}, 110 113 ]; 111 114 … … 116 119 'args' => $arguments }; 117 120 121 # Will make the html input file as a proper XML file with removed font tag and 122 # image size added to the img tag. 123 # The tidying process takes place in a collection specific 'tmp' directory so 124 # that we don't accidentally damage the input. 125 sub tmp_tidy_file 126 { 127 my $self = shift (@_); 128 my ($file) = @_; 129 my $input_filename = $file; 130 131 if (-d $input_filename) 132 { 133 return $input_filename; 134 } 135 136 # get the input filename 137 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$"); 138 my $base_dirname = $dirname; 139 $suffix = lc($suffix); 140 141 # derive tmp filename from input filename 142 # Remove any white space from filename -- no risk of name collision, and 143 # makes later conversion by utils simpler. Leave spaces in path... 144 # tidy up the filename with space, dot, hyphen between 145 $tailname =~ s/\s+//g; 146 $tailname =~ s/\.+//g; 147 $tailname =~ s/\-+//g; 148 # convert to utf-8 otherwise we have problems with the doc.xml file 149 # later on 150 &unicode::ensure_utf8(\$tailname); 151 152 # softlink to collection tmp dir 153 my $tmp_dirname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tidytmp"); 154 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname); 155 156 # remove trailing slashes 157 $dirname =~ s/[\\\/]+$//; 158 # create folder for this file 159 my $folderdirname = &File::Basename::basename($dirname); 160 $tmp_dirname = &util::filename_cat($tmp_dirname,$folderdirname); 161 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname); 162 163 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix"); 164 165 # tidy the input file if it is a HTML-like file 166 if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml")) 167 { 168 # create HTML parser to decode the input file 169 my $parser = HTML::TokeParser::Simple->new($input_filename); 170 171 # write HTML tmp file without the font tag and image size are added to the img tag 172 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!"); 173 while (my $token = $parser->get_token()) 174 { 175 # is it an img tag 176 if ($token->is_start_tag('img')) 177 { 178 # get the attributes 179 my $attr = $token->return_attr; 180 181 # get the full path to the image 182 my $img_file = &util::filename_cat($dirname,$attr->{src}); 183 184 # set the width and height attribute 185 ($attr->{width}, $attr->{height}) = imgsize($img_file); 186 187 # recreate the tag 188 print PROD "<img"; 189 print PROD map { qq { $_="$attr->{$_}"} } keys %$attr; 190 print PROD ">"; 191 } 192 # is it a font tag 193 else 194 { 195 if (($token->is_start_tag('font')) || ($token->is_end_tag('font'))) 196 { 197 # remove font tag 198 print PROD ""; 199 } 200 else 201 { 202 # print without changes 203 print PROD $token->as_is; 204 } 205 } 206 } 207 close (PROD) || die("Error Closing File: $tmp_filename $!"); 208 209 # run html-tidy on the tmp file to make it a proper XML file 210 my $tidyfile = `tidy -wrap 0 -asxml $tmp_filename`; 211 212 # write result back to the tmp file 213 open (PROD, ">$tmp_filename") || die("Error Writing to File: $tmp_filename $!"); 214 print PROD $tidyfile; 215 close (PROD) || die("Error Closing File: $tmp_filename $!"); 216 217 # just for checking copy all other file from the base dir to tmp dir if it is not exists 218 opendir(DIR,$base_dirname) or die "Can't open base directory : $base_dirname!"; 219 my @files = grep {!/^\.+$/} readdir(DIR); 220 close(DIR); 221 222 foreach my $file (@files) 223 { 224 my $src_file = &util::filename_cat($base_dirname,$file); 225 my $dest_file = &util::filename_cat($tmp_dirname,$file); 226 if ((!-e $dest_file) && (!-d $src_file)) 227 { 228 # just copy the original file back to the tmp directory 229 open (TIDYIN, "< $src_file") or die "Can't open $src_file : $!"; 230 open (TIDYOUT, "> $dest_file") or die "Can't open $dest_file : $!"; 231 print TIDYOUT <TIDYIN>; 232 close TIDYIN; 233 close TIDYOUT; 234 } 235 } 236 } 237 else 238 { 239 if (!-e $tmp_filename) 240 { 241 # just copy the original file back to the tmp directory 242 open (TIDYIN, "< $input_filename") or die "Can't open $input_filename : $!"; 243 open (TIDYOUT, "> $tmp_filename") or die "Can't open $tmp_filename : $!"; 244 print TIDYOUT <TIDYIN>; 245 close TIDYIN; 246 close TIDYOUT; 247 } 248 } 249 250 # return the output filename 251 return $tmp_filename; 252 } 253 254 sub read_into_doc_obj 255 { 256 my $self = shift (@_); 257 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 258 259 # get the input file 260 my $input_filename = $file; 261 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$"); 262 $suffix = lc($suffix); 263 264 if ($self->{'tidy_html'}) 265 { 266 # tidy the input file if it is a HTML-like file 267 #if (($suffix eq ".htm") || ($suffix eq ".html") || ($suffix eq ".shtml")) 268 #{ 269 # set the file to be tidied 270 $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ /\w/; 271 272 # get the tidied file 273 my $tidy_filename = $self->tmp_tidy_file($input_filename); 274 275 # derive tmp filename from input filename 276 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($tidy_filename, "\\.[^\\.]+\$"); 277 278 # set the new input file and base_dir to be from the tidied file 279 $file = "$tailname$suffix"; 280 $base_dir = $dirname; 281 #} 282 } 283 284 # call the parent read_into_doc_obj 285 my ($process_status,$doc_obj) = &BasPlug::read_into_doc_obj($self,$pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli); 286 287 return ($process_status,$doc_obj); 288 } 289 118 290 sub new { 119 291 my ($class) = shift (@_); … … 124 296 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)}; 125 297 126 my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists); 298 299 my $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs); 127 300 128 301 if ($self->{'w3mir'}) { … … 148 321 149 322 # the last option is an attempt to encode the concept of an html query ... 150 return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+ [\?\@].+=.*)$^;323 return q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php\d?|\.cgi|.+\?.+=.*)$^; 151 324 } 152 325 … … 174 347 my @embed_matches = ($$textref =~ m/<embed[^>]*?src\s*=\s*($attval)[^>]*>/igs); 175 348 my @tabbg_matches = ($$textref =~ m/<(?:table|tr|td)[^>]*?background\s*=\s*($attval)[^>]*>/igs); 176 my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs); 177 foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches , @script_matches) {349 350 foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches) { 178 351 179 352 # remove quotes from link at start and end if necessary … … 227 400 #--></hX> 228 401 if ($self->{'sectionalise_using_h_tags'}) { 229 # description_tags should al ways be activated because we convert headings to description tags402 # description_tags should allways be activated because we convert headings to description tags 230 403 $self->{'description_tags'} = 1; 231 404 … … 245 418 246 419 $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection) 247 unless $self->{'no_metadata'}; 248 249 # extract style info as DocumentHeader metadata 250 $self->extract_style ($textref, $doc_obj, $cursection, $base_dir, $file) 251 if ($self->{'extract_style'} == 1); 420 unless $self->{'no_metadata'} || $self->{'description_tags'}; 252 421 253 422 # Store URL for page as metadata - this can be used for an … … 355 524 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection); 356 525 526 # if document contains no Section tags we'll go ahead 527 # and extract metadata (this won't have been done 528 # above as the -description_tags option prevents it) 529 my $complete_text = $head_keep.$doc_obj->get_text($cursection); 530 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection) 531 unless $self->{'no_metadata'}; 532 357 533 } else { 358 534 print $outhandle "HTMLPlug: WARNING: $file contains the following text outside\n"; … … 379 555 print $outhandle " is blank or empty. Metadata will be assigned if present.\n"; 380 556 } 381 } 382 } # if $self->{'description_tags'} 383 else { 557 558 my $complete_text = $head_keep.$doc_obj->get_text($cursection); 559 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection) 560 unless $self->{'no_metadata'}; 561 } 562 563 } else { 564 384 565 # remove header and footer 385 if (!$self->{'keep_head'} ) {566 if (!$self->{'keep_head'} || $self->{'description_tags'}) { 386 567 $$textref =~ s/^.*?<body[^>]*>//is; 387 568 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg; … … 509 690 my $anchor_name = $img_file; 510 691 $anchor_name =~ s/^.*\///; 511 $anchor_name = "<a name=\"$anchor_name\" ></a>";692 $anchor_name = "<a name=\"$anchor_name\" />"; 512 693 513 694 return $front . $img_file . $back . $anchor_name; … … 738 919 my $outhandle = $self->{'outhandle'}; 739 920 # if we don't want metadata, we may as well not be here ... 740 return if (!defined $self->{'metadata_fields'} && $self->{'hunt_creator_metadata'} == 0);921 return if (!defined $self->{'metadata_fields'}); 741 922 742 923 # metadata fields to extract/save. 'key' is the (lowercase) name of the … … 758 939 } 759 940 760 if ($self->{'hunt_creator_metadata'} == 1 ) { 941 if (defined $self->{'hunt_creator_metadata'} && 942 $self->{'hunt_creator_metadata'} == 1 ) { 761 943 my @extra_fields = 762 944 ( … … 913 1095 914 1096 915 sub extract_style {916 my $self = shift (@_);917 my ($textref, $doc_obj, $section, $base_dir, $file) = @_;918 my $outhandle = $self->{'outhandle'};919 920 # find the header in the html file, which has the style info921 $$textref =~ m@<head>(.*?)</head>@si;922 923 my $html_header=$1;924 my $style_contents = "";925 926 # look for style tags927 $html_header =~ /^/; # match the start of the string, for \G assertion928 while ($html_header =~ m/\G.*?<(style|script|link)/sig) {929 my $tag_name = $1;930 if ($tag_name eq "style") {931 if ($html_header =~ m/\G([^>]*>[^<]+<\/style[^>]*>)/is) {932 $style_contents .= "\n<style";933 $style_contents .= $1;934 }935 }936 elsif ($tag_name eq "link") {937 $style_contents .= "\n<link";938 $html_header =~ m/\G(.*?>)/is;939 $style_contents .= $1;940 }941 elsif ($tag_name eq "script") {942 # bit more tricky cos it may or may not have content943 if ($html_header =~ m/\G([^>]*?src=[^>]*>)/is) {944 $style_contents .= "\n<script";945 $style_contents .= $1;946 } elsif ($html_header =~ m/\G([^>]*>[^<]+<\/script[^>]*>)/is) {947 $style_contents .= "\n<script";948 $style_contents .= $1;949 }950 }951 }952 953 # now we need to do something with any links found in the style thing954 $style_contents =~ s/(<(?:link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/955 $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $section)/isge;956 957 $doc_obj->add_utf8_metadata($section, "DocumentHeader", $style_contents);958 959 }960 961 1097 # evaluate any "../" to next directory up 962 1098 # evaluate any "./" as here -
trunk/gsdl/perllib/strings.properties
r13901 r13968 817 817 HTMLPlug.title_sub:Substitution expression to modify string stored as Title. Used by, for example, PDFPlug to remove "Page 1", etc from text used as the title. 818 818 819 HTMLPlug.tidy_html:If set, converts a HTML document to a well-formed XHTML. It enable users to view the document in the book format. 820 819 821 ImagePlug.converttotype:Convert main image to format 's'. 820 822
Note:
See TracChangeset
for help on using the changeset viewer.