Changeset 721
- Timestamp:
- 1999-10-19T16:21:35+13:00 (25 years ago)
- Location:
- trunk/gsdl/perllib
- Files:
-
- 1 added
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/HTMLPlug.pm
r617 r721 40 40 use BasPlug; 41 41 use sorttools; 42 use html; 42 43 use util; 44 45 use File::Basename; 43 46 44 47 sub BEGIN { … … 50 53 $self = new BasPlug (); 51 54 55 $self->{'f2d_mapping'} = undef; 56 $self->{'f2d_partitioned_dirs'} = []; 57 52 58 return bless $self, $class; 53 59 } … … 59 65 } 60 66 67 #-- 68 # Convert filename to lookup filename 69 #-- 70 sub f2d_lookup_filename 71 { 72 my ($self,$filename) = @_; 73 74 my $coldir = $ENV{'GSDLCOLLECTDIR'}; 75 my $dirsep_re = &util::get_re_dirsep(); 76 my $supress_dirs = "(import|building)"; 77 78 my @part_dirs = @{$self->{'f2d_partitioned_dirs'}}; 79 80 if (scalar(@part_dirs)>0) 81 { 82 my $joined_part_dirs = join('|',@part_dirs); 83 $supress_dirs .= "($dirsep_re)($joined_part_dirs)"; 84 } 85 86 my $lookup_filename = $filename; 87 $lookup_filename =~ s/^$coldir($dirsep_re)//; 88 $lookup_filename =~ s/^$supress_dirs($dirsep_re)//; 89 90 return $lookup_filename; 91 } 92 93 94 #-- 95 # file to document mapping 96 #-- 97 98 sub rec_f2d_mapping 99 { 100 my ($self,$dirname) = @_; 101 102 # read all the files in the directory 103 if (!opendir(DIR, $dirname)) 104 { 105 print STDERR "HTMLPlug: WARNING - couldn't read directory $dirname"; 106 print STDERR " during file to doc mapping\n"; 107 return; 108 } 109 my @dir = readdir (DIR); 110 closedir (DIR); 111 112 # process each file 113 my $subfile; 114 foreach $subfile (@dir) 115 { 116 if ($subfile !~ /^\.\.?$/) 117 { 118 my $filename = &util::filename_cat($dirname, $subfile); 119 120 if (-d $filename) 121 { 122 my $dirname = $filename; 123 $self->rec_f2d_mapping($dirname); 124 } 125 else 126 { 127 if ($subfile =~ m/\.(html?(\.gz)?)$/i && (-e $filename)) 128 { 129 # add mapping 130 131 print STDERR "HTMLPlug: Precalculating OID for $subfile\n" 132 if (defined($self->{'verbosity'})); 133 134 my $oid = doc::_calc_OID(undef,$filename); 135 my $lookup_filename 136 = $self->f2d_lookup_filename($filename); 137 138 $self->{'f2d_mapping'}->{$lookup_filename} = $oid; 139 140 if ($subfile =~ m/index\.(html?(\.gz)?)$/i) 141 { 142 # Cater for links such as "/paper/" mapping to 143 # "/paper/index.html" 144 my $lookup_dirname 145 = $self->f2d_lookup_filename($dirname); 146 147 $self->{'f2d_mapping'}->{$lookup_dirname} = $oid; 148 } 149 } 150 elsif ($subfile =~ /\.(gif|jpg|jpeg|png)$/i) 151 { 152 # convert to png ? 153 154 # Hard link in build directory 155 #-- 156 my $coldir = $ENV{'GSDLCOLLECTDIR'}; 157 my $dirsep_re = &util::get_re_dirsep(); 158 my $iorb_re = "import|building"; 159 160 my ($dirsep,$iorb) 161 = ($filename =~ m/^$coldir($dirsep_re)($iorb_re)/); 162 163 my $copyname = $filename; 164 $copyname 165 =~ s/^${coldir}${dirsep}${iorb} 166 /${coldir}${dirsep}building_images${dirsep}imgsrc/x; 167 168 if (!-e $copyname) 169 { 170 print STDERR "HTMLPlug: Hard linking $subfile\n" 171 if (defined($self->{'verbosity'})); 172 &util::hard_link($filename,$copyname); 173 } 174 175 } 176 } 177 178 } 179 } 180 } 181 182 sub build_file_to_doc_mapping 183 { 184 my ($self,$base_dir) = @_; 185 186 $self->{'f2d_mapping'} = {}; 187 188 my $bimages_dir 189 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"building_images","imgsrc"); 190 if (!-e $bimages_dir && !mkdir ($bimages_dir, 0775)) 191 { 192 print STDERR "HTMLPlug:: Could not create directory $bimages_dir\n"; 193 return; 194 } 195 196 $self->rec_f2d_mapping($base_dir); 197 } 198 199 #-- 200 #-- 61 201 62 202 # return number of files processed, undef if can't process … … 67 207 my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_; 68 208 209 my $verbosity = $processor->{'verbosity'}; 210 $self->{'verbosity'} = $verbosity; 211 212 if (!defined($self->{'f2d_mapping'})) 213 { 214 print STDERR "HTMLPlug: building OID mapping\n" 215 if $processor->{'verbosity'}; 216 217 $self->build_file_to_doc_mapping($base_dir,$verbosity); 218 } 69 219 my $filename = &util::filename_cat($base_dir, $file); 70 220 my $absdir = $filename; 71 221 $absdir =~ s/[^\/\\]*$//; 72 222 223 if ($file =~ /\.(gif|jpg|jpeg|png)$/i) 224 { 225 return 0; 226 } 227 73 228 return undef unless ($filename =~ /\.(html?(\.gz)?)$/i && (-e $filename)); 229 return undef if (-d $filename); 74 230 75 231 my $gz = 0; … … 79 235 } 80 236 81 print STDERR "HTMLPlug: processing $file name\n" if $processor->{'verbosity'};237 print STDERR "HTMLPlug: processing $file\n" if $processor->{'verbosity'}; 82 238 83 239 # create a new document 84 240 my $doc_obj = new doc ($file, "indexed_doc"); 85 241 86 242 if ($gz) { 87 243 open (FILE, "zcat $filename |") || die "HTMLPlug::read - zcat can't open $filename\n"; … … 91 247 my $cursection = $doc_obj->get_top_section(); 92 248 249 # look up precalculated OID 250 my $lookup_filename = $self->f2d_lookup_filename($filename); 251 my $precalc_oid = $self->{'f2d_mapping'}->{$lookup_filename}; 252 253 # read in HTML file 93 254 my $text = ""; 94 255 my $line = ""; 256 my $donehead = 0; 95 257 my $title = ""; 96 258 while (defined ($line = <FILE>)) { … … 98 260 } 99 261 100 # we'll use the worthless alarm thingy to temporarily replace101 # '\n' so we'd better check it doesn't occur naturally102 if ($text =~ /\a/) {103 print STDERR "HTMLPlug::read - 'WARNING '\a' character occurs in text!!\n";104 }105 106 262 # remove line breaks 107 $text =~ s/\ n/\a/g;263 $text =~ s/\s+/ /g; 108 264 109 265 # see if there's a <title> tag … … 129 285 $text =~ s/^.*?<body[^>]*>//i; 130 286 131 # and any other unwanted tags 132 $text =~ s/<(\/p|\/html|\/body)>//g; 133 134 # fix up the image links 135 $text =~ s/(<img[^>]*?src\s*=\s*\"?)([^\">]+)(\"?[^>]*>)/ 136 &replace_image_links($absdir, $doc_obj, $1, $2, $3)/ige; 137 138 # put line breaks back in 139 $text =~ s/\a/\n/g; 140 287 # usemap="./#index" not handled correctly => change to "#index" 288 $text =~ s/(<img[^>]*?usemap\s*=\s*\"?)([^\">]+)(\"?[^>]*>)/ 289 &replace_usemap_links($1, $2, $3)/ige; 290 291 # fix up the href links 292 $f2d_mapping = $self->{'f2d_mapping'}; 293 294 $text =~ s/(<(a|area)\s+[^>]*?href\s*=\s*\"?)([^\">]+)(\"?[^>]*>)/ 295 &replace_href_links($filename,$precalc_oid,$f2d_mapping,$1,$3,$4)/ige; 296 297 # add a newline at the beginning of each paragraph 298 $text =~ s/(.)\s*<p\b/$1\n\n<p/gi; 299 300 # add a newline every 80 characters at a word boundary 301 # Note: this regular expression puts a line feed before 302 # the last word in each section, even when it is not 303 # needed. 304 $text =~ s/(.{1,80})\s/$1\n/g; 305 306 307 # Store URL for page as metadata 308 my $web_url = "http://$lookup_filename"; 309 $doc_obj->add_metadata($cursection, "URL", $web_url); 310 311 my $import_url = $filename; 312 $import_url =~ s/^$ENV{'GSDLCOLLECTDIR'}/_httpcollection_/; 313 314 # Add base tag so images can find correct location 315 my $index_url = $import_url; 316 $index_url =~ s/_httpcollection_\/import/_httpcollection_\/index\/imgsrc/; 317 318 my $dirsep_re = &util::get_re_dirsep(); 319 my @base_url_split = split(/$dirsep_re/,$index_url); 320 pop(@base_url_split); 321 my $base_url = join('/',@base_url_split,""); # force / at end 322 323 $text = "<base href=\"$base_url\">\n$text"; 141 324 $doc_obj->add_text ($cursection, $text); 142 325 326 # Add metadata that has been provided externally 327 #-- 143 328 foreach $field (keys(%$metadata)) { 144 329 # $metadata->{$field} may be an array reference … … 152 337 } 153 338 154 # add OID 155 $doc_obj->set_OID (); 339 # Fix OID so it is the same as the pre-calculated OID 340 $doc_obj->set_OID($precalc_oid); 341 342 my $set_oid = $doc_obj->get_OID(); 343 if ($precalc_oid ne $set_oid) # check (for super safety!) 344 { 345 print STDERR "Warning: pre-calculated OID and current OID differnt:"; 346 print STDERR " $filename\n"; 347 } 348 156 349 157 350 # process the document … … 161 354 } 162 355 163 sub replace_image_links { 164 165 my ($dir, $doc_obj, $front, $link, $back) = @_; 166 167 my ($filename, $error); 168 my $foundimage = 0; 169 170 $link =~ s/\/\///; 171 my ($imagetype) = $link =~ /([^\.]*)$/; 172 $imagetype =~ tr/[A-Z]/[a-z]/; 173 if ($imagetype eq "jpg") {$imagetype = "jpeg";} 174 if ($imagetype !~ /^(jpg|gif|png)$/) { 175 print STDERR "HTMLPlug: Warning - unknown image type ($imagetype)\n"; 176 } 177 my ($imagefile) = $link =~ /([^\/]*)$/; 178 my ($imagepath) = $link =~ /^[^\/]*(.*)$/; 179 180 if (defined $imagepath && $imagepath =~ /\w/) { 181 # relative link 182 $filename = &util::filename_cat ($dir, $imagepath); 183 if (-e $filename) { 184 $doc_obj->associate_file ($filename, $imagefile, "image/$imagetype"); 185 $foundimage = 1; 186 } else { 187 $error = "HTMLPlug: Warning - couldn't find image file $imagefile in either $filename or"; 188 } 189 } 190 191 if (!$foundimage) { 192 $filename = &util::filename_cat ($dir, $imagefile); 193 if (-e $filename) { 194 $doc_obj->associate_file ($filename, $imagefile, "image/$imagetype"); 195 $foundimage = 1; 196 } elsif (defined $error) { 197 print STDERR "$error $filename\n"; 198 } else { 199 print STDERR "HTMLPlug: Warning - couldn't find image file $imagefile in $filename\n"; 200 } 201 } 202 203 if ($foundimage) { 204 return "${front}_httpcollection_/archives/_thisOID_/${imagefile}${back}"; 205 } else { 206 return ""; 207 } 208 } 356 # support for fixing up links to work as a GSDL collection 357 #-- 358 359 sub eval_dir_dots 360 { 361 # evaluate any "../" to next directory up 362 # evaluate any "./" as here 363 #-- 364 my ($self,$filename) = @_; 365 366 my $dirsep_os = &util::get_os_dirsep(); 367 my @dirsep = split(/$dirsep_os/,$filename); 368 369 my @eval_dirs = (); 370 foreach $d (@dirsep) 371 { 372 if ($d eq "..") 373 { 374 pop(@eval_dirs); 375 } 376 elsif ($d eq ".") 377 { 378 # do nothing! 379 } 380 else 381 { 382 push(@eval_dirs,$d); 383 } 384 } 385 386 return &util::filename_cat(@eval_dirs); 387 } 388 389 sub replace_href_links 390 { 391 my ($this_filename,$this_oid, $f2d_mapping, $front,$link,$back) = @_; 392 393 return $front.$link.$back if ($link =~ m/\.(gif|jpg|jpeg|png)$/i); 394 395 if ($link =~ m/^(http|ftp|file):/i) 396 { 397 # this should really check that the link ends in .htm* 398 #**** 399 my $http_as_filename = $link; 400 $http_as_filename =~ s/^(http|ftp|file):\/\///i; 401 402 my ($before_hash,$after_hash) 403 = ($http_as_filename =~ m/^([^\#]*)\#?(.*)$/); 404 405 if ($link =~ m/^(http|ftp):/i) 406 { 407 # Turn url (using /) into file name (possibly using \ on windows) 408 my @http_dir_split = split('/',$before_hash); 409 $http_as_filename = &util::filename_cat(@http_dir_split); 410 } 411 412 $http_as_filename = $self->eval_dir_dots($http_as_filename); 413 414 if (defined($f2d_mapping->{$http_as_filename})) 415 { 416 # transform link into a local link (and then let it be 417 # processed by later code) 418 $link = $http_as_filename; 419 420 $link = "_httpdocument_&cl=_cgiargcl_&d=$oid"; 421 $link .= "#$after_hash" if ($after_hash ne ""); 422 } 423 else 424 { 425 # external link => set it up to pass through off-site page 426 427 my $link_safe = $link; 428 &html::urlsafe($link_safe); 429 430 $link = "_httpextlink_&href=$link_safe&d=$this_oid"; 431 432 } 433 } 434 elsif ($link !~ m/^(mailto|news):/i) 435 { 436 my ($before_hash,$after_hash) = ($link =~ m/^([^\#]*)\#?(.*)$/); 437 my $link_filename; 438 439 if ($before_hash =~ m/^\//) 440 { 441 my $dirsep_re = &util::get_re_dirsep(); 442 my $lookup = $self->f2d_lookup_filename($this_filename); 443 my @lookup_split = split(/$dirsep_re/,$lookup); 444 my $domname = shift(@lookup_split); 445 $link_filename = &util::filename_cat($domname,$before_hash); 446 } 447 else 448 { 449 # Turn relative file path into full path 450 if ($before_hash eq "") # handle links such as <a href="#x"> 451 { 452 $link_filename = $this_filename; 453 } 454 else 455 { 456 my $dirname = &File::Basename::dirname($this_filename); 457 $link_filename = &util::filename_cat($dirname,$before_hash); 458 } 459 } 460 461 $link_filename = $self->eval_dir_dots($link_filename); 462 my $lookup_filename = $self->f2d_lookup_filename($link_filename); 463 464 my $oid = $f2d_mapping->{$lookup_filename}; 465 if (defined($oid)) 466 { 467 $link = "_httpdocument_&cl=_cgiargcl_&d=$oid"; 468 $link .= "#$after_hash" if ($after_hash ne ""); 469 } 470 else 471 { 472 print STDERR "HTMLPlug WARNING:"; 473 print STDERR " Could not find link: \"$lookup_filename\". "; 474 print STDERR " Deactivating link\n"; 475 476 $link = "_httpextlink_&d=$this_oid"; 477 } 478 } 479 else 480 { 481 if ($link !~ m/^(mailto|news):/i) 482 { 483 print STDERR "HTMLPlug WARNING: Unhandled type of link, \"$link\"\n"; 484 } 485 else 486 { 487 my $link_safe = $link; 488 &html::urlsafe($link_safe); 489 490 $link = "_httpextlink_&href=$link_safe&d=$this_oid"; 491 } 492 } 493 494 my $fixed_tag = "${front}${link}${back}"; 495 496 return $fixed_tag; 497 } 498 499 500 sub replace_usemap_links 501 { 502 my ($front, $link, $back) = @_; 503 504 $link =~ s/^\.\///; 505 506 return "${front}${link}${back}"; 507 } 508 509 209 510 210 511 1; -
trunk/gsdl/perllib/util.pm
r619 r721 40 40 if (!-e $file) { 41 41 print STDERR "util::rm $file does not exist\n"; 42 } elsif ( !-f $file) {43 print STDERR "util::rm $file is not a regular file\n";42 } elsif ((!-f $file) && (!-l $file)) { 43 print STDERR "util::rm $file is not a regular (or symbolic) file\n"; 44 44 } else { 45 45 push (@filefiles, $file); … … 68 68 print STDERR "util::rm_r $file does not exist\n"; 69 69 70 } elsif ( -d $file) {70 } elsif ((-d $file) && (!-l $file)) { # don't recurse down symbolic link 71 71 # get the contents of this directory 72 72 if (!opendir (INDIR, $file)) { … … 92 92 } 93 93 94 # moves a file or a group of files 95 sub mv { 96 my $dest = pop (@_); 97 my (@srcfiles) = @_; 98 99 # remove trailing slashes from source and destination files 100 $dest =~ s/[\\\/]+$//; 101 map {$_ =~ s/[\\\/]+$//;} @srcfiles; 102 103 # a few sanity checks 104 if (scalar (@srcfiles) == 0) { 105 print STDERR "util::mv no destination directory given\n"; 106 return; 107 } elsif ((scalar (@srcfiles) > 1) && (!-d $dest)) { 108 print STDERR "util::mv if multiple source files are given the ". 109 "destination must be a directory\n"; 110 return; 111 } 112 113 # move the files 114 foreach $file (@srcfiles) { 115 my $tempdest = $dest; 116 if (-d $tempdest) { 117 my ($filename) = $file =~ /([^\\\/]+)$/; 118 $tempdest .= "/$filename"; 119 } 120 if (!-e $file) { 121 print STDERR "util::mv $file does not exist\n"; 122 } else { 123 rename ($file, $tempdest); 124 } 125 } 126 } 127 94 128 95 129 # copies a file or a group of files … … 128 162 } 129 163 } 164 130 165 131 166 … … 186 221 187 222 223 sub mk_dir { 224 my ($dir) = @_; 225 226 if (!mkdir ($dir, 0775)) { 227 print STDERR "util::mk_dir could not create directory $dir\n"; 228 return; 229 } 230 } 231 188 232 sub mk_all_dir { 189 233 my ($dir) = @_; … … 239 283 } 240 284 285 } 286 287 # make soft link to file if supported by OS, otherwise return error 288 sub soft_link { 289 my ($src,$dest) = @_; 290 291 # remove trailing slashes from source and destination files 292 $src =~ s/[\\\/]+$//; 293 $dest =~ s/[\\\/]+$//; 294 295 # a few sanity checks 296 if (!-e $src) { 297 print STDERR "util::soft_link source file $src does not exist\n"; 298 return 0; 299 } 300 301 my $dest_dir = &File::Basename::dirname($dest); 302 mk_all_dir($dest_dir) if (!-e $dest_dir); 303 304 if (!symlink($src,$dest)) 305 { 306 print STDERR "util::soft_link: unable to create soft link."; 307 return 0; 308 } 309 310 return 1; 241 311 } 242 312
Note:
See TracChangeset
for help on using the changeset viewer.