Changeset 15120 for gsdl/trunk/bin/script/gsConvert.pl
- Timestamp:
- 2008-03-20T21:08:15+13:00 (15 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/bin/script/gsConvert.pl
r12704 r15120 226 226 } 227 227 } 228 229 228 return &convertAnything($input_filename, $output_filestem, $output_type); 230 229 } … … 506 505 "packages", "wv", "wvHtml.xml"); 507 506 507 # Added the following to work with replace_srcdoc_with_html.pl: 508 # Make wvWare put any associated (image) files of the word doc into 509 # folder docname-without-extention_files. This folder should be at 510 # the same level as the html file generated from the doc. 511 # wvWare will take care of proper interlinking. 512 513 # This step is necessary for replace_srcdoc_with_html.pl which will 514 # move the html and associated files into the import folder. We 515 # want to ensure that the associated files won't overwrite similarly 516 # named items already in import. Hence we put them in a folder first 517 # (to which the html links properly) and that will allow 518 # replace_srcdoc_with_html.pl to move them safely to /import. 519 520 # To do all this, we need to use wvWare's --dir and --basename options 521 # where dir is the full path to the image folder directory and 522 # basename is the full path to the image folder appended to the name 523 # which is to be prepended to every image file: 524 # eg. if the images were to have names like sample0.jpg to sampleN.jpg, 525 # then the basename is "/full/path/to/imgdir/sample". 526 # In this case, basename is the full path to and name of the document. 527 # HOWEVER: basename always takes full path, not relative url, so 528 # the greenstone browser is unable to display the images (absolute paths 529 # cause it to give an "external link" message) 530 # See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html 531 # and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html 532 # "added --dir option to wvHtml so that pictures can be placed in 533 # a seperate directory" 534 # "running wvWare through IMP to view word documents as html. It gets 535 # invoked like this: 536 # wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output" 537 538 # toppath is the folder where html is generated 539 # docname is the name (without extension) of the html to be generated 540 # suffix (extension) is thrown away 541 my ($docname, $toppath) 542 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$"); 543 544 # We want the image folder generated to have the same name as windows 545 # would generate ($windows_scripting) when it converts from word to html. 546 # That is, foldername=docname_files 547 my $assoc_dir = &util::filename_cat($toppath, $docname."_files"); 548 #print "assoc_dir: ".$assoc_dir."\n"; # same as "$output_filestem._files" 549 550 # ensure this image directory exists 551 # if it exists already, just delete and recreate 552 if(-e $assoc_dir) { 553 &util::rm_r($assoc_dir); 554 } 555 &util::mk_dir($assoc_dir); 556 557 # the images are all going to be called image0, image1,..., imageN 558 my $img_basenames = &util::filename_cat($assoc_dir, $docname); 559 560 #print STDERR "****toppath: $toppath\n****docname: $docname\n; 561 #print STDERR "****img_basenames: $img_basenames\n" if($img_basenames); 562 #print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir); 563 508 564 my $cmd = ""; 509 565 if ($timeout) {$cmd = "ulimit -t $timeout;";} 510 $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\""; 566 # wvWare's --dir and --basename options for image directory. 567 # Replaced the next line with the *2 lines* following it: 568 # $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\""; 569 $cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\""; 570 $cmd .= " --charset utf-8 --config \"$wv_conf\""; 511 571 $cmd .= " \"$input_filename\" > \"$output_filestem.html\""; 512 572 513 573 # redirecting STDERR is a bad idea on windows 95/98 514 574 $cmd .= " 2> \"$output_filestem.err\"" … … 545 605 # Was the conversion successful? 546 606 547 if (-s "$output_filestem.html") { 607 if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents) 548 608 open(TMP, "$output_filestem.html"); 549 609 $line = <TMP>; 550 610 close(TMP); 551 611 if ($line && $line =~ /DOCTYPE HTML/) { 552 &util::rm("$output_filestem.err") if -e "$output_filestem.err"; 612 &util::rm("$output_filestem.err") if -e "$output_filestem.err"; 613 614 # Inserted this code to remove the images directory if it was still empty after 615 # the html was generated (in case there were no images in the word document) 616 if(is_dir_empty($assoc_dir)) { 617 print STDERR "***gsConvert.pl: Image dir $assoc_dir is empty, removing***\n"; 618 &util::rm_r($assoc_dir); 619 } else { # there was an image folder (it was generated) 620 # Therefore, the html file generated contains absolute links to the images 621 # If the folder contains images 622 # Replace them with relative links instead, so it can be moved elsewhere 623 make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files"); 624 } 553 625 return 1; 554 626 } … … 568 640 569 641 return 0; 642 } 643 644 # Method to work with doc_to_html - Word docs might contain images. 645 # When such word docs are converted with wvWare, we make it generate a 646 # <filename>_files folder with the associated images, while the html file 647 # <filename> refers to the images using absolute paths to <filename>_files. 648 # This method reads in that html file and replaces all the absolute paths to 649 # the images in <filename>_files with the relative paths to the images from 650 # that folder. (I.e. with <filename>_files/<imagename.ext>). 651 sub make_links_to_assocdir_relative{ 652 # toppath is the top-level folder in which the html file we're going to be fixing resides 653 # docname is just the name (without extension) of the html file 654 # html_file is the full path to the html file: /full/path/docname.html 655 # assoc_dir_path is toppath/docname_files 656 # assoc_dirname is the directory name of the folder with associated imgs: docname_files 657 my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_; 658 659 # 1. Read all the contents of the html into a string 660 # open the original file for reading 661 unless(open(FIN, "<$html_file")) { 662 print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR\n"; 663 return; 664 } 665 # From http://perl.plover.com/local.html 666 # "It's cheaper to read the file all at once, without all the splitting and reassembling. 667 # (Some people call this slurping the file.) Perl has a special feature to support this: 668 # If the $/ variable is undefined, the <...> operator will read the entire file all at once" 669 $/ = undef; # Read entire file at once 670 my $html_contents = <FIN>; # Now file is read in as one single 'line' 671 close(FIN); # close the file 672 print STDERR $html_contents; 673 674 # 2. Replace (substitute) *all* ocurrences of the assoc_dir_path in a hrefs and img src 675 # values with assoc_dirname 676 # At the end: g means substitute all occurrences (global), while s at the end means treat 677 # all new lines as a regular space. This interacts with g to consider all the lines 678 # together as a single line so that multi-occurrences can be replaced. 679 680 # The following regular expression substitution looks for <a or <image, followed by any other 681 # attributes and values until it comes to the FIRST (indicated by ?) href= or src= 682 # followed by " or ' no quotes at all around path, followed by the associated folder's pathname 683 # followed by characters (for the img filename), then finally the optional closing quotes 684 # in " or ' form, followed by any other attributes and values until the first > to end the tag. 685 # The substitution: all the parts preceding associated folder's pathname are retained, 686 # the associated folder path name is replaced by associated folder directory name 687 # and the rest upto and including the closing > tag is retained. 688 # The sg at the end of the pattern match treats all of html_contents as a single line (s) 689 # and performs a global replace (g) meaning that all occurrences that match in that single line 690 # are substituted. 691 $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)$assoc_dir_path(.*?(\"|\')?.*?>)/$1$assoc_dirname$5/sg; 692 #$html_contents =~ s/$assoc_dir_path/$assoc_dirname/gs; # this works, used as fall-back 693 694 # delete the original file and recreate it 695 my $copy_of_filename = $html_file; 696 &util::rm($copy_of_filename); # deleted the file 697 698 # Recreate the original file for writing the updated contents 699 unless(open(FOUT, ">$html_file")) { # open it as a new file for writing 700 print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR\n"; 701 return; 702 } 703 # write out the updated contents and close the file 704 print FOUT $html_contents; 705 close(FOUT); 706 707 } 708 709 # A method to check if directory is empty (note that an empty directory still has non-zero size!!!) 710 # Code is from http://episteme.arstechnica.com/eve/forums/a/tpc/f/6330927813/m/436007700831 711 sub is_dir_empty 712 { 713 my ($path) = @_; 714 opendir DIR, $path; 715 while(my $entry = readdir DIR) { 716 next if($entry =~ /^\.\.?$/); 717 closedir DIR; 718 return 0; 719 } 720 closedir DIR; 721 return 1; 570 722 } 571 723 … … 1250 1402 return 0; 1251 1403 } 1252 1404 1405 print STDERR "\n**** In any to text****\n\n"; 1253 1406 open(IN, "<$input_filename") || return 0; 1254 1407 binmode(IN);
Note:
See TracChangeset
for help on using the changeset viewer.