Changeset 24371 for main/trunk/greenstone2
- Timestamp:
- 2011-08-08T21:10:36+12:00 (13 years ago)
- Location:
- main/trunk/greenstone2/bin/script
- Files:
-
- 1 added
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/bin/script/gsConvert.pl
r24362 r24371 520 520 my ($input_filename, $output_filestem) = @_; 521 521 522 my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wvWare"); 523 524 if ( -d "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv" && $ENV{'GSDLOS'} eq "linux" ) { 525 $ENV{'PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/bin:$ENV{'PATH'}"; 526 $ENV{'LD_LIBRARY_PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/lib:$ENV{'LD_LIBRARY_PATH'}"; 527 $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wv", "bin", "wvWare"); 528 } 529 530 # don't include path on windows (to avoid having to play about 531 # with quoting when GSDLHOME might contain spaces) but assume 532 # that the PATH is set up correctly 533 $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i); 534 535 my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc", 536 "packages", "wv", "wvHtml.xml"); 522 my $wvware_status = 0; 537 523 538 # Added the following to work with replace_srcdoc_with_html.pl: 539 # Make wvWare put any associated (image) files of the word doc into 540 # folder docname-without-extention_files. This folder should be at 541 # the same level as the html file generated from the doc. 542 # wvWare will take care of proper interlinking. 543 544 # This step is necessary for replace_srcdoc_with_html.pl which will 545 # move the html and associated files into the import folder. We 546 # want to ensure that the associated files won't overwrite similarly 547 # named items already in import. Hence we put them in a folder first 548 # (to which the html links properly) and that will allow 549 # replace_srcdoc_with_html.pl to move them safely to /import. 550 551 # To do all this, we need to use wvWare's --dir and --basename options 552 # where dir is the full path to the image folder directory and 553 # basename is the full path to the image folder appended to the name 554 # which is to be prepended to every image file: 555 # eg. if the images were to have names like sample0.jpg to sampleN.jpg, 556 # then the basename is "/full/path/to/imgdir/sample". 557 # In this case, basename is the full path to and name of the document. 558 # HOWEVER: basename always takes full path, not relative url, so 559 # the greenstone browser is unable to display the images (absolute paths 560 # cause it to give an "external link" message) 561 # See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html 562 # and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html 563 # "added --dir option to wvHtml so that pictures can be placed in 564 # a seperate directory" 565 # "running wvWare through IMP to view word documents as html. It gets 566 # invoked like this: 567 # wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output" 568 569 # toppath is the folder where html is generated 570 # docname is the name (without extension) of the html to be generated 571 # suffix (extension) is thrown away 572 my ($docname, $toppath) 573 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$"); 574 575 # We want the image folder generated to have the same name as windows 576 # would generate ($windows_scripting) when it converts from word to html. 577 # That is, foldername=docname_files 578 my $assoc_dir = &util::filename_cat($toppath, $docname."_files"); 579 #print "assoc_dir: ".$assoc_dir."\n"; # same as "$output_filestem._files" 580 581 # ensure this image directory exists 582 # if it exists already, just delete and recreate 583 if(-e $assoc_dir) { 584 &util::rm_r($assoc_dir); 585 } 586 &util::mk_dir($assoc_dir); 587 588 # the images are all going to be called image0, image1,..., imageN 589 my $img_basenames = &util::filename_cat($assoc_dir, $docname); 590 591 #print STDERR "****toppath: $toppath\n****docname: $docname\n; 592 #print STDERR "****img_basenames: $img_basenames\n" if($img_basenames); 593 #print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir); 594 595 my $cmd = ""; 596 if ($timeout) {$cmd = "ulimit -t $timeout;";} 597 # wvWare's --dir and --basename options for image directory. 598 # Replaced the next line with the *2 lines* following it: 599 # $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\""; 600 $cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\""; 601 $cmd .= " --charset utf-8 --config \"$wv_conf\""; 602 $cmd .= " \"$input_filename\" > \"$output_filestem.html\""; 603 604 # redirecting STDERR is a bad idea on windows 95/98 605 $cmd .= " 2> \"$output_filestem.err\"" 606 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000); 607 # execute the command 608 $!=0; 609 if (system($cmd)!=0) 610 { 611 print STDERR "Error executing wv converter:$!\n"; 612 if (-s "$output_filestem.err") { 613 open (ERRFILE, "<$output_filestem.err"); 614 615 my $write_to_fail_log=0; 616 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) 617 {$write_to_fail_log=1;} 618 619 my $line; 620 while ($line=<ERRFILE>) { 621 if ($line =~ m/\w/) { 622 print STDERR "$line"; 623 print FAILLOG "$line" if ($write_to_fail_log); 624 } 625 if ($line !~ m/startup error/) {next;} 626 print STDERR " (given an invalid .DOC file?)\n"; 627 print FAILLOG " (given an invalid .DOC file?)\n" 628 if ($write_to_fail_log); 629 630 } # while ERRFILE 631 close FAILLOG if ($write_to_fail_log); 632 } 633 return 0; # we can try any_to_text 634 } 635 636 # Was the conversion successful? 637 638 if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents) 639 open(TMP, "$output_filestem.html"); 640 my $line = <TMP>; 641 close(TMP); 642 if ($line && $line =~ m/DOCTYPE HTML/) { 643 &util::rm("$output_filestem.err") if -e "$output_filestem.err"; 644 645 # Inserted this code to remove the images directory if it was still empty after 646 # the html was generated (in case there were no images in the word document) 647 if (&util::is_dir_empty($assoc_dir)) { 648 #print STDERR "***gsConvert.pl: Image dir $assoc_dir is empty, removing***\n"; 649 &util::rm_r($assoc_dir); 650 } else { # there was an image folder (it was generated) 651 # Therefore, the html file generated contains absolute links to the images 652 # Replace them with relative links instead, so the folder can be moved elsewhere 653 &make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files"); 654 } 655 return 1; 656 } 657 } 658 659 # If here, an error of some sort occurred 660 &util::rm("$output_filestem.html") if -e "$output_filestem.html"; 661 if (-e "$output_filestem.err") { 662 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) { 663 open (ERRLOG,"$output_filestem.err"); 664 while (<ERRLOG>) {print FAILLOG $_;} 665 close FAILLOG; 666 close ERRLOG; 667 } 668 &util::rm("$output_filestem.err"); 669 } 670 671 return 0; 672 } 673 674 # Method to work with doc_to_html - Word docs might contain images. 675 # When such word docs are converted with wvWare, we make it generate a 676 # <filename>_files folder with the associated images, while the html file 677 # <filename> refers to the images using absolute paths to <filename>_files. 678 # This method reads in that html file and replaces all the absolute paths to 679 # the images in <filename>_files with the relative paths to the images from 680 # that folder. (I.e. with <filename>_files/<imagename.ext>). 681 sub make_links_to_assocdir_relative{ 682 # toppath is the top-level folder in which the html file we're going to be fixing resides 683 # docname is just the name (without extension) of the html file 684 # html_file is the full path to the html file: /full/path/docname.html 685 # assoc_dir_path is toppath/docname_files 686 # assoc_dirname is the directory name of the folder with associated imgs: docname_files 687 my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_; 688 689 # 1. Read all the contents of the html into a string 690 # open the original file for reading 691 unless(open(FIN, "<$html_file")) { 692 print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR: $!\n"; 693 return 0; 694 } 695 # From http://perl.plover.com/local.html 696 # "It's cheaper to read the file all at once, without all the splitting and reassembling. 697 # (Some people call this slurping the file.) Perl has a special feature to support this: 698 # If the $/ variable is undefined, the <...> operator will read the entire file all at once" 699 my $html_contents; 700 { 701 local $/ = undef; # Read entire file at once 702 $html_contents = <FIN>; # Now file is read in as one single 'line' 703 } 704 close(FIN); # close the file 705 #print STDERR $html_contents; 706 707 # 2. Replace (substitute) *all* ocurrences of the assoc_dir_path in a hrefs and img src 708 # values with assoc_dirname 709 # At the end: g means substitute all occurrences (global), while s at the end means treat 710 # all new lines as a regular space. This interacts with g to consider all the lines 711 # together as a single line so that multi-occurrences can be replaced. 712 713 # we can't just replace $assoc_dir_path with $assoc_dir 714 # $assoc_dir_path represents a regular expression that needs to be replaced 715 # if it contains ., -, [, ], or Windows style backslashes in paths -- which all have special 716 # meaning in Perl regular expressions -- we need to escape these first 717 my $safe_reg_expression = $assoc_dir_path; 718 $safe_reg_expression =~ s/\\/\\\\/g; 719 $safe_reg_expression =~ s/\./\\./g; 720 $safe_reg_expression =~ s/\-/\\-/g; 721 $safe_reg_expression =~ s/\[/\\[/g; 722 $safe_reg_expression =~ s/\]/\\]/g; 723 $safe_reg_expression =~ s/ /%20/g; # wvWare put %20 in place of space, so we need to change our prefix to match 724 725 # The following regular expression substitution looks for <a or <image, followed by any other 726 # attributes and values until it comes to the FIRST (indicated by ?) href= or src= 727 # followed by " or ' no quotes at all around path, followed by the associated folder's pathname 728 # followed by characters (for the img filename), then finally the optional closing quotes 729 # in " or ' form, followed by any other attributes and values until the first > to end the tag. 730 # The substitution: all the parts preceding associated folder's pathname are retained, 731 # the associated folder path name is replaced by associated folder directory name 732 # and the rest upto and including the closing > tag is retained. 733 # The sg at the end of the pattern match treats all of html_contents as a single line (s) 734 # and performs a global replace (g) meaning that all occurrences that match in that single line 735 # are substituted. 736 $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)$safe_reg_expression(.*?(\"|\')?.*?>)/$1$assoc_dirname$5/sg; 737 #$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back 738 # now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this 739 $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)(.*)(.*?(\"|\')?.*?>)/&post_process_assocfile_urls($1, $5, $6)/sge; 740 741 #print STDERR "****assoc_dirname: $assoc_dirname***\n"; 742 #print STDERR "****safe_reg_expression: $safe_reg_expression***\n"; 743 744 # delete the original file and recreate it 745 my $copy_of_filename = $html_file; 746 &util::rm($copy_of_filename); # deleted the file 747 748 # Recreate the original file for writing the updated contents 749 unless(open(FOUT, ">$html_file")) { # open it as a new file for writing 750 print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR: $!\n"; 751 return 0; 752 } 753 754 # write out the updated contents and close the file 755 print FOUT $html_contents; 756 close(FOUT); 757 return 1; 758 } 759 760 # Utility routine to make sure HTML plugin gets img src/href link pathnames that contain 761 # url slashes (/) instead of windows-style backwards slashes, and to convert all %20 762 # introduced in link pathnames by wvWare into space again. Converts all percent signs 763 # introduced by URL encoding filenames generated into %25 in these url links referencing them 764 sub post_process_assocfile_urls 765 { 766 my ($pre, $text, $post) = @_; 767 768 $text =~ s/%20/ /g; # Convert %20s to space and not underscore since underscores mess with incremental rebuild 769 # $text =~ s/%20/_/g; # reinstated this line, since we no longer replace spaces with %20. We replace them with underscores 770 $text =~ s/\\/\//g; 771 $text =~ s/%/%25/g; 772 773 return "$pre$text$post"; 524 # need to ensure that the path to perl is quoted (in case there's spaces in it) 525 my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl $input_filename $output_filestem $faillogfile $timeout"; 526 527 # print STDERR "***** wvware launch cmd = $launch_cmd\n"; 528 529 $wvware_status = system($launch_cmd)/256; 530 return $wvware_status; 774 531 } 775 532
Note:
See TracChangeset
for help on using the changeset viewer.