Changeset 16435
- Timestamp:
- 2008-07-16T17:03:13+12:00 (15 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/bin/script/gsConvert.pl
r15168 r16435 215 215 216 216 my $success = 0; 217 if (!$output_type || ($output_type =~ /html/i)){217 if (!$output_type || ($output_type =~ m/html/i)){ 218 218 if ($windows_scripting) { 219 219 $success = &native_doc_to_html($input_filename, $output_filestem); … … 238 238 239 239 # Attempt specialised conversion to HTML 240 if (!$output_type || ($output_type =~ /html/i)) {240 if (!$output_type || ($output_type =~ m/html/i)) { 241 241 242 242 if ($windows_scripting) { … … 266 266 267 267 # Attempt simple conversion to HTML 268 if (!$output_type || ($output_type =~ /html/i)) {268 if (!$output_type || ($output_type =~ m/html/i)) { 269 269 $success = &any_to_html($input_filename, $output_filestem); 270 270 if ($success) { … … 274 274 275 275 # Convert to text 276 if (!$output_type || ($output_type =~ /text/i)) {276 if (!$output_type || ($output_type =~ m/text/i)) { 277 277 $success = &any_to_text($input_filename, $output_filestem); 278 278 if ($success) { … … 293 293 $output_type =~ s/.*\-(.*)/$1/i; 294 294 # Attempt coversion to Image 295 if ($output_type =~ /jp?g|gif|png/i) {295 if ($output_type =~ m/jp?g|gif|png/i) { 296 296 $success = &pdf_to_img($dirname, $input_filename, $output_filestem, $output_type); 297 297 if ($success){ … … 301 301 302 302 # Attempt conversion to HTML 303 if (!$output_type || ($output_type =~ /html/i)) {303 if (!$output_type || ($output_type =~ m/html/i)) { 304 304 $success = &pdf_to_html($dirname, $input_filename, $output_filestem); 305 305 if ($success) { … … 309 309 310 310 # Attempt conversion to TEXT 311 if (!$output_type || ($output_type =~ /text/i)) {311 if (!$output_type || ($output_type =~ m/text/i)) { 312 312 $success = &pdf_to_text($dirname, $input_filename, $output_filestem); 313 313 if ($success) { … … 329 329 $output_type =~ s/.*\-(.*)/$1/i; 330 330 # Attempt coversion to Image 331 if ($output_type =~ /jp?g|gif|png/i) {331 if ($output_type =~ m/jp?g|gif|png/i) { 332 332 $success = &ps_to_img($dirname, $input_filename, $output_filestem, $output_type); 333 333 if ($success){ … … 337 337 338 338 # Attempt conversion to TEXT 339 if (!$output_type || ($output_type =~ /text/i)) {339 if (!$output_type || ($output_type =~ m/text/i)) { 340 340 $success = &ps_to_text($input_filename, $output_filestem); 341 341 if ($success) { … … 352 352 353 353 my $ppt_convert_type = ""; 354 #if (!$output_type || $windows_scripting || ($output_type !~ /html/i) ||($output_type !~/text/i)){355 if ($windows_scripting && ($output_type !~ /html/i) && ($output_type !~/text/i)){356 if ($output_type =~ /gif/i) {354 #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){ 355 if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){ 356 if ($output_type =~ m/gif/i) { 357 357 $ppt_convert_type = "-g"; 358 } elsif ($output_type =~ /jp?g/i){358 } elsif ($output_type =~ m/jp?g/i){ 359 359 $ppt_convert_type = "-j"; 360 } elsif ($output_type =~ /png/i){360 } elsif ($output_type =~ m/png/i){ 361 361 $ppt_convert_type = "-p"; 362 362 } 363 363 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin", 364 364 $ENV{'GSDLOS'}, "pptextract"); 365 $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ /^windows$/i);365 $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); 366 366 367 367 $cmd = ""; … … 374 374 $cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\""; 375 375 $cmd .= " 2>\"$output_filestem.err\"" 376 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);376 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000); 377 377 if (system($cmd) !=0) { 378 378 print STDERR "Powerpoint VB Scripting convert failed\n"; … … 381 381 } 382 382 } 383 } elsif (!$output_type || ($output_type =~ /html/i)) {383 } elsif (!$output_type || ($output_type =~ m/html/i)) { 384 384 # Attempt conversion to HTML 385 #if (!$output_type || ($output_type =~ /html/i)) {385 #if (!$output_type || ($output_type =~ m/html/i)) { 386 386 # formulate the command 387 387 $cmd = ""; … … 389 389 $cmd .= " \"$input_filename\" \"$output_filestem.html\""; 390 390 $cmd .= " 2>\"$output_filestem.err\"" 391 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);391 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000); 392 392 393 393 # execute the command … … 416 416 417 417 # Attempt conversion to HTML 418 if (!$output_type || ($output_type =~ /html/i)) {418 if (!$output_type || ($output_type =~ m/html/i)) { 419 419 # formulate the command 420 420 $cmd = ""; … … 422 422 $cmd .= " \"$input_filename\" \"$output_filestem.html\""; 423 423 $cmd .= " 2>\"$output_filestem.err\"" 424 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);424 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000); 425 425 426 426 … … 463 463 if ($first) { 464 464 # check to see if this is an rtf file 465 if ($line =~ /^\{\\rtf/) {465 if ($line =~ m/^\{\\rtf/) { 466 466 close(CHK); 467 467 return "rtf"; … … 471 471 472 472 # is this is a word 6/7/8 document? 473 if ($line =~ /Word\.Document\.([678])/) {473 if ($line =~ m/Word\.Document\.([678])/) { 474 474 close(CHK); 475 475 return "word$1"; … … 500 500 # with quoting when GSDLHOME might contain spaces) but assume 501 501 # that the PATH is set up correctly 502 $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ /^windows$/i);502 $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i); 503 503 504 504 my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc", … … 573 573 # redirecting STDERR is a bad idea on windows 95/98 574 574 $cmd .= " 2> \"$output_filestem.err\"" 575 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);575 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000); 576 576 # execute the command 577 577 $!=0; … … 588 588 my $line; 589 589 while ($line=<ERRFILE>) { 590 if ($line =~ /\w/) {590 if ($line =~ m/\w/) { 591 591 print STDERR "$line"; 592 592 print FAILLOG "$line" if ($write_to_fail_log); … … 609 609 $line = <TMP>; 610 610 close(TMP); 611 if ($line && $line =~ /DOCTYPE HTML/) {611 if ($line && $line =~ m/DOCTYPE HTML/) { 612 612 &util::rm("$output_filestem.err") if -e "$output_filestem.err"; 613 613 614 614 # Inserted this code to remove the images directory if it was still empty after 615 615 # the html was generated (in case there were no images in the word document) 616 if (&is_dir_empty($assoc_dir)) {616 if (&util::is_dir_empty($assoc_dir)) { 617 617 #print STDERR "***gsConvert.pl: Image dir $assoc_dir is empty, removing***\n"; 618 618 &util::rm_r($assoc_dir); 619 619 } else { # there was an image folder (it was generated) 620 620 # Therefore, the html file generated contains absolute links to the images 621 # If the folder contains images 622 # Replace them with relative links instead, so it can be moved elsewhere 621 # Replace them with relative links instead, so the folder can be moved elsewhere 623 622 &make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files"); 624 623 } … … 640 639 641 640 return 0; 642 }643 644 645 # A method to check if a directory is empty (note that an empty directory still has non-zero size!!!)646 # Code is from http://episteme.arstechnica.com/eve/forums/a/tpc/f/6330927813/m/436007700831647 sub is_dir_empty648 {649 my ($path) = @_;650 opendir DIR, $path;651 while(my $entry = readdir DIR) {652 next if($entry =~ /^\.\.?$/);653 closedir DIR;654 return 0;655 }656 closedir DIR;657 return 1;658 641 } 659 642 … … 699 682 # we can't just replace $assoc_dir_path with $assoc_dir 700 683 # $assoc_dir_path represents a regular expression that needs to be replaced 701 # if it contains ., -, [ or ] -- which all have special meaning in Perl regular expressions --702 # we need to escape these first684 # if it contains ., -, [, ], or Windows style backslashes in paths -- which all have special 685 # meaning in Perl regular expressions -- we need to escape these first 703 686 my $safe_reg_expression = $assoc_dir_path; 687 $safe_reg_expression =~ s/\\/\\\\/g; 704 688 $safe_reg_expression =~ s/\./\\./g; 705 689 $safe_reg_expression =~ s/\-/\\-/g; … … 722 706 #$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back 723 707 # now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this 724 $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)(.*)(.*?(\"|\')?.*?>)/&p ercent_twenty_to_space($1, $5, $6)/sge;725 726 #print STDERR "assoc_dirname: ****$assoc_dirname***\n";727 #print STDERR "safe_reg_expression: ****$safe_reg_expression***\n";708 $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)(.*)(.*?(\"|\')?.*?>)/&post_process_assocfile_urls($1, $5, $6)/sge; 709 710 print STDERR "****assoc_dirname: $assoc_dirname***\n"; 711 print STDERR "****safe_reg_expression: $safe_reg_expression***\n"; 728 712 729 713 # delete the original file and recreate it … … 736 720 return 0; 737 721 } 722 738 723 # write out the updated contents and close the file 739 724 print FOUT $html_contents; … … 742 727 } 743 728 744 # Utility routine to convert all %20 introduced by wvWare in link pathnames into space again 745 sub percent_twenty_to_space 729 # Utility routine to make sure HTML plugin gets img src/href link pathnames that contain 730 # url slashes (/) instead of windows-style backwards slashes, and to convert all %20 731 # introduced in link pathnames by wvWare into space again 732 sub post_process_assocfile_urls 746 733 { 747 734 my ($pre, $text, $post) = @_; 748 735 749 736 $text =~ s/%20/ /g; 737 $text =~ s/\\/\//g; 750 738 751 739 return "$pre$text$post"; … … 759 747 $ENV{'GSDLOS'}, "word2html"); 760 748 761 $vbScript = "word2html" if ($ENV{'GSDLOS'} =~ /^windows$/i);749 $vbScript = "word2html" if ($ENV{'GSDLOS'} =~ m/^windows$/i); 762 750 if (-e "$output_filestem.html") { 763 751 print STDERR "*** The conversion file has existed\n"; … … 773 761 # redirecting STDERR 774 762 $cmd .= " 2> \"$output_filestem.err\"" 775 if ($ENV {'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);763 if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000); 776 764 777 765 # execute the command … … 789 777 my $line; 790 778 while ($line=<ERRFILE>) { 791 if ($line =~ /\w/) {779 if ($line =~ m/\w/) { 792 780 print STDERR "$line"; 793 781 print FAILLOG "$line" if ($write_to_fail_log); … … 809 797 $line = <TMP>; 810 798 close(TMP); 811 if ($line && $line =~ /html/) {799 if ($line && $line =~ m/html/) { 812 800 &util::rm("$output_filestem.err") if -e "$output_filestem.err"; 813 801 return 1; … … 843 831 844 832 $cmd .= " 2>\"$output_filestem.err\"" 845 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);833 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000); 846 834 847 835 … … 865 853 866 854 if ($past_header == 0) { 867 if ($line =~ /<body>/) {$past_header=1;}855 if ($line =~ m/<body>/) {$past_header=1;} 868 856 next; 869 857 } 870 858 871 859 $line =~ s/<[^>]+>//g; 872 if ($line =~ /\w/ && $past_header) { # we found some content...860 if ($line =~ m/\w/ && $past_header) { # we found some content... 873 861 $was_successful=1; 874 862 last; … … 898 886 899 887 # print out header info from src html. 900 while (defined($_ = <HTMLSRC>) && $_ =~ /\w/) {888 while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) { 901 889 print HTML "$_"; 902 890 } … … 962 950 $cmd .= " \"$input_filename\" \"$output_filestem\""; 963 951 964 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000) {952 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) { 965 953 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\""; 966 954 } else { … … 1029 1017 $output_type =~ s/.*\_(.*)/$1/i; 1030 1018 $cmd .= "perl -S pdftoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\""; 1031 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000) {1019 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) { 1032 1020 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\""; 1033 1021 } else { … … 1086 1074 my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\""; 1087 1075 1088 if ($ENV{'GSDLOS'} !~ /^windows$/i) {1076 if ($ENV{'GSDLOS'} !~ m/^windows$/i) { 1089 1077 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\""; 1090 1078 } else { … … 1105 1093 my $seen_text=0; 1106 1094 while (($seen_text==0) && ($line=<EXTR_TEXT>)) { 1107 if ($line=~ /\w/) {$seen_text=1;}1095 if ($line=~ m/\w/) {$seen_text=1;} 1108 1096 } 1109 1097 close EXTR_TEXT; … … 1157 1145 # if we're on windows we'll fall straight through without attempting 1158 1146 # to use gs 1159 if ($ENV{'GSDLOS'} =~ /^windows$/i) {1147 if ($ENV{'GSDLOS'} =~ m/^windows$/i) { 1160 1148 $error = "Windows does not support gs"; 1161 1149 … … 1181 1169 # possible for the actual text to start with this, but.... 1182 1170 open PSOUT, "$output_filestem.text"; 1183 if (<PSOUT> =~ /^Error: (.*)/) {1171 if (<PSOUT> =~ m/^Error: (.*)/) { 1184 1172 $error="interpreter error - \"$1\""; 1185 1173 } … … 1223 1211 1224 1212 # Make sure this is a ps file... 1225 if ($text !~ /^%!/) {1213 if ($text !~ m/^%!/) { 1226 1214 print STDERR "Bad postscript header: not '%!'\n"; 1227 1215 if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile"))) … … 1334 1322 $output_type =~ s/.*\_(.*)/$1/i; 1335 1323 $cmd .= "perl -S pstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\""; 1336 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000) {1324 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) { 1337 1325 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\""; 1338 1326 } else { … … 1406 1394 $line =~ s/</</g; 1407 1395 $line =~ s/>/>/g; 1408 if ($line =~ /^\s*$/) {1396 if ($line =~ m/^\s*$/) { 1409 1397 print HTML "<p>"; 1410 1398 } else { … … 1447 1435 # delete any string less than 10 characters long 1448 1436 $line =~ s/^.{0,9}$/\n/mg; 1449 while ($line =~ /^.{1,9}$/m) {1437 while ($line =~ m/^.{1,9}$/m) { 1450 1438 $line =~ s/^.{0,9}$/\n/mg; 1451 1439 $line =~ s/\n+/\n/sg; … … 1457 1445 1458 1446 # output whatever is left 1459 if ($line =~ /[^\n ]/) {1447 if ($line =~ m/[^\n ]/) { 1460 1448 print OUT $line; 1461 1449 ++$output_line_count;
Note:
See TracChangeset
for help on using the changeset viewer.