Changeset 1734
- Timestamp:
- 2000-12-01T16:36:33+13:00 (23 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/bin/script/gsConvert.pl
r1705 r1734 84 84 # Deduce filenames 85 85 my ($tailname,$dirname,$suffix) 86 = File::Basename::fileparse($input_filename,'\. [^\.]+');86 = File::Basename::fileparse($input_filename,'\..+'); 87 87 my $output_filestem = &util::filename_cat($dirname,"$tailname"); 88 88 … … 102 102 } 103 103 elsif ($input_type eq "doc") { 104 print STDERR "I recognise this to be a Word document...\n"; # remove 104 105 print &convertDOC($input_filename, $output_filestem, $output_type); 105 106 print "\n"; … … 148 149 my $realtype = &find_docfile_type($input_filename); 149 150 150 if ($realtype eq "word678") { 151 print STDERR "The real type of this Word document is $realtype\n"; # remove 152 153 if ($realtype eq "word6" || $realtype eq "word7" || $realtype eq "word8") { 154 print STDERR "I recognise this to be a word678 document...\n"; # remove 151 155 return &convertWord678($input_filename, $output_filestem, $output_type); 152 156 } elsif ($realtype eq "rtf") { … … 166 170 # Attempt specialised conversion to HTML 167 171 if (!$output_type || ($output_type =~ /html/i)) { 172 print STDERR "I am about to call doc_to_html...\n"; 168 173 $success = &doc_to_html($input_filename, $output_filestem); 169 174 if ($success) { … … 278 283 ($input_filename) = @_; 279 284 285 open(TMP, ">temp.txt"); 286 binmode(TMP); 280 287 open(CHK, "<$input_filename"); 288 binmode(CHK); 281 289 my $line = ""; 282 290 my $first = 1; … … 285 293 286 294 $line = $_; 287 295 print TMP "$line\n\n"; 288 296 if ($first) { 289 297 # check to see if this is an rtf file … … 294 302 } 295 303 296 # is th eis a word 6/7/8 document?297 if ($line =~ /Word\.Document\. [678]/) {304 # is this is a word 6/7/8 document? 305 if ($line =~ /Word\.Document\.([678])/) { 298 306 close(CHK); 299 return "word 678";307 return "word$1"; 300 308 } 301 309 … … 309 317 310 318 311 # Specific type-to-type c ponversions319 # Specific type-to-type conversions 312 320 # 313 321 # Each of the following functions attempts to convert a document from … … 320 328 321 329 sub doc_to_html { 330 print STDERR "/;-DG I am in doc_to_html...\n"; # remove 322 331 ($input_filename, $output_filestem) = @_; 323 332 324 # formulate the command 325 my $wv_home = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix", "wv"); 326 my $wv_conf = &util::filename_cat($wv_home, "lib", "wv", "wvHtml.xml"); 327 my $wvWare = &util::filename_cat($wv_home, "bin", "wvWare"); 333 my $wvWare = ""; 334 my $wv_conf = ""; 335 if ($ENV{'GSDLOS'} =~ /^windows$/i) { 336 $wvWare = "$ENV{'GSDLHOME'}\\bin\\windows\\wvWare.exe"; 337 $wv_conf = "$ENV{'GSDLHOME'}\\bin\\windows\\wvHtml.xml"; 338 339 } else { 340 # formulate the command 341 my $wv_home = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix", "wv"); 342 $wv_conf = &util::filename_cat($wv_home, "lib", "wv", "wvHtml.xml"); 343 $wvWare = &util::filename_cat($wv_home, "bin", "wvWare"); 344 } 345 print STDERR "I am about to test if your file exists...\n"; 328 346 return 0 unless (-e "$wvWare"); 329 347 $cmd = ""; … … 332 350 $cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\""; 333 351 352 print STDERR "$cmd\n"; #remove 353 334 354 # execute the command 355 print STDERR system($cmd); 356 print STDERR "\n"; 335 357 if (system($cmd)>0) 336 358 { … … 352 374 } 353 375 } 376 print STDERR "/;-DG I am leaving doc_to_html...\n"; 354 377 return 0; 355 378 } … … 488 511 my $cmd = "ps2ascii \"$input_filename\" > \"$output_filestem.text\""; 489 512 $cmd .= " 2> $output_filestem.err"; 490 491 513 if (system($cmd)>0) 492 514 { … … 494 516 &util::rm("$output_filestem.text") if (-e "$output_filestem.text"); 495 517 &util::rm("$output_filestem.err") if (-e "$output_filestem.err"); 496 return 0; 497 } 498 518 519 # Fine then. We'll just do a lousy job by ourselves... 520 # Based on code nicked from: 521 # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html 522 # 523 print STDERR "Attempting to strip text from postscript.\n"; 524 my $errorcode=0; 525 open (IN, "$input_filename") 526 || ($errorcode=1, warn "Couldn't read file: $!"); 527 open (OUT, ">$output_filestem.text") 528 || ($errorcode=1, warn "Couldn't write file: $!"); 529 if ($errorcode) {print STDERR "errors\n";return 0;} 530 531 my $in_a_sentence=0; 532 while (<IN>) { 533 if (/^[^\(\)]+$/ && !$in_a_sentence) {next ;} # no brackets in line 534 # attempt to add whitespace between different lines... 535 s/F.?\(/\( /g; # this might break up some other words though... 536 ### remove all postscript control data 537 if (!$in_a_sentence) { 538 s/^[^\(\)]*?\(//;} # rm start of line up to first open bracket 539 s/\\\(/\{/g;s/\\\)/\}/g ; # change quoted braces 540 s/\)([^\(\)])*?\(//g ; # close bracket up to next open unquoted bracket 541 if (s/\)[^\(\)]*?$//g) # last close bracket to end of line 542 {$in_a_sentence=0;chomp;} 543 if (s/\\$//) # if line is a continuation 544 {$in_a_sentence=1;chomp;} 545 s/^$//g ; # remove empty lines 546 ### ligatures have special characters... 547 s/\\214/fi/g; 548 s/\\215/fl/g; 549 print OUT "$_"; 550 } 551 close IN; close OUT; 552 } 499 553 &util::rm("$output_filestem.err") if (-e "$output_filestem.err"); 500 554 return 1; … … 506 560 507 561 sub any_to_html { 562 print STDERR "/;-Dg I am in any_to_html!\n"; 508 563 ($input_filename, $output_filestem) = @_; 509 564 … … 518 573 <META HTTP-EQUIV="Content-Type" CONTENT="text/html"> 519 574 <META NAME="GENERATOR" CONTENT="Greenstone any_to_html"> 520 </head><body>\n\n'; 575 </head><body>'; 576 print HTML "\n\n"; 577 521 578 while (<TEXT>) { 522 579 print HTML "<p> ", $_; 523 580 524 581 } 525 print HTML "\n</body></html> ]\n";582 print HTML "\n</body></html>\n"; 526 583 527 584 &util::rm("$output_filestem.text") if (-e "$output_filestem.text"); 585 print STDERR "/;-Dg I am getting out of any_to_html!\n"; 528 586 return 1; 529 587 } … … 535 593 ($input_filename, $output_filestem) = @_; 536 594 595 #open(TEMP, ">temp.txt"); 537 596 open(IN, "<$input_filename"); 597 binmode(IN); 538 598 open(OUT, ">$output_filestem.text"); 539 599 540 600 my ($line); 601 my $dgcount = 0; 541 602 while (<IN>) { 542 603 $line = $_; 543 604 544 605 # delete anything that isn't a printable character 606 #print TEMP $line; 545 607 $line =~ s/[^\040-\176]+/\n/sg; 546 608 547 609 # delete any string less than 10 characters long 548 $line =~ s/^ [^\n]{0,9}$/\n/mg;549 while ($line =~ /^ [^\n]{1,9}$/m) {550 $line =~ s/^ [^\n]{0,9}$/\n/mg;610 $line =~ s/^.{0,9}$/\n/mg; 611 while ($line =~ /^.{1,9}$/m) { 612 $line =~ s/^.{0,9}$/\n/mg; 551 613 $line =~ s/\n+/\n/sg; 552 614 }
Note:
See TracChangeset
for help on using the changeset viewer.