Changeset 2241
- Timestamp:
- 2001-04-01T21:19:25+12:00 (23 years ago)
- Location:
- trunk/gsdl
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/bin/script/gsConvert.pl
r2117 r2241 91 91 # Deduce filenames 92 92 my ($tailname,$dirname,$suffix) 93 = File::Basename::fileparse($input_filename, '\..+');94 my $output_filestem = &util::filename_cat($dirname, "$tailname");93 = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$"); 94 my $output_filestem = &util::filename_cat($dirname, "$tailname"); 95 95 96 96 if ($input_type eq "") 97 97 { 98 $input_type = substr($suffix,1,length($suffix)-1);98 $input_type = lc (substr($suffix,1,length($suffix)-1)); 99 99 } 100 100 … … 138 138 139 139 140 # Document-type conversion fu cntions140 # Document-type conversion functions 141 141 # 142 142 # The following functions attempt to convert documents from their … … 219 219 # Convert to text 220 220 if (!$output_type || ($output_type =~ /text/i)) { 221 $success = any_to_text($input_filename, $output_filestem);221 $success = &any_to_text($input_filename, $output_filestem); 222 222 if ($success) { 223 223 return "text"; … … 332 332 my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", 333 333 $ENV{'GSDLOS'}, "wvWare"); 334 $wvWare .= ".exe" if ($ENV{'GSDLOS'} =~ /^windows$/i); 335 return 0 unless (-e "$wvWare"); 334 335 # don't include path on windows (to avoid having to play about 336 # with quoting when GSDLHOME might contain spaces) but assume 337 # that the PATH is set up correctly 338 $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ /^windows$/i); 336 339 337 340 my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "packages", 338 341 "wv", "wvHtml.xml"); 339 342 343 my $cmd = ""; 344 if ($timeout) {$cmd = "ulimit -t $timeout;";} 345 $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\""; 346 $cmd .= " \"$input_filename\" > \"$output_filestem.html\""; 347 348 # redirecting STDERR is a bad idea on windows 95/98 349 $cmd .= " 2> \"$output_filestem.err\"" 350 if $ENV{'GSDLOS'} !~ /^windows$/i; 351 352 # execute the command 353 if (system($cmd)!=0) 354 { 355 print STDERR "Error executing wv converter: $!. Continuing...\n"; 356 } 357 358 # Was the conversion successful? 359 360 if (-e "$output_filestem.html") { 361 open(TMP, "$output_filestem.html"); 362 $line = <TMP>; 363 close(TMP); 364 if ($line && $line =~ /DOCTYPE HTML/) { 365 &util::rm("$output_filestem.err") if -e "$output_filestem.err"; 366 return 1; 367 } else { 368 # An error of some sort occurred 369 &util::rm("$output_filestem.html"); 370 &util::rm("$output_filestem.err") if -e "$output_filestem.err"; 371 } 372 } 373 374 return 0; 375 } 376 377 378 # Attempt to convert an RTF document to html with rtftohtml 379 # 380 # rtf2html isn't distributed with Greenstone because it is not 381 # distributed under the GPL. If you know of a better solution, 382 # please let me know. 383 384 sub rtf_to_html { 385 my ($input_filename, $output_filestem) = @_; 386 387 # we'll give up already if using Windows 388 return 0 if $ENV{'GSDLOS'} =~ /^windows$/i; 389 390 # formulate the command 391 my $r_cmd = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix", 392 "rtf2html", "rtf2html", "rtf2html"); 393 $r_cmd = "rtf2html" unless (-e "$r_cmd"); 394 return 0 unless (-e "$r_cmd"); 340 395 $cmd = ""; 341 396 if ($timeout) {$cmd = "ulimit -t $timeout;";} 342 $cmd .= "$ wvWare --charset utf-8 --config $wv_conf";397 $cmd .= "$r_cmd"; 343 398 $cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\""; 344 399 … … 346 401 if (system($cmd)!=0) 347 402 { 348 print STDERR "Error executing wvconverter: $!. Continuing...\n";403 print STDERR "Error executing rtf converter: $!. Continuing...\n"; 349 404 } 350 405 … … 363 418 } 364 419 } 365 366 420 return 0; 367 421 } 368 422 369 423 370 # Attempt to convert an RTF document to html with rtftohtml 371 # 372 # rtf2html isn't distributed with Greenstone because it is not 373 # distributed under teh GPL. If you know of a better solution, 374 # please let me know. 375 376 sub rtf_to_html { 377 ($input_filename, $output_filestem) = @_; 378 379 # formulate the command 380 my $r_cmd = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix", 381 "rtf2html", "rtf2html", "rtf2html"); 382 $r_cmd = "rtf2html" unless (-e "$r_cmd"); 383 return 0 unless (-e "$r_cmd"); 424 # Convert a pdf file to html with the pdftohtml command 425 426 sub pdf_to_html { 427 ($dirname, $input_filename, $output_filestem) = @_; 428 384 429 $cmd = ""; 385 430 if ($timeout) {$cmd = "ulimit -t $timeout;";} 386 $cmd .= "$r_cmd"; 387 $cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\""; 388 389 # execute the command 390 if (system($cmd)!=0) 391 { 392 print STDERR "Error executing rtf converter: $!. Continuing...\n"; 393 } 394 395 # Was the conversion successful? 396 if (-e "$output_filestem.html") { 397 open(TMP, "$output_filestem.html"); 398 $line = <TMP>; 399 close(TMP); 400 if ($line && $line =~ /DOCTYPE HTML/) { 401 &util::rm("$output_filestem.err"); 402 return 1; 403 } else { 404 # An error of some sort occurred 405 &util::rm("$output_filestem.html"); 406 &util::rm("$output_filestem.err"); 407 } 408 } 409 return 0; 410 } 411 412 413 # Convert a pdf file to html with the pdftohtml command 414 415 sub pdf_to_html { 416 ($dirname, $input_filename, $output_filestem) = @_; 417 418 $cmd = ""; 419 if ($timeout) {$cmd = "ulimit -t $timeout;";} 420 $cmd .= "pdftohtml.pl -F "; 431 $cmd .= "perl -S pdftohtml.pl -F "; 421 432 $cmd .= " \"$input_filename\" \"$output_filestem\""; 422 433 $!=0; 434 423 435 if (system($cmd)!=0) 424 436 { … … 491 503 492 504 sub ps_to_text { 493 ($input_filename, $output_filestem) = @_; 494 495 my $cmd = "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save "; 496 $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\""; 497 $cmd .= " 2> $output_filestem.err"; 498 $!=0; 499 my $retcode=system($cmd); 500 $retcode = $? >> 8; # see man perlfunc - system for this... 501 # if system returns -1 | 127 (couldn't start program), look at $! for message 502 my $error=""; 503 if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}} 504 elsif (! -e "$output_filestem.text") { 505 $error="did not create output file.\n"; 506 } 507 else 508 { # make sure the interpreter didn't get an error. It is technically 509 # possible for the actual text to start with this, but.... 510 open PSOUT, "$output_filestem.text"; 511 if (<PSOUT> =~ /^Error: (.*)/) { 512 $error="interpreter error - \"$1\""; 513 } 514 close PSOUT; 515 } 505 my ($input_filename, $output_filestem) = @_; 506 507 my $error = ""; 508 509 # if we're on windows we'll fall straight through without attempting 510 # to use gs 511 if ($ENV{'GSDLOS'} =~ /^windows$/i) { 512 $error = "Windows does not support gs"; 513 514 } else { 515 my $cmd = "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save "; 516 $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\""; 517 $cmd .= " 2> $output_filestem.err"; 518 $!=0; 519 my $retcode=system($cmd); 520 $retcode = $? >> 8; # see man perlfunc - system for this... 521 # if system returns -1 | 127 (couldn't start program), look at $! for message 522 523 if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}} 524 elsif (! -e "$output_filestem.text") { 525 $error="did not create output file.\n"; 526 } 527 else 528 { # make sure the interpreter didn't get an error. It is technically 529 # possible for the actual text to start with this, but.... 530 open PSOUT, "$output_filestem.text"; 531 if (<PSOUT> =~ /^Error: (.*)/) { 532 $error="interpreter error - \"$1\""; 533 } 534 close PSOUT; 535 } 536 } 537 516 538 if ($error ne "") 517 539 { … … 611 633 open(HTML, ">$output_filestem.html"); 612 634 613 print HTML '<html><head> 614 <META HTTP-EQUIV="Content-Type" CONTENT="text/html"> 615 <META NAME="GENERATOR" CONTENT="Greenstone any_to_html"> 616 </head><body>'; 617 print HTML "\n\n"; 635 print HTML "<html><head>\n"; 636 print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n"; 637 print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n"; 638 print HTML "</head><body>\n\n"; 618 639 619 640 while (<TEXT>) { 620 641 print HTML "<p> ", $_; 621 622 642 } 623 643 print HTML "\n</body></html>\n"; 644 645 close HTML; 646 close TEXT; 624 647 625 648 &util::rm("$output_filestem.text") if (-e "$output_filestem.text"); … … 661 684 } 662 685 } 686 687 close OUT; 688 close IN; 689 663 690 return 1; 664 691 } -
trunk/gsdl/bin/script/pdftohtml.pl
r2118 r2241 177 177 print STDERR "pdftohtml.pl: $input_filename appears to have no "; 178 178 print STDERR "textual data. Aborting.\n"; 179 print STDERR "num: $unenc_stream_objects and $non_text_objects from $num_objects\n";179 # print STDERR "num: $unenc_stream_objects and $non_text_objects from $num_objects\n"; 180 180 exit(1); 181 181 } 182 182 183 183 # formulate the command 184 my $pdftohtml = &util::filename_cat($ENV{'GSDLHOME'}, "bin", 185 $ENV{'GSDLOS'}, "pdftohtml.bin"); 186 return 0 unless (-e "$pdftohtml"); 187 188 $cmd = ""; 184 my $cmd = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "pdftohtml.bin"); 185 186 # don't include path on windows (to avoid having to play about 187 # with quoting when GSDLHOME might contain spaces) but assume 188 # that the PATH is set up correctly - note also that on windows 189 # we use pdftohtml.exe not pdftohtml.bin 190 $cmd = "pdftohtml" if ($ENV{'GSDLOS'} !~ /^windows$/) { 191 189 192 if ($timeout) {$cmd = "ulimit -t $timeout;";} 190 $cmd .= "$pdftohtml -noframes"; 191 $cmd .= " \"$input_filename\" \"$output_filestem.html\""; 192 $cmd .= " >\"$output_filestem.out\" 2>\"$output_filestem.err\""; 193 193 $cmd .= " -noframes \"$input_filename\" \"$output_filestem.html\""; 194 $cmd .= " > \"$output_filestem.out\""; 195 196 # attempting to redirect STDERR on windows 95/98 is a bad idea 197 $cmd .= " 2> \"$output_filestem.err\"" 198 if $ENV{'GSDLOS'} !~ /^windows$/i; 199 194 200 if (system($cmd)>0) { 195 201 print STDERR "Error executing $cmd: $!\n"; … … 201 207 # Need to convert images from PPM format to PNG format 202 208 my @images; 209 203 210 204 211 open (IMAGES, "images.log"); … … 212 219 my $cmd = ""; 213 220 if ($ENV{'GSDLOS'} =~ /^windows/i) { 214 $cmd = &util::filename_cat($ENV{'GSDLHOME'}, "bin", "windows", "pnmtopng.exe"); 215 $cmd .= " $image"; 221 $cmd = "pnmtopng $image"; 216 222 if (system($cmd)!=0) { 217 223 print STDERR "Error executing $cmd\n"; -
trunk/gsdl/perllib/plugins/ConvertToPlug.pm
r2086 r2241 94 94 if ($class eq "ConvertToPlug") {$class = shift (@_);} 95 95 my $self; 96 # parsargv::parse might modify the list, so we do this by creating a copy97 # of the argument list.96 # parsargv::parse might modify the list, so we do this by creating a copy 97 # of the argument list. 98 98 my @arglist = @_; 99 my ($plugin_name,$generate_format, $kea_arg) = $class->parse_args(\@_); 99 my ($plugin_name, $generate_format, $kea_arg) = $class->parse_args(\@_); 100 101 if ($class eq "PDFPlug" && $generate_format eq "text" && 102 $ENV{'GSDLOS'} =~ /^windows$/i) { 103 print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n"; 104 $generate_format = "html"; 105 } 100 106 101 107 if ($generate_format eq "text") … … 135 141 sub tmp_area_convert_file { 136 142 my $self = shift (@_); 137 my ($output_ext, $input_filename, $textref) = @_;143 my ($output_ext, $input_filename, $textref) = @_; 138 144 139 145 my $convert_to = $self->{'convert_to'}; … … 142 148 my $colname = &util::use_collection(); 143 149 my $tmp_dirname 144 = &util::filename_cat($ENV{'GSDLHOME'}, "collect",$colname,"tmp");150 = &util::filename_cat($ENV{'GSDLHOME'}, "collect", $colname, "tmp"); 145 151 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname); 146 152 147 153 # derive tmp filename from input filename 148 my ($tailname, $dirname,$suffix)149 = File::Basename::fileparse($input_filename,'\.[^\.]+$');154 my ($tailname, $dirname, $suffix) 155 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$"); 150 156 151 157 # Remove any white space from filename -- no risk of name collision, and … … 153 159 $tailname =~ s/\s+//g; 154 160 155 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");156 157 &util::soft_link($input_filename, $tmp_filename);161 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix"); 162 163 &util::soft_link($input_filename, $tmp_filename); 158 164 159 165 my $verbosity = $self->{'verbosity'}; 160 if ($verbosity>0) 161 { 166 if ($verbosity > 0) { 162 167 print STDERR "Converting $tailname$suffix to $convert_to format\n"; 163 168 } … … 166 171 # making sure the converter gives us the appropriate output type 167 172 my $output_type = lc($convert_to); 168 my $cmd = " gsConvert.pl -verbose $verbosity -output $output_type \"$tmp_filename\"";173 my $cmd = "perl -S gsConvert.pl -verbose $verbosity -output $output_type \"$tmp_filename\""; 169 174 $output_type = `$cmd`; 170 171 # Check STDERR here172 175 173 176 chomp $output_type; … … 175 178 print STDERR "Could not convert $tailname$suffix to $convert_to format\n"; 176 179 return ""; 177 ### exit 1;178 180 } 179 181 … … 184 186 $self->{'convert_to_ext'} = $output_type; 185 187 my $output_filename = $tmp_filename; 188 186 189 $output_filename =~ s/$suffix$/.$output_type/; 187 190 … … 197 200 my $colname = &util::use_collection(); 198 201 my $tmp_dirname 199 = &util::filename_cat($ENV{'GSDLHOME'}, "collect",$colname,"tmp");202 = &util::filename_cat($ENV{'GSDLHOME'}, "collect", $colname, "tmp"); 200 203 &util::rm_r($tmp_dirname); 201 204 &util::mk_dir($tmp_dirname); … … 229 232 230 233 my $output_ext = $self->{'convert_to_ext'}; 231 my $conv_filename = $self->tmp_area_convert_file($output_ext,$filename); 234 my $conv_filename = $self->tmp_area_convert_file($output_ext, $filename); 235 232 236 if ("$conv_filename" eq "") {return 0;} # allows continue on errors 233 237 if (! -e "$conv_filename") {return 0;} # allows continue on errors 234 238 $self->{'conv_filename'} = $conv_filename; 235 239 236 # Do encoding stuff240 # Do encoding stuff 237 241 my ($language, $encoding); 238 242 if ($self->{'input_encoding'} eq "auto") { … … 253 257 } 254 258 255 BasPlug::read_file($self,$conv_filename, $encoding, \$text);259 &BasPlug::read_file($self, $conv_filename, $encoding, \$text); 256 260 if (!length ($text)) { 257 261 print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'}; … … 301 305 { 302 306 303 $ret_val = TEXTPlug::process($self,$textref,$pluginfo,304 $tmp_dirname,$tmp_tailname,305 $metadata,$doc_obj);307 $ret_val = &TEXTPlug::process($self, $textref, $pluginfo, 308 $tmp_dirname, $tmp_tailname, 309 $metadata, $doc_obj); 306 310 } 307 311 else 308 312 { 309 $ret_val = HTMLPlug::process($self,$textref,$pluginfo,310 $tmp_dirname,$tmp_tailname,311 $metadata,$doc_obj);313 $ret_val = &HTMLPlug::process($self, $textref, $pluginfo, 314 $tmp_dirname, $tmp_tailname, 315 $metadata, $doc_obj); 312 316 } 313 317 314 318 # associate original file with doc object 315 319 my $cursection = $doc_obj->get_top_section(); 316 my $filename = &util::filename_cat($base_dir, $file);320 my $filename = &util::filename_cat($base_dir, $file); 317 321 $doc_obj->associate_file($filename, "doc.$doc_ext", undef, $cursection); 318 322
Note:
See TracChangeset
for help on using the changeset viewer.