Context Navigation

source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 32221

Last change on this file since 32221 was 32221, checked in by ak19, 6 years ago
Cosmetic changes: some cleanup.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 41.7 KB

Rev	Line
[1445]	1	#!/usr/bin/perl -w
	2
	3	###########################################################################
	4	#
[2032]	5	# gsConvert.pl -- convert documents to HTML or TEXT format
[1445]	6	#
	7	# A component of the Greenstone digital library software
	8	# from the New Zealand Digital Library Project at the
	9	# University of Waikato, New Zealand.
	10	#
[3013]	11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
[1445]	12	#
	13	# This program is free software; you can redistribute it and/or modify
	14	# it under the terms of the GNU General Public License as published by
	15	# the Free Software Foundation; either version 2 of the License, or
	16	# (at your option) any later version.
	17	#
	18	# This program is distributed in the hope that it will be useful,
	19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	21	# GNU General Public License for more details.
	22	#
	23	# You should have received a copy of the GNU General Public License
	24	# along with this program; if not, write to the Free Software
	25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	26	#
	27	###########################################################################
	28
	29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
[2755]	30	# by exploiting third-party programs. The sources of these are usually found
	31	# in the $GSDLHOME/packages directory, and the executables should live in
	32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
[1445]	33	#
[3013]	34	# Currently, we can convert the following formats by using external
	35	# conversion utilities:
	36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
	37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
[2032]	38	#
[3013]	39	# We can try to convert any file to text with a perl implementation of the
	40	# UNIX strings command.
	41	#
[2032]	42	# We try to convert Postscript files to text using "gs" which is often on
[2755]	43	# *nix machines. We fall back to performing weak text extraction by using
	44	# regular expressions.
[1445]	45
	46	BEGIN {
	47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
	48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
	49	}
	50
[22429]	51	use strict;
	52
[1445]	53	use parsargv;
	54	use util;
[27509]	55	use FileUtils;
[1445]	56	use Cwd;
	57
[2755]	58	# Are we running on WinNT or Win2000 (or later)?
	59	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
	60	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
[1445]	61
[3350]	62	my $use_strings;
[3720]	63	my $pdf_complex;
[4103]	64	my $pdf_nohidden;
[3720]	65	my $pdf_zoom;
	66	my $pdf_ignore_images;
[10451]	67	my $pdf_allow_images_only;
[10282]	68	my $windows_scripting;
[3350]	69
[1445]	70	sub print_usage
	71	{
[1970]	72	print STDERR "\n";
	73	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
	74	print STDERR " or text using third-party programs.\n\n";
	75	print STDERR " usage: $0 [options] filename\n";
[22642]	76	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
[2755]	77	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
[22596]	78	print STDERR "\t-output\tauto\|html\|text\|pagedimg_jpg\|pagedimg_gif\|pagedimg_png\t(output file type)\n";
[2755]	79	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
[3720]	80	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
[22568]	81	print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
[3720]	82	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
[4103]	83	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
[3720]	84	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
	85	print STDERR "\t\tconverting PDF to HTML\n";
[10451]	86	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
[3720]	87	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
	88	print STDERR "\t\t-pdf_complex is set\n";
[1445]	89	exit(1);
	90	}
	91
[2755]	92	my $faillogfile="";
[3538]	93	my $timeout=0;
[24375]	94	my $verbosity=0;
[1445]	95
	96	sub main
	97	{
	98	my (@ARGV) = @_;
[3538]	99	my ($input_type,$output_type,$verbose);
[1960]	100
[23473]	101	# Dynamically figure out what the --type option can support, based on whether -windows_scripting
	102	# is in use or not
	103	my $default_type_re = "(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)";
	104	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
	105	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
	106	# Currently only have VBA for Word and PPT(but no XLS)
	107	my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xls)";
	108
	109	my $type_re = $default_type_re;
	110
	111	foreach my $a (@ARGV) {
	112	if ($a =~ m/^windows_scripting$/i) {
	113	$type_re = $enhanced_type_re;
	114	}
	115	}
	116
[1445]	117	# read command-line arguments
	118	if (!parsargv::parse(\@ARGV,
[23473]	119	"type/$type_re/", \$input_type,
[2755]	120	'/errlog/.*/', \$faillogfile,
[22596]	121	'output/(auto\|html\|text\|pagedimg).*/', \$output_type,
[1692]	122	'timeout/\d+/0',\$timeout,
[10282]	123	'verbose/\d+/0', \$verbose,
[22429]	124	'windows_scripting',\$windows_scripting,
[3720]	125	'use_strings', \$use_strings,
	126	'pdf_complex', \$pdf_complex,
[9482]	127	'pdf_ignore_images', \$pdf_ignore_images,
[10451]	128	'pdf_allow_images_only', \$pdf_allow_images_only,
[4103]	129	'pdf_nohidden', \$pdf_nohidden,
[3720]	130	'pdf_zoom/\d+/2', \$pdf_zoom
	131	))
[1445]	132	{
	133	print_usage();
	134	}
[24375]	135
	136	$verbosity=$verbose if defined $verbose;
	137
[1445]	138	# Make sure the input file exists and can be opened for reading
	139	if (scalar(@ARGV!=1)) {
	140	print_usage();
	141	}
[1928]	142
[1445]	143	my $input_filename = $ARGV[0];
	144	if (!-r $input_filename) {
	145	print STDERR "Error: unable to open $input_filename for reading\n";
	146	exit(1);
	147	}
	148
	149	# Deduce filenames
	150	my ($tailname,$dirname,$suffix)
[2241]	151	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
[27509]	152	my $output_filestem = &FileUtils::filenameConcatenate($dirname, "$tailname");
[1445]	153
	154	if ($input_type eq "")
	155	{
[2241]	156	$input_type = lc (substr($suffix,1,length($suffix)-1));
[1445]	157	}
	158
	159	# Change to temporary working directory
	160	my $stored_dir = cwd();
	161	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
[10357]	162
[1445]	163	# Select convert utility
	164	if (!defined $input_type) {
	165	print STDERR "Error: No filename extension or input type defined\n";
	166	exit(1);
	167	}
[23473]	168	elsif ($input_type =~ m/^docx?$/ \|\| $input_type eq "dot") {
[1445]	169	print &convertDOC($input_filename, $output_filestem, $output_type);
	170	print "\n";
	171	}
[1684]	172	elsif ($input_type eq "rtf") {
	173	print &convertRTF($input_filename, $output_filestem, $output_type);
	174	print "\n";
	175	}
[1445]	176	elsif ($input_type eq "pdf") {
	177	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
	178	print "\n";
	179	}
	180	elsif ($input_type eq "ps") {
[22429]	181	print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
[1445]	182	print "\n";
	183	}
[23473]	184	elsif ($input_type =~ m/pptx?$/) {
[2977]	185	print &convertPPT($input_filename, $output_filestem, $output_type);
	186	print "\n";
	187	}
[23473]	188	elsif ($input_type =~ m/xlsx?$/) {
[2991]	189	print &convertXLS($input_filename, $output_filestem, $output_type);
	190	print "\n";
	191	}
[1445]	192	else {
	193	print STDERR "Error: Unable to convert type '$input_type'\n";
	194	exit(1);
	195	}
	196
	197	# restore to original working directory
	198	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
	199
	200	}
	201
	202	&main(@ARGV);
	203
	204
	205
[2241]	206	# Document-type conversion functions
[1445]	207	#
	208	# The following functions attempt to convert documents from their
	209	# input type to the specified output type. If no output type was
	210	# given, then they first attempt HTML, and then TEXT.
	211	#
	212	# Each returns the output type ("html" or "text") or "fail" if no
	213	# conversion is possible.
	214
	215	# Convert a Microsoft word document
	216
	217	sub convertDOC {
[22429]	218	my ($input_filename, $output_filestem, $output_type) = @_;
[1445]	219
[1654]	220	# Many .doc files are not in fact word documents!
	221	my $realtype = &find_docfile_type($input_filename);
	222
[23473]	223	if ($realtype eq "word6" \|\| $realtype eq "word7"
	224	\|\| $realtype eq "word8" \|\| $realtype eq "docx") {
[1654]	225	return &convertWord678($input_filename, $output_filestem, $output_type);
	226	} elsif ($realtype eq "rtf") {
	227	return &convertRTF($input_filename, $output_filestem, $output_type);
	228	} else {
	229	return &convertAnything($input_filename, $output_filestem, $output_type);
	230	}
	231	}
	232
	233	# Convert a Microsoft word 6/7/8 document
	234
	235	sub convertWord678 {
[22429]	236	my ($input_filename, $output_filestem, $output_type) = @_;
[1654]	237
[1445]	238	my $success = 0;
[16435]	239	if (!$output_type \|\| ($output_type =~ m/html/i)){
[10282]	240	if ($windows_scripting) {
	241	$success = &native_doc_to_html($input_filename, $output_filestem);
	242	}
	243	else {
	244	$success = &doc_to_html($input_filename, $output_filestem);
	245	}
[1445]	246	if ($success) {
[10282]	247	return "html";
[1445]	248	}
	249	}
[1654]	250	return &convertAnything($input_filename, $output_filestem, $output_type);
	251	}
	252
	253
	254	# Convert a Rich Text Format (RTF) file
	255
	256	sub convertRTF {
[22429]	257	my ($input_filename, $output_filestem, $output_type) = @_;
[1654]	258
	259	my $success = 0;
	260
	261	# Attempt specialised conversion to HTML
[16435]	262	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[12704]	263
	264	if ($windows_scripting) {
	265	$success = &native_doc_to_html($input_filename, $output_filestem);
	266	}
	267	else {
	268	$success = &rtf_to_html($input_filename, $output_filestem);
	269	}
[1654]	270	if ($success) {
	271	return "html";
	272	}
	273	}
	274
[2755]	275	# rtf is so ugly that's it's not worth running strings over.
	276	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
	277	# return &convertAnything($input_filename, $output_filestem, $output_type);
	278	return "fail";
[1654]	279	}
	280
	281
	282	# Convert an unidentified file
	283
	284	sub convertAnything {
[22429]	285	my ($input_filename, $output_filestem, $output_type) = @_;
[1654]	286
	287	my $success = 0;
[10464]	288
[1445]	289	# Attempt simple conversion to HTML
[16435]	290	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[1445]	291	$success = &any_to_html($input_filename, $output_filestem);
	292	if ($success) {
	293	return "html";
	294	}
	295	}
	296
	297	# Convert to text
[16435]	298	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[2241]	299	$success = &any_to_text($input_filename, $output_filestem);
[1445]	300	if ($success) {
	301	return "text";
	302	}
	303	}
	304	return "fail";
	305	}
	306
	307
[1654]	308
[1445]	309	# Convert an Adobe PDF document
	310
	311	sub convertPDF {
[2755]	312	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[1445]	313
	314	my $success = 0;
[10357]	315	$output_type =~ s/.\-(.)/$1/i;
	316	# Attempt coversion to Image
[16435]	317	if ($output_type =~ m/jp?g\|gif\|png/i) {
[17329]	318	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
[10357]	319	if ($success){
	320	return "item";
	321	}
	322	}
[1445]	323
	324	# Attempt conversion to HTML
[32205]	325	# Uses the old pdftohtml that doesn't work for newer PDF versions
	326	#if ($output_type =~ m/^html/i) {
	327	if (!$output_type \|\| ($output_type =~ m/^html/i)) {
[1445]	328	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
	329	if ($success) {
	330	return "html";
	331	}
	332	}
	333
[32205]	334	# Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools. This
	335	# will be the new default for PDFs when output_type for PDF docs is not specified
	336	# (once our use of xpdftools' pdftohtml has been implemented on win and mac).
	337	if ($output_type =~ m/paged_html/i) {
	338	#if (!$output_type \|\| ($output_type =~ m/paged_html/i)) {
	339	$success = &xpdf_to_html($dirname, $input_filename, $output_filestem);
	340	if ($success) {
	341	return "paged_html";
	342	}
	343	}
	344
[1445]	345	# Attempt conversion to TEXT
[16435]	346	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[2117]	347	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
[1445]	348	if ($success) {
	349	return "text";
	350	}
	351	}
	352
	353	return "fail";
	354
	355	}
	356
	357
	358	# Convert an Adobe PostScript document
	359
	360	sub convertPS {
[22429]	361	my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
[1445]	362
	363	my $success = 0;
[10534]	364	$output_type =~ s/.\-(.)/$1/i;
	365	# Attempt coversion to Image
[16435]	366	if ($output_type =~ m/jp?g\|gif\|png/i) {
[17329]	367	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
[10534]	368	if ($success){
	369	return "item";
	370	}
	371	}
[1445]	372
	373	# Attempt conversion to TEXT
[16435]	374	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[1445]	375	$success = &ps_to_text($input_filename, $output_filestem);
	376	if ($success) {
	377	return "text";
	378	}
	379	}
	380	return "fail";
	381	}
	382
	383
[2977]	384	sub convertPPT {
	385	my ($input_filename, $output_filestem, $output_type) = @_;
[10357]	386	my $success = 0;
[2977]	387
[10282]	388	my $ppt_convert_type = "";
[22513]	389
[16435]	390	#if (!$output_type \|\| $windows_scripting \|\| ($output_type !~ m/html/i) \|\| ($output_type !~ m/text/i)){
	391	if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
	392	if ($output_type =~ m/gif/i) {
[10282]	393	$ppt_convert_type = "-g";
[16435]	394	} elsif ($output_type =~ m/jp?g/i){
[10282]	395	$ppt_convert_type = "-j";
[16435]	396	} elsif ($output_type =~ m/png/i){
[10282]	397	$ppt_convert_type = "-p";
	398	}
[27509]	399	my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin",
[10282]	400	$ENV{'GSDLOS'}, "pptextract");
[28355]	401	$vbScript = "CScript //Nologo \"".$vbScript.".vbs\"" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # now we use the .vbs VBScript
	402	# $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # back when the pptextract.exe VB executable was used
[10282]	403
[22429]	404	my $cmd = "";
[10357]	405	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[22429]	406	# if the converting directory already exists
[10282]	407	if (-d $output_filestem) {
[22429]	408	print STDERR "**The conversion directory already exists\n";
[10282]	409	return "item";
	410	} else {
[10521]	411	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
[10282]	412	$cmd .= " 2>\"$output_filestem.err\""
[16435]	413	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[28355]	414
[10282]	415	if (system($cmd) !=0) {
	416	print STDERR "Powerpoint VB Scripting convert failed\n";
	417	} else {
	418	return "item";
	419	}
	420	}
[16435]	421	} elsif (!$output_type \|\| ($output_type =~ m/html/i)) {
[10282]	422	# Attempt conversion to HTML
[16435]	423	#if (!$output_type \|\| ($output_type =~ m/html/i)) {
[2977]	424	# formulate the command
[22429]	425	my $cmd = "";
[24362]	426	my $full_perl_path = &util::get_perl_exec();
[24124]	427	$cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
[2977]	428	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
	429	$cmd .= " 2>\"$output_filestem.err\""
[16435]	430	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[10357]	431
[2977]	432	# execute the command
	433	$!=0;
	434	if (system($cmd)!=0)
	435	{
[2991]	436	print STDERR "Powerpoint 95/97 converter failed $!\n";
[2977]	437	} else {
	438	return "html";
	439	}
[10464]	440	}
[2977]	441
	442	$success = &any_to_text($input_filename, $output_filestem);
	443	if ($success) {
	444	return "text";
	445	}
[10464]	446
[2977]	447	return "fail";
	448	}
	449
	450
[2991]	451	sub convertXLS {
	452	my ($input_filename, $output_filestem, $output_type) = @_;
[2977]	453
[2991]	454	my $success = 0;
[2977]	455
[2991]	456	# Attempt conversion to HTML
[16435]	457	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[2991]	458	# formulate the command
[22429]	459	my $cmd = "";
[24362]	460	my $full_perl_path = &util::get_perl_exec();
[24124]	461	$cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
[2991]	462	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
	463	$cmd .= " 2>\"$output_filestem.err\""
[16435]	464	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[2991]	465
	466
	467	# execute the command
	468	$!=0;
	469	if (system($cmd)!=0)
	470	{
	471	print STDERR "Excel 95/97 converter failed $!\n";
	472	} else {
	473	return "html";
	474	}
	475	}
[2977]	476
[2991]	477	$success = &any_to_text($input_filename, $output_filestem);
	478	if ($success) {
	479	return "text";
	480	}
	481
	482	return "fail";
	483	}
	484
	485
	486
[1654]	487	# Find the real type of a .doc file
	488	#
[2012]	489	# We seem to have a lot of files with a .doc extension that are .rtf
[1654]	490	# files or Word 5 files. This function attempts to tell the difference.
	491	sub find_docfile_type {
[22429]	492	my ($input_filename) = @_;
[23473]	493
	494	if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
	495	return "docx";
	496	}
	497
[1654]	498	open(CHK, "<$input_filename");
[1734]	499	binmode(CHK);
[1654]	500	my $line = "";
	501	my $first = 1;
	502
	503	while (<CHK>) {
	504
	505	$line = $_;
[1960]	506
[1654]	507	if ($first) {
	508	# check to see if this is an rtf file
[16435]	509	if ($line =~ m/^\{\\rtf/) {
[1654]	510	close(CHK);
	511	return "rtf";
	512	}
[2755]	513	$first = 0;
[1654]	514	}
	515
[1734]	516	# is this is a word 6/7/8 document?
[16435]	517	if ($line =~ m/Word\.Document\.([678])/) {
[1654]	518	close(CHK);
[23473]	519
[1734]	520	return "word$1";
[1654]	521	}
	522
	523	}
	524
	525	return "unknown";
	526	}
	527
	528
[1734]	529	# Specific type-to-type conversions
[1445]	530	#
	531	# Each of the following functions attempts to convert a document from
[2755]	532	# a specific format to another. If they succeed they return 1 and leave
[1445]	533	# the output document(s) in the appropriate place; if they fail they
	534	# return 0 and delete any working files.
	535
	536
	537	# Attempt to convert a word document to html with the wv program
	538	sub doc_to_html {
[22429]	539	my ($input_filename, $output_filestem) = @_;
[1445]	540
[24371]	541	my $wvware_status = 0;
[24375]	542
[24371]	543	# need to ensure that the path to perl is quoted (in case there's spaces in it)
[24513]	544	my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl \"$input_filename\" \"$output_filestem\" \"$faillogfile\" $verbosity $timeout";
[15120]	545
[30683]	546	# print STDERR "***** wvware launch cmd = $launch_cmd\n";
[15120]	547
[24371]	548	$wvware_status = system($launch_cmd)/256;
	549	return $wvware_status;
[1445]	550	}
	551
[10282]	552	# Attempt to convert a word document to html with the word2html scripting program
	553	sub native_doc_to_html {
[22429]	554	my ($input_filename, $output_filestem) = @_;
[1445]	555
[24166]	556	# build up the path to the doc-to-html conversion tool we're going to use
[27509]	557	my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
[10282]	558
[24164]	559	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
[24166]	560	# if windows scripting with docx input, use new VBscript to get the local Word install (if
	561	# any) to do the conversion, since docX can't be processed by word2html's windows_scripting
	562
	563	if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
	564	# else script launch fails when there are error msgs
[27509]	565	$vbScript = &FileUtils::filenameConcatenate($vbScript, "docx2html.vbs");
[28355]	566	$vbScript = "CScript //Nologo \"$vbScript\""; # launch with CScript for error output in STDERR
[24169]	567	# //Nologo flag avoids Microsoft's opening/logo msgs
	568	print STDERR "About to use windows scripting to process docx file $input_filename.\n";
	569	print STDERR " This may take some time. Please wait...\n";
[24166]	570	}
	571	else { # old doc versions. use the usual VB executable word2html for the
	572	# conversion. Doesn't need full path, since bin\windows is on PATH
[27509]	573	$vbScript = "word2html"; #$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
[24166]	574	}
	575	}
	576	else { # not windows
[27509]	577	$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
[24164]	578	}
	579
[10445]	580	if (-e "$output_filestem.html") {
[22429]	581	print STDERR " The conversion file:\n";
	582	print STDERR " $output_filestem.html\n";
	583	print STDERR " ... already exists. Skipping\n";
[10445]	584	return 1;
	585	}
[10282]	586
	587	my $cmd = "";
	588	if ($timeout) {$cmd = "ulimit -t $timeout;";}
	589	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
[10445]	590	#$cmd .= "$vbScript $input_filename $output_filestem.html";
[10521]	591	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
[10445]	592
[10282]	593	# redirecting STDERR
[24166]	594
	595	$cmd .= " 2> \"$output_filestem.err\""
	596	if ($ENV {'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
	597	#print STDERR "@@@@@@@@@ cmd=$cmd\n";
	598
[10282]	599	# execute the command
	600	$!=0;
	601	if (system($cmd)!=0)
	602	{
[24164]	603	print STDERR "Error executing $vbScript converter:$!\n";
[10282]	604	if (-s "$output_filestem.err") {
	605	open (ERRFILE, "<$output_filestem.err");
[24166]	606
[10282]	607	my $write_to_fail_log=0;
	608	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	609	{$write_to_fail_log=1;}
	610
	611	my $line;
	612	while ($line=<ERRFILE>) {
[16435]	613	if ($line =~ m/\w/) {
[10282]	614	print STDERR "$line";
	615	print FAILLOG "$line" if ($write_to_fail_log);
	616	}
	617	if ($line !~ m/startup error/) {next;}
	618	print STDERR " (given an invalid .DOC file?)\n";
	619	print FAILLOG " (given an invalid .DOC file?)\n"
	620	if ($write_to_fail_log);
	621
	622	} # while ERRFILE
	623	close FAILLOG if ($write_to_fail_log);
	624	}
	625	return 0; # we can try any_to_text
	626	}
	627
	628	# Was the conversion successful?
	629	if (-s "$output_filestem.html") {
	630	open(TMP, "$output_filestem.html");
[22429]	631	my $line = <TMP>;
[10282]	632	close(TMP);
[22429]	633	if ($line && $line =~ m/html/i) {
[27509]	634	&FileUtils::removeFiles("$output_filestem.err") if -e "$output_filestem.err";
[10282]	635	return 1;
	636	}
	637	}
	638
	639	# If here, an error of some sort occurred
[27509]	640	&FileUtils::removeFiles("$output_filestem.html") if -e "$output_filestem.html";
[10282]	641	if (-e "$output_filestem.err") {
	642	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
	643	open (ERRLOG,"$output_filestem.err");
	644	while (<ERRLOG>) {print FAILLOG $_;}
	645	close FAILLOG;
	646	close ERRLOG;
	647	}
[27509]	648	&FileUtils::removeFiles("$output_filestem.err");
[10282]	649	}
	650	return 0;
	651	}
	652
[1654]	653	# Attempt to convert an RTF document to html with rtftohtml
	654	sub rtf_to_html {
[2241]	655	my ($input_filename, $output_filestem) = @_;
[1654]	656
	657	# formulate the command
[22429]	658	my $cmd = "";
[1692]	659	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[2574]	660	$cmd .= "rtftohtml";
[10282]	661	#$cmd .= "rtf-converter";
[1654]	662
[3246]	663	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
[2574]	664
	665	$cmd .= " 2>\"$output_filestem.err\""
[16435]	666	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[2574]	667
	668
[1654]	669	# execute the command
[2755]	670	$!=0;
[2060]	671	if (system($cmd)!=0)
[1654]	672	{
[2755]	673	print STDERR "Error executing rtf converter $!\n";
[2656]	674	# don't currently bother printing out error log...
	675	# keep going, in case it still created an HTML file...
[1654]	676	}
	677
	678	# Was the conversion successful?
[2755]	679	my $was_successful=0;
[2656]	680	if (-s "$output_filestem.html") {
[2755]	681	# make sure we have some content other than header
	682	open (HTML, "$output_filestem.html"); # what to do if fail?
	683	my $line;
	684	my $past_header=0;
	685	while ($line=<HTML>) {
	686
	687	if ($past_header == 0) {
[16435]	688	if ($line =~ m/<body>/) {$past_header=1;}
[2755]	689	next;
	690	}
	691
	692	$line =~ s/<[^>]+>//g;
[16435]	693	if ($line =~ m/\w/ && $past_header) { # we found some content...
[2755]	694	$was_successful=1;
	695	last;
	696	}
	697	}
	698	close HTML;
[1654]	699	}
[2574]	700
[2755]	701	if ($was_successful) {
[27509]	702	&FileUtils::removeFiles("$output_filestem.err")
[2755]	703	if (-e "$output_filestem.err");
	704	# insert the (modified) table of contents, if it exists.
	705	if (-e "${output_filestem}_ToC.html") {
[27509]	706	&FileUtils::moveFiles("$output_filestem.html","$output_filestem.src");
[2755]	707	my $open_failed=0;
	708	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
	709	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
	710	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
	711
	712	if ($open_failed) {
	713	close HTMLSRC;
	714	close TOC;
	715	close HTML;
[27509]	716	&FileUtils::moveFiles("$output_filestem.src","$output_filestem.html");
[2755]	717	return 1;
	718	}
	719
	720	# print out header info from src html.
[16435]	721	while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
[2755]	722	print HTML "$_";
	723	}
	724
	725	# print out table of contents, making links relative
	726	<TOC>; <TOC>; # ignore first 2 lines
	727	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
	728	my $line;
	729	while ($line=<TOC>) {
[22429]	730	$line =~ s@</body></html>$@@i ; # only last line has this
[2755]	731	# make link relative
[22429]	732	$line =~ s@href=\"[^\#]+@href=\"@i;
[2755]	733	print HTML $line;
	734	}
	735	close TOC;
	736
	737	# rest of html src
	738	while (<HTMLSRC>) {
	739	print HTML $_;
	740	}
	741	close HTMLSRC;
	742	close HTML;
	743
[27509]	744	&FileUtils::removeFiles("${output_filestem}_ToC.html");
	745	&FileUtils::removeFiles("${output_filestem}.src");
[2755]	746	}
	747	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
	748	return 1; # success
	749	}
	750
	751	if (-e "$output_filestem.err") {
	752	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	753	{
	754	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
[10282]	755	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
[2755]	756	print FAILLOG " (rtf file might be too recent):\n";
	757	open (ERRLOG, "$output_filestem.err");
	758	while (<ERRLOG>) {print FAILLOG $_;}
	759	close ERRLOG;
	760	close FAILLOG;
	761	}
[27509]	762	&FileUtils::removeFiles("$output_filestem.err");
[2755]	763	}
	764
[27509]	765	&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
[2656]	766
[1654]	767	return 0;
	768	}
	769
	770
[32205]	771	# Convert a pdf file to html with the old pdftohtml command
	772	# which only works for older PDF versions
[1445]	773	sub pdf_to_html {
[2755]	774	my ($dirname, $input_filename, $output_filestem) = @_;
[1445]	775
[22429]	776	my $cmd = "";
[1692]	777	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[24362]	778	my $full_perl_path = &util::get_perl_exec();
[24124]	779	$cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
[3720]	780	$cmd .= " -c" if ($pdf_complex);
	781	$cmd .= " -i" if ($pdf_ignore_images);
[10451]	782	$cmd .= " -a" if ($pdf_allow_images_only);
[4103]	783	$cmd .= " -hidden" unless ($pdf_nohidden);
[1928]	784	$cmd .= " \"$input_filename\" \"$output_filestem\"";
[2755]	785
[16435]	786	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
[2755]	787	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	788	} else {
	789	$cmd .= " > \"$output_filestem.err\"";
	790	}
	791
[2117]	792	$!=0;
[2241]	793
[2656]	794	my $retval=system($cmd);
	795	if ($retval!=0)
[1445]	796	{
[2755]	797	print STDERR "Error executing pdftohtml.pl";
[2117]	798	if ($!) {print STDERR ": $!";}
	799	print STDERR "\n";
[1445]	800	}
	801
[1692]	802	# make sure the converter made something
[2656]	803	if ($retval!=0 \|\| ! -s "$output_filestem.html")
[1692]	804	{
[27509]	805	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
[2656]	806	# print out the converter's std err, if any
	807	if (-s "$output_filestem.err") {
[1692]	808	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[2755]	809	print STDERR "pdftohtml error log:\n";
[1692]	810	while (<ERRLOG>) {
	811	print STDERR "$_";
	812	}
	813	close ERRLOG;
	814	}
[24608]	815	#print STDERR "***********output filestem $output_filestem.html\n";
[27509]	816	&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
[2755]	817	if (-e "$output_filestem.err") {
	818	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	819	{
	820	open (ERRLOG, "$output_filestem.err");
	821	while (<ERRLOG>) {print FAILLOG $_;}
	822	close ERRLOG;
	823	close FAILLOG;
	824	}
[27509]	825	&FileUtils::removeFiles("$output_filestem.err");
[2755]	826	}
[1692]	827	return 0;
	828	}
[10357]	829
[27509]	830	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
	831	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
[10357]	832	return 1;
	833	}
	834
[32205]	835
	836	# Convert a pdf file to html with the newer Xpdftools' pdftohtml
	837	# This generates "paged HTML" where extracted, selectable text is positioned
	838	# over screenshots of each page.
	839	# Since xpdf's pdftohtml fails if the output dir already exists and for easier
	840	# naming, the output files are created in a "pages" subdirectory of the tmp
	841	# location parent of $output_filestem instead
	842	sub xpdf_to_html {
	843	my ($dirname, $input_filename, $output_filestem) = @_;
	844
	845	my $cmd = "";
	846
	847	# build up the path to the doc-to-html conversion tool we're going to use
	848	my $xpdf_pdftohtml = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "xpdf-tools");
[32209]	849
[32221]	850	if($ENV{'GSDLOS'} =~ m/^windows$/i) { # For Windows, just use the 32 bit xpdf's pdftohtml as it works the same as the 64 bit
	851	$xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin32");
[32220]	852	} else { # unix (linux\|darwin), use the bin32/bin64 folder depending on the BITNESS env var
[32207]	853
[32221]	854	# Don't use $ENV{'GSDLARCH'}, use the new $ENV{'BITNESS'}, since
	855	# $ENV{'GSDLARCH'} is only (meant to be) set when many other 32-bit or 64-bit
	856	# specific subdirectories exist in a greenstone installation.
	857	# None of those locations need exist when xpdf-tools is installed with GS.
	858	# So don't depend on GSDLARCH as forcing that to be exported has side-effects
	859	if($ENV{'BITNESS'}) {
	860	$xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin".$ENV{'BITNESS'});
	861	} else { # what if $ENV{'BITNESS'} undefined, fallback on bin32? or 64?
	862	$xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin32");
	863	}
[32205]	864	}
[32207]	865
[32205]	866	# We'll create the file by name $output_filestem during post-conversion processing.
	867	# Note that Xpdf tools will only create its conversion products in a dir that does
	868	# not yet exist. So we'll create this location as a subdir of the output_filestem's
	869	# parent directory. The parent dir is the already generated tmp area for conversion. So:
	870	# - tmpdir gs2build/tmp/<random-num> already exists at this stage
	871	# - We'll create gs2build/tmp/<rand>/output_filestem.html later, during post-processing
	872	# - For now, XPdftools will create gs2build/tmp/<rand>/pages and put its products in there.
	873	my ($tailname, $tmp_dirname, $suffix)
	874	= &File::Basename::fileparse($output_filestem, "\\.[^\\.]+\$");
	875	$tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "pages");
	876
	877	$xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "pdftohtml");
	878	# xpdf's pdftohtml tool also takes a zoom factor, where a zoom of 1 is 100%
	879	$cmd .= "\"$xpdf_pdftohtml\"";
	880	$cmd .= " -z $pdf_zoom" if ($pdf_zoom);
	881	# $cmd .= " -c" if ($pdf_complex);
	882	# $cmd .= " -i" if ($pdf_ignore_images);
	883	# $cmd .= " -a" if ($pdf_allow_images_only);
	884	# $cmd .= " -hidden" unless ($pdf_nohidden);
	885	$cmd .= " \"$input_filename\" \"$tmp_dirname\"";
	886	#$cmd .= " \"$input_filename\" \"$output_filestem\"";
	887
	888	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
	889	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	890	} else {
	891	$cmd .= " > \"$output_filestem.err\"";
	892	}
	893
	894	#print STDERR "@@@@ Running command: $cmd\n";
	895
	896	$!=0;
	897	my $retval=system($cmd);
	898	if ($retval!=0)
	899	{
	900	print STDERR "Error executing xpdf's pdftohtml tool";
	901	if ($!) {print STDERR ": $!";}
	902	print STDERR "\n";
	903	}
	904
	905	# make sure the converter made something
	906	if ($retval!=0 \|\| ! -s &FileUtils::filenameConcatenate($tmp_dirname,"index.html"))
	907	{
	908	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
	909	# print out the converter's std err, if any
	910	if (-s "$output_filestem.err") {
	911	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
	912	print STDERR "pdftohtml error log:\n";
	913	while (<ERRLOG>) {
	914	print STDERR "$_";
	915	}
	916	close ERRLOG;
	917	}
	918	#print STDERR "***********output filestem $output_filestem.html\n";
	919	&FileUtils::removeFiles("$tmp_dirname") if (-d "$tmp_dirname");
	920	if (-e "$output_filestem.err") {
	921	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	922	{
	923	open (ERRLOG, "$output_filestem.err");
	924	while (<ERRLOG>) {print FAILLOG $_;}
	925	close ERRLOG;
	926	close FAILLOG;
	927	}
	928	&FileUtils::removeFiles("$output_filestem.err");
	929	}
	930	return 0;
	931	}
	932
	933	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
	934	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
	935	return 1;
	936	}
	937
	938
	939
[10357]	940	# Convert a pdf file to various types of image with the convert command
	941
[17329]	942	sub pdfps_to_img {
[10357]	943	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[10401]	944
	945	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
	946	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
[24600]	947	my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
[24763]	948	$imagick_cmd = $imagick_cmd." --verbosity=$verbosity" if defined $verbosity;
[24600]	949	my $result = `$imagick_cmd identify 2>&1`;
	950
	951	# Linux and Windows return different values for "program not found".
	952	# Linux returns -1 and Windows 256 for "program not found". But once they're
	953	# converted to signed values, it will be -1 for Linux and 1 for Windows.
	954	# Whenever we test for return values other than 0, shift by 8 and perform
	955	# unsigned to signed status conversion on $? to get expected range of return vals
	956	# Although gs-magick.pl already shifts its $? by 8, converts it to a signed value
	957	# and then exits on that, by the time we get here, we need to do it again
	958	my $status = $?;
	959	$status >>= 8;
	960	$status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);
[25798]	961	if (($ENV{'GSDLOS'} ne "windows" && $status == -1) \|\| ($ENV{'GSDLOS'} eq "windows" && $status == 1)) {
	962	# if ($status == -1 \|\| $status == 1) #if ($status == -1 \|\| $status == 256) {
[10401]	963	#ImageMagick is not installed, thus the convert utility is not available.
[25798]	964	print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images. Status: $status\n";
[10401]	965	return 0;
	966	}
	967	}
	968
[22429]	969	my $cmd = "";
[10357]	970	if ($timeout) {$cmd = "ulimit -t $timeout;";}
	971	$output_type =~ s/.\_(.)/$1/i;
[24362]	972	my $full_perl_path = &util::get_perl_exec();
[24124]	973	$cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
[16435]	974	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
[10357]	975	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	976	} else {
	977	$cmd .= " > \"$output_filestem.err\"";
	978	}
	979
	980	# don't include path on windows (to avoid having to play about
	981	# with quoting when GSDLHOME might contain spaces) but assume
	982	# that the PATH is set up correctly
	983	$!=0;
	984	my $retval=system($cmd);
	985	if ($retval!=0)
	986	{
[28166]	987	print STDERR "Error executing pdfpstoimg.pl";
[10357]	988	if ($!) {print STDERR ": $!";}
	989	print STDERR "\n";
	990	}
	991
	992	#make sure the converter made something
	993	#if ($retval !=0) \|\| ! -s "$output_filestem")
	994	if ($retval !=0)
	995	{
[27509]	996	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
[10357]	997	#print out the converter's std err, if any
	998	if (-s "$output_filestem.err") {
	999	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[17329]	1000	print STDERR "pdfpstoimg error log:\n";
[10357]	1001	while (<ERRLOG>) {
	1002	print STDERR "$_";
	1003	}
	1004	close ERRLOG;
	1005	}
[27509]	1006	#&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
[10357]	1007	if (-e "$output_filestem.err") {
	1008	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	1009	{
	1010	open (ERRLOG, "$output_filestem.err");
	1011	while (<ERRLOG>) {print FAILLOG $_;}
	1012	close ERRLOG;
	1013	close FAILLOG;
	1014	}
[27509]	1015	&FileUtils::removeFiles("$output_filestem.err");
[10357]	1016	}
	1017	return 0;
	1018	}
[27509]	1019	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
	1020	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
[1445]	1021	return 1;
	1022	}
	1023
	1024	# Convert a PDF file to text with the pdftotext command
	1025
	1026	sub pdf_to_text {
[2755]	1027	my ($dirname, $input_filename, $output_filestem) = @_;
[1445]	1028
[2248]	1029	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
[2755]	1030
[16435]	1031	if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
[2755]	1032	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	1033	} else {
	1034	$cmd .= " > \"$output_filestem.err\"";
	1035	}
[1445]	1036
[2060]	1037	if (system($cmd)!=0)
[1445]	1038	{
	1039	print STDERR "Error executing $cmd: $!\n";
[27509]	1040	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
[1445]	1041	}
	1042
[2755]	1043	# make sure there is some extracted text.
	1044	if (-e "$output_filestem.text") {
	1045	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
	1046	binmode(EXTR_TEXT); # just in case...
	1047	my $line="";
	1048	my $seen_text=0;
	1049	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
[16435]	1050	if ($line=~ m/\w/) {$seen_text=1;}
[2755]	1051	}
	1052	close EXTR_TEXT;
	1053	if ($seen_text==0) { # no text was extracted
	1054	print STDERR "Error: pdftotext found no text\n";
[27509]	1055	&FileUtils::removeFiles("$output_filestem.text");
[2755]	1056	}
	1057	}
	1058
[1692]	1059	# make sure the converter made something
[2656]	1060	if (! -s "$output_filestem.text")
[1692]	1061	{
	1062	# print out the converters std err, if any
[2656]	1063	if (-s "$output_filestem.err") {
[1692]	1064	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[2755]	1065	print STDERR "pdftotext error log:\n";
[1692]	1066	while (<ERRLOG>) {
	1067	print STDERR "$_";
	1068	}
	1069	close ERRLOG;
	1070	}
[2656]	1071	# does this converter create a .out file?
[27509]	1072	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
	1073	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
[2755]	1074	if (-e "$output_filestem.err") {
	1075	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	1076	{
	1077	open (ERRLOG,"$output_filestem.err");
	1078	while (<ERRLOG>) {print FAILLOG $_;}
	1079	close ERRLOG;
	1080	close FAILLOG;
	1081	}
[27509]	1082	&FileUtils::removeFiles("$output_filestem.err");
[2755]	1083	}
[1692]	1084	return 0;
	1085	}
[27509]	1086	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
[1445]	1087	return 1;
	1088	}
	1089
[2012]	1090	# Convert a PostScript document to text
	1091	# note - just using "ps2ascii" isn't good enough, as it
	1092	# returns 0 for a postscript interpreter error. ps2ascii is just
	1093	# a wrapper to "gs" anyway, so we use that cmd here.
[1445]	1094
	1095	sub ps_to_text {
[2241]	1096	my ($input_filename, $output_filestem) = @_;
[1445]	1097
[2241]	1098	my $error = "";
	1099
	1100	# if we're on windows we'll fall straight through without attempting
	1101	# to use gs
[16435]	1102	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
[2241]	1103	$error = "Windows does not support gs";
	1104
	1105	} else {
[3538]	1106	my $cmd = "";
	1107	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
	1108	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
[2241]	1109	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
[10357]	1110	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
[2241]	1111	$cmd .= " 2> $output_filestem.err";
	1112	$!=0;
[10357]	1113
[2241]	1114	my $retcode=system($cmd);
	1115	$retcode = $? >> 8; # see man perlfunc - system for this...
	1116	# if system returns -1 \| 127 (couldn't start program), look at $! for message
	1117
	1118	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
	1119	elsif (! -e "$output_filestem.text") {
	1120	$error="did not create output file.\n";
[2012]	1121	}
[2241]	1122	else
	1123	{ # make sure the interpreter didn't get an error. It is technically
	1124	# possible for the actual text to start with this, but....
	1125	open PSOUT, "$output_filestem.text";
[16435]	1126	if (<PSOUT> =~ m/^Error: (.*)/) {
[2241]	1127	$error="interpreter error - \"$1\"";
	1128	}
	1129	close PSOUT;
	1130	}
[2012]	1131	}
[2241]	1132
[2012]	1133	if ($error ne "")
[1445]	1134	{
[2755]	1135	print STDERR "Warning: Error executing gs: $error\n";
[30724]	1136	print STDERR "Resorting to Perl regular expressions to extract text from PostScript...\n";
[27509]	1137	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
[2755]	1138
	1139	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
	1140	{
	1141	print FAILLOG "gs - $error\n";
	1142	if (-e "$output_filestem.err") {
	1143	open(ERRLOG, "$output_filestem.err");
	1144	while (<ERRLOG>) {print FAILLOG $_;}
	1145	close ERRLOG;
	1146	}
	1147	close FAILLOG;
	1148	}
[27509]	1149	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
[2012]	1150
[2755]	1151
[2012]	1152	# Fine then. We'll just do a lousy job by ourselves...
[2031]	1153	# Based on 5-line regexp sed script found at:
[2012]	1154	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
	1155	#
[2755]	1156	print STDERR "Stripping text from postscript\n";
[2012]	1157	my $errorcode=0;
	1158	open (IN, "$input_filename")
	1159	\|\| ($errorcode=1, warn "Couldn't read file: $!");
	1160	open (OUT, ">$output_filestem.text")
	1161	\|\| ($errorcode=1, warn "Couldn't write file: $!");
	1162	if ($errorcode) {print STDERR "errors\n";return 0;}
	1163
[2031]	1164	my $text=""; # this is for whole .ps file...
[2755]	1165	$text = join('', <IN>); # see man perlport, under "System Resources"
[2031]	1166	close IN;
	1167
[2447]	1168	# Make sure this is a ps file...
[16435]	1169	if ($text !~ m/^%!/) {
[2755]	1170	print STDERR "Bad postscript header: not '%!'\n";
	1171	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
	1172	{
	1173	print FAILLOG "Bad postscript header: not '%!'\n";
	1174	close FAILLOG;
	1175	}
[2447]	1176	return 0;
	1177	}
	1178
[2031]	1179	# if ps has Page data, then use it to delete all stuff before it.
	1180	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
	1181
	1182	# remove all leading non-data stuff
	1183	$text =~ s/^.*?\(//s;
	1184
	1185	# remove all newline chars for easier processing
	1186	$text =~ s/\n//g;
	1187
	1188	# Big assumption here - assume that if any co-ordinates are
	1189	# given, then we are at the end of a sentence.
	1190	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
	1191
	1192	# special characters--
	1193	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
	1194
	1195	# ? ps text formatting (eg italics?) ?
	1196	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
	1197	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
	1198	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
	1199	# default - remove the rest
	1200	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
	1201
	1202	# attempt to add whitespace between words...
	1203	# this is based purely on observation, and may be completely wrong...
	1204	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
	1205	# eg I notice "b(" is sometimes NOT a space if preceded by a
	1206	# negative number.
	1207	$text =~ s/\)\d+ ?b\(/\) \( /g;
	1208
	1209	# change quoted braces to brackets
	1210	$text =~ s/([^\\])\\\(/$1\{/g;
	1211	$text =~ s/([^\\])\\\)/$1\}/g ;
	1212
	1213	# remove everything that is not between braces
	1214	$text =~ s/\)([^\(\)])+?\(//sg ;
	1215
	1216	# remove any Trailer eof stuff.
	1217	$text =~ s/\)[^\)]*$//sg;
	1218
	1219	### ligatures have special characters...
	1220	$text =~ s/\\013/ff/g;
	1221	$text =~ s/\\014/fi/g;
	1222	$text =~ s/\\015/fl/g;
	1223	$text =~ s/\\016/ffi/g;
	1224	$text =~ s/\\214/fi/g;
	1225	$text =~ s/\\215/fl/g;
	1226	$text =~ s/\\017/\n\* /g; # asterisk?
	1227	$text =~ s/\\023/\023/g; # e acute ('e)
	1228	$text =~ s/\\177/\252/g; # u"
	1229	# $text =~ s/ ?? /\344/g; # a"
	1230
	1231	print OUT "$text";
	1232	close OUT;
[1960]	1233	}
[2600]	1234	# wrap the text - use a minimum length. ie, first space after this length.
	1235	my $wrap_length=72;
[27509]	1236	&FileUtils::moveFiles("$output_filestem.text", "$output_filestem.text.tmp");
[2600]	1237	open INFILE, "$output_filestem.text.tmp" \|\|
	1238	die "Couldn't open file: $!";
	1239	open OUTFILE, ">$output_filestem.text" \|\|
	1240	die "Couldn't open file for writing: $!";
	1241	my $line="";
	1242	while ($line=<INFILE>) {
	1243	while (length($line)>0) {
	1244	if (length($line)>$wrap_length) {
	1245	$line =~ s/^(.{$wrap_length}[^\s])\s//;
	1246	print OUTFILE "$1\n";
	1247	} else {
	1248	print OUTFILE "$line";
	1249	$line="";
	1250	}
	1251	}
	1252	}
	1253	close INFILE;
	1254	close OUTFILE;
[27509]	1255	&FileUtils::removeFiles("$output_filestem.text.tmp");
[2600]	1256
[27509]	1257	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
[1445]	1258	return 1;
	1259	}
	1260
	1261
	1262	# Convert any file to HTML with a crude perl implementation of the
	1263	# UNIX strings command.
	1264
	1265	sub any_to_html {
[22429]	1266	my ($input_filename, $output_filestem) = @_;
[1445]	1267
	1268	# First generate a text file
	1269	return 0 unless (&any_to_text($input_filename, $output_filestem));
	1270
	1271	# create an HTML file from the text file
	1272	open(TEXT, "<$output_filestem.text");
	1273	open(HTML, ">$output_filestem.html");
	1274
[2241]	1275	print HTML "<html><head>\n";
	1276	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
	1277	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
	1278	print HTML "</head><body>\n\n";
[1734]	1279
[2755]	1280	my $line;
	1281	while ($line=<TEXT>) {
	1282	$line =~ s/</</g;
	1283	$line =~ s/>/>/g;
[16435]	1284	if ($line =~ m/^\s*$/) {
[2755]	1285	print HTML "<p>";
	1286	} else {
	1287	print HTML "<br> ", $line;
	1288	}
[1445]	1289	}
[1734]	1290	print HTML "\n</body></html>\n";
[1445]	1291
[2241]	1292	close HTML;
	1293	close TEXT;
	1294
[27509]	1295	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
[1445]	1296	return 1;
	1297	}
	1298
	1299	# Convert any file to TEXT with a crude perl implementation of the
	1300	# UNIX strings command.
[2755]	1301	# Note - this assumes ascii charsets :( (jrm21)
[1445]	1302
	1303	sub any_to_text {
[22429]	1304	my ($input_filename, $output_filestem) = @_;
[1445]	1305
[3350]	1306	if (!$use_strings) {
	1307	return 0;
	1308	}
[15120]	1309
	1310	print STDERR "\n** In any to text**\n\n";
[2755]	1311	open(IN, "<$input_filename") \|\| return 0;
[1734]	1312	binmode(IN);
[2755]	1313	open(OUT, ">$output_filestem.text") \|\| return 0;
[1445]	1314
	1315	my ($line);
[2755]	1316	my $output_line_count = 0;
[1445]	1317	while (<IN>) {
	1318	$line = $_;
[1734]	1319
[1445]	1320	# delete anything that isn't a printable character
	1321	$line =~ s/[^\040-\176]+/\n/sg;
	1322
	1323	# delete any string less than 10 characters long
[1734]	1324	$line =~ s/^.{0,9}$/\n/mg;
[16435]	1325	while ($line =~ m/^.{1,9}$/m) {
[1734]	1326	$line =~ s/^.{0,9}$/\n/mg;
[1445]	1327	$line =~ s/\n+/\n/sg;
	1328	}
	1329
	1330	# remove extraneous whitespace
	1331	$line =~ s/\n+/\n/gs;
	1332	$line =~ s/^\n//gs;
[1578]	1333
[1445]	1334	# output whatever is left
[16435]	1335	if ($line =~ m/[^\n ]/) {
[1445]	1336	print OUT $line;
[2755]	1337	++$output_line_count;
[1445]	1338	}
	1339	}
[2241]	1340
	1341	close OUT;
	1342	close IN;
	1343
[2755]	1344	if ($output_line_count) { # try to protect against binary only formats
	1345	return 1;
	1346	}
	1347
[27509]	1348	&FileUtils::removeFiles("$output_filestem.text");
[2755]	1349	return 0;
	1350
[1445]	1351	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: