Context Navigation

source: gsdl/trunk/bin/script/gsConvert.pl@ 16552

Last change on this file since 16552 was 16552, checked in by ak19, 16 years ago
Commented out a couple of debug output statements
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 45.9 KB

Rev	Line
[1445]	1	#!/usr/bin/perl -w
	2
	3	###########################################################################
	4	#
[2032]	5	# gsConvert.pl -- convert documents to HTML or TEXT format
[1445]	6	#
	7	# A component of the Greenstone digital library software
	8	# from the New Zealand Digital Library Project at the
	9	# University of Waikato, New Zealand.
	10	#
[3013]	11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
[1445]	12	#
	13	# This program is free software; you can redistribute it and/or modify
	14	# it under the terms of the GNU General Public License as published by
	15	# the Free Software Foundation; either version 2 of the License, or
	16	# (at your option) any later version.
	17	#
	18	# This program is distributed in the hope that it will be useful,
	19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	21	# GNU General Public License for more details.
	22	#
	23	# You should have received a copy of the GNU General Public License
	24	# along with this program; if not, write to the Free Software
	25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	26	#
	27	###########################################################################
	28
	29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
[2755]	30	# by exploiting third-party programs. The sources of these are usually found
	31	# in the $GSDLHOME/packages directory, and the executables should live in
	32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
[1445]	33	#
[3013]	34	# Currently, we can convert the following formats by using external
	35	# conversion utilities:
	36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
	37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
[2032]	38	#
[3013]	39	# We can try to convert any file to text with a perl implementation of the
	40	# UNIX strings command.
	41	#
[2032]	42	# We try to convert Postscript files to text using "gs" which is often on
[2755]	43	# *nix machines. We fall back to performing weak text extraction by using
	44	# regular expressions.
[1445]	45
	46	BEGIN {
	47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
	48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
	49	}
	50
	51	use parsargv;
	52	use util;
	53	use Cwd;
	54	use File::Basename;
	55
[2755]	56	# Are we running on WinNT or Win2000 (or later)?
	57	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
	58	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
[1445]	59
[3350]	60	my $use_strings;
[3720]	61	my $pdf_complex;
[4103]	62	my $pdf_nohidden;
[3720]	63	my $pdf_zoom;
	64	my $pdf_ignore_images;
[10451]	65	my $pdf_allow_images_only;
[10282]	66	my $windows_scripting;
[3350]	67
[1445]	68	sub print_usage
	69	{
[1970]	70	print STDERR "\n";
	71	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
	72	print STDERR " or text using third-party programs.\n\n";
	73	print STDERR " usage: $0 [options] filename\n";
[3400]	74	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
[2755]	75	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
[10282]	76	print STDERR "\t-output\tauto\|html\|text\|pagedimg-jpg\|pagedimg-gif\|pagedimg-png\t(output file type)\n";
[2755]	77	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
[3720]	78	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
[10282]	79	print STDERR "\t-windows_scripting\tuse windows script when converting Microsoft Word and PPT via VB script\n";
[3720]	80	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
[4103]	81	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
[3720]	82	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
	83	print STDERR "\t\tconverting PDF to HTML\n";
[10451]	84	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
[3720]	85	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
	86	print STDERR "\t\t-pdf_complex is set\n";
[1445]	87	exit(1);
	88	}
	89
[2755]	90	my $faillogfile="";
[3538]	91	my $timeout=0;
[1445]	92
	93	sub main
	94	{
	95	my (@ARGV) = @_;
[3538]	96	my ($input_type,$output_type,$verbose);
[1960]	97
[1445]	98	# read command-line arguments
	99	if (!parsargv::parse(\@ARGV,
[3400]	100	'type/(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)/', \$input_type,
[2755]	101	'/errlog/.*/', \$faillogfile,
[10282]	102	'output/(auto\|html\|text\|pagedimg).*/', \$output_type,
[1692]	103	'timeout/\d+/0',\$timeout,
[10282]	104	'verbose/\d+/0', \$verbose,
[3720]	105	'use_strings', \$use_strings,
[10282]	106	'windows_scripting',\$windows_scripting,
[3720]	107	'pdf_complex', \$pdf_complex,
[9482]	108	'pdf_ignore_images', \$pdf_ignore_images,
[10451]	109	'pdf_allow_images_only', \$pdf_allow_images_only,
[4103]	110	'pdf_nohidden', \$pdf_nohidden,
[3720]	111	'pdf_zoom/\d+/2', \$pdf_zoom
	112	))
[1445]	113	{
	114	print_usage();
	115	}
[12704]	116
[1445]	117	# Make sure the input file exists and can be opened for reading
	118	if (scalar(@ARGV!=1)) {
	119	print_usage();
	120	}
[1928]	121
[1445]	122	my $input_filename = $ARGV[0];
	123	if (!-r $input_filename) {
	124	print STDERR "Error: unable to open $input_filename for reading\n";
	125	exit(1);
	126	}
	127
	128	# Deduce filenames
	129	my ($tailname,$dirname,$suffix)
[2241]	130	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
	131	my $output_filestem = &util::filename_cat($dirname, "$tailname");
[1445]	132
	133	if ($input_type eq "")
	134	{
[2241]	135	$input_type = lc (substr($suffix,1,length($suffix)-1));
[1445]	136	}
	137
	138	# Change to temporary working directory
	139	my $stored_dir = cwd();
	140	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
[10357]	141
[1445]	142	# Select convert utility
	143	if (!defined $input_type) {
	144	print STDERR "Error: No filename extension or input type defined\n";
	145	exit(1);
	146	}
[3400]	147	elsif ($input_type eq "doc" \|\| $input_type eq "dot") {
[1445]	148	print &convertDOC($input_filename, $output_filestem, $output_type);
	149	print "\n";
	150	}
[1684]	151	elsif ($input_type eq "rtf") {
	152	print &convertRTF($input_filename, $output_filestem, $output_type);
	153	print "\n";
	154	}
[1445]	155	elsif ($input_type eq "pdf") {
	156	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
	157	print "\n";
	158	}
	159	elsif ($input_type eq "ps") {
	160	print &convertPS($input_filename, $output_filestem, $output_type);
	161	print "\n";
	162	}
[2977]	163	elsif ($input_type eq "ppt") {
	164	print &convertPPT($input_filename, $output_filestem, $output_type);
	165	print "\n";
	166	}
[2991]	167	elsif ($input_type eq "xls") {
	168	print &convertXLS($input_filename, $output_filestem, $output_type);
	169	print "\n";
	170	}
[1445]	171	else {
	172	print STDERR "Error: Unable to convert type '$input_type'\n";
	173	exit(1);
	174	}
	175
	176	# restore to original working directory
	177	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
	178
	179	}
	180
	181	&main(@ARGV);
	182
	183
	184
[2241]	185	# Document-type conversion functions
[1445]	186	#
	187	# The following functions attempt to convert documents from their
	188	# input type to the specified output type. If no output type was
	189	# given, then they first attempt HTML, and then TEXT.
	190	#
	191	# Each returns the output type ("html" or "text") or "fail" if no
	192	# conversion is possible.
	193
	194	# Convert a Microsoft word document
	195
	196	sub convertDOC {
	197	($input_filename, $output_filestem, $output_type) = @_;
	198
[1654]	199	# Many .doc files are not in fact word documents!
	200	my $realtype = &find_docfile_type($input_filename);
	201
[1734]	202	if ($realtype eq "word6" \|\| $realtype eq "word7" \|\| $realtype eq "word8") {
[1654]	203	return &convertWord678($input_filename, $output_filestem, $output_type);
	204	} elsif ($realtype eq "rtf") {
	205	return &convertRTF($input_filename, $output_filestem, $output_type);
	206	} else {
	207	return &convertAnything($input_filename, $output_filestem, $output_type);
	208	}
	209	}
	210
	211	# Convert a Microsoft word 6/7/8 document
	212
	213	sub convertWord678 {
	214	($input_filename, $output_filestem, $output_type) = @_;
	215
[1445]	216	my $success = 0;
[16435]	217	if (!$output_type \|\| ($output_type =~ m/html/i)){
[10282]	218	if ($windows_scripting) {
	219	$success = &native_doc_to_html($input_filename, $output_filestem);
	220	}
	221	else {
	222	$success = &doc_to_html($input_filename, $output_filestem);
	223	}
[1445]	224	if ($success) {
[10282]	225	return "html";
[1445]	226	}
	227	}
[1654]	228	return &convertAnything($input_filename, $output_filestem, $output_type);
	229	}
	230
	231
	232	# Convert a Rich Text Format (RTF) file
	233
	234	sub convertRTF {
	235	($input_filename, $output_filestem, $output_type) = @_;
	236
	237	my $success = 0;
	238
	239	# Attempt specialised conversion to HTML
[16435]	240	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[12704]	241
	242	if ($windows_scripting) {
	243	$success = &native_doc_to_html($input_filename, $output_filestem);
	244	}
	245	else {
	246	$success = &rtf_to_html($input_filename, $output_filestem);
	247	}
[1654]	248	if ($success) {
	249	return "html";
	250	}
	251	}
	252
[2755]	253	# rtf is so ugly that's it's not worth running strings over.
	254	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
	255	# return &convertAnything($input_filename, $output_filestem, $output_type);
	256	return "fail";
[1654]	257	}
	258
	259
	260	# Convert an unidentified file
	261
	262	sub convertAnything {
	263	($input_filename, $output_filestem, $output_type) = @_;
	264
	265	my $success = 0;
[10464]	266
[1445]	267	# Attempt simple conversion to HTML
[16435]	268	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[1445]	269	$success = &any_to_html($input_filename, $output_filestem);
	270	if ($success) {
	271	return "html";
	272	}
	273	}
	274
	275	# Convert to text
[16435]	276	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[2241]	277	$success = &any_to_text($input_filename, $output_filestem);
[1445]	278	if ($success) {
	279	return "text";
	280	}
	281	}
	282	return "fail";
	283	}
	284
	285
[1654]	286
[1445]	287	# Convert an Adobe PDF document
	288
	289	sub convertPDF {
[2755]	290	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[1445]	291
	292	my $success = 0;
[10357]	293	$output_type =~ s/.\-(.)/$1/i;
	294	# Attempt coversion to Image
[16435]	295	if ($output_type =~ m/jp?g\|gif\|png/i) {
[10357]	296	$success = &pdf_to_img($dirname, $input_filename, $output_filestem, $output_type);
	297	if ($success){
	298	return "item";
	299	}
	300	}
[1445]	301
	302	# Attempt conversion to HTML
[16435]	303	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[1445]	304	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
	305	if ($success) {
	306	return "html";
	307	}
	308	}
	309
	310	# Attempt conversion to TEXT
[16435]	311	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[2117]	312	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
[1445]	313	if ($success) {
	314	return "text";
	315	}
	316	}
	317
	318	return "fail";
	319
	320	}
	321
	322
	323	# Convert an Adobe PostScript document
	324
	325	sub convertPS {
	326	($input_filename, $output_filestem, $output_type) = @_;
	327
	328	my $success = 0;
[10534]	329	$output_type =~ s/.\-(.)/$1/i;
	330	# Attempt coversion to Image
[16435]	331	if ($output_type =~ m/jp?g\|gif\|png/i) {
[10534]	332	$success = &ps_to_img($dirname, $input_filename, $output_filestem, $output_type);
	333	if ($success){
	334	return "item";
	335	}
	336	}
[1445]	337
	338	# Attempt conversion to TEXT
[16435]	339	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[1445]	340	$success = &ps_to_text($input_filename, $output_filestem);
	341	if ($success) {
	342	return "text";
	343	}
	344	}
	345	return "fail";
	346	}
	347
	348
[2977]	349	sub convertPPT {
	350	my ($input_filename, $output_filestem, $output_type) = @_;
[10357]	351	my $success = 0;
[2977]	352
[10282]	353	my $ppt_convert_type = "";
[16435]	354	#if (!$output_type \|\| $windows_scripting \|\| ($output_type !~ m/html/i) \|\| ($output_type !~ m/text/i)){
	355	if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
	356	if ($output_type =~ m/gif/i) {
[10282]	357	$ppt_convert_type = "-g";
[16435]	358	} elsif ($output_type =~ m/jp?g/i){
[10282]	359	$ppt_convert_type = "-j";
[16435]	360	} elsif ($output_type =~ m/png/i){
[10282]	361	$ppt_convert_type = "-p";
	362	}
	363	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
	364	$ENV{'GSDLOS'}, "pptextract");
[16435]	365	$vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
[10282]	366
	367	$cmd = "";
[10357]	368	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[10282]	369	# if the converting directory has already existed
	370	if (-d $output_filestem) {
	371	print STDERR "**The conversion directory has existed\n";
	372	return "item";
	373	} else {
[10521]	374	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
[10282]	375	$cmd .= " 2>\"$output_filestem.err\""
[16435]	376	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[10282]	377	if (system($cmd) !=0) {
	378	print STDERR "Powerpoint VB Scripting convert failed\n";
	379	} else {
	380	return "item";
	381	}
	382	}
[16435]	383	} elsif (!$output_type \|\| ($output_type =~ m/html/i)) {
[10282]	384	# Attempt conversion to HTML
[16435]	385	#if (!$output_type \|\| ($output_type =~ m/html/i)) {
[2977]	386	# formulate the command
	387	$cmd = "";
	388	$cmd .= "perl -S ppttohtml.pl ";
	389	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
	390	$cmd .= " 2>\"$output_filestem.err\""
[16435]	391	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[10357]	392
[2977]	393	# execute the command
	394	$!=0;
	395	if (system($cmd)!=0)
	396	{
[2991]	397	print STDERR "Powerpoint 95/97 converter failed $!\n";
[2977]	398	} else {
	399	return "html";
	400	}
[10464]	401	}
[2977]	402
	403	$success = &any_to_text($input_filename, $output_filestem);
	404	if ($success) {
	405	return "text";
	406	}
[10464]	407
[2977]	408	return "fail";
	409	}
	410
	411
[2991]	412	sub convertXLS {
	413	my ($input_filename, $output_filestem, $output_type) = @_;
[2977]	414
[2991]	415	my $success = 0;
[2977]	416
[2991]	417	# Attempt conversion to HTML
[16435]	418	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[2991]	419	# formulate the command
	420	$cmd = "";
	421	$cmd .= "perl -S xlstohtml.pl ";
	422	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
	423	$cmd .= " 2>\"$output_filestem.err\""
[16435]	424	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[2991]	425
	426
	427	# execute the command
	428	$!=0;
	429	if (system($cmd)!=0)
	430	{
	431	print STDERR "Excel 95/97 converter failed $!\n";
	432	} else {
	433	return "html";
	434	}
	435	}
[2977]	436
[2991]	437	$success = &any_to_text($input_filename, $output_filestem);
	438	if ($success) {
	439	return "text";
	440	}
	441
	442	return "fail";
	443	}
	444
	445
	446
[1654]	447	# Find the real type of a .doc file
	448	#
[2012]	449	# We seem to have a lot of files with a .doc extension that are .rtf
[1654]	450	# files or Word 5 files. This function attempts to tell the difference.
	451	sub find_docfile_type {
	452	($input_filename) = @_;
	453
	454	open(CHK, "<$input_filename");
[1734]	455	binmode(CHK);
[1654]	456	my $line = "";
	457	my $first = 1;
	458
	459	while (<CHK>) {
	460
	461	$line = $_;
[1960]	462
[1654]	463	if ($first) {
	464	# check to see if this is an rtf file
[16435]	465	if ($line =~ m/^\{\\rtf/) {
[1654]	466	close(CHK);
	467	return "rtf";
	468	}
[2755]	469	$first = 0;
[1654]	470	}
	471
[1734]	472	# is this is a word 6/7/8 document?
[16435]	473	if ($line =~ m/Word\.Document\.([678])/) {
[1654]	474	close(CHK);
[1734]	475	return "word$1";
[1654]	476	}
	477
	478	}
	479
	480	return "unknown";
	481	}
	482
	483
[1734]	484	# Specific type-to-type conversions
[1445]	485	#
	486	# Each of the following functions attempts to convert a document from
[2755]	487	# a specific format to another. If they succeed they return 1 and leave
[1445]	488	# the output document(s) in the appropriate place; if they fail they
	489	# return 0 and delete any working files.
	490
	491
	492	# Attempt to convert a word document to html with the wv program
	493	sub doc_to_html {
	494	($input_filename, $output_filestem) = @_;
	495
[2023]	496	my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
	497	$ENV{'GSDLOS'}, "wvWare");
[1928]	498
[2241]	499	# don't include path on windows (to avoid having to play about
	500	# with quoting when GSDLHOME might contain spaces) but assume
	501	# that the PATH is set up correctly
[16435]	502	$wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
[2241]	503
[2512]	504	my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
[2574]	505	"packages", "wv", "wvHtml.xml");
[1928]	506
[15120]	507	# Added the following to work with replace_srcdoc_with_html.pl:
	508	# Make wvWare put any associated (image) files of the word doc into
	509	# folder docname-without-extention_files. This folder should be at
	510	# the same level as the html file generated from the doc.
	511	# wvWare will take care of proper interlinking.
	512
	513	# This step is necessary for replace_srcdoc_with_html.pl which will
	514	# move the html and associated files into the import folder. We
	515	# want to ensure that the associated files won't overwrite similarly
	516	# named items already in import. Hence we put them in a folder first
	517	# (to which the html links properly) and that will allow
	518	# replace_srcdoc_with_html.pl to move them safely to /import.
	519
	520	# To do all this, we need to use wvWare's --dir and --basename options
	521	# where dir is the full path to the image folder directory and
	522	# basename is the full path to the image folder appended to the name
	523	# which is to be prepended to every image file:
	524	# eg. if the images were to have names like sample0.jpg to sampleN.jpg,
	525	# then the basename is "/full/path/to/imgdir/sample".
	526	# In this case, basename is the full path to and name of the document.
	527	# HOWEVER: basename always takes full path, not relative url, so
	528	# the greenstone browser is unable to display the images (absolute paths
	529	# cause it to give an "external link" message)
	530	# See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html
	531	# and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html
	532	# "added --dir option to wvHtml so that pictures can be placed in
	533	# a seperate directory"
	534	# "running wvWare through IMP to view word documents as html. It gets
	535	# invoked like this:
	536	# wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output"
	537
	538	# toppath is the folder where html is generated
	539	# docname is the name (without extension) of the html to be generated
	540	# suffix (extension) is thrown away
	541	my ($docname, $toppath)
	542	= &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
	543
	544	# We want the image folder generated to have the same name as windows
	545	# would generate ($windows_scripting) when it converts from word to html.
	546	# That is, foldername=docname_files
	547	my $assoc_dir = &util::filename_cat($toppath, $docname."_files");
	548	#print "assoc_dir: ".$assoc_dir."\n"; # same as "$output_filestem._files"
	549
	550	# ensure this image directory exists
	551	# if it exists already, just delete and recreate
	552	if(-e $assoc_dir) {
	553	&util::rm_r($assoc_dir);
	554	}
	555	&util::mk_dir($assoc_dir);
	556
	557	# the images are all going to be called image0, image1,..., imageN
	558	my $img_basenames = &util::filename_cat($assoc_dir, $docname);
	559
	560	#print STDERR "**toppath: $toppath\n**docname: $docname\n;
	561	#print STDERR "****img_basenames: $img_basenames\n" if($img_basenames);
	562	#print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir);
	563
[2241]	564	my $cmd = "";
[1692]	565	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[15120]	566	# wvWare's --dir and --basename options for image directory.
	567	# Replaced the next line with the 2 lines following it:
	568	# $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
	569	$cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\"";
	570	$cmd .= " --charset utf-8 --config \"$wv_conf\"";
[2241]	571	$cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
[15120]	572
[2241]	573	# redirecting STDERR is a bad idea on windows 95/98
	574	$cmd .= " 2> \"$output_filestem.err\""
[16435]	575	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[1445]	576	# execute the command
[2755]	577	$!=0;
[2060]	578	if (system($cmd)!=0)
[1445]	579	{
[2755]	580	print STDERR "Error executing wv converter:$!\n";
	581	if (-s "$output_filestem.err") {
	582	open (ERRFILE, "<$output_filestem.err");
	583
	584	my $write_to_fail_log=0;
	585	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	586	{$write_to_fail_log=1;}
	587
	588	my $line;
	589	while ($line=<ERRFILE>) {
[16435]	590	if ($line =~ m/\w/) {
[2755]	591	print STDERR "$line";
	592	print FAILLOG "$line" if ($write_to_fail_log);
	593	}
	594	if ($line !~ m/startup error/) {next;}
	595	print STDERR " (given an invalid .DOC file?)\n";
	596	print FAILLOG " (given an invalid .DOC file?)\n"
	597	if ($write_to_fail_log);
	598
	599	} # while ERRFILE
	600	close FAILLOG if ($write_to_fail_log);
	601	}
	602	return 0; # we can try any_to_text
[1445]	603	}
[1578]	604
[1445]	605	# Was the conversion successful?
[2241]	606
[15120]	607	if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents)
[1445]	608	open(TMP, "$output_filestem.html");
	609	$line = <TMP>;
	610	close(TMP);
[16435]	611	if ($line && $line =~ m/DOCTYPE HTML/) {
[15120]	612	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
	613
	614	# Inserted this code to remove the images directory if it was still empty after
	615	# the html was generated (in case there were no images in the word document)
[16435]	616	if (&util::is_dir_empty($assoc_dir)) {
[15152]	617	#print STDERR "*gsConvert.pl: Image dir $assoc_dir is empty, removing*\n";
[15120]	618	&util::rm_r($assoc_dir);
	619	} else { # there was an image folder (it was generated)
	620	# Therefore, the html file generated contains absolute links to the images
[16435]	621	# Replace them with relative links instead, so the folder can be moved elsewhere
[15152]	622	&make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");
[15120]	623	}
[1445]	624	return 1;
	625	}
	626	}
[2755]	627
	628	# If here, an error of some sort occurred
	629	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
	630	if (-e "$output_filestem.err") {
	631	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
	632	open (ERRLOG,"$output_filestem.err");
	633	while (<ERRLOG>) {print FAILLOG $_;}
	634	close FAILLOG;
	635	close ERRLOG;
	636	}
	637	&util::rm("$output_filestem.err");
	638	}
	639
[1445]	640	return 0;
	641	}
	642
[15120]	643	# Method to work with doc_to_html - Word docs might contain images.
	644	# When such word docs are converted with wvWare, we make it generate a
	645	# <filename>_files folder with the associated images, while the html file
	646	# <filename> refers to the images using absolute paths to <filename>_files.
	647	# This method reads in that html file and replaces all the absolute paths to
	648	# the images in <filename>_files with the relative paths to the images from
	649	# that folder. (I.e. with <filename>_files/<imagename.ext>).
	650	sub make_links_to_assocdir_relative{
	651	# toppath is the top-level folder in which the html file we're going to be fixing resides
	652	# docname is just the name (without extension) of the html file
	653	# html_file is the full path to the html file: /full/path/docname.html
	654	# assoc_dir_path is toppath/docname_files
	655	# assoc_dirname is the directory name of the folder with associated imgs: docname_files
	656	my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_;
[10357]	657
[15120]	658	# 1. Read all the contents of the html into a string
	659	# open the original file for reading
	660	unless(open(FIN, "<$html_file")) {
[15168]	661	print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR: $!\n";
[15152]	662	return 0;
[15120]	663	}
	664	# From http://perl.plover.com/local.html
	665	# "It's cheaper to read the file all at once, without all the splitting and reassembling.
	666	# (Some people call this slurping the file.) Perl has a special feature to support this:
	667	# If the $/ variable is undefined, the <...> operator will read the entire file all at once"
[15152]	668	my $html_contents;
	669	{
	670	local $/ = undef; # Read entire file at once
	671	$html_contents = <FIN>; # Now file is read in as one single 'line'
	672	}
[15120]	673	close(FIN); # close the file
[15152]	674	#print STDERR $html_contents;
[15120]	675
	676	# 2. Replace (substitute) all ocurrences of the assoc_dir_path in a hrefs and img src
	677	# values with assoc_dirname
	678	# At the end: g means substitute all occurrences (global), while s at the end means treat
	679	# all new lines as a regular space. This interacts with g to consider all the lines
	680	# together as a single line so that multi-occurrences can be replaced.
[15152]	681
	682	# we can't just replace $assoc_dir_path with $assoc_dir
	683	# $assoc_dir_path represents a regular expression that needs to be replaced
[16435]	684	# if it contains ., -, [, ], or Windows style backslashes in paths -- which all have special
	685	# meaning in Perl regular expressions -- we need to escape these first
[15152]	686	my $safe_reg_expression = $assoc_dir_path;
[16435]	687	$safe_reg_expression =~ s/\\/\\\\/g;
[15152]	688	$safe_reg_expression =~ s/\./\\./g;
	689	$safe_reg_expression =~ s/\-/\\-/g;
	690	$safe_reg_expression =~ s/\[/\\[/g;
	691	$safe_reg_expression =~ s/\]/\\]/g;
	692	$safe_reg_expression =~ s/ /%20/g; # wvWare put %20 in place of space, so we need to change our prefix to match
	693
[15120]	694	# The following regular expression substitution looks for <a or <image, followed by any other
	695	# attributes and values until it comes to the FIRST (indicated by ?) href= or src=
	696	# followed by " or ' no quotes at all around path, followed by the associated folder's pathname
	697	# followed by characters (for the img filename), then finally the optional closing quotes
	698	# in " or ' form, followed by any other attributes and values until the first > to end the tag.
	699	# The substitution: all the parts preceding associated folder's pathname are retained,
	700	# the associated folder path name is replaced by associated folder directory name
	701	# and the rest upto and including the closing > tag is retained.
	702	# The sg at the end of the pattern match treats all of html_contents as a single line (s)
	703	# and performs a global replace (g) meaning that all occurrences that match in that single line
	704	# are substituted.
[15152]	705	$html_contents =~ s/(<(a\|img).?(href\|src)=(\"\|\')?)$safe_reg_expression(.?(\"\|\')?.*?>)/$1$assoc_dirname$5/sg;
	706	#$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back
	707	# now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this
[16435]	708	$html_contents =~ s/(<(a\|img).?(href\|src)=(\"\|\')?)(.)(.?(\"\|\')?.?>)/&post_process_assocfile_urls($1, $5, $6)/sge;
	709
[16552]	710	#print STDERR "**assoc_dirname: $assoc_dirname*\n";
	711	#print STDERR "**safe_reg_expression: $safe_reg_expression*\n";
[15152]	712
[15120]	713	# delete the original file and recreate it
	714	my $copy_of_filename = $html_file;
	715	&util::rm($copy_of_filename); # deleted the file
	716
	717	# Recreate the original file for writing the updated contents
	718	unless(open(FOUT, ">$html_file")) { # open it as a new file for writing
[15168]	719	print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR: $!\n";
[15152]	720	return 0;
[15120]	721	}
[16435]	722
[15120]	723	# write out the updated contents and close the file
	724	print FOUT $html_contents;
	725	close(FOUT);
[15152]	726	return 1;
[15120]	727	}
	728
[16435]	729	# Utility routine to make sure HTML plugin gets img src/href link pathnames that contain
	730	# url slashes (/) instead of windows-style backwards slashes, and to convert all %20
	731	# introduced in link pathnames by wvWare into space again
	732	sub post_process_assocfile_urls
[15120]	733	{
[15152]	734	my ($pre, $text, $post) = @_;
	735
	736	$text =~ s/%20/ /g;
[16435]	737	$text =~ s/\\/\//g;
[15152]	738
	739	return "$pre$text$post";
[15120]	740	}
	741
[10282]	742	# Attempt to convert a word document to html with the word2html scripting program
	743	sub native_doc_to_html {
	744	($input_filename, $output_filestem) = @_;
[1445]	745
[10282]	746	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
	747	$ENV{'GSDLOS'}, "word2html");
	748
[16435]	749	$vbScript = "word2html" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
[10445]	750	if (-e "$output_filestem.html") {
	751	print STDERR "*** The conversion file has existed\n";
	752	return 1;
	753	}
[10282]	754
	755	my $cmd = "";
	756	if ($timeout) {$cmd = "ulimit -t $timeout;";}
	757	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
[10445]	758	#$cmd .= "$vbScript $input_filename $output_filestem.html";
[10521]	759	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
[10445]	760
[10282]	761	# redirecting STDERR
	762	$cmd .= " 2> \"$output_filestem.err\""
[16435]	763	if ($ENV {'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[10282]	764
	765	# execute the command
	766	$!=0;
	767	if (system($cmd)!=0)
	768	{
	769	print STDERR "Error executing word2Html converter:$!\n";
	770	if (-s "$output_filestem.err") {
	771	open (ERRFILE, "<$output_filestem.err");
	772
	773	my $write_to_fail_log=0;
	774	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	775	{$write_to_fail_log=1;}
	776
	777	my $line;
	778	while ($line=<ERRFILE>) {
[16435]	779	if ($line =~ m/\w/) {
[10282]	780	print STDERR "$line";
	781	print FAILLOG "$line" if ($write_to_fail_log);
	782	}
	783	if ($line !~ m/startup error/) {next;}
	784	print STDERR " (given an invalid .DOC file?)\n";
	785	print FAILLOG " (given an invalid .DOC file?)\n"
	786	if ($write_to_fail_log);
	787
	788	} # while ERRFILE
	789	close FAILLOG if ($write_to_fail_log);
	790	}
	791	return 0; # we can try any_to_text
	792	}
	793
	794	# Was the conversion successful?
	795	if (-s "$output_filestem.html") {
	796	open(TMP, "$output_filestem.html");
	797	$line = <TMP>;
	798	close(TMP);
[16435]	799	if ($line && $line =~ m/html/) {
[10282]	800	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
	801	return 1;
	802	}
	803	}
	804
	805	# If here, an error of some sort occurred
	806	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
	807	if (-e "$output_filestem.err") {
	808	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
	809	open (ERRLOG,"$output_filestem.err");
	810	while (<ERRLOG>) {print FAILLOG $_;}
	811	close FAILLOG;
	812	close ERRLOG;
	813	}
	814	&util::rm("$output_filestem.err");
	815	}
	816	return 0;
	817	}
	818
[1654]	819	# Attempt to convert an RTF document to html with rtftohtml
	820
	821	sub rtf_to_html {
[2241]	822	my ($input_filename, $output_filestem) = @_;
[1654]	823
	824	# formulate the command
[1692]	825	$cmd = "";
	826	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[2574]	827	$cmd .= "rtftohtml";
[10282]	828	#$cmd .= "rtf-converter";
[1654]	829
[3246]	830	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
[2574]	831
	832	$cmd .= " 2>\"$output_filestem.err\""
[16435]	833	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[2574]	834
	835
[1654]	836	# execute the command
[2755]	837	$!=0;
[2060]	838	if (system($cmd)!=0)
[1654]	839	{
[2755]	840	print STDERR "Error executing rtf converter $!\n";
[2656]	841	# don't currently bother printing out error log...
	842	# keep going, in case it still created an HTML file...
[1654]	843	}
	844
	845	# Was the conversion successful?
[2755]	846	my $was_successful=0;
[2656]	847	if (-s "$output_filestem.html") {
[2755]	848	# make sure we have some content other than header
	849	open (HTML, "$output_filestem.html"); # what to do if fail?
	850	my $line;
	851	my $past_header=0;
	852	while ($line=<HTML>) {
	853
	854	if ($past_header == 0) {
[16435]	855	if ($line =~ m/<body>/) {$past_header=1;}
[2755]	856	next;
	857	}
	858
	859	$line =~ s/<[^>]+>//g;
[16435]	860	if ($line =~ m/\w/ && $past_header) { # we found some content...
[2755]	861	$was_successful=1;
	862	last;
	863	}
	864	}
	865	close HTML;
[1654]	866	}
[2574]	867
[2755]	868	if ($was_successful) {
	869	&util::rm("$output_filestem.err")
	870	if (-e "$output_filestem.err");
	871	# insert the (modified) table of contents, if it exists.
	872	if (-e "${output_filestem}_ToC.html") {
	873	&util::mv("$output_filestem.html","$output_filestem.src");
	874	my $open_failed=0;
	875	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
	876	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
	877	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
	878
	879	if ($open_failed) {
	880	close HTMLSRC;
	881	close TOC;
	882	close HTML;
	883	&util::mv("$output_filestem.src","$output_filestem.html");
	884	return 1;
	885	}
	886
	887	# print out header info from src html.
[16435]	888	while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
[2755]	889	print HTML "$_";
	890	}
	891
	892	# print out table of contents, making links relative
	893	<TOC>; <TOC>; # ignore first 2 lines
	894	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
	895	my $line;
	896	while ($line=<TOC>) {
	897	$line =~ s@</body></html>$@@ ; # only last line has this
	898	# make link relative
	899	$line =~ s@href=\"[^\#]+@href=\"@;
	900	print HTML $line;
	901	}
	902	close TOC;
	903
	904	# rest of html src
	905	while (<HTMLSRC>) {
	906	print HTML $_;
	907	}
	908	close HTMLSRC;
	909	close HTML;
	910
	911	&util::rm("${output_filestem}_ToC.html");
	912	&util::rm("${output_filestem}.src");
	913	}
	914	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
	915	return 1; # success
	916	}
	917
	918	if (-e "$output_filestem.err") {
	919	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	920	{
	921	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
[10282]	922	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
[2755]	923	print FAILLOG " (rtf file might be too recent):\n";
	924	open (ERRLOG, "$output_filestem.err");
	925	while (<ERRLOG>) {print FAILLOG $_;}
	926	close ERRLOG;
	927	close FAILLOG;
	928	}
	929	&util::rm("$output_filestem.err");
	930	}
	931
[2656]	932	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
	933
[1654]	934	return 0;
	935	}
	936
	937
[1445]	938	# Convert a pdf file to html with the pdftohtml command
	939
	940	sub pdf_to_html {
[2755]	941	my ($dirname, $input_filename, $output_filestem) = @_;
[1445]	942
[1692]	943	$cmd = "";
	944	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[3720]	945	$cmd .= "perl -S pdftohtml.pl -zoom $pdf_zoom";
	946	$cmd .= " -c" if ($pdf_complex);
	947	$cmd .= " -i" if ($pdf_ignore_images);
[10451]	948	$cmd .= " -a" if ($pdf_allow_images_only);
[4103]	949	$cmd .= " -hidden" unless ($pdf_nohidden);
[1928]	950	$cmd .= " \"$input_filename\" \"$output_filestem\"";
[2755]	951
[16435]	952	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
[2755]	953	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	954	} else {
	955	$cmd .= " > \"$output_filestem.err\"";
	956	}
	957
[2117]	958	$!=0;
[2241]	959
[2656]	960	my $retval=system($cmd);
	961	if ($retval!=0)
[1445]	962	{
[2755]	963	print STDERR "Error executing pdftohtml.pl";
[2117]	964	if ($!) {print STDERR ": $!";}
	965	print STDERR "\n";
[1445]	966	}
	967
[1692]	968	# make sure the converter made something
[2656]	969	if ($retval!=0 \|\| ! -s "$output_filestem.html")
[1692]	970	{
	971	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
[2656]	972	# print out the converter's std err, if any
	973	if (-s "$output_filestem.err") {
[1692]	974	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[2755]	975	print STDERR "pdftohtml error log:\n";
[1692]	976	while (<ERRLOG>) {
	977	print STDERR "$_";
	978	}
	979	close ERRLOG;
	980	}
[2656]	981	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
[2755]	982	if (-e "$output_filestem.err") {
	983	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	984	{
	985	open (ERRLOG, "$output_filestem.err");
	986	while (<ERRLOG>) {print FAILLOG $_;}
	987	close ERRLOG;
	988	close FAILLOG;
	989	}
[10282]	990	&util::rm("$output_filestem.err");
[2755]	991	}
[1692]	992	return 0;
	993	}
[10357]	994
	995	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	996	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	997	return 1;
	998	}
	999
	1000	# Convert a pdf file to various types of image with the convert command
	1001
	1002	sub pdf_to_img {
	1003	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[10401]	1004
	1005	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
	1006	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
	1007	my $result = `identify 2>&1`;
	1008	if ($? == -1 \|\| $? == 256) { # Linux and Windows return different values for "program not found"
	1009	#ImageMagick is not installed, thus the convert utility is not available.
	1010	print STDERR "*** ImageMagick is not installed, the convert utility is not available\n";
	1011	return 0;
	1012	}
	1013	}
	1014
[10357]	1015	$cmd = "";
	1016	if ($timeout) {$cmd = "ulimit -t $timeout;";}
	1017	$output_type =~ s/.\_(.)/$1/i;
[10521]	1018	$cmd .= "perl -S pdftoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
[16435]	1019	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
[10357]	1020	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	1021	} else {
	1022	$cmd .= " > \"$output_filestem.err\"";
	1023	}
	1024
	1025	# don't include path on windows (to avoid having to play about
	1026	# with quoting when GSDLHOME might contain spaces) but assume
	1027	# that the PATH is set up correctly
	1028	$!=0;
	1029	my $retval=system($cmd);
	1030	if ($retval!=0)
	1031	{
[10401]	1032	print STDERR "Error executing pdftoimg.pl";
[10357]	1033	if ($!) {print STDERR ": $!";}
	1034	print STDERR "\n";
	1035	}
	1036
	1037	#make sure the converter made something
	1038	#if ($retval !=0) \|\| ! -s "$output_filestem")
	1039	if ($retval !=0)
	1040	{
	1041	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	1042	#print out the converter's std err, if any
	1043	if (-s "$output_filestem.err") {
	1044	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[10401]	1045	print STDERR "pdftoimg error log:\n";
[10357]	1046	while (<ERRLOG>) {
	1047	print STDERR "$_";
	1048	}
	1049	close ERRLOG;
	1050	}
[10534]	1051	#&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
[10357]	1052	if (-e "$output_filestem.err") {
	1053	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	1054	{
	1055	open (ERRLOG, "$output_filestem.err");
	1056	while (<ERRLOG>) {print FAILLOG $_;}
	1057	close ERRLOG;
	1058	close FAILLOG;
	1059	}
	1060	&util::rm("$output_filestem.err");
	1061	}
	1062	return 0;
	1063	}
[2656]	1064	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
[1445]	1065	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	1066	return 1;
	1067	}
	1068
	1069	# Convert a PDF file to text with the pdftotext command
	1070
	1071	sub pdf_to_text {
[2755]	1072	my ($dirname, $input_filename, $output_filestem) = @_;
[1445]	1073
[2248]	1074	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
[2755]	1075
[16435]	1076	if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
[2755]	1077	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	1078	} else {
	1079	$cmd .= " > \"$output_filestem.err\"";
	1080	}
[1445]	1081
[2060]	1082	if (system($cmd)!=0)
[1445]	1083	{
	1084	print STDERR "Error executing $cmd: $!\n";
	1085	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
	1086	}
	1087
[2755]	1088	# make sure there is some extracted text.
	1089	if (-e "$output_filestem.text") {
	1090	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
	1091	binmode(EXTR_TEXT); # just in case...
	1092	my $line="";
	1093	my $seen_text=0;
	1094	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
[16435]	1095	if ($line=~ m/\w/) {$seen_text=1;}
[2755]	1096	}
	1097	close EXTR_TEXT;
	1098	if ($seen_text==0) { # no text was extracted
	1099	print STDERR "Error: pdftotext found no text\n";
	1100	&util::rm("$output_filestem.text");
	1101	}
	1102	}
	1103
[1692]	1104	# make sure the converter made something
[2656]	1105	if (! -s "$output_filestem.text")
[1692]	1106	{
	1107	# print out the converters std err, if any
[2656]	1108	if (-s "$output_filestem.err") {
[1692]	1109	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[2755]	1110	print STDERR "pdftotext error log:\n";
[1692]	1111	while (<ERRLOG>) {
	1112	print STDERR "$_";
	1113	}
	1114	close ERRLOG;
	1115	}
[2656]	1116	# does this converter create a .out file?
	1117	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	1118	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
[2755]	1119	if (-e "$output_filestem.err") {
	1120	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	1121	{
	1122	open (ERRLOG,"$output_filestem.err");
	1123	while (<ERRLOG>) {print FAILLOG $_;}
	1124	close ERRLOG;
	1125	close FAILLOG;
	1126	}
	1127	&util::rm("$output_filestem.err");
	1128	}
[1692]	1129	return 0;
	1130	}
[1445]	1131	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	1132	return 1;
	1133	}
	1134
[2012]	1135	# Convert a PostScript document to text
	1136	# note - just using "ps2ascii" isn't good enough, as it
	1137	# returns 0 for a postscript interpreter error. ps2ascii is just
	1138	# a wrapper to "gs" anyway, so we use that cmd here.
[1445]	1139
	1140	sub ps_to_text {
[2241]	1141	my ($input_filename, $output_filestem) = @_;
[1445]	1142
[2241]	1143	my $error = "";
	1144
	1145	# if we're on windows we'll fall straight through without attempting
	1146	# to use gs
[16435]	1147	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
[2241]	1148	$error = "Windows does not support gs";
	1149
	1150	} else {
[3538]	1151	my $cmd = "";
	1152	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
	1153	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
[2241]	1154	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
[10357]	1155	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
[2241]	1156	$cmd .= " 2> $output_filestem.err";
	1157	$!=0;
[10357]	1158
[2241]	1159	my $retcode=system($cmd);
	1160	$retcode = $? >> 8; # see man perlfunc - system for this...
	1161	# if system returns -1 \| 127 (couldn't start program), look at $! for message
	1162
	1163	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
	1164	elsif (! -e "$output_filestem.text") {
	1165	$error="did not create output file.\n";
[2012]	1166	}
[2241]	1167	else
	1168	{ # make sure the interpreter didn't get an error. It is technically
	1169	# possible for the actual text to start with this, but....
	1170	open PSOUT, "$output_filestem.text";
[16435]	1171	if (<PSOUT> =~ m/^Error: (.*)/) {
[2241]	1172	$error="interpreter error - \"$1\"";
	1173	}
	1174	close PSOUT;
	1175	}
[2012]	1176	}
[2241]	1177
[2012]	1178	if ($error ne "")
[1445]	1179	{
[2755]	1180	print STDERR "Warning: Error executing gs: $error\n";
[1445]	1181	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
[2755]	1182
	1183	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
	1184	{
	1185	print FAILLOG "gs - $error\n";
	1186	if (-e "$output_filestem.err") {
	1187	open(ERRLOG, "$output_filestem.err");
	1188	while (<ERRLOG>) {print FAILLOG $_;}
	1189	close ERRLOG;
	1190	}
	1191	close FAILLOG;
	1192	}
[1445]	1193	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
[2012]	1194
[2755]	1195
[2012]	1196	# Fine then. We'll just do a lousy job by ourselves...
[2031]	1197	# Based on 5-line regexp sed script found at:
[2012]	1198	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
	1199	#
[2755]	1200	print STDERR "Stripping text from postscript\n";
[2012]	1201	my $errorcode=0;
	1202	open (IN, "$input_filename")
	1203	\|\| ($errorcode=1, warn "Couldn't read file: $!");
	1204	open (OUT, ">$output_filestem.text")
	1205	\|\| ($errorcode=1, warn "Couldn't write file: $!");
	1206	if ($errorcode) {print STDERR "errors\n";return 0;}
	1207
[2031]	1208	my $text=""; # this is for whole .ps file...
[2755]	1209	$text = join('', <IN>); # see man perlport, under "System Resources"
[2031]	1210	close IN;
	1211
[2447]	1212	# Make sure this is a ps file...
[16435]	1213	if ($text !~ m/^%!/) {
[2755]	1214	print STDERR "Bad postscript header: not '%!'\n";
	1215	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
	1216	{
	1217	print FAILLOG "Bad postscript header: not '%!'\n";
	1218	close FAILLOG;
	1219	}
[2447]	1220	return 0;
	1221	}
	1222
[2031]	1223	# if ps has Page data, then use it to delete all stuff before it.
	1224	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
	1225
	1226	# remove all leading non-data stuff
	1227	$text =~ s/^.*?\(//s;
	1228
	1229	# remove all newline chars for easier processing
	1230	$text =~ s/\n//g;
	1231
	1232	# Big assumption here - assume that if any co-ordinates are
	1233	# given, then we are at the end of a sentence.
	1234	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
	1235
	1236	# special characters--
	1237	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
	1238
	1239	# ? ps text formatting (eg italics?) ?
	1240	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
	1241	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
	1242	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
	1243	# default - remove the rest
	1244	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
	1245
	1246	# attempt to add whitespace between words...
	1247	# this is based purely on observation, and may be completely wrong...
	1248	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
	1249	# eg I notice "b(" is sometimes NOT a space if preceded by a
	1250	# negative number.
	1251	$text =~ s/\)\d+ ?b\(/\) \( /g;
	1252
	1253	# change quoted braces to brackets
	1254	$text =~ s/([^\\])\\\(/$1\{/g;
	1255	$text =~ s/([^\\])\\\)/$1\}/g ;
	1256
	1257	# remove everything that is not between braces
	1258	$text =~ s/\)([^\(\)])+?\(//sg ;
	1259
	1260	# remove any Trailer eof stuff.
	1261	$text =~ s/\)[^\)]*$//sg;
	1262
	1263	### ligatures have special characters...
	1264	$text =~ s/\\013/ff/g;
	1265	$text =~ s/\\014/fi/g;
	1266	$text =~ s/\\015/fl/g;
	1267	$text =~ s/\\016/ffi/g;
	1268	$text =~ s/\\214/fi/g;
	1269	$text =~ s/\\215/fl/g;
	1270	$text =~ s/\\017/\n\* /g; # asterisk?
	1271	$text =~ s/\\023/\023/g; # e acute ('e)
	1272	$text =~ s/\\177/\252/g; # u"
	1273	# $text =~ s/ ?? /\344/g; # a"
	1274
	1275	print OUT "$text";
	1276	close OUT;
[1960]	1277	}
[2600]	1278	# wrap the text - use a minimum length. ie, first space after this length.
	1279	my $wrap_length=72;
	1280	&util::mv("$output_filestem.text", "$output_filestem.text.tmp");
	1281	open INFILE, "$output_filestem.text.tmp" \|\|
	1282	die "Couldn't open file: $!";
	1283	open OUTFILE, ">$output_filestem.text" \|\|
	1284	die "Couldn't open file for writing: $!";
	1285	my $line="";
	1286	while ($line=<INFILE>) {
	1287	while (length($line)>0) {
	1288	if (length($line)>$wrap_length) {
	1289	$line =~ s/^(.{$wrap_length}[^\s])\s//;
	1290	print OUTFILE "$1\n";
	1291	} else {
	1292	print OUTFILE "$line";
	1293	$line="";
	1294	}
	1295	}
	1296	}
	1297	close INFILE;
	1298	close OUTFILE;
	1299	&util::rm("$output_filestem.text.tmp");
	1300
[1445]	1301	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	1302	return 1;
	1303	}
	1304
	1305
[10534]	1306	# Convert a PS file to various types of image with the convert utility
	1307	sub ps_to_img {
	1308	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
	1309
	1310	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
	1311	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
	1312	my $result = `identify 2>&1`;
	1313	if ($? == -1 \|\| $? == 256) { # Linux and Windows return different values for "program not found"
	1314	#ImageMagick is not installed, thus the convert utility is not available.
	1315	print STDERR "*** ImageMagick is not installed, the convert utility is not available\n";
	1316	return 0;
	1317	}
	1318	}
	1319
	1320	$cmd = "";
	1321	if ($timeout) {$cmd = "ulimit -t $timeout;";}
	1322	$output_type =~ s/.\_(.)/$1/i;
	1323	$cmd .= "perl -S pstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
[16435]	1324	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
[10534]	1325	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	1326	} else {
	1327	$cmd .= " > \"$output_filestem.err\"";
	1328	}
	1329
	1330	# don't include path on windows (to avoid having to play about
	1331	# with quoting when GSDLHOME might contain spaces) but assume
	1332	# that the PATH is set up correctly
	1333	$!=0;
	1334	my $retval=system($cmd);
	1335	if ($retval!=0)
	1336	{
	1337	print STDERR "Error executing pstoimg.pl";
	1338	if ($!) {print STDERR ": $!";}
	1339	print STDERR "\n";
	1340	}
	1341
	1342	#make sure the converter made something
	1343	#if ($retval !=0) \|\| ! -s "$output_filestem")
	1344	if ($retval !=0)
	1345	{
	1346	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	1347	#print out the converter's std err, if any
	1348	if (-s "$output_filestem.err") {
	1349	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
	1350	print STDERR "pstoimg error log:\n";
	1351	while (<ERRLOG>) {
	1352	print STDERR "$_";
	1353	}
	1354	close ERRLOG;
	1355	}
	1356	#&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
	1357	if (-e "$output_filestem.err") {
	1358	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	1359	{
	1360	open (ERRLOG, "$output_filestem.err");
	1361	while (<ERRLOG>) {print FAILLOG $_;}
	1362	close ERRLOG;
	1363	close FAILLOG;
	1364	}
	1365	&util::rm("$output_filestem.err");
	1366	}
	1367	return 0;
	1368	}
	1369	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	1370	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	1371	return 1;
	1372	}
	1373
[1445]	1374	# Convert any file to HTML with a crude perl implementation of the
	1375	# UNIX strings command.
	1376
	1377	sub any_to_html {
	1378	($input_filename, $output_filestem) = @_;
	1379
	1380	# First generate a text file
	1381	return 0 unless (&any_to_text($input_filename, $output_filestem));
	1382
	1383	# create an HTML file from the text file
	1384	open(TEXT, "<$output_filestem.text");
	1385	open(HTML, ">$output_filestem.html");
	1386
[2241]	1387	print HTML "<html><head>\n";
	1388	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
	1389	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
	1390	print HTML "</head><body>\n\n";
[1734]	1391
[2755]	1392	my $line;
	1393	while ($line=<TEXT>) {
	1394	$line =~ s/</</g;
	1395	$line =~ s/>/>/g;
[16435]	1396	if ($line =~ m/^\s*$/) {
[2755]	1397	print HTML "<p>";
	1398	} else {
	1399	print HTML "<br> ", $line;
	1400	}
[1445]	1401	}
[1734]	1402	print HTML "\n</body></html>\n";
[1445]	1403
[2241]	1404	close HTML;
	1405	close TEXT;
	1406
[1445]	1407	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
	1408	return 1;
	1409	}
	1410
	1411	# Convert any file to TEXT with a crude perl implementation of the
	1412	# UNIX strings command.
[2755]	1413	# Note - this assumes ascii charsets :( (jrm21)
[1445]	1414
	1415	sub any_to_text {
	1416	($input_filename, $output_filestem) = @_;
	1417
[3350]	1418	if (!$use_strings) {
	1419	return 0;
	1420	}
[15120]	1421
	1422	print STDERR "\n** In any to text**\n\n";
[2755]	1423	open(IN, "<$input_filename") \|\| return 0;
[1734]	1424	binmode(IN);
[2755]	1425	open(OUT, ">$output_filestem.text") \|\| return 0;
[1445]	1426
	1427	my ($line);
[2755]	1428	my $output_line_count = 0;
[1445]	1429	while (<IN>) {
	1430	$line = $_;
[1734]	1431
[1445]	1432	# delete anything that isn't a printable character
	1433	$line =~ s/[^\040-\176]+/\n/sg;
	1434
	1435	# delete any string less than 10 characters long
[1734]	1436	$line =~ s/^.{0,9}$/\n/mg;
[16435]	1437	while ($line =~ m/^.{1,9}$/m) {
[1734]	1438	$line =~ s/^.{0,9}$/\n/mg;
[1445]	1439	$line =~ s/\n+/\n/sg;
	1440	}
	1441
	1442	# remove extraneous whitespace
	1443	$line =~ s/\n+/\n/gs;
	1444	$line =~ s/^\n//gs;
[1578]	1445
[1445]	1446	# output whatever is left
[16435]	1447	if ($line =~ m/[^\n ]/) {
[1445]	1448	print OUT $line;
[2755]	1449	++$output_line_count;
[1445]	1450	}
	1451	}
[2241]	1452
	1453	close OUT;
	1454	close IN;
	1455
[2755]	1456	if ($output_line_count) { # try to protect against binary only formats
	1457	return 1;
	1458	}
	1459
	1460	&util::rm("$output_filestem.text");
	1461	return 0;
	1462
[1445]	1463	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: