Context Navigation

source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 22642

Last change on this file since 22642 was 22642, checked in by kjdon, 14 years ago
removed all open office stuff. Haven't tested it properly as am at home
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 44.6 KB

Rev	Line
[1445]	1	#!/usr/bin/perl -w
	2
	3	###########################################################################
	4	#
[2032]	5	# gsConvert.pl -- convert documents to HTML or TEXT format
[1445]	6	#
	7	# A component of the Greenstone digital library software
	8	# from the New Zealand Digital Library Project at the
	9	# University of Waikato, New Zealand.
	10	#
[3013]	11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
[1445]	12	#
	13	# This program is free software; you can redistribute it and/or modify
	14	# it under the terms of the GNU General Public License as published by
	15	# the Free Software Foundation; either version 2 of the License, or
	16	# (at your option) any later version.
	17	#
	18	# This program is distributed in the hope that it will be useful,
	19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	21	# GNU General Public License for more details.
	22	#
	23	# You should have received a copy of the GNU General Public License
	24	# along with this program; if not, write to the Free Software
	25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	26	#
	27	###########################################################################
	28
	29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
[2755]	30	# by exploiting third-party programs. The sources of these are usually found
	31	# in the $GSDLHOME/packages directory, and the executables should live in
	32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
[1445]	33	#
[3013]	34	# Currently, we can convert the following formats by using external
	35	# conversion utilities:
	36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
	37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
[2032]	38	#
[3013]	39	# We can try to convert any file to text with a perl implementation of the
	40	# UNIX strings command.
	41	#
[2032]	42	# We try to convert Postscript files to text using "gs" which is often on
[2755]	43	# *nix machines. We fall back to performing weak text extraction by using
	44	# regular expressions.
[1445]	45
	46	BEGIN {
	47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
	48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
	49	}
	50
[22429]	51	use strict;
	52
[1445]	53	use parsargv;
	54	use util;
	55	use Cwd;
	56	use File::Basename;
	57
[2755]	58	# Are we running on WinNT or Win2000 (or later)?
	59	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
	60	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
[1445]	61
[3350]	62	my $use_strings;
[3720]	63	my $pdf_complex;
[4103]	64	my $pdf_nohidden;
[3720]	65	my $pdf_zoom;
	66	my $pdf_ignore_images;
[10451]	67	my $pdf_allow_images_only;
[10282]	68	my $windows_scripting;
[3350]	69
[1445]	70	sub print_usage
	71	{
[1970]	72	print STDERR "\n";
	73	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
	74	print STDERR " or text using third-party programs.\n\n";
	75	print STDERR " usage: $0 [options] filename\n";
[22642]	76	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
[2755]	77	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
[22596]	78	print STDERR "\t-output\tauto\|html\|text\|pagedimg_jpg\|pagedimg_gif\|pagedimg_png\t(output file type)\n";
[2755]	79	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
[3720]	80	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
[22568]	81	print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
[3720]	82	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
[4103]	83	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
[3720]	84	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
	85	print STDERR "\t\tconverting PDF to HTML\n";
[10451]	86	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
[3720]	87	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
	88	print STDERR "\t\t-pdf_complex is set\n";
[1445]	89	exit(1);
	90	}
	91
[2755]	92	my $faillogfile="";
[3538]	93	my $timeout=0;
[1445]	94
	95	sub main
	96	{
	97	my (@ARGV) = @_;
[3538]	98	my ($input_type,$output_type,$verbose);
[1960]	99
[22429]	100
[1445]	101	# read command-line arguments
	102	if (!parsargv::parse(\@ARGV,
[22642]	103	'type/(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)/', \$input_type,
[2755]	104	'/errlog/.*/', \$faillogfile,
[22596]	105	'output/(auto\|html\|text\|pagedimg).*/', \$output_type,
[1692]	106	'timeout/\d+/0',\$timeout,
[10282]	107	'verbose/\d+/0', \$verbose,
[22429]	108	'windows_scripting',\$windows_scripting,
[3720]	109	'use_strings', \$use_strings,
	110	'pdf_complex', \$pdf_complex,
[9482]	111	'pdf_ignore_images', \$pdf_ignore_images,
[10451]	112	'pdf_allow_images_only', \$pdf_allow_images_only,
[4103]	113	'pdf_nohidden', \$pdf_nohidden,
[3720]	114	'pdf_zoom/\d+/2', \$pdf_zoom
	115	))
[1445]	116	{
	117	print_usage();
	118	}
[12704]	119
[1445]	120	# Make sure the input file exists and can be opened for reading
	121	if (scalar(@ARGV!=1)) {
	122	print_usage();
	123	}
[1928]	124
[1445]	125	my $input_filename = $ARGV[0];
	126	if (!-r $input_filename) {
	127	print STDERR "Error: unable to open $input_filename for reading\n";
	128	exit(1);
	129	}
	130
	131	# Deduce filenames
	132	my ($tailname,$dirname,$suffix)
[2241]	133	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
	134	my $output_filestem = &util::filename_cat($dirname, "$tailname");
[1445]	135
	136	if ($input_type eq "")
	137	{
[2241]	138	$input_type = lc (substr($suffix,1,length($suffix)-1));
[1445]	139	}
	140
	141	# Change to temporary working directory
	142	my $stored_dir = cwd();
	143	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
[10357]	144
[1445]	145	# Select convert utility
	146	if (!defined $input_type) {
	147	print STDERR "Error: No filename extension or input type defined\n";
	148	exit(1);
	149	}
[3400]	150	elsif ($input_type eq "doc" \|\| $input_type eq "dot") {
[1445]	151	print &convertDOC($input_filename, $output_filestem, $output_type);
	152	print "\n";
	153	}
[1684]	154	elsif ($input_type eq "rtf") {
	155	print &convertRTF($input_filename, $output_filestem, $output_type);
	156	print "\n";
	157	}
[1445]	158	elsif ($input_type eq "pdf") {
	159	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
	160	print "\n";
	161	}
	162	elsif ($input_type eq "ps") {
[22429]	163	print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
[1445]	164	print "\n";
	165	}
[2977]	166	elsif ($input_type eq "ppt") {
	167	print &convertPPT($input_filename, $output_filestem, $output_type);
	168	print "\n";
	169	}
[2991]	170	elsif ($input_type eq "xls") {
	171	print &convertXLS($input_filename, $output_filestem, $output_type);
	172	print "\n";
	173	}
[1445]	174	else {
	175	print STDERR "Error: Unable to convert type '$input_type'\n";
	176	exit(1);
	177	}
	178
	179	# restore to original working directory
	180	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
	181
	182	}
	183
	184	&main(@ARGV);
	185
	186
	187
[2241]	188	# Document-type conversion functions
[1445]	189	#
	190	# The following functions attempt to convert documents from their
	191	# input type to the specified output type. If no output type was
	192	# given, then they first attempt HTML, and then TEXT.
	193	#
	194	# Each returns the output type ("html" or "text") or "fail" if no
	195	# conversion is possible.
	196
	197	# Convert a Microsoft word document
	198
	199	sub convertDOC {
[22429]	200	my ($input_filename, $output_filestem, $output_type) = @_;
[1445]	201
[1654]	202	# Many .doc files are not in fact word documents!
	203	my $realtype = &find_docfile_type($input_filename);
	204
[1734]	205	if ($realtype eq "word6" \|\| $realtype eq "word7" \|\| $realtype eq "word8") {
[1654]	206	return &convertWord678($input_filename, $output_filestem, $output_type);
	207	} elsif ($realtype eq "rtf") {
	208	return &convertRTF($input_filename, $output_filestem, $output_type);
	209	} else {
	210	return &convertAnything($input_filename, $output_filestem, $output_type);
	211	}
	212	}
	213
	214	# Convert a Microsoft word 6/7/8 document
	215
	216	sub convertWord678 {
[22429]	217	my ($input_filename, $output_filestem, $output_type) = @_;
[1654]	218
[1445]	219	my $success = 0;
[16435]	220	if (!$output_type \|\| ($output_type =~ m/html/i)){
[10282]	221	if ($windows_scripting) {
	222	$success = &native_doc_to_html($input_filename, $output_filestem);
	223	}
	224	else {
	225	$success = &doc_to_html($input_filename, $output_filestem);
	226	}
[1445]	227	if ($success) {
[10282]	228	return "html";
[1445]	229	}
	230	}
[1654]	231	return &convertAnything($input_filename, $output_filestem, $output_type);
	232	}
	233
	234
	235	# Convert a Rich Text Format (RTF) file
	236
	237	sub convertRTF {
[22429]	238	my ($input_filename, $output_filestem, $output_type) = @_;
[1654]	239
	240	my $success = 0;
	241
	242	# Attempt specialised conversion to HTML
[16435]	243	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[12704]	244
	245	if ($windows_scripting) {
	246	$success = &native_doc_to_html($input_filename, $output_filestem);
	247	}
	248	else {
	249	$success = &rtf_to_html($input_filename, $output_filestem);
	250	}
[1654]	251	if ($success) {
	252	return "html";
	253	}
	254	}
	255
[2755]	256	# rtf is so ugly that's it's not worth running strings over.
	257	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
	258	# return &convertAnything($input_filename, $output_filestem, $output_type);
	259	return "fail";
[1654]	260	}
	261
	262
	263	# Convert an unidentified file
	264
	265	sub convertAnything {
[22429]	266	my ($input_filename, $output_filestem, $output_type) = @_;
[1654]	267
	268	my $success = 0;
[10464]	269
[1445]	270	# Attempt simple conversion to HTML
[16435]	271	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[1445]	272	$success = &any_to_html($input_filename, $output_filestem);
	273	if ($success) {
	274	return "html";
	275	}
	276	}
	277
	278	# Convert to text
[16435]	279	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[2241]	280	$success = &any_to_text($input_filename, $output_filestem);
[1445]	281	if ($success) {
	282	return "text";
	283	}
	284	}
	285	return "fail";
	286	}
	287
	288
[1654]	289
[1445]	290	# Convert an Adobe PDF document
	291
	292	sub convertPDF {
[2755]	293	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[1445]	294
	295	my $success = 0;
[10357]	296	$output_type =~ s/.\-(.)/$1/i;
	297	# Attempt coversion to Image
[16435]	298	if ($output_type =~ m/jp?g\|gif\|png/i) {
[17329]	299	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
[10357]	300	if ($success){
	301	return "item";
	302	}
	303	}
[1445]	304
	305	# Attempt conversion to HTML
[16435]	306	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[1445]	307	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
	308	if ($success) {
	309	return "html";
	310	}
	311	}
	312
	313	# Attempt conversion to TEXT
[16435]	314	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[2117]	315	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
[1445]	316	if ($success) {
	317	return "text";
	318	}
	319	}
	320
	321	return "fail";
	322
	323	}
	324
	325
	326	# Convert an Adobe PostScript document
	327
	328	sub convertPS {
[22429]	329	my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
[1445]	330
	331	my $success = 0;
[10534]	332	$output_type =~ s/.\-(.)/$1/i;
	333	# Attempt coversion to Image
[16435]	334	if ($output_type =~ m/jp?g\|gif\|png/i) {
[17329]	335	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
[10534]	336	if ($success){
	337	return "item";
	338	}
	339	}
[1445]	340
	341	# Attempt conversion to TEXT
[16435]	342	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[1445]	343	$success = &ps_to_text($input_filename, $output_filestem);
	344	if ($success) {
	345	return "text";
	346	}
	347	}
	348	return "fail";
	349	}
	350
	351
[2977]	352	sub convertPPT {
	353	my ($input_filename, $output_filestem, $output_type) = @_;
[10357]	354	my $success = 0;
[2977]	355
[10282]	356	my $ppt_convert_type = "";
[22513]	357
[16435]	358	#if (!$output_type \|\| $windows_scripting \|\| ($output_type !~ m/html/i) \|\| ($output_type !~ m/text/i)){
	359	if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
	360	if ($output_type =~ m/gif/i) {
[10282]	361	$ppt_convert_type = "-g";
[16435]	362	} elsif ($output_type =~ m/jp?g/i){
[10282]	363	$ppt_convert_type = "-j";
[16435]	364	} elsif ($output_type =~ m/png/i){
[10282]	365	$ppt_convert_type = "-p";
	366	}
	367	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
	368	$ENV{'GSDLOS'}, "pptextract");
[16435]	369	$vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
[10282]	370
[22429]	371	my $cmd = "";
[10357]	372	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[22429]	373	# if the converting directory already exists
[10282]	374	if (-d $output_filestem) {
[22429]	375	print STDERR "**The conversion directory already exists\n";
[10282]	376	return "item";
	377	} else {
[10521]	378	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
[10282]	379	$cmd .= " 2>\"$output_filestem.err\""
[16435]	380	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[10282]	381	if (system($cmd) !=0) {
	382	print STDERR "Powerpoint VB Scripting convert failed\n";
	383	} else {
	384	return "item";
	385	}
	386	}
[16435]	387	} elsif (!$output_type \|\| ($output_type =~ m/html/i)) {
[10282]	388	# Attempt conversion to HTML
[16435]	389	#if (!$output_type \|\| ($output_type =~ m/html/i)) {
[2977]	390	# formulate the command
[22429]	391	my $cmd = "";
[2977]	392	$cmd .= "perl -S ppttohtml.pl ";
	393	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
	394	$cmd .= " 2>\"$output_filestem.err\""
[16435]	395	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[10357]	396
[2977]	397	# execute the command
	398	$!=0;
	399	if (system($cmd)!=0)
	400	{
[2991]	401	print STDERR "Powerpoint 95/97 converter failed $!\n";
[2977]	402	} else {
	403	return "html";
	404	}
[10464]	405	}
[2977]	406
	407	$success = &any_to_text($input_filename, $output_filestem);
	408	if ($success) {
	409	return "text";
	410	}
[10464]	411
[2977]	412	return "fail";
	413	}
	414
	415
[2991]	416	sub convertXLS {
	417	my ($input_filename, $output_filestem, $output_type) = @_;
[2977]	418
[2991]	419	my $success = 0;
[2977]	420
[2991]	421	# Attempt conversion to HTML
[16435]	422	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[2991]	423	# formulate the command
[22429]	424	my $cmd = "";
[2991]	425	$cmd .= "perl -S xlstohtml.pl ";
	426	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
	427	$cmd .= " 2>\"$output_filestem.err\""
[16435]	428	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[2991]	429
	430
	431	# execute the command
	432	$!=0;
	433	if (system($cmd)!=0)
	434	{
	435	print STDERR "Excel 95/97 converter failed $!\n";
	436	} else {
	437	return "html";
	438	}
	439	}
[2977]	440
[2991]	441	$success = &any_to_text($input_filename, $output_filestem);
	442	if ($success) {
	443	return "text";
	444	}
	445
	446	return "fail";
	447	}
	448
	449
	450
[1654]	451	# Find the real type of a .doc file
	452	#
[2012]	453	# We seem to have a lot of files with a .doc extension that are .rtf
[1654]	454	# files or Word 5 files. This function attempts to tell the difference.
	455	sub find_docfile_type {
[22429]	456	my ($input_filename) = @_;
[1654]	457
	458	open(CHK, "<$input_filename");
[1734]	459	binmode(CHK);
[1654]	460	my $line = "";
	461	my $first = 1;
	462
	463	while (<CHK>) {
	464
	465	$line = $_;
[1960]	466
[1654]	467	if ($first) {
	468	# check to see if this is an rtf file
[16435]	469	if ($line =~ m/^\{\\rtf/) {
[1654]	470	close(CHK);
	471	return "rtf";
	472	}
[2755]	473	$first = 0;
[1654]	474	}
	475
[1734]	476	# is this is a word 6/7/8 document?
[16435]	477	if ($line =~ m/Word\.Document\.([678])/) {
[1654]	478	close(CHK);
[1734]	479	return "word$1";
[1654]	480	}
	481
	482	}
	483
	484	return "unknown";
	485	}
	486
	487
[1734]	488	# Specific type-to-type conversions
[1445]	489	#
	490	# Each of the following functions attempts to convert a document from
[2755]	491	# a specific format to another. If they succeed they return 1 and leave
[1445]	492	# the output document(s) in the appropriate place; if they fail they
	493	# return 0 and delete any working files.
	494
	495
	496	# Attempt to convert a word document to html with the wv program
	497	sub doc_to_html {
[22429]	498	my ($input_filename, $output_filestem) = @_;
[1445]	499
[20933]	500	my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wvWare");
[1928]	501
[20933]	502	if ( -d "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv" && $ENV{'GSDLOS'} eq "linux" ) {
	503	$ENV{'PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/bin:$ENV{'PATH'}";
	504	$ENV{'LD_LIBRARY_PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/lib:$ENV{'LD_LIBRARY_PATH'}";
	505	$wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wv", "bin", "wvWare");
	506	}
	507
[2241]	508	# don't include path on windows (to avoid having to play about
	509	# with quoting when GSDLHOME might contain spaces) but assume
	510	# that the PATH is set up correctly
[16435]	511	$wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
[2241]	512
[2512]	513	my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
[2574]	514	"packages", "wv", "wvHtml.xml");
[1928]	515
[15120]	516	# Added the following to work with replace_srcdoc_with_html.pl:
	517	# Make wvWare put any associated (image) files of the word doc into
	518	# folder docname-without-extention_files. This folder should be at
	519	# the same level as the html file generated from the doc.
	520	# wvWare will take care of proper interlinking.
	521
	522	# This step is necessary for replace_srcdoc_with_html.pl which will
	523	# move the html and associated files into the import folder. We
	524	# want to ensure that the associated files won't overwrite similarly
	525	# named items already in import. Hence we put them in a folder first
	526	# (to which the html links properly) and that will allow
	527	# replace_srcdoc_with_html.pl to move them safely to /import.
	528
	529	# To do all this, we need to use wvWare's --dir and --basename options
	530	# where dir is the full path to the image folder directory and
	531	# basename is the full path to the image folder appended to the name
	532	# which is to be prepended to every image file:
	533	# eg. if the images were to have names like sample0.jpg to sampleN.jpg,
	534	# then the basename is "/full/path/to/imgdir/sample".
	535	# In this case, basename is the full path to and name of the document.
	536	# HOWEVER: basename always takes full path, not relative url, so
	537	# the greenstone browser is unable to display the images (absolute paths
	538	# cause it to give an "external link" message)
	539	# See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html
	540	# and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html
	541	# "added --dir option to wvHtml so that pictures can be placed in
	542	# a seperate directory"
	543	# "running wvWare through IMP to view word documents as html. It gets
	544	# invoked like this:
	545	# wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output"
	546
	547	# toppath is the folder where html is generated
	548	# docname is the name (without extension) of the html to be generated
	549	# suffix (extension) is thrown away
	550	my ($docname, $toppath)
	551	= &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
	552
	553	# We want the image folder generated to have the same name as windows
	554	# would generate ($windows_scripting) when it converts from word to html.
	555	# That is, foldername=docname_files
	556	my $assoc_dir = &util::filename_cat($toppath, $docname."_files");
	557	#print "assoc_dir: ".$assoc_dir."\n"; # same as "$output_filestem._files"
	558
	559	# ensure this image directory exists
	560	# if it exists already, just delete and recreate
	561	if(-e $assoc_dir) {
	562	&util::rm_r($assoc_dir);
	563	}
	564	&util::mk_dir($assoc_dir);
	565
	566	# the images are all going to be called image0, image1,..., imageN
	567	my $img_basenames = &util::filename_cat($assoc_dir, $docname);
	568
	569	#print STDERR "**toppath: $toppath\n**docname: $docname\n;
	570	#print STDERR "****img_basenames: $img_basenames\n" if($img_basenames);
	571	#print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir);
	572
[2241]	573	my $cmd = "";
[1692]	574	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[15120]	575	# wvWare's --dir and --basename options for image directory.
	576	# Replaced the next line with the 2 lines following it:
	577	# $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
	578	$cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\"";
	579	$cmd .= " --charset utf-8 --config \"$wv_conf\"";
[2241]	580	$cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
[15120]	581
[2241]	582	# redirecting STDERR is a bad idea on windows 95/98
	583	$cmd .= " 2> \"$output_filestem.err\""
[16435]	584	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[1445]	585	# execute the command
[2755]	586	$!=0;
[2060]	587	if (system($cmd)!=0)
[1445]	588	{
[2755]	589	print STDERR "Error executing wv converter:$!\n";
	590	if (-s "$output_filestem.err") {
	591	open (ERRFILE, "<$output_filestem.err");
	592
	593	my $write_to_fail_log=0;
	594	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	595	{$write_to_fail_log=1;}
	596
	597	my $line;
	598	while ($line=<ERRFILE>) {
[16435]	599	if ($line =~ m/\w/) {
[2755]	600	print STDERR "$line";
	601	print FAILLOG "$line" if ($write_to_fail_log);
	602	}
	603	if ($line !~ m/startup error/) {next;}
	604	print STDERR " (given an invalid .DOC file?)\n";
	605	print FAILLOG " (given an invalid .DOC file?)\n"
	606	if ($write_to_fail_log);
	607
	608	} # while ERRFILE
	609	close FAILLOG if ($write_to_fail_log);
	610	}
	611	return 0; # we can try any_to_text
[1445]	612	}
[1578]	613
[1445]	614	# Was the conversion successful?
[2241]	615
[15120]	616	if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents)
[1445]	617	open(TMP, "$output_filestem.html");
[22429]	618	my $line = <TMP>;
[1445]	619	close(TMP);
[16435]	620	if ($line && $line =~ m/DOCTYPE HTML/) {
[15120]	621	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
	622
	623	# Inserted this code to remove the images directory if it was still empty after
	624	# the html was generated (in case there were no images in the word document)
[16435]	625	if (&util::is_dir_empty($assoc_dir)) {
[15152]	626	#print STDERR "*gsConvert.pl: Image dir $assoc_dir is empty, removing*\n";
[15120]	627	&util::rm_r($assoc_dir);
	628	} else { # there was an image folder (it was generated)
	629	# Therefore, the html file generated contains absolute links to the images
[16435]	630	# Replace them with relative links instead, so the folder can be moved elsewhere
[15152]	631	&make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");
[15120]	632	}
[1445]	633	return 1;
	634	}
	635	}
[2755]	636
	637	# If here, an error of some sort occurred
	638	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
	639	if (-e "$output_filestem.err") {
	640	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
	641	open (ERRLOG,"$output_filestem.err");
	642	while (<ERRLOG>) {print FAILLOG $_;}
	643	close FAILLOG;
	644	close ERRLOG;
	645	}
	646	&util::rm("$output_filestem.err");
	647	}
	648
[1445]	649	return 0;
	650	}
	651
[15120]	652	# Method to work with doc_to_html - Word docs might contain images.
	653	# When such word docs are converted with wvWare, we make it generate a
	654	# <filename>_files folder with the associated images, while the html file
	655	# <filename> refers to the images using absolute paths to <filename>_files.
	656	# This method reads in that html file and replaces all the absolute paths to
	657	# the images in <filename>_files with the relative paths to the images from
	658	# that folder. (I.e. with <filename>_files/<imagename.ext>).
	659	sub make_links_to_assocdir_relative{
	660	# toppath is the top-level folder in which the html file we're going to be fixing resides
	661	# docname is just the name (without extension) of the html file
	662	# html_file is the full path to the html file: /full/path/docname.html
	663	# assoc_dir_path is toppath/docname_files
	664	# assoc_dirname is the directory name of the folder with associated imgs: docname_files
	665	my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_;
[10357]	666
[15120]	667	# 1. Read all the contents of the html into a string
	668	# open the original file for reading
	669	unless(open(FIN, "<$html_file")) {
[15168]	670	print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR: $!\n";
[15152]	671	return 0;
[15120]	672	}
	673	# From http://perl.plover.com/local.html
	674	# "It's cheaper to read the file all at once, without all the splitting and reassembling.
	675	# (Some people call this slurping the file.) Perl has a special feature to support this:
	676	# If the $/ variable is undefined, the <...> operator will read the entire file all at once"
[15152]	677	my $html_contents;
	678	{
	679	local $/ = undef; # Read entire file at once
	680	$html_contents = <FIN>; # Now file is read in as one single 'line'
	681	}
[15120]	682	close(FIN); # close the file
[15152]	683	#print STDERR $html_contents;
[15120]	684
	685	# 2. Replace (substitute) all ocurrences of the assoc_dir_path in a hrefs and img src
	686	# values with assoc_dirname
	687	# At the end: g means substitute all occurrences (global), while s at the end means treat
	688	# all new lines as a regular space. This interacts with g to consider all the lines
	689	# together as a single line so that multi-occurrences can be replaced.
[15152]	690
	691	# we can't just replace $assoc_dir_path with $assoc_dir
	692	# $assoc_dir_path represents a regular expression that needs to be replaced
[16435]	693	# if it contains ., -, [, ], or Windows style backslashes in paths -- which all have special
	694	# meaning in Perl regular expressions -- we need to escape these first
[15152]	695	my $safe_reg_expression = $assoc_dir_path;
[16435]	696	$safe_reg_expression =~ s/\\/\\\\/g;
[15152]	697	$safe_reg_expression =~ s/\./\\./g;
	698	$safe_reg_expression =~ s/\-/\\-/g;
	699	$safe_reg_expression =~ s/\[/\\[/g;
	700	$safe_reg_expression =~ s/\]/\\]/g;
	701	$safe_reg_expression =~ s/ /%20/g; # wvWare put %20 in place of space, so we need to change our prefix to match
	702
[15120]	703	# The following regular expression substitution looks for <a or <image, followed by any other
	704	# attributes and values until it comes to the FIRST (indicated by ?) href= or src=
	705	# followed by " or ' no quotes at all around path, followed by the associated folder's pathname
	706	# followed by characters (for the img filename), then finally the optional closing quotes
	707	# in " or ' form, followed by any other attributes and values until the first > to end the tag.
	708	# The substitution: all the parts preceding associated folder's pathname are retained,
	709	# the associated folder path name is replaced by associated folder directory name
	710	# and the rest upto and including the closing > tag is retained.
	711	# The sg at the end of the pattern match treats all of html_contents as a single line (s)
	712	# and performs a global replace (g) meaning that all occurrences that match in that single line
	713	# are substituted.
[15152]	714	$html_contents =~ s/(<(a\|img).?(href\|src)=(\"\|\')?)$safe_reg_expression(.?(\"\|\')?.*?>)/$1$assoc_dirname$5/sg;
	715	#$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back
	716	# now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this
[16435]	717	$html_contents =~ s/(<(a\|img).?(href\|src)=(\"\|\')?)(.)(.?(\"\|\')?.?>)/&post_process_assocfile_urls($1, $5, $6)/sge;
	718
[16552]	719	#print STDERR "**assoc_dirname: $assoc_dirname*\n";
	720	#print STDERR "**safe_reg_expression: $safe_reg_expression*\n";
[15152]	721
[15120]	722	# delete the original file and recreate it
	723	my $copy_of_filename = $html_file;
	724	&util::rm($copy_of_filename); # deleted the file
	725
	726	# Recreate the original file for writing the updated contents
	727	unless(open(FOUT, ">$html_file")) { # open it as a new file for writing
[15168]	728	print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR: $!\n";
[15152]	729	return 0;
[15120]	730	}
[16435]	731
[15120]	732	# write out the updated contents and close the file
	733	print FOUT $html_contents;
	734	close(FOUT);
[15152]	735	return 1;
[15120]	736	}
	737
[16435]	738	# Utility routine to make sure HTML plugin gets img src/href link pathnames that contain
	739	# url slashes (/) instead of windows-style backwards slashes, and to convert all %20
[16899]	740	# introduced in link pathnames by wvWare into space again. Converts all percent signs
	741	# introduced by URL encoding filenames generated into %25 in these url links referencing them
[16435]	742	sub post_process_assocfile_urls
[15120]	743	{
[15152]	744	my ($pre, $text, $post) = @_;
	745
[19763]	746	$text =~ s/%20/ /g; # Convert %20s to space and not underscore since underscores mess with incremental rebuild
	747	# $text =~ s/%20/_/g; # reinstated this line, since we no longer replace spaces with %20. We replace them with underscores
[16435]	748	$text =~ s/\\/\//g;
[16899]	749	$text =~ s/%/%25/g;
[15152]	750
	751	return "$pre$text$post";
[15120]	752	}
	753
[10282]	754	# Attempt to convert a word document to html with the word2html scripting program
	755	sub native_doc_to_html {
[22429]	756	my ($input_filename, $output_filestem) = @_;
[1445]	757
[10282]	758	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
	759	$ENV{'GSDLOS'}, "word2html");
	760
[16435]	761	$vbScript = "word2html" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
[10445]	762	if (-e "$output_filestem.html") {
[22429]	763	print STDERR " The conversion file:\n";
	764	print STDERR " $output_filestem.html\n";
	765	print STDERR " ... already exists. Skipping\n";
[10445]	766	return 1;
	767	}
[10282]	768
	769	my $cmd = "";
	770	if ($timeout) {$cmd = "ulimit -t $timeout;";}
	771	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
[10445]	772	#$cmd .= "$vbScript $input_filename $output_filestem.html";
[10521]	773	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
[10445]	774
[10282]	775	# redirecting STDERR
	776	$cmd .= " 2> \"$output_filestem.err\""
[16435]	777	if ($ENV {'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[10282]	778
	779	# execute the command
	780	$!=0;
	781	if (system($cmd)!=0)
	782	{
	783	print STDERR "Error executing word2Html converter:$!\n";
	784	if (-s "$output_filestem.err") {
	785	open (ERRFILE, "<$output_filestem.err");
	786
	787	my $write_to_fail_log=0;
	788	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	789	{$write_to_fail_log=1;}
	790
	791	my $line;
	792	while ($line=<ERRFILE>) {
[16435]	793	if ($line =~ m/\w/) {
[10282]	794	print STDERR "$line";
	795	print FAILLOG "$line" if ($write_to_fail_log);
	796	}
	797	if ($line !~ m/startup error/) {next;}
	798	print STDERR " (given an invalid .DOC file?)\n";
	799	print FAILLOG " (given an invalid .DOC file?)\n"
	800	if ($write_to_fail_log);
	801
	802	} # while ERRFILE
	803	close FAILLOG if ($write_to_fail_log);
	804	}
	805	return 0; # we can try any_to_text
	806	}
	807
	808	# Was the conversion successful?
	809	if (-s "$output_filestem.html") {
	810	open(TMP, "$output_filestem.html");
[22429]	811	my $line = <TMP>;
[10282]	812	close(TMP);
[22429]	813	if ($line && $line =~ m/html/i) {
[10282]	814	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
	815	return 1;
	816	}
	817	}
	818
	819	# If here, an error of some sort occurred
	820	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
	821	if (-e "$output_filestem.err") {
	822	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
	823	open (ERRLOG,"$output_filestem.err");
	824	while (<ERRLOG>) {print FAILLOG $_;}
	825	close FAILLOG;
	826	close ERRLOG;
	827	}
	828	&util::rm("$output_filestem.err");
	829	}
	830	return 0;
	831	}
	832
[1654]	833	# Attempt to convert an RTF document to html with rtftohtml
	834	sub rtf_to_html {
[2241]	835	my ($input_filename, $output_filestem) = @_;
[1654]	836
	837	# formulate the command
[22429]	838	my $cmd = "";
[1692]	839	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[2574]	840	$cmd .= "rtftohtml";
[10282]	841	#$cmd .= "rtf-converter";
[1654]	842
[3246]	843	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
[2574]	844
	845	$cmd .= " 2>\"$output_filestem.err\""
[16435]	846	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[2574]	847
	848
[1654]	849	# execute the command
[2755]	850	$!=0;
[2060]	851	if (system($cmd)!=0)
[1654]	852	{
[2755]	853	print STDERR "Error executing rtf converter $!\n";
[2656]	854	# don't currently bother printing out error log...
	855	# keep going, in case it still created an HTML file...
[1654]	856	}
	857
	858	# Was the conversion successful?
[2755]	859	my $was_successful=0;
[2656]	860	if (-s "$output_filestem.html") {
[2755]	861	# make sure we have some content other than header
	862	open (HTML, "$output_filestem.html"); # what to do if fail?
	863	my $line;
	864	my $past_header=0;
	865	while ($line=<HTML>) {
	866
	867	if ($past_header == 0) {
[16435]	868	if ($line =~ m/<body>/) {$past_header=1;}
[2755]	869	next;
	870	}
	871
	872	$line =~ s/<[^>]+>//g;
[16435]	873	if ($line =~ m/\w/ && $past_header) { # we found some content...
[2755]	874	$was_successful=1;
	875	last;
	876	}
	877	}
	878	close HTML;
[1654]	879	}
[2574]	880
[2755]	881	if ($was_successful) {
	882	&util::rm("$output_filestem.err")
	883	if (-e "$output_filestem.err");
	884	# insert the (modified) table of contents, if it exists.
	885	if (-e "${output_filestem}_ToC.html") {
	886	&util::mv("$output_filestem.html","$output_filestem.src");
	887	my $open_failed=0;
	888	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
	889	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
	890	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
	891
	892	if ($open_failed) {
	893	close HTMLSRC;
	894	close TOC;
	895	close HTML;
	896	&util::mv("$output_filestem.src","$output_filestem.html");
	897	return 1;
	898	}
	899
	900	# print out header info from src html.
[16435]	901	while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
[2755]	902	print HTML "$_";
	903	}
	904
	905	# print out table of contents, making links relative
	906	<TOC>; <TOC>; # ignore first 2 lines
	907	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
	908	my $line;
	909	while ($line=<TOC>) {
[22429]	910	$line =~ s@</body></html>$@@i ; # only last line has this
[2755]	911	# make link relative
[22429]	912	$line =~ s@href=\"[^\#]+@href=\"@i;
[2755]	913	print HTML $line;
	914	}
	915	close TOC;
	916
	917	# rest of html src
	918	while (<HTMLSRC>) {
	919	print HTML $_;
	920	}
	921	close HTMLSRC;
	922	close HTML;
	923
	924	&util::rm("${output_filestem}_ToC.html");
	925	&util::rm("${output_filestem}.src");
	926	}
	927	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
	928	return 1; # success
	929	}
	930
	931	if (-e "$output_filestem.err") {
	932	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	933	{
	934	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
[10282]	935	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
[2755]	936	print FAILLOG " (rtf file might be too recent):\n";
	937	open (ERRLOG, "$output_filestem.err");
	938	while (<ERRLOG>) {print FAILLOG $_;}
	939	close ERRLOG;
	940	close FAILLOG;
	941	}
	942	&util::rm("$output_filestem.err");
	943	}
	944
[2656]	945	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
	946
[1654]	947	return 0;
	948	}
	949
	950
[1445]	951	# Convert a pdf file to html with the pdftohtml command
	952
	953	sub pdf_to_html {
[2755]	954	my ($dirname, $input_filename, $output_filestem) = @_;
[1445]	955
[22429]	956	my $cmd = "";
[1692]	957	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[3720]	958	$cmd .= "perl -S pdftohtml.pl -zoom $pdf_zoom";
	959	$cmd .= " -c" if ($pdf_complex);
	960	$cmd .= " -i" if ($pdf_ignore_images);
[10451]	961	$cmd .= " -a" if ($pdf_allow_images_only);
[4103]	962	$cmd .= " -hidden" unless ($pdf_nohidden);
[1928]	963	$cmd .= " \"$input_filename\" \"$output_filestem\"";
[2755]	964
[16435]	965	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
[2755]	966	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	967	} else {
	968	$cmd .= " > \"$output_filestem.err\"";
	969	}
	970
[2117]	971	$!=0;
[2241]	972
[2656]	973	my $retval=system($cmd);
	974	if ($retval!=0)
[1445]	975	{
[2755]	976	print STDERR "Error executing pdftohtml.pl";
[2117]	977	if ($!) {print STDERR ": $!";}
	978	print STDERR "\n";
[1445]	979	}
	980
[1692]	981	# make sure the converter made something
[2656]	982	if ($retval!=0 \|\| ! -s "$output_filestem.html")
[1692]	983	{
	984	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
[2656]	985	# print out the converter's std err, if any
	986	if (-s "$output_filestem.err") {
[1692]	987	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[2755]	988	print STDERR "pdftohtml error log:\n";
[1692]	989	while (<ERRLOG>) {
	990	print STDERR "$_";
	991	}
	992	close ERRLOG;
	993	}
[22513]	994	print STDERR "***********output filestem $output_filestem.html\n";
[2656]	995	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
[2755]	996	if (-e "$output_filestem.err") {
	997	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	998	{
	999	open (ERRLOG, "$output_filestem.err");
	1000	while (<ERRLOG>) {print FAILLOG $_;}
	1001	close ERRLOG;
	1002	close FAILLOG;
	1003	}
[10282]	1004	&util::rm("$output_filestem.err");
[2755]	1005	}
[1692]	1006	return 0;
	1007	}
[10357]	1008
	1009	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	1010	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	1011	return 1;
	1012	}
	1013
	1014	# Convert a pdf file to various types of image with the convert command
	1015
[17329]	1016	sub pdfps_to_img {
[10357]	1017	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[10401]	1018
	1019	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
	1020	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
	1021	my $result = `identify 2>&1`;
	1022	if ($? == -1 \|\| $? == 256) { # Linux and Windows return different values for "program not found"
	1023	#ImageMagick is not installed, thus the convert utility is not available.
[17329]	1024	print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images\n";
[10401]	1025	return 0;
	1026	}
	1027	}
	1028
[22429]	1029	my $cmd = "";
[10357]	1030	if ($timeout) {$cmd = "ulimit -t $timeout;";}
	1031	$output_type =~ s/.\_(.)/$1/i;
[17329]	1032	$cmd .= "perl -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
[16435]	1033	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
[10357]	1034	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	1035	} else {
	1036	$cmd .= " > \"$output_filestem.err\"";
	1037	}
	1038
	1039	# don't include path on windows (to avoid having to play about
	1040	# with quoting when GSDLHOME might contain spaces) but assume
	1041	# that the PATH is set up correctly
	1042	$!=0;
	1043	my $retval=system($cmd);
	1044	if ($retval!=0)
	1045	{
[10401]	1046	print STDERR "Error executing pdftoimg.pl";
[10357]	1047	if ($!) {print STDERR ": $!";}
	1048	print STDERR "\n";
	1049	}
	1050
	1051	#make sure the converter made something
	1052	#if ($retval !=0) \|\| ! -s "$output_filestem")
	1053	if ($retval !=0)
	1054	{
	1055	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	1056	#print out the converter's std err, if any
	1057	if (-s "$output_filestem.err") {
	1058	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[17329]	1059	print STDERR "pdfpstoimg error log:\n";
[10357]	1060	while (<ERRLOG>) {
	1061	print STDERR "$_";
	1062	}
	1063	close ERRLOG;
	1064	}
[10534]	1065	#&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
[10357]	1066	if (-e "$output_filestem.err") {
	1067	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	1068	{
	1069	open (ERRLOG, "$output_filestem.err");
	1070	while (<ERRLOG>) {print FAILLOG $_;}
	1071	close ERRLOG;
	1072	close FAILLOG;
	1073	}
	1074	&util::rm("$output_filestem.err");
	1075	}
	1076	return 0;
	1077	}
[2656]	1078	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
[1445]	1079	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	1080	return 1;
	1081	}
	1082
	1083	# Convert a PDF file to text with the pdftotext command
	1084
	1085	sub pdf_to_text {
[2755]	1086	my ($dirname, $input_filename, $output_filestem) = @_;
[1445]	1087
[2248]	1088	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
[2755]	1089
[16435]	1090	if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
[2755]	1091	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	1092	} else {
	1093	$cmd .= " > \"$output_filestem.err\"";
	1094	}
[1445]	1095
[2060]	1096	if (system($cmd)!=0)
[1445]	1097	{
	1098	print STDERR "Error executing $cmd: $!\n";
	1099	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
	1100	}
	1101
[2755]	1102	# make sure there is some extracted text.
	1103	if (-e "$output_filestem.text") {
	1104	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
	1105	binmode(EXTR_TEXT); # just in case...
	1106	my $line="";
	1107	my $seen_text=0;
	1108	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
[16435]	1109	if ($line=~ m/\w/) {$seen_text=1;}
[2755]	1110	}
	1111	close EXTR_TEXT;
	1112	if ($seen_text==0) { # no text was extracted
	1113	print STDERR "Error: pdftotext found no text\n";
	1114	&util::rm("$output_filestem.text");
	1115	}
	1116	}
	1117
[1692]	1118	# make sure the converter made something
[2656]	1119	if (! -s "$output_filestem.text")
[1692]	1120	{
	1121	# print out the converters std err, if any
[2656]	1122	if (-s "$output_filestem.err") {
[1692]	1123	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[2755]	1124	print STDERR "pdftotext error log:\n";
[1692]	1125	while (<ERRLOG>) {
	1126	print STDERR "$_";
	1127	}
	1128	close ERRLOG;
	1129	}
[2656]	1130	# does this converter create a .out file?
	1131	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	1132	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
[2755]	1133	if (-e "$output_filestem.err") {
	1134	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	1135	{
	1136	open (ERRLOG,"$output_filestem.err");
	1137	while (<ERRLOG>) {print FAILLOG $_;}
	1138	close ERRLOG;
	1139	close FAILLOG;
	1140	}
	1141	&util::rm("$output_filestem.err");
	1142	}
[1692]	1143	return 0;
	1144	}
[1445]	1145	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	1146	return 1;
	1147	}
	1148
[2012]	1149	# Convert a PostScript document to text
	1150	# note - just using "ps2ascii" isn't good enough, as it
	1151	# returns 0 for a postscript interpreter error. ps2ascii is just
	1152	# a wrapper to "gs" anyway, so we use that cmd here.
[1445]	1153
	1154	sub ps_to_text {
[2241]	1155	my ($input_filename, $output_filestem) = @_;
[1445]	1156
[2241]	1157	my $error = "";
	1158
	1159	# if we're on windows we'll fall straight through without attempting
	1160	# to use gs
[16435]	1161	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
[2241]	1162	$error = "Windows does not support gs";
	1163
	1164	} else {
[3538]	1165	my $cmd = "";
	1166	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
	1167	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
[2241]	1168	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
[10357]	1169	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
[2241]	1170	$cmd .= " 2> $output_filestem.err";
	1171	$!=0;
[10357]	1172
[2241]	1173	my $retcode=system($cmd);
	1174	$retcode = $? >> 8; # see man perlfunc - system for this...
	1175	# if system returns -1 \| 127 (couldn't start program), look at $! for message
	1176
	1177	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
	1178	elsif (! -e "$output_filestem.text") {
	1179	$error="did not create output file.\n";
[2012]	1180	}
[2241]	1181	else
	1182	{ # make sure the interpreter didn't get an error. It is technically
	1183	# possible for the actual text to start with this, but....
	1184	open PSOUT, "$output_filestem.text";
[16435]	1185	if (<PSOUT> =~ m/^Error: (.*)/) {
[2241]	1186	$error="interpreter error - \"$1\"";
	1187	}
	1188	close PSOUT;
	1189	}
[2012]	1190	}
[2241]	1191
[2012]	1192	if ($error ne "")
[1445]	1193	{
[2755]	1194	print STDERR "Warning: Error executing gs: $error\n";
[1445]	1195	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
[2755]	1196
	1197	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
	1198	{
	1199	print FAILLOG "gs - $error\n";
	1200	if (-e "$output_filestem.err") {
	1201	open(ERRLOG, "$output_filestem.err");
	1202	while (<ERRLOG>) {print FAILLOG $_;}
	1203	close ERRLOG;
	1204	}
	1205	close FAILLOG;
	1206	}
[1445]	1207	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
[2012]	1208
[2755]	1209
[2012]	1210	# Fine then. We'll just do a lousy job by ourselves...
[2031]	1211	# Based on 5-line regexp sed script found at:
[2012]	1212	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
	1213	#
[2755]	1214	print STDERR "Stripping text from postscript\n";
[2012]	1215	my $errorcode=0;
	1216	open (IN, "$input_filename")
	1217	\|\| ($errorcode=1, warn "Couldn't read file: $!");
	1218	open (OUT, ">$output_filestem.text")
	1219	\|\| ($errorcode=1, warn "Couldn't write file: $!");
	1220	if ($errorcode) {print STDERR "errors\n";return 0;}
	1221
[2031]	1222	my $text=""; # this is for whole .ps file...
[2755]	1223	$text = join('', <IN>); # see man perlport, under "System Resources"
[2031]	1224	close IN;
	1225
[2447]	1226	# Make sure this is a ps file...
[16435]	1227	if ($text !~ m/^%!/) {
[2755]	1228	print STDERR "Bad postscript header: not '%!'\n";
	1229	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
	1230	{
	1231	print FAILLOG "Bad postscript header: not '%!'\n";
	1232	close FAILLOG;
	1233	}
[2447]	1234	return 0;
	1235	}
	1236
[2031]	1237	# if ps has Page data, then use it to delete all stuff before it.
	1238	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
	1239
	1240	# remove all leading non-data stuff
	1241	$text =~ s/^.*?\(//s;
	1242
	1243	# remove all newline chars for easier processing
	1244	$text =~ s/\n//g;
	1245
	1246	# Big assumption here - assume that if any co-ordinates are
	1247	# given, then we are at the end of a sentence.
	1248	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
	1249
	1250	# special characters--
	1251	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
	1252
	1253	# ? ps text formatting (eg italics?) ?
	1254	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
	1255	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
	1256	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
	1257	# default - remove the rest
	1258	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
	1259
	1260	# attempt to add whitespace between words...
	1261	# this is based purely on observation, and may be completely wrong...
	1262	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
	1263	# eg I notice "b(" is sometimes NOT a space if preceded by a
	1264	# negative number.
	1265	$text =~ s/\)\d+ ?b\(/\) \( /g;
	1266
	1267	# change quoted braces to brackets
	1268	$text =~ s/([^\\])\\\(/$1\{/g;
	1269	$text =~ s/([^\\])\\\)/$1\}/g ;
	1270
	1271	# remove everything that is not between braces
	1272	$text =~ s/\)([^\(\)])+?\(//sg ;
	1273
	1274	# remove any Trailer eof stuff.
	1275	$text =~ s/\)[^\)]*$//sg;
	1276
	1277	### ligatures have special characters...
	1278	$text =~ s/\\013/ff/g;
	1279	$text =~ s/\\014/fi/g;
	1280	$text =~ s/\\015/fl/g;
	1281	$text =~ s/\\016/ffi/g;
	1282	$text =~ s/\\214/fi/g;
	1283	$text =~ s/\\215/fl/g;
	1284	$text =~ s/\\017/\n\* /g; # asterisk?
	1285	$text =~ s/\\023/\023/g; # e acute ('e)
	1286	$text =~ s/\\177/\252/g; # u"
	1287	# $text =~ s/ ?? /\344/g; # a"
	1288
	1289	print OUT "$text";
	1290	close OUT;
[1960]	1291	}
[2600]	1292	# wrap the text - use a minimum length. ie, first space after this length.
	1293	my $wrap_length=72;
	1294	&util::mv("$output_filestem.text", "$output_filestem.text.tmp");
	1295	open INFILE, "$output_filestem.text.tmp" \|\|
	1296	die "Couldn't open file: $!";
	1297	open OUTFILE, ">$output_filestem.text" \|\|
	1298	die "Couldn't open file for writing: $!";
	1299	my $line="";
	1300	while ($line=<INFILE>) {
	1301	while (length($line)>0) {
	1302	if (length($line)>$wrap_length) {
	1303	$line =~ s/^(.{$wrap_length}[^\s])\s//;
	1304	print OUTFILE "$1\n";
	1305	} else {
	1306	print OUTFILE "$line";
	1307	$line="";
	1308	}
	1309	}
	1310	}
	1311	close INFILE;
	1312	close OUTFILE;
	1313	&util::rm("$output_filestem.text.tmp");
	1314
[1445]	1315	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	1316	return 1;
	1317	}
	1318
	1319
	1320	# Convert any file to HTML with a crude perl implementation of the
	1321	# UNIX strings command.
	1322
	1323	sub any_to_html {
[22429]	1324	my ($input_filename, $output_filestem) = @_;
[1445]	1325
	1326	# First generate a text file
	1327	return 0 unless (&any_to_text($input_filename, $output_filestem));
	1328
	1329	# create an HTML file from the text file
	1330	open(TEXT, "<$output_filestem.text");
	1331	open(HTML, ">$output_filestem.html");
	1332
[2241]	1333	print HTML "<html><head>\n";
	1334	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
	1335	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
	1336	print HTML "</head><body>\n\n";
[1734]	1337
[2755]	1338	my $line;
	1339	while ($line=<TEXT>) {
	1340	$line =~ s/</</g;
	1341	$line =~ s/>/>/g;
[16435]	1342	if ($line =~ m/^\s*$/) {
[2755]	1343	print HTML "<p>";
	1344	} else {
	1345	print HTML "<br> ", $line;
	1346	}
[1445]	1347	}
[1734]	1348	print HTML "\n</body></html>\n";
[1445]	1349
[2241]	1350	close HTML;
	1351	close TEXT;
	1352
[1445]	1353	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
	1354	return 1;
	1355	}
	1356
	1357	# Convert any file to TEXT with a crude perl implementation of the
	1358	# UNIX strings command.
[2755]	1359	# Note - this assumes ascii charsets :( (jrm21)
[1445]	1360
	1361	sub any_to_text {
[22429]	1362	my ($input_filename, $output_filestem) = @_;
[1445]	1363
[3350]	1364	if (!$use_strings) {
	1365	return 0;
	1366	}
[15120]	1367
	1368	print STDERR "\n** In any to text**\n\n";
[2755]	1369	open(IN, "<$input_filename") \|\| return 0;
[1734]	1370	binmode(IN);
[2755]	1371	open(OUT, ">$output_filestem.text") \|\| return 0;
[1445]	1372
	1373	my ($line);
[2755]	1374	my $output_line_count = 0;
[1445]	1375	while (<IN>) {
	1376	$line = $_;
[1734]	1377
[1445]	1378	# delete anything that isn't a printable character
	1379	$line =~ s/[^\040-\176]+/\n/sg;
	1380
	1381	# delete any string less than 10 characters long
[1734]	1382	$line =~ s/^.{0,9}$/\n/mg;
[16435]	1383	while ($line =~ m/^.{1,9}$/m) {
[1734]	1384	$line =~ s/^.{0,9}$/\n/mg;
[1445]	1385	$line =~ s/\n+/\n/sg;
	1386	}
	1387
	1388	# remove extraneous whitespace
	1389	$line =~ s/\n+/\n/gs;
	1390	$line =~ s/^\n//gs;
[1578]	1391
[1445]	1392	# output whatever is left
[16435]	1393	if ($line =~ m/[^\n ]/) {
[1445]	1394	print OUT $line;
[2755]	1395	++$output_line_count;
[1445]	1396	}
	1397	}
[2241]	1398
	1399	close OUT;
	1400	close IN;
	1401
[2755]	1402	if ($output_line_count) { # try to protect against binary only formats
	1403	return 1;
	1404	}
	1405
	1406	&util::rm("$output_filestem.text");
	1407	return 0;
	1408
[1445]	1409	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: