Context Navigation

source: gsdl/trunk/bin/script/gsConvert.pl@ 19763

Last change on this file since 19763 was 19763, checked in by ak19, 15 years ago
No longer convert spaces to underscores in the rename_file subroutine, since underscores mess up incremental build (file renaming forces incremental building to rebuild everything again since incr building thinks the file with the original file has been deleted and new files have been added).
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 44.0 KB

Rev	Line
[1445]	1	#!/usr/bin/perl -w
	2
	3	###########################################################################
	4	#
[2032]	5	# gsConvert.pl -- convert documents to HTML or TEXT format
[1445]	6	#
	7	# A component of the Greenstone digital library software
	8	# from the New Zealand Digital Library Project at the
	9	# University of Waikato, New Zealand.
	10	#
[3013]	11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
[1445]	12	#
	13	# This program is free software; you can redistribute it and/or modify
	14	# it under the terms of the GNU General Public License as published by
	15	# the Free Software Foundation; either version 2 of the License, or
	16	# (at your option) any later version.
	17	#
	18	# This program is distributed in the hope that it will be useful,
	19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	21	# GNU General Public License for more details.
	22	#
	23	# You should have received a copy of the GNU General Public License
	24	# along with this program; if not, write to the Free Software
	25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	26	#
	27	###########################################################################
	28
	29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
[2755]	30	# by exploiting third-party programs. The sources of these are usually found
	31	# in the $GSDLHOME/packages directory, and the executables should live in
	32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
[1445]	33	#
[3013]	34	# Currently, we can convert the following formats by using external
	35	# conversion utilities:
	36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
	37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
[2032]	38	#
[3013]	39	# We can try to convert any file to text with a perl implementation of the
	40	# UNIX strings command.
	41	#
[2032]	42	# We try to convert Postscript files to text using "gs" which is often on
[2755]	43	# *nix machines. We fall back to performing weak text extraction by using
	44	# regular expressions.
[1445]	45
	46	BEGIN {
	47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
	48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
	49	}
	50
	51	use parsargv;
	52	use util;
	53	use Cwd;
	54	use File::Basename;
	55
[2755]	56	# Are we running on WinNT or Win2000 (or later)?
	57	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
	58	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
[1445]	59
[3350]	60	my $use_strings;
[3720]	61	my $pdf_complex;
[4103]	62	my $pdf_nohidden;
[3720]	63	my $pdf_zoom;
	64	my $pdf_ignore_images;
[10451]	65	my $pdf_allow_images_only;
[10282]	66	my $windows_scripting;
[3350]	67
[1445]	68	sub print_usage
	69	{
[1970]	70	print STDERR "\n";
	71	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
	72	print STDERR " or text using third-party programs.\n\n";
	73	print STDERR " usage: $0 [options] filename\n";
[3400]	74	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
[2755]	75	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
[17195]	76	print STDERR "\t-output\tauto\|html\|text\|pagedimage_jpg\|pagedimage_gif\|pagedimage_png\t(output file type)\n";
[2755]	77	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
[3720]	78	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
[10282]	79	print STDERR "\t-windows_scripting\tuse windows script when converting Microsoft Word and PPT via VB script\n";
[3720]	80	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
[4103]	81	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
[3720]	82	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
	83	print STDERR "\t\tconverting PDF to HTML\n";
[10451]	84	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
[3720]	85	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
	86	print STDERR "\t\t-pdf_complex is set\n";
[1445]	87	exit(1);
	88	}
	89
[2755]	90	my $faillogfile="";
[3538]	91	my $timeout=0;
[1445]	92
	93	sub main
	94	{
	95	my (@ARGV) = @_;
[3538]	96	my ($input_type,$output_type,$verbose);
[1960]	97
[1445]	98	# read command-line arguments
	99	if (!parsargv::parse(\@ARGV,
[3400]	100	'type/(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)/', \$input_type,
[2755]	101	'/errlog/.*/', \$faillogfile,
[17194]	102	'output/(auto\|html\|text\|pagedimage).*/', \$output_type,
[1692]	103	'timeout/\d+/0',\$timeout,
[10282]	104	'verbose/\d+/0', \$verbose,
[3720]	105	'use_strings', \$use_strings,
[10282]	106	'windows_scripting',\$windows_scripting,
[3720]	107	'pdf_complex', \$pdf_complex,
[9482]	108	'pdf_ignore_images', \$pdf_ignore_images,
[10451]	109	'pdf_allow_images_only', \$pdf_allow_images_only,
[4103]	110	'pdf_nohidden', \$pdf_nohidden,
[3720]	111	'pdf_zoom/\d+/2', \$pdf_zoom
	112	))
[1445]	113	{
	114	print_usage();
	115	}
[12704]	116
[1445]	117	# Make sure the input file exists and can be opened for reading
	118	if (scalar(@ARGV!=1)) {
	119	print_usage();
	120	}
[1928]	121
[1445]	122	my $input_filename = $ARGV[0];
	123	if (!-r $input_filename) {
	124	print STDERR "Error: unable to open $input_filename for reading\n";
	125	exit(1);
	126	}
	127
	128	# Deduce filenames
	129	my ($tailname,$dirname,$suffix)
[2241]	130	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
	131	my $output_filestem = &util::filename_cat($dirname, "$tailname");
[1445]	132
	133	if ($input_type eq "")
	134	{
[2241]	135	$input_type = lc (substr($suffix,1,length($suffix)-1));
[1445]	136	}
	137
	138	# Change to temporary working directory
	139	my $stored_dir = cwd();
	140	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
[10357]	141
[1445]	142	# Select convert utility
	143	if (!defined $input_type) {
	144	print STDERR "Error: No filename extension or input type defined\n";
	145	exit(1);
	146	}
[3400]	147	elsif ($input_type eq "doc" \|\| $input_type eq "dot") {
[1445]	148	print &convertDOC($input_filename, $output_filestem, $output_type);
	149	print "\n";
	150	}
[1684]	151	elsif ($input_type eq "rtf") {
	152	print &convertRTF($input_filename, $output_filestem, $output_type);
	153	print "\n";
	154	}
[1445]	155	elsif ($input_type eq "pdf") {
	156	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
	157	print "\n";
	158	}
	159	elsif ($input_type eq "ps") {
	160	print &convertPS($input_filename, $output_filestem, $output_type);
	161	print "\n";
	162	}
[2977]	163	elsif ($input_type eq "ppt") {
	164	print &convertPPT($input_filename, $output_filestem, $output_type);
	165	print "\n";
	166	}
[2991]	167	elsif ($input_type eq "xls") {
	168	print &convertXLS($input_filename, $output_filestem, $output_type);
	169	print "\n";
	170	}
[1445]	171	else {
	172	print STDERR "Error: Unable to convert type '$input_type'\n";
	173	exit(1);
	174	}
	175
	176	# restore to original working directory
	177	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
	178
	179	}
	180
	181	&main(@ARGV);
	182
	183
	184
[2241]	185	# Document-type conversion functions
[1445]	186	#
	187	# The following functions attempt to convert documents from their
	188	# input type to the specified output type. If no output type was
	189	# given, then they first attempt HTML, and then TEXT.
	190	#
	191	# Each returns the output type ("html" or "text") or "fail" if no
	192	# conversion is possible.
	193
	194	# Convert a Microsoft word document
	195
	196	sub convertDOC {
	197	($input_filename, $output_filestem, $output_type) = @_;
	198
[1654]	199	# Many .doc files are not in fact word documents!
	200	my $realtype = &find_docfile_type($input_filename);
	201
[1734]	202	if ($realtype eq "word6" \|\| $realtype eq "word7" \|\| $realtype eq "word8") {
[1654]	203	return &convertWord678($input_filename, $output_filestem, $output_type);
	204	} elsif ($realtype eq "rtf") {
	205	return &convertRTF($input_filename, $output_filestem, $output_type);
	206	} else {
	207	return &convertAnything($input_filename, $output_filestem, $output_type);
	208	}
	209	}
	210
	211	# Convert a Microsoft word 6/7/8 document
	212
	213	sub convertWord678 {
	214	($input_filename, $output_filestem, $output_type) = @_;
	215
[1445]	216	my $success = 0;
[16435]	217	if (!$output_type \|\| ($output_type =~ m/html/i)){
[10282]	218	if ($windows_scripting) {
	219	$success = &native_doc_to_html($input_filename, $output_filestem);
	220	}
	221	else {
	222	$success = &doc_to_html($input_filename, $output_filestem);
	223	}
[1445]	224	if ($success) {
[10282]	225	return "html";
[1445]	226	}
	227	}
[1654]	228	return &convertAnything($input_filename, $output_filestem, $output_type);
	229	}
	230
	231
	232	# Convert a Rich Text Format (RTF) file
	233
	234	sub convertRTF {
	235	($input_filename, $output_filestem, $output_type) = @_;
	236
	237	my $success = 0;
	238
	239	# Attempt specialised conversion to HTML
[16435]	240	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[12704]	241
	242	if ($windows_scripting) {
	243	$success = &native_doc_to_html($input_filename, $output_filestem);
	244	}
	245	else {
	246	$success = &rtf_to_html($input_filename, $output_filestem);
	247	}
[1654]	248	if ($success) {
	249	return "html";
	250	}
	251	}
	252
[2755]	253	# rtf is so ugly that's it's not worth running strings over.
	254	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
	255	# return &convertAnything($input_filename, $output_filestem, $output_type);
	256	return "fail";
[1654]	257	}
	258
	259
	260	# Convert an unidentified file
	261
	262	sub convertAnything {
	263	($input_filename, $output_filestem, $output_type) = @_;
	264
	265	my $success = 0;
[10464]	266
[1445]	267	# Attempt simple conversion to HTML
[16435]	268	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[1445]	269	$success = &any_to_html($input_filename, $output_filestem);
	270	if ($success) {
	271	return "html";
	272	}
	273	}
	274
	275	# Convert to text
[16435]	276	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[2241]	277	$success = &any_to_text($input_filename, $output_filestem);
[1445]	278	if ($success) {
	279	return "text";
	280	}
	281	}
	282	return "fail";
	283	}
	284
	285
[1654]	286
[1445]	287	# Convert an Adobe PDF document
	288
	289	sub convertPDF {
[2755]	290	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[1445]	291
	292	my $success = 0;
[10357]	293	$output_type =~ s/.\-(.)/$1/i;
	294	# Attempt coversion to Image
[16435]	295	if ($output_type =~ m/jp?g\|gif\|png/i) {
[17329]	296	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
[10357]	297	if ($success){
	298	return "item";
	299	}
	300	}
[1445]	301
	302	# Attempt conversion to HTML
[16435]	303	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[1445]	304	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
	305	if ($success) {
	306	return "html";
	307	}
	308	}
	309
	310	# Attempt conversion to TEXT
[16435]	311	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[2117]	312	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
[1445]	313	if ($success) {
	314	return "text";
	315	}
	316	}
	317
	318	return "fail";
	319
	320	}
	321
	322
	323	# Convert an Adobe PostScript document
	324
	325	sub convertPS {
	326	($input_filename, $output_filestem, $output_type) = @_;
	327
	328	my $success = 0;
[10534]	329	$output_type =~ s/.\-(.)/$1/i;
	330	# Attempt coversion to Image
[16435]	331	if ($output_type =~ m/jp?g\|gif\|png/i) {
[17329]	332	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
[10534]	333	if ($success){
	334	return "item";
	335	}
	336	}
[1445]	337
	338	# Attempt conversion to TEXT
[16435]	339	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[1445]	340	$success = &ps_to_text($input_filename, $output_filestem);
	341	if ($success) {
	342	return "text";
	343	}
	344	}
	345	return "fail";
	346	}
	347
	348
[2977]	349	sub convertPPT {
	350	my ($input_filename, $output_filestem, $output_type) = @_;
[10357]	351	my $success = 0;
[2977]	352
[10282]	353	my $ppt_convert_type = "";
[16435]	354	#if (!$output_type \|\| $windows_scripting \|\| ($output_type !~ m/html/i) \|\| ($output_type !~ m/text/i)){
	355	if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
	356	if ($output_type =~ m/gif/i) {
[10282]	357	$ppt_convert_type = "-g";
[16435]	358	} elsif ($output_type =~ m/jp?g/i){
[10282]	359	$ppt_convert_type = "-j";
[16435]	360	} elsif ($output_type =~ m/png/i){
[10282]	361	$ppt_convert_type = "-p";
	362	}
	363	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
	364	$ENV{'GSDLOS'}, "pptextract");
[16435]	365	$vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
[10282]	366
	367	$cmd = "";
[10357]	368	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[10282]	369	# if the converting directory has already existed
	370	if (-d $output_filestem) {
	371	print STDERR "**The conversion directory has existed\n";
	372	return "item";
	373	} else {
[10521]	374	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
[10282]	375	$cmd .= " 2>\"$output_filestem.err\""
[16435]	376	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[10282]	377	if (system($cmd) !=0) {
	378	print STDERR "Powerpoint VB Scripting convert failed\n";
	379	} else {
	380	return "item";
	381	}
	382	}
[16435]	383	} elsif (!$output_type \|\| ($output_type =~ m/html/i)) {
[10282]	384	# Attempt conversion to HTML
[16435]	385	#if (!$output_type \|\| ($output_type =~ m/html/i)) {
[2977]	386	# formulate the command
	387	$cmd = "";
	388	$cmd .= "perl -S ppttohtml.pl ";
	389	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
	390	$cmd .= " 2>\"$output_filestem.err\""
[16435]	391	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[10357]	392
[2977]	393	# execute the command
	394	$!=0;
	395	if (system($cmd)!=0)
	396	{
[2991]	397	print STDERR "Powerpoint 95/97 converter failed $!\n";
[2977]	398	} else {
	399	return "html";
	400	}
[10464]	401	}
[2977]	402
	403	$success = &any_to_text($input_filename, $output_filestem);
	404	if ($success) {
	405	return "text";
	406	}
[10464]	407
[2977]	408	return "fail";
	409	}
	410
	411
[2991]	412	sub convertXLS {
	413	my ($input_filename, $output_filestem, $output_type) = @_;
[2977]	414
[2991]	415	my $success = 0;
[2977]	416
[2991]	417	# Attempt conversion to HTML
[16435]	418	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[2991]	419	# formulate the command
	420	$cmd = "";
	421	$cmd .= "perl -S xlstohtml.pl ";
	422	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
	423	$cmd .= " 2>\"$output_filestem.err\""
[16435]	424	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[2991]	425
	426
	427	# execute the command
	428	$!=0;
	429	if (system($cmd)!=0)
	430	{
	431	print STDERR "Excel 95/97 converter failed $!\n";
	432	} else {
	433	return "html";
	434	}
	435	}
[2977]	436
[2991]	437	$success = &any_to_text($input_filename, $output_filestem);
	438	if ($success) {
	439	return "text";
	440	}
	441
	442	return "fail";
	443	}
	444
	445
	446
[1654]	447	# Find the real type of a .doc file
	448	#
[2012]	449	# We seem to have a lot of files with a .doc extension that are .rtf
[1654]	450	# files or Word 5 files. This function attempts to tell the difference.
	451	sub find_docfile_type {
	452	($input_filename) = @_;
	453
	454	open(CHK, "<$input_filename");
[1734]	455	binmode(CHK);
[1654]	456	my $line = "";
	457	my $first = 1;
	458
	459	while (<CHK>) {
	460
	461	$line = $_;
[1960]	462
[1654]	463	if ($first) {
	464	# check to see if this is an rtf file
[16435]	465	if ($line =~ m/^\{\\rtf/) {
[1654]	466	close(CHK);
	467	return "rtf";
	468	}
[2755]	469	$first = 0;
[1654]	470	}
	471
[1734]	472	# is this is a word 6/7/8 document?
[16435]	473	if ($line =~ m/Word\.Document\.([678])/) {
[1654]	474	close(CHK);
[1734]	475	return "word$1";
[1654]	476	}
	477
	478	}
	479
	480	return "unknown";
	481	}
	482
	483
[1734]	484	# Specific type-to-type conversions
[1445]	485	#
	486	# Each of the following functions attempts to convert a document from
[2755]	487	# a specific format to another. If they succeed they return 1 and leave
[1445]	488	# the output document(s) in the appropriate place; if they fail they
	489	# return 0 and delete any working files.
	490
	491
	492	# Attempt to convert a word document to html with the wv program
	493	sub doc_to_html {
	494	($input_filename, $output_filestem) = @_;
	495
[2023]	496	my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
	497	$ENV{'GSDLOS'}, "wvWare");
[1928]	498
[2241]	499	# don't include path on windows (to avoid having to play about
	500	# with quoting when GSDLHOME might contain spaces) but assume
	501	# that the PATH is set up correctly
[16435]	502	$wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
[2241]	503
[2512]	504	my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
[2574]	505	"packages", "wv", "wvHtml.xml");
[1928]	506
[15120]	507	# Added the following to work with replace_srcdoc_with_html.pl:
	508	# Make wvWare put any associated (image) files of the word doc into
	509	# folder docname-without-extention_files. This folder should be at
	510	# the same level as the html file generated from the doc.
	511	# wvWare will take care of proper interlinking.
	512
	513	# This step is necessary for replace_srcdoc_with_html.pl which will
	514	# move the html and associated files into the import folder. We
	515	# want to ensure that the associated files won't overwrite similarly
	516	# named items already in import. Hence we put them in a folder first
	517	# (to which the html links properly) and that will allow
	518	# replace_srcdoc_with_html.pl to move them safely to /import.
	519
	520	# To do all this, we need to use wvWare's --dir and --basename options
	521	# where dir is the full path to the image folder directory and
	522	# basename is the full path to the image folder appended to the name
	523	# which is to be prepended to every image file:
	524	# eg. if the images were to have names like sample0.jpg to sampleN.jpg,
	525	# then the basename is "/full/path/to/imgdir/sample".
	526	# In this case, basename is the full path to and name of the document.
	527	# HOWEVER: basename always takes full path, not relative url, so
	528	# the greenstone browser is unable to display the images (absolute paths
	529	# cause it to give an "external link" message)
	530	# See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html
	531	# and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html
	532	# "added --dir option to wvHtml so that pictures can be placed in
	533	# a seperate directory"
	534	# "running wvWare through IMP to view word documents as html. It gets
	535	# invoked like this:
	536	# wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output"
	537
	538	# toppath is the folder where html is generated
	539	# docname is the name (without extension) of the html to be generated
	540	# suffix (extension) is thrown away
	541	my ($docname, $toppath)
	542	= &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
	543
	544	# We want the image folder generated to have the same name as windows
	545	# would generate ($windows_scripting) when it converts from word to html.
	546	# That is, foldername=docname_files
	547	my $assoc_dir = &util::filename_cat($toppath, $docname."_files");
	548	#print "assoc_dir: ".$assoc_dir."\n"; # same as "$output_filestem._files"
	549
	550	# ensure this image directory exists
	551	# if it exists already, just delete and recreate
	552	if(-e $assoc_dir) {
	553	&util::rm_r($assoc_dir);
	554	}
	555	&util::mk_dir($assoc_dir);
	556
	557	# the images are all going to be called image0, image1,..., imageN
	558	my $img_basenames = &util::filename_cat($assoc_dir, $docname);
	559
	560	#print STDERR "**toppath: $toppath\n**docname: $docname\n;
	561	#print STDERR "****img_basenames: $img_basenames\n" if($img_basenames);
	562	#print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir);
	563
[2241]	564	my $cmd = "";
[1692]	565	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[15120]	566	# wvWare's --dir and --basename options for image directory.
	567	# Replaced the next line with the 2 lines following it:
	568	# $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
	569	$cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\"";
	570	$cmd .= " --charset utf-8 --config \"$wv_conf\"";
[2241]	571	$cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
[15120]	572
[2241]	573	# redirecting STDERR is a bad idea on windows 95/98
	574	$cmd .= " 2> \"$output_filestem.err\""
[16435]	575	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[1445]	576	# execute the command
[2755]	577	$!=0;
[2060]	578	if (system($cmd)!=0)
[1445]	579	{
[2755]	580	print STDERR "Error executing wv converter:$!\n";
	581	if (-s "$output_filestem.err") {
	582	open (ERRFILE, "<$output_filestem.err");
	583
	584	my $write_to_fail_log=0;
	585	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	586	{$write_to_fail_log=1;}
	587
	588	my $line;
	589	while ($line=<ERRFILE>) {
[16435]	590	if ($line =~ m/\w/) {
[2755]	591	print STDERR "$line";
	592	print FAILLOG "$line" if ($write_to_fail_log);
	593	}
	594	if ($line !~ m/startup error/) {next;}
	595	print STDERR " (given an invalid .DOC file?)\n";
	596	print FAILLOG " (given an invalid .DOC file?)\n"
	597	if ($write_to_fail_log);
	598
	599	} # while ERRFILE
	600	close FAILLOG if ($write_to_fail_log);
	601	}
	602	return 0; # we can try any_to_text
[1445]	603	}
[1578]	604
[1445]	605	# Was the conversion successful?
[2241]	606
[15120]	607	if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents)
[1445]	608	open(TMP, "$output_filestem.html");
	609	$line = <TMP>;
	610	close(TMP);
[16435]	611	if ($line && $line =~ m/DOCTYPE HTML/) {
[15120]	612	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
	613
	614	# Inserted this code to remove the images directory if it was still empty after
	615	# the html was generated (in case there were no images in the word document)
[16435]	616	if (&util::is_dir_empty($assoc_dir)) {
[15152]	617	#print STDERR "*gsConvert.pl: Image dir $assoc_dir is empty, removing*\n";
[15120]	618	&util::rm_r($assoc_dir);
	619	} else { # there was an image folder (it was generated)
	620	# Therefore, the html file generated contains absolute links to the images
[16435]	621	# Replace them with relative links instead, so the folder can be moved elsewhere
[15152]	622	&make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");
[15120]	623	}
[1445]	624	return 1;
	625	}
	626	}
[2755]	627
	628	# If here, an error of some sort occurred
	629	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
	630	if (-e "$output_filestem.err") {
	631	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
	632	open (ERRLOG,"$output_filestem.err");
	633	while (<ERRLOG>) {print FAILLOG $_;}
	634	close FAILLOG;
	635	close ERRLOG;
	636	}
	637	&util::rm("$output_filestem.err");
	638	}
	639
[1445]	640	return 0;
	641	}
	642
[15120]	643	# Method to work with doc_to_html - Word docs might contain images.
	644	# When such word docs are converted with wvWare, we make it generate a
	645	# <filename>_files folder with the associated images, while the html file
	646	# <filename> refers to the images using absolute paths to <filename>_files.
	647	# This method reads in that html file and replaces all the absolute paths to
	648	# the images in <filename>_files with the relative paths to the images from
	649	# that folder. (I.e. with <filename>_files/<imagename.ext>).
	650	sub make_links_to_assocdir_relative{
	651	# toppath is the top-level folder in which the html file we're going to be fixing resides
	652	# docname is just the name (without extension) of the html file
	653	# html_file is the full path to the html file: /full/path/docname.html
	654	# assoc_dir_path is toppath/docname_files
	655	# assoc_dirname is the directory name of the folder with associated imgs: docname_files
	656	my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_;
[10357]	657
[15120]	658	# 1. Read all the contents of the html into a string
	659	# open the original file for reading
	660	unless(open(FIN, "<$html_file")) {
[15168]	661	print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR: $!\n";
[15152]	662	return 0;
[15120]	663	}
	664	# From http://perl.plover.com/local.html
	665	# "It's cheaper to read the file all at once, without all the splitting and reassembling.
	666	# (Some people call this slurping the file.) Perl has a special feature to support this:
	667	# If the $/ variable is undefined, the <...> operator will read the entire file all at once"
[15152]	668	my $html_contents;
	669	{
	670	local $/ = undef; # Read entire file at once
	671	$html_contents = <FIN>; # Now file is read in as one single 'line'
	672	}
[15120]	673	close(FIN); # close the file
[15152]	674	#print STDERR $html_contents;
[15120]	675
	676	# 2. Replace (substitute) all ocurrences of the assoc_dir_path in a hrefs and img src
	677	# values with assoc_dirname
	678	# At the end: g means substitute all occurrences (global), while s at the end means treat
	679	# all new lines as a regular space. This interacts with g to consider all the lines
	680	# together as a single line so that multi-occurrences can be replaced.
[15152]	681
	682	# we can't just replace $assoc_dir_path with $assoc_dir
	683	# $assoc_dir_path represents a regular expression that needs to be replaced
[16435]	684	# if it contains ., -, [, ], or Windows style backslashes in paths -- which all have special
	685	# meaning in Perl regular expressions -- we need to escape these first
[15152]	686	my $safe_reg_expression = $assoc_dir_path;
[16435]	687	$safe_reg_expression =~ s/\\/\\\\/g;
[15152]	688	$safe_reg_expression =~ s/\./\\./g;
	689	$safe_reg_expression =~ s/\-/\\-/g;
	690	$safe_reg_expression =~ s/\[/\\[/g;
	691	$safe_reg_expression =~ s/\]/\\]/g;
	692	$safe_reg_expression =~ s/ /%20/g; # wvWare put %20 in place of space, so we need to change our prefix to match
	693
[15120]	694	# The following regular expression substitution looks for <a or <image, followed by any other
	695	# attributes and values until it comes to the FIRST (indicated by ?) href= or src=
	696	# followed by " or ' no quotes at all around path, followed by the associated folder's pathname
	697	# followed by characters (for the img filename), then finally the optional closing quotes
	698	# in " or ' form, followed by any other attributes and values until the first > to end the tag.
	699	# The substitution: all the parts preceding associated folder's pathname are retained,
	700	# the associated folder path name is replaced by associated folder directory name
	701	# and the rest upto and including the closing > tag is retained.
	702	# The sg at the end of the pattern match treats all of html_contents as a single line (s)
	703	# and performs a global replace (g) meaning that all occurrences that match in that single line
	704	# are substituted.
[15152]	705	$html_contents =~ s/(<(a\|img).?(href\|src)=(\"\|\')?)$safe_reg_expression(.?(\"\|\')?.*?>)/$1$assoc_dirname$5/sg;
	706	#$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back
	707	# now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this
[16435]	708	$html_contents =~ s/(<(a\|img).?(href\|src)=(\"\|\')?)(.)(.?(\"\|\')?.?>)/&post_process_assocfile_urls($1, $5, $6)/sge;
	709
[16552]	710	#print STDERR "**assoc_dirname: $assoc_dirname*\n";
	711	#print STDERR "**safe_reg_expression: $safe_reg_expression*\n";
[15152]	712
[15120]	713	# delete the original file and recreate it
	714	my $copy_of_filename = $html_file;
	715	&util::rm($copy_of_filename); # deleted the file
	716
	717	# Recreate the original file for writing the updated contents
	718	unless(open(FOUT, ">$html_file")) { # open it as a new file for writing
[15168]	719	print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR: $!\n";
[15152]	720	return 0;
[15120]	721	}
[16435]	722
[15120]	723	# write out the updated contents and close the file
	724	print FOUT $html_contents;
	725	close(FOUT);
[15152]	726	return 1;
[15120]	727	}
	728
[16435]	729	# Utility routine to make sure HTML plugin gets img src/href link pathnames that contain
	730	# url slashes (/) instead of windows-style backwards slashes, and to convert all %20
[16899]	731	# introduced in link pathnames by wvWare into space again. Converts all percent signs
	732	# introduced by URL encoding filenames generated into %25 in these url links referencing them
[16435]	733	sub post_process_assocfile_urls
[15120]	734	{
[15152]	735	my ($pre, $text, $post) = @_;
	736
[19763]	737	$text =~ s/%20/ /g; # Convert %20s to space and not underscore since underscores mess with incremental rebuild
	738	# $text =~ s/%20/_/g; # reinstated this line, since we no longer replace spaces with %20. We replace them with underscores
[16435]	739	$text =~ s/\\/\//g;
[16899]	740	$text =~ s/%/%25/g;
[15152]	741
	742	return "$pre$text$post";
[15120]	743	}
	744
[10282]	745	# Attempt to convert a word document to html with the word2html scripting program
	746	sub native_doc_to_html {
	747	($input_filename, $output_filestem) = @_;
[1445]	748
[10282]	749	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
	750	$ENV{'GSDLOS'}, "word2html");
	751
[16435]	752	$vbScript = "word2html" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
[10445]	753	if (-e "$output_filestem.html") {
	754	print STDERR "*** The conversion file has existed\n";
	755	return 1;
	756	}
[10282]	757
	758	my $cmd = "";
	759	if ($timeout) {$cmd = "ulimit -t $timeout;";}
	760	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
[10445]	761	#$cmd .= "$vbScript $input_filename $output_filestem.html";
[10521]	762	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
[10445]	763
[10282]	764	# redirecting STDERR
	765	$cmd .= " 2> \"$output_filestem.err\""
[16435]	766	if ($ENV {'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[10282]	767
	768	# execute the command
	769	$!=0;
	770	if (system($cmd)!=0)
	771	{
	772	print STDERR "Error executing word2Html converter:$!\n";
	773	if (-s "$output_filestem.err") {
	774	open (ERRFILE, "<$output_filestem.err");
	775
	776	my $write_to_fail_log=0;
	777	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	778	{$write_to_fail_log=1;}
	779
	780	my $line;
	781	while ($line=<ERRFILE>) {
[16435]	782	if ($line =~ m/\w/) {
[10282]	783	print STDERR "$line";
	784	print FAILLOG "$line" if ($write_to_fail_log);
	785	}
	786	if ($line !~ m/startup error/) {next;}
	787	print STDERR " (given an invalid .DOC file?)\n";
	788	print FAILLOG " (given an invalid .DOC file?)\n"
	789	if ($write_to_fail_log);
	790
	791	} # while ERRFILE
	792	close FAILLOG if ($write_to_fail_log);
	793	}
	794	return 0; # we can try any_to_text
	795	}
	796
	797	# Was the conversion successful?
	798	if (-s "$output_filestem.html") {
	799	open(TMP, "$output_filestem.html");
	800	$line = <TMP>;
	801	close(TMP);
[16435]	802	if ($line && $line =~ m/html/) {
[10282]	803	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
	804	return 1;
	805	}
	806	}
	807
	808	# If here, an error of some sort occurred
	809	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
	810	if (-e "$output_filestem.err") {
	811	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
	812	open (ERRLOG,"$output_filestem.err");
	813	while (<ERRLOG>) {print FAILLOG $_;}
	814	close FAILLOG;
	815	close ERRLOG;
	816	}
	817	&util::rm("$output_filestem.err");
	818	}
	819	return 0;
	820	}
	821
[1654]	822	# Attempt to convert an RTF document to html with rtftohtml
	823
	824	sub rtf_to_html {
[2241]	825	my ($input_filename, $output_filestem) = @_;
[1654]	826
	827	# formulate the command
[1692]	828	$cmd = "";
	829	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[2574]	830	$cmd .= "rtftohtml";
[10282]	831	#$cmd .= "rtf-converter";
[1654]	832
[3246]	833	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
[2574]	834
	835	$cmd .= " 2>\"$output_filestem.err\""
[16435]	836	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[2574]	837
	838
[1654]	839	# execute the command
[2755]	840	$!=0;
[2060]	841	if (system($cmd)!=0)
[1654]	842	{
[2755]	843	print STDERR "Error executing rtf converter $!\n";
[2656]	844	# don't currently bother printing out error log...
	845	# keep going, in case it still created an HTML file...
[1654]	846	}
	847
	848	# Was the conversion successful?
[2755]	849	my $was_successful=0;
[2656]	850	if (-s "$output_filestem.html") {
[2755]	851	# make sure we have some content other than header
	852	open (HTML, "$output_filestem.html"); # what to do if fail?
	853	my $line;
	854	my $past_header=0;
	855	while ($line=<HTML>) {
	856
	857	if ($past_header == 0) {
[16435]	858	if ($line =~ m/<body>/) {$past_header=1;}
[2755]	859	next;
	860	}
	861
	862	$line =~ s/<[^>]+>//g;
[16435]	863	if ($line =~ m/\w/ && $past_header) { # we found some content...
[2755]	864	$was_successful=1;
	865	last;
	866	}
	867	}
	868	close HTML;
[1654]	869	}
[2574]	870
[2755]	871	if ($was_successful) {
	872	&util::rm("$output_filestem.err")
	873	if (-e "$output_filestem.err");
	874	# insert the (modified) table of contents, if it exists.
	875	if (-e "${output_filestem}_ToC.html") {
	876	&util::mv("$output_filestem.html","$output_filestem.src");
	877	my $open_failed=0;
	878	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
	879	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
	880	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
	881
	882	if ($open_failed) {
	883	close HTMLSRC;
	884	close TOC;
	885	close HTML;
	886	&util::mv("$output_filestem.src","$output_filestem.html");
	887	return 1;
	888	}
	889
	890	# print out header info from src html.
[16435]	891	while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
[2755]	892	print HTML "$_";
	893	}
	894
	895	# print out table of contents, making links relative
	896	<TOC>; <TOC>; # ignore first 2 lines
	897	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
	898	my $line;
	899	while ($line=<TOC>) {
	900	$line =~ s@</body></html>$@@ ; # only last line has this
	901	# make link relative
	902	$line =~ s@href=\"[^\#]+@href=\"@;
	903	print HTML $line;
	904	}
	905	close TOC;
	906
	907	# rest of html src
	908	while (<HTMLSRC>) {
	909	print HTML $_;
	910	}
	911	close HTMLSRC;
	912	close HTML;
	913
	914	&util::rm("${output_filestem}_ToC.html");
	915	&util::rm("${output_filestem}.src");
	916	}
	917	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
	918	return 1; # success
	919	}
	920
	921	if (-e "$output_filestem.err") {
	922	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	923	{
	924	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
[10282]	925	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
[2755]	926	print FAILLOG " (rtf file might be too recent):\n";
	927	open (ERRLOG, "$output_filestem.err");
	928	while (<ERRLOG>) {print FAILLOG $_;}
	929	close ERRLOG;
	930	close FAILLOG;
	931	}
	932	&util::rm("$output_filestem.err");
	933	}
	934
[2656]	935	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
	936
[1654]	937	return 0;
	938	}
	939
	940
[1445]	941	# Convert a pdf file to html with the pdftohtml command
	942
	943	sub pdf_to_html {
[2755]	944	my ($dirname, $input_filename, $output_filestem) = @_;
[1445]	945
[1692]	946	$cmd = "";
	947	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[3720]	948	$cmd .= "perl -S pdftohtml.pl -zoom $pdf_zoom";
	949	$cmd .= " -c" if ($pdf_complex);
	950	$cmd .= " -i" if ($pdf_ignore_images);
[10451]	951	$cmd .= " -a" if ($pdf_allow_images_only);
[4103]	952	$cmd .= " -hidden" unless ($pdf_nohidden);
[1928]	953	$cmd .= " \"$input_filename\" \"$output_filestem\"";
[2755]	954
[16435]	955	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
[2755]	956	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	957	} else {
	958	$cmd .= " > \"$output_filestem.err\"";
	959	}
	960
[2117]	961	$!=0;
[2241]	962
[2656]	963	my $retval=system($cmd);
	964	if ($retval!=0)
[1445]	965	{
[2755]	966	print STDERR "Error executing pdftohtml.pl";
[2117]	967	if ($!) {print STDERR ": $!";}
	968	print STDERR "\n";
[1445]	969	}
	970
[1692]	971	# make sure the converter made something
[2656]	972	if ($retval!=0 \|\| ! -s "$output_filestem.html")
[1692]	973	{
	974	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
[2656]	975	# print out the converter's std err, if any
	976	if (-s "$output_filestem.err") {
[1692]	977	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[2755]	978	print STDERR "pdftohtml error log:\n";
[1692]	979	while (<ERRLOG>) {
	980	print STDERR "$_";
	981	}
	982	close ERRLOG;
	983	}
[2656]	984	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
[2755]	985	if (-e "$output_filestem.err") {
	986	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	987	{
	988	open (ERRLOG, "$output_filestem.err");
	989	while (<ERRLOG>) {print FAILLOG $_;}
	990	close ERRLOG;
	991	close FAILLOG;
	992	}
[10282]	993	&util::rm("$output_filestem.err");
[2755]	994	}
[1692]	995	return 0;
	996	}
[10357]	997
	998	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	999	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	1000	return 1;
	1001	}
	1002
	1003	# Convert a pdf file to various types of image with the convert command
	1004
[17329]	1005	sub pdfps_to_img {
[10357]	1006	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[10401]	1007
	1008	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
	1009	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
	1010	my $result = `identify 2>&1`;
	1011	if ($? == -1 \|\| $? == 256) { # Linux and Windows return different values for "program not found"
	1012	#ImageMagick is not installed, thus the convert utility is not available.
[17329]	1013	print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images\n";
[10401]	1014	return 0;
	1015	}
	1016	}
	1017
[10357]	1018	$cmd = "";
	1019	if ($timeout) {$cmd = "ulimit -t $timeout;";}
	1020	$output_type =~ s/.\_(.)/$1/i;
[17329]	1021	$cmd .= "perl -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
[16435]	1022	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
[10357]	1023	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	1024	} else {
	1025	$cmd .= " > \"$output_filestem.err\"";
	1026	}
	1027
	1028	# don't include path on windows (to avoid having to play about
	1029	# with quoting when GSDLHOME might contain spaces) but assume
	1030	# that the PATH is set up correctly
	1031	$!=0;
	1032	my $retval=system($cmd);
	1033	if ($retval!=0)
	1034	{
[10401]	1035	print STDERR "Error executing pdftoimg.pl";
[10357]	1036	if ($!) {print STDERR ": $!";}
	1037	print STDERR "\n";
	1038	}
	1039
	1040	#make sure the converter made something
	1041	#if ($retval !=0) \|\| ! -s "$output_filestem")
	1042	if ($retval !=0)
	1043	{
	1044	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	1045	#print out the converter's std err, if any
	1046	if (-s "$output_filestem.err") {
	1047	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[17329]	1048	print STDERR "pdfpstoimg error log:\n";
[10357]	1049	while (<ERRLOG>) {
	1050	print STDERR "$_";
	1051	}
	1052	close ERRLOG;
	1053	}
[10534]	1054	#&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
[10357]	1055	if (-e "$output_filestem.err") {
	1056	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	1057	{
	1058	open (ERRLOG, "$output_filestem.err");
	1059	while (<ERRLOG>) {print FAILLOG $_;}
	1060	close ERRLOG;
	1061	close FAILLOG;
	1062	}
	1063	&util::rm("$output_filestem.err");
	1064	}
	1065	return 0;
	1066	}
[2656]	1067	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
[1445]	1068	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	1069	return 1;
	1070	}
	1071
	1072	# Convert a PDF file to text with the pdftotext command
	1073
	1074	sub pdf_to_text {
[2755]	1075	my ($dirname, $input_filename, $output_filestem) = @_;
[1445]	1076
[2248]	1077	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
[2755]	1078
[16435]	1079	if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
[2755]	1080	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	1081	} else {
	1082	$cmd .= " > \"$output_filestem.err\"";
	1083	}
[1445]	1084
[2060]	1085	if (system($cmd)!=0)
[1445]	1086	{
	1087	print STDERR "Error executing $cmd: $!\n";
	1088	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
	1089	}
	1090
[2755]	1091	# make sure there is some extracted text.
	1092	if (-e "$output_filestem.text") {
	1093	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
	1094	binmode(EXTR_TEXT); # just in case...
	1095	my $line="";
	1096	my $seen_text=0;
	1097	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
[16435]	1098	if ($line=~ m/\w/) {$seen_text=1;}
[2755]	1099	}
	1100	close EXTR_TEXT;
	1101	if ($seen_text==0) { # no text was extracted
	1102	print STDERR "Error: pdftotext found no text\n";
	1103	&util::rm("$output_filestem.text");
	1104	}
	1105	}
	1106
[1692]	1107	# make sure the converter made something
[2656]	1108	if (! -s "$output_filestem.text")
[1692]	1109	{
	1110	# print out the converters std err, if any
[2656]	1111	if (-s "$output_filestem.err") {
[1692]	1112	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[2755]	1113	print STDERR "pdftotext error log:\n";
[1692]	1114	while (<ERRLOG>) {
	1115	print STDERR "$_";
	1116	}
	1117	close ERRLOG;
	1118	}
[2656]	1119	# does this converter create a .out file?
	1120	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	1121	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
[2755]	1122	if (-e "$output_filestem.err") {
	1123	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	1124	{
	1125	open (ERRLOG,"$output_filestem.err");
	1126	while (<ERRLOG>) {print FAILLOG $_;}
	1127	close ERRLOG;
	1128	close FAILLOG;
	1129	}
	1130	&util::rm("$output_filestem.err");
	1131	}
[1692]	1132	return 0;
	1133	}
[1445]	1134	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	1135	return 1;
	1136	}
	1137
[2012]	1138	# Convert a PostScript document to text
	1139	# note - just using "ps2ascii" isn't good enough, as it
	1140	# returns 0 for a postscript interpreter error. ps2ascii is just
	1141	# a wrapper to "gs" anyway, so we use that cmd here.
[1445]	1142
	1143	sub ps_to_text {
[2241]	1144	my ($input_filename, $output_filestem) = @_;
[1445]	1145
[2241]	1146	my $error = "";
	1147
	1148	# if we're on windows we'll fall straight through without attempting
	1149	# to use gs
[16435]	1150	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
[2241]	1151	$error = "Windows does not support gs";
	1152
	1153	} else {
[3538]	1154	my $cmd = "";
	1155	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
	1156	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
[2241]	1157	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
[10357]	1158	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
[2241]	1159	$cmd .= " 2> $output_filestem.err";
	1160	$!=0;
[10357]	1161
[2241]	1162	my $retcode=system($cmd);
	1163	$retcode = $? >> 8; # see man perlfunc - system for this...
	1164	# if system returns -1 \| 127 (couldn't start program), look at $! for message
	1165
	1166	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
	1167	elsif (! -e "$output_filestem.text") {
	1168	$error="did not create output file.\n";
[2012]	1169	}
[2241]	1170	else
	1171	{ # make sure the interpreter didn't get an error. It is technically
	1172	# possible for the actual text to start with this, but....
	1173	open PSOUT, "$output_filestem.text";
[16435]	1174	if (<PSOUT> =~ m/^Error: (.*)/) {
[2241]	1175	$error="interpreter error - \"$1\"";
	1176	}
	1177	close PSOUT;
	1178	}
[2012]	1179	}
[2241]	1180
[2012]	1181	if ($error ne "")
[1445]	1182	{
[2755]	1183	print STDERR "Warning: Error executing gs: $error\n";
[1445]	1184	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
[2755]	1185
	1186	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
	1187	{
	1188	print FAILLOG "gs - $error\n";
	1189	if (-e "$output_filestem.err") {
	1190	open(ERRLOG, "$output_filestem.err");
	1191	while (<ERRLOG>) {print FAILLOG $_;}
	1192	close ERRLOG;
	1193	}
	1194	close FAILLOG;
	1195	}
[1445]	1196	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
[2012]	1197
[2755]	1198
[2012]	1199	# Fine then. We'll just do a lousy job by ourselves...
[2031]	1200	# Based on 5-line regexp sed script found at:
[2012]	1201	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
	1202	#
[2755]	1203	print STDERR "Stripping text from postscript\n";
[2012]	1204	my $errorcode=0;
	1205	open (IN, "$input_filename")
	1206	\|\| ($errorcode=1, warn "Couldn't read file: $!");
	1207	open (OUT, ">$output_filestem.text")
	1208	\|\| ($errorcode=1, warn "Couldn't write file: $!");
	1209	if ($errorcode) {print STDERR "errors\n";return 0;}
	1210
[2031]	1211	my $text=""; # this is for whole .ps file...
[2755]	1212	$text = join('', <IN>); # see man perlport, under "System Resources"
[2031]	1213	close IN;
	1214
[2447]	1215	# Make sure this is a ps file...
[16435]	1216	if ($text !~ m/^%!/) {
[2755]	1217	print STDERR "Bad postscript header: not '%!'\n";
	1218	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
	1219	{
	1220	print FAILLOG "Bad postscript header: not '%!'\n";
	1221	close FAILLOG;
	1222	}
[2447]	1223	return 0;
	1224	}
	1225
[2031]	1226	# if ps has Page data, then use it to delete all stuff before it.
	1227	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
	1228
	1229	# remove all leading non-data stuff
	1230	$text =~ s/^.*?\(//s;
	1231
	1232	# remove all newline chars for easier processing
	1233	$text =~ s/\n//g;
	1234
	1235	# Big assumption here - assume that if any co-ordinates are
	1236	# given, then we are at the end of a sentence.
	1237	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
	1238
	1239	# special characters--
	1240	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
	1241
	1242	# ? ps text formatting (eg italics?) ?
	1243	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
	1244	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
	1245	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
	1246	# default - remove the rest
	1247	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
	1248
	1249	# attempt to add whitespace between words...
	1250	# this is based purely on observation, and may be completely wrong...
	1251	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
	1252	# eg I notice "b(" is sometimes NOT a space if preceded by a
	1253	# negative number.
	1254	$text =~ s/\)\d+ ?b\(/\) \( /g;
	1255
	1256	# change quoted braces to brackets
	1257	$text =~ s/([^\\])\\\(/$1\{/g;
	1258	$text =~ s/([^\\])\\\)/$1\}/g ;
	1259
	1260	# remove everything that is not between braces
	1261	$text =~ s/\)([^\(\)])+?\(//sg ;
	1262
	1263	# remove any Trailer eof stuff.
	1264	$text =~ s/\)[^\)]*$//sg;
	1265
	1266	### ligatures have special characters...
	1267	$text =~ s/\\013/ff/g;
	1268	$text =~ s/\\014/fi/g;
	1269	$text =~ s/\\015/fl/g;
	1270	$text =~ s/\\016/ffi/g;
	1271	$text =~ s/\\214/fi/g;
	1272	$text =~ s/\\215/fl/g;
	1273	$text =~ s/\\017/\n\* /g; # asterisk?
	1274	$text =~ s/\\023/\023/g; # e acute ('e)
	1275	$text =~ s/\\177/\252/g; # u"
	1276	# $text =~ s/ ?? /\344/g; # a"
	1277
	1278	print OUT "$text";
	1279	close OUT;
[1960]	1280	}
[2600]	1281	# wrap the text - use a minimum length. ie, first space after this length.
	1282	my $wrap_length=72;
	1283	&util::mv("$output_filestem.text", "$output_filestem.text.tmp");
	1284	open INFILE, "$output_filestem.text.tmp" \|\|
	1285	die "Couldn't open file: $!";
	1286	open OUTFILE, ">$output_filestem.text" \|\|
	1287	die "Couldn't open file for writing: $!";
	1288	my $line="";
	1289	while ($line=<INFILE>) {
	1290	while (length($line)>0) {
	1291	if (length($line)>$wrap_length) {
	1292	$line =~ s/^(.{$wrap_length}[^\s])\s//;
	1293	print OUTFILE "$1\n";
	1294	} else {
	1295	print OUTFILE "$line";
	1296	$line="";
	1297	}
	1298	}
	1299	}
	1300	close INFILE;
	1301	close OUTFILE;
	1302	&util::rm("$output_filestem.text.tmp");
	1303
[1445]	1304	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	1305	return 1;
	1306	}
	1307
	1308
	1309	# Convert any file to HTML with a crude perl implementation of the
	1310	# UNIX strings command.
	1311
	1312	sub any_to_html {
	1313	($input_filename, $output_filestem) = @_;
	1314
	1315	# First generate a text file
	1316	return 0 unless (&any_to_text($input_filename, $output_filestem));
	1317
	1318	# create an HTML file from the text file
	1319	open(TEXT, "<$output_filestem.text");
	1320	open(HTML, ">$output_filestem.html");
	1321
[2241]	1322	print HTML "<html><head>\n";
	1323	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
	1324	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
	1325	print HTML "</head><body>\n\n";
[1734]	1326
[2755]	1327	my $line;
	1328	while ($line=<TEXT>) {
	1329	$line =~ s/</</g;
	1330	$line =~ s/>/>/g;
[16435]	1331	if ($line =~ m/^\s*$/) {
[2755]	1332	print HTML "<p>";
	1333	} else {
	1334	print HTML "<br> ", $line;
	1335	}
[1445]	1336	}
[1734]	1337	print HTML "\n</body></html>\n";
[1445]	1338
[2241]	1339	close HTML;
	1340	close TEXT;
	1341
[1445]	1342	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
	1343	return 1;
	1344	}
	1345
	1346	# Convert any file to TEXT with a crude perl implementation of the
	1347	# UNIX strings command.
[2755]	1348	# Note - this assumes ascii charsets :( (jrm21)
[1445]	1349
	1350	sub any_to_text {
	1351	($input_filename, $output_filestem) = @_;
	1352
[3350]	1353	if (!$use_strings) {
	1354	return 0;
	1355	}
[15120]	1356
	1357	print STDERR "\n** In any to text**\n\n";
[2755]	1358	open(IN, "<$input_filename") \|\| return 0;
[1734]	1359	binmode(IN);
[2755]	1360	open(OUT, ">$output_filestem.text") \|\| return 0;
[1445]	1361
	1362	my ($line);
[2755]	1363	my $output_line_count = 0;
[1445]	1364	while (<IN>) {
	1365	$line = $_;
[1734]	1366
[1445]	1367	# delete anything that isn't a printable character
	1368	$line =~ s/[^\040-\176]+/\n/sg;
	1369
	1370	# delete any string less than 10 characters long
[1734]	1371	$line =~ s/^.{0,9}$/\n/mg;
[16435]	1372	while ($line =~ m/^.{1,9}$/m) {
[1734]	1373	$line =~ s/^.{0,9}$/\n/mg;
[1445]	1374	$line =~ s/\n+/\n/sg;
	1375	}
	1376
	1377	# remove extraneous whitespace
	1378	$line =~ s/\n+/\n/gs;
	1379	$line =~ s/^\n//gs;
[1578]	1380
[1445]	1381	# output whatever is left
[16435]	1382	if ($line =~ m/[^\n ]/) {
[1445]	1383	print OUT $line;
[2755]	1384	++$output_line_count;
[1445]	1385	}
	1386	}
[2241]	1387
	1388	close OUT;
	1389	close IN;
	1390
[2755]	1391	if ($output_line_count) { # try to protect against binary only formats
	1392	return 1;
	1393	}
	1394
	1395	&util::rm("$output_filestem.text");
	1396	return 0;
	1397
[1445]	1398	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: