Context Navigation

source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 23473

Last change on this file since 23473 was 23473, checked in by ak19, 13 years ago
Provision for supporting .docx and .pptx files when Windows scripting is on.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 45.2 KB

Rev	Line
[1445]	1	#!/usr/bin/perl -w
	2
	3	###########################################################################
	4	#
[2032]	5	# gsConvert.pl -- convert documents to HTML or TEXT format
[1445]	6	#
	7	# A component of the Greenstone digital library software
	8	# from the New Zealand Digital Library Project at the
	9	# University of Waikato, New Zealand.
	10	#
[3013]	11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
[1445]	12	#
	13	# This program is free software; you can redistribute it and/or modify
	14	# it under the terms of the GNU General Public License as published by
	15	# the Free Software Foundation; either version 2 of the License, or
	16	# (at your option) any later version.
	17	#
	18	# This program is distributed in the hope that it will be useful,
	19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	21	# GNU General Public License for more details.
	22	#
	23	# You should have received a copy of the GNU General Public License
	24	# along with this program; if not, write to the Free Software
	25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	26	#
	27	###########################################################################
	28
	29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
[2755]	30	# by exploiting third-party programs. The sources of these are usually found
	31	# in the $GSDLHOME/packages directory, and the executables should live in
	32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
[1445]	33	#
[3013]	34	# Currently, we can convert the following formats by using external
	35	# conversion utilities:
	36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
	37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
[2032]	38	#
[3013]	39	# We can try to convert any file to text with a perl implementation of the
	40	# UNIX strings command.
	41	#
[2032]	42	# We try to convert Postscript files to text using "gs" which is often on
[2755]	43	# *nix machines. We fall back to performing weak text extraction by using
	44	# regular expressions.
[1445]	45
	46	BEGIN {
	47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
	48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
	49	}
	50
[22429]	51	use strict;
	52
[1445]	53	use parsargv;
	54	use util;
	55	use Cwd;
	56	use File::Basename;
	57
[2755]	58	# Are we running on WinNT or Win2000 (or later)?
	59	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
	60	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
[1445]	61
[3350]	62	my $use_strings;
[3720]	63	my $pdf_complex;
[4103]	64	my $pdf_nohidden;
[3720]	65	my $pdf_zoom;
	66	my $pdf_ignore_images;
[10451]	67	my $pdf_allow_images_only;
[10282]	68	my $windows_scripting;
[3350]	69
[1445]	70	sub print_usage
	71	{
[1970]	72	print STDERR "\n";
	73	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
	74	print STDERR " or text using third-party programs.\n\n";
	75	print STDERR " usage: $0 [options] filename\n";
[22642]	76	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
[2755]	77	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
[22596]	78	print STDERR "\t-output\tauto\|html\|text\|pagedimg_jpg\|pagedimg_gif\|pagedimg_png\t(output file type)\n";
[2755]	79	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
[3720]	80	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
[22568]	81	print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
[3720]	82	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
[4103]	83	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
[3720]	84	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
	85	print STDERR "\t\tconverting PDF to HTML\n";
[10451]	86	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
[3720]	87	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
	88	print STDERR "\t\t-pdf_complex is set\n";
[1445]	89	exit(1);
	90	}
	91
[2755]	92	my $faillogfile="";
[3538]	93	my $timeout=0;
[1445]	94
	95	sub main
	96	{
	97	my (@ARGV) = @_;
[3538]	98	my ($input_type,$output_type,$verbose);
[1960]	99
[23473]	100	# Dynamically figure out what the --type option can support, based on whether -windows_scripting
	101	# is in use or not
	102	my $default_type_re = "(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)";
	103	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
	104	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
	105	# Currently only have VBA for Word and PPT(but no XLS)
	106	my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xls)";
	107
	108	my $type_re = $default_type_re;
	109
	110	foreach my $a (@ARGV) {
	111	if ($a =~ m/^windows_scripting$/i) {
	112	$type_re = $enhanced_type_re;
	113	}
	114	}
	115
[1445]	116	# read command-line arguments
	117	if (!parsargv::parse(\@ARGV,
[23473]	118	"type/$type_re/", \$input_type,
[2755]	119	'/errlog/.*/', \$faillogfile,
[22596]	120	'output/(auto\|html\|text\|pagedimg).*/', \$output_type,
[1692]	121	'timeout/\d+/0',\$timeout,
[10282]	122	'verbose/\d+/0', \$verbose,
[22429]	123	'windows_scripting',\$windows_scripting,
[3720]	124	'use_strings', \$use_strings,
	125	'pdf_complex', \$pdf_complex,
[9482]	126	'pdf_ignore_images', \$pdf_ignore_images,
[10451]	127	'pdf_allow_images_only', \$pdf_allow_images_only,
[4103]	128	'pdf_nohidden', \$pdf_nohidden,
[3720]	129	'pdf_zoom/\d+/2', \$pdf_zoom
	130	))
[1445]	131	{
	132	print_usage();
	133	}
[12704]	134
[1445]	135	# Make sure the input file exists and can be opened for reading
	136	if (scalar(@ARGV!=1)) {
	137	print_usage();
	138	}
[1928]	139
[1445]	140	my $input_filename = $ARGV[0];
	141	if (!-r $input_filename) {
	142	print STDERR "Error: unable to open $input_filename for reading\n";
	143	exit(1);
	144	}
	145
	146	# Deduce filenames
	147	my ($tailname,$dirname,$suffix)
[2241]	148	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
	149	my $output_filestem = &util::filename_cat($dirname, "$tailname");
[1445]	150
	151	if ($input_type eq "")
	152	{
[2241]	153	$input_type = lc (substr($suffix,1,length($suffix)-1));
[1445]	154	}
	155
	156	# Change to temporary working directory
	157	my $stored_dir = cwd();
	158	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
[10357]	159
[1445]	160	# Select convert utility
	161	if (!defined $input_type) {
	162	print STDERR "Error: No filename extension or input type defined\n";
	163	exit(1);
	164	}
[23473]	165	elsif ($input_type =~ m/^docx?$/ \|\| $input_type eq "dot") {
[1445]	166	print &convertDOC($input_filename, $output_filestem, $output_type);
	167	print "\n";
	168	}
[1684]	169	elsif ($input_type eq "rtf") {
	170	print &convertRTF($input_filename, $output_filestem, $output_type);
	171	print "\n";
	172	}
[1445]	173	elsif ($input_type eq "pdf") {
	174	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
	175	print "\n";
	176	}
	177	elsif ($input_type eq "ps") {
[22429]	178	print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
[1445]	179	print "\n";
	180	}
[23473]	181	elsif ($input_type =~ m/pptx?$/) {
[2977]	182	print &convertPPT($input_filename, $output_filestem, $output_type);
	183	print "\n";
	184	}
[23473]	185	elsif ($input_type =~ m/xlsx?$/) {
[2991]	186	print &convertXLS($input_filename, $output_filestem, $output_type);
	187	print "\n";
	188	}
[1445]	189	else {
	190	print STDERR "Error: Unable to convert type '$input_type'\n";
	191	exit(1);
	192	}
	193
	194	# restore to original working directory
	195	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
	196
	197	}
	198
	199	&main(@ARGV);
	200
	201
	202
[2241]	203	# Document-type conversion functions
[1445]	204	#
	205	# The following functions attempt to convert documents from their
	206	# input type to the specified output type. If no output type was
	207	# given, then they first attempt HTML, and then TEXT.
	208	#
	209	# Each returns the output type ("html" or "text") or "fail" if no
	210	# conversion is possible.
	211
	212	# Convert a Microsoft word document
	213
	214	sub convertDOC {
[22429]	215	my ($input_filename, $output_filestem, $output_type) = @_;
[1445]	216
[1654]	217	# Many .doc files are not in fact word documents!
	218	my $realtype = &find_docfile_type($input_filename);
	219
[23473]	220	if ($realtype eq "word6" \|\| $realtype eq "word7"
	221	\|\| $realtype eq "word8" \|\| $realtype eq "docx") {
[1654]	222	return &convertWord678($input_filename, $output_filestem, $output_type);
	223	} elsif ($realtype eq "rtf") {
	224	return &convertRTF($input_filename, $output_filestem, $output_type);
	225	} else {
	226	return &convertAnything($input_filename, $output_filestem, $output_type);
	227	}
	228	}
	229
	230	# Convert a Microsoft word 6/7/8 document
	231
	232	sub convertWord678 {
[22429]	233	my ($input_filename, $output_filestem, $output_type) = @_;
[1654]	234
[1445]	235	my $success = 0;
[16435]	236	if (!$output_type \|\| ($output_type =~ m/html/i)){
[10282]	237	if ($windows_scripting) {
	238	$success = &native_doc_to_html($input_filename, $output_filestem);
	239	}
	240	else {
	241	$success = &doc_to_html($input_filename, $output_filestem);
	242	}
[1445]	243	if ($success) {
[10282]	244	return "html";
[1445]	245	}
	246	}
[1654]	247	return &convertAnything($input_filename, $output_filestem, $output_type);
	248	}
	249
	250
	251	# Convert a Rich Text Format (RTF) file
	252
	253	sub convertRTF {
[22429]	254	my ($input_filename, $output_filestem, $output_type) = @_;
[1654]	255
	256	my $success = 0;
	257
	258	# Attempt specialised conversion to HTML
[16435]	259	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[12704]	260
	261	if ($windows_scripting) {
	262	$success = &native_doc_to_html($input_filename, $output_filestem);
	263	}
	264	else {
	265	$success = &rtf_to_html($input_filename, $output_filestem);
	266	}
[1654]	267	if ($success) {
	268	return "html";
	269	}
	270	}
	271
[2755]	272	# rtf is so ugly that's it's not worth running strings over.
	273	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
	274	# return &convertAnything($input_filename, $output_filestem, $output_type);
	275	return "fail";
[1654]	276	}
	277
	278
	279	# Convert an unidentified file
	280
	281	sub convertAnything {
[22429]	282	my ($input_filename, $output_filestem, $output_type) = @_;
[1654]	283
	284	my $success = 0;
[10464]	285
[1445]	286	# Attempt simple conversion to HTML
[16435]	287	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[1445]	288	$success = &any_to_html($input_filename, $output_filestem);
	289	if ($success) {
	290	return "html";
	291	}
	292	}
	293
	294	# Convert to text
[16435]	295	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[2241]	296	$success = &any_to_text($input_filename, $output_filestem);
[1445]	297	if ($success) {
	298	return "text";
	299	}
	300	}
	301	return "fail";
	302	}
	303
	304
[1654]	305
[1445]	306	# Convert an Adobe PDF document
	307
	308	sub convertPDF {
[2755]	309	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[1445]	310
	311	my $success = 0;
[10357]	312	$output_type =~ s/.\-(.)/$1/i;
	313	# Attempt coversion to Image
[16435]	314	if ($output_type =~ m/jp?g\|gif\|png/i) {
[17329]	315	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
[10357]	316	if ($success){
	317	return "item";
	318	}
	319	}
[1445]	320
	321	# Attempt conversion to HTML
[16435]	322	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[1445]	323	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
	324	if ($success) {
	325	return "html";
	326	}
	327	}
	328
	329	# Attempt conversion to TEXT
[16435]	330	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[2117]	331	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
[1445]	332	if ($success) {
	333	return "text";
	334	}
	335	}
	336
	337	return "fail";
	338
	339	}
	340
	341
	342	# Convert an Adobe PostScript document
	343
	344	sub convertPS {
[22429]	345	my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
[1445]	346
	347	my $success = 0;
[10534]	348	$output_type =~ s/.\-(.)/$1/i;
	349	# Attempt coversion to Image
[16435]	350	if ($output_type =~ m/jp?g\|gif\|png/i) {
[17329]	351	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
[10534]	352	if ($success){
	353	return "item";
	354	}
	355	}
[1445]	356
	357	# Attempt conversion to TEXT
[16435]	358	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[1445]	359	$success = &ps_to_text($input_filename, $output_filestem);
	360	if ($success) {
	361	return "text";
	362	}
	363	}
	364	return "fail";
	365	}
	366
	367
[2977]	368	sub convertPPT {
	369	my ($input_filename, $output_filestem, $output_type) = @_;
[10357]	370	my $success = 0;
[2977]	371
[10282]	372	my $ppt_convert_type = "";
[22513]	373
[16435]	374	#if (!$output_type \|\| $windows_scripting \|\| ($output_type !~ m/html/i) \|\| ($output_type !~ m/text/i)){
	375	if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
	376	if ($output_type =~ m/gif/i) {
[10282]	377	$ppt_convert_type = "-g";
[16435]	378	} elsif ($output_type =~ m/jp?g/i){
[10282]	379	$ppt_convert_type = "-j";
[16435]	380	} elsif ($output_type =~ m/png/i){
[10282]	381	$ppt_convert_type = "-p";
	382	}
	383	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
	384	$ENV{'GSDLOS'}, "pptextract");
[16435]	385	$vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
[10282]	386
[22429]	387	my $cmd = "";
[10357]	388	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[22429]	389	# if the converting directory already exists
[10282]	390	if (-d $output_filestem) {
[22429]	391	print STDERR "**The conversion directory already exists\n";
[10282]	392	return "item";
	393	} else {
[10521]	394	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
[10282]	395	$cmd .= " 2>\"$output_filestem.err\""
[16435]	396	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[10282]	397	if (system($cmd) !=0) {
	398	print STDERR "Powerpoint VB Scripting convert failed\n";
	399	} else {
	400	return "item";
	401	}
	402	}
[16435]	403	} elsif (!$output_type \|\| ($output_type =~ m/html/i)) {
[10282]	404	# Attempt conversion to HTML
[16435]	405	#if (!$output_type \|\| ($output_type =~ m/html/i)) {
[2977]	406	# formulate the command
[22429]	407	my $cmd = "";
[2977]	408	$cmd .= "perl -S ppttohtml.pl ";
	409	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
	410	$cmd .= " 2>\"$output_filestem.err\""
[16435]	411	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[10357]	412
[2977]	413	# execute the command
	414	$!=0;
	415	if (system($cmd)!=0)
	416	{
[2991]	417	print STDERR "Powerpoint 95/97 converter failed $!\n";
[2977]	418	} else {
	419	return "html";
	420	}
[10464]	421	}
[2977]	422
	423	$success = &any_to_text($input_filename, $output_filestem);
	424	if ($success) {
	425	return "text";
	426	}
[10464]	427
[2977]	428	return "fail";
	429	}
	430
	431
[2991]	432	sub convertXLS {
	433	my ($input_filename, $output_filestem, $output_type) = @_;
[2977]	434
[2991]	435	my $success = 0;
[2977]	436
[2991]	437	# Attempt conversion to HTML
[16435]	438	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[2991]	439	# formulate the command
[22429]	440	my $cmd = "";
[2991]	441	$cmd .= "perl -S xlstohtml.pl ";
	442	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
	443	$cmd .= " 2>\"$output_filestem.err\""
[16435]	444	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[2991]	445
	446
	447	# execute the command
	448	$!=0;
	449	if (system($cmd)!=0)
	450	{
	451	print STDERR "Excel 95/97 converter failed $!\n";
	452	} else {
	453	return "html";
	454	}
	455	}
[2977]	456
[2991]	457	$success = &any_to_text($input_filename, $output_filestem);
	458	if ($success) {
	459	return "text";
	460	}
	461
	462	return "fail";
	463	}
	464
	465
	466
[1654]	467	# Find the real type of a .doc file
	468	#
[2012]	469	# We seem to have a lot of files with a .doc extension that are .rtf
[1654]	470	# files or Word 5 files. This function attempts to tell the difference.
	471	sub find_docfile_type {
[22429]	472	my ($input_filename) = @_;
[23473]	473
	474	if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
	475	return "docx";
	476	}
	477
[1654]	478	open(CHK, "<$input_filename");
[1734]	479	binmode(CHK);
[1654]	480	my $line = "";
	481	my $first = 1;
	482
	483	while (<CHK>) {
	484
	485	$line = $_;
[1960]	486
[1654]	487	if ($first) {
	488	# check to see if this is an rtf file
[16435]	489	if ($line =~ m/^\{\\rtf/) {
[1654]	490	close(CHK);
	491	return "rtf";
	492	}
[2755]	493	$first = 0;
[1654]	494	}
	495
[1734]	496	# is this is a word 6/7/8 document?
[16435]	497	if ($line =~ m/Word\.Document\.([678])/) {
[1654]	498	close(CHK);
[23473]	499
[1734]	500	return "word$1";
[1654]	501	}
	502
	503	}
	504
	505	return "unknown";
	506	}
	507
	508
[1734]	509	# Specific type-to-type conversions
[1445]	510	#
	511	# Each of the following functions attempts to convert a document from
[2755]	512	# a specific format to another. If they succeed they return 1 and leave
[1445]	513	# the output document(s) in the appropriate place; if they fail they
	514	# return 0 and delete any working files.
	515
	516
	517	# Attempt to convert a word document to html with the wv program
	518	sub doc_to_html {
[22429]	519	my ($input_filename, $output_filestem) = @_;
[1445]	520
[20933]	521	my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wvWare");
[1928]	522
[20933]	523	if ( -d "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv" && $ENV{'GSDLOS'} eq "linux" ) {
	524	$ENV{'PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/bin:$ENV{'PATH'}";
	525	$ENV{'LD_LIBRARY_PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/lib:$ENV{'LD_LIBRARY_PATH'}";
	526	$wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wv", "bin", "wvWare");
	527	}
	528
[2241]	529	# don't include path on windows (to avoid having to play about
	530	# with quoting when GSDLHOME might contain spaces) but assume
	531	# that the PATH is set up correctly
[16435]	532	$wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
[2241]	533
[2512]	534	my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
[2574]	535	"packages", "wv", "wvHtml.xml");
[1928]	536
[15120]	537	# Added the following to work with replace_srcdoc_with_html.pl:
	538	# Make wvWare put any associated (image) files of the word doc into
	539	# folder docname-without-extention_files. This folder should be at
	540	# the same level as the html file generated from the doc.
	541	# wvWare will take care of proper interlinking.
	542
	543	# This step is necessary for replace_srcdoc_with_html.pl which will
	544	# move the html and associated files into the import folder. We
	545	# want to ensure that the associated files won't overwrite similarly
	546	# named items already in import. Hence we put them in a folder first
	547	# (to which the html links properly) and that will allow
	548	# replace_srcdoc_with_html.pl to move them safely to /import.
	549
	550	# To do all this, we need to use wvWare's --dir and --basename options
	551	# where dir is the full path to the image folder directory and
	552	# basename is the full path to the image folder appended to the name
	553	# which is to be prepended to every image file:
	554	# eg. if the images were to have names like sample0.jpg to sampleN.jpg,
	555	# then the basename is "/full/path/to/imgdir/sample".
	556	# In this case, basename is the full path to and name of the document.
	557	# HOWEVER: basename always takes full path, not relative url, so
	558	# the greenstone browser is unable to display the images (absolute paths
	559	# cause it to give an "external link" message)
	560	# See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html
	561	# and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html
	562	# "added --dir option to wvHtml so that pictures can be placed in
	563	# a seperate directory"
	564	# "running wvWare through IMP to view word documents as html. It gets
	565	# invoked like this:
	566	# wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output"
	567
	568	# toppath is the folder where html is generated
	569	# docname is the name (without extension) of the html to be generated
	570	# suffix (extension) is thrown away
	571	my ($docname, $toppath)
	572	= &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
	573
	574	# We want the image folder generated to have the same name as windows
	575	# would generate ($windows_scripting) when it converts from word to html.
	576	# That is, foldername=docname_files
	577	my $assoc_dir = &util::filename_cat($toppath, $docname."_files");
	578	#print "assoc_dir: ".$assoc_dir."\n"; # same as "$output_filestem._files"
	579
	580	# ensure this image directory exists
	581	# if it exists already, just delete and recreate
	582	if(-e $assoc_dir) {
	583	&util::rm_r($assoc_dir);
	584	}
	585	&util::mk_dir($assoc_dir);
	586
	587	# the images are all going to be called image0, image1,..., imageN
	588	my $img_basenames = &util::filename_cat($assoc_dir, $docname);
	589
	590	#print STDERR "**toppath: $toppath\n**docname: $docname\n;
	591	#print STDERR "****img_basenames: $img_basenames\n" if($img_basenames);
	592	#print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir);
	593
[2241]	594	my $cmd = "";
[1692]	595	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[15120]	596	# wvWare's --dir and --basename options for image directory.
	597	# Replaced the next line with the 2 lines following it:
	598	# $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
	599	$cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\"";
	600	$cmd .= " --charset utf-8 --config \"$wv_conf\"";
[2241]	601	$cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
[15120]	602
[2241]	603	# redirecting STDERR is a bad idea on windows 95/98
	604	$cmd .= " 2> \"$output_filestem.err\""
[16435]	605	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[1445]	606	# execute the command
[2755]	607	$!=0;
[2060]	608	if (system($cmd)!=0)
[1445]	609	{
[2755]	610	print STDERR "Error executing wv converter:$!\n";
	611	if (-s "$output_filestem.err") {
	612	open (ERRFILE, "<$output_filestem.err");
	613
	614	my $write_to_fail_log=0;
	615	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	616	{$write_to_fail_log=1;}
	617
	618	my $line;
	619	while ($line=<ERRFILE>) {
[16435]	620	if ($line =~ m/\w/) {
[2755]	621	print STDERR "$line";
	622	print FAILLOG "$line" if ($write_to_fail_log);
	623	}
	624	if ($line !~ m/startup error/) {next;}
	625	print STDERR " (given an invalid .DOC file?)\n";
	626	print FAILLOG " (given an invalid .DOC file?)\n"
	627	if ($write_to_fail_log);
	628
	629	} # while ERRFILE
	630	close FAILLOG if ($write_to_fail_log);
	631	}
	632	return 0; # we can try any_to_text
[1445]	633	}
[1578]	634
[1445]	635	# Was the conversion successful?
[2241]	636
[15120]	637	if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents)
[1445]	638	open(TMP, "$output_filestem.html");
[22429]	639	my $line = <TMP>;
[1445]	640	close(TMP);
[16435]	641	if ($line && $line =~ m/DOCTYPE HTML/) {
[15120]	642	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
	643
	644	# Inserted this code to remove the images directory if it was still empty after
	645	# the html was generated (in case there were no images in the word document)
[16435]	646	if (&util::is_dir_empty($assoc_dir)) {
[15152]	647	#print STDERR "*gsConvert.pl: Image dir $assoc_dir is empty, removing*\n";
[15120]	648	&util::rm_r($assoc_dir);
	649	} else { # there was an image folder (it was generated)
	650	# Therefore, the html file generated contains absolute links to the images
[16435]	651	# Replace them with relative links instead, so the folder can be moved elsewhere
[15152]	652	&make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");
[15120]	653	}
[1445]	654	return 1;
	655	}
	656	}
[2755]	657
	658	# If here, an error of some sort occurred
	659	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
	660	if (-e "$output_filestem.err") {
	661	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
	662	open (ERRLOG,"$output_filestem.err");
	663	while (<ERRLOG>) {print FAILLOG $_;}
	664	close FAILLOG;
	665	close ERRLOG;
	666	}
	667	&util::rm("$output_filestem.err");
	668	}
	669
[1445]	670	return 0;
	671	}
	672
[15120]	673	# Method to work with doc_to_html - Word docs might contain images.
	674	# When such word docs are converted with wvWare, we make it generate a
	675	# <filename>_files folder with the associated images, while the html file
	676	# <filename> refers to the images using absolute paths to <filename>_files.
	677	# This method reads in that html file and replaces all the absolute paths to
	678	# the images in <filename>_files with the relative paths to the images from
	679	# that folder. (I.e. with <filename>_files/<imagename.ext>).
	680	sub make_links_to_assocdir_relative{
	681	# toppath is the top-level folder in which the html file we're going to be fixing resides
	682	# docname is just the name (without extension) of the html file
	683	# html_file is the full path to the html file: /full/path/docname.html
	684	# assoc_dir_path is toppath/docname_files
	685	# assoc_dirname is the directory name of the folder with associated imgs: docname_files
	686	my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_;
[10357]	687
[15120]	688	# 1. Read all the contents of the html into a string
	689	# open the original file for reading
	690	unless(open(FIN, "<$html_file")) {
[15168]	691	print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR: $!\n";
[15152]	692	return 0;
[15120]	693	}
	694	# From http://perl.plover.com/local.html
	695	# "It's cheaper to read the file all at once, without all the splitting and reassembling.
	696	# (Some people call this slurping the file.) Perl has a special feature to support this:
	697	# If the $/ variable is undefined, the <...> operator will read the entire file all at once"
[15152]	698	my $html_contents;
	699	{
	700	local $/ = undef; # Read entire file at once
	701	$html_contents = <FIN>; # Now file is read in as one single 'line'
	702	}
[15120]	703	close(FIN); # close the file
[15152]	704	#print STDERR $html_contents;
[15120]	705
	706	# 2. Replace (substitute) all ocurrences of the assoc_dir_path in a hrefs and img src
	707	# values with assoc_dirname
	708	# At the end: g means substitute all occurrences (global), while s at the end means treat
	709	# all new lines as a regular space. This interacts with g to consider all the lines
	710	# together as a single line so that multi-occurrences can be replaced.
[15152]	711
	712	# we can't just replace $assoc_dir_path with $assoc_dir
	713	# $assoc_dir_path represents a regular expression that needs to be replaced
[16435]	714	# if it contains ., -, [, ], or Windows style backslashes in paths -- which all have special
	715	# meaning in Perl regular expressions -- we need to escape these first
[15152]	716	my $safe_reg_expression = $assoc_dir_path;
[16435]	717	$safe_reg_expression =~ s/\\/\\\\/g;
[15152]	718	$safe_reg_expression =~ s/\./\\./g;
	719	$safe_reg_expression =~ s/\-/\\-/g;
	720	$safe_reg_expression =~ s/\[/\\[/g;
	721	$safe_reg_expression =~ s/\]/\\]/g;
	722	$safe_reg_expression =~ s/ /%20/g; # wvWare put %20 in place of space, so we need to change our prefix to match
	723
[15120]	724	# The following regular expression substitution looks for <a or <image, followed by any other
	725	# attributes and values until it comes to the FIRST (indicated by ?) href= or src=
	726	# followed by " or ' no quotes at all around path, followed by the associated folder's pathname
	727	# followed by characters (for the img filename), then finally the optional closing quotes
	728	# in " or ' form, followed by any other attributes and values until the first > to end the tag.
	729	# The substitution: all the parts preceding associated folder's pathname are retained,
	730	# the associated folder path name is replaced by associated folder directory name
	731	# and the rest upto and including the closing > tag is retained.
	732	# The sg at the end of the pattern match treats all of html_contents as a single line (s)
	733	# and performs a global replace (g) meaning that all occurrences that match in that single line
	734	# are substituted.
[15152]	735	$html_contents =~ s/(<(a\|img).?(href\|src)=(\"\|\')?)$safe_reg_expression(.?(\"\|\')?.*?>)/$1$assoc_dirname$5/sg;
	736	#$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back
	737	# now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this
[16435]	738	$html_contents =~ s/(<(a\|img).?(href\|src)=(\"\|\')?)(.)(.?(\"\|\')?.?>)/&post_process_assocfile_urls($1, $5, $6)/sge;
	739
[16552]	740	#print STDERR "**assoc_dirname: $assoc_dirname*\n";
	741	#print STDERR "**safe_reg_expression: $safe_reg_expression*\n";
[15152]	742
[15120]	743	# delete the original file and recreate it
	744	my $copy_of_filename = $html_file;
	745	&util::rm($copy_of_filename); # deleted the file
	746
	747	# Recreate the original file for writing the updated contents
	748	unless(open(FOUT, ">$html_file")) { # open it as a new file for writing
[15168]	749	print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR: $!\n";
[15152]	750	return 0;
[15120]	751	}
[16435]	752
[15120]	753	# write out the updated contents and close the file
	754	print FOUT $html_contents;
	755	close(FOUT);
[15152]	756	return 1;
[15120]	757	}
	758
[16435]	759	# Utility routine to make sure HTML plugin gets img src/href link pathnames that contain
	760	# url slashes (/) instead of windows-style backwards slashes, and to convert all %20
[16899]	761	# introduced in link pathnames by wvWare into space again. Converts all percent signs
	762	# introduced by URL encoding filenames generated into %25 in these url links referencing them
[16435]	763	sub post_process_assocfile_urls
[15120]	764	{
[15152]	765	my ($pre, $text, $post) = @_;
	766
[19763]	767	$text =~ s/%20/ /g; # Convert %20s to space and not underscore since underscores mess with incremental rebuild
	768	# $text =~ s/%20/_/g; # reinstated this line, since we no longer replace spaces with %20. We replace them with underscores
[16435]	769	$text =~ s/\\/\//g;
[16899]	770	$text =~ s/%/%25/g;
[15152]	771
	772	return "$pre$text$post";
[15120]	773	}
	774
[10282]	775	# Attempt to convert a word document to html with the word2html scripting program
	776	sub native_doc_to_html {
[22429]	777	my ($input_filename, $output_filestem) = @_;
[1445]	778
[10282]	779	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
	780	$ENV{'GSDLOS'}, "word2html");
	781
[16435]	782	$vbScript = "word2html" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
[10445]	783	if (-e "$output_filestem.html") {
[22429]	784	print STDERR " The conversion file:\n";
	785	print STDERR " $output_filestem.html\n";
	786	print STDERR " ... already exists. Skipping\n";
[10445]	787	return 1;
	788	}
[10282]	789
	790	my $cmd = "";
	791	if ($timeout) {$cmd = "ulimit -t $timeout;";}
	792	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
[10445]	793	#$cmd .= "$vbScript $input_filename $output_filestem.html";
[10521]	794	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
[10445]	795
[10282]	796	# redirecting STDERR
	797	$cmd .= " 2> \"$output_filestem.err\""
[16435]	798	if ($ENV {'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[10282]	799
	800	# execute the command
	801	$!=0;
	802	if (system($cmd)!=0)
	803	{
	804	print STDERR "Error executing word2Html converter:$!\n";
	805	if (-s "$output_filestem.err") {
	806	open (ERRFILE, "<$output_filestem.err");
	807
	808	my $write_to_fail_log=0;
	809	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	810	{$write_to_fail_log=1;}
	811
	812	my $line;
	813	while ($line=<ERRFILE>) {
[16435]	814	if ($line =~ m/\w/) {
[10282]	815	print STDERR "$line";
	816	print FAILLOG "$line" if ($write_to_fail_log);
	817	}
	818	if ($line !~ m/startup error/) {next;}
	819	print STDERR " (given an invalid .DOC file?)\n";
	820	print FAILLOG " (given an invalid .DOC file?)\n"
	821	if ($write_to_fail_log);
	822
	823	} # while ERRFILE
	824	close FAILLOG if ($write_to_fail_log);
	825	}
	826	return 0; # we can try any_to_text
	827	}
	828
	829	# Was the conversion successful?
	830	if (-s "$output_filestem.html") {
	831	open(TMP, "$output_filestem.html");
[22429]	832	my $line = <TMP>;
[10282]	833	close(TMP);
[22429]	834	if ($line && $line =~ m/html/i) {
[10282]	835	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
	836	return 1;
	837	}
	838	}
	839
	840	# If here, an error of some sort occurred
	841	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
	842	if (-e "$output_filestem.err") {
	843	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
	844	open (ERRLOG,"$output_filestem.err");
	845	while (<ERRLOG>) {print FAILLOG $_;}
	846	close FAILLOG;
	847	close ERRLOG;
	848	}
	849	&util::rm("$output_filestem.err");
	850	}
	851	return 0;
	852	}
	853
[1654]	854	# Attempt to convert an RTF document to html with rtftohtml
	855	sub rtf_to_html {
[2241]	856	my ($input_filename, $output_filestem) = @_;
[1654]	857
	858	# formulate the command
[22429]	859	my $cmd = "";
[1692]	860	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[2574]	861	$cmd .= "rtftohtml";
[10282]	862	#$cmd .= "rtf-converter";
[1654]	863
[3246]	864	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
[2574]	865
	866	$cmd .= " 2>\"$output_filestem.err\""
[16435]	867	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[2574]	868
	869
[1654]	870	# execute the command
[2755]	871	$!=0;
[2060]	872	if (system($cmd)!=0)
[1654]	873	{
[2755]	874	print STDERR "Error executing rtf converter $!\n";
[2656]	875	# don't currently bother printing out error log...
	876	# keep going, in case it still created an HTML file...
[1654]	877	}
	878
	879	# Was the conversion successful?
[2755]	880	my $was_successful=0;
[2656]	881	if (-s "$output_filestem.html") {
[2755]	882	# make sure we have some content other than header
	883	open (HTML, "$output_filestem.html"); # what to do if fail?
	884	my $line;
	885	my $past_header=0;
	886	while ($line=<HTML>) {
	887
	888	if ($past_header == 0) {
[16435]	889	if ($line =~ m/<body>/) {$past_header=1;}
[2755]	890	next;
	891	}
	892
	893	$line =~ s/<[^>]+>//g;
[16435]	894	if ($line =~ m/\w/ && $past_header) { # we found some content...
[2755]	895	$was_successful=1;
	896	last;
	897	}
	898	}
	899	close HTML;
[1654]	900	}
[2574]	901
[2755]	902	if ($was_successful) {
	903	&util::rm("$output_filestem.err")
	904	if (-e "$output_filestem.err");
	905	# insert the (modified) table of contents, if it exists.
	906	if (-e "${output_filestem}_ToC.html") {
	907	&util::mv("$output_filestem.html","$output_filestem.src");
	908	my $open_failed=0;
	909	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
	910	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
	911	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
	912
	913	if ($open_failed) {
	914	close HTMLSRC;
	915	close TOC;
	916	close HTML;
	917	&util::mv("$output_filestem.src","$output_filestem.html");
	918	return 1;
	919	}
	920
	921	# print out header info from src html.
[16435]	922	while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
[2755]	923	print HTML "$_";
	924	}
	925
	926	# print out table of contents, making links relative
	927	<TOC>; <TOC>; # ignore first 2 lines
	928	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
	929	my $line;
	930	while ($line=<TOC>) {
[22429]	931	$line =~ s@</body></html>$@@i ; # only last line has this
[2755]	932	# make link relative
[22429]	933	$line =~ s@href=\"[^\#]+@href=\"@i;
[2755]	934	print HTML $line;
	935	}
	936	close TOC;
	937
	938	# rest of html src
	939	while (<HTMLSRC>) {
	940	print HTML $_;
	941	}
	942	close HTMLSRC;
	943	close HTML;
	944
	945	&util::rm("${output_filestem}_ToC.html");
	946	&util::rm("${output_filestem}.src");
	947	}
	948	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
	949	return 1; # success
	950	}
	951
	952	if (-e "$output_filestem.err") {
	953	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	954	{
	955	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
[10282]	956	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
[2755]	957	print FAILLOG " (rtf file might be too recent):\n";
	958	open (ERRLOG, "$output_filestem.err");
	959	while (<ERRLOG>) {print FAILLOG $_;}
	960	close ERRLOG;
	961	close FAILLOG;
	962	}
	963	&util::rm("$output_filestem.err");
	964	}
	965
[2656]	966	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
	967
[1654]	968	return 0;
	969	}
	970
	971
[1445]	972	# Convert a pdf file to html with the pdftohtml command
	973
	974	sub pdf_to_html {
[2755]	975	my ($dirname, $input_filename, $output_filestem) = @_;
[1445]	976
[22429]	977	my $cmd = "";
[1692]	978	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[3720]	979	$cmd .= "perl -S pdftohtml.pl -zoom $pdf_zoom";
	980	$cmd .= " -c" if ($pdf_complex);
	981	$cmd .= " -i" if ($pdf_ignore_images);
[10451]	982	$cmd .= " -a" if ($pdf_allow_images_only);
[4103]	983	$cmd .= " -hidden" unless ($pdf_nohidden);
[1928]	984	$cmd .= " \"$input_filename\" \"$output_filestem\"";
[2755]	985
[16435]	986	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
[2755]	987	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	988	} else {
	989	$cmd .= " > \"$output_filestem.err\"";
	990	}
	991
[2117]	992	$!=0;
[2241]	993
[2656]	994	my $retval=system($cmd);
	995	if ($retval!=0)
[1445]	996	{
[2755]	997	print STDERR "Error executing pdftohtml.pl";
[2117]	998	if ($!) {print STDERR ": $!";}
	999	print STDERR "\n";
[1445]	1000	}
	1001
[1692]	1002	# make sure the converter made something
[2656]	1003	if ($retval!=0 \|\| ! -s "$output_filestem.html")
[1692]	1004	{
	1005	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
[2656]	1006	# print out the converter's std err, if any
	1007	if (-s "$output_filestem.err") {
[1692]	1008	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[2755]	1009	print STDERR "pdftohtml error log:\n";
[1692]	1010	while (<ERRLOG>) {
	1011	print STDERR "$_";
	1012	}
	1013	close ERRLOG;
	1014	}
[22513]	1015	print STDERR "***********output filestem $output_filestem.html\n";
[2656]	1016	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
[2755]	1017	if (-e "$output_filestem.err") {
	1018	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	1019	{
	1020	open (ERRLOG, "$output_filestem.err");
	1021	while (<ERRLOG>) {print FAILLOG $_;}
	1022	close ERRLOG;
	1023	close FAILLOG;
	1024	}
[10282]	1025	&util::rm("$output_filestem.err");
[2755]	1026	}
[1692]	1027	return 0;
	1028	}
[10357]	1029
	1030	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	1031	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	1032	return 1;
	1033	}
	1034
	1035	# Convert a pdf file to various types of image with the convert command
	1036
[17329]	1037	sub pdfps_to_img {
[10357]	1038	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[10401]	1039
	1040	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
	1041	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
	1042	my $result = `identify 2>&1`;
	1043	if ($? == -1 \|\| $? == 256) { # Linux and Windows return different values for "program not found"
	1044	#ImageMagick is not installed, thus the convert utility is not available.
[17329]	1045	print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images\n";
[10401]	1046	return 0;
	1047	}
	1048	}
	1049
[22429]	1050	my $cmd = "";
[10357]	1051	if ($timeout) {$cmd = "ulimit -t $timeout;";}
	1052	$output_type =~ s/.\_(.)/$1/i;
[17329]	1053	$cmd .= "perl -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
[16435]	1054	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
[10357]	1055	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	1056	} else {
	1057	$cmd .= " > \"$output_filestem.err\"";
	1058	}
	1059
	1060	# don't include path on windows (to avoid having to play about
	1061	# with quoting when GSDLHOME might contain spaces) but assume
	1062	# that the PATH is set up correctly
	1063	$!=0;
	1064	my $retval=system($cmd);
	1065	if ($retval!=0)
	1066	{
[10401]	1067	print STDERR "Error executing pdftoimg.pl";
[10357]	1068	if ($!) {print STDERR ": $!";}
	1069	print STDERR "\n";
	1070	}
	1071
	1072	#make sure the converter made something
	1073	#if ($retval !=0) \|\| ! -s "$output_filestem")
	1074	if ($retval !=0)
	1075	{
	1076	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	1077	#print out the converter's std err, if any
	1078	if (-s "$output_filestem.err") {
	1079	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[17329]	1080	print STDERR "pdfpstoimg error log:\n";
[10357]	1081	while (<ERRLOG>) {
	1082	print STDERR "$_";
	1083	}
	1084	close ERRLOG;
	1085	}
[10534]	1086	#&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
[10357]	1087	if (-e "$output_filestem.err") {
	1088	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	1089	{
	1090	open (ERRLOG, "$output_filestem.err");
	1091	while (<ERRLOG>) {print FAILLOG $_;}
	1092	close ERRLOG;
	1093	close FAILLOG;
	1094	}
	1095	&util::rm("$output_filestem.err");
	1096	}
	1097	return 0;
	1098	}
[2656]	1099	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
[1445]	1100	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	1101	return 1;
	1102	}
	1103
	1104	# Convert a PDF file to text with the pdftotext command
	1105
	1106	sub pdf_to_text {
[2755]	1107	my ($dirname, $input_filename, $output_filestem) = @_;
[1445]	1108
[2248]	1109	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
[2755]	1110
[16435]	1111	if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
[2755]	1112	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	1113	} else {
	1114	$cmd .= " > \"$output_filestem.err\"";
	1115	}
[1445]	1116
[2060]	1117	if (system($cmd)!=0)
[1445]	1118	{
	1119	print STDERR "Error executing $cmd: $!\n";
	1120	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
	1121	}
	1122
[2755]	1123	# make sure there is some extracted text.
	1124	if (-e "$output_filestem.text") {
	1125	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
	1126	binmode(EXTR_TEXT); # just in case...
	1127	my $line="";
	1128	my $seen_text=0;
	1129	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
[16435]	1130	if ($line=~ m/\w/) {$seen_text=1;}
[2755]	1131	}
	1132	close EXTR_TEXT;
	1133	if ($seen_text==0) { # no text was extracted
	1134	print STDERR "Error: pdftotext found no text\n";
	1135	&util::rm("$output_filestem.text");
	1136	}
	1137	}
	1138
[1692]	1139	# make sure the converter made something
[2656]	1140	if (! -s "$output_filestem.text")
[1692]	1141	{
	1142	# print out the converters std err, if any
[2656]	1143	if (-s "$output_filestem.err") {
[1692]	1144	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[2755]	1145	print STDERR "pdftotext error log:\n";
[1692]	1146	while (<ERRLOG>) {
	1147	print STDERR "$_";
	1148	}
	1149	close ERRLOG;
	1150	}
[2656]	1151	# does this converter create a .out file?
	1152	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	1153	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
[2755]	1154	if (-e "$output_filestem.err") {
	1155	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	1156	{
	1157	open (ERRLOG,"$output_filestem.err");
	1158	while (<ERRLOG>) {print FAILLOG $_;}
	1159	close ERRLOG;
	1160	close FAILLOG;
	1161	}
	1162	&util::rm("$output_filestem.err");
	1163	}
[1692]	1164	return 0;
	1165	}
[1445]	1166	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	1167	return 1;
	1168	}
	1169
[2012]	1170	# Convert a PostScript document to text
	1171	# note - just using "ps2ascii" isn't good enough, as it
	1172	# returns 0 for a postscript interpreter error. ps2ascii is just
	1173	# a wrapper to "gs" anyway, so we use that cmd here.
[1445]	1174
	1175	sub ps_to_text {
[2241]	1176	my ($input_filename, $output_filestem) = @_;
[1445]	1177
[2241]	1178	my $error = "";
	1179
	1180	# if we're on windows we'll fall straight through without attempting
	1181	# to use gs
[16435]	1182	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
[2241]	1183	$error = "Windows does not support gs";
	1184
	1185	} else {
[3538]	1186	my $cmd = "";
	1187	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
	1188	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
[2241]	1189	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
[10357]	1190	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
[2241]	1191	$cmd .= " 2> $output_filestem.err";
	1192	$!=0;
[10357]	1193
[2241]	1194	my $retcode=system($cmd);
	1195	$retcode = $? >> 8; # see man perlfunc - system for this...
	1196	# if system returns -1 \| 127 (couldn't start program), look at $! for message
	1197
	1198	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
	1199	elsif (! -e "$output_filestem.text") {
	1200	$error="did not create output file.\n";
[2012]	1201	}
[2241]	1202	else
	1203	{ # make sure the interpreter didn't get an error. It is technically
	1204	# possible for the actual text to start with this, but....
	1205	open PSOUT, "$output_filestem.text";
[16435]	1206	if (<PSOUT> =~ m/^Error: (.*)/) {
[2241]	1207	$error="interpreter error - \"$1\"";
	1208	}
	1209	close PSOUT;
	1210	}
[2012]	1211	}
[2241]	1212
[2012]	1213	if ($error ne "")
[1445]	1214	{
[2755]	1215	print STDERR "Warning: Error executing gs: $error\n";
[1445]	1216	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
[2755]	1217
	1218	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
	1219	{
	1220	print FAILLOG "gs - $error\n";
	1221	if (-e "$output_filestem.err") {
	1222	open(ERRLOG, "$output_filestem.err");
	1223	while (<ERRLOG>) {print FAILLOG $_;}
	1224	close ERRLOG;
	1225	}
	1226	close FAILLOG;
	1227	}
[1445]	1228	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
[2012]	1229
[2755]	1230
[2012]	1231	# Fine then. We'll just do a lousy job by ourselves...
[2031]	1232	# Based on 5-line regexp sed script found at:
[2012]	1233	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
	1234	#
[2755]	1235	print STDERR "Stripping text from postscript\n";
[2012]	1236	my $errorcode=0;
	1237	open (IN, "$input_filename")
	1238	\|\| ($errorcode=1, warn "Couldn't read file: $!");
	1239	open (OUT, ">$output_filestem.text")
	1240	\|\| ($errorcode=1, warn "Couldn't write file: $!");
	1241	if ($errorcode) {print STDERR "errors\n";return 0;}
	1242
[2031]	1243	my $text=""; # this is for whole .ps file...
[2755]	1244	$text = join('', <IN>); # see man perlport, under "System Resources"
[2031]	1245	close IN;
	1246
[2447]	1247	# Make sure this is a ps file...
[16435]	1248	if ($text !~ m/^%!/) {
[2755]	1249	print STDERR "Bad postscript header: not '%!'\n";
	1250	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
	1251	{
	1252	print FAILLOG "Bad postscript header: not '%!'\n";
	1253	close FAILLOG;
	1254	}
[2447]	1255	return 0;
	1256	}
	1257
[2031]	1258	# if ps has Page data, then use it to delete all stuff before it.
	1259	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
	1260
	1261	# remove all leading non-data stuff
	1262	$text =~ s/^.*?\(//s;
	1263
	1264	# remove all newline chars for easier processing
	1265	$text =~ s/\n//g;
	1266
	1267	# Big assumption here - assume that if any co-ordinates are
	1268	# given, then we are at the end of a sentence.
	1269	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
	1270
	1271	# special characters--
	1272	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
	1273
	1274	# ? ps text formatting (eg italics?) ?
	1275	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
	1276	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
	1277	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
	1278	# default - remove the rest
	1279	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
	1280
	1281	# attempt to add whitespace between words...
	1282	# this is based purely on observation, and may be completely wrong...
	1283	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
	1284	# eg I notice "b(" is sometimes NOT a space if preceded by a
	1285	# negative number.
	1286	$text =~ s/\)\d+ ?b\(/\) \( /g;
	1287
	1288	# change quoted braces to brackets
	1289	$text =~ s/([^\\])\\\(/$1\{/g;
	1290	$text =~ s/([^\\])\\\)/$1\}/g ;
	1291
	1292	# remove everything that is not between braces
	1293	$text =~ s/\)([^\(\)])+?\(//sg ;
	1294
	1295	# remove any Trailer eof stuff.
	1296	$text =~ s/\)[^\)]*$//sg;
	1297
	1298	### ligatures have special characters...
	1299	$text =~ s/\\013/ff/g;
	1300	$text =~ s/\\014/fi/g;
	1301	$text =~ s/\\015/fl/g;
	1302	$text =~ s/\\016/ffi/g;
	1303	$text =~ s/\\214/fi/g;
	1304	$text =~ s/\\215/fl/g;
	1305	$text =~ s/\\017/\n\* /g; # asterisk?
	1306	$text =~ s/\\023/\023/g; # e acute ('e)
	1307	$text =~ s/\\177/\252/g; # u"
	1308	# $text =~ s/ ?? /\344/g; # a"
	1309
	1310	print OUT "$text";
	1311	close OUT;
[1960]	1312	}
[2600]	1313	# wrap the text - use a minimum length. ie, first space after this length.
	1314	my $wrap_length=72;
	1315	&util::mv("$output_filestem.text", "$output_filestem.text.tmp");
	1316	open INFILE, "$output_filestem.text.tmp" \|\|
	1317	die "Couldn't open file: $!";
	1318	open OUTFILE, ">$output_filestem.text" \|\|
	1319	die "Couldn't open file for writing: $!";
	1320	my $line="";
	1321	while ($line=<INFILE>) {
	1322	while (length($line)>0) {
	1323	if (length($line)>$wrap_length) {
	1324	$line =~ s/^(.{$wrap_length}[^\s])\s//;
	1325	print OUTFILE "$1\n";
	1326	} else {
	1327	print OUTFILE "$line";
	1328	$line="";
	1329	}
	1330	}
	1331	}
	1332	close INFILE;
	1333	close OUTFILE;
	1334	&util::rm("$output_filestem.text.tmp");
	1335
[1445]	1336	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	1337	return 1;
	1338	}
	1339
	1340
	1341	# Convert any file to HTML with a crude perl implementation of the
	1342	# UNIX strings command.
	1343
	1344	sub any_to_html {
[22429]	1345	my ($input_filename, $output_filestem) = @_;
[1445]	1346
	1347	# First generate a text file
	1348	return 0 unless (&any_to_text($input_filename, $output_filestem));
	1349
	1350	# create an HTML file from the text file
	1351	open(TEXT, "<$output_filestem.text");
	1352	open(HTML, ">$output_filestem.html");
	1353
[2241]	1354	print HTML "<html><head>\n";
	1355	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
	1356	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
	1357	print HTML "</head><body>\n\n";
[1734]	1358
[2755]	1359	my $line;
	1360	while ($line=<TEXT>) {
	1361	$line =~ s/</</g;
	1362	$line =~ s/>/>/g;
[16435]	1363	if ($line =~ m/^\s*$/) {
[2755]	1364	print HTML "<p>";
	1365	} else {
	1366	print HTML "<br> ", $line;
	1367	}
[1445]	1368	}
[1734]	1369	print HTML "\n</body></html>\n";
[1445]	1370
[2241]	1371	close HTML;
	1372	close TEXT;
	1373
[1445]	1374	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
	1375	return 1;
	1376	}
	1377
	1378	# Convert any file to TEXT with a crude perl implementation of the
	1379	# UNIX strings command.
[2755]	1380	# Note - this assumes ascii charsets :( (jrm21)
[1445]	1381
	1382	sub any_to_text {
[22429]	1383	my ($input_filename, $output_filestem) = @_;
[1445]	1384
[3350]	1385	if (!$use_strings) {
	1386	return 0;
	1387	}
[15120]	1388
	1389	print STDERR "\n** In any to text**\n\n";
[2755]	1390	open(IN, "<$input_filename") \|\| return 0;
[1734]	1391	binmode(IN);
[2755]	1392	open(OUT, ">$output_filestem.text") \|\| return 0;
[1445]	1393
	1394	my ($line);
[2755]	1395	my $output_line_count = 0;
[1445]	1396	while (<IN>) {
	1397	$line = $_;
[1734]	1398
[1445]	1399	# delete anything that isn't a printable character
	1400	$line =~ s/[^\040-\176]+/\n/sg;
	1401
	1402	# delete any string less than 10 characters long
[1734]	1403	$line =~ s/^.{0,9}$/\n/mg;
[16435]	1404	while ($line =~ m/^.{1,9}$/m) {
[1734]	1405	$line =~ s/^.{0,9}$/\n/mg;
[1445]	1406	$line =~ s/\n+/\n/sg;
	1407	}
	1408
	1409	# remove extraneous whitespace
	1410	$line =~ s/\n+/\n/gs;
	1411	$line =~ s/^\n//gs;
[1578]	1412
[1445]	1413	# output whatever is left
[16435]	1414	if ($line =~ m/[^\n ]/) {
[1445]	1415	print OUT $line;
[2755]	1416	++$output_line_count;
[1445]	1417	}
	1418	}
[2241]	1419
	1420	close OUT;
	1421	close IN;
	1422
[2755]	1423	if ($output_line_count) { # try to protect against binary only formats
	1424	return 1;
	1425	}
	1426
	1427	&util::rm("$output_filestem.text");
	1428	return 0;
	1429
[1445]	1430	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: