Context Navigation

source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 24223

Last change on this file since 24223 was 24223, checked in by ak19, 13 years ago
More changes to do with obtaining the perlpath using Config: moved the Use Config statement into the Begin block (it cannot be a Require statement for some reason).
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 47.2 KB

Rev	Line
[1445]	1	#!/usr/bin/perl -w
	2
	3	###########################################################################
	4	#
[2032]	5	# gsConvert.pl -- convert documents to HTML or TEXT format
[1445]	6	#
	7	# A component of the Greenstone digital library software
	8	# from the New Zealand Digital Library Project at the
	9	# University of Waikato, New Zealand.
	10	#
[3013]	11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
[1445]	12	#
	13	# This program is free software; you can redistribute it and/or modify
	14	# it under the terms of the GNU General Public License as published by
	15	# the Free Software Foundation; either version 2 of the License, or
	16	# (at your option) any later version.
	17	#
	18	# This program is distributed in the hope that it will be useful,
	19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	21	# GNU General Public License for more details.
	22	#
	23	# You should have received a copy of the GNU General Public License
	24	# along with this program; if not, write to the Free Software
	25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	26	#
	27	###########################################################################
	28
	29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
[2755]	30	# by exploiting third-party programs. The sources of these are usually found
	31	# in the $GSDLHOME/packages directory, and the executables should live in
	32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
[1445]	33	#
[3013]	34	# Currently, we can convert the following formats by using external
	35	# conversion utilities:
	36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
	37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
[2032]	38	#
[3013]	39	# We can try to convert any file to text with a perl implementation of the
	40	# UNIX strings command.
	41	#
[2032]	42	# We try to convert Postscript files to text using "gs" which is often on
[2755]	43	# *nix machines. We fall back to performing weak text extraction by using
	44	# regular expressions.
[1445]	45
	46	BEGIN {
	47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
	48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
[24093]	49
	50	if(!$ENV{'PERLPATH'}) {
[24223]	51	use Config; # for some reason, this can't be a require statement
	52	my $full_perl_exec = $Config{perlpath}; #$^X;
[24093]	53	require File::Basename;
	54	my $perl_path = File::Basename::dirname($full_perl_exec);
	55	$ENV{'PERLPATH'} = $perl_path;
	56	}
	57
[1445]	58	}
	59
[22429]	60	use strict;
	61
[1445]	62	use parsargv;
	63	use util;
	64	use Cwd;
[24223]	65	#use Config; # for getting the perlpath in the recommended way
[24192]	66	# Note: even though this (and other) use statement comes after its
	67	# usage in BEGIN, the use statement is in fact executed before BEGIN.
[24223]	68	# Doesn't work here for Greenstone 3.
[1445]	69
[24192]	70
[2755]	71	# Are we running on WinNT or Win2000 (or later)?
	72	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
	73	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
[1445]	74
[3350]	75	my $use_strings;
[3720]	76	my $pdf_complex;
[4103]	77	my $pdf_nohidden;
[3720]	78	my $pdf_zoom;
	79	my $pdf_ignore_images;
[10451]	80	my $pdf_allow_images_only;
[10282]	81	my $windows_scripting;
[3350]	82
[1445]	83	sub print_usage
	84	{
[1970]	85	print STDERR "\n";
	86	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
	87	print STDERR " or text using third-party programs.\n\n";
	88	print STDERR " usage: $0 [options] filename\n";
[22642]	89	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
[2755]	90	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
[22596]	91	print STDERR "\t-output\tauto\|html\|text\|pagedimg_jpg\|pagedimg_gif\|pagedimg_png\t(output file type)\n";
[2755]	92	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
[3720]	93	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
[22568]	94	print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
[3720]	95	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
[4103]	96	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
[3720]	97	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
	98	print STDERR "\t\tconverting PDF to HTML\n";
[10451]	99	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
[3720]	100	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
	101	print STDERR "\t\t-pdf_complex is set\n";
[1445]	102	exit(1);
	103	}
	104
[2755]	105	my $faillogfile="";
[3538]	106	my $timeout=0;
[1445]	107
	108	sub main
	109	{
	110	my (@ARGV) = @_;
[3538]	111	my ($input_type,$output_type,$verbose);
[1960]	112
[23473]	113	# Dynamically figure out what the --type option can support, based on whether -windows_scripting
	114	# is in use or not
	115	my $default_type_re = "(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)";
	116	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
	117	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
	118	# Currently only have VBA for Word and PPT(but no XLS)
	119	my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xls)";
	120
	121	my $type_re = $default_type_re;
	122
	123	foreach my $a (@ARGV) {
	124	if ($a =~ m/^windows_scripting$/i) {
	125	$type_re = $enhanced_type_re;
	126	}
	127	}
	128
[1445]	129	# read command-line arguments
	130	if (!parsargv::parse(\@ARGV,
[23473]	131	"type/$type_re/", \$input_type,
[2755]	132	'/errlog/.*/', \$faillogfile,
[22596]	133	'output/(auto\|html\|text\|pagedimg).*/', \$output_type,
[1692]	134	'timeout/\d+/0',\$timeout,
[10282]	135	'verbose/\d+/0', \$verbose,
[22429]	136	'windows_scripting',\$windows_scripting,
[3720]	137	'use_strings', \$use_strings,
	138	'pdf_complex', \$pdf_complex,
[9482]	139	'pdf_ignore_images', \$pdf_ignore_images,
[10451]	140	'pdf_allow_images_only', \$pdf_allow_images_only,
[4103]	141	'pdf_nohidden', \$pdf_nohidden,
[3720]	142	'pdf_zoom/\d+/2', \$pdf_zoom
	143	))
[1445]	144	{
	145	print_usage();
	146	}
[12704]	147
[1445]	148	# Make sure the input file exists and can be opened for reading
	149	if (scalar(@ARGV!=1)) {
	150	print_usage();
	151	}
[1928]	152
[1445]	153	my $input_filename = $ARGV[0];
	154	if (!-r $input_filename) {
	155	print STDERR "Error: unable to open $input_filename for reading\n";
	156	exit(1);
	157	}
	158
	159	# Deduce filenames
	160	my ($tailname,$dirname,$suffix)
[2241]	161	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
	162	my $output_filestem = &util::filename_cat($dirname, "$tailname");
[1445]	163
	164	if ($input_type eq "")
	165	{
[2241]	166	$input_type = lc (substr($suffix,1,length($suffix)-1));
[1445]	167	}
	168
	169	# Change to temporary working directory
	170	my $stored_dir = cwd();
	171	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
[10357]	172
[1445]	173	# Select convert utility
	174	if (!defined $input_type) {
	175	print STDERR "Error: No filename extension or input type defined\n";
	176	exit(1);
	177	}
[23473]	178	elsif ($input_type =~ m/^docx?$/ \|\| $input_type eq "dot") {
[1445]	179	print &convertDOC($input_filename, $output_filestem, $output_type);
	180	print "\n";
	181	}
[1684]	182	elsif ($input_type eq "rtf") {
	183	print &convertRTF($input_filename, $output_filestem, $output_type);
	184	print "\n";
	185	}
[1445]	186	elsif ($input_type eq "pdf") {
	187	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
	188	print "\n";
	189	}
	190	elsif ($input_type eq "ps") {
[22429]	191	print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
[1445]	192	print "\n";
	193	}
[23473]	194	elsif ($input_type =~ m/pptx?$/) {
[2977]	195	print &convertPPT($input_filename, $output_filestem, $output_type);
	196	print "\n";
	197	}
[23473]	198	elsif ($input_type =~ m/xlsx?$/) {
[2991]	199	print &convertXLS($input_filename, $output_filestem, $output_type);
	200	print "\n";
	201	}
[1445]	202	else {
	203	print STDERR "Error: Unable to convert type '$input_type'\n";
	204	exit(1);
	205	}
	206
	207	# restore to original working directory
	208	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
	209
	210	}
	211
	212	&main(@ARGV);
	213
	214
	215
[2241]	216	# Document-type conversion functions
[1445]	217	#
	218	# The following functions attempt to convert documents from their
	219	# input type to the specified output type. If no output type was
	220	# given, then they first attempt HTML, and then TEXT.
	221	#
	222	# Each returns the output type ("html" or "text") or "fail" if no
	223	# conversion is possible.
	224
	225	# Convert a Microsoft word document
	226
	227	sub convertDOC {
[22429]	228	my ($input_filename, $output_filestem, $output_type) = @_;
[1445]	229
[1654]	230	# Many .doc files are not in fact word documents!
	231	my $realtype = &find_docfile_type($input_filename);
	232
[23473]	233	if ($realtype eq "word6" \|\| $realtype eq "word7"
	234	\|\| $realtype eq "word8" \|\| $realtype eq "docx") {
[1654]	235	return &convertWord678($input_filename, $output_filestem, $output_type);
	236	} elsif ($realtype eq "rtf") {
	237	return &convertRTF($input_filename, $output_filestem, $output_type);
	238	} else {
	239	return &convertAnything($input_filename, $output_filestem, $output_type);
	240	}
	241	}
	242
	243	# Convert a Microsoft word 6/7/8 document
	244
	245	sub convertWord678 {
[22429]	246	my ($input_filename, $output_filestem, $output_type) = @_;
[1654]	247
[1445]	248	my $success = 0;
[16435]	249	if (!$output_type \|\| ($output_type =~ m/html/i)){
[10282]	250	if ($windows_scripting) {
	251	$success = &native_doc_to_html($input_filename, $output_filestem);
	252	}
	253	else {
	254	$success = &doc_to_html($input_filename, $output_filestem);
	255	}
[1445]	256	if ($success) {
[10282]	257	return "html";
[1445]	258	}
	259	}
[1654]	260	return &convertAnything($input_filename, $output_filestem, $output_type);
	261	}
	262
	263
	264	# Convert a Rich Text Format (RTF) file
	265
	266	sub convertRTF {
[22429]	267	my ($input_filename, $output_filestem, $output_type) = @_;
[1654]	268
	269	my $success = 0;
	270
	271	# Attempt specialised conversion to HTML
[16435]	272	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[12704]	273
	274	if ($windows_scripting) {
	275	$success = &native_doc_to_html($input_filename, $output_filestem);
	276	}
	277	else {
	278	$success = &rtf_to_html($input_filename, $output_filestem);
	279	}
[1654]	280	if ($success) {
	281	return "html";
	282	}
	283	}
	284
[2755]	285	# rtf is so ugly that's it's not worth running strings over.
	286	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
	287	# return &convertAnything($input_filename, $output_filestem, $output_type);
	288	return "fail";
[1654]	289	}
	290
	291
	292	# Convert an unidentified file
	293
	294	sub convertAnything {
[22429]	295	my ($input_filename, $output_filestem, $output_type) = @_;
[1654]	296
	297	my $success = 0;
[10464]	298
[1445]	299	# Attempt simple conversion to HTML
[16435]	300	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[1445]	301	$success = &any_to_html($input_filename, $output_filestem);
	302	if ($success) {
	303	return "html";
	304	}
	305	}
	306
	307	# Convert to text
[16435]	308	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[2241]	309	$success = &any_to_text($input_filename, $output_filestem);
[1445]	310	if ($success) {
	311	return "text";
	312	}
	313	}
	314	return "fail";
	315	}
	316
	317
[1654]	318
[1445]	319	# Convert an Adobe PDF document
	320
	321	sub convertPDF {
[2755]	322	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[1445]	323
	324	my $success = 0;
[10357]	325	$output_type =~ s/.\-(.)/$1/i;
	326	# Attempt coversion to Image
[16435]	327	if ($output_type =~ m/jp?g\|gif\|png/i) {
[17329]	328	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
[10357]	329	if ($success){
	330	return "item";
	331	}
	332	}
[1445]	333
	334	# Attempt conversion to HTML
[16435]	335	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[1445]	336	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
	337	if ($success) {
	338	return "html";
	339	}
	340	}
	341
	342	# Attempt conversion to TEXT
[16435]	343	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[2117]	344	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
[1445]	345	if ($success) {
	346	return "text";
	347	}
	348	}
	349
	350	return "fail";
	351
	352	}
	353
	354
	355	# Convert an Adobe PostScript document
	356
	357	sub convertPS {
[22429]	358	my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
[1445]	359
	360	my $success = 0;
[10534]	361	$output_type =~ s/.\-(.)/$1/i;
	362	# Attempt coversion to Image
[16435]	363	if ($output_type =~ m/jp?g\|gif\|png/i) {
[17329]	364	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
[10534]	365	if ($success){
	366	return "item";
	367	}
	368	}
[1445]	369
	370	# Attempt conversion to TEXT
[16435]	371	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[1445]	372	$success = &ps_to_text($input_filename, $output_filestem);
	373	if ($success) {
	374	return "text";
	375	}
	376	}
	377	return "fail";
	378	}
	379
	380
[2977]	381	sub convertPPT {
	382	my ($input_filename, $output_filestem, $output_type) = @_;
[10357]	383	my $success = 0;
[2977]	384
[10282]	385	my $ppt_convert_type = "";
[22513]	386
[16435]	387	#if (!$output_type \|\| $windows_scripting \|\| ($output_type !~ m/html/i) \|\| ($output_type !~ m/text/i)){
	388	if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
	389	if ($output_type =~ m/gif/i) {
[10282]	390	$ppt_convert_type = "-g";
[16435]	391	} elsif ($output_type =~ m/jp?g/i){
[10282]	392	$ppt_convert_type = "-j";
[16435]	393	} elsif ($output_type =~ m/png/i){
[10282]	394	$ppt_convert_type = "-p";
	395	}
	396	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
	397	$ENV{'GSDLOS'}, "pptextract");
[16435]	398	$vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
[10282]	399
[22429]	400	my $cmd = "";
[10357]	401	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[22429]	402	# if the converting directory already exists
[10282]	403	if (-d $output_filestem) {
[22429]	404	print STDERR "**The conversion directory already exists\n";
[10282]	405	return "item";
	406	} else {
[10521]	407	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
[10282]	408	$cmd .= " 2>\"$output_filestem.err\""
[16435]	409	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[10282]	410	if (system($cmd) !=0) {
	411	print STDERR "Powerpoint VB Scripting convert failed\n";
	412	} else {
	413	return "item";
	414	}
	415	}
[16435]	416	} elsif (!$output_type \|\| ($output_type =~ m/html/i)) {
[10282]	417	# Attempt conversion to HTML
[16435]	418	#if (!$output_type \|\| ($output_type =~ m/html/i)) {
[2977]	419	# formulate the command
[22429]	420	my $cmd = "";
[24103]	421	my $full_perl_path = &util::filename_cat($ENV{'PERLPATH'},"perl");
[24124]	422	$cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
[2977]	423	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
	424	$cmd .= " 2>\"$output_filestem.err\""
[16435]	425	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[10357]	426
[2977]	427	# execute the command
	428	$!=0;
	429	if (system($cmd)!=0)
	430	{
[2991]	431	print STDERR "Powerpoint 95/97 converter failed $!\n";
[2977]	432	} else {
	433	return "html";
	434	}
[10464]	435	}
[2977]	436
	437	$success = &any_to_text($input_filename, $output_filestem);
	438	if ($success) {
	439	return "text";
	440	}
[10464]	441
[2977]	442	return "fail";
	443	}
	444
	445
[2991]	446	sub convertXLS {
	447	my ($input_filename, $output_filestem, $output_type) = @_;
[2977]	448
[2991]	449	my $success = 0;
[2977]	450
[2991]	451	# Attempt conversion to HTML
[16435]	452	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[2991]	453	# formulate the command
[22429]	454	my $cmd = "";
[24103]	455	my $full_perl_path = &util::filename_cat($ENV{'PERLPATH'},"perl");
[24124]	456	$cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
[2991]	457	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
	458	$cmd .= " 2>\"$output_filestem.err\""
[16435]	459	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[2991]	460
	461
	462	# execute the command
	463	$!=0;
	464	if (system($cmd)!=0)
	465	{
	466	print STDERR "Excel 95/97 converter failed $!\n";
	467	} else {
	468	return "html";
	469	}
	470	}
[2977]	471
[2991]	472	$success = &any_to_text($input_filename, $output_filestem);
	473	if ($success) {
	474	return "text";
	475	}
	476
	477	return "fail";
	478	}
	479
	480
	481
[1654]	482	# Find the real type of a .doc file
	483	#
[2012]	484	# We seem to have a lot of files with a .doc extension that are .rtf
[1654]	485	# files or Word 5 files. This function attempts to tell the difference.
	486	sub find_docfile_type {
[22429]	487	my ($input_filename) = @_;
[23473]	488
	489	if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
	490	return "docx";
	491	}
	492
[1654]	493	open(CHK, "<$input_filename");
[1734]	494	binmode(CHK);
[1654]	495	my $line = "";
	496	my $first = 1;
	497
	498	while (<CHK>) {
	499
	500	$line = $_;
[1960]	501
[1654]	502	if ($first) {
	503	# check to see if this is an rtf file
[16435]	504	if ($line =~ m/^\{\\rtf/) {
[1654]	505	close(CHK);
	506	return "rtf";
	507	}
[2755]	508	$first = 0;
[1654]	509	}
	510
[1734]	511	# is this is a word 6/7/8 document?
[16435]	512	if ($line =~ m/Word\.Document\.([678])/) {
[1654]	513	close(CHK);
[23473]	514
[1734]	515	return "word$1";
[1654]	516	}
	517
	518	}
	519
	520	return "unknown";
	521	}
	522
	523
[1734]	524	# Specific type-to-type conversions
[1445]	525	#
	526	# Each of the following functions attempts to convert a document from
[2755]	527	# a specific format to another. If they succeed they return 1 and leave
[1445]	528	# the output document(s) in the appropriate place; if they fail they
	529	# return 0 and delete any working files.
	530
	531
	532	# Attempt to convert a word document to html with the wv program
	533	sub doc_to_html {
[22429]	534	my ($input_filename, $output_filestem) = @_;
[1445]	535
[20933]	536	my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wvWare");
[1928]	537
[20933]	538	if ( -d "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv" && $ENV{'GSDLOS'} eq "linux" ) {
	539	$ENV{'PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/bin:$ENV{'PATH'}";
	540	$ENV{'LD_LIBRARY_PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/lib:$ENV{'LD_LIBRARY_PATH'}";
	541	$wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wv", "bin", "wvWare");
	542	}
	543
[2241]	544	# don't include path on windows (to avoid having to play about
	545	# with quoting when GSDLHOME might contain spaces) but assume
	546	# that the PATH is set up correctly
[16435]	547	$wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
[2241]	548
[2512]	549	my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
[2574]	550	"packages", "wv", "wvHtml.xml");
[1928]	551
[15120]	552	# Added the following to work with replace_srcdoc_with_html.pl:
	553	# Make wvWare put any associated (image) files of the word doc into
	554	# folder docname-without-extention_files. This folder should be at
	555	# the same level as the html file generated from the doc.
	556	# wvWare will take care of proper interlinking.
	557
	558	# This step is necessary for replace_srcdoc_with_html.pl which will
	559	# move the html and associated files into the import folder. We
	560	# want to ensure that the associated files won't overwrite similarly
	561	# named items already in import. Hence we put them in a folder first
	562	# (to which the html links properly) and that will allow
	563	# replace_srcdoc_with_html.pl to move them safely to /import.
	564
	565	# To do all this, we need to use wvWare's --dir and --basename options
	566	# where dir is the full path to the image folder directory and
	567	# basename is the full path to the image folder appended to the name
	568	# which is to be prepended to every image file:
	569	# eg. if the images were to have names like sample0.jpg to sampleN.jpg,
	570	# then the basename is "/full/path/to/imgdir/sample".
	571	# In this case, basename is the full path to and name of the document.
	572	# HOWEVER: basename always takes full path, not relative url, so
	573	# the greenstone browser is unable to display the images (absolute paths
	574	# cause it to give an "external link" message)
	575	# See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html
	576	# and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html
	577	# "added --dir option to wvHtml so that pictures can be placed in
	578	# a seperate directory"
	579	# "running wvWare through IMP to view word documents as html. It gets
	580	# invoked like this:
	581	# wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output"
	582
	583	# toppath is the folder where html is generated
	584	# docname is the name (without extension) of the html to be generated
	585	# suffix (extension) is thrown away
	586	my ($docname, $toppath)
	587	= &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
	588
	589	# We want the image folder generated to have the same name as windows
	590	# would generate ($windows_scripting) when it converts from word to html.
	591	# That is, foldername=docname_files
	592	my $assoc_dir = &util::filename_cat($toppath, $docname."_files");
	593	#print "assoc_dir: ".$assoc_dir."\n"; # same as "$output_filestem._files"
	594
	595	# ensure this image directory exists
	596	# if it exists already, just delete and recreate
	597	if(-e $assoc_dir) {
	598	&util::rm_r($assoc_dir);
	599	}
	600	&util::mk_dir($assoc_dir);
	601
	602	# the images are all going to be called image0, image1,..., imageN
	603	my $img_basenames = &util::filename_cat($assoc_dir, $docname);
	604
	605	#print STDERR "**toppath: $toppath\n**docname: $docname\n;
	606	#print STDERR "****img_basenames: $img_basenames\n" if($img_basenames);
	607	#print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir);
	608
[2241]	609	my $cmd = "";
[1692]	610	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[15120]	611	# wvWare's --dir and --basename options for image directory.
	612	# Replaced the next line with the 2 lines following it:
	613	# $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
	614	$cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\"";
	615	$cmd .= " --charset utf-8 --config \"$wv_conf\"";
[2241]	616	$cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
[15120]	617
[2241]	618	# redirecting STDERR is a bad idea on windows 95/98
	619	$cmd .= " 2> \"$output_filestem.err\""
[16435]	620	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[1445]	621	# execute the command
[2755]	622	$!=0;
[2060]	623	if (system($cmd)!=0)
[1445]	624	{
[2755]	625	print STDERR "Error executing wv converter:$!\n";
	626	if (-s "$output_filestem.err") {
	627	open (ERRFILE, "<$output_filestem.err");
	628
	629	my $write_to_fail_log=0;
	630	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	631	{$write_to_fail_log=1;}
	632
	633	my $line;
	634	while ($line=<ERRFILE>) {
[16435]	635	if ($line =~ m/\w/) {
[2755]	636	print STDERR "$line";
	637	print FAILLOG "$line" if ($write_to_fail_log);
	638	}
	639	if ($line !~ m/startup error/) {next;}
	640	print STDERR " (given an invalid .DOC file?)\n";
	641	print FAILLOG " (given an invalid .DOC file?)\n"
	642	if ($write_to_fail_log);
	643
	644	} # while ERRFILE
	645	close FAILLOG if ($write_to_fail_log);
	646	}
	647	return 0; # we can try any_to_text
[1445]	648	}
[1578]	649
[1445]	650	# Was the conversion successful?
[2241]	651
[15120]	652	if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents)
[1445]	653	open(TMP, "$output_filestem.html");
[22429]	654	my $line = <TMP>;
[1445]	655	close(TMP);
[16435]	656	if ($line && $line =~ m/DOCTYPE HTML/) {
[15120]	657	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
	658
	659	# Inserted this code to remove the images directory if it was still empty after
	660	# the html was generated (in case there were no images in the word document)
[16435]	661	if (&util::is_dir_empty($assoc_dir)) {
[15152]	662	#print STDERR "*gsConvert.pl: Image dir $assoc_dir is empty, removing*\n";
[15120]	663	&util::rm_r($assoc_dir);
	664	} else { # there was an image folder (it was generated)
	665	# Therefore, the html file generated contains absolute links to the images
[16435]	666	# Replace them with relative links instead, so the folder can be moved elsewhere
[15152]	667	&make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");
[15120]	668	}
[1445]	669	return 1;
	670	}
	671	}
[2755]	672
	673	# If here, an error of some sort occurred
	674	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
	675	if (-e "$output_filestem.err") {
	676	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
	677	open (ERRLOG,"$output_filestem.err");
	678	while (<ERRLOG>) {print FAILLOG $_;}
	679	close FAILLOG;
	680	close ERRLOG;
	681	}
	682	&util::rm("$output_filestem.err");
	683	}
	684
[1445]	685	return 0;
	686	}
	687
[15120]	688	# Method to work with doc_to_html - Word docs might contain images.
	689	# When such word docs are converted with wvWare, we make it generate a
	690	# <filename>_files folder with the associated images, while the html file
	691	# <filename> refers to the images using absolute paths to <filename>_files.
	692	# This method reads in that html file and replaces all the absolute paths to
	693	# the images in <filename>_files with the relative paths to the images from
	694	# that folder. (I.e. with <filename>_files/<imagename.ext>).
	695	sub make_links_to_assocdir_relative{
	696	# toppath is the top-level folder in which the html file we're going to be fixing resides
	697	# docname is just the name (without extension) of the html file
	698	# html_file is the full path to the html file: /full/path/docname.html
	699	# assoc_dir_path is toppath/docname_files
	700	# assoc_dirname is the directory name of the folder with associated imgs: docname_files
	701	my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_;
[10357]	702
[15120]	703	# 1. Read all the contents of the html into a string
	704	# open the original file for reading
	705	unless(open(FIN, "<$html_file")) {
[15168]	706	print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR: $!\n";
[15152]	707	return 0;
[15120]	708	}
	709	# From http://perl.plover.com/local.html
	710	# "It's cheaper to read the file all at once, without all the splitting and reassembling.
	711	# (Some people call this slurping the file.) Perl has a special feature to support this:
	712	# If the $/ variable is undefined, the <...> operator will read the entire file all at once"
[15152]	713	my $html_contents;
	714	{
	715	local $/ = undef; # Read entire file at once
	716	$html_contents = <FIN>; # Now file is read in as one single 'line'
	717	}
[15120]	718	close(FIN); # close the file
[15152]	719	#print STDERR $html_contents;
[15120]	720
	721	# 2. Replace (substitute) all ocurrences of the assoc_dir_path in a hrefs and img src
	722	# values with assoc_dirname
	723	# At the end: g means substitute all occurrences (global), while s at the end means treat
	724	# all new lines as a regular space. This interacts with g to consider all the lines
	725	# together as a single line so that multi-occurrences can be replaced.
[15152]	726
	727	# we can't just replace $assoc_dir_path with $assoc_dir
	728	# $assoc_dir_path represents a regular expression that needs to be replaced
[16435]	729	# if it contains ., -, [, ], or Windows style backslashes in paths -- which all have special
	730	# meaning in Perl regular expressions -- we need to escape these first
[15152]	731	my $safe_reg_expression = $assoc_dir_path;
[16435]	732	$safe_reg_expression =~ s/\\/\\\\/g;
[15152]	733	$safe_reg_expression =~ s/\./\\./g;
	734	$safe_reg_expression =~ s/\-/\\-/g;
	735	$safe_reg_expression =~ s/\[/\\[/g;
	736	$safe_reg_expression =~ s/\]/\\]/g;
	737	$safe_reg_expression =~ s/ /%20/g; # wvWare put %20 in place of space, so we need to change our prefix to match
	738
[15120]	739	# The following regular expression substitution looks for <a or <image, followed by any other
	740	# attributes and values until it comes to the FIRST (indicated by ?) href= or src=
	741	# followed by " or ' no quotes at all around path, followed by the associated folder's pathname
	742	# followed by characters (for the img filename), then finally the optional closing quotes
	743	# in " or ' form, followed by any other attributes and values until the first > to end the tag.
	744	# The substitution: all the parts preceding associated folder's pathname are retained,
	745	# the associated folder path name is replaced by associated folder directory name
	746	# and the rest upto and including the closing > tag is retained.
	747	# The sg at the end of the pattern match treats all of html_contents as a single line (s)
	748	# and performs a global replace (g) meaning that all occurrences that match in that single line
	749	# are substituted.
[15152]	750	$html_contents =~ s/(<(a\|img).?(href\|src)=(\"\|\')?)$safe_reg_expression(.?(\"\|\')?.*?>)/$1$assoc_dirname$5/sg;
	751	#$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back
	752	# now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this
[16435]	753	$html_contents =~ s/(<(a\|img).?(href\|src)=(\"\|\')?)(.)(.?(\"\|\')?.?>)/&post_process_assocfile_urls($1, $5, $6)/sge;
	754
[16552]	755	#print STDERR "**assoc_dirname: $assoc_dirname*\n";
	756	#print STDERR "**safe_reg_expression: $safe_reg_expression*\n";
[15152]	757
[15120]	758	# delete the original file and recreate it
	759	my $copy_of_filename = $html_file;
	760	&util::rm($copy_of_filename); # deleted the file
	761
	762	# Recreate the original file for writing the updated contents
	763	unless(open(FOUT, ">$html_file")) { # open it as a new file for writing
[15168]	764	print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR: $!\n";
[15152]	765	return 0;
[15120]	766	}
[16435]	767
[15120]	768	# write out the updated contents and close the file
	769	print FOUT $html_contents;
	770	close(FOUT);
[15152]	771	return 1;
[15120]	772	}
	773
[16435]	774	# Utility routine to make sure HTML plugin gets img src/href link pathnames that contain
	775	# url slashes (/) instead of windows-style backwards slashes, and to convert all %20
[16899]	776	# introduced in link pathnames by wvWare into space again. Converts all percent signs
	777	# introduced by URL encoding filenames generated into %25 in these url links referencing them
[16435]	778	sub post_process_assocfile_urls
[15120]	779	{
[15152]	780	my ($pre, $text, $post) = @_;
	781
[19763]	782	$text =~ s/%20/ /g; # Convert %20s to space and not underscore since underscores mess with incremental rebuild
	783	# $text =~ s/%20/_/g; # reinstated this line, since we no longer replace spaces with %20. We replace them with underscores
[16435]	784	$text =~ s/\\/\//g;
[16899]	785	$text =~ s/%/%25/g;
[15152]	786
	787	return "$pre$text$post";
[15120]	788	}
	789
[10282]	790	# Attempt to convert a word document to html with the word2html scripting program
	791	sub native_doc_to_html {
[22429]	792	my ($input_filename, $output_filestem) = @_;
[1445]	793
[24166]	794	# build up the path to the doc-to-html conversion tool we're going to use
	795	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
[10282]	796
[24164]	797	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
[24166]	798	# if windows scripting with docx input, use new VBscript to get the local Word install (if
	799	# any) to do the conversion, since docX can't be processed by word2html's windows_scripting
	800
	801	if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
	802	# else script launch fails when there are error msgs
	803	$vbScript = &util::filename_cat($vbScript, "docx2html.vbs");
	804	$vbScript = "CScript //Nologo \"$vbScript\""; # launche with CScript for error output in STDERR
[24169]	805	# //Nologo flag avoids Microsoft's opening/logo msgs
	806	print STDERR "About to use windows scripting to process docx file $input_filename.\n";
	807	print STDERR " This may take some time. Please wait...\n";
[24166]	808	}
	809	else { # old doc versions. use the usual VB executable word2html for the
	810	# conversion. Doesn't need full path, since bin\windows is on PATH
	811	$vbScript = "word2html"; #$vbScript = "\"".&util::filename_cat($vbScript, "word2html")."\"";
	812	}
	813	}
	814	else { # not windows
	815	$vbScript = "\"".&util::filename_cat($vbScript, "word2html")."\"";
[24164]	816	}
	817
[10445]	818	if (-e "$output_filestem.html") {
[22429]	819	print STDERR " The conversion file:\n";
	820	print STDERR " $output_filestem.html\n";
	821	print STDERR " ... already exists. Skipping\n";
[10445]	822	return 1;
	823	}
[10282]	824
	825	my $cmd = "";
	826	if ($timeout) {$cmd = "ulimit -t $timeout;";}
	827	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
[10445]	828	#$cmd .= "$vbScript $input_filename $output_filestem.html";
[10521]	829	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
[10445]	830
[10282]	831	# redirecting STDERR
[24166]	832
	833	$cmd .= " 2> \"$output_filestem.err\""
	834	if ($ENV {'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
	835	#print STDERR "@@@@@@@@@ cmd=$cmd\n";
	836
[10282]	837	# execute the command
	838	$!=0;
	839	if (system($cmd)!=0)
	840	{
[24164]	841	print STDERR "Error executing $vbScript converter:$!\n";
[10282]	842	if (-s "$output_filestem.err") {
	843	open (ERRFILE, "<$output_filestem.err");
[24166]	844
[10282]	845	my $write_to_fail_log=0;
	846	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	847	{$write_to_fail_log=1;}
	848
	849	my $line;
	850	while ($line=<ERRFILE>) {
[16435]	851	if ($line =~ m/\w/) {
[10282]	852	print STDERR "$line";
	853	print FAILLOG "$line" if ($write_to_fail_log);
	854	}
	855	if ($line !~ m/startup error/) {next;}
	856	print STDERR " (given an invalid .DOC file?)\n";
	857	print FAILLOG " (given an invalid .DOC file?)\n"
	858	if ($write_to_fail_log);
	859
	860	} # while ERRFILE
	861	close FAILLOG if ($write_to_fail_log);
	862	}
	863	return 0; # we can try any_to_text
	864	}
	865
	866	# Was the conversion successful?
	867	if (-s "$output_filestem.html") {
	868	open(TMP, "$output_filestem.html");
[22429]	869	my $line = <TMP>;
[10282]	870	close(TMP);
[22429]	871	if ($line && $line =~ m/html/i) {
[10282]	872	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
	873	return 1;
	874	}
	875	}
	876
	877	# If here, an error of some sort occurred
	878	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
	879	if (-e "$output_filestem.err") {
	880	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
	881	open (ERRLOG,"$output_filestem.err");
	882	while (<ERRLOG>) {print FAILLOG $_;}
	883	close FAILLOG;
	884	close ERRLOG;
	885	}
	886	&util::rm("$output_filestem.err");
	887	}
	888	return 0;
	889	}
	890
[1654]	891	# Attempt to convert an RTF document to html with rtftohtml
	892	sub rtf_to_html {
[2241]	893	my ($input_filename, $output_filestem) = @_;
[1654]	894
	895	# formulate the command
[22429]	896	my $cmd = "";
[1692]	897	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[2574]	898	$cmd .= "rtftohtml";
[10282]	899	#$cmd .= "rtf-converter";
[1654]	900
[3246]	901	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
[2574]	902
	903	$cmd .= " 2>\"$output_filestem.err\""
[16435]	904	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[2574]	905
	906
[1654]	907	# execute the command
[2755]	908	$!=0;
[2060]	909	if (system($cmd)!=0)
[1654]	910	{
[2755]	911	print STDERR "Error executing rtf converter $!\n";
[2656]	912	# don't currently bother printing out error log...
	913	# keep going, in case it still created an HTML file...
[1654]	914	}
	915
	916	# Was the conversion successful?
[2755]	917	my $was_successful=0;
[2656]	918	if (-s "$output_filestem.html") {
[2755]	919	# make sure we have some content other than header
	920	open (HTML, "$output_filestem.html"); # what to do if fail?
	921	my $line;
	922	my $past_header=0;
	923	while ($line=<HTML>) {
	924
	925	if ($past_header == 0) {
[16435]	926	if ($line =~ m/<body>/) {$past_header=1;}
[2755]	927	next;
	928	}
	929
	930	$line =~ s/<[^>]+>//g;
[16435]	931	if ($line =~ m/\w/ && $past_header) { # we found some content...
[2755]	932	$was_successful=1;
	933	last;
	934	}
	935	}
	936	close HTML;
[1654]	937	}
[2574]	938
[2755]	939	if ($was_successful) {
	940	&util::rm("$output_filestem.err")
	941	if (-e "$output_filestem.err");
	942	# insert the (modified) table of contents, if it exists.
	943	if (-e "${output_filestem}_ToC.html") {
	944	&util::mv("$output_filestem.html","$output_filestem.src");
	945	my $open_failed=0;
	946	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
	947	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
	948	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
	949
	950	if ($open_failed) {
	951	close HTMLSRC;
	952	close TOC;
	953	close HTML;
	954	&util::mv("$output_filestem.src","$output_filestem.html");
	955	return 1;
	956	}
	957
	958	# print out header info from src html.
[16435]	959	while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
[2755]	960	print HTML "$_";
	961	}
	962
	963	# print out table of contents, making links relative
	964	<TOC>; <TOC>; # ignore first 2 lines
	965	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
	966	my $line;
	967	while ($line=<TOC>) {
[22429]	968	$line =~ s@</body></html>$@@i ; # only last line has this
[2755]	969	# make link relative
[22429]	970	$line =~ s@href=\"[^\#]+@href=\"@i;
[2755]	971	print HTML $line;
	972	}
	973	close TOC;
	974
	975	# rest of html src
	976	while (<HTMLSRC>) {
	977	print HTML $_;
	978	}
	979	close HTMLSRC;
	980	close HTML;
	981
	982	&util::rm("${output_filestem}_ToC.html");
	983	&util::rm("${output_filestem}.src");
	984	}
	985	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
	986	return 1; # success
	987	}
	988
	989	if (-e "$output_filestem.err") {
	990	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	991	{
	992	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
[10282]	993	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
[2755]	994	print FAILLOG " (rtf file might be too recent):\n";
	995	open (ERRLOG, "$output_filestem.err");
	996	while (<ERRLOG>) {print FAILLOG $_;}
	997	close ERRLOG;
	998	close FAILLOG;
	999	}
	1000	&util::rm("$output_filestem.err");
	1001	}
	1002
[2656]	1003	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
	1004
[1654]	1005	return 0;
	1006	}
	1007
	1008
[1445]	1009	# Convert a pdf file to html with the pdftohtml command
	1010
	1011	sub pdf_to_html {
[2755]	1012	my ($dirname, $input_filename, $output_filestem) = @_;
[1445]	1013
[22429]	1014	my $cmd = "";
[1692]	1015	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[24103]	1016	my $full_perl_path = &util::filename_cat($ENV{'PERLPATH'},"perl");
[24124]	1017	$cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
[3720]	1018	$cmd .= " -c" if ($pdf_complex);
	1019	$cmd .= " -i" if ($pdf_ignore_images);
[10451]	1020	$cmd .= " -a" if ($pdf_allow_images_only);
[4103]	1021	$cmd .= " -hidden" unless ($pdf_nohidden);
[1928]	1022	$cmd .= " \"$input_filename\" \"$output_filestem\"";
[2755]	1023
[16435]	1024	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
[2755]	1025	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	1026	} else {
	1027	$cmd .= " > \"$output_filestem.err\"";
	1028	}
	1029
[2117]	1030	$!=0;
[2241]	1031
[2656]	1032	my $retval=system($cmd);
	1033	if ($retval!=0)
[1445]	1034	{
[2755]	1035	print STDERR "Error executing pdftohtml.pl";
[2117]	1036	if ($!) {print STDERR ": $!";}
	1037	print STDERR "\n";
[1445]	1038	}
	1039
[1692]	1040	# make sure the converter made something
[2656]	1041	if ($retval!=0 \|\| ! -s "$output_filestem.html")
[1692]	1042	{
	1043	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
[2656]	1044	# print out the converter's std err, if any
	1045	if (-s "$output_filestem.err") {
[1692]	1046	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[2755]	1047	print STDERR "pdftohtml error log:\n";
[1692]	1048	while (<ERRLOG>) {
	1049	print STDERR "$_";
	1050	}
	1051	close ERRLOG;
	1052	}
[22513]	1053	print STDERR "***********output filestem $output_filestem.html\n";
[2656]	1054	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
[2755]	1055	if (-e "$output_filestem.err") {
	1056	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	1057	{
	1058	open (ERRLOG, "$output_filestem.err");
	1059	while (<ERRLOG>) {print FAILLOG $_;}
	1060	close ERRLOG;
	1061	close FAILLOG;
	1062	}
[10282]	1063	&util::rm("$output_filestem.err");
[2755]	1064	}
[1692]	1065	return 0;
	1066	}
[10357]	1067
	1068	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	1069	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	1070	return 1;
	1071	}
	1072
	1073	# Convert a pdf file to various types of image with the convert command
	1074
[17329]	1075	sub pdfps_to_img {
[10357]	1076	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[10401]	1077
	1078	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
	1079	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
	1080	my $result = `identify 2>&1`;
	1081	if ($? == -1 \|\| $? == 256) { # Linux and Windows return different values for "program not found"
	1082	#ImageMagick is not installed, thus the convert utility is not available.
[17329]	1083	print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images\n";
[10401]	1084	return 0;
	1085	}
	1086	}
	1087
[22429]	1088	my $cmd = "";
[10357]	1089	if ($timeout) {$cmd = "ulimit -t $timeout;";}
	1090	$output_type =~ s/.\_(.)/$1/i;
[24103]	1091	my $full_perl_path = &util::filename_cat($ENV{'PERLPATH'},"perl");
[24124]	1092	$cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
[16435]	1093	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
[10357]	1094	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	1095	} else {
	1096	$cmd .= " > \"$output_filestem.err\"";
	1097	}
	1098
	1099	# don't include path on windows (to avoid having to play about
	1100	# with quoting when GSDLHOME might contain spaces) but assume
	1101	# that the PATH is set up correctly
	1102	$!=0;
	1103	my $retval=system($cmd);
	1104	if ($retval!=0)
	1105	{
[10401]	1106	print STDERR "Error executing pdftoimg.pl";
[10357]	1107	if ($!) {print STDERR ": $!";}
	1108	print STDERR "\n";
	1109	}
	1110
	1111	#make sure the converter made something
	1112	#if ($retval !=0) \|\| ! -s "$output_filestem")
	1113	if ($retval !=0)
	1114	{
	1115	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	1116	#print out the converter's std err, if any
	1117	if (-s "$output_filestem.err") {
	1118	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[17329]	1119	print STDERR "pdfpstoimg error log:\n";
[10357]	1120	while (<ERRLOG>) {
	1121	print STDERR "$_";
	1122	}
	1123	close ERRLOG;
	1124	}
[10534]	1125	#&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
[10357]	1126	if (-e "$output_filestem.err") {
	1127	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	1128	{
	1129	open (ERRLOG, "$output_filestem.err");
	1130	while (<ERRLOG>) {print FAILLOG $_;}
	1131	close ERRLOG;
	1132	close FAILLOG;
	1133	}
	1134	&util::rm("$output_filestem.err");
	1135	}
	1136	return 0;
	1137	}
[2656]	1138	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
[1445]	1139	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	1140	return 1;
	1141	}
	1142
	1143	# Convert a PDF file to text with the pdftotext command
	1144
	1145	sub pdf_to_text {
[2755]	1146	my ($dirname, $input_filename, $output_filestem) = @_;
[1445]	1147
[2248]	1148	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
[2755]	1149
[16435]	1150	if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
[2755]	1151	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	1152	} else {
	1153	$cmd .= " > \"$output_filestem.err\"";
	1154	}
[1445]	1155
[2060]	1156	if (system($cmd)!=0)
[1445]	1157	{
	1158	print STDERR "Error executing $cmd: $!\n";
	1159	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
	1160	}
	1161
[2755]	1162	# make sure there is some extracted text.
	1163	if (-e "$output_filestem.text") {
	1164	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
	1165	binmode(EXTR_TEXT); # just in case...
	1166	my $line="";
	1167	my $seen_text=0;
	1168	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
[16435]	1169	if ($line=~ m/\w/) {$seen_text=1;}
[2755]	1170	}
	1171	close EXTR_TEXT;
	1172	if ($seen_text==0) { # no text was extracted
	1173	print STDERR "Error: pdftotext found no text\n";
	1174	&util::rm("$output_filestem.text");
	1175	}
	1176	}
	1177
[1692]	1178	# make sure the converter made something
[2656]	1179	if (! -s "$output_filestem.text")
[1692]	1180	{
	1181	# print out the converters std err, if any
[2656]	1182	if (-s "$output_filestem.err") {
[1692]	1183	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[2755]	1184	print STDERR "pdftotext error log:\n";
[1692]	1185	while (<ERRLOG>) {
	1186	print STDERR "$_";
	1187	}
	1188	close ERRLOG;
	1189	}
[2656]	1190	# does this converter create a .out file?
	1191	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	1192	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
[2755]	1193	if (-e "$output_filestem.err") {
	1194	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	1195	{
	1196	open (ERRLOG,"$output_filestem.err");
	1197	while (<ERRLOG>) {print FAILLOG $_;}
	1198	close ERRLOG;
	1199	close FAILLOG;
	1200	}
	1201	&util::rm("$output_filestem.err");
	1202	}
[1692]	1203	return 0;
	1204	}
[1445]	1205	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	1206	return 1;
	1207	}
	1208
[2012]	1209	# Convert a PostScript document to text
	1210	# note - just using "ps2ascii" isn't good enough, as it
	1211	# returns 0 for a postscript interpreter error. ps2ascii is just
	1212	# a wrapper to "gs" anyway, so we use that cmd here.
[1445]	1213
	1214	sub ps_to_text {
[2241]	1215	my ($input_filename, $output_filestem) = @_;
[1445]	1216
[2241]	1217	my $error = "";
	1218
	1219	# if we're on windows we'll fall straight through without attempting
	1220	# to use gs
[16435]	1221	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
[2241]	1222	$error = "Windows does not support gs";
	1223
	1224	} else {
[3538]	1225	my $cmd = "";
	1226	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
	1227	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
[2241]	1228	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
[10357]	1229	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
[2241]	1230	$cmd .= " 2> $output_filestem.err";
	1231	$!=0;
[10357]	1232
[2241]	1233	my $retcode=system($cmd);
	1234	$retcode = $? >> 8; # see man perlfunc - system for this...
	1235	# if system returns -1 \| 127 (couldn't start program), look at $! for message
	1236
	1237	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
	1238	elsif (! -e "$output_filestem.text") {
	1239	$error="did not create output file.\n";
[2012]	1240	}
[2241]	1241	else
	1242	{ # make sure the interpreter didn't get an error. It is technically
	1243	# possible for the actual text to start with this, but....
	1244	open PSOUT, "$output_filestem.text";
[16435]	1245	if (<PSOUT> =~ m/^Error: (.*)/) {
[2241]	1246	$error="interpreter error - \"$1\"";
	1247	}
	1248	close PSOUT;
	1249	}
[2012]	1250	}
[2241]	1251
[2012]	1252	if ($error ne "")
[1445]	1253	{
[2755]	1254	print STDERR "Warning: Error executing gs: $error\n";
[1445]	1255	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
[2755]	1256
	1257	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
	1258	{
	1259	print FAILLOG "gs - $error\n";
	1260	if (-e "$output_filestem.err") {
	1261	open(ERRLOG, "$output_filestem.err");
	1262	while (<ERRLOG>) {print FAILLOG $_;}
	1263	close ERRLOG;
	1264	}
	1265	close FAILLOG;
	1266	}
[1445]	1267	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
[2012]	1268
[2755]	1269
[2012]	1270	# Fine then. We'll just do a lousy job by ourselves...
[2031]	1271	# Based on 5-line regexp sed script found at:
[2012]	1272	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
	1273	#
[2755]	1274	print STDERR "Stripping text from postscript\n";
[2012]	1275	my $errorcode=0;
	1276	open (IN, "$input_filename")
	1277	\|\| ($errorcode=1, warn "Couldn't read file: $!");
	1278	open (OUT, ">$output_filestem.text")
	1279	\|\| ($errorcode=1, warn "Couldn't write file: $!");
	1280	if ($errorcode) {print STDERR "errors\n";return 0;}
	1281
[2031]	1282	my $text=""; # this is for whole .ps file...
[2755]	1283	$text = join('', <IN>); # see man perlport, under "System Resources"
[2031]	1284	close IN;
	1285
[2447]	1286	# Make sure this is a ps file...
[16435]	1287	if ($text !~ m/^%!/) {
[2755]	1288	print STDERR "Bad postscript header: not '%!'\n";
	1289	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
	1290	{
	1291	print FAILLOG "Bad postscript header: not '%!'\n";
	1292	close FAILLOG;
	1293	}
[2447]	1294	return 0;
	1295	}
	1296
[2031]	1297	# if ps has Page data, then use it to delete all stuff before it.
	1298	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
	1299
	1300	# remove all leading non-data stuff
	1301	$text =~ s/^.*?\(//s;
	1302
	1303	# remove all newline chars for easier processing
	1304	$text =~ s/\n//g;
	1305
	1306	# Big assumption here - assume that if any co-ordinates are
	1307	# given, then we are at the end of a sentence.
	1308	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
	1309
	1310	# special characters--
	1311	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
	1312
	1313	# ? ps text formatting (eg italics?) ?
	1314	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
	1315	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
	1316	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
	1317	# default - remove the rest
	1318	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
	1319
	1320	# attempt to add whitespace between words...
	1321	# this is based purely on observation, and may be completely wrong...
	1322	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
	1323	# eg I notice "b(" is sometimes NOT a space if preceded by a
	1324	# negative number.
	1325	$text =~ s/\)\d+ ?b\(/\) \( /g;
	1326
	1327	# change quoted braces to brackets
	1328	$text =~ s/([^\\])\\\(/$1\{/g;
	1329	$text =~ s/([^\\])\\\)/$1\}/g ;
	1330
	1331	# remove everything that is not between braces
	1332	$text =~ s/\)([^\(\)])+?\(//sg ;
	1333
	1334	# remove any Trailer eof stuff.
	1335	$text =~ s/\)[^\)]*$//sg;
	1336
	1337	### ligatures have special characters...
	1338	$text =~ s/\\013/ff/g;
	1339	$text =~ s/\\014/fi/g;
	1340	$text =~ s/\\015/fl/g;
	1341	$text =~ s/\\016/ffi/g;
	1342	$text =~ s/\\214/fi/g;
	1343	$text =~ s/\\215/fl/g;
	1344	$text =~ s/\\017/\n\* /g; # asterisk?
	1345	$text =~ s/\\023/\023/g; # e acute ('e)
	1346	$text =~ s/\\177/\252/g; # u"
	1347	# $text =~ s/ ?? /\344/g; # a"
	1348
	1349	print OUT "$text";
	1350	close OUT;
[1960]	1351	}
[2600]	1352	# wrap the text - use a minimum length. ie, first space after this length.
	1353	my $wrap_length=72;
	1354	&util::mv("$output_filestem.text", "$output_filestem.text.tmp");
	1355	open INFILE, "$output_filestem.text.tmp" \|\|
	1356	die "Couldn't open file: $!";
	1357	open OUTFILE, ">$output_filestem.text" \|\|
	1358	die "Couldn't open file for writing: $!";
	1359	my $line="";
	1360	while ($line=<INFILE>) {
	1361	while (length($line)>0) {
	1362	if (length($line)>$wrap_length) {
	1363	$line =~ s/^(.{$wrap_length}[^\s])\s//;
	1364	print OUTFILE "$1\n";
	1365	} else {
	1366	print OUTFILE "$line";
	1367	$line="";
	1368	}
	1369	}
	1370	}
	1371	close INFILE;
	1372	close OUTFILE;
	1373	&util::rm("$output_filestem.text.tmp");
	1374
[1445]	1375	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	1376	return 1;
	1377	}
	1378
	1379
	1380	# Convert any file to HTML with a crude perl implementation of the
	1381	# UNIX strings command.
	1382
	1383	sub any_to_html {
[22429]	1384	my ($input_filename, $output_filestem) = @_;
[1445]	1385
	1386	# First generate a text file
	1387	return 0 unless (&any_to_text($input_filename, $output_filestem));
	1388
	1389	# create an HTML file from the text file
	1390	open(TEXT, "<$output_filestem.text");
	1391	open(HTML, ">$output_filestem.html");
	1392
[2241]	1393	print HTML "<html><head>\n";
	1394	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
	1395	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
	1396	print HTML "</head><body>\n\n";
[1734]	1397
[2755]	1398	my $line;
	1399	while ($line=<TEXT>) {
	1400	$line =~ s/</</g;
	1401	$line =~ s/>/>/g;
[16435]	1402	if ($line =~ m/^\s*$/) {
[2755]	1403	print HTML "<p>";
	1404	} else {
	1405	print HTML "<br> ", $line;
	1406	}
[1445]	1407	}
[1734]	1408	print HTML "\n</body></html>\n";
[1445]	1409
[2241]	1410	close HTML;
	1411	close TEXT;
	1412
[1445]	1413	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
	1414	return 1;
	1415	}
	1416
	1417	# Convert any file to TEXT with a crude perl implementation of the
	1418	# UNIX strings command.
[2755]	1419	# Note - this assumes ascii charsets :( (jrm21)
[1445]	1420
	1421	sub any_to_text {
[22429]	1422	my ($input_filename, $output_filestem) = @_;
[1445]	1423
[3350]	1424	if (!$use_strings) {
	1425	return 0;
	1426	}
[15120]	1427
	1428	print STDERR "\n** In any to text**\n\n";
[2755]	1429	open(IN, "<$input_filename") \|\| return 0;
[1734]	1430	binmode(IN);
[2755]	1431	open(OUT, ">$output_filestem.text") \|\| return 0;
[1445]	1432
	1433	my ($line);
[2755]	1434	my $output_line_count = 0;
[1445]	1435	while (<IN>) {
	1436	$line = $_;
[1734]	1437
[1445]	1438	# delete anything that isn't a printable character
	1439	$line =~ s/[^\040-\176]+/\n/sg;
	1440
	1441	# delete any string less than 10 characters long
[1734]	1442	$line =~ s/^.{0,9}$/\n/mg;
[16435]	1443	while ($line =~ m/^.{1,9}$/m) {
[1734]	1444	$line =~ s/^.{0,9}$/\n/mg;
[1445]	1445	$line =~ s/\n+/\n/sg;
	1446	}
	1447
	1448	# remove extraneous whitespace
	1449	$line =~ s/\n+/\n/gs;
	1450	$line =~ s/^\n//gs;
[1578]	1451
[1445]	1452	# output whatever is left
[16435]	1453	if ($line =~ m/[^\n ]/) {
[1445]	1454	print OUT $line;
[2755]	1455	++$output_line_count;
[1445]	1456	}
	1457	}
[2241]	1458
	1459	close OUT;
	1460	close IN;
	1461
[2755]	1462	if ($output_line_count) { # try to protect against binary only formats
	1463	return 1;
	1464	}
	1465
	1466	&util::rm("$output_filestem.text");
	1467	return 0;
	1468
[1445]	1469	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: