Context Navigation

source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 24103

Last change on this file since 24103 was 24103, checked in by ak19, 13 years ago
Some minor oversights fixed.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 45.7 KB

Rev	Line
[1445]	1	#!/usr/bin/perl -w
	2
	3	###########################################################################
	4	#
[2032]	5	# gsConvert.pl -- convert documents to HTML or TEXT format
[1445]	6	#
	7	# A component of the Greenstone digital library software
	8	# from the New Zealand Digital Library Project at the
	9	# University of Waikato, New Zealand.
	10	#
[3013]	11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
[1445]	12	#
	13	# This program is free software; you can redistribute it and/or modify
	14	# it under the terms of the GNU General Public License as published by
	15	# the Free Software Foundation; either version 2 of the License, or
	16	# (at your option) any later version.
	17	#
	18	# This program is distributed in the hope that it will be useful,
	19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	21	# GNU General Public License for more details.
	22	#
	23	# You should have received a copy of the GNU General Public License
	24	# along with this program; if not, write to the Free Software
	25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	26	#
	27	###########################################################################
	28
	29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
[2755]	30	# by exploiting third-party programs. The sources of these are usually found
	31	# in the $GSDLHOME/packages directory, and the executables should live in
	32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
[1445]	33	#
[3013]	34	# Currently, we can convert the following formats by using external
	35	# conversion utilities:
	36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
	37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
[2032]	38	#
[3013]	39	# We can try to convert any file to text with a perl implementation of the
	40	# UNIX strings command.
	41	#
[2032]	42	# We try to convert Postscript files to text using "gs" which is often on
[2755]	43	# *nix machines. We fall back to performing weak text extraction by using
	44	# regular expressions.
[1445]	45
	46	BEGIN {
	47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
	48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
[24093]	49
	50	if(!$ENV{'PERLPATH'}) {
	51	my $full_perl_exec = $^X;
	52	require File::Basename;
	53	my $perl_path = File::Basename::dirname($full_perl_exec);
	54	$ENV{'PERLPATH'} = $perl_path;
	55	}
	56
[1445]	57	}
	58
[22429]	59	use strict;
	60
[1445]	61	use parsargv;
	62	use util;
	63	use Cwd;
	64
[2755]	65	# Are we running on WinNT or Win2000 (or later)?
	66	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
	67	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
[1445]	68
[3350]	69	my $use_strings;
[3720]	70	my $pdf_complex;
[4103]	71	my $pdf_nohidden;
[3720]	72	my $pdf_zoom;
	73	my $pdf_ignore_images;
[10451]	74	my $pdf_allow_images_only;
[10282]	75	my $windows_scripting;
[3350]	76
[1445]	77	sub print_usage
	78	{
[1970]	79	print STDERR "\n";
	80	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
	81	print STDERR " or text using third-party programs.\n\n";
	82	print STDERR " usage: $0 [options] filename\n";
[22642]	83	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
[2755]	84	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
[22596]	85	print STDERR "\t-output\tauto\|html\|text\|pagedimg_jpg\|pagedimg_gif\|pagedimg_png\t(output file type)\n";
[2755]	86	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
[3720]	87	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
[22568]	88	print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
[3720]	89	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
[4103]	90	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
[3720]	91	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
	92	print STDERR "\t\tconverting PDF to HTML\n";
[10451]	93	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
[3720]	94	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
	95	print STDERR "\t\t-pdf_complex is set\n";
[1445]	96	exit(1);
	97	}
	98
[2755]	99	my $faillogfile="";
[3538]	100	my $timeout=0;
[1445]	101
	102	sub main
	103	{
	104	my (@ARGV) = @_;
[3538]	105	my ($input_type,$output_type,$verbose);
[1960]	106
[23473]	107	# Dynamically figure out what the --type option can support, based on whether -windows_scripting
	108	# is in use or not
	109	my $default_type_re = "(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)";
	110	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
	111	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
	112	# Currently only have VBA for Word and PPT(but no XLS)
	113	my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xls)";
	114
	115	my $type_re = $default_type_re;
	116
	117	foreach my $a (@ARGV) {
	118	if ($a =~ m/^windows_scripting$/i) {
	119	$type_re = $enhanced_type_re;
	120	}
	121	}
	122
[1445]	123	# read command-line arguments
	124	if (!parsargv::parse(\@ARGV,
[23473]	125	"type/$type_re/", \$input_type,
[2755]	126	'/errlog/.*/', \$faillogfile,
[22596]	127	'output/(auto\|html\|text\|pagedimg).*/', \$output_type,
[1692]	128	'timeout/\d+/0',\$timeout,
[10282]	129	'verbose/\d+/0', \$verbose,
[22429]	130	'windows_scripting',\$windows_scripting,
[3720]	131	'use_strings', \$use_strings,
	132	'pdf_complex', \$pdf_complex,
[9482]	133	'pdf_ignore_images', \$pdf_ignore_images,
[10451]	134	'pdf_allow_images_only', \$pdf_allow_images_only,
[4103]	135	'pdf_nohidden', \$pdf_nohidden,
[3720]	136	'pdf_zoom/\d+/2', \$pdf_zoom
	137	))
[1445]	138	{
	139	print_usage();
	140	}
[12704]	141
[1445]	142	# Make sure the input file exists and can be opened for reading
	143	if (scalar(@ARGV!=1)) {
	144	print_usage();
	145	}
[1928]	146
[1445]	147	my $input_filename = $ARGV[0];
	148	if (!-r $input_filename) {
	149	print STDERR "Error: unable to open $input_filename for reading\n";
	150	exit(1);
	151	}
	152
	153	# Deduce filenames
	154	my ($tailname,$dirname,$suffix)
[2241]	155	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
	156	my $output_filestem = &util::filename_cat($dirname, "$tailname");
[1445]	157
	158	if ($input_type eq "")
	159	{
[2241]	160	$input_type = lc (substr($suffix,1,length($suffix)-1));
[1445]	161	}
	162
	163	# Change to temporary working directory
	164	my $stored_dir = cwd();
	165	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
[10357]	166
[1445]	167	# Select convert utility
	168	if (!defined $input_type) {
	169	print STDERR "Error: No filename extension or input type defined\n";
	170	exit(1);
	171	}
[23473]	172	elsif ($input_type =~ m/^docx?$/ \|\| $input_type eq "dot") {
[1445]	173	print &convertDOC($input_filename, $output_filestem, $output_type);
	174	print "\n";
	175	}
[1684]	176	elsif ($input_type eq "rtf") {
	177	print &convertRTF($input_filename, $output_filestem, $output_type);
	178	print "\n";
	179	}
[1445]	180	elsif ($input_type eq "pdf") {
	181	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
	182	print "\n";
	183	}
	184	elsif ($input_type eq "ps") {
[22429]	185	print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
[1445]	186	print "\n";
	187	}
[23473]	188	elsif ($input_type =~ m/pptx?$/) {
[2977]	189	print &convertPPT($input_filename, $output_filestem, $output_type);
	190	print "\n";
	191	}
[23473]	192	elsif ($input_type =~ m/xlsx?$/) {
[2991]	193	print &convertXLS($input_filename, $output_filestem, $output_type);
	194	print "\n";
	195	}
[1445]	196	else {
	197	print STDERR "Error: Unable to convert type '$input_type'\n";
	198	exit(1);
	199	}
	200
	201	# restore to original working directory
	202	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
	203
	204	}
	205
	206	&main(@ARGV);
	207
	208
	209
[2241]	210	# Document-type conversion functions
[1445]	211	#
	212	# The following functions attempt to convert documents from their
	213	# input type to the specified output type. If no output type was
	214	# given, then they first attempt HTML, and then TEXT.
	215	#
	216	# Each returns the output type ("html" or "text") or "fail" if no
	217	# conversion is possible.
	218
	219	# Convert a Microsoft word document
	220
	221	sub convertDOC {
[22429]	222	my ($input_filename, $output_filestem, $output_type) = @_;
[1445]	223
[1654]	224	# Many .doc files are not in fact word documents!
	225	my $realtype = &find_docfile_type($input_filename);
	226
[23473]	227	if ($realtype eq "word6" \|\| $realtype eq "word7"
	228	\|\| $realtype eq "word8" \|\| $realtype eq "docx") {
[1654]	229	return &convertWord678($input_filename, $output_filestem, $output_type);
	230	} elsif ($realtype eq "rtf") {
	231	return &convertRTF($input_filename, $output_filestem, $output_type);
	232	} else {
	233	return &convertAnything($input_filename, $output_filestem, $output_type);
	234	}
	235	}
	236
	237	# Convert a Microsoft word 6/7/8 document
	238
	239	sub convertWord678 {
[22429]	240	my ($input_filename, $output_filestem, $output_type) = @_;
[1654]	241
[1445]	242	my $success = 0;
[16435]	243	if (!$output_type \|\| ($output_type =~ m/html/i)){
[10282]	244	if ($windows_scripting) {
	245	$success = &native_doc_to_html($input_filename, $output_filestem);
	246	}
	247	else {
	248	$success = &doc_to_html($input_filename, $output_filestem);
	249	}
[1445]	250	if ($success) {
[10282]	251	return "html";
[1445]	252	}
	253	}
[1654]	254	return &convertAnything($input_filename, $output_filestem, $output_type);
	255	}
	256
	257
	258	# Convert a Rich Text Format (RTF) file
	259
	260	sub convertRTF {
[22429]	261	my ($input_filename, $output_filestem, $output_type) = @_;
[1654]	262
	263	my $success = 0;
	264
	265	# Attempt specialised conversion to HTML
[16435]	266	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[12704]	267
	268	if ($windows_scripting) {
	269	$success = &native_doc_to_html($input_filename, $output_filestem);
	270	}
	271	else {
	272	$success = &rtf_to_html($input_filename, $output_filestem);
	273	}
[1654]	274	if ($success) {
	275	return "html";
	276	}
	277	}
	278
[2755]	279	# rtf is so ugly that's it's not worth running strings over.
	280	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
	281	# return &convertAnything($input_filename, $output_filestem, $output_type);
	282	return "fail";
[1654]	283	}
	284
	285
	286	# Convert an unidentified file
	287
	288	sub convertAnything {
[22429]	289	my ($input_filename, $output_filestem, $output_type) = @_;
[1654]	290
	291	my $success = 0;
[10464]	292
[1445]	293	# Attempt simple conversion to HTML
[16435]	294	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[1445]	295	$success = &any_to_html($input_filename, $output_filestem);
	296	if ($success) {
	297	return "html";
	298	}
	299	}
	300
	301	# Convert to text
[16435]	302	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[2241]	303	$success = &any_to_text($input_filename, $output_filestem);
[1445]	304	if ($success) {
	305	return "text";
	306	}
	307	}
	308	return "fail";
	309	}
	310
	311
[1654]	312
[1445]	313	# Convert an Adobe PDF document
	314
	315	sub convertPDF {
[2755]	316	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[1445]	317
	318	my $success = 0;
[10357]	319	$output_type =~ s/.\-(.)/$1/i;
	320	# Attempt coversion to Image
[16435]	321	if ($output_type =~ m/jp?g\|gif\|png/i) {
[17329]	322	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
[10357]	323	if ($success){
	324	return "item";
	325	}
	326	}
[1445]	327
	328	# Attempt conversion to HTML
[16435]	329	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[1445]	330	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
	331	if ($success) {
	332	return "html";
	333	}
	334	}
	335
	336	# Attempt conversion to TEXT
[16435]	337	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[2117]	338	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
[1445]	339	if ($success) {
	340	return "text";
	341	}
	342	}
	343
	344	return "fail";
	345
	346	}
	347
	348
	349	# Convert an Adobe PostScript document
	350
	351	sub convertPS {
[22429]	352	my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
[1445]	353
	354	my $success = 0;
[10534]	355	$output_type =~ s/.\-(.)/$1/i;
	356	# Attempt coversion to Image
[16435]	357	if ($output_type =~ m/jp?g\|gif\|png/i) {
[17329]	358	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
[10534]	359	if ($success){
	360	return "item";
	361	}
	362	}
[1445]	363
	364	# Attempt conversion to TEXT
[16435]	365	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[1445]	366	$success = &ps_to_text($input_filename, $output_filestem);
	367	if ($success) {
	368	return "text";
	369	}
	370	}
	371	return "fail";
	372	}
	373
	374
[2977]	375	sub convertPPT {
	376	my ($input_filename, $output_filestem, $output_type) = @_;
[10357]	377	my $success = 0;
[2977]	378
[10282]	379	my $ppt_convert_type = "";
[22513]	380
[16435]	381	#if (!$output_type \|\| $windows_scripting \|\| ($output_type !~ m/html/i) \|\| ($output_type !~ m/text/i)){
	382	if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
	383	if ($output_type =~ m/gif/i) {
[10282]	384	$ppt_convert_type = "-g";
[16435]	385	} elsif ($output_type =~ m/jp?g/i){
[10282]	386	$ppt_convert_type = "-j";
[16435]	387	} elsif ($output_type =~ m/png/i){
[10282]	388	$ppt_convert_type = "-p";
	389	}
	390	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
	391	$ENV{'GSDLOS'}, "pptextract");
[16435]	392	$vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
[10282]	393
[22429]	394	my $cmd = "";
[10357]	395	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[22429]	396	# if the converting directory already exists
[10282]	397	if (-d $output_filestem) {
[22429]	398	print STDERR "**The conversion directory already exists\n";
[10282]	399	return "item";
	400	} else {
[10521]	401	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
[10282]	402	$cmd .= " 2>\"$output_filestem.err\""
[16435]	403	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[10282]	404	if (system($cmd) !=0) {
	405	print STDERR "Powerpoint VB Scripting convert failed\n";
	406	} else {
	407	return "item";
	408	}
	409	}
[16435]	410	} elsif (!$output_type \|\| ($output_type =~ m/html/i)) {
[10282]	411	# Attempt conversion to HTML
[16435]	412	#if (!$output_type \|\| ($output_type =~ m/html/i)) {
[2977]	413	# formulate the command
[22429]	414	my $cmd = "";
[24103]	415	my $full_perl_path = &util::filename_cat($ENV{'PERLPATH'},"perl");
[24093]	416	$cmd .= "$full_perl_path -S ppttohtml.pl ";
[2977]	417	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
	418	$cmd .= " 2>\"$output_filestem.err\""
[16435]	419	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[10357]	420
[2977]	421	# execute the command
	422	$!=0;
	423	if (system($cmd)!=0)
	424	{
[2991]	425	print STDERR "Powerpoint 95/97 converter failed $!\n";
[2977]	426	} else {
	427	return "html";
	428	}
[10464]	429	}
[2977]	430
	431	$success = &any_to_text($input_filename, $output_filestem);
	432	if ($success) {
	433	return "text";
	434	}
[10464]	435
[2977]	436	return "fail";
	437	}
	438
	439
[2991]	440	sub convertXLS {
	441	my ($input_filename, $output_filestem, $output_type) = @_;
[2977]	442
[2991]	443	my $success = 0;
[2977]	444
[2991]	445	# Attempt conversion to HTML
[16435]	446	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[2991]	447	# formulate the command
[22429]	448	my $cmd = "";
[24103]	449	my $full_perl_path = &util::filename_cat($ENV{'PERLPATH'},"perl");
	450	$cmd .= "$full_perl_path -S xlstohtml.pl ";
[2991]	451	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
	452	$cmd .= " 2>\"$output_filestem.err\""
[16435]	453	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[2991]	454
	455
	456	# execute the command
	457	$!=0;
	458	if (system($cmd)!=0)
	459	{
	460	print STDERR "Excel 95/97 converter failed $!\n";
	461	} else {
	462	return "html";
	463	}
	464	}
[2977]	465
[2991]	466	$success = &any_to_text($input_filename, $output_filestem);
	467	if ($success) {
	468	return "text";
	469	}
	470
	471	return "fail";
	472	}
	473
	474
	475
[1654]	476	# Find the real type of a .doc file
	477	#
[2012]	478	# We seem to have a lot of files with a .doc extension that are .rtf
[1654]	479	# files or Word 5 files. This function attempts to tell the difference.
	480	sub find_docfile_type {
[22429]	481	my ($input_filename) = @_;
[23473]	482
	483	if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
	484	return "docx";
	485	}
	486
[1654]	487	open(CHK, "<$input_filename");
[1734]	488	binmode(CHK);
[1654]	489	my $line = "";
	490	my $first = 1;
	491
	492	while (<CHK>) {
	493
	494	$line = $_;
[1960]	495
[1654]	496	if ($first) {
	497	# check to see if this is an rtf file
[16435]	498	if ($line =~ m/^\{\\rtf/) {
[1654]	499	close(CHK);
	500	return "rtf";
	501	}
[2755]	502	$first = 0;
[1654]	503	}
	504
[1734]	505	# is this is a word 6/7/8 document?
[16435]	506	if ($line =~ m/Word\.Document\.([678])/) {
[1654]	507	close(CHK);
[23473]	508
[1734]	509	return "word$1";
[1654]	510	}
	511
	512	}
	513
	514	return "unknown";
	515	}
	516
	517
[1734]	518	# Specific type-to-type conversions
[1445]	519	#
	520	# Each of the following functions attempts to convert a document from
[2755]	521	# a specific format to another. If they succeed they return 1 and leave
[1445]	522	# the output document(s) in the appropriate place; if they fail they
	523	# return 0 and delete any working files.
	524
	525
	526	# Attempt to convert a word document to html with the wv program
	527	sub doc_to_html {
[22429]	528	my ($input_filename, $output_filestem) = @_;
[1445]	529
[20933]	530	my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wvWare");
[1928]	531
[20933]	532	if ( -d "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv" && $ENV{'GSDLOS'} eq "linux" ) {
	533	$ENV{'PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/bin:$ENV{'PATH'}";
	534	$ENV{'LD_LIBRARY_PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/lib:$ENV{'LD_LIBRARY_PATH'}";
	535	$wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wv", "bin", "wvWare");
	536	}
	537
[2241]	538	# don't include path on windows (to avoid having to play about
	539	# with quoting when GSDLHOME might contain spaces) but assume
	540	# that the PATH is set up correctly
[16435]	541	$wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
[2241]	542
[2512]	543	my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
[2574]	544	"packages", "wv", "wvHtml.xml");
[1928]	545
[15120]	546	# Added the following to work with replace_srcdoc_with_html.pl:
	547	# Make wvWare put any associated (image) files of the word doc into
	548	# folder docname-without-extention_files. This folder should be at
	549	# the same level as the html file generated from the doc.
	550	# wvWare will take care of proper interlinking.
	551
	552	# This step is necessary for replace_srcdoc_with_html.pl which will
	553	# move the html and associated files into the import folder. We
	554	# want to ensure that the associated files won't overwrite similarly
	555	# named items already in import. Hence we put them in a folder first
	556	# (to which the html links properly) and that will allow
	557	# replace_srcdoc_with_html.pl to move them safely to /import.
	558
	559	# To do all this, we need to use wvWare's --dir and --basename options
	560	# where dir is the full path to the image folder directory and
	561	# basename is the full path to the image folder appended to the name
	562	# which is to be prepended to every image file:
	563	# eg. if the images were to have names like sample0.jpg to sampleN.jpg,
	564	# then the basename is "/full/path/to/imgdir/sample".
	565	# In this case, basename is the full path to and name of the document.
	566	# HOWEVER: basename always takes full path, not relative url, so
	567	# the greenstone browser is unable to display the images (absolute paths
	568	# cause it to give an "external link" message)
	569	# See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html
	570	# and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html
	571	# "added --dir option to wvHtml so that pictures can be placed in
	572	# a seperate directory"
	573	# "running wvWare through IMP to view word documents as html. It gets
	574	# invoked like this:
	575	# wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output"
	576
	577	# toppath is the folder where html is generated
	578	# docname is the name (without extension) of the html to be generated
	579	# suffix (extension) is thrown away
	580	my ($docname, $toppath)
	581	= &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
	582
	583	# We want the image folder generated to have the same name as windows
	584	# would generate ($windows_scripting) when it converts from word to html.
	585	# That is, foldername=docname_files
	586	my $assoc_dir = &util::filename_cat($toppath, $docname."_files");
	587	#print "assoc_dir: ".$assoc_dir."\n"; # same as "$output_filestem._files"
	588
	589	# ensure this image directory exists
	590	# if it exists already, just delete and recreate
	591	if(-e $assoc_dir) {
	592	&util::rm_r($assoc_dir);
	593	}
	594	&util::mk_dir($assoc_dir);
	595
	596	# the images are all going to be called image0, image1,..., imageN
	597	my $img_basenames = &util::filename_cat($assoc_dir, $docname);
	598
	599	#print STDERR "**toppath: $toppath\n**docname: $docname\n;
	600	#print STDERR "****img_basenames: $img_basenames\n" if($img_basenames);
	601	#print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir);
	602
[2241]	603	my $cmd = "";
[1692]	604	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[15120]	605	# wvWare's --dir and --basename options for image directory.
	606	# Replaced the next line with the 2 lines following it:
	607	# $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
	608	$cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\"";
	609	$cmd .= " --charset utf-8 --config \"$wv_conf\"";
[2241]	610	$cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
[15120]	611
[2241]	612	# redirecting STDERR is a bad idea on windows 95/98
	613	$cmd .= " 2> \"$output_filestem.err\""
[16435]	614	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[1445]	615	# execute the command
[2755]	616	$!=0;
[2060]	617	if (system($cmd)!=0)
[1445]	618	{
[2755]	619	print STDERR "Error executing wv converter:$!\n";
	620	if (-s "$output_filestem.err") {
	621	open (ERRFILE, "<$output_filestem.err");
	622
	623	my $write_to_fail_log=0;
	624	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	625	{$write_to_fail_log=1;}
	626
	627	my $line;
	628	while ($line=<ERRFILE>) {
[16435]	629	if ($line =~ m/\w/) {
[2755]	630	print STDERR "$line";
	631	print FAILLOG "$line" if ($write_to_fail_log);
	632	}
	633	if ($line !~ m/startup error/) {next;}
	634	print STDERR " (given an invalid .DOC file?)\n";
	635	print FAILLOG " (given an invalid .DOC file?)\n"
	636	if ($write_to_fail_log);
	637
	638	} # while ERRFILE
	639	close FAILLOG if ($write_to_fail_log);
	640	}
	641	return 0; # we can try any_to_text
[1445]	642	}
[1578]	643
[1445]	644	# Was the conversion successful?
[2241]	645
[15120]	646	if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents)
[1445]	647	open(TMP, "$output_filestem.html");
[22429]	648	my $line = <TMP>;
[1445]	649	close(TMP);
[16435]	650	if ($line && $line =~ m/DOCTYPE HTML/) {
[15120]	651	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
	652
	653	# Inserted this code to remove the images directory if it was still empty after
	654	# the html was generated (in case there were no images in the word document)
[16435]	655	if (&util::is_dir_empty($assoc_dir)) {
[15152]	656	#print STDERR "*gsConvert.pl: Image dir $assoc_dir is empty, removing*\n";
[15120]	657	&util::rm_r($assoc_dir);
	658	} else { # there was an image folder (it was generated)
	659	# Therefore, the html file generated contains absolute links to the images
[16435]	660	# Replace them with relative links instead, so the folder can be moved elsewhere
[15152]	661	&make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");
[15120]	662	}
[1445]	663	return 1;
	664	}
	665	}
[2755]	666
	667	# If here, an error of some sort occurred
	668	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
	669	if (-e "$output_filestem.err") {
	670	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
	671	open (ERRLOG,"$output_filestem.err");
	672	while (<ERRLOG>) {print FAILLOG $_;}
	673	close FAILLOG;
	674	close ERRLOG;
	675	}
	676	&util::rm("$output_filestem.err");
	677	}
	678
[1445]	679	return 0;
	680	}
	681
[15120]	682	# Method to work with doc_to_html - Word docs might contain images.
	683	# When such word docs are converted with wvWare, we make it generate a
	684	# <filename>_files folder with the associated images, while the html file
	685	# <filename> refers to the images using absolute paths to <filename>_files.
	686	# This method reads in that html file and replaces all the absolute paths to
	687	# the images in <filename>_files with the relative paths to the images from
	688	# that folder. (I.e. with <filename>_files/<imagename.ext>).
	689	sub make_links_to_assocdir_relative{
	690	# toppath is the top-level folder in which the html file we're going to be fixing resides
	691	# docname is just the name (without extension) of the html file
	692	# html_file is the full path to the html file: /full/path/docname.html
	693	# assoc_dir_path is toppath/docname_files
	694	# assoc_dirname is the directory name of the folder with associated imgs: docname_files
	695	my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_;
[10357]	696
[15120]	697	# 1. Read all the contents of the html into a string
	698	# open the original file for reading
	699	unless(open(FIN, "<$html_file")) {
[15168]	700	print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR: $!\n";
[15152]	701	return 0;
[15120]	702	}
	703	# From http://perl.plover.com/local.html
	704	# "It's cheaper to read the file all at once, without all the splitting and reassembling.
	705	# (Some people call this slurping the file.) Perl has a special feature to support this:
	706	# If the $/ variable is undefined, the <...> operator will read the entire file all at once"
[15152]	707	my $html_contents;
	708	{
	709	local $/ = undef; # Read entire file at once
	710	$html_contents = <FIN>; # Now file is read in as one single 'line'
	711	}
[15120]	712	close(FIN); # close the file
[15152]	713	#print STDERR $html_contents;
[15120]	714
	715	# 2. Replace (substitute) all ocurrences of the assoc_dir_path in a hrefs and img src
	716	# values with assoc_dirname
	717	# At the end: g means substitute all occurrences (global), while s at the end means treat
	718	# all new lines as a regular space. This interacts with g to consider all the lines
	719	# together as a single line so that multi-occurrences can be replaced.
[15152]	720
	721	# we can't just replace $assoc_dir_path with $assoc_dir
	722	# $assoc_dir_path represents a regular expression that needs to be replaced
[16435]	723	# if it contains ., -, [, ], or Windows style backslashes in paths -- which all have special
	724	# meaning in Perl regular expressions -- we need to escape these first
[15152]	725	my $safe_reg_expression = $assoc_dir_path;
[16435]	726	$safe_reg_expression =~ s/\\/\\\\/g;
[15152]	727	$safe_reg_expression =~ s/\./\\./g;
	728	$safe_reg_expression =~ s/\-/\\-/g;
	729	$safe_reg_expression =~ s/\[/\\[/g;
	730	$safe_reg_expression =~ s/\]/\\]/g;
	731	$safe_reg_expression =~ s/ /%20/g; # wvWare put %20 in place of space, so we need to change our prefix to match
	732
[15120]	733	# The following regular expression substitution looks for <a or <image, followed by any other
	734	# attributes and values until it comes to the FIRST (indicated by ?) href= or src=
	735	# followed by " or ' no quotes at all around path, followed by the associated folder's pathname
	736	# followed by characters (for the img filename), then finally the optional closing quotes
	737	# in " or ' form, followed by any other attributes and values until the first > to end the tag.
	738	# The substitution: all the parts preceding associated folder's pathname are retained,
	739	# the associated folder path name is replaced by associated folder directory name
	740	# and the rest upto and including the closing > tag is retained.
	741	# The sg at the end of the pattern match treats all of html_contents as a single line (s)
	742	# and performs a global replace (g) meaning that all occurrences that match in that single line
	743	# are substituted.
[15152]	744	$html_contents =~ s/(<(a\|img).?(href\|src)=(\"\|\')?)$safe_reg_expression(.?(\"\|\')?.*?>)/$1$assoc_dirname$5/sg;
	745	#$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back
	746	# now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this
[16435]	747	$html_contents =~ s/(<(a\|img).?(href\|src)=(\"\|\')?)(.)(.?(\"\|\')?.?>)/&post_process_assocfile_urls($1, $5, $6)/sge;
	748
[16552]	749	#print STDERR "**assoc_dirname: $assoc_dirname*\n";
	750	#print STDERR "**safe_reg_expression: $safe_reg_expression*\n";
[15152]	751
[15120]	752	# delete the original file and recreate it
	753	my $copy_of_filename = $html_file;
	754	&util::rm($copy_of_filename); # deleted the file
	755
	756	# Recreate the original file for writing the updated contents
	757	unless(open(FOUT, ">$html_file")) { # open it as a new file for writing
[15168]	758	print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR: $!\n";
[15152]	759	return 0;
[15120]	760	}
[16435]	761
[15120]	762	# write out the updated contents and close the file
	763	print FOUT $html_contents;
	764	close(FOUT);
[15152]	765	return 1;
[15120]	766	}
	767
[16435]	768	# Utility routine to make sure HTML plugin gets img src/href link pathnames that contain
	769	# url slashes (/) instead of windows-style backwards slashes, and to convert all %20
[16899]	770	# introduced in link pathnames by wvWare into space again. Converts all percent signs
	771	# introduced by URL encoding filenames generated into %25 in these url links referencing them
[16435]	772	sub post_process_assocfile_urls
[15120]	773	{
[15152]	774	my ($pre, $text, $post) = @_;
	775
[19763]	776	$text =~ s/%20/ /g; # Convert %20s to space and not underscore since underscores mess with incremental rebuild
	777	# $text =~ s/%20/_/g; # reinstated this line, since we no longer replace spaces with %20. We replace them with underscores
[16435]	778	$text =~ s/\\/\//g;
[16899]	779	$text =~ s/%/%25/g;
[15152]	780
	781	return "$pre$text$post";
[15120]	782	}
	783
[10282]	784	# Attempt to convert a word document to html with the word2html scripting program
	785	sub native_doc_to_html {
[22429]	786	my ($input_filename, $output_filestem) = @_;
[1445]	787
[10282]	788	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
	789	$ENV{'GSDLOS'}, "word2html");
	790
[16435]	791	$vbScript = "word2html" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
[10445]	792	if (-e "$output_filestem.html") {
[22429]	793	print STDERR " The conversion file:\n";
	794	print STDERR " $output_filestem.html\n";
	795	print STDERR " ... already exists. Skipping\n";
[10445]	796	return 1;
	797	}
[10282]	798
	799	my $cmd = "";
	800	if ($timeout) {$cmd = "ulimit -t $timeout;";}
	801	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
[10445]	802	#$cmd .= "$vbScript $input_filename $output_filestem.html";
[10521]	803	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
[10445]	804
[10282]	805	# redirecting STDERR
	806	$cmd .= " 2> \"$output_filestem.err\""
[16435]	807	if ($ENV {'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[10282]	808
	809	# execute the command
	810	$!=0;
	811	if (system($cmd)!=0)
	812	{
	813	print STDERR "Error executing word2Html converter:$!\n";
	814	if (-s "$output_filestem.err") {
	815	open (ERRFILE, "<$output_filestem.err");
	816
	817	my $write_to_fail_log=0;
	818	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	819	{$write_to_fail_log=1;}
	820
	821	my $line;
	822	while ($line=<ERRFILE>) {
[16435]	823	if ($line =~ m/\w/) {
[10282]	824	print STDERR "$line";
	825	print FAILLOG "$line" if ($write_to_fail_log);
	826	}
	827	if ($line !~ m/startup error/) {next;}
	828	print STDERR " (given an invalid .DOC file?)\n";
	829	print FAILLOG " (given an invalid .DOC file?)\n"
	830	if ($write_to_fail_log);
	831
	832	} # while ERRFILE
	833	close FAILLOG if ($write_to_fail_log);
	834	}
	835	return 0; # we can try any_to_text
	836	}
	837
	838	# Was the conversion successful?
	839	if (-s "$output_filestem.html") {
	840	open(TMP, "$output_filestem.html");
[22429]	841	my $line = <TMP>;
[10282]	842	close(TMP);
[22429]	843	if ($line && $line =~ m/html/i) {
[10282]	844	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
	845	return 1;
	846	}
	847	}
	848
	849	# If here, an error of some sort occurred
	850	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
	851	if (-e "$output_filestem.err") {
	852	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
	853	open (ERRLOG,"$output_filestem.err");
	854	while (<ERRLOG>) {print FAILLOG $_;}
	855	close FAILLOG;
	856	close ERRLOG;
	857	}
	858	&util::rm("$output_filestem.err");
	859	}
	860	return 0;
	861	}
	862
[1654]	863	# Attempt to convert an RTF document to html with rtftohtml
	864	sub rtf_to_html {
[2241]	865	my ($input_filename, $output_filestem) = @_;
[1654]	866
	867	# formulate the command
[22429]	868	my $cmd = "";
[1692]	869	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[2574]	870	$cmd .= "rtftohtml";
[10282]	871	#$cmd .= "rtf-converter";
[1654]	872
[3246]	873	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
[2574]	874
	875	$cmd .= " 2>\"$output_filestem.err\""
[16435]	876	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[2574]	877
	878
[1654]	879	# execute the command
[2755]	880	$!=0;
[2060]	881	if (system($cmd)!=0)
[1654]	882	{
[2755]	883	print STDERR "Error executing rtf converter $!\n";
[2656]	884	# don't currently bother printing out error log...
	885	# keep going, in case it still created an HTML file...
[1654]	886	}
	887
	888	# Was the conversion successful?
[2755]	889	my $was_successful=0;
[2656]	890	if (-s "$output_filestem.html") {
[2755]	891	# make sure we have some content other than header
	892	open (HTML, "$output_filestem.html"); # what to do if fail?
	893	my $line;
	894	my $past_header=0;
	895	while ($line=<HTML>) {
	896
	897	if ($past_header == 0) {
[16435]	898	if ($line =~ m/<body>/) {$past_header=1;}
[2755]	899	next;
	900	}
	901
	902	$line =~ s/<[^>]+>//g;
[16435]	903	if ($line =~ m/\w/ && $past_header) { # we found some content...
[2755]	904	$was_successful=1;
	905	last;
	906	}
	907	}
	908	close HTML;
[1654]	909	}
[2574]	910
[2755]	911	if ($was_successful) {
	912	&util::rm("$output_filestem.err")
	913	if (-e "$output_filestem.err");
	914	# insert the (modified) table of contents, if it exists.
	915	if (-e "${output_filestem}_ToC.html") {
	916	&util::mv("$output_filestem.html","$output_filestem.src");
	917	my $open_failed=0;
	918	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
	919	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
	920	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
	921
	922	if ($open_failed) {
	923	close HTMLSRC;
	924	close TOC;
	925	close HTML;
	926	&util::mv("$output_filestem.src","$output_filestem.html");
	927	return 1;
	928	}
	929
	930	# print out header info from src html.
[16435]	931	while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
[2755]	932	print HTML "$_";
	933	}
	934
	935	# print out table of contents, making links relative
	936	<TOC>; <TOC>; # ignore first 2 lines
	937	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
	938	my $line;
	939	while ($line=<TOC>) {
[22429]	940	$line =~ s@</body></html>$@@i ; # only last line has this
[2755]	941	# make link relative
[22429]	942	$line =~ s@href=\"[^\#]+@href=\"@i;
[2755]	943	print HTML $line;
	944	}
	945	close TOC;
	946
	947	# rest of html src
	948	while (<HTMLSRC>) {
	949	print HTML $_;
	950	}
	951	close HTMLSRC;
	952	close HTML;
	953
	954	&util::rm("${output_filestem}_ToC.html");
	955	&util::rm("${output_filestem}.src");
	956	}
	957	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
	958	return 1; # success
	959	}
	960
	961	if (-e "$output_filestem.err") {
	962	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	963	{
	964	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
[10282]	965	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
[2755]	966	print FAILLOG " (rtf file might be too recent):\n";
	967	open (ERRLOG, "$output_filestem.err");
	968	while (<ERRLOG>) {print FAILLOG $_;}
	969	close ERRLOG;
	970	close FAILLOG;
	971	}
	972	&util::rm("$output_filestem.err");
	973	}
	974
[2656]	975	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
	976
[1654]	977	return 0;
	978	}
	979
	980
[1445]	981	# Convert a pdf file to html with the pdftohtml command
	982
	983	sub pdf_to_html {
[2755]	984	my ($dirname, $input_filename, $output_filestem) = @_;
[1445]	985
[22429]	986	my $cmd = "";
[1692]	987	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[24103]	988	my $full_perl_path = &util::filename_cat($ENV{'PERLPATH'},"perl");
[24093]	989	$cmd .= "$full_perl_path -S pdftohtml.pl -zoom $pdf_zoom";
[3720]	990	$cmd .= " -c" if ($pdf_complex);
	991	$cmd .= " -i" if ($pdf_ignore_images);
[10451]	992	$cmd .= " -a" if ($pdf_allow_images_only);
[4103]	993	$cmd .= " -hidden" unless ($pdf_nohidden);
[1928]	994	$cmd .= " \"$input_filename\" \"$output_filestem\"";
[2755]	995
[16435]	996	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
[2755]	997	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	998	} else {
	999	$cmd .= " > \"$output_filestem.err\"";
	1000	}
	1001
[2117]	1002	$!=0;
[2241]	1003
[2656]	1004	my $retval=system($cmd);
	1005	if ($retval!=0)
[1445]	1006	{
[2755]	1007	print STDERR "Error executing pdftohtml.pl";
[2117]	1008	if ($!) {print STDERR ": $!";}
	1009	print STDERR "\n";
[1445]	1010	}
	1011
[1692]	1012	# make sure the converter made something
[2656]	1013	if ($retval!=0 \|\| ! -s "$output_filestem.html")
[1692]	1014	{
	1015	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
[2656]	1016	# print out the converter's std err, if any
	1017	if (-s "$output_filestem.err") {
[1692]	1018	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[2755]	1019	print STDERR "pdftohtml error log:\n";
[1692]	1020	while (<ERRLOG>) {
	1021	print STDERR "$_";
	1022	}
	1023	close ERRLOG;
	1024	}
[22513]	1025	print STDERR "***********output filestem $output_filestem.html\n";
[2656]	1026	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
[2755]	1027	if (-e "$output_filestem.err") {
	1028	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	1029	{
	1030	open (ERRLOG, "$output_filestem.err");
	1031	while (<ERRLOG>) {print FAILLOG $_;}
	1032	close ERRLOG;
	1033	close FAILLOG;
	1034	}
[10282]	1035	&util::rm("$output_filestem.err");
[2755]	1036	}
[1692]	1037	return 0;
	1038	}
[10357]	1039
	1040	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	1041	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	1042	return 1;
	1043	}
	1044
	1045	# Convert a pdf file to various types of image with the convert command
	1046
[17329]	1047	sub pdfps_to_img {
[10357]	1048	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[10401]	1049
	1050	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
	1051	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
	1052	my $result = `identify 2>&1`;
	1053	if ($? == -1 \|\| $? == 256) { # Linux and Windows return different values for "program not found"
	1054	#ImageMagick is not installed, thus the convert utility is not available.
[17329]	1055	print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images\n";
[10401]	1056	return 0;
	1057	}
	1058	}
	1059
[22429]	1060	my $cmd = "";
[10357]	1061	if ($timeout) {$cmd = "ulimit -t $timeout;";}
	1062	$output_type =~ s/.\_(.)/$1/i;
[24103]	1063	my $full_perl_path = &util::filename_cat($ENV{'PERLPATH'},"perl");
	1064	$cmd .= "$full_perl_path -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
[16435]	1065	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
[10357]	1066	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	1067	} else {
	1068	$cmd .= " > \"$output_filestem.err\"";
	1069	}
	1070
	1071	# don't include path on windows (to avoid having to play about
	1072	# with quoting when GSDLHOME might contain spaces) but assume
	1073	# that the PATH is set up correctly
	1074	$!=0;
	1075	my $retval=system($cmd);
	1076	if ($retval!=0)
	1077	{
[10401]	1078	print STDERR "Error executing pdftoimg.pl";
[10357]	1079	if ($!) {print STDERR ": $!";}
	1080	print STDERR "\n";
	1081	}
	1082
	1083	#make sure the converter made something
	1084	#if ($retval !=0) \|\| ! -s "$output_filestem")
	1085	if ($retval !=0)
	1086	{
	1087	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	1088	#print out the converter's std err, if any
	1089	if (-s "$output_filestem.err") {
	1090	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[17329]	1091	print STDERR "pdfpstoimg error log:\n";
[10357]	1092	while (<ERRLOG>) {
	1093	print STDERR "$_";
	1094	}
	1095	close ERRLOG;
	1096	}
[10534]	1097	#&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
[10357]	1098	if (-e "$output_filestem.err") {
	1099	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	1100	{
	1101	open (ERRLOG, "$output_filestem.err");
	1102	while (<ERRLOG>) {print FAILLOG $_;}
	1103	close ERRLOG;
	1104	close FAILLOG;
	1105	}
	1106	&util::rm("$output_filestem.err");
	1107	}
	1108	return 0;
	1109	}
[2656]	1110	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
[1445]	1111	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	1112	return 1;
	1113	}
	1114
	1115	# Convert a PDF file to text with the pdftotext command
	1116
	1117	sub pdf_to_text {
[2755]	1118	my ($dirname, $input_filename, $output_filestem) = @_;
[1445]	1119
[2248]	1120	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
[2755]	1121
[16435]	1122	if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
[2755]	1123	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	1124	} else {
	1125	$cmd .= " > \"$output_filestem.err\"";
	1126	}
[1445]	1127
[2060]	1128	if (system($cmd)!=0)
[1445]	1129	{
	1130	print STDERR "Error executing $cmd: $!\n";
	1131	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
	1132	}
	1133
[2755]	1134	# make sure there is some extracted text.
	1135	if (-e "$output_filestem.text") {
	1136	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
	1137	binmode(EXTR_TEXT); # just in case...
	1138	my $line="";
	1139	my $seen_text=0;
	1140	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
[16435]	1141	if ($line=~ m/\w/) {$seen_text=1;}
[2755]	1142	}
	1143	close EXTR_TEXT;
	1144	if ($seen_text==0) { # no text was extracted
	1145	print STDERR "Error: pdftotext found no text\n";
	1146	&util::rm("$output_filestem.text");
	1147	}
	1148	}
	1149
[1692]	1150	# make sure the converter made something
[2656]	1151	if (! -s "$output_filestem.text")
[1692]	1152	{
	1153	# print out the converters std err, if any
[2656]	1154	if (-s "$output_filestem.err") {
[1692]	1155	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[2755]	1156	print STDERR "pdftotext error log:\n";
[1692]	1157	while (<ERRLOG>) {
	1158	print STDERR "$_";
	1159	}
	1160	close ERRLOG;
	1161	}
[2656]	1162	# does this converter create a .out file?
	1163	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	1164	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
[2755]	1165	if (-e "$output_filestem.err") {
	1166	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	1167	{
	1168	open (ERRLOG,"$output_filestem.err");
	1169	while (<ERRLOG>) {print FAILLOG $_;}
	1170	close ERRLOG;
	1171	close FAILLOG;
	1172	}
	1173	&util::rm("$output_filestem.err");
	1174	}
[1692]	1175	return 0;
	1176	}
[1445]	1177	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	1178	return 1;
	1179	}
	1180
[2012]	1181	# Convert a PostScript document to text
	1182	# note - just using "ps2ascii" isn't good enough, as it
	1183	# returns 0 for a postscript interpreter error. ps2ascii is just
	1184	# a wrapper to "gs" anyway, so we use that cmd here.
[1445]	1185
	1186	sub ps_to_text {
[2241]	1187	my ($input_filename, $output_filestem) = @_;
[1445]	1188
[2241]	1189	my $error = "";
	1190
	1191	# if we're on windows we'll fall straight through without attempting
	1192	# to use gs
[16435]	1193	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
[2241]	1194	$error = "Windows does not support gs";
	1195
	1196	} else {
[3538]	1197	my $cmd = "";
	1198	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
	1199	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
[2241]	1200	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
[10357]	1201	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
[2241]	1202	$cmd .= " 2> $output_filestem.err";
	1203	$!=0;
[10357]	1204
[2241]	1205	my $retcode=system($cmd);
	1206	$retcode = $? >> 8; # see man perlfunc - system for this...
	1207	# if system returns -1 \| 127 (couldn't start program), look at $! for message
	1208
	1209	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
	1210	elsif (! -e "$output_filestem.text") {
	1211	$error="did not create output file.\n";
[2012]	1212	}
[2241]	1213	else
	1214	{ # make sure the interpreter didn't get an error. It is technically
	1215	# possible for the actual text to start with this, but....
	1216	open PSOUT, "$output_filestem.text";
[16435]	1217	if (<PSOUT> =~ m/^Error: (.*)/) {
[2241]	1218	$error="interpreter error - \"$1\"";
	1219	}
	1220	close PSOUT;
	1221	}
[2012]	1222	}
[2241]	1223
[2012]	1224	if ($error ne "")
[1445]	1225	{
[2755]	1226	print STDERR "Warning: Error executing gs: $error\n";
[1445]	1227	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
[2755]	1228
	1229	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
	1230	{
	1231	print FAILLOG "gs - $error\n";
	1232	if (-e "$output_filestem.err") {
	1233	open(ERRLOG, "$output_filestem.err");
	1234	while (<ERRLOG>) {print FAILLOG $_;}
	1235	close ERRLOG;
	1236	}
	1237	close FAILLOG;
	1238	}
[1445]	1239	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
[2012]	1240
[2755]	1241
[2012]	1242	# Fine then. We'll just do a lousy job by ourselves...
[2031]	1243	# Based on 5-line regexp sed script found at:
[2012]	1244	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
	1245	#
[2755]	1246	print STDERR "Stripping text from postscript\n";
[2012]	1247	my $errorcode=0;
	1248	open (IN, "$input_filename")
	1249	\|\| ($errorcode=1, warn "Couldn't read file: $!");
	1250	open (OUT, ">$output_filestem.text")
	1251	\|\| ($errorcode=1, warn "Couldn't write file: $!");
	1252	if ($errorcode) {print STDERR "errors\n";return 0;}
	1253
[2031]	1254	my $text=""; # this is for whole .ps file...
[2755]	1255	$text = join('', <IN>); # see man perlport, under "System Resources"
[2031]	1256	close IN;
	1257
[2447]	1258	# Make sure this is a ps file...
[16435]	1259	if ($text !~ m/^%!/) {
[2755]	1260	print STDERR "Bad postscript header: not '%!'\n";
	1261	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
	1262	{
	1263	print FAILLOG "Bad postscript header: not '%!'\n";
	1264	close FAILLOG;
	1265	}
[2447]	1266	return 0;
	1267	}
	1268
[2031]	1269	# if ps has Page data, then use it to delete all stuff before it.
	1270	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
	1271
	1272	# remove all leading non-data stuff
	1273	$text =~ s/^.*?\(//s;
	1274
	1275	# remove all newline chars for easier processing
	1276	$text =~ s/\n//g;
	1277
	1278	# Big assumption here - assume that if any co-ordinates are
	1279	# given, then we are at the end of a sentence.
	1280	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
	1281
	1282	# special characters--
	1283	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
	1284
	1285	# ? ps text formatting (eg italics?) ?
	1286	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
	1287	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
	1288	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
	1289	# default - remove the rest
	1290	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
	1291
	1292	# attempt to add whitespace between words...
	1293	# this is based purely on observation, and may be completely wrong...
	1294	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
	1295	# eg I notice "b(" is sometimes NOT a space if preceded by a
	1296	# negative number.
	1297	$text =~ s/\)\d+ ?b\(/\) \( /g;
	1298
	1299	# change quoted braces to brackets
	1300	$text =~ s/([^\\])\\\(/$1\{/g;
	1301	$text =~ s/([^\\])\\\)/$1\}/g ;
	1302
	1303	# remove everything that is not between braces
	1304	$text =~ s/\)([^\(\)])+?\(//sg ;
	1305
	1306	# remove any Trailer eof stuff.
	1307	$text =~ s/\)[^\)]*$//sg;
	1308
	1309	### ligatures have special characters...
	1310	$text =~ s/\\013/ff/g;
	1311	$text =~ s/\\014/fi/g;
	1312	$text =~ s/\\015/fl/g;
	1313	$text =~ s/\\016/ffi/g;
	1314	$text =~ s/\\214/fi/g;
	1315	$text =~ s/\\215/fl/g;
	1316	$text =~ s/\\017/\n\* /g; # asterisk?
	1317	$text =~ s/\\023/\023/g; # e acute ('e)
	1318	$text =~ s/\\177/\252/g; # u"
	1319	# $text =~ s/ ?? /\344/g; # a"
	1320
	1321	print OUT "$text";
	1322	close OUT;
[1960]	1323	}
[2600]	1324	# wrap the text - use a minimum length. ie, first space after this length.
	1325	my $wrap_length=72;
	1326	&util::mv("$output_filestem.text", "$output_filestem.text.tmp");
	1327	open INFILE, "$output_filestem.text.tmp" \|\|
	1328	die "Couldn't open file: $!";
	1329	open OUTFILE, ">$output_filestem.text" \|\|
	1330	die "Couldn't open file for writing: $!";
	1331	my $line="";
	1332	while ($line=<INFILE>) {
	1333	while (length($line)>0) {
	1334	if (length($line)>$wrap_length) {
	1335	$line =~ s/^(.{$wrap_length}[^\s])\s//;
	1336	print OUTFILE "$1\n";
	1337	} else {
	1338	print OUTFILE "$line";
	1339	$line="";
	1340	}
	1341	}
	1342	}
	1343	close INFILE;
	1344	close OUTFILE;
	1345	&util::rm("$output_filestem.text.tmp");
	1346
[1445]	1347	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	1348	return 1;
	1349	}
	1350
	1351
	1352	# Convert any file to HTML with a crude perl implementation of the
	1353	# UNIX strings command.
	1354
	1355	sub any_to_html {
[22429]	1356	my ($input_filename, $output_filestem) = @_;
[1445]	1357
	1358	# First generate a text file
	1359	return 0 unless (&any_to_text($input_filename, $output_filestem));
	1360
	1361	# create an HTML file from the text file
	1362	open(TEXT, "<$output_filestem.text");
	1363	open(HTML, ">$output_filestem.html");
	1364
[2241]	1365	print HTML "<html><head>\n";
	1366	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
	1367	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
	1368	print HTML "</head><body>\n\n";
[1734]	1369
[2755]	1370	my $line;
	1371	while ($line=<TEXT>) {
	1372	$line =~ s/</</g;
	1373	$line =~ s/>/>/g;
[16435]	1374	if ($line =~ m/^\s*$/) {
[2755]	1375	print HTML "<p>";
	1376	} else {
	1377	print HTML "<br> ", $line;
	1378	}
[1445]	1379	}
[1734]	1380	print HTML "\n</body></html>\n";
[1445]	1381
[2241]	1382	close HTML;
	1383	close TEXT;
	1384
[1445]	1385	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
	1386	return 1;
	1387	}
	1388
	1389	# Convert any file to TEXT with a crude perl implementation of the
	1390	# UNIX strings command.
[2755]	1391	# Note - this assumes ascii charsets :( (jrm21)
[1445]	1392
	1393	sub any_to_text {
[22429]	1394	my ($input_filename, $output_filestem) = @_;
[1445]	1395
[3350]	1396	if (!$use_strings) {
	1397	return 0;
	1398	}
[15120]	1399
	1400	print STDERR "\n** In any to text**\n\n";
[2755]	1401	open(IN, "<$input_filename") \|\| return 0;
[1734]	1402	binmode(IN);
[2755]	1403	open(OUT, ">$output_filestem.text") \|\| return 0;
[1445]	1404
	1405	my ($line);
[2755]	1406	my $output_line_count = 0;
[1445]	1407	while (<IN>) {
	1408	$line = $_;
[1734]	1409
[1445]	1410	# delete anything that isn't a printable character
	1411	$line =~ s/[^\040-\176]+/\n/sg;
	1412
	1413	# delete any string less than 10 characters long
[1734]	1414	$line =~ s/^.{0,9}$/\n/mg;
[16435]	1415	while ($line =~ m/^.{1,9}$/m) {
[1734]	1416	$line =~ s/^.{0,9}$/\n/mg;
[1445]	1417	$line =~ s/\n+/\n/sg;
	1418	}
	1419
	1420	# remove extraneous whitespace
	1421	$line =~ s/\n+/\n/gs;
	1422	$line =~ s/^\n//gs;
[1578]	1423
[1445]	1424	# output whatever is left
[16435]	1425	if ($line =~ m/[^\n ]/) {
[1445]	1426	print OUT $line;
[2755]	1427	++$output_line_count;
[1445]	1428	}
	1429	}
[2241]	1430
	1431	close OUT;
	1432	close IN;
	1433
[2755]	1434	if ($output_line_count) { # try to protect against binary only formats
	1435	return 1;
	1436	}
	1437
	1438	&util::rm("$output_filestem.text");
	1439	return 0;
	1440
[1445]	1441	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: