Context Navigation

source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 32205

Last change on this file since 32205 was 32205, checked in by ak19, 6 years ago
First set of commits to do with implementing the new 'paged_html' output option of PDFPlugin that uses using xpdftools' new pdftohtml. So far tested only on Linux (64 bit), but things work there so I'm optimistically committing the changes since they work. 2. Committing the pre-built Linux binaries of XPDFtools for both 32 and 64 bit built by the XPDF group. 2. To use the correct bitness variant of xpdftools, setup.bash now exports the BITNESS env var, consulted by gsConvert.pl. 3. All the perl code changes to do with using xpdf tools' pdftohtml to generate paged_html and feed it in the desired form into GS(3): gsConvert.pl, PDFPlugin.pm and its parent ConvertBinaryPFile.pm have been modified to make it all work. xpdftools' pdftohtml generates a folder containing an html file and a screenshot for each page in a PDF (as well as an index.html linking to each page's html). However, we want a single html file that contains each individual 'page' html's content in a div, and need to do some further HTML style, attribute and structure modifications to massage the xpdftool output to what we want for GS. In order to parse and manipulate the HTML 'DOM' to do this, we're using the Mojo::DOM package that Dr Bainbridge found and which he's compiled up. Mojo::DOM is therefore also committed in this revision. Some further changes and some display fixes are required, but need to check with the others about that.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 41.6 KB

Rev	Line
[1445]	1	#!/usr/bin/perl -w
	2
	3	###########################################################################
	4	#
[2032]	5	# gsConvert.pl -- convert documents to HTML or TEXT format
[1445]	6	#
	7	# A component of the Greenstone digital library software
	8	# from the New Zealand Digital Library Project at the
	9	# University of Waikato, New Zealand.
	10	#
[3013]	11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
[1445]	12	#
	13	# This program is free software; you can redistribute it and/or modify
	14	# it under the terms of the GNU General Public License as published by
	15	# the Free Software Foundation; either version 2 of the License, or
	16	# (at your option) any later version.
	17	#
	18	# This program is distributed in the hope that it will be useful,
	19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	21	# GNU General Public License for more details.
	22	#
	23	# You should have received a copy of the GNU General Public License
	24	# along with this program; if not, write to the Free Software
	25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	26	#
	27	###########################################################################
	28
	29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
[2755]	30	# by exploiting third-party programs. The sources of these are usually found
	31	# in the $GSDLHOME/packages directory, and the executables should live in
	32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
[1445]	33	#
[3013]	34	# Currently, we can convert the following formats by using external
	35	# conversion utilities:
	36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
	37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
[2032]	38	#
[3013]	39	# We can try to convert any file to text with a perl implementation of the
	40	# UNIX strings command.
	41	#
[2032]	42	# We try to convert Postscript files to text using "gs" which is often on
[2755]	43	# *nix machines. We fall back to performing weak text extraction by using
	44	# regular expressions.
[1445]	45
	46	BEGIN {
	47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
	48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
	49	}
	50
[22429]	51	use strict;
	52
[1445]	53	use parsargv;
	54	use util;
[27509]	55	use FileUtils;
[1445]	56	use Cwd;
	57
[2755]	58	# Are we running on WinNT or Win2000 (or later)?
	59	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
	60	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
[1445]	61
[3350]	62	my $use_strings;
[3720]	63	my $pdf_complex;
[4103]	64	my $pdf_nohidden;
[3720]	65	my $pdf_zoom;
	66	my $pdf_ignore_images;
[10451]	67	my $pdf_allow_images_only;
[10282]	68	my $windows_scripting;
[3350]	69
[1445]	70	sub print_usage
	71	{
[1970]	72	print STDERR "\n";
	73	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
	74	print STDERR " or text using third-party programs.\n\n";
	75	print STDERR " usage: $0 [options] filename\n";
[22642]	76	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
[2755]	77	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
[22596]	78	print STDERR "\t-output\tauto\|html\|text\|pagedimg_jpg\|pagedimg_gif\|pagedimg_png\t(output file type)\n";
[2755]	79	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
[3720]	80	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
[22568]	81	print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
[3720]	82	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
[4103]	83	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
[3720]	84	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
	85	print STDERR "\t\tconverting PDF to HTML\n";
[10451]	86	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
[3720]	87	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
	88	print STDERR "\t\t-pdf_complex is set\n";
[1445]	89	exit(1);
	90	}
	91
[2755]	92	my $faillogfile="";
[3538]	93	my $timeout=0;
[24375]	94	my $verbosity=0;
[1445]	95
	96	sub main
	97	{
	98	my (@ARGV) = @_;
[3538]	99	my ($input_type,$output_type,$verbose);
[1960]	100
[23473]	101	# Dynamically figure out what the --type option can support, based on whether -windows_scripting
	102	# is in use or not
	103	my $default_type_re = "(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)";
	104	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
	105	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
	106	# Currently only have VBA for Word and PPT(but no XLS)
	107	my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xls)";
	108
	109	my $type_re = $default_type_re;
	110
	111	foreach my $a (@ARGV) {
	112	if ($a =~ m/^windows_scripting$/i) {
	113	$type_re = $enhanced_type_re;
	114	}
	115	}
	116
[1445]	117	# read command-line arguments
	118	if (!parsargv::parse(\@ARGV,
[23473]	119	"type/$type_re/", \$input_type,
[2755]	120	'/errlog/.*/', \$faillogfile,
[22596]	121	'output/(auto\|html\|text\|pagedimg).*/', \$output_type,
[1692]	122	'timeout/\d+/0',\$timeout,
[10282]	123	'verbose/\d+/0', \$verbose,
[22429]	124	'windows_scripting',\$windows_scripting,
[3720]	125	'use_strings', \$use_strings,
	126	'pdf_complex', \$pdf_complex,
[9482]	127	'pdf_ignore_images', \$pdf_ignore_images,
[10451]	128	'pdf_allow_images_only', \$pdf_allow_images_only,
[4103]	129	'pdf_nohidden', \$pdf_nohidden,
[3720]	130	'pdf_zoom/\d+/2', \$pdf_zoom
	131	))
[1445]	132	{
	133	print_usage();
	134	}
[24375]	135
	136	$verbosity=$verbose if defined $verbose;
	137
[1445]	138	# Make sure the input file exists and can be opened for reading
	139	if (scalar(@ARGV!=1)) {
	140	print_usage();
	141	}
[1928]	142
[1445]	143	my $input_filename = $ARGV[0];
	144	if (!-r $input_filename) {
	145	print STDERR "Error: unable to open $input_filename for reading\n";
	146	exit(1);
	147	}
	148
	149	# Deduce filenames
	150	my ($tailname,$dirname,$suffix)
[2241]	151	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
[27509]	152	my $output_filestem = &FileUtils::filenameConcatenate($dirname, "$tailname");
[1445]	153
	154	if ($input_type eq "")
	155	{
[2241]	156	$input_type = lc (substr($suffix,1,length($suffix)-1));
[1445]	157	}
	158
	159	# Change to temporary working directory
	160	my $stored_dir = cwd();
	161	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
[10357]	162
[1445]	163	# Select convert utility
	164	if (!defined $input_type) {
	165	print STDERR "Error: No filename extension or input type defined\n";
	166	exit(1);
	167	}
[23473]	168	elsif ($input_type =~ m/^docx?$/ \|\| $input_type eq "dot") {
[1445]	169	print &convertDOC($input_filename, $output_filestem, $output_type);
	170	print "\n";
	171	}
[1684]	172	elsif ($input_type eq "rtf") {
	173	print &convertRTF($input_filename, $output_filestem, $output_type);
	174	print "\n";
	175	}
[1445]	176	elsif ($input_type eq "pdf") {
	177	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
	178	print "\n";
	179	}
	180	elsif ($input_type eq "ps") {
[22429]	181	print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
[1445]	182	print "\n";
	183	}
[23473]	184	elsif ($input_type =~ m/pptx?$/) {
[2977]	185	print &convertPPT($input_filename, $output_filestem, $output_type);
	186	print "\n";
	187	}
[23473]	188	elsif ($input_type =~ m/xlsx?$/) {
[2991]	189	print &convertXLS($input_filename, $output_filestem, $output_type);
	190	print "\n";
	191	}
[1445]	192	else {
	193	print STDERR "Error: Unable to convert type '$input_type'\n";
	194	exit(1);
	195	}
	196
	197	# restore to original working directory
	198	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
	199
	200	}
	201
	202	&main(@ARGV);
	203
	204
	205
[2241]	206	# Document-type conversion functions
[1445]	207	#
	208	# The following functions attempt to convert documents from their
	209	# input type to the specified output type. If no output type was
	210	# given, then they first attempt HTML, and then TEXT.
	211	#
	212	# Each returns the output type ("html" or "text") or "fail" if no
	213	# conversion is possible.
	214
	215	# Convert a Microsoft word document
	216
	217	sub convertDOC {
[22429]	218	my ($input_filename, $output_filestem, $output_type) = @_;
[1445]	219
[1654]	220	# Many .doc files are not in fact word documents!
	221	my $realtype = &find_docfile_type($input_filename);
	222
[23473]	223	if ($realtype eq "word6" \|\| $realtype eq "word7"
	224	\|\| $realtype eq "word8" \|\| $realtype eq "docx") {
[1654]	225	return &convertWord678($input_filename, $output_filestem, $output_type);
	226	} elsif ($realtype eq "rtf") {
	227	return &convertRTF($input_filename, $output_filestem, $output_type);
	228	} else {
	229	return &convertAnything($input_filename, $output_filestem, $output_type);
	230	}
	231	}
	232
	233	# Convert a Microsoft word 6/7/8 document
	234
	235	sub convertWord678 {
[22429]	236	my ($input_filename, $output_filestem, $output_type) = @_;
[1654]	237
[1445]	238	my $success = 0;
[16435]	239	if (!$output_type \|\| ($output_type =~ m/html/i)){
[10282]	240	if ($windows_scripting) {
	241	$success = &native_doc_to_html($input_filename, $output_filestem);
	242	}
	243	else {
	244	$success = &doc_to_html($input_filename, $output_filestem);
	245	}
[1445]	246	if ($success) {
[10282]	247	return "html";
[1445]	248	}
	249	}
[1654]	250	return &convertAnything($input_filename, $output_filestem, $output_type);
	251	}
	252
	253
	254	# Convert a Rich Text Format (RTF) file
	255
	256	sub convertRTF {
[22429]	257	my ($input_filename, $output_filestem, $output_type) = @_;
[1654]	258
	259	my $success = 0;
	260
	261	# Attempt specialised conversion to HTML
[16435]	262	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[12704]	263
	264	if ($windows_scripting) {
	265	$success = &native_doc_to_html($input_filename, $output_filestem);
	266	}
	267	else {
	268	$success = &rtf_to_html($input_filename, $output_filestem);
	269	}
[1654]	270	if ($success) {
	271	return "html";
	272	}
	273	}
	274
[2755]	275	# rtf is so ugly that's it's not worth running strings over.
	276	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
	277	# return &convertAnything($input_filename, $output_filestem, $output_type);
	278	return "fail";
[1654]	279	}
	280
	281
	282	# Convert an unidentified file
	283
	284	sub convertAnything {
[22429]	285	my ($input_filename, $output_filestem, $output_type) = @_;
[1654]	286
	287	my $success = 0;
[10464]	288
[1445]	289	# Attempt simple conversion to HTML
[16435]	290	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[1445]	291	$success = &any_to_html($input_filename, $output_filestem);
	292	if ($success) {
	293	return "html";
	294	}
	295	}
	296
	297	# Convert to text
[16435]	298	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[2241]	299	$success = &any_to_text($input_filename, $output_filestem);
[1445]	300	if ($success) {
	301	return "text";
	302	}
	303	}
	304	return "fail";
	305	}
	306
	307
[1654]	308
[1445]	309	# Convert an Adobe PDF document
	310
	311	sub convertPDF {
[2755]	312	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[1445]	313
	314	my $success = 0;
[10357]	315	$output_type =~ s/.\-(.)/$1/i;
	316	# Attempt coversion to Image
[16435]	317	if ($output_type =~ m/jp?g\|gif\|png/i) {
[17329]	318	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
[10357]	319	if ($success){
	320	return "item";
	321	}
	322	}
[1445]	323
	324	# Attempt conversion to HTML
[32205]	325	# Uses the old pdftohtml that doesn't work for newer PDF versions
	326	#if ($output_type =~ m/^html/i) {
	327	if (!$output_type \|\| ($output_type =~ m/^html/i)) {
[1445]	328	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
	329	if ($success) {
	330	return "html";
	331	}
	332	}
	333
[32205]	334	# Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools. This
	335	# will be the new default for PDFs when output_type for PDF docs is not specified
	336	# (once our use of xpdftools' pdftohtml has been implemented on win and mac).
	337	if ($output_type =~ m/paged_html/i) {
	338	#if (!$output_type \|\| ($output_type =~ m/paged_html/i)) {
	339	$success = &xpdf_to_html($dirname, $input_filename, $output_filestem);
	340	if ($success) {
	341	return "paged_html";
	342	}
	343	}
	344
[1445]	345	# Attempt conversion to TEXT
[16435]	346	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[2117]	347	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
[1445]	348	if ($success) {
	349	return "text";
	350	}
	351	}
	352
	353	return "fail";
	354
	355	}
	356
	357
	358	# Convert an Adobe PostScript document
	359
	360	sub convertPS {
[22429]	361	my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
[1445]	362
	363	my $success = 0;
[10534]	364	$output_type =~ s/.\-(.)/$1/i;
	365	# Attempt coversion to Image
[16435]	366	if ($output_type =~ m/jp?g\|gif\|png/i) {
[17329]	367	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
[10534]	368	if ($success){
	369	return "item";
	370	}
	371	}
[1445]	372
	373	# Attempt conversion to TEXT
[16435]	374	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[1445]	375	$success = &ps_to_text($input_filename, $output_filestem);
	376	if ($success) {
	377	return "text";
	378	}
	379	}
	380	return "fail";
	381	}
	382
	383
[2977]	384	sub convertPPT {
	385	my ($input_filename, $output_filestem, $output_type) = @_;
[10357]	386	my $success = 0;
[2977]	387
[10282]	388	my $ppt_convert_type = "";
[22513]	389
[16435]	390	#if (!$output_type \|\| $windows_scripting \|\| ($output_type !~ m/html/i) \|\| ($output_type !~ m/text/i)){
	391	if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
	392	if ($output_type =~ m/gif/i) {
[10282]	393	$ppt_convert_type = "-g";
[16435]	394	} elsif ($output_type =~ m/jp?g/i){
[10282]	395	$ppt_convert_type = "-j";
[16435]	396	} elsif ($output_type =~ m/png/i){
[10282]	397	$ppt_convert_type = "-p";
	398	}
[27509]	399	my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin",
[10282]	400	$ENV{'GSDLOS'}, "pptextract");
[28355]	401	$vbScript = "CScript //Nologo \"".$vbScript.".vbs\"" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # now we use the .vbs VBScript
	402	# $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # back when the pptextract.exe VB executable was used
[10282]	403
[22429]	404	my $cmd = "";
[10357]	405	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[22429]	406	# if the converting directory already exists
[10282]	407	if (-d $output_filestem) {
[22429]	408	print STDERR "**The conversion directory already exists\n";
[10282]	409	return "item";
	410	} else {
[10521]	411	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
[10282]	412	$cmd .= " 2>\"$output_filestem.err\""
[16435]	413	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[28355]	414
[10282]	415	if (system($cmd) !=0) {
	416	print STDERR "Powerpoint VB Scripting convert failed\n";
	417	} else {
	418	return "item";
	419	}
	420	}
[16435]	421	} elsif (!$output_type \|\| ($output_type =~ m/html/i)) {
[10282]	422	# Attempt conversion to HTML
[16435]	423	#if (!$output_type \|\| ($output_type =~ m/html/i)) {
[2977]	424	# formulate the command
[22429]	425	my $cmd = "";
[24362]	426	my $full_perl_path = &util::get_perl_exec();
[24124]	427	$cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
[2977]	428	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
	429	$cmd .= " 2>\"$output_filestem.err\""
[16435]	430	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[10357]	431
[2977]	432	# execute the command
	433	$!=0;
	434	if (system($cmd)!=0)
	435	{
[2991]	436	print STDERR "Powerpoint 95/97 converter failed $!\n";
[2977]	437	} else {
	438	return "html";
	439	}
[10464]	440	}
[2977]	441
	442	$success = &any_to_text($input_filename, $output_filestem);
	443	if ($success) {
	444	return "text";
	445	}
[10464]	446
[2977]	447	return "fail";
	448	}
	449
	450
[2991]	451	sub convertXLS {
	452	my ($input_filename, $output_filestem, $output_type) = @_;
[2977]	453
[2991]	454	my $success = 0;
[2977]	455
[2991]	456	# Attempt conversion to HTML
[16435]	457	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[2991]	458	# formulate the command
[22429]	459	my $cmd = "";
[24362]	460	my $full_perl_path = &util::get_perl_exec();
[24124]	461	$cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
[2991]	462	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
	463	$cmd .= " 2>\"$output_filestem.err\""
[16435]	464	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[2991]	465
	466
	467	# execute the command
	468	$!=0;
	469	if (system($cmd)!=0)
	470	{
	471	print STDERR "Excel 95/97 converter failed $!\n";
	472	} else {
	473	return "html";
	474	}
	475	}
[2977]	476
[2991]	477	$success = &any_to_text($input_filename, $output_filestem);
	478	if ($success) {
	479	return "text";
	480	}
	481
	482	return "fail";
	483	}
	484
	485
	486
[1654]	487	# Find the real type of a .doc file
	488	#
[2012]	489	# We seem to have a lot of files with a .doc extension that are .rtf
[1654]	490	# files or Word 5 files. This function attempts to tell the difference.
	491	sub find_docfile_type {
[22429]	492	my ($input_filename) = @_;
[23473]	493
	494	if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
	495	return "docx";
	496	}
	497
[1654]	498	open(CHK, "<$input_filename");
[1734]	499	binmode(CHK);
[1654]	500	my $line = "";
	501	my $first = 1;
	502
	503	while (<CHK>) {
	504
	505	$line = $_;
[1960]	506
[1654]	507	if ($first) {
	508	# check to see if this is an rtf file
[16435]	509	if ($line =~ m/^\{\\rtf/) {
[1654]	510	close(CHK);
	511	return "rtf";
	512	}
[2755]	513	$first = 0;
[1654]	514	}
	515
[1734]	516	# is this is a word 6/7/8 document?
[16435]	517	if ($line =~ m/Word\.Document\.([678])/) {
[1654]	518	close(CHK);
[23473]	519
[1734]	520	return "word$1";
[1654]	521	}
	522
	523	}
	524
	525	return "unknown";
	526	}
	527
	528
[1734]	529	# Specific type-to-type conversions
[1445]	530	#
	531	# Each of the following functions attempts to convert a document from
[2755]	532	# a specific format to another. If they succeed they return 1 and leave
[1445]	533	# the output document(s) in the appropriate place; if they fail they
	534	# return 0 and delete any working files.
	535
	536
	537	# Attempt to convert a word document to html with the wv program
	538	sub doc_to_html {
[22429]	539	my ($input_filename, $output_filestem) = @_;
[1445]	540
[24371]	541	my $wvware_status = 0;
[24375]	542
[24371]	543	# need to ensure that the path to perl is quoted (in case there's spaces in it)
[24513]	544	my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl \"$input_filename\" \"$output_filestem\" \"$faillogfile\" $verbosity $timeout";
[15120]	545
[30683]	546	# print STDERR "***** wvware launch cmd = $launch_cmd\n";
[15120]	547
[24371]	548	$wvware_status = system($launch_cmd)/256;
	549	return $wvware_status;
[1445]	550	}
	551
[10282]	552	# Attempt to convert a word document to html with the word2html scripting program
	553	sub native_doc_to_html {
[22429]	554	my ($input_filename, $output_filestem) = @_;
[1445]	555
[24166]	556	# build up the path to the doc-to-html conversion tool we're going to use
[27509]	557	my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
[10282]	558
[24164]	559	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
[24166]	560	# if windows scripting with docx input, use new VBscript to get the local Word install (if
	561	# any) to do the conversion, since docX can't be processed by word2html's windows_scripting
	562
	563	if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
	564	# else script launch fails when there are error msgs
[27509]	565	$vbScript = &FileUtils::filenameConcatenate($vbScript, "docx2html.vbs");
[28355]	566	$vbScript = "CScript //Nologo \"$vbScript\""; # launch with CScript for error output in STDERR
[24169]	567	# //Nologo flag avoids Microsoft's opening/logo msgs
	568	print STDERR "About to use windows scripting to process docx file $input_filename.\n";
	569	print STDERR " This may take some time. Please wait...\n";
[24166]	570	}
	571	else { # old doc versions. use the usual VB executable word2html for the
	572	# conversion. Doesn't need full path, since bin\windows is on PATH
[27509]	573	$vbScript = "word2html"; #$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
[24166]	574	}
	575	}
	576	else { # not windows
[27509]	577	$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
[24164]	578	}
	579
[10445]	580	if (-e "$output_filestem.html") {
[22429]	581	print STDERR " The conversion file:\n";
	582	print STDERR " $output_filestem.html\n";
	583	print STDERR " ... already exists. Skipping\n";
[10445]	584	return 1;
	585	}
[10282]	586
	587	my $cmd = "";
	588	if ($timeout) {$cmd = "ulimit -t $timeout;";}
	589	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
[10445]	590	#$cmd .= "$vbScript $input_filename $output_filestem.html";
[10521]	591	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
[10445]	592
[10282]	593	# redirecting STDERR
[24166]	594
	595	$cmd .= " 2> \"$output_filestem.err\""
	596	if ($ENV {'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
	597	#print STDERR "@@@@@@@@@ cmd=$cmd\n";
	598
[10282]	599	# execute the command
	600	$!=0;
	601	if (system($cmd)!=0)
	602	{
[24164]	603	print STDERR "Error executing $vbScript converter:$!\n";
[10282]	604	if (-s "$output_filestem.err") {
	605	open (ERRFILE, "<$output_filestem.err");
[24166]	606
[10282]	607	my $write_to_fail_log=0;
	608	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	609	{$write_to_fail_log=1;}
	610
	611	my $line;
	612	while ($line=<ERRFILE>) {
[16435]	613	if ($line =~ m/\w/) {
[10282]	614	print STDERR "$line";
	615	print FAILLOG "$line" if ($write_to_fail_log);
	616	}
	617	if ($line !~ m/startup error/) {next;}
	618	print STDERR " (given an invalid .DOC file?)\n";
	619	print FAILLOG " (given an invalid .DOC file?)\n"
	620	if ($write_to_fail_log);
	621
	622	} # while ERRFILE
	623	close FAILLOG if ($write_to_fail_log);
	624	}
	625	return 0; # we can try any_to_text
	626	}
	627
	628	# Was the conversion successful?
	629	if (-s "$output_filestem.html") {
	630	open(TMP, "$output_filestem.html");
[22429]	631	my $line = <TMP>;
[10282]	632	close(TMP);
[22429]	633	if ($line && $line =~ m/html/i) {
[27509]	634	&FileUtils::removeFiles("$output_filestem.err") if -e "$output_filestem.err";
[10282]	635	return 1;
	636	}
	637	}
	638
	639	# If here, an error of some sort occurred
[27509]	640	&FileUtils::removeFiles("$output_filestem.html") if -e "$output_filestem.html";
[10282]	641	if (-e "$output_filestem.err") {
	642	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
	643	open (ERRLOG,"$output_filestem.err");
	644	while (<ERRLOG>) {print FAILLOG $_;}
	645	close FAILLOG;
	646	close ERRLOG;
	647	}
[27509]	648	&FileUtils::removeFiles("$output_filestem.err");
[10282]	649	}
	650	return 0;
	651	}
	652
[1654]	653	# Attempt to convert an RTF document to html with rtftohtml
	654	sub rtf_to_html {
[2241]	655	my ($input_filename, $output_filestem) = @_;
[1654]	656
	657	# formulate the command
[22429]	658	my $cmd = "";
[1692]	659	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[2574]	660	$cmd .= "rtftohtml";
[10282]	661	#$cmd .= "rtf-converter";
[1654]	662
[3246]	663	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
[2574]	664
	665	$cmd .= " 2>\"$output_filestem.err\""
[16435]	666	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[2574]	667
	668
[1654]	669	# execute the command
[2755]	670	$!=0;
[2060]	671	if (system($cmd)!=0)
[1654]	672	{
[2755]	673	print STDERR "Error executing rtf converter $!\n";
[2656]	674	# don't currently bother printing out error log...
	675	# keep going, in case it still created an HTML file...
[1654]	676	}
	677
	678	# Was the conversion successful?
[2755]	679	my $was_successful=0;
[2656]	680	if (-s "$output_filestem.html") {
[2755]	681	# make sure we have some content other than header
	682	open (HTML, "$output_filestem.html"); # what to do if fail?
	683	my $line;
	684	my $past_header=0;
	685	while ($line=<HTML>) {
	686
	687	if ($past_header == 0) {
[16435]	688	if ($line =~ m/<body>/) {$past_header=1;}
[2755]	689	next;
	690	}
	691
	692	$line =~ s/<[^>]+>//g;
[16435]	693	if ($line =~ m/\w/ && $past_header) { # we found some content...
[2755]	694	$was_successful=1;
	695	last;
	696	}
	697	}
	698	close HTML;
[1654]	699	}
[2574]	700
[2755]	701	if ($was_successful) {
[27509]	702	&FileUtils::removeFiles("$output_filestem.err")
[2755]	703	if (-e "$output_filestem.err");
	704	# insert the (modified) table of contents, if it exists.
	705	if (-e "${output_filestem}_ToC.html") {
[27509]	706	&FileUtils::moveFiles("$output_filestem.html","$output_filestem.src");
[2755]	707	my $open_failed=0;
	708	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
	709	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
	710	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
	711
	712	if ($open_failed) {
	713	close HTMLSRC;
	714	close TOC;
	715	close HTML;
[27509]	716	&FileUtils::moveFiles("$output_filestem.src","$output_filestem.html");
[2755]	717	return 1;
	718	}
	719
	720	# print out header info from src html.
[16435]	721	while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
[2755]	722	print HTML "$_";
	723	}
	724
	725	# print out table of contents, making links relative
	726	<TOC>; <TOC>; # ignore first 2 lines
	727	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
	728	my $line;
	729	while ($line=<TOC>) {
[22429]	730	$line =~ s@</body></html>$@@i ; # only last line has this
[2755]	731	# make link relative
[22429]	732	$line =~ s@href=\"[^\#]+@href=\"@i;
[2755]	733	print HTML $line;
	734	}
	735	close TOC;
	736
	737	# rest of html src
	738	while (<HTMLSRC>) {
	739	print HTML $_;
	740	}
	741	close HTMLSRC;
	742	close HTML;
	743
[27509]	744	&FileUtils::removeFiles("${output_filestem}_ToC.html");
	745	&FileUtils::removeFiles("${output_filestem}.src");
[2755]	746	}
	747	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
	748	return 1; # success
	749	}
	750
	751	if (-e "$output_filestem.err") {
	752	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	753	{
	754	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
[10282]	755	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
[2755]	756	print FAILLOG " (rtf file might be too recent):\n";
	757	open (ERRLOG, "$output_filestem.err");
	758	while (<ERRLOG>) {print FAILLOG $_;}
	759	close ERRLOG;
	760	close FAILLOG;
	761	}
[27509]	762	&FileUtils::removeFiles("$output_filestem.err");
[2755]	763	}
	764
[27509]	765	&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
[2656]	766
[1654]	767	return 0;
	768	}
	769
	770
[32205]	771	# Convert a pdf file to html with the old pdftohtml command
	772	# which only works for older PDF versions
[1445]	773	sub pdf_to_html {
[2755]	774	my ($dirname, $input_filename, $output_filestem) = @_;
[1445]	775
[22429]	776	my $cmd = "";
[1692]	777	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[24362]	778	my $full_perl_path = &util::get_perl_exec();
[24124]	779	$cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
[3720]	780	$cmd .= " -c" if ($pdf_complex);
	781	$cmd .= " -i" if ($pdf_ignore_images);
[10451]	782	$cmd .= " -a" if ($pdf_allow_images_only);
[4103]	783	$cmd .= " -hidden" unless ($pdf_nohidden);
[1928]	784	$cmd .= " \"$input_filename\" \"$output_filestem\"";
[2755]	785
[16435]	786	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
[2755]	787	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	788	} else {
	789	$cmd .= " > \"$output_filestem.err\"";
	790	}
	791
[2117]	792	$!=0;
[2241]	793
[2656]	794	my $retval=system($cmd);
	795	if ($retval!=0)
[1445]	796	{
[2755]	797	print STDERR "Error executing pdftohtml.pl";
[2117]	798	if ($!) {print STDERR ": $!";}
	799	print STDERR "\n";
[1445]	800	}
	801
[1692]	802	# make sure the converter made something
[2656]	803	if ($retval!=0 \|\| ! -s "$output_filestem.html")
[1692]	804	{
[27509]	805	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
[2656]	806	# print out the converter's std err, if any
	807	if (-s "$output_filestem.err") {
[1692]	808	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[2755]	809	print STDERR "pdftohtml error log:\n";
[1692]	810	while (<ERRLOG>) {
	811	print STDERR "$_";
	812	}
	813	close ERRLOG;
	814	}
[24608]	815	#print STDERR "***********output filestem $output_filestem.html\n";
[27509]	816	&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
[2755]	817	if (-e "$output_filestem.err") {
	818	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	819	{
	820	open (ERRLOG, "$output_filestem.err");
	821	while (<ERRLOG>) {print FAILLOG $_;}
	822	close ERRLOG;
	823	close FAILLOG;
	824	}
[27509]	825	&FileUtils::removeFiles("$output_filestem.err");
[2755]	826	}
[1692]	827	return 0;
	828	}
[10357]	829
[27509]	830	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
	831	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
[10357]	832	return 1;
	833	}
	834
[32205]	835
	836	# Convert a pdf file to html with the newer Xpdftools' pdftohtml
	837	# This generates "paged HTML" where extracted, selectable text is positioned
	838	# over screenshots of each page.
	839	# Since xpdf's pdftohtml fails if the output dir already exists and for easier
	840	# naming, the output files are created in a "pages" subdirectory of the tmp
	841	# location parent of $output_filestem instead
	842	sub xpdf_to_html {
	843	my ($dirname, $input_filename, $output_filestem) = @_;
	844
	845	my $cmd = "";
	846
	847	# build up the path to the doc-to-html conversion tool we're going to use
	848	my $xpdf_pdftohtml = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "xpdf-tools");
	849
	850	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
	851	# TODO
	852	} elsif ($ENV{'GSDLOS'} =~ m/^darwin$/i) {
	853	# TODO
	854	} else { # unix, use the appropriate bin folder for the bitness of the system
	855
	856	# Don't use $ENV{'GSDLARCH'}, use the new $ENV{'BITNESS'}, since
	857	# $ENV{'GSDLARCH'} is only (meant to be) set when many other 32-bit or 64-bit
	858	# specific subdirectories exist in a greenstone installation.
	859	# None of those locations need exist when xpdf-tools is installed with GS.
	860	# So don't depend on GSDLARCH as forcing that to be exported has side-effects
	861	if($ENV{'BITNESS'}) {
	862	$xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin".$ENV{'BITNESS'});
	863	} else { # what if $ENV{'BITNESS'} undefined, fallback on bin32? or 64?
	864	$xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin32");
	865	}
	866	}
	867
	868	# We'll create the file by name $output_filestem during post-conversion processing.
	869	# Note that Xpdf tools will only create its conversion products in a dir that does
	870	# not yet exist. So we'll create this location as a subdir of the output_filestem's
	871	# parent directory. The parent dir is the already generated tmp area for conversion. So:
	872	# - tmpdir gs2build/tmp/<random-num> already exists at this stage
	873	# - We'll create gs2build/tmp/<rand>/output_filestem.html later, during post-processing
	874	# - For now, XPdftools will create gs2build/tmp/<rand>/pages and put its products in there.
	875	my ($tailname, $tmp_dirname, $suffix)
	876	= &File::Basename::fileparse($output_filestem, "\\.[^\\.]+\$");
	877	$tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "pages");
	878
	879	$xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "pdftohtml");
	880	# xpdf's pdftohtml tool also takes a zoom factor, where a zoom of 1 is 100%
	881	$cmd .= "\"$xpdf_pdftohtml\"";
	882	$cmd .= " -z $pdf_zoom" if ($pdf_zoom);
	883	# $cmd .= " -c" if ($pdf_complex);
	884	# $cmd .= " -i" if ($pdf_ignore_images);
	885	# $cmd .= " -a" if ($pdf_allow_images_only);
	886	# $cmd .= " -hidden" unless ($pdf_nohidden);
	887	$cmd .= " \"$input_filename\" \"$tmp_dirname\"";
	888	#$cmd .= " \"$input_filename\" \"$output_filestem\"";
	889
	890	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
	891	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	892	} else {
	893	$cmd .= " > \"$output_filestem.err\"";
	894	}
	895
	896	#print STDERR "@@@@ Running command: $cmd\n";
	897
	898	$!=0;
	899	my $retval=system($cmd);
	900	if ($retval!=0)
	901	{
	902	print STDERR "Error executing xpdf's pdftohtml tool";
	903	if ($!) {print STDERR ": $!";}
	904	print STDERR "\n";
	905	}
	906
	907	# make sure the converter made something
	908	if ($retval!=0 \|\| ! -s &FileUtils::filenameConcatenate($tmp_dirname,"index.html"))
	909	{
	910	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
	911	# print out the converter's std err, if any
	912	if (-s "$output_filestem.err") {
	913	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
	914	print STDERR "pdftohtml error log:\n";
	915	while (<ERRLOG>) {
	916	print STDERR "$_";
	917	}
	918	close ERRLOG;
	919	}
	920	#print STDERR "***********output filestem $output_filestem.html\n";
	921	&FileUtils::removeFiles("$tmp_dirname") if (-d "$tmp_dirname");
	922	if (-e "$output_filestem.err") {
	923	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	924	{
	925	open (ERRLOG, "$output_filestem.err");
	926	while (<ERRLOG>) {print FAILLOG $_;}
	927	close ERRLOG;
	928	close FAILLOG;
	929	}
	930	&FileUtils::removeFiles("$output_filestem.err");
	931	}
	932	return 0;
	933	}
	934
	935	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
	936	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
	937	return 1;
	938	}
	939
	940
	941
[10357]	942	# Convert a pdf file to various types of image with the convert command
	943
[17329]	944	sub pdfps_to_img {
[10357]	945	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[10401]	946
	947	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
	948	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
[24600]	949	my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
[24763]	950	$imagick_cmd = $imagick_cmd." --verbosity=$verbosity" if defined $verbosity;
[24600]	951	my $result = `$imagick_cmd identify 2>&1`;
	952
	953	# Linux and Windows return different values for "program not found".
	954	# Linux returns -1 and Windows 256 for "program not found". But once they're
	955	# converted to signed values, it will be -1 for Linux and 1 for Windows.
	956	# Whenever we test for return values other than 0, shift by 8 and perform
	957	# unsigned to signed status conversion on $? to get expected range of return vals
	958	# Although gs-magick.pl already shifts its $? by 8, converts it to a signed value
	959	# and then exits on that, by the time we get here, we need to do it again
	960	my $status = $?;
	961	$status >>= 8;
	962	$status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);
[25798]	963	if (($ENV{'GSDLOS'} ne "windows" && $status == -1) \|\| ($ENV{'GSDLOS'} eq "windows" && $status == 1)) {
	964	# if ($status == -1 \|\| $status == 1) #if ($status == -1 \|\| $status == 256) {
[10401]	965	#ImageMagick is not installed, thus the convert utility is not available.
[25798]	966	print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images. Status: $status\n";
[10401]	967	return 0;
	968	}
	969	}
	970
[22429]	971	my $cmd = "";
[10357]	972	if ($timeout) {$cmd = "ulimit -t $timeout;";}
	973	$output_type =~ s/.\_(.)/$1/i;
[24362]	974	my $full_perl_path = &util::get_perl_exec();
[24124]	975	$cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
[16435]	976	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
[10357]	977	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	978	} else {
	979	$cmd .= " > \"$output_filestem.err\"";
	980	}
	981
	982	# don't include path on windows (to avoid having to play about
	983	# with quoting when GSDLHOME might contain spaces) but assume
	984	# that the PATH is set up correctly
	985	$!=0;
	986	my $retval=system($cmd);
	987	if ($retval!=0)
	988	{
[28166]	989	print STDERR "Error executing pdfpstoimg.pl";
[10357]	990	if ($!) {print STDERR ": $!";}
	991	print STDERR "\n";
	992	}
	993
	994	#make sure the converter made something
	995	#if ($retval !=0) \|\| ! -s "$output_filestem")
	996	if ($retval !=0)
	997	{
[27509]	998	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
[10357]	999	#print out the converter's std err, if any
	1000	if (-s "$output_filestem.err") {
	1001	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[17329]	1002	print STDERR "pdfpstoimg error log:\n";
[10357]	1003	while (<ERRLOG>) {
	1004	print STDERR "$_";
	1005	}
	1006	close ERRLOG;
	1007	}
[27509]	1008	#&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
[10357]	1009	if (-e "$output_filestem.err") {
	1010	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	1011	{
	1012	open (ERRLOG, "$output_filestem.err");
	1013	while (<ERRLOG>) {print FAILLOG $_;}
	1014	close ERRLOG;
	1015	close FAILLOG;
	1016	}
[27509]	1017	&FileUtils::removeFiles("$output_filestem.err");
[10357]	1018	}
	1019	return 0;
	1020	}
[27509]	1021	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
	1022	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
[1445]	1023	return 1;
	1024	}
	1025
	1026	# Convert a PDF file to text with the pdftotext command
	1027
	1028	sub pdf_to_text {
[2755]	1029	my ($dirname, $input_filename, $output_filestem) = @_;
[1445]	1030
[2248]	1031	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
[2755]	1032
[16435]	1033	if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
[2755]	1034	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	1035	} else {
	1036	$cmd .= " > \"$output_filestem.err\"";
	1037	}
[1445]	1038
[2060]	1039	if (system($cmd)!=0)
[1445]	1040	{
	1041	print STDERR "Error executing $cmd: $!\n";
[27509]	1042	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
[1445]	1043	}
	1044
[2755]	1045	# make sure there is some extracted text.
	1046	if (-e "$output_filestem.text") {
	1047	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
	1048	binmode(EXTR_TEXT); # just in case...
	1049	my $line="";
	1050	my $seen_text=0;
	1051	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
[16435]	1052	if ($line=~ m/\w/) {$seen_text=1;}
[2755]	1053	}
	1054	close EXTR_TEXT;
	1055	if ($seen_text==0) { # no text was extracted
	1056	print STDERR "Error: pdftotext found no text\n";
[27509]	1057	&FileUtils::removeFiles("$output_filestem.text");
[2755]	1058	}
	1059	}
	1060
[1692]	1061	# make sure the converter made something
[2656]	1062	if (! -s "$output_filestem.text")
[1692]	1063	{
	1064	# print out the converters std err, if any
[2656]	1065	if (-s "$output_filestem.err") {
[1692]	1066	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[2755]	1067	print STDERR "pdftotext error log:\n";
[1692]	1068	while (<ERRLOG>) {
	1069	print STDERR "$_";
	1070	}
	1071	close ERRLOG;
	1072	}
[2656]	1073	# does this converter create a .out file?
[27509]	1074	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
	1075	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
[2755]	1076	if (-e "$output_filestem.err") {
	1077	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	1078	{
	1079	open (ERRLOG,"$output_filestem.err");
	1080	while (<ERRLOG>) {print FAILLOG $_;}
	1081	close ERRLOG;
	1082	close FAILLOG;
	1083	}
[27509]	1084	&FileUtils::removeFiles("$output_filestem.err");
[2755]	1085	}
[1692]	1086	return 0;
	1087	}
[27509]	1088	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
[1445]	1089	return 1;
	1090	}
	1091
[2012]	1092	# Convert a PostScript document to text
	1093	# note - just using "ps2ascii" isn't good enough, as it
	1094	# returns 0 for a postscript interpreter error. ps2ascii is just
	1095	# a wrapper to "gs" anyway, so we use that cmd here.
[1445]	1096
	1097	sub ps_to_text {
[2241]	1098	my ($input_filename, $output_filestem) = @_;
[1445]	1099
[2241]	1100	my $error = "";
	1101
	1102	# if we're on windows we'll fall straight through without attempting
	1103	# to use gs
[16435]	1104	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
[2241]	1105	$error = "Windows does not support gs";
	1106
	1107	} else {
[3538]	1108	my $cmd = "";
	1109	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
	1110	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
[2241]	1111	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
[10357]	1112	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
[2241]	1113	$cmd .= " 2> $output_filestem.err";
	1114	$!=0;
[10357]	1115
[2241]	1116	my $retcode=system($cmd);
	1117	$retcode = $? >> 8; # see man perlfunc - system for this...
	1118	# if system returns -1 \| 127 (couldn't start program), look at $! for message
	1119
	1120	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
	1121	elsif (! -e "$output_filestem.text") {
	1122	$error="did not create output file.\n";
[2012]	1123	}
[2241]	1124	else
	1125	{ # make sure the interpreter didn't get an error. It is technically
	1126	# possible for the actual text to start with this, but....
	1127	open PSOUT, "$output_filestem.text";
[16435]	1128	if (<PSOUT> =~ m/^Error: (.*)/) {
[2241]	1129	$error="interpreter error - \"$1\"";
	1130	}
	1131	close PSOUT;
	1132	}
[2012]	1133	}
[2241]	1134
[2012]	1135	if ($error ne "")
[1445]	1136	{
[2755]	1137	print STDERR "Warning: Error executing gs: $error\n";
[30724]	1138	print STDERR "Resorting to Perl regular expressions to extract text from PostScript...\n";
[27509]	1139	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
[2755]	1140
	1141	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
	1142	{
	1143	print FAILLOG "gs - $error\n";
	1144	if (-e "$output_filestem.err") {
	1145	open(ERRLOG, "$output_filestem.err");
	1146	while (<ERRLOG>) {print FAILLOG $_;}
	1147	close ERRLOG;
	1148	}
	1149	close FAILLOG;
	1150	}
[27509]	1151	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
[2012]	1152
[2755]	1153
[2012]	1154	# Fine then. We'll just do a lousy job by ourselves...
[2031]	1155	# Based on 5-line regexp sed script found at:
[2012]	1156	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
	1157	#
[2755]	1158	print STDERR "Stripping text from postscript\n";
[2012]	1159	my $errorcode=0;
	1160	open (IN, "$input_filename")
	1161	\|\| ($errorcode=1, warn "Couldn't read file: $!");
	1162	open (OUT, ">$output_filestem.text")
	1163	\|\| ($errorcode=1, warn "Couldn't write file: $!");
	1164	if ($errorcode) {print STDERR "errors\n";return 0;}
	1165
[2031]	1166	my $text=""; # this is for whole .ps file...
[2755]	1167	$text = join('', <IN>); # see man perlport, under "System Resources"
[2031]	1168	close IN;
	1169
[2447]	1170	# Make sure this is a ps file...
[16435]	1171	if ($text !~ m/^%!/) {
[2755]	1172	print STDERR "Bad postscript header: not '%!'\n";
	1173	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
	1174	{
	1175	print FAILLOG "Bad postscript header: not '%!'\n";
	1176	close FAILLOG;
	1177	}
[2447]	1178	return 0;
	1179	}
	1180
[2031]	1181	# if ps has Page data, then use it to delete all stuff before it.
	1182	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
	1183
	1184	# remove all leading non-data stuff
	1185	$text =~ s/^.*?\(//s;
	1186
	1187	# remove all newline chars for easier processing
	1188	$text =~ s/\n//g;
	1189
	1190	# Big assumption here - assume that if any co-ordinates are
	1191	# given, then we are at the end of a sentence.
	1192	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
	1193
	1194	# special characters--
	1195	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
	1196
	1197	# ? ps text formatting (eg italics?) ?
	1198	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
	1199	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
	1200	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
	1201	# default - remove the rest
	1202	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
	1203
	1204	# attempt to add whitespace between words...
	1205	# this is based purely on observation, and may be completely wrong...
	1206	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
	1207	# eg I notice "b(" is sometimes NOT a space if preceded by a
	1208	# negative number.
	1209	$text =~ s/\)\d+ ?b\(/\) \( /g;
	1210
	1211	# change quoted braces to brackets
	1212	$text =~ s/([^\\])\\\(/$1\{/g;
	1213	$text =~ s/([^\\])\\\)/$1\}/g ;
	1214
	1215	# remove everything that is not between braces
	1216	$text =~ s/\)([^\(\)])+?\(//sg ;
	1217
	1218	# remove any Trailer eof stuff.
	1219	$text =~ s/\)[^\)]*$//sg;
	1220
	1221	### ligatures have special characters...
	1222	$text =~ s/\\013/ff/g;
	1223	$text =~ s/\\014/fi/g;
	1224	$text =~ s/\\015/fl/g;
	1225	$text =~ s/\\016/ffi/g;
	1226	$text =~ s/\\214/fi/g;
	1227	$text =~ s/\\215/fl/g;
	1228	$text =~ s/\\017/\n\* /g; # asterisk?
	1229	$text =~ s/\\023/\023/g; # e acute ('e)
	1230	$text =~ s/\\177/\252/g; # u"
	1231	# $text =~ s/ ?? /\344/g; # a"
	1232
	1233	print OUT "$text";
	1234	close OUT;
[1960]	1235	}
[2600]	1236	# wrap the text - use a minimum length. ie, first space after this length.
	1237	my $wrap_length=72;
[27509]	1238	&FileUtils::moveFiles("$output_filestem.text", "$output_filestem.text.tmp");
[2600]	1239	open INFILE, "$output_filestem.text.tmp" \|\|
	1240	die "Couldn't open file: $!";
	1241	open OUTFILE, ">$output_filestem.text" \|\|
	1242	die "Couldn't open file for writing: $!";
	1243	my $line="";
	1244	while ($line=<INFILE>) {
	1245	while (length($line)>0) {
	1246	if (length($line)>$wrap_length) {
	1247	$line =~ s/^(.{$wrap_length}[^\s])\s//;
	1248	print OUTFILE "$1\n";
	1249	} else {
	1250	print OUTFILE "$line";
	1251	$line="";
	1252	}
	1253	}
	1254	}
	1255	close INFILE;
	1256	close OUTFILE;
[27509]	1257	&FileUtils::removeFiles("$output_filestem.text.tmp");
[2600]	1258
[27509]	1259	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
[1445]	1260	return 1;
	1261	}
	1262
	1263
	1264	# Convert any file to HTML with a crude perl implementation of the
	1265	# UNIX strings command.
	1266
	1267	sub any_to_html {
[22429]	1268	my ($input_filename, $output_filestem) = @_;
[1445]	1269
	1270	# First generate a text file
	1271	return 0 unless (&any_to_text($input_filename, $output_filestem));
	1272
	1273	# create an HTML file from the text file
	1274	open(TEXT, "<$output_filestem.text");
	1275	open(HTML, ">$output_filestem.html");
	1276
[2241]	1277	print HTML "<html><head>\n";
	1278	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
	1279	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
	1280	print HTML "</head><body>\n\n";
[1734]	1281
[2755]	1282	my $line;
	1283	while ($line=<TEXT>) {
	1284	$line =~ s/</</g;
	1285	$line =~ s/>/>/g;
[16435]	1286	if ($line =~ m/^\s*$/) {
[2755]	1287	print HTML "<p>";
	1288	} else {
	1289	print HTML "<br> ", $line;
	1290	}
[1445]	1291	}
[1734]	1292	print HTML "\n</body></html>\n";
[1445]	1293
[2241]	1294	close HTML;
	1295	close TEXT;
	1296
[27509]	1297	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
[1445]	1298	return 1;
	1299	}
	1300
	1301	# Convert any file to TEXT with a crude perl implementation of the
	1302	# UNIX strings command.
[2755]	1303	# Note - this assumes ascii charsets :( (jrm21)
[1445]	1304
	1305	sub any_to_text {
[22429]	1306	my ($input_filename, $output_filestem) = @_;
[1445]	1307
[3350]	1308	if (!$use_strings) {
	1309	return 0;
	1310	}
[15120]	1311
	1312	print STDERR "\n** In any to text**\n\n";
[2755]	1313	open(IN, "<$input_filename") \|\| return 0;
[1734]	1314	binmode(IN);
[2755]	1315	open(OUT, ">$output_filestem.text") \|\| return 0;
[1445]	1316
	1317	my ($line);
[2755]	1318	my $output_line_count = 0;
[1445]	1319	while (<IN>) {
	1320	$line = $_;
[1734]	1321
[1445]	1322	# delete anything that isn't a printable character
	1323	$line =~ s/[^\040-\176]+/\n/sg;
	1324
	1325	# delete any string less than 10 characters long
[1734]	1326	$line =~ s/^.{0,9}$/\n/mg;
[16435]	1327	while ($line =~ m/^.{1,9}$/m) {
[1734]	1328	$line =~ s/^.{0,9}$/\n/mg;
[1445]	1329	$line =~ s/\n+/\n/sg;
	1330	}
	1331
	1332	# remove extraneous whitespace
	1333	$line =~ s/\n+/\n/gs;
	1334	$line =~ s/^\n//gs;
[1578]	1335
[1445]	1336	# output whatever is left
[16435]	1337	if ($line =~ m/[^\n ]/) {
[1445]	1338	print OUT $line;
[2755]	1339	++$output_line_count;
[1445]	1340	}
	1341	}
[2241]	1342
	1343	close OUT;
	1344	close IN;
	1345
[2755]	1346	if ($output_line_count) { # try to protect against binary only formats
	1347	return 1;
	1348	}
	1349
[27509]	1350	&FileUtils::removeFiles("$output_filestem.text");
[2755]	1351	return 0;
	1352
[1445]	1353	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: