Context Navigation

source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 32287

Last change on this file since 32287 was 32287, checked in by ak19, 6 years ago
Cleaning up unused strings, some debug statements and recently commented out code.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 43.9 KB

Rev	Line
[1445]	1	#!/usr/bin/perl -w
	2
	3	###########################################################################
	4	#
[2032]	5	# gsConvert.pl -- convert documents to HTML or TEXT format
[1445]	6	#
	7	# A component of the Greenstone digital library software
	8	# from the New Zealand Digital Library Project at the
	9	# University of Waikato, New Zealand.
	10	#
[3013]	11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
[1445]	12	#
	13	# This program is free software; you can redistribute it and/or modify
	14	# it under the terms of the GNU General Public License as published by
	15	# the Free Software Foundation; either version 2 of the License, or
	16	# (at your option) any later version.
	17	#
	18	# This program is distributed in the hope that it will be useful,
	19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	21	# GNU General Public License for more details.
	22	#
	23	# You should have received a copy of the GNU General Public License
	24	# along with this program; if not, write to the Free Software
	25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	26	#
	27	###########################################################################
	28
	29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
[2755]	30	# by exploiting third-party programs. The sources of these are usually found
	31	# in the $GSDLHOME/packages directory, and the executables should live in
	32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
[1445]	33	#
[3013]	34	# Currently, we can convert the following formats by using external
	35	# conversion utilities:
	36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
	37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
[2032]	38	#
[3013]	39	# We can try to convert any file to text with a perl implementation of the
	40	# UNIX strings command.
	41	#
[2032]	42	# We try to convert Postscript files to text using "gs" which is often on
[2755]	43	# *nix machines. We fall back to performing weak text extraction by using
	44	# regular expressions.
[1445]	45
	46	BEGIN {
	47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
	48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
	49	}
	50
[22429]	51	use strict;
	52
[1445]	53	use parsargv;
	54	use util;
[27509]	55	use FileUtils;
[1445]	56	use Cwd;
	57
[2755]	58	# Are we running on WinNT or Win2000 (or later)?
	59	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
	60	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
[1445]	61
[3350]	62	my $use_strings;
[32273]	63	my $pdf_tool;
[3720]	64	my $pdf_complex;
[4103]	65	my $pdf_nohidden;
[3720]	66	my $pdf_zoom;
[32284]	67	my $pdf_dpi;
[3720]	68	my $pdf_ignore_images;
[10451]	69	my $pdf_allow_images_only;
[10282]	70	my $windows_scripting;
[32224]	71	my $enc;
[3350]	72
[1445]	73	sub print_usage
	74	{
[1970]	75	print STDERR "\n";
	76	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
	77	print STDERR " or text using third-party programs.\n\n";
	78	print STDERR " usage: $0 [options] filename\n";
[22642]	79	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
[2755]	80	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
[32277]	81	print STDERR "\t-output\tauto\|html\|pretty_html\|paged_pretty_html\|paged_html\|text\|paged_text\|pagedimg_jpg\|pagedimg_gif\|pagedimg_png\|pagedimgtxt_jpg\|pagedimgtxt_png\t(output file type)\n";
[2755]	82	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
[3720]	83	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
[22568]	84	print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
[32273]	85	print STDERR "\t-pdf_tool\tpdftohtml\|xpdftools\|pdfbox (not all output types are supported by every pdf_tool)\n";
[3720]	86	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
[4103]	87	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
[3720]	88	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
	89	print STDERR "\t\tconverting PDF to HTML\n";
[10451]	90	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
[3720]	91	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
	92	print STDERR "\t\t-pdf_complex is set\n";
[32284]	93	print STDERR "\t-pdf_dpi\tSet the resolution in DPI of background images produced by xpdf's pdftohtml\n";
[1445]	94	exit(1);
	95	}
	96
[2755]	97	my $faillogfile="";
[3538]	98	my $timeout=0;
[24375]	99	my $verbosity=0;
[1445]	100
	101	sub main
	102	{
	103	my (@ARGV) = @_;
[3538]	104	my ($input_type,$output_type,$verbose);
[1960]	105
[23473]	106	# Dynamically figure out what the --type option can support, based on whether -windows_scripting
	107	# is in use or not
	108	my $default_type_re = "(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)";
	109	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
	110	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
	111	# Currently only have VBA for Word and PPT(but no XLS)
	112	my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xls)";
	113
	114	my $type_re = $default_type_re;
	115
	116	foreach my $a (@ARGV) {
	117	if ($a =~ m/^windows_scripting$/i) {
	118	$type_re = $enhanced_type_re;
	119	}
	120	}
	121
[1445]	122	# read command-line arguments
	123	if (!parsargv::parse(\@ARGV,
[23473]	124	"type/$type_re/", \$input_type,
[2755]	125	'/errlog/.*/', \$faillogfile,
[32273]	126	'output/(auto\|html\|text\|pagedimg).*/', \$output_type, # regex includes html_multi and paged_html besides html
[1692]	127	'timeout/\d+/0',\$timeout,
[10282]	128	'verbose/\d+/0', \$verbose,
[22429]	129	'windows_scripting',\$windows_scripting,
[3720]	130	'use_strings', \$use_strings,
[32273]	131	'pdf_tool/(pdftohtml\|pdfbox\|xpdftools)/', \$pdf_tool, # the old pdftohtml tool, pdfbox extensions or the newer xpdf-tools
	132	'pdf_complex', \$pdf_complex, # options for pdf_tool = pdftohtml (the old pdftohtml tool)
[9482]	133	'pdf_ignore_images', \$pdf_ignore_images,
[10451]	134	'pdf_allow_images_only', \$pdf_allow_images_only,
[4103]	135	'pdf_nohidden', \$pdf_nohidden,
[32284]	136	'pdf_zoom/\d+/2', \$pdf_zoom,
	137	'pdf_dpi/\d+/96', \$pdf_dpi
[3720]	138	))
[1445]	139	{
	140	print_usage();
	141	}
[24375]	142
	143	$verbosity=$verbose if defined $verbose;
	144
[1445]	145	# Make sure the input file exists and can be opened for reading
	146	if (scalar(@ARGV!=1)) {
	147	print_usage();
	148	}
[1928]	149
[1445]	150	my $input_filename = $ARGV[0];
	151	if (!-r $input_filename) {
	152	print STDERR "Error: unable to open $input_filename for reading\n";
	153	exit(1);
	154	}
	155
	156	# Deduce filenames
	157	my ($tailname,$dirname,$suffix)
[2241]	158	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
[27509]	159	my $output_filestem = &FileUtils::filenameConcatenate($dirname, "$tailname");
[1445]	160
	161	if ($input_type eq "")
	162	{
[2241]	163	$input_type = lc (substr($suffix,1,length($suffix)-1));
[1445]	164	}
	165
	166	# Change to temporary working directory
	167	my $stored_dir = cwd();
	168	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
[10357]	169
[1445]	170	# Select convert utility
	171	if (!defined $input_type) {
	172	print STDERR "Error: No filename extension or input type defined\n";
	173	exit(1);
	174	}
[23473]	175	elsif ($input_type =~ m/^docx?$/ \|\| $input_type eq "dot") {
[1445]	176	print &convertDOC($input_filename, $output_filestem, $output_type);
	177	print "\n";
	178	}
[1684]	179	elsif ($input_type eq "rtf") {
	180	print &convertRTF($input_filename, $output_filestem, $output_type);
	181	print "\n";
	182	}
[1445]	183	elsif ($input_type eq "pdf") {
	184	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
	185	print "\n";
	186	}
	187	elsif ($input_type eq "ps") {
[22429]	188	print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
[1445]	189	print "\n";
	190	}
[23473]	191	elsif ($input_type =~ m/pptx?$/) {
[2977]	192	print &convertPPT($input_filename, $output_filestem, $output_type);
	193	print "\n";
	194	}
[23473]	195	elsif ($input_type =~ m/xlsx?$/) {
[2991]	196	print &convertXLS($input_filename, $output_filestem, $output_type);
	197	print "\n";
	198	}
[1445]	199	else {
	200	print STDERR "Error: Unable to convert type '$input_type'\n";
	201	exit(1);
	202	}
	203
	204	# restore to original working directory
	205	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
	206
	207	}
	208
	209	&main(@ARGV);
	210
	211
	212
[2241]	213	# Document-type conversion functions
[1445]	214	#
	215	# The following functions attempt to convert documents from their
	216	# input type to the specified output type. If no output type was
	217	# given, then they first attempt HTML, and then TEXT.
	218	#
	219	# Each returns the output type ("html" or "text") or "fail" if no
	220	# conversion is possible.
	221
	222	# Convert a Microsoft word document
	223
	224	sub convertDOC {
[22429]	225	my ($input_filename, $output_filestem, $output_type) = @_;
[1445]	226
[1654]	227	# Many .doc files are not in fact word documents!
	228	my $realtype = &find_docfile_type($input_filename);
	229
[23473]	230	if ($realtype eq "word6" \|\| $realtype eq "word7"
	231	\|\| $realtype eq "word8" \|\| $realtype eq "docx") {
[1654]	232	return &convertWord678($input_filename, $output_filestem, $output_type);
	233	} elsif ($realtype eq "rtf") {
	234	return &convertRTF($input_filename, $output_filestem, $output_type);
	235	} else {
	236	return &convertAnything($input_filename, $output_filestem, $output_type);
	237	}
	238	}
	239
	240	# Convert a Microsoft word 6/7/8 document
	241
	242	sub convertWord678 {
[22429]	243	my ($input_filename, $output_filestem, $output_type) = @_;
[1654]	244
[1445]	245	my $success = 0;
[16435]	246	if (!$output_type \|\| ($output_type =~ m/html/i)){
[10282]	247	if ($windows_scripting) {
	248	$success = &native_doc_to_html($input_filename, $output_filestem);
	249	}
	250	else {
	251	$success = &doc_to_html($input_filename, $output_filestem);
	252	}
[1445]	253	if ($success) {
[10282]	254	return "html";
[1445]	255	}
	256	}
[1654]	257	return &convertAnything($input_filename, $output_filestem, $output_type);
	258	}
	259
	260
	261	# Convert a Rich Text Format (RTF) file
	262
	263	sub convertRTF {
[22429]	264	my ($input_filename, $output_filestem, $output_type) = @_;
[1654]	265
	266	my $success = 0;
	267
	268	# Attempt specialised conversion to HTML
[16435]	269	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[12704]	270
	271	if ($windows_scripting) {
	272	$success = &native_doc_to_html($input_filename, $output_filestem);
	273	}
	274	else {
	275	$success = &rtf_to_html($input_filename, $output_filestem);
	276	}
[1654]	277	if ($success) {
	278	return "html";
	279	}
	280	}
	281
[2755]	282	# rtf is so ugly that's it's not worth running strings over.
	283	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
	284	# return &convertAnything($input_filename, $output_filestem, $output_type);
	285	return "fail";
[1654]	286	}
	287
	288
	289	# Convert an unidentified file
	290
	291	sub convertAnything {
[22429]	292	my ($input_filename, $output_filestem, $output_type) = @_;
[1654]	293
	294	my $success = 0;
[10464]	295
[1445]	296	# Attempt simple conversion to HTML
[16435]	297	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[1445]	298	$success = &any_to_html($input_filename, $output_filestem);
	299	if ($success) {
	300	return "html";
	301	}
	302	}
	303
	304	# Convert to text
[16435]	305	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[2241]	306	$success = &any_to_text($input_filename, $output_filestem);
[1445]	307	if ($success) {
	308	return "text";
	309	}
	310	}
	311	return "fail";
	312	}
	313
	314
[1654]	315
[1445]	316	# Convert an Adobe PDF document
	317
	318	sub convertPDF {
[2755]	319	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[1445]	320
	321	my $success = 0;
[10357]	322	$output_type =~ s/.\-(.)/$1/i;
[32277]	323
[32287]	324	#print STDERR "@@@@@@@@ Using $pdf_tool for the conversion\n";
[32277]	325
[32273]	326	# First determine which pdf conversion tool we're using among pdftohtml/pdfbox/xpdftools
	327	# and then decide which conversion command to run based on the output type
	328	# (pdfbox does not currently go through gsConvert.pl
	329	# as PDFBoxConverter inherits from AutoLoadConverters)
	330
	331	if ($pdf_tool eq "pdftohtml" ) { # old pdftohtml tool
[10357]	332	# Attempt coversion to Image
[16435]	333	if ($output_type =~ m/jp?g\|gif\|png/i) {
[17329]	334	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
[10357]	335	if ($success){
	336	return "item";
	337	}
	338	}
[1445]	339
	340	# Attempt conversion to HTML
[32205]	341	# Uses the old pdftohtml that doesn't work for newer PDF versions
[32223]	342	if ($output_type =~ m/^html/i) {
	343	#if (!$output_type \|\| ($output_type =~ m/^html/i)) {
[1445]	344	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
	345	if ($success) {
	346	return "html";
	347	}
	348	}
	349
[32273]	350	# Attempt conversion to TEXT (not for Windows, but PDFPlugin/PDFv1Plugin takes care of that
	351	if (!$output_type \|\| ($output_type =~ m/text/i)) {
	352	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
[32205]	353
[1445]	354	if ($success) {
	355	return "text";
	356	}
	357	}
[32273]	358	}
	359
[32277]	360	elsif ($pdf_tool eq "xpdftools" ) {
	361
	362	# default to pretty html output
[32273]	363	if (!$output_type) {
[32277]	364	$output_type = "pretty_html";
[32273]	365	}
	366
	367	# Attempt coversion to Image
	368	#if ($output_type =~ m/jp?g\|gif\|png/i) {
	369	# $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
	370	# if ($success){
	371	# return "item";
	372	# }
	373	#}
	374
[32277]	375	# Attempt conversion to (paged) pretty HTML using the newer pdftohtml of Xpdftools.
	376	if ($output_type =~ m/pretty_html$/i) {
[32273]	377	$success = &xpdf_to_html($dirname, $input_filename, $output_filestem);
	378	if ($success) {
	379	return $output_type;
	380	}
	381	}
	382
	383	# Attempt conversion to TEXT
[32277]	384	# Proper paged_text processing not yet implemented with xpdf
	385	if ($output_type =~ m/text/i) {
	386	$success = &xpdf_to_text($dirname, $input_filename, $output_filestem, $output_type);
[32273]	387
	388	if ($success) {
	389	return "text";
	390	}
	391	}
[32277]	392	}
	393
[1445]	394	return "fail";
	395
	396	}
	397
	398
	399	# Convert an Adobe PostScript document
	400
	401	sub convertPS {
[22429]	402	my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
[1445]	403
	404	my $success = 0;
[10534]	405	$output_type =~ s/.\-(.)/$1/i;
	406	# Attempt coversion to Image
[16435]	407	if ($output_type =~ m/jp?g\|gif\|png/i) {
[17329]	408	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
[10534]	409	if ($success){
	410	return "item";
	411	}
	412	}
[1445]	413
	414	# Attempt conversion to TEXT
[16435]	415	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[1445]	416	$success = &ps_to_text($input_filename, $output_filestem);
	417	if ($success) {
	418	return "text";
	419	}
	420	}
	421	return "fail";
	422	}
	423
	424
[2977]	425	sub convertPPT {
	426	my ($input_filename, $output_filestem, $output_type) = @_;
[10357]	427	my $success = 0;
[2977]	428
[10282]	429	my $ppt_convert_type = "";
[22513]	430
[16435]	431	#if (!$output_type \|\| $windows_scripting \|\| ($output_type !~ m/html/i) \|\| ($output_type !~ m/text/i)){
	432	if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
	433	if ($output_type =~ m/gif/i) {
[10282]	434	$ppt_convert_type = "-g";
[16435]	435	} elsif ($output_type =~ m/jp?g/i){
[10282]	436	$ppt_convert_type = "-j";
[16435]	437	} elsif ($output_type =~ m/png/i){
[10282]	438	$ppt_convert_type = "-p";
	439	}
[27509]	440	my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin",
[10282]	441	$ENV{'GSDLOS'}, "pptextract");
[28355]	442	$vbScript = "CScript //Nologo \"".$vbScript.".vbs\"" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # now we use the .vbs VBScript
	443	# $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # back when the pptextract.exe VB executable was used
[10282]	444
[22429]	445	my $cmd = "";
[10357]	446	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[22429]	447	# if the converting directory already exists
[10282]	448	if (-d $output_filestem) {
[22429]	449	print STDERR "**The conversion directory already exists\n";
[10282]	450	return "item";
	451	} else {
[10521]	452	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
[10282]	453	$cmd .= " 2>\"$output_filestem.err\""
[16435]	454	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[28355]	455
[10282]	456	if (system($cmd) !=0) {
	457	print STDERR "Powerpoint VB Scripting convert failed\n";
	458	} else {
	459	return "item";
	460	}
	461	}
[16435]	462	} elsif (!$output_type \|\| ($output_type =~ m/html/i)) {
[10282]	463	# Attempt conversion to HTML
[16435]	464	#if (!$output_type \|\| ($output_type =~ m/html/i)) {
[2977]	465	# formulate the command
[22429]	466	my $cmd = "";
[24362]	467	my $full_perl_path = &util::get_perl_exec();
[24124]	468	$cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
[2977]	469	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
	470	$cmd .= " 2>\"$output_filestem.err\""
[16435]	471	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[10357]	472
[2977]	473	# execute the command
	474	$!=0;
	475	if (system($cmd)!=0)
	476	{
[2991]	477	print STDERR "Powerpoint 95/97 converter failed $!\n";
[2977]	478	} else {
	479	return "html";
	480	}
[10464]	481	}
[2977]	482
	483	$success = &any_to_text($input_filename, $output_filestem);
	484	if ($success) {
	485	return "text";
	486	}
[10464]	487
[2977]	488	return "fail";
	489	}
	490
	491
[2991]	492	sub convertXLS {
	493	my ($input_filename, $output_filestem, $output_type) = @_;
[2977]	494
[2991]	495	my $success = 0;
[2977]	496
[2991]	497	# Attempt conversion to HTML
[16435]	498	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[2991]	499	# formulate the command
[22429]	500	my $cmd = "";
[24362]	501	my $full_perl_path = &util::get_perl_exec();
[24124]	502	$cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
[2991]	503	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
	504	$cmd .= " 2>\"$output_filestem.err\""
[16435]	505	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[2991]	506
	507
	508	# execute the command
	509	$!=0;
	510	if (system($cmd)!=0)
	511	{
	512	print STDERR "Excel 95/97 converter failed $!\n";
	513	} else {
	514	return "html";
	515	}
	516	}
[2977]	517
[2991]	518	$success = &any_to_text($input_filename, $output_filestem);
	519	if ($success) {
	520	return "text";
	521	}
	522
	523	return "fail";
	524	}
	525
	526
	527
[1654]	528	# Find the real type of a .doc file
	529	#
[2012]	530	# We seem to have a lot of files with a .doc extension that are .rtf
[1654]	531	# files or Word 5 files. This function attempts to tell the difference.
	532	sub find_docfile_type {
[22429]	533	my ($input_filename) = @_;
[23473]	534
	535	if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
	536	return "docx";
	537	}
	538
[1654]	539	open(CHK, "<$input_filename");
[1734]	540	binmode(CHK);
[1654]	541	my $line = "";
	542	my $first = 1;
	543
	544	while (<CHK>) {
	545
	546	$line = $_;
[1960]	547
[1654]	548	if ($first) {
	549	# check to see if this is an rtf file
[16435]	550	if ($line =~ m/^\{\\rtf/) {
[1654]	551	close(CHK);
	552	return "rtf";
	553	}
[2755]	554	$first = 0;
[1654]	555	}
	556
[1734]	557	# is this is a word 6/7/8 document?
[16435]	558	if ($line =~ m/Word\.Document\.([678])/) {
[1654]	559	close(CHK);
[23473]	560
[1734]	561	return "word$1";
[1654]	562	}
	563
	564	}
	565
	566	return "unknown";
	567	}
	568
	569
[1734]	570	# Specific type-to-type conversions
[1445]	571	#
	572	# Each of the following functions attempts to convert a document from
[2755]	573	# a specific format to another. If they succeed they return 1 and leave
[1445]	574	# the output document(s) in the appropriate place; if they fail they
	575	# return 0 and delete any working files.
	576
	577
	578	# Attempt to convert a word document to html with the wv program
	579	sub doc_to_html {
[22429]	580	my ($input_filename, $output_filestem) = @_;
[1445]	581
[24371]	582	my $wvware_status = 0;
[24375]	583
[24371]	584	# need to ensure that the path to perl is quoted (in case there's spaces in it)
[24513]	585	my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl \"$input_filename\" \"$output_filestem\" \"$faillogfile\" $verbosity $timeout";
[15120]	586
[30683]	587	# print STDERR "***** wvware launch cmd = $launch_cmd\n";
[15120]	588
[24371]	589	$wvware_status = system($launch_cmd)/256;
	590	return $wvware_status;
[1445]	591	}
	592
[10282]	593	# Attempt to convert a word document to html with the word2html scripting program
	594	sub native_doc_to_html {
[22429]	595	my ($input_filename, $output_filestem) = @_;
[1445]	596
[24166]	597	# build up the path to the doc-to-html conversion tool we're going to use
[27509]	598	my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
[10282]	599
[24164]	600	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
[24166]	601	# if windows scripting with docx input, use new VBscript to get the local Word install (if
	602	# any) to do the conversion, since docX can't be processed by word2html's windows_scripting
	603
	604	if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
	605	# else script launch fails when there are error msgs
[27509]	606	$vbScript = &FileUtils::filenameConcatenate($vbScript, "docx2html.vbs");
[28355]	607	$vbScript = "CScript //Nologo \"$vbScript\""; # launch with CScript for error output in STDERR
[24169]	608	# //Nologo flag avoids Microsoft's opening/logo msgs
	609	print STDERR "About to use windows scripting to process docx file $input_filename.\n";
	610	print STDERR " This may take some time. Please wait...\n";
[24166]	611	}
	612	else { # old doc versions. use the usual VB executable word2html for the
	613	# conversion. Doesn't need full path, since bin\windows is on PATH
[27509]	614	$vbScript = "word2html"; #$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
[24166]	615	}
	616	}
	617	else { # not windows
[27509]	618	$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
[24164]	619	}
	620
[10445]	621	if (-e "$output_filestem.html") {
[22429]	622	print STDERR " The conversion file:\n";
	623	print STDERR " $output_filestem.html\n";
	624	print STDERR " ... already exists. Skipping\n";
[10445]	625	return 1;
	626	}
[10282]	627
	628	my $cmd = "";
	629	if ($timeout) {$cmd = "ulimit -t $timeout;";}
	630	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
[10445]	631	#$cmd .= "$vbScript $input_filename $output_filestem.html";
[10521]	632	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
[10445]	633
[10282]	634	# redirecting STDERR
[24166]	635
	636	$cmd .= " 2> \"$output_filestem.err\""
	637	if ($ENV {'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
	638	#print STDERR "@@@@@@@@@ cmd=$cmd\n";
	639
[10282]	640	# execute the command
	641	$!=0;
	642	if (system($cmd)!=0)
	643	{
[24164]	644	print STDERR "Error executing $vbScript converter:$!\n";
[10282]	645	if (-s "$output_filestem.err") {
	646	open (ERRFILE, "<$output_filestem.err");
[24166]	647
[10282]	648	my $write_to_fail_log=0;
	649	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	650	{$write_to_fail_log=1;}
	651
	652	my $line;
	653	while ($line=<ERRFILE>) {
[16435]	654	if ($line =~ m/\w/) {
[10282]	655	print STDERR "$line";
	656	print FAILLOG "$line" if ($write_to_fail_log);
	657	}
	658	if ($line !~ m/startup error/) {next;}
	659	print STDERR " (given an invalid .DOC file?)\n";
	660	print FAILLOG " (given an invalid .DOC file?)\n"
	661	if ($write_to_fail_log);
	662
	663	} # while ERRFILE
	664	close FAILLOG if ($write_to_fail_log);
	665	}
	666	return 0; # we can try any_to_text
	667	}
	668
	669	# Was the conversion successful?
	670	if (-s "$output_filestem.html") {
	671	open(TMP, "$output_filestem.html");
[22429]	672	my $line = <TMP>;
[10282]	673	close(TMP);
[22429]	674	if ($line && $line =~ m/html/i) {
[27509]	675	&FileUtils::removeFiles("$output_filestem.err") if -e "$output_filestem.err";
[10282]	676	return 1;
	677	}
	678	}
	679
	680	# If here, an error of some sort occurred
[27509]	681	&FileUtils::removeFiles("$output_filestem.html") if -e "$output_filestem.html";
[10282]	682	if (-e "$output_filestem.err") {
	683	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
	684	open (ERRLOG,"$output_filestem.err");
	685	while (<ERRLOG>) {print FAILLOG $_;}
	686	close FAILLOG;
	687	close ERRLOG;
	688	}
[27509]	689	&FileUtils::removeFiles("$output_filestem.err");
[10282]	690	}
	691	return 0;
	692	}
	693
[1654]	694	# Attempt to convert an RTF document to html with rtftohtml
	695	sub rtf_to_html {
[2241]	696	my ($input_filename, $output_filestem) = @_;
[1654]	697
	698	# formulate the command
[22429]	699	my $cmd = "";
[1692]	700	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[2574]	701	$cmd .= "rtftohtml";
[10282]	702	#$cmd .= "rtf-converter";
[1654]	703
[3246]	704	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
[2574]	705
	706	$cmd .= " 2>\"$output_filestem.err\""
[16435]	707	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[2574]	708
	709
[1654]	710	# execute the command
[2755]	711	$!=0;
[2060]	712	if (system($cmd)!=0)
[1654]	713	{
[2755]	714	print STDERR "Error executing rtf converter $!\n";
[2656]	715	# don't currently bother printing out error log...
	716	# keep going, in case it still created an HTML file...
[1654]	717	}
	718
	719	# Was the conversion successful?
[2755]	720	my $was_successful=0;
[2656]	721	if (-s "$output_filestem.html") {
[2755]	722	# make sure we have some content other than header
	723	open (HTML, "$output_filestem.html"); # what to do if fail?
	724	my $line;
	725	my $past_header=0;
	726	while ($line=<HTML>) {
	727
	728	if ($past_header == 0) {
[16435]	729	if ($line =~ m/<body>/) {$past_header=1;}
[2755]	730	next;
	731	}
	732
	733	$line =~ s/<[^>]+>//g;
[16435]	734	if ($line =~ m/\w/ && $past_header) { # we found some content...
[2755]	735	$was_successful=1;
	736	last;
	737	}
	738	}
	739	close HTML;
[1654]	740	}
[2574]	741
[2755]	742	if ($was_successful) {
[27509]	743	&FileUtils::removeFiles("$output_filestem.err")
[2755]	744	if (-e "$output_filestem.err");
	745	# insert the (modified) table of contents, if it exists.
	746	if (-e "${output_filestem}_ToC.html") {
[27509]	747	&FileUtils::moveFiles("$output_filestem.html","$output_filestem.src");
[2755]	748	my $open_failed=0;
	749	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
	750	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
	751	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
	752
	753	if ($open_failed) {
	754	close HTMLSRC;
	755	close TOC;
	756	close HTML;
[27509]	757	&FileUtils::moveFiles("$output_filestem.src","$output_filestem.html");
[2755]	758	return 1;
	759	}
	760
	761	# print out header info from src html.
[16435]	762	while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
[2755]	763	print HTML "$_";
	764	}
	765
	766	# print out table of contents, making links relative
	767	<TOC>; <TOC>; # ignore first 2 lines
	768	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
	769	my $line;
	770	while ($line=<TOC>) {
[22429]	771	$line =~ s@</body></html>$@@i ; # only last line has this
[2755]	772	# make link relative
[22429]	773	$line =~ s@href=\"[^\#]+@href=\"@i;
[2755]	774	print HTML $line;
	775	}
	776	close TOC;
	777
	778	# rest of html src
	779	while (<HTMLSRC>) {
	780	print HTML $_;
	781	}
	782	close HTMLSRC;
	783	close HTML;
	784
[27509]	785	&FileUtils::removeFiles("${output_filestem}_ToC.html");
	786	&FileUtils::removeFiles("${output_filestem}.src");
[2755]	787	}
	788	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
	789	return 1; # success
	790	}
	791
	792	if (-e "$output_filestem.err") {
	793	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	794	{
	795	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
[10282]	796	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
[2755]	797	print FAILLOG " (rtf file might be too recent):\n";
	798	open (ERRLOG, "$output_filestem.err");
	799	while (<ERRLOG>) {print FAILLOG $_;}
	800	close ERRLOG;
	801	close FAILLOG;
	802	}
[27509]	803	&FileUtils::removeFiles("$output_filestem.err");
[2755]	804	}
	805
[27509]	806	&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
[2656]	807
[1654]	808	return 0;
	809	}
	810
	811
[32205]	812	# Convert a pdf file to html with the old pdftohtml command
	813	# which only works for older PDF versions
[1445]	814	sub pdf_to_html {
[2755]	815	my ($dirname, $input_filename, $output_filestem) = @_;
[1445]	816
[22429]	817	my $cmd = "";
[1692]	818	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[24362]	819	my $full_perl_path = &util::get_perl_exec();
[24124]	820	$cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
[3720]	821	$cmd .= " -c" if ($pdf_complex);
	822	$cmd .= " -i" if ($pdf_ignore_images);
[10451]	823	$cmd .= " -a" if ($pdf_allow_images_only);
[4103]	824	$cmd .= " -hidden" unless ($pdf_nohidden);
[1928]	825	$cmd .= " \"$input_filename\" \"$output_filestem\"";
[2755]	826
[16435]	827	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
[2755]	828	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	829	} else {
	830	$cmd .= " > \"$output_filestem.err\"";
	831	}
	832
[2117]	833	$!=0;
[2241]	834
[2656]	835	my $retval=system($cmd);
	836	if ($retval!=0)
[1445]	837	{
[2755]	838	print STDERR "Error executing pdftohtml.pl";
[2117]	839	if ($!) {print STDERR ": $!";}
	840	print STDERR "\n";
[1445]	841	}
	842
[1692]	843	# make sure the converter made something
[2656]	844	if ($retval!=0 \|\| ! -s "$output_filestem.html")
[1692]	845	{
[27509]	846	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
[2656]	847	# print out the converter's std err, if any
	848	if (-s "$output_filestem.err") {
[1692]	849	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[2755]	850	print STDERR "pdftohtml error log:\n";
[1692]	851	while (<ERRLOG>) {
	852	print STDERR "$_";
	853	}
	854	close ERRLOG;
	855	}
[24608]	856	#print STDERR "***********output filestem $output_filestem.html\n";
[27509]	857	&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
[2755]	858	if (-e "$output_filestem.err") {
	859	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	860	{
	861	open (ERRLOG, "$output_filestem.err");
	862	while (<ERRLOG>) {print FAILLOG $_;}
	863	close ERRLOG;
	864	close FAILLOG;
	865	}
[27509]	866	&FileUtils::removeFiles("$output_filestem.err");
[2755]	867	}
[1692]	868	return 0;
	869	}
[10357]	870
[27509]	871	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
	872	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
[10357]	873	return 1;
	874	}
	875
[32205]	876
	877	# Convert a pdf file to html with the newer Xpdftools' pdftohtml
	878	# This generates "paged HTML" where extracted, selectable text is positioned
	879	# over screenshots of each page.
	880	# Since xpdf's pdftohtml fails if the output dir already exists and for easier
	881	# naming, the output files are created in a "pages" subdirectory of the tmp
	882	# location parent of $output_filestem instead
	883	sub xpdf_to_html {
	884	my ($dirname, $input_filename, $output_filestem) = @_;
	885
	886	my $cmd = "";
	887
	888	# build up the path to the doc-to-html conversion tool we're going to use
[32224]	889	my $xpdf_pdftohtml = &FileUtils::filenameConcatenate(_get_xpdftools_bindir(), "pdftohtml");
[32209]	890
[32205]	891	# We'll create the file by name $output_filestem during post-conversion processing.
	892	# Note that Xpdf tools will only create its conversion products in a dir that does
	893	# not yet exist. So we'll create this location as a subdir of the output_filestem's
	894	# parent directory. The parent dir is the already generated tmp area for conversion. So:
	895	# - tmpdir gs2build/tmp/<random-num> already exists at this stage
	896	# - We'll create gs2build/tmp/<rand>/output_filestem.html later, during post-processing
	897	# - For now, XPdftools will create gs2build/tmp/<rand>/pages and put its products in there.
	898	my ($tailname, $tmp_dirname, $suffix)
	899	= &File::Basename::fileparse($output_filestem, "\\.[^\\.]+\$");
	900	$tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "pages");
[32224]	901
[32205]	902	# xpdf's pdftohtml tool also takes a zoom factor, where a zoom of 1 is 100%
	903	$cmd .= "\"$xpdf_pdftohtml\"";
[32284]	904	# resolution, -r in DPI of background images, see https://www.xpdfreader.com/pdftohtml-man.html
	905	$cmd .= " -r $pdf_dpi" if ($pdf_dpi);
[32205]	906	$cmd .= " \"$input_filename\" \"$tmp_dirname\"";
	907	#$cmd .= " \"$input_filename\" \"$output_filestem\"";
	908
	909	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
	910	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	911	} else {
	912	$cmd .= " > \"$output_filestem.err\"";
	913	}
	914
	915	#print STDERR "@@@@ Running command: $cmd\n";
	916
	917	$!=0;
	918	my $retval=system($cmd);
	919	if ($retval!=0)
	920	{
	921	print STDERR "Error executing xpdf's pdftohtml tool";
	922	if ($!) {print STDERR ": $!";}
	923	print STDERR "\n";
	924	}
	925
	926	# make sure the converter made something
	927	if ($retval!=0 \|\| ! -s &FileUtils::filenameConcatenate($tmp_dirname,"index.html"))
	928	{
	929	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
	930	# print out the converter's std err, if any
	931	if (-s "$output_filestem.err") {
	932	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
	933	print STDERR "pdftohtml error log:\n";
	934	while (<ERRLOG>) {
	935	print STDERR "$_";
	936	}
	937	close ERRLOG;
	938	}
	939	#print STDERR "***********output filestem $output_filestem.html\n";
	940	&FileUtils::removeFiles("$tmp_dirname") if (-d "$tmp_dirname");
	941	if (-e "$output_filestem.err") {
	942	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	943	{
	944	open (ERRLOG, "$output_filestem.err");
	945	while (<ERRLOG>) {print FAILLOG $_;}
	946	close ERRLOG;
	947	close FAILLOG;
	948	}
	949	&FileUtils::removeFiles("$output_filestem.err");
	950	}
	951	return 0;
	952	}
	953
	954	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
	955	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
	956	return 1;
	957	}
	958
[32224]	959	# Returns the path to xpdf-tools's containing bin dir appropriate for this machine's OS and bitness
	960	sub _get_xpdftools_bindir {
[32205]	961
[32263]	962	# build up the path to the containing bin dir of the xpdf conversion tool we're going to use
	963	my $xpdf_tools_bin = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "xpdf-tools", "bin");
	964	return $xpdf_tools_bin;
[32224]	965	}
[32205]	966
[10357]	967	# Convert a pdf file to various types of image with the convert command
	968
[17329]	969	sub pdfps_to_img {
[10357]	970	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[10401]	971
	972	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
	973	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
[24600]	974	my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
[24763]	975	$imagick_cmd = $imagick_cmd." --verbosity=$verbosity" if defined $verbosity;
[24600]	976	my $result = `$imagick_cmd identify 2>&1`;
	977
	978	# Linux and Windows return different values for "program not found".
	979	# Linux returns -1 and Windows 256 for "program not found". But once they're
	980	# converted to signed values, it will be -1 for Linux and 1 for Windows.
	981	# Whenever we test for return values other than 0, shift by 8 and perform
	982	# unsigned to signed status conversion on $? to get expected range of return vals
	983	# Although gs-magick.pl already shifts its $? by 8, converts it to a signed value
	984	# and then exits on that, by the time we get here, we need to do it again
	985	my $status = $?;
	986	$status >>= 8;
	987	$status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);
[25798]	988	if (($ENV{'GSDLOS'} ne "windows" && $status == -1) \|\| ($ENV{'GSDLOS'} eq "windows" && $status == 1)) {
	989	# if ($status == -1 \|\| $status == 1) #if ($status == -1 \|\| $status == 256) {
[10401]	990	#ImageMagick is not installed, thus the convert utility is not available.
[25798]	991	print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images. Status: $status\n";
[10401]	992	return 0;
	993	}
	994	}
	995
[22429]	996	my $cmd = "";
[10357]	997	if ($timeout) {$cmd = "ulimit -t $timeout;";}
	998	$output_type =~ s/.\_(.)/$1/i;
[24362]	999	my $full_perl_path = &util::get_perl_exec();
[24124]	1000	$cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
[16435]	1001	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
[10357]	1002	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	1003	} else {
	1004	$cmd .= " > \"$output_filestem.err\"";
	1005	}
	1006
	1007	# don't include path on windows (to avoid having to play about
	1008	# with quoting when GSDLHOME might contain spaces) but assume
	1009	# that the PATH is set up correctly
	1010	$!=0;
	1011	my $retval=system($cmd);
	1012	if ($retval!=0)
	1013	{
[28166]	1014	print STDERR "Error executing pdfpstoimg.pl";
[10357]	1015	if ($!) {print STDERR ": $!";}
	1016	print STDERR "\n";
	1017	}
	1018
	1019	#make sure the converter made something
	1020	#if ($retval !=0) \|\| ! -s "$output_filestem")
	1021	if ($retval !=0)
	1022	{
[27509]	1023	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
[10357]	1024	#print out the converter's std err, if any
	1025	if (-s "$output_filestem.err") {
	1026	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[17329]	1027	print STDERR "pdfpstoimg error log:\n";
[10357]	1028	while (<ERRLOG>) {
	1029	print STDERR "$_";
	1030	}
	1031	close ERRLOG;
	1032	}
[27509]	1033	#&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
[10357]	1034	if (-e "$output_filestem.err") {
	1035	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	1036	{
	1037	open (ERRLOG, "$output_filestem.err");
	1038	while (<ERRLOG>) {print FAILLOG $_;}
	1039	close ERRLOG;
	1040	close FAILLOG;
	1041	}
[27509]	1042	&FileUtils::removeFiles("$output_filestem.err");
[10357]	1043	}
	1044	return 0;
	1045	}
[27509]	1046	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
	1047	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
[1445]	1048	return 1;
	1049	}
	1050
[32224]	1051	# Convert a PDF file to text with xpdftools' pdftotext command
	1052	# Works for Windows too, whereas the old pdftotxt didn't
	1053	sub xpdf_to_text {
[32277]	1054	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[32224]	1055
	1056	my $cmd = "";
	1057
	1058	# build up the path to the doc-to-txt conversion tool we're going to use
	1059	my $xpdf_pdftotxt = &FileUtils::filenameConcatenate(_get_xpdftools_bindir(), "pdftotext");
	1060
	1061	# For xpdf's pdftotxt options, see https://www.xpdfreader.com/pdftotext-man.html
	1062	$cmd .= "\"$xpdf_pdftotxt\"";
	1063	if($enc) {
	1064	$cmd .= " -enc $enc"; # decode the bytes in the file using the designated encoding scheme
	1065	} else {
	1066	# as per https://www.xpdfreader.com/pdftotext-man.html
	1067	# xpdf's pdftotxt defaults to using Latin-1 encoding, should we default to UTF-8?
	1068	$cmd .= " -enc UTF-8"; # see https://www.xpdfreader.com/xpdfrc-man.html
	1069	}
[32277]	1070
	1071	if ($output_type ne "paged_text") { # output_type eq "text", don't bother about page break markers
[32224]	1072	$cmd .= " -nopgbrk";
[32277]	1073	}
[32224]	1074	# Avoid the silly solitary carriage returns (CR in Notepad) at the end
	1075	# of lines that ends up as \n appended to the doc title
	1076	# by setting the end of line marker to unix style solitary newline (LF or \n),
	1077	# which doesn't end up in the doc title
	1078	$cmd .= " -eol unix";
	1079	$cmd .= " \"$input_filename\" \"$output_filestem.text\"";
	1080
	1081	print STDERR "@@@@ Running command: $cmd\n";
	1082
	1083	return _run_pdf_to_text_cmd($cmd, $output_filestem);
	1084	}
	1085
[1445]	1086	# Convert a PDF file to text with the pdftotext command
	1087
	1088	sub pdf_to_text {
[2755]	1089	my ($dirname, $input_filename, $output_filestem) = @_;
[1445]	1090
[2248]	1091	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
[32224]	1092
	1093	return _run_pdf_to_text_cmd($cmd, $output_filestem);
	1094	}
[2755]	1095
[32224]	1096	sub _run_pdf_to_text_cmd {
	1097	my ($cmd, $output_filestem) = @_;
	1098
[16435]	1099	if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
[2755]	1100	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	1101	} else {
	1102	$cmd .= " > \"$output_filestem.err\"";
	1103	}
[1445]	1104
[2060]	1105	if (system($cmd)!=0)
[1445]	1106	{
	1107	print STDERR "Error executing $cmd: $!\n";
[27509]	1108	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
[1445]	1109	}
	1110
[2755]	1111	# make sure there is some extracted text.
	1112	if (-e "$output_filestem.text") {
	1113	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
	1114	binmode(EXTR_TEXT); # just in case...
	1115	my $line="";
	1116	my $seen_text=0;
	1117	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
[16435]	1118	if ($line=~ m/\w/) {$seen_text=1;}
[2755]	1119	}
	1120	close EXTR_TEXT;
	1121	if ($seen_text==0) { # no text was extracted
	1122	print STDERR "Error: pdftotext found no text\n";
[27509]	1123	&FileUtils::removeFiles("$output_filestem.text");
[2755]	1124	}
	1125	}
	1126
[1692]	1127	# make sure the converter made something
[2656]	1128	if (! -s "$output_filestem.text")
[1692]	1129	{
	1130	# print out the converters std err, if any
[2656]	1131	if (-s "$output_filestem.err") {
[1692]	1132	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[2755]	1133	print STDERR "pdftotext error log:\n";
[1692]	1134	while (<ERRLOG>) {
	1135	print STDERR "$_";
	1136	}
	1137	close ERRLOG;
	1138	}
[2656]	1139	# does this converter create a .out file?
[27509]	1140	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
	1141	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
[2755]	1142	if (-e "$output_filestem.err") {
	1143	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	1144	{
	1145	open (ERRLOG,"$output_filestem.err");
	1146	while (<ERRLOG>) {print FAILLOG $_;}
	1147	close ERRLOG;
	1148	close FAILLOG;
	1149	}
[27509]	1150	&FileUtils::removeFiles("$output_filestem.err");
[2755]	1151	}
[1692]	1152	return 0;
	1153	}
[27509]	1154	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
[1445]	1155	return 1;
	1156	}
	1157
[2012]	1158	# Convert a PostScript document to text
	1159	# note - just using "ps2ascii" isn't good enough, as it
	1160	# returns 0 for a postscript interpreter error. ps2ascii is just
	1161	# a wrapper to "gs" anyway, so we use that cmd here.
[1445]	1162
	1163	sub ps_to_text {
[2241]	1164	my ($input_filename, $output_filestem) = @_;
[1445]	1165
[2241]	1166	my $error = "";
	1167
	1168	# if we're on windows we'll fall straight through without attempting
	1169	# to use gs
[16435]	1170	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
[2241]	1171	$error = "Windows does not support gs";
	1172
	1173	} else {
[3538]	1174	my $cmd = "";
	1175	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
	1176	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
[2241]	1177	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
[10357]	1178	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
[2241]	1179	$cmd .= " 2> $output_filestem.err";
	1180	$!=0;
[10357]	1181
[2241]	1182	my $retcode=system($cmd);
	1183	$retcode = $? >> 8; # see man perlfunc - system for this...
	1184	# if system returns -1 \| 127 (couldn't start program), look at $! for message
	1185
	1186	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
	1187	elsif (! -e "$output_filestem.text") {
	1188	$error="did not create output file.\n";
[2012]	1189	}
[2241]	1190	else
	1191	{ # make sure the interpreter didn't get an error. It is technically
	1192	# possible for the actual text to start with this, but....
	1193	open PSOUT, "$output_filestem.text";
[16435]	1194	if (<PSOUT> =~ m/^Error: (.*)/) {
[2241]	1195	$error="interpreter error - \"$1\"";
	1196	}
	1197	close PSOUT;
	1198	}
[2012]	1199	}
[2241]	1200
[2012]	1201	if ($error ne "")
[1445]	1202	{
[2755]	1203	print STDERR "Warning: Error executing gs: $error\n";
[30724]	1204	print STDERR "Resorting to Perl regular expressions to extract text from PostScript...\n";
[27509]	1205	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
[2755]	1206
	1207	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
	1208	{
	1209	print FAILLOG "gs - $error\n";
	1210	if (-e "$output_filestem.err") {
	1211	open(ERRLOG, "$output_filestem.err");
	1212	while (<ERRLOG>) {print FAILLOG $_;}
	1213	close ERRLOG;
	1214	}
	1215	close FAILLOG;
	1216	}
[27509]	1217	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
[2012]	1218
[2755]	1219
[2012]	1220	# Fine then. We'll just do a lousy job by ourselves...
[2031]	1221	# Based on 5-line regexp sed script found at:
[2012]	1222	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
	1223	#
[2755]	1224	print STDERR "Stripping text from postscript\n";
[2012]	1225	my $errorcode=0;
	1226	open (IN, "$input_filename")
	1227	\|\| ($errorcode=1, warn "Couldn't read file: $!");
	1228	open (OUT, ">$output_filestem.text")
	1229	\|\| ($errorcode=1, warn "Couldn't write file: $!");
	1230	if ($errorcode) {print STDERR "errors\n";return 0;}
	1231
[2031]	1232	my $text=""; # this is for whole .ps file...
[2755]	1233	$text = join('', <IN>); # see man perlport, under "System Resources"
[2031]	1234	close IN;
	1235
[2447]	1236	# Make sure this is a ps file...
[16435]	1237	if ($text !~ m/^%!/) {
[2755]	1238	print STDERR "Bad postscript header: not '%!'\n";
	1239	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
	1240	{
	1241	print FAILLOG "Bad postscript header: not '%!'\n";
	1242	close FAILLOG;
	1243	}
[2447]	1244	return 0;
	1245	}
	1246
[2031]	1247	# if ps has Page data, then use it to delete all stuff before it.
	1248	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
	1249
	1250	# remove all leading non-data stuff
	1251	$text =~ s/^.*?\(//s;
	1252
	1253	# remove all newline chars for easier processing
	1254	$text =~ s/\n//g;
	1255
	1256	# Big assumption here - assume that if any co-ordinates are
	1257	# given, then we are at the end of a sentence.
	1258	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
	1259
	1260	# special characters--
	1261	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
	1262
	1263	# ? ps text formatting (eg italics?) ?
	1264	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
	1265	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
	1266	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
	1267	# default - remove the rest
	1268	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
	1269
	1270	# attempt to add whitespace between words...
	1271	# this is based purely on observation, and may be completely wrong...
	1272	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
	1273	# eg I notice "b(" is sometimes NOT a space if preceded by a
	1274	# negative number.
	1275	$text =~ s/\)\d+ ?b\(/\) \( /g;
	1276
	1277	# change quoted braces to brackets
	1278	$text =~ s/([^\\])\\\(/$1\{/g;
	1279	$text =~ s/([^\\])\\\)/$1\}/g ;
	1280
	1281	# remove everything that is not between braces
	1282	$text =~ s/\)([^\(\)])+?\(//sg ;
	1283
	1284	# remove any Trailer eof stuff.
	1285	$text =~ s/\)[^\)]*$//sg;
	1286
	1287	### ligatures have special characters...
	1288	$text =~ s/\\013/ff/g;
	1289	$text =~ s/\\014/fi/g;
	1290	$text =~ s/\\015/fl/g;
	1291	$text =~ s/\\016/ffi/g;
	1292	$text =~ s/\\214/fi/g;
	1293	$text =~ s/\\215/fl/g;
	1294	$text =~ s/\\017/\n\* /g; # asterisk?
	1295	$text =~ s/\\023/\023/g; # e acute ('e)
	1296	$text =~ s/\\177/\252/g; # u"
	1297	# $text =~ s/ ?? /\344/g; # a"
	1298
	1299	print OUT "$text";
	1300	close OUT;
[1960]	1301	}
[2600]	1302	# wrap the text - use a minimum length. ie, first space after this length.
	1303	my $wrap_length=72;
[27509]	1304	&FileUtils::moveFiles("$output_filestem.text", "$output_filestem.text.tmp");
[2600]	1305	open INFILE, "$output_filestem.text.tmp" \|\|
	1306	die "Couldn't open file: $!";
	1307	open OUTFILE, ">$output_filestem.text" \|\|
	1308	die "Couldn't open file for writing: $!";
	1309	my $line="";
	1310	while ($line=<INFILE>) {
	1311	while (length($line)>0) {
	1312	if (length($line)>$wrap_length) {
	1313	$line =~ s/^(.{$wrap_length}[^\s])\s//;
	1314	print OUTFILE "$1\n";
	1315	} else {
	1316	print OUTFILE "$line";
	1317	$line="";
	1318	}
	1319	}
	1320	}
	1321	close INFILE;
	1322	close OUTFILE;
[27509]	1323	&FileUtils::removeFiles("$output_filestem.text.tmp");
[2600]	1324
[27509]	1325	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
[1445]	1326	return 1;
	1327	}
	1328
	1329
	1330	# Convert any file to HTML with a crude perl implementation of the
	1331	# UNIX strings command.
	1332
	1333	sub any_to_html {
[22429]	1334	my ($input_filename, $output_filestem) = @_;
[1445]	1335
	1336	# First generate a text file
	1337	return 0 unless (&any_to_text($input_filename, $output_filestem));
	1338
	1339	# create an HTML file from the text file
	1340	open(TEXT, "<$output_filestem.text");
	1341	open(HTML, ">$output_filestem.html");
	1342
[2241]	1343	print HTML "<html><head>\n";
	1344	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
	1345	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
	1346	print HTML "</head><body>\n\n";
[1734]	1347
[2755]	1348	my $line;
	1349	while ($line=<TEXT>) {
	1350	$line =~ s/</</g;
	1351	$line =~ s/>/>/g;
[16435]	1352	if ($line =~ m/^\s*$/) {
[2755]	1353	print HTML "<p>";
	1354	} else {
	1355	print HTML "<br> ", $line;
	1356	}
[1445]	1357	}
[1734]	1358	print HTML "\n</body></html>\n";
[1445]	1359
[2241]	1360	close HTML;
	1361	close TEXT;
	1362
[27509]	1363	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
[1445]	1364	return 1;
	1365	}
	1366
	1367	# Convert any file to TEXT with a crude perl implementation of the
	1368	# UNIX strings command.
[2755]	1369	# Note - this assumes ascii charsets :( (jrm21)
[1445]	1370
	1371	sub any_to_text {
[22429]	1372	my ($input_filename, $output_filestem) = @_;
[1445]	1373
[3350]	1374	if (!$use_strings) {
	1375	return 0;
	1376	}
[15120]	1377
	1378	print STDERR "\n** In any to text**\n\n";
[2755]	1379	open(IN, "<$input_filename") \|\| return 0;
[1734]	1380	binmode(IN);
[2755]	1381	open(OUT, ">$output_filestem.text") \|\| return 0;
[1445]	1382
	1383	my ($line);
[2755]	1384	my $output_line_count = 0;
[1445]	1385	while (<IN>) {
	1386	$line = $_;
[1734]	1387
[1445]	1388	# delete anything that isn't a printable character
	1389	$line =~ s/[^\040-\176]+/\n/sg;
	1390
	1391	# delete any string less than 10 characters long
[1734]	1392	$line =~ s/^.{0,9}$/\n/mg;
[16435]	1393	while ($line =~ m/^.{1,9}$/m) {
[1734]	1394	$line =~ s/^.{0,9}$/\n/mg;
[1445]	1395	$line =~ s/\n+/\n/sg;
	1396	}
	1397
	1398	# remove extraneous whitespace
	1399	$line =~ s/\n+/\n/gs;
	1400	$line =~ s/^\n//gs;
[1578]	1401
[1445]	1402	# output whatever is left
[16435]	1403	if ($line =~ m/[^\n ]/) {
[1445]	1404	print OUT $line;
[2755]	1405	++$output_line_count;
[1445]	1406	}
	1407	}
[2241]	1408
	1409	close OUT;
	1410	close IN;
	1411
[2755]	1412	if ($output_line_count) { # try to protect against binary only formats
	1413	return 1;
	1414	}
	1415
[27509]	1416	&FileUtils::removeFiles("$output_filestem.text");
[2755]	1417	return 0;
	1418
[1445]	1419	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: