Context Navigation

source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 25798

Last change on this file since 25798 was 25798, checked in by ak19, 12 years ago
Fixing bug where on CentOS Linux a call to identify returns 256 and is not the error code for when Imagemagick is not found (on Linux -1 is the error code). Oddly the same issue did not occur on the Ubuntu.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 35.9 KB

Rev	Line
[1445]	1	#!/usr/bin/perl -w
	2
	3	###########################################################################
	4	#
[2032]	5	# gsConvert.pl -- convert documents to HTML or TEXT format
[1445]	6	#
	7	# A component of the Greenstone digital library software
	8	# from the New Zealand Digital Library Project at the
	9	# University of Waikato, New Zealand.
	10	#
[3013]	11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
[1445]	12	#
	13	# This program is free software; you can redistribute it and/or modify
	14	# it under the terms of the GNU General Public License as published by
	15	# the Free Software Foundation; either version 2 of the License, or
	16	# (at your option) any later version.
	17	#
	18	# This program is distributed in the hope that it will be useful,
	19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	21	# GNU General Public License for more details.
	22	#
	23	# You should have received a copy of the GNU General Public License
	24	# along with this program; if not, write to the Free Software
	25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	26	#
	27	###########################################################################
	28
	29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
[2755]	30	# by exploiting third-party programs. The sources of these are usually found
	31	# in the $GSDLHOME/packages directory, and the executables should live in
	32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
[1445]	33	#
[3013]	34	# Currently, we can convert the following formats by using external
	35	# conversion utilities:
	36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
	37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
[2032]	38	#
[3013]	39	# We can try to convert any file to text with a perl implementation of the
	40	# UNIX strings command.
	41	#
[2032]	42	# We try to convert Postscript files to text using "gs" which is often on
[2755]	43	# *nix machines. We fall back to performing weak text extraction by using
	44	# regular expressions.
[1445]	45
	46	BEGIN {
	47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
	48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
	49	}
	50
[22429]	51	use strict;
	52
[1445]	53	use parsargv;
	54	use util;
	55	use Cwd;
	56
[2755]	57	# Are we running on WinNT or Win2000 (or later)?
	58	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
	59	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
[1445]	60
[3350]	61	my $use_strings;
[3720]	62	my $pdf_complex;
[4103]	63	my $pdf_nohidden;
[3720]	64	my $pdf_zoom;
	65	my $pdf_ignore_images;
[10451]	66	my $pdf_allow_images_only;
[10282]	67	my $windows_scripting;
[3350]	68
[1445]	69	sub print_usage
	70	{
[1970]	71	print STDERR "\n";
	72	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
	73	print STDERR " or text using third-party programs.\n\n";
	74	print STDERR " usage: $0 [options] filename\n";
[22642]	75	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
[2755]	76	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
[22596]	77	print STDERR "\t-output\tauto\|html\|text\|pagedimg_jpg\|pagedimg_gif\|pagedimg_png\t(output file type)\n";
[2755]	78	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
[3720]	79	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
[22568]	80	print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
[3720]	81	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
[4103]	82	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
[3720]	83	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
	84	print STDERR "\t\tconverting PDF to HTML\n";
[10451]	85	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
[3720]	86	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
	87	print STDERR "\t\t-pdf_complex is set\n";
[1445]	88	exit(1);
	89	}
	90
[2755]	91	my $faillogfile="";
[3538]	92	my $timeout=0;
[24375]	93	my $verbosity=0;
[1445]	94
	95	sub main
	96	{
	97	my (@ARGV) = @_;
[3538]	98	my ($input_type,$output_type,$verbose);
[1960]	99
[23473]	100	# Dynamically figure out what the --type option can support, based on whether -windows_scripting
	101	# is in use or not
	102	my $default_type_re = "(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)";
	103	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
	104	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
	105	# Currently only have VBA for Word and PPT(but no XLS)
	106	my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xls)";
	107
	108	my $type_re = $default_type_re;
	109
	110	foreach my $a (@ARGV) {
	111	if ($a =~ m/^windows_scripting$/i) {
	112	$type_re = $enhanced_type_re;
	113	}
	114	}
	115
[1445]	116	# read command-line arguments
	117	if (!parsargv::parse(\@ARGV,
[23473]	118	"type/$type_re/", \$input_type,
[2755]	119	'/errlog/.*/', \$faillogfile,
[22596]	120	'output/(auto\|html\|text\|pagedimg).*/', \$output_type,
[1692]	121	'timeout/\d+/0',\$timeout,
[10282]	122	'verbose/\d+/0', \$verbose,
[22429]	123	'windows_scripting',\$windows_scripting,
[3720]	124	'use_strings', \$use_strings,
	125	'pdf_complex', \$pdf_complex,
[9482]	126	'pdf_ignore_images', \$pdf_ignore_images,
[10451]	127	'pdf_allow_images_only', \$pdf_allow_images_only,
[4103]	128	'pdf_nohidden', \$pdf_nohidden,
[3720]	129	'pdf_zoom/\d+/2', \$pdf_zoom
	130	))
[1445]	131	{
	132	print_usage();
	133	}
[24375]	134
	135	$verbosity=$verbose if defined $verbose;
	136
[1445]	137	# Make sure the input file exists and can be opened for reading
	138	if (scalar(@ARGV!=1)) {
	139	print_usage();
	140	}
[1928]	141
[1445]	142	my $input_filename = $ARGV[0];
	143	if (!-r $input_filename) {
	144	print STDERR "Error: unable to open $input_filename for reading\n";
	145	exit(1);
	146	}
	147
	148	# Deduce filenames
	149	my ($tailname,$dirname,$suffix)
[2241]	150	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
	151	my $output_filestem = &util::filename_cat($dirname, "$tailname");
[1445]	152
	153	if ($input_type eq "")
	154	{
[2241]	155	$input_type = lc (substr($suffix,1,length($suffix)-1));
[1445]	156	}
	157
	158	# Change to temporary working directory
	159	my $stored_dir = cwd();
	160	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
[10357]	161
[1445]	162	# Select convert utility
	163	if (!defined $input_type) {
	164	print STDERR "Error: No filename extension or input type defined\n";
	165	exit(1);
	166	}
[23473]	167	elsif ($input_type =~ m/^docx?$/ \|\| $input_type eq "dot") {
[1445]	168	print &convertDOC($input_filename, $output_filestem, $output_type);
	169	print "\n";
	170	}
[1684]	171	elsif ($input_type eq "rtf") {
	172	print &convertRTF($input_filename, $output_filestem, $output_type);
	173	print "\n";
	174	}
[1445]	175	elsif ($input_type eq "pdf") {
	176	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
	177	print "\n";
	178	}
	179	elsif ($input_type eq "ps") {
[22429]	180	print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
[1445]	181	print "\n";
	182	}
[23473]	183	elsif ($input_type =~ m/pptx?$/) {
[2977]	184	print &convertPPT($input_filename, $output_filestem, $output_type);
	185	print "\n";
	186	}
[23473]	187	elsif ($input_type =~ m/xlsx?$/) {
[2991]	188	print &convertXLS($input_filename, $output_filestem, $output_type);
	189	print "\n";
	190	}
[1445]	191	else {
	192	print STDERR "Error: Unable to convert type '$input_type'\n";
	193	exit(1);
	194	}
	195
	196	# restore to original working directory
	197	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
	198
	199	}
	200
	201	&main(@ARGV);
	202
	203
	204
[2241]	205	# Document-type conversion functions
[1445]	206	#
	207	# The following functions attempt to convert documents from their
	208	# input type to the specified output type. If no output type was
	209	# given, then they first attempt HTML, and then TEXT.
	210	#
	211	# Each returns the output type ("html" or "text") or "fail" if no
	212	# conversion is possible.
	213
	214	# Convert a Microsoft word document
	215
	216	sub convertDOC {
[22429]	217	my ($input_filename, $output_filestem, $output_type) = @_;
[1445]	218
[1654]	219	# Many .doc files are not in fact word documents!
	220	my $realtype = &find_docfile_type($input_filename);
	221
[23473]	222	if ($realtype eq "word6" \|\| $realtype eq "word7"
	223	\|\| $realtype eq "word8" \|\| $realtype eq "docx") {
[1654]	224	return &convertWord678($input_filename, $output_filestem, $output_type);
	225	} elsif ($realtype eq "rtf") {
	226	return &convertRTF($input_filename, $output_filestem, $output_type);
	227	} else {
	228	return &convertAnything($input_filename, $output_filestem, $output_type);
	229	}
	230	}
	231
	232	# Convert a Microsoft word 6/7/8 document
	233
	234	sub convertWord678 {
[22429]	235	my ($input_filename, $output_filestem, $output_type) = @_;
[1654]	236
[1445]	237	my $success = 0;
[16435]	238	if (!$output_type \|\| ($output_type =~ m/html/i)){
[10282]	239	if ($windows_scripting) {
	240	$success = &native_doc_to_html($input_filename, $output_filestem);
	241	}
	242	else {
	243	$success = &doc_to_html($input_filename, $output_filestem);
	244	}
[1445]	245	if ($success) {
[10282]	246	return "html";
[1445]	247	}
	248	}
[1654]	249	return &convertAnything($input_filename, $output_filestem, $output_type);
	250	}
	251
	252
	253	# Convert a Rich Text Format (RTF) file
	254
	255	sub convertRTF {
[22429]	256	my ($input_filename, $output_filestem, $output_type) = @_;
[1654]	257
	258	my $success = 0;
	259
	260	# Attempt specialised conversion to HTML
[16435]	261	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[12704]	262
	263	if ($windows_scripting) {
	264	$success = &native_doc_to_html($input_filename, $output_filestem);
	265	}
	266	else {
	267	$success = &rtf_to_html($input_filename, $output_filestem);
	268	}
[1654]	269	if ($success) {
	270	return "html";
	271	}
	272	}
	273
[2755]	274	# rtf is so ugly that's it's not worth running strings over.
	275	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
	276	# return &convertAnything($input_filename, $output_filestem, $output_type);
	277	return "fail";
[1654]	278	}
	279
	280
	281	# Convert an unidentified file
	282
	283	sub convertAnything {
[22429]	284	my ($input_filename, $output_filestem, $output_type) = @_;
[1654]	285
	286	my $success = 0;
[10464]	287
[1445]	288	# Attempt simple conversion to HTML
[16435]	289	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[1445]	290	$success = &any_to_html($input_filename, $output_filestem);
	291	if ($success) {
	292	return "html";
	293	}
	294	}
	295
	296	# Convert to text
[16435]	297	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[2241]	298	$success = &any_to_text($input_filename, $output_filestem);
[1445]	299	if ($success) {
	300	return "text";
	301	}
	302	}
	303	return "fail";
	304	}
	305
	306
[1654]	307
[1445]	308	# Convert an Adobe PDF document
	309
	310	sub convertPDF {
[2755]	311	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[1445]	312
	313	my $success = 0;
[10357]	314	$output_type =~ s/.\-(.)/$1/i;
	315	# Attempt coversion to Image
[16435]	316	if ($output_type =~ m/jp?g\|gif\|png/i) {
[17329]	317	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
[10357]	318	if ($success){
	319	return "item";
	320	}
	321	}
[1445]	322
	323	# Attempt conversion to HTML
[16435]	324	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[1445]	325	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
	326	if ($success) {
	327	return "html";
	328	}
	329	}
	330
	331	# Attempt conversion to TEXT
[16435]	332	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[2117]	333	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
[1445]	334	if ($success) {
	335	return "text";
	336	}
	337	}
	338
	339	return "fail";
	340
	341	}
	342
	343
	344	# Convert an Adobe PostScript document
	345
	346	sub convertPS {
[22429]	347	my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
[1445]	348
	349	my $success = 0;
[10534]	350	$output_type =~ s/.\-(.)/$1/i;
	351	# Attempt coversion to Image
[16435]	352	if ($output_type =~ m/jp?g\|gif\|png/i) {
[17329]	353	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
[10534]	354	if ($success){
	355	return "item";
	356	}
	357	}
[1445]	358
	359	# Attempt conversion to TEXT
[16435]	360	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[1445]	361	$success = &ps_to_text($input_filename, $output_filestem);
	362	if ($success) {
	363	return "text";
	364	}
	365	}
	366	return "fail";
	367	}
	368
	369
[2977]	370	sub convertPPT {
	371	my ($input_filename, $output_filestem, $output_type) = @_;
[10357]	372	my $success = 0;
[2977]	373
[10282]	374	my $ppt_convert_type = "";
[22513]	375
[16435]	376	#if (!$output_type \|\| $windows_scripting \|\| ($output_type !~ m/html/i) \|\| ($output_type !~ m/text/i)){
	377	if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
	378	if ($output_type =~ m/gif/i) {
[10282]	379	$ppt_convert_type = "-g";
[16435]	380	} elsif ($output_type =~ m/jp?g/i){
[10282]	381	$ppt_convert_type = "-j";
[16435]	382	} elsif ($output_type =~ m/png/i){
[10282]	383	$ppt_convert_type = "-p";
	384	}
	385	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
	386	$ENV{'GSDLOS'}, "pptextract");
[16435]	387	$vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
[10282]	388
[22429]	389	my $cmd = "";
[10357]	390	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[22429]	391	# if the converting directory already exists
[10282]	392	if (-d $output_filestem) {
[22429]	393	print STDERR "**The conversion directory already exists\n";
[10282]	394	return "item";
	395	} else {
[10521]	396	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
[10282]	397	$cmd .= " 2>\"$output_filestem.err\""
[16435]	398	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[10282]	399	if (system($cmd) !=0) {
	400	print STDERR "Powerpoint VB Scripting convert failed\n";
	401	} else {
	402	return "item";
	403	}
	404	}
[16435]	405	} elsif (!$output_type \|\| ($output_type =~ m/html/i)) {
[10282]	406	# Attempt conversion to HTML
[16435]	407	#if (!$output_type \|\| ($output_type =~ m/html/i)) {
[2977]	408	# formulate the command
[22429]	409	my $cmd = "";
[24362]	410	my $full_perl_path = &util::get_perl_exec();
[24124]	411	$cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
[2977]	412	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
	413	$cmd .= " 2>\"$output_filestem.err\""
[16435]	414	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[10357]	415
[2977]	416	# execute the command
	417	$!=0;
	418	if (system($cmd)!=0)
	419	{
[2991]	420	print STDERR "Powerpoint 95/97 converter failed $!\n";
[2977]	421	} else {
	422	return "html";
	423	}
[10464]	424	}
[2977]	425
	426	$success = &any_to_text($input_filename, $output_filestem);
	427	if ($success) {
	428	return "text";
	429	}
[10464]	430
[2977]	431	return "fail";
	432	}
	433
	434
[2991]	435	sub convertXLS {
	436	my ($input_filename, $output_filestem, $output_type) = @_;
[2977]	437
[2991]	438	my $success = 0;
[2977]	439
[2991]	440	# Attempt conversion to HTML
[16435]	441	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[2991]	442	# formulate the command
[22429]	443	my $cmd = "";
[24362]	444	my $full_perl_path = &util::get_perl_exec();
[24124]	445	$cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
[2991]	446	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
	447	$cmd .= " 2>\"$output_filestem.err\""
[16435]	448	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[2991]	449
	450
	451	# execute the command
	452	$!=0;
	453	if (system($cmd)!=0)
	454	{
	455	print STDERR "Excel 95/97 converter failed $!\n";
	456	} else {
	457	return "html";
	458	}
	459	}
[2977]	460
[2991]	461	$success = &any_to_text($input_filename, $output_filestem);
	462	if ($success) {
	463	return "text";
	464	}
	465
	466	return "fail";
	467	}
	468
	469
	470
[1654]	471	# Find the real type of a .doc file
	472	#
[2012]	473	# We seem to have a lot of files with a .doc extension that are .rtf
[1654]	474	# files or Word 5 files. This function attempts to tell the difference.
	475	sub find_docfile_type {
[22429]	476	my ($input_filename) = @_;
[23473]	477
	478	if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
	479	return "docx";
	480	}
	481
[1654]	482	open(CHK, "<$input_filename");
[1734]	483	binmode(CHK);
[1654]	484	my $line = "";
	485	my $first = 1;
	486
	487	while (<CHK>) {
	488
	489	$line = $_;
[1960]	490
[1654]	491	if ($first) {
	492	# check to see if this is an rtf file
[16435]	493	if ($line =~ m/^\{\\rtf/) {
[1654]	494	close(CHK);
	495	return "rtf";
	496	}
[2755]	497	$first = 0;
[1654]	498	}
	499
[1734]	500	# is this is a word 6/7/8 document?
[16435]	501	if ($line =~ m/Word\.Document\.([678])/) {
[1654]	502	close(CHK);
[23473]	503
[1734]	504	return "word$1";
[1654]	505	}
	506
	507	}
	508
	509	return "unknown";
	510	}
	511
	512
[1734]	513	# Specific type-to-type conversions
[1445]	514	#
	515	# Each of the following functions attempts to convert a document from
[2755]	516	# a specific format to another. If they succeed they return 1 and leave
[1445]	517	# the output document(s) in the appropriate place; if they fail they
	518	# return 0 and delete any working files.
	519
	520
	521	# Attempt to convert a word document to html with the wv program
	522	sub doc_to_html {
[22429]	523	my ($input_filename, $output_filestem) = @_;
[1445]	524
[24371]	525	my $wvware_status = 0;
[24375]	526
[24371]	527	# need to ensure that the path to perl is quoted (in case there's spaces in it)
[24513]	528	my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl \"$input_filename\" \"$output_filestem\" \"$faillogfile\" $verbosity $timeout";
[15120]	529
[24371]	530	# print STDERR "***** wvware launch cmd = $launch_cmd\n";
[15120]	531
[24371]	532	$wvware_status = system($launch_cmd)/256;
	533	return $wvware_status;
[1445]	534	}
	535
[10282]	536	# Attempt to convert a word document to html with the word2html scripting program
	537	sub native_doc_to_html {
[22429]	538	my ($input_filename, $output_filestem) = @_;
[1445]	539
[24166]	540	# build up the path to the doc-to-html conversion tool we're going to use
	541	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
[10282]	542
[24164]	543	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
[24166]	544	# if windows scripting with docx input, use new VBscript to get the local Word install (if
	545	# any) to do the conversion, since docX can't be processed by word2html's windows_scripting
	546
	547	if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
	548	# else script launch fails when there are error msgs
	549	$vbScript = &util::filename_cat($vbScript, "docx2html.vbs");
	550	$vbScript = "CScript //Nologo \"$vbScript\""; # launche with CScript for error output in STDERR
[24169]	551	# //Nologo flag avoids Microsoft's opening/logo msgs
	552	print STDERR "About to use windows scripting to process docx file $input_filename.\n";
	553	print STDERR " This may take some time. Please wait...\n";
[24166]	554	}
	555	else { # old doc versions. use the usual VB executable word2html for the
	556	# conversion. Doesn't need full path, since bin\windows is on PATH
	557	$vbScript = "word2html"; #$vbScript = "\"".&util::filename_cat($vbScript, "word2html")."\"";
	558	}
	559	}
	560	else { # not windows
	561	$vbScript = "\"".&util::filename_cat($vbScript, "word2html")."\"";
[24164]	562	}
	563
[10445]	564	if (-e "$output_filestem.html") {
[22429]	565	print STDERR " The conversion file:\n";
	566	print STDERR " $output_filestem.html\n";
	567	print STDERR " ... already exists. Skipping\n";
[10445]	568	return 1;
	569	}
[10282]	570
	571	my $cmd = "";
	572	if ($timeout) {$cmd = "ulimit -t $timeout;";}
	573	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
[10445]	574	#$cmd .= "$vbScript $input_filename $output_filestem.html";
[10521]	575	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
[10445]	576
[10282]	577	# redirecting STDERR
[24166]	578
	579	$cmd .= " 2> \"$output_filestem.err\""
	580	if ($ENV {'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
	581	#print STDERR "@@@@@@@@@ cmd=$cmd\n";
	582
[10282]	583	# execute the command
	584	$!=0;
	585	if (system($cmd)!=0)
	586	{
[24164]	587	print STDERR "Error executing $vbScript converter:$!\n";
[10282]	588	if (-s "$output_filestem.err") {
	589	open (ERRFILE, "<$output_filestem.err");
[24166]	590
[10282]	591	my $write_to_fail_log=0;
	592	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	593	{$write_to_fail_log=1;}
	594
	595	my $line;
	596	while ($line=<ERRFILE>) {
[16435]	597	if ($line =~ m/\w/) {
[10282]	598	print STDERR "$line";
	599	print FAILLOG "$line" if ($write_to_fail_log);
	600	}
	601	if ($line !~ m/startup error/) {next;}
	602	print STDERR " (given an invalid .DOC file?)\n";
	603	print FAILLOG " (given an invalid .DOC file?)\n"
	604	if ($write_to_fail_log);
	605
	606	} # while ERRFILE
	607	close FAILLOG if ($write_to_fail_log);
	608	}
	609	return 0; # we can try any_to_text
	610	}
	611
	612	# Was the conversion successful?
	613	if (-s "$output_filestem.html") {
	614	open(TMP, "$output_filestem.html");
[22429]	615	my $line = <TMP>;
[10282]	616	close(TMP);
[22429]	617	if ($line && $line =~ m/html/i) {
[10282]	618	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
	619	return 1;
	620	}
	621	}
	622
	623	# If here, an error of some sort occurred
	624	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
	625	if (-e "$output_filestem.err") {
	626	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
	627	open (ERRLOG,"$output_filestem.err");
	628	while (<ERRLOG>) {print FAILLOG $_;}
	629	close FAILLOG;
	630	close ERRLOG;
	631	}
	632	&util::rm("$output_filestem.err");
	633	}
	634	return 0;
	635	}
	636
[1654]	637	# Attempt to convert an RTF document to html with rtftohtml
	638	sub rtf_to_html {
[2241]	639	my ($input_filename, $output_filestem) = @_;
[1654]	640
	641	# formulate the command
[22429]	642	my $cmd = "";
[1692]	643	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[2574]	644	$cmd .= "rtftohtml";
[10282]	645	#$cmd .= "rtf-converter";
[1654]	646
[3246]	647	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
[2574]	648
	649	$cmd .= " 2>\"$output_filestem.err\""
[16435]	650	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[2574]	651
	652
[1654]	653	# execute the command
[2755]	654	$!=0;
[2060]	655	if (system($cmd)!=0)
[1654]	656	{
[2755]	657	print STDERR "Error executing rtf converter $!\n";
[2656]	658	# don't currently bother printing out error log...
	659	# keep going, in case it still created an HTML file...
[1654]	660	}
	661
	662	# Was the conversion successful?
[2755]	663	my $was_successful=0;
[2656]	664	if (-s "$output_filestem.html") {
[2755]	665	# make sure we have some content other than header
	666	open (HTML, "$output_filestem.html"); # what to do if fail?
	667	my $line;
	668	my $past_header=0;
	669	while ($line=<HTML>) {
	670
	671	if ($past_header == 0) {
[16435]	672	if ($line =~ m/<body>/) {$past_header=1;}
[2755]	673	next;
	674	}
	675
	676	$line =~ s/<[^>]+>//g;
[16435]	677	if ($line =~ m/\w/ && $past_header) { # we found some content...
[2755]	678	$was_successful=1;
	679	last;
	680	}
	681	}
	682	close HTML;
[1654]	683	}
[2574]	684
[2755]	685	if ($was_successful) {
	686	&util::rm("$output_filestem.err")
	687	if (-e "$output_filestem.err");
	688	# insert the (modified) table of contents, if it exists.
	689	if (-e "${output_filestem}_ToC.html") {
	690	&util::mv("$output_filestem.html","$output_filestem.src");
	691	my $open_failed=0;
	692	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
	693	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
	694	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
	695
	696	if ($open_failed) {
	697	close HTMLSRC;
	698	close TOC;
	699	close HTML;
	700	&util::mv("$output_filestem.src","$output_filestem.html");
	701	return 1;
	702	}
	703
	704	# print out header info from src html.
[16435]	705	while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
[2755]	706	print HTML "$_";
	707	}
	708
	709	# print out table of contents, making links relative
	710	<TOC>; <TOC>; # ignore first 2 lines
	711	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
	712	my $line;
	713	while ($line=<TOC>) {
[22429]	714	$line =~ s@</body></html>$@@i ; # only last line has this
[2755]	715	# make link relative
[22429]	716	$line =~ s@href=\"[^\#]+@href=\"@i;
[2755]	717	print HTML $line;
	718	}
	719	close TOC;
	720
	721	# rest of html src
	722	while (<HTMLSRC>) {
	723	print HTML $_;
	724	}
	725	close HTMLSRC;
	726	close HTML;
	727
	728	&util::rm("${output_filestem}_ToC.html");
	729	&util::rm("${output_filestem}.src");
	730	}
	731	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
	732	return 1; # success
	733	}
	734
	735	if (-e "$output_filestem.err") {
	736	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	737	{
	738	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
[10282]	739	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
[2755]	740	print FAILLOG " (rtf file might be too recent):\n";
	741	open (ERRLOG, "$output_filestem.err");
	742	while (<ERRLOG>) {print FAILLOG $_;}
	743	close ERRLOG;
	744	close FAILLOG;
	745	}
	746	&util::rm("$output_filestem.err");
	747	}
	748
[2656]	749	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
	750
[1654]	751	return 0;
	752	}
	753
	754
[1445]	755	# Convert a pdf file to html with the pdftohtml command
	756
	757	sub pdf_to_html {
[2755]	758	my ($dirname, $input_filename, $output_filestem) = @_;
[1445]	759
[22429]	760	my $cmd = "";
[1692]	761	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[24362]	762	my $full_perl_path = &util::get_perl_exec();
[24124]	763	$cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
[3720]	764	$cmd .= " -c" if ($pdf_complex);
	765	$cmd .= " -i" if ($pdf_ignore_images);
[10451]	766	$cmd .= " -a" if ($pdf_allow_images_only);
[4103]	767	$cmd .= " -hidden" unless ($pdf_nohidden);
[1928]	768	$cmd .= " \"$input_filename\" \"$output_filestem\"";
[2755]	769
[16435]	770	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
[2755]	771	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	772	} else {
	773	$cmd .= " > \"$output_filestem.err\"";
	774	}
	775
[2117]	776	$!=0;
[2241]	777
[2656]	778	my $retval=system($cmd);
	779	if ($retval!=0)
[1445]	780	{
[2755]	781	print STDERR "Error executing pdftohtml.pl";
[2117]	782	if ($!) {print STDERR ": $!";}
	783	print STDERR "\n";
[1445]	784	}
	785
[1692]	786	# make sure the converter made something
[2656]	787	if ($retval!=0 \|\| ! -s "$output_filestem.html")
[1692]	788	{
	789	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
[2656]	790	# print out the converter's std err, if any
	791	if (-s "$output_filestem.err") {
[1692]	792	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[2755]	793	print STDERR "pdftohtml error log:\n";
[1692]	794	while (<ERRLOG>) {
	795	print STDERR "$_";
	796	}
	797	close ERRLOG;
	798	}
[24608]	799	#print STDERR "***********output filestem $output_filestem.html\n";
[2656]	800	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
[2755]	801	if (-e "$output_filestem.err") {
	802	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	803	{
	804	open (ERRLOG, "$output_filestem.err");
	805	while (<ERRLOG>) {print FAILLOG $_;}
	806	close ERRLOG;
	807	close FAILLOG;
	808	}
[10282]	809	&util::rm("$output_filestem.err");
[2755]	810	}
[1692]	811	return 0;
	812	}
[10357]	813
	814	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	815	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	816	return 1;
	817	}
	818
	819	# Convert a pdf file to various types of image with the convert command
	820
[17329]	821	sub pdfps_to_img {
[10357]	822	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[10401]	823
	824	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
	825	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
[24600]	826	my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
[24763]	827	$imagick_cmd = $imagick_cmd." --verbosity=$verbosity" if defined $verbosity;
[24600]	828	my $result = `$imagick_cmd identify 2>&1`;
	829
	830	# Linux and Windows return different values for "program not found".
	831	# Linux returns -1 and Windows 256 for "program not found". But once they're
	832	# converted to signed values, it will be -1 for Linux and 1 for Windows.
	833	# Whenever we test for return values other than 0, shift by 8 and perform
	834	# unsigned to signed status conversion on $? to get expected range of return vals
	835	# Although gs-magick.pl already shifts its $? by 8, converts it to a signed value
	836	# and then exits on that, by the time we get here, we need to do it again
	837	my $status = $?;
	838	$status >>= 8;
	839	$status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);
[25798]	840	if (($ENV{'GSDLOS'} ne "windows" && $status == -1) \|\| ($ENV{'GSDLOS'} eq "windows" && $status == 1)) {
	841	# if ($status == -1 \|\| $status == 1) #if ($status == -1 \|\| $status == 256) {
[10401]	842	#ImageMagick is not installed, thus the convert utility is not available.
[25798]	843	print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images. Status: $status\n";
[10401]	844	return 0;
	845	}
	846	}
	847
[22429]	848	my $cmd = "";
[10357]	849	if ($timeout) {$cmd = "ulimit -t $timeout;";}
	850	$output_type =~ s/.\_(.)/$1/i;
[24362]	851	my $full_perl_path = &util::get_perl_exec();
[24124]	852	$cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
[16435]	853	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
[10357]	854	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	855	} else {
	856	$cmd .= " > \"$output_filestem.err\"";
	857	}
	858
	859	# don't include path on windows (to avoid having to play about
	860	# with quoting when GSDLHOME might contain spaces) but assume
	861	# that the PATH is set up correctly
	862	$!=0;
	863	my $retval=system($cmd);
	864	if ($retval!=0)
	865	{
[10401]	866	print STDERR "Error executing pdftoimg.pl";
[10357]	867	if ($!) {print STDERR ": $!";}
	868	print STDERR "\n";
	869	}
	870
	871	#make sure the converter made something
	872	#if ($retval !=0) \|\| ! -s "$output_filestem")
	873	if ($retval !=0)
	874	{
	875	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	876	#print out the converter's std err, if any
	877	if (-s "$output_filestem.err") {
	878	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[17329]	879	print STDERR "pdfpstoimg error log:\n";
[10357]	880	while (<ERRLOG>) {
	881	print STDERR "$_";
	882	}
	883	close ERRLOG;
	884	}
[10534]	885	#&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
[10357]	886	if (-e "$output_filestem.err") {
	887	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	888	{
	889	open (ERRLOG, "$output_filestem.err");
	890	while (<ERRLOG>) {print FAILLOG $_;}
	891	close ERRLOG;
	892	close FAILLOG;
	893	}
	894	&util::rm("$output_filestem.err");
	895	}
	896	return 0;
	897	}
[2656]	898	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
[1445]	899	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	900	return 1;
	901	}
	902
	903	# Convert a PDF file to text with the pdftotext command
	904
	905	sub pdf_to_text {
[2755]	906	my ($dirname, $input_filename, $output_filestem) = @_;
[1445]	907
[2248]	908	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
[2755]	909
[16435]	910	if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
[2755]	911	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	912	} else {
	913	$cmd .= " > \"$output_filestem.err\"";
	914	}
[1445]	915
[2060]	916	if (system($cmd)!=0)
[1445]	917	{
	918	print STDERR "Error executing $cmd: $!\n";
	919	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
	920	}
	921
[2755]	922	# make sure there is some extracted text.
	923	if (-e "$output_filestem.text") {
	924	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
	925	binmode(EXTR_TEXT); # just in case...
	926	my $line="";
	927	my $seen_text=0;
	928	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
[16435]	929	if ($line=~ m/\w/) {$seen_text=1;}
[2755]	930	}
	931	close EXTR_TEXT;
	932	if ($seen_text==0) { # no text was extracted
	933	print STDERR "Error: pdftotext found no text\n";
	934	&util::rm("$output_filestem.text");
	935	}
	936	}
	937
[1692]	938	# make sure the converter made something
[2656]	939	if (! -s "$output_filestem.text")
[1692]	940	{
	941	# print out the converters std err, if any
[2656]	942	if (-s "$output_filestem.err") {
[1692]	943	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[2755]	944	print STDERR "pdftotext error log:\n";
[1692]	945	while (<ERRLOG>) {
	946	print STDERR "$_";
	947	}
	948	close ERRLOG;
	949	}
[2656]	950	# does this converter create a .out file?
	951	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	952	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
[2755]	953	if (-e "$output_filestem.err") {
	954	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	955	{
	956	open (ERRLOG,"$output_filestem.err");
	957	while (<ERRLOG>) {print FAILLOG $_;}
	958	close ERRLOG;
	959	close FAILLOG;
	960	}
	961	&util::rm("$output_filestem.err");
	962	}
[1692]	963	return 0;
	964	}
[1445]	965	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	966	return 1;
	967	}
	968
[2012]	969	# Convert a PostScript document to text
	970	# note - just using "ps2ascii" isn't good enough, as it
	971	# returns 0 for a postscript interpreter error. ps2ascii is just
	972	# a wrapper to "gs" anyway, so we use that cmd here.
[1445]	973
	974	sub ps_to_text {
[2241]	975	my ($input_filename, $output_filestem) = @_;
[1445]	976
[2241]	977	my $error = "";
	978
	979	# if we're on windows we'll fall straight through without attempting
	980	# to use gs
[16435]	981	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
[2241]	982	$error = "Windows does not support gs";
	983
	984	} else {
[3538]	985	my $cmd = "";
	986	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
	987	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
[2241]	988	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
[10357]	989	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
[2241]	990	$cmd .= " 2> $output_filestem.err";
	991	$!=0;
[10357]	992
[2241]	993	my $retcode=system($cmd);
	994	$retcode = $? >> 8; # see man perlfunc - system for this...
	995	# if system returns -1 \| 127 (couldn't start program), look at $! for message
	996
	997	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
	998	elsif (! -e "$output_filestem.text") {
	999	$error="did not create output file.\n";
[2012]	1000	}
[2241]	1001	else
	1002	{ # make sure the interpreter didn't get an error. It is technically
	1003	# possible for the actual text to start with this, but....
	1004	open PSOUT, "$output_filestem.text";
[16435]	1005	if (<PSOUT> =~ m/^Error: (.*)/) {
[2241]	1006	$error="interpreter error - \"$1\"";
	1007	}
	1008	close PSOUT;
	1009	}
[2012]	1010	}
[2241]	1011
[2012]	1012	if ($error ne "")
[1445]	1013	{
[2755]	1014	print STDERR "Warning: Error executing gs: $error\n";
[1445]	1015	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
[2755]	1016
	1017	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
	1018	{
	1019	print FAILLOG "gs - $error\n";
	1020	if (-e "$output_filestem.err") {
	1021	open(ERRLOG, "$output_filestem.err");
	1022	while (<ERRLOG>) {print FAILLOG $_;}
	1023	close ERRLOG;
	1024	}
	1025	close FAILLOG;
	1026	}
[1445]	1027	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
[2012]	1028
[2755]	1029
[2012]	1030	# Fine then. We'll just do a lousy job by ourselves...
[2031]	1031	# Based on 5-line regexp sed script found at:
[2012]	1032	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
	1033	#
[2755]	1034	print STDERR "Stripping text from postscript\n";
[2012]	1035	my $errorcode=0;
	1036	open (IN, "$input_filename")
	1037	\|\| ($errorcode=1, warn "Couldn't read file: $!");
	1038	open (OUT, ">$output_filestem.text")
	1039	\|\| ($errorcode=1, warn "Couldn't write file: $!");
	1040	if ($errorcode) {print STDERR "errors\n";return 0;}
	1041
[2031]	1042	my $text=""; # this is for whole .ps file...
[2755]	1043	$text = join('', <IN>); # see man perlport, under "System Resources"
[2031]	1044	close IN;
	1045
[2447]	1046	# Make sure this is a ps file...
[16435]	1047	if ($text !~ m/^%!/) {
[2755]	1048	print STDERR "Bad postscript header: not '%!'\n";
	1049	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
	1050	{
	1051	print FAILLOG "Bad postscript header: not '%!'\n";
	1052	close FAILLOG;
	1053	}
[2447]	1054	return 0;
	1055	}
	1056
[2031]	1057	# if ps has Page data, then use it to delete all stuff before it.
	1058	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
	1059
	1060	# remove all leading non-data stuff
	1061	$text =~ s/^.*?\(//s;
	1062
	1063	# remove all newline chars for easier processing
	1064	$text =~ s/\n//g;
	1065
	1066	# Big assumption here - assume that if any co-ordinates are
	1067	# given, then we are at the end of a sentence.
	1068	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
	1069
	1070	# special characters--
	1071	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
	1072
	1073	# ? ps text formatting (eg italics?) ?
	1074	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
	1075	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
	1076	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
	1077	# default - remove the rest
	1078	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
	1079
	1080	# attempt to add whitespace between words...
	1081	# this is based purely on observation, and may be completely wrong...
	1082	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
	1083	# eg I notice "b(" is sometimes NOT a space if preceded by a
	1084	# negative number.
	1085	$text =~ s/\)\d+ ?b\(/\) \( /g;
	1086
	1087	# change quoted braces to brackets
	1088	$text =~ s/([^\\])\\\(/$1\{/g;
	1089	$text =~ s/([^\\])\\\)/$1\}/g ;
	1090
	1091	# remove everything that is not between braces
	1092	$text =~ s/\)([^\(\)])+?\(//sg ;
	1093
	1094	# remove any Trailer eof stuff.
	1095	$text =~ s/\)[^\)]*$//sg;
	1096
	1097	### ligatures have special characters...
	1098	$text =~ s/\\013/ff/g;
	1099	$text =~ s/\\014/fi/g;
	1100	$text =~ s/\\015/fl/g;
	1101	$text =~ s/\\016/ffi/g;
	1102	$text =~ s/\\214/fi/g;
	1103	$text =~ s/\\215/fl/g;
	1104	$text =~ s/\\017/\n\* /g; # asterisk?
	1105	$text =~ s/\\023/\023/g; # e acute ('e)
	1106	$text =~ s/\\177/\252/g; # u"
	1107	# $text =~ s/ ?? /\344/g; # a"
	1108
	1109	print OUT "$text";
	1110	close OUT;
[1960]	1111	}
[2600]	1112	# wrap the text - use a minimum length. ie, first space after this length.
	1113	my $wrap_length=72;
	1114	&util::mv("$output_filestem.text", "$output_filestem.text.tmp");
	1115	open INFILE, "$output_filestem.text.tmp" \|\|
	1116	die "Couldn't open file: $!";
	1117	open OUTFILE, ">$output_filestem.text" \|\|
	1118	die "Couldn't open file for writing: $!";
	1119	my $line="";
	1120	while ($line=<INFILE>) {
	1121	while (length($line)>0) {
	1122	if (length($line)>$wrap_length) {
	1123	$line =~ s/^(.{$wrap_length}[^\s])\s//;
	1124	print OUTFILE "$1\n";
	1125	} else {
	1126	print OUTFILE "$line";
	1127	$line="";
	1128	}
	1129	}
	1130	}
	1131	close INFILE;
	1132	close OUTFILE;
	1133	&util::rm("$output_filestem.text.tmp");
	1134
[1445]	1135	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	1136	return 1;
	1137	}
	1138
	1139
	1140	# Convert any file to HTML with a crude perl implementation of the
	1141	# UNIX strings command.
	1142
	1143	sub any_to_html {
[22429]	1144	my ($input_filename, $output_filestem) = @_;
[1445]	1145
	1146	# First generate a text file
	1147	return 0 unless (&any_to_text($input_filename, $output_filestem));
	1148
	1149	# create an HTML file from the text file
	1150	open(TEXT, "<$output_filestem.text");
	1151	open(HTML, ">$output_filestem.html");
	1152
[2241]	1153	print HTML "<html><head>\n";
	1154	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
	1155	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
	1156	print HTML "</head><body>\n\n";
[1734]	1157
[2755]	1158	my $line;
	1159	while ($line=<TEXT>) {
	1160	$line =~ s/</</g;
	1161	$line =~ s/>/>/g;
[16435]	1162	if ($line =~ m/^\s*$/) {
[2755]	1163	print HTML "<p>";
	1164	} else {
	1165	print HTML "<br> ", $line;
	1166	}
[1445]	1167	}
[1734]	1168	print HTML "\n</body></html>\n";
[1445]	1169
[2241]	1170	close HTML;
	1171	close TEXT;
	1172
[1445]	1173	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
	1174	return 1;
	1175	}
	1176
	1177	# Convert any file to TEXT with a crude perl implementation of the
	1178	# UNIX strings command.
[2755]	1179	# Note - this assumes ascii charsets :( (jrm21)
[1445]	1180
	1181	sub any_to_text {
[22429]	1182	my ($input_filename, $output_filestem) = @_;
[1445]	1183
[3350]	1184	if (!$use_strings) {
	1185	return 0;
	1186	}
[15120]	1187
	1188	print STDERR "\n** In any to text**\n\n";
[2755]	1189	open(IN, "<$input_filename") \|\| return 0;
[1734]	1190	binmode(IN);
[2755]	1191	open(OUT, ">$output_filestem.text") \|\| return 0;
[1445]	1192
	1193	my ($line);
[2755]	1194	my $output_line_count = 0;
[1445]	1195	while (<IN>) {
	1196	$line = $_;
[1734]	1197
[1445]	1198	# delete anything that isn't a printable character
	1199	$line =~ s/[^\040-\176]+/\n/sg;
	1200
	1201	# delete any string less than 10 characters long
[1734]	1202	$line =~ s/^.{0,9}$/\n/mg;
[16435]	1203	while ($line =~ m/^.{1,9}$/m) {
[1734]	1204	$line =~ s/^.{0,9}$/\n/mg;
[1445]	1205	$line =~ s/\n+/\n/sg;
	1206	}
	1207
	1208	# remove extraneous whitespace
	1209	$line =~ s/\n+/\n/gs;
	1210	$line =~ s/^\n//gs;
[1578]	1211
[1445]	1212	# output whatever is left
[16435]	1213	if ($line =~ m/[^\n ]/) {
[1445]	1214	print OUT $line;
[2755]	1215	++$output_line_count;
[1445]	1216	}
	1217	}
[2241]	1218
	1219	close OUT;
	1220	close IN;
	1221
[2755]	1222	if ($output_line_count) { # try to protect against binary only formats
	1223	return 1;
	1224	}
	1225
	1226	&util::rm("$output_filestem.text");
	1227	return 0;
	1228
[1445]	1229	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: