Context Navigation

source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 24371

Last change on this file since 24371 was 24371, checked in by ak19, 13 years ago
Ticket 779: the new wvware.pl script sets the environment for what wvware needs, by setting the LD_LIB_PATH to gnome-lib-minimal in the extension folder, if this exists. wvware.pl is called by gsConvert to run wvware (also checked with the replace src doc with html menu option on rightclick) and the perl script can be launched from the command prompt to do the conversion as well.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 34.9 KB

Rev	Line
[1445]	1	#!/usr/bin/perl -w
	2
	3	###########################################################################
	4	#
[2032]	5	# gsConvert.pl -- convert documents to HTML or TEXT format
[1445]	6	#
	7	# A component of the Greenstone digital library software
	8	# from the New Zealand Digital Library Project at the
	9	# University of Waikato, New Zealand.
	10	#
[3013]	11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
[1445]	12	#
	13	# This program is free software; you can redistribute it and/or modify
	14	# it under the terms of the GNU General Public License as published by
	15	# the Free Software Foundation; either version 2 of the License, or
	16	# (at your option) any later version.
	17	#
	18	# This program is distributed in the hope that it will be useful,
	19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	21	# GNU General Public License for more details.
	22	#
	23	# You should have received a copy of the GNU General Public License
	24	# along with this program; if not, write to the Free Software
	25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	26	#
	27	###########################################################################
	28
	29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
[2755]	30	# by exploiting third-party programs. The sources of these are usually found
	31	# in the $GSDLHOME/packages directory, and the executables should live in
	32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
[1445]	33	#
[3013]	34	# Currently, we can convert the following formats by using external
	35	# conversion utilities:
	36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
	37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
[2032]	38	#
[3013]	39	# We can try to convert any file to text with a perl implementation of the
	40	# UNIX strings command.
	41	#
[2032]	42	# We try to convert Postscript files to text using "gs" which is often on
[2755]	43	# *nix machines. We fall back to performing weak text extraction by using
	44	# regular expressions.
[1445]	45
	46	BEGIN {
	47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
	48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
	49	}
	50
[22429]	51	use strict;
	52
[1445]	53	use parsargv;
	54	use util;
	55	use Cwd;
	56
[2755]	57	# Are we running on WinNT or Win2000 (or later)?
	58	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
	59	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
[1445]	60
[3350]	61	my $use_strings;
[3720]	62	my $pdf_complex;
[4103]	63	my $pdf_nohidden;
[3720]	64	my $pdf_zoom;
	65	my $pdf_ignore_images;
[10451]	66	my $pdf_allow_images_only;
[10282]	67	my $windows_scripting;
[3350]	68
[1445]	69	sub print_usage
	70	{
[1970]	71	print STDERR "\n";
	72	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
	73	print STDERR " or text using third-party programs.\n\n";
	74	print STDERR " usage: $0 [options] filename\n";
[22642]	75	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
[2755]	76	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
[22596]	77	print STDERR "\t-output\tauto\|html\|text\|pagedimg_jpg\|pagedimg_gif\|pagedimg_png\t(output file type)\n";
[2755]	78	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
[3720]	79	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
[22568]	80	print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
[3720]	81	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
[4103]	82	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
[3720]	83	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
	84	print STDERR "\t\tconverting PDF to HTML\n";
[10451]	85	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
[3720]	86	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
	87	print STDERR "\t\t-pdf_complex is set\n";
[1445]	88	exit(1);
	89	}
	90
[2755]	91	my $faillogfile="";
[3538]	92	my $timeout=0;
[1445]	93
	94	sub main
	95	{
	96	my (@ARGV) = @_;
[3538]	97	my ($input_type,$output_type,$verbose);
[1960]	98
[23473]	99	# Dynamically figure out what the --type option can support, based on whether -windows_scripting
	100	# is in use or not
	101	my $default_type_re = "(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)";
	102	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
	103	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
	104	# Currently only have VBA for Word and PPT(but no XLS)
	105	my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xls)";
	106
	107	my $type_re = $default_type_re;
	108
	109	foreach my $a (@ARGV) {
	110	if ($a =~ m/^windows_scripting$/i) {
	111	$type_re = $enhanced_type_re;
	112	}
	113	}
	114
[1445]	115	# read command-line arguments
	116	if (!parsargv::parse(\@ARGV,
[23473]	117	"type/$type_re/", \$input_type,
[2755]	118	'/errlog/.*/', \$faillogfile,
[22596]	119	'output/(auto\|html\|text\|pagedimg).*/', \$output_type,
[1692]	120	'timeout/\d+/0',\$timeout,
[10282]	121	'verbose/\d+/0', \$verbose,
[22429]	122	'windows_scripting',\$windows_scripting,
[3720]	123	'use_strings', \$use_strings,
	124	'pdf_complex', \$pdf_complex,
[9482]	125	'pdf_ignore_images', \$pdf_ignore_images,
[10451]	126	'pdf_allow_images_only', \$pdf_allow_images_only,
[4103]	127	'pdf_nohidden', \$pdf_nohidden,
[3720]	128	'pdf_zoom/\d+/2', \$pdf_zoom
	129	))
[1445]	130	{
	131	print_usage();
	132	}
[12704]	133
[1445]	134	# Make sure the input file exists and can be opened for reading
	135	if (scalar(@ARGV!=1)) {
	136	print_usage();
	137	}
[1928]	138
[1445]	139	my $input_filename = $ARGV[0];
	140	if (!-r $input_filename) {
	141	print STDERR "Error: unable to open $input_filename for reading\n";
	142	exit(1);
	143	}
	144
	145	# Deduce filenames
	146	my ($tailname,$dirname,$suffix)
[2241]	147	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
	148	my $output_filestem = &util::filename_cat($dirname, "$tailname");
[1445]	149
	150	if ($input_type eq "")
	151	{
[2241]	152	$input_type = lc (substr($suffix,1,length($suffix)-1));
[1445]	153	}
	154
	155	# Change to temporary working directory
	156	my $stored_dir = cwd();
	157	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
[10357]	158
[1445]	159	# Select convert utility
	160	if (!defined $input_type) {
	161	print STDERR "Error: No filename extension or input type defined\n";
	162	exit(1);
	163	}
[23473]	164	elsif ($input_type =~ m/^docx?$/ \|\| $input_type eq "dot") {
[1445]	165	print &convertDOC($input_filename, $output_filestem, $output_type);
	166	print "\n";
	167	}
[1684]	168	elsif ($input_type eq "rtf") {
	169	print &convertRTF($input_filename, $output_filestem, $output_type);
	170	print "\n";
	171	}
[1445]	172	elsif ($input_type eq "pdf") {
	173	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
	174	print "\n";
	175	}
	176	elsif ($input_type eq "ps") {
[22429]	177	print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
[1445]	178	print "\n";
	179	}
[23473]	180	elsif ($input_type =~ m/pptx?$/) {
[2977]	181	print &convertPPT($input_filename, $output_filestem, $output_type);
	182	print "\n";
	183	}
[23473]	184	elsif ($input_type =~ m/xlsx?$/) {
[2991]	185	print &convertXLS($input_filename, $output_filestem, $output_type);
	186	print "\n";
	187	}
[1445]	188	else {
	189	print STDERR "Error: Unable to convert type '$input_type'\n";
	190	exit(1);
	191	}
	192
	193	# restore to original working directory
	194	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
	195
	196	}
	197
	198	&main(@ARGV);
	199
	200
	201
[2241]	202	# Document-type conversion functions
[1445]	203	#
	204	# The following functions attempt to convert documents from their
	205	# input type to the specified output type. If no output type was
	206	# given, then they first attempt HTML, and then TEXT.
	207	#
	208	# Each returns the output type ("html" or "text") or "fail" if no
	209	# conversion is possible.
	210
	211	# Convert a Microsoft word document
	212
	213	sub convertDOC {
[22429]	214	my ($input_filename, $output_filestem, $output_type) = @_;
[1445]	215
[1654]	216	# Many .doc files are not in fact word documents!
	217	my $realtype = &find_docfile_type($input_filename);
	218
[23473]	219	if ($realtype eq "word6" \|\| $realtype eq "word7"
	220	\|\| $realtype eq "word8" \|\| $realtype eq "docx") {
[1654]	221	return &convertWord678($input_filename, $output_filestem, $output_type);
	222	} elsif ($realtype eq "rtf") {
	223	return &convertRTF($input_filename, $output_filestem, $output_type);
	224	} else {
	225	return &convertAnything($input_filename, $output_filestem, $output_type);
	226	}
	227	}
	228
	229	# Convert a Microsoft word 6/7/8 document
	230
	231	sub convertWord678 {
[22429]	232	my ($input_filename, $output_filestem, $output_type) = @_;
[1654]	233
[1445]	234	my $success = 0;
[16435]	235	if (!$output_type \|\| ($output_type =~ m/html/i)){
[10282]	236	if ($windows_scripting) {
	237	$success = &native_doc_to_html($input_filename, $output_filestem);
	238	}
	239	else {
	240	$success = &doc_to_html($input_filename, $output_filestem);
	241	}
[1445]	242	if ($success) {
[10282]	243	return "html";
[1445]	244	}
	245	}
[1654]	246	return &convertAnything($input_filename, $output_filestem, $output_type);
	247	}
	248
	249
	250	# Convert a Rich Text Format (RTF) file
	251
	252	sub convertRTF {
[22429]	253	my ($input_filename, $output_filestem, $output_type) = @_;
[1654]	254
	255	my $success = 0;
	256
	257	# Attempt specialised conversion to HTML
[16435]	258	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[12704]	259
	260	if ($windows_scripting) {
	261	$success = &native_doc_to_html($input_filename, $output_filestem);
	262	}
	263	else {
	264	$success = &rtf_to_html($input_filename, $output_filestem);
	265	}
[1654]	266	if ($success) {
	267	return "html";
	268	}
	269	}
	270
[2755]	271	# rtf is so ugly that's it's not worth running strings over.
	272	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
	273	# return &convertAnything($input_filename, $output_filestem, $output_type);
	274	return "fail";
[1654]	275	}
	276
	277
	278	# Convert an unidentified file
	279
	280	sub convertAnything {
[22429]	281	my ($input_filename, $output_filestem, $output_type) = @_;
[1654]	282
	283	my $success = 0;
[10464]	284
[1445]	285	# Attempt simple conversion to HTML
[16435]	286	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[1445]	287	$success = &any_to_html($input_filename, $output_filestem);
	288	if ($success) {
	289	return "html";
	290	}
	291	}
	292
	293	# Convert to text
[16435]	294	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[2241]	295	$success = &any_to_text($input_filename, $output_filestem);
[1445]	296	if ($success) {
	297	return "text";
	298	}
	299	}
	300	return "fail";
	301	}
	302
	303
[1654]	304
[1445]	305	# Convert an Adobe PDF document
	306
	307	sub convertPDF {
[2755]	308	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[1445]	309
	310	my $success = 0;
[10357]	311	$output_type =~ s/.\-(.)/$1/i;
	312	# Attempt coversion to Image
[16435]	313	if ($output_type =~ m/jp?g\|gif\|png/i) {
[17329]	314	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
[10357]	315	if ($success){
	316	return "item";
	317	}
	318	}
[1445]	319
	320	# Attempt conversion to HTML
[16435]	321	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[1445]	322	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
	323	if ($success) {
	324	return "html";
	325	}
	326	}
	327
	328	# Attempt conversion to TEXT
[16435]	329	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[2117]	330	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
[1445]	331	if ($success) {
	332	return "text";
	333	}
	334	}
	335
	336	return "fail";
	337
	338	}
	339
	340
	341	# Convert an Adobe PostScript document
	342
	343	sub convertPS {
[22429]	344	my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
[1445]	345
	346	my $success = 0;
[10534]	347	$output_type =~ s/.\-(.)/$1/i;
	348	# Attempt coversion to Image
[16435]	349	if ($output_type =~ m/jp?g\|gif\|png/i) {
[17329]	350	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
[10534]	351	if ($success){
	352	return "item";
	353	}
	354	}
[1445]	355
	356	# Attempt conversion to TEXT
[16435]	357	if (!$output_type \|\| ($output_type =~ m/text/i)) {
[1445]	358	$success = &ps_to_text($input_filename, $output_filestem);
	359	if ($success) {
	360	return "text";
	361	}
	362	}
	363	return "fail";
	364	}
	365
	366
[2977]	367	sub convertPPT {
	368	my ($input_filename, $output_filestem, $output_type) = @_;
[10357]	369	my $success = 0;
[2977]	370
[10282]	371	my $ppt_convert_type = "";
[22513]	372
[16435]	373	#if (!$output_type \|\| $windows_scripting \|\| ($output_type !~ m/html/i) \|\| ($output_type !~ m/text/i)){
	374	if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
	375	if ($output_type =~ m/gif/i) {
[10282]	376	$ppt_convert_type = "-g";
[16435]	377	} elsif ($output_type =~ m/jp?g/i){
[10282]	378	$ppt_convert_type = "-j";
[16435]	379	} elsif ($output_type =~ m/png/i){
[10282]	380	$ppt_convert_type = "-p";
	381	}
	382	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
	383	$ENV{'GSDLOS'}, "pptextract");
[16435]	384	$vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
[10282]	385
[22429]	386	my $cmd = "";
[10357]	387	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[22429]	388	# if the converting directory already exists
[10282]	389	if (-d $output_filestem) {
[22429]	390	print STDERR "**The conversion directory already exists\n";
[10282]	391	return "item";
	392	} else {
[10521]	393	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
[10282]	394	$cmd .= " 2>\"$output_filestem.err\""
[16435]	395	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[10282]	396	if (system($cmd) !=0) {
	397	print STDERR "Powerpoint VB Scripting convert failed\n";
	398	} else {
	399	return "item";
	400	}
	401	}
[16435]	402	} elsif (!$output_type \|\| ($output_type =~ m/html/i)) {
[10282]	403	# Attempt conversion to HTML
[16435]	404	#if (!$output_type \|\| ($output_type =~ m/html/i)) {
[2977]	405	# formulate the command
[22429]	406	my $cmd = "";
[24362]	407	my $full_perl_path = &util::get_perl_exec();
[24124]	408	$cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
[2977]	409	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
	410	$cmd .= " 2>\"$output_filestem.err\""
[16435]	411	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[10357]	412
[2977]	413	# execute the command
	414	$!=0;
	415	if (system($cmd)!=0)
	416	{
[2991]	417	print STDERR "Powerpoint 95/97 converter failed $!\n";
[2977]	418	} else {
	419	return "html";
	420	}
[10464]	421	}
[2977]	422
	423	$success = &any_to_text($input_filename, $output_filestem);
	424	if ($success) {
	425	return "text";
	426	}
[10464]	427
[2977]	428	return "fail";
	429	}
	430
	431
[2991]	432	sub convertXLS {
	433	my ($input_filename, $output_filestem, $output_type) = @_;
[2977]	434
[2991]	435	my $success = 0;
[2977]	436
[2991]	437	# Attempt conversion to HTML
[16435]	438	if (!$output_type \|\| ($output_type =~ m/html/i)) {
[2991]	439	# formulate the command
[22429]	440	my $cmd = "";
[24362]	441	my $full_perl_path = &util::get_perl_exec();
[24124]	442	$cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
[2991]	443	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
	444	$cmd .= " 2>\"$output_filestem.err\""
[16435]	445	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[2991]	446
	447
	448	# execute the command
	449	$!=0;
	450	if (system($cmd)!=0)
	451	{
	452	print STDERR "Excel 95/97 converter failed $!\n";
	453	} else {
	454	return "html";
	455	}
	456	}
[2977]	457
[2991]	458	$success = &any_to_text($input_filename, $output_filestem);
	459	if ($success) {
	460	return "text";
	461	}
	462
	463	return "fail";
	464	}
	465
	466
	467
[1654]	468	# Find the real type of a .doc file
	469	#
[2012]	470	# We seem to have a lot of files with a .doc extension that are .rtf
[1654]	471	# files or Word 5 files. This function attempts to tell the difference.
	472	sub find_docfile_type {
[22429]	473	my ($input_filename) = @_;
[23473]	474
	475	if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
	476	return "docx";
	477	}
	478
[1654]	479	open(CHK, "<$input_filename");
[1734]	480	binmode(CHK);
[1654]	481	my $line = "";
	482	my $first = 1;
	483
	484	while (<CHK>) {
	485
	486	$line = $_;
[1960]	487
[1654]	488	if ($first) {
	489	# check to see if this is an rtf file
[16435]	490	if ($line =~ m/^\{\\rtf/) {
[1654]	491	close(CHK);
	492	return "rtf";
	493	}
[2755]	494	$first = 0;
[1654]	495	}
	496
[1734]	497	# is this is a word 6/7/8 document?
[16435]	498	if ($line =~ m/Word\.Document\.([678])/) {
[1654]	499	close(CHK);
[23473]	500
[1734]	501	return "word$1";
[1654]	502	}
	503
	504	}
	505
	506	return "unknown";
	507	}
	508
	509
[1734]	510	# Specific type-to-type conversions
[1445]	511	#
	512	# Each of the following functions attempts to convert a document from
[2755]	513	# a specific format to another. If they succeed they return 1 and leave
[1445]	514	# the output document(s) in the appropriate place; if they fail they
	515	# return 0 and delete any working files.
	516
	517
	518	# Attempt to convert a word document to html with the wv program
	519	sub doc_to_html {
[22429]	520	my ($input_filename, $output_filestem) = @_;
[1445]	521
[24371]	522	my $wvware_status = 0;
[1928]	523
[24371]	524	# need to ensure that the path to perl is quoted (in case there's spaces in it)
	525	my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl $input_filename $output_filestem $faillogfile $timeout";
[15120]	526
[24371]	527	# print STDERR "***** wvware launch cmd = $launch_cmd\n";
[15120]	528
[24371]	529	$wvware_status = system($launch_cmd)/256;
	530	return $wvware_status;
[1445]	531	}
	532
[10282]	533	# Attempt to convert a word document to html with the word2html scripting program
	534	sub native_doc_to_html {
[22429]	535	my ($input_filename, $output_filestem) = @_;
[1445]	536
[24166]	537	# build up the path to the doc-to-html conversion tool we're going to use
	538	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
[10282]	539
[24164]	540	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
[24166]	541	# if windows scripting with docx input, use new VBscript to get the local Word install (if
	542	# any) to do the conversion, since docX can't be processed by word2html's windows_scripting
	543
	544	if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
	545	# else script launch fails when there are error msgs
	546	$vbScript = &util::filename_cat($vbScript, "docx2html.vbs");
	547	$vbScript = "CScript //Nologo \"$vbScript\""; # launche with CScript for error output in STDERR
[24169]	548	# //Nologo flag avoids Microsoft's opening/logo msgs
	549	print STDERR "About to use windows scripting to process docx file $input_filename.\n";
	550	print STDERR " This may take some time. Please wait...\n";
[24166]	551	}
	552	else { # old doc versions. use the usual VB executable word2html for the
	553	# conversion. Doesn't need full path, since bin\windows is on PATH
	554	$vbScript = "word2html"; #$vbScript = "\"".&util::filename_cat($vbScript, "word2html")."\"";
	555	}
	556	}
	557	else { # not windows
	558	$vbScript = "\"".&util::filename_cat($vbScript, "word2html")."\"";
[24164]	559	}
	560
[10445]	561	if (-e "$output_filestem.html") {
[22429]	562	print STDERR " The conversion file:\n";
	563	print STDERR " $output_filestem.html\n";
	564	print STDERR " ... already exists. Skipping\n";
[10445]	565	return 1;
	566	}
[10282]	567
	568	my $cmd = "";
	569	if ($timeout) {$cmd = "ulimit -t $timeout;";}
	570	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
[10445]	571	#$cmd .= "$vbScript $input_filename $output_filestem.html";
[10521]	572	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
[10445]	573
[10282]	574	# redirecting STDERR
[24166]	575
	576	$cmd .= " 2> \"$output_filestem.err\""
	577	if ($ENV {'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
	578	#print STDERR "@@@@@@@@@ cmd=$cmd\n";
	579
[10282]	580	# execute the command
	581	$!=0;
	582	if (system($cmd)!=0)
	583	{
[24164]	584	print STDERR "Error executing $vbScript converter:$!\n";
[10282]	585	if (-s "$output_filestem.err") {
	586	open (ERRFILE, "<$output_filestem.err");
[24166]	587
[10282]	588	my $write_to_fail_log=0;
	589	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	590	{$write_to_fail_log=1;}
	591
	592	my $line;
	593	while ($line=<ERRFILE>) {
[16435]	594	if ($line =~ m/\w/) {
[10282]	595	print STDERR "$line";
	596	print FAILLOG "$line" if ($write_to_fail_log);
	597	}
	598	if ($line !~ m/startup error/) {next;}
	599	print STDERR " (given an invalid .DOC file?)\n";
	600	print FAILLOG " (given an invalid .DOC file?)\n"
	601	if ($write_to_fail_log);
	602
	603	} # while ERRFILE
	604	close FAILLOG if ($write_to_fail_log);
	605	}
	606	return 0; # we can try any_to_text
	607	}
	608
	609	# Was the conversion successful?
	610	if (-s "$output_filestem.html") {
	611	open(TMP, "$output_filestem.html");
[22429]	612	my $line = <TMP>;
[10282]	613	close(TMP);
[22429]	614	if ($line && $line =~ m/html/i) {
[10282]	615	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
	616	return 1;
	617	}
	618	}
	619
	620	# If here, an error of some sort occurred
	621	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
	622	if (-e "$output_filestem.err") {
	623	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
	624	open (ERRLOG,"$output_filestem.err");
	625	while (<ERRLOG>) {print FAILLOG $_;}
	626	close FAILLOG;
	627	close ERRLOG;
	628	}
	629	&util::rm("$output_filestem.err");
	630	}
	631	return 0;
	632	}
	633
[1654]	634	# Attempt to convert an RTF document to html with rtftohtml
	635	sub rtf_to_html {
[2241]	636	my ($input_filename, $output_filestem) = @_;
[1654]	637
	638	# formulate the command
[22429]	639	my $cmd = "";
[1692]	640	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[2574]	641	$cmd .= "rtftohtml";
[10282]	642	#$cmd .= "rtf-converter";
[1654]	643
[3246]	644	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
[2574]	645
	646	$cmd .= " 2>\"$output_filestem.err\""
[16435]	647	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
[2574]	648
	649
[1654]	650	# execute the command
[2755]	651	$!=0;
[2060]	652	if (system($cmd)!=0)
[1654]	653	{
[2755]	654	print STDERR "Error executing rtf converter $!\n";
[2656]	655	# don't currently bother printing out error log...
	656	# keep going, in case it still created an HTML file...
[1654]	657	}
	658
	659	# Was the conversion successful?
[2755]	660	my $was_successful=0;
[2656]	661	if (-s "$output_filestem.html") {
[2755]	662	# make sure we have some content other than header
	663	open (HTML, "$output_filestem.html"); # what to do if fail?
	664	my $line;
	665	my $past_header=0;
	666	while ($line=<HTML>) {
	667
	668	if ($past_header == 0) {
[16435]	669	if ($line =~ m/<body>/) {$past_header=1;}
[2755]	670	next;
	671	}
	672
	673	$line =~ s/<[^>]+>//g;
[16435]	674	if ($line =~ m/\w/ && $past_header) { # we found some content...
[2755]	675	$was_successful=1;
	676	last;
	677	}
	678	}
	679	close HTML;
[1654]	680	}
[2574]	681
[2755]	682	if ($was_successful) {
	683	&util::rm("$output_filestem.err")
	684	if (-e "$output_filestem.err");
	685	# insert the (modified) table of contents, if it exists.
	686	if (-e "${output_filestem}_ToC.html") {
	687	&util::mv("$output_filestem.html","$output_filestem.src");
	688	my $open_failed=0;
	689	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
	690	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
	691	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
	692
	693	if ($open_failed) {
	694	close HTMLSRC;
	695	close TOC;
	696	close HTML;
	697	&util::mv("$output_filestem.src","$output_filestem.html");
	698	return 1;
	699	}
	700
	701	# print out header info from src html.
[16435]	702	while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
[2755]	703	print HTML "$_";
	704	}
	705
	706	# print out table of contents, making links relative
	707	<TOC>; <TOC>; # ignore first 2 lines
	708	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
	709	my $line;
	710	while ($line=<TOC>) {
[22429]	711	$line =~ s@</body></html>$@@i ; # only last line has this
[2755]	712	# make link relative
[22429]	713	$line =~ s@href=\"[^\#]+@href=\"@i;
[2755]	714	print HTML $line;
	715	}
	716	close TOC;
	717
	718	# rest of html src
	719	while (<HTMLSRC>) {
	720	print HTML $_;
	721	}
	722	close HTMLSRC;
	723	close HTML;
	724
	725	&util::rm("${output_filestem}_ToC.html");
	726	&util::rm("${output_filestem}.src");
	727	}
	728	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
	729	return 1; # success
	730	}
	731
	732	if (-e "$output_filestem.err") {
	733	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	734	{
	735	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
[10282]	736	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
[2755]	737	print FAILLOG " (rtf file might be too recent):\n";
	738	open (ERRLOG, "$output_filestem.err");
	739	while (<ERRLOG>) {print FAILLOG $_;}
	740	close ERRLOG;
	741	close FAILLOG;
	742	}
	743	&util::rm("$output_filestem.err");
	744	}
	745
[2656]	746	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
	747
[1654]	748	return 0;
	749	}
	750
	751
[1445]	752	# Convert a pdf file to html with the pdftohtml command
	753
	754	sub pdf_to_html {
[2755]	755	my ($dirname, $input_filename, $output_filestem) = @_;
[1445]	756
[22429]	757	my $cmd = "";
[1692]	758	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[24362]	759	my $full_perl_path = &util::get_perl_exec();
[24124]	760	$cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
[3720]	761	$cmd .= " -c" if ($pdf_complex);
	762	$cmd .= " -i" if ($pdf_ignore_images);
[10451]	763	$cmd .= " -a" if ($pdf_allow_images_only);
[4103]	764	$cmd .= " -hidden" unless ($pdf_nohidden);
[1928]	765	$cmd .= " \"$input_filename\" \"$output_filestem\"";
[2755]	766
[16435]	767	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
[2755]	768	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	769	} else {
	770	$cmd .= " > \"$output_filestem.err\"";
	771	}
	772
[2117]	773	$!=0;
[2241]	774
[2656]	775	my $retval=system($cmd);
	776	if ($retval!=0)
[1445]	777	{
[2755]	778	print STDERR "Error executing pdftohtml.pl";
[2117]	779	if ($!) {print STDERR ": $!";}
	780	print STDERR "\n";
[1445]	781	}
	782
[1692]	783	# make sure the converter made something
[2656]	784	if ($retval!=0 \|\| ! -s "$output_filestem.html")
[1692]	785	{
	786	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
[2656]	787	# print out the converter's std err, if any
	788	if (-s "$output_filestem.err") {
[1692]	789	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[2755]	790	print STDERR "pdftohtml error log:\n";
[1692]	791	while (<ERRLOG>) {
	792	print STDERR "$_";
	793	}
	794	close ERRLOG;
	795	}
[22513]	796	print STDERR "***********output filestem $output_filestem.html\n";
[2656]	797	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
[2755]	798	if (-e "$output_filestem.err") {
	799	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	800	{
	801	open (ERRLOG, "$output_filestem.err");
	802	while (<ERRLOG>) {print FAILLOG $_;}
	803	close ERRLOG;
	804	close FAILLOG;
	805	}
[10282]	806	&util::rm("$output_filestem.err");
[2755]	807	}
[1692]	808	return 0;
	809	}
[10357]	810
	811	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	812	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	813	return 1;
	814	}
	815
	816	# Convert a pdf file to various types of image with the convert command
	817
[17329]	818	sub pdfps_to_img {
[10357]	819	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[10401]	820
	821	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
	822	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
	823	my $result = `identify 2>&1`;
	824	if ($? == -1 \|\| $? == 256) { # Linux and Windows return different values for "program not found"
	825	#ImageMagick is not installed, thus the convert utility is not available.
[17329]	826	print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images\n";
[10401]	827	return 0;
	828	}
	829	}
	830
[22429]	831	my $cmd = "";
[10357]	832	if ($timeout) {$cmd = "ulimit -t $timeout;";}
	833	$output_type =~ s/.\_(.)/$1/i;
[24362]	834	my $full_perl_path = &util::get_perl_exec();
[24124]	835	$cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
[16435]	836	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
[10357]	837	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	838	} else {
	839	$cmd .= " > \"$output_filestem.err\"";
	840	}
	841
	842	# don't include path on windows (to avoid having to play about
	843	# with quoting when GSDLHOME might contain spaces) but assume
	844	# that the PATH is set up correctly
	845	$!=0;
	846	my $retval=system($cmd);
	847	if ($retval!=0)
	848	{
[10401]	849	print STDERR "Error executing pdftoimg.pl";
[10357]	850	if ($!) {print STDERR ": $!";}
	851	print STDERR "\n";
	852	}
	853
	854	#make sure the converter made something
	855	#if ($retval !=0) \|\| ! -s "$output_filestem")
	856	if ($retval !=0)
	857	{
	858	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	859	#print out the converter's std err, if any
	860	if (-s "$output_filestem.err") {
	861	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[17329]	862	print STDERR "pdfpstoimg error log:\n";
[10357]	863	while (<ERRLOG>) {
	864	print STDERR "$_";
	865	}
	866	close ERRLOG;
	867	}
[10534]	868	#&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
[10357]	869	if (-e "$output_filestem.err") {
	870	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	871	{
	872	open (ERRLOG, "$output_filestem.err");
	873	while (<ERRLOG>) {print FAILLOG $_;}
	874	close ERRLOG;
	875	close FAILLOG;
	876	}
	877	&util::rm("$output_filestem.err");
	878	}
	879	return 0;
	880	}
[2656]	881	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
[1445]	882	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	883	return 1;
	884	}
	885
	886	# Convert a PDF file to text with the pdftotext command
	887
	888	sub pdf_to_text {
[2755]	889	my ($dirname, $input_filename, $output_filestem) = @_;
[1445]	890
[2248]	891	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
[2755]	892
[16435]	893	if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
[2755]	894	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	895	} else {
	896	$cmd .= " > \"$output_filestem.err\"";
	897	}
[1445]	898
[2060]	899	if (system($cmd)!=0)
[1445]	900	{
	901	print STDERR "Error executing $cmd: $!\n";
	902	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
	903	}
	904
[2755]	905	# make sure there is some extracted text.
	906	if (-e "$output_filestem.text") {
	907	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
	908	binmode(EXTR_TEXT); # just in case...
	909	my $line="";
	910	my $seen_text=0;
	911	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
[16435]	912	if ($line=~ m/\w/) {$seen_text=1;}
[2755]	913	}
	914	close EXTR_TEXT;
	915	if ($seen_text==0) { # no text was extracted
	916	print STDERR "Error: pdftotext found no text\n";
	917	&util::rm("$output_filestem.text");
	918	}
	919	}
	920
[1692]	921	# make sure the converter made something
[2656]	922	if (! -s "$output_filestem.text")
[1692]	923	{
	924	# print out the converters std err, if any
[2656]	925	if (-s "$output_filestem.err") {
[1692]	926	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[2755]	927	print STDERR "pdftotext error log:\n";
[1692]	928	while (<ERRLOG>) {
	929	print STDERR "$_";
	930	}
	931	close ERRLOG;
	932	}
[2656]	933	# does this converter create a .out file?
	934	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	935	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
[2755]	936	if (-e "$output_filestem.err") {
	937	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	938	{
	939	open (ERRLOG,"$output_filestem.err");
	940	while (<ERRLOG>) {print FAILLOG $_;}
	941	close ERRLOG;
	942	close FAILLOG;
	943	}
	944	&util::rm("$output_filestem.err");
	945	}
[1692]	946	return 0;
	947	}
[1445]	948	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	949	return 1;
	950	}
	951
[2012]	952	# Convert a PostScript document to text
	953	# note - just using "ps2ascii" isn't good enough, as it
	954	# returns 0 for a postscript interpreter error. ps2ascii is just
	955	# a wrapper to "gs" anyway, so we use that cmd here.
[1445]	956
	957	sub ps_to_text {
[2241]	958	my ($input_filename, $output_filestem) = @_;
[1445]	959
[2241]	960	my $error = "";
	961
	962	# if we're on windows we'll fall straight through without attempting
	963	# to use gs
[16435]	964	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
[2241]	965	$error = "Windows does not support gs";
	966
	967	} else {
[3538]	968	my $cmd = "";
	969	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
	970	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
[2241]	971	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
[10357]	972	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
[2241]	973	$cmd .= " 2> $output_filestem.err";
	974	$!=0;
[10357]	975
[2241]	976	my $retcode=system($cmd);
	977	$retcode = $? >> 8; # see man perlfunc - system for this...
	978	# if system returns -1 \| 127 (couldn't start program), look at $! for message
	979
	980	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
	981	elsif (! -e "$output_filestem.text") {
	982	$error="did not create output file.\n";
[2012]	983	}
[2241]	984	else
	985	{ # make sure the interpreter didn't get an error. It is technically
	986	# possible for the actual text to start with this, but....
	987	open PSOUT, "$output_filestem.text";
[16435]	988	if (<PSOUT> =~ m/^Error: (.*)/) {
[2241]	989	$error="interpreter error - \"$1\"";
	990	}
	991	close PSOUT;
	992	}
[2012]	993	}
[2241]	994
[2012]	995	if ($error ne "")
[1445]	996	{
[2755]	997	print STDERR "Warning: Error executing gs: $error\n";
[1445]	998	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
[2755]	999
	1000	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
	1001	{
	1002	print FAILLOG "gs - $error\n";
	1003	if (-e "$output_filestem.err") {
	1004	open(ERRLOG, "$output_filestem.err");
	1005	while (<ERRLOG>) {print FAILLOG $_;}
	1006	close ERRLOG;
	1007	}
	1008	close FAILLOG;
	1009	}
[1445]	1010	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
[2012]	1011
[2755]	1012
[2012]	1013	# Fine then. We'll just do a lousy job by ourselves...
[2031]	1014	# Based on 5-line regexp sed script found at:
[2012]	1015	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
	1016	#
[2755]	1017	print STDERR "Stripping text from postscript\n";
[2012]	1018	my $errorcode=0;
	1019	open (IN, "$input_filename")
	1020	\|\| ($errorcode=1, warn "Couldn't read file: $!");
	1021	open (OUT, ">$output_filestem.text")
	1022	\|\| ($errorcode=1, warn "Couldn't write file: $!");
	1023	if ($errorcode) {print STDERR "errors\n";return 0;}
	1024
[2031]	1025	my $text=""; # this is for whole .ps file...
[2755]	1026	$text = join('', <IN>); # see man perlport, under "System Resources"
[2031]	1027	close IN;
	1028
[2447]	1029	# Make sure this is a ps file...
[16435]	1030	if ($text !~ m/^%!/) {
[2755]	1031	print STDERR "Bad postscript header: not '%!'\n";
	1032	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
	1033	{
	1034	print FAILLOG "Bad postscript header: not '%!'\n";
	1035	close FAILLOG;
	1036	}
[2447]	1037	return 0;
	1038	}
	1039
[2031]	1040	# if ps has Page data, then use it to delete all stuff before it.
	1041	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
	1042
	1043	# remove all leading non-data stuff
	1044	$text =~ s/^.*?\(//s;
	1045
	1046	# remove all newline chars for easier processing
	1047	$text =~ s/\n//g;
	1048
	1049	# Big assumption here - assume that if any co-ordinates are
	1050	# given, then we are at the end of a sentence.
	1051	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
	1052
	1053	# special characters--
	1054	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
	1055
	1056	# ? ps text formatting (eg italics?) ?
	1057	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
	1058	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
	1059	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
	1060	# default - remove the rest
	1061	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
	1062
	1063	# attempt to add whitespace between words...
	1064	# this is based purely on observation, and may be completely wrong...
	1065	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
	1066	# eg I notice "b(" is sometimes NOT a space if preceded by a
	1067	# negative number.
	1068	$text =~ s/\)\d+ ?b\(/\) \( /g;
	1069
	1070	# change quoted braces to brackets
	1071	$text =~ s/([^\\])\\\(/$1\{/g;
	1072	$text =~ s/([^\\])\\\)/$1\}/g ;
	1073
	1074	# remove everything that is not between braces
	1075	$text =~ s/\)([^\(\)])+?\(//sg ;
	1076
	1077	# remove any Trailer eof stuff.
	1078	$text =~ s/\)[^\)]*$//sg;
	1079
	1080	### ligatures have special characters...
	1081	$text =~ s/\\013/ff/g;
	1082	$text =~ s/\\014/fi/g;
	1083	$text =~ s/\\015/fl/g;
	1084	$text =~ s/\\016/ffi/g;
	1085	$text =~ s/\\214/fi/g;
	1086	$text =~ s/\\215/fl/g;
	1087	$text =~ s/\\017/\n\* /g; # asterisk?
	1088	$text =~ s/\\023/\023/g; # e acute ('e)
	1089	$text =~ s/\\177/\252/g; # u"
	1090	# $text =~ s/ ?? /\344/g; # a"
	1091
	1092	print OUT "$text";
	1093	close OUT;
[1960]	1094	}
[2600]	1095	# wrap the text - use a minimum length. ie, first space after this length.
	1096	my $wrap_length=72;
	1097	&util::mv("$output_filestem.text", "$output_filestem.text.tmp");
	1098	open INFILE, "$output_filestem.text.tmp" \|\|
	1099	die "Couldn't open file: $!";
	1100	open OUTFILE, ">$output_filestem.text" \|\|
	1101	die "Couldn't open file for writing: $!";
	1102	my $line="";
	1103	while ($line=<INFILE>) {
	1104	while (length($line)>0) {
	1105	if (length($line)>$wrap_length) {
	1106	$line =~ s/^(.{$wrap_length}[^\s])\s//;
	1107	print OUTFILE "$1\n";
	1108	} else {
	1109	print OUTFILE "$line";
	1110	$line="";
	1111	}
	1112	}
	1113	}
	1114	close INFILE;
	1115	close OUTFILE;
	1116	&util::rm("$output_filestem.text.tmp");
	1117
[1445]	1118	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	1119	return 1;
	1120	}
	1121
	1122
	1123	# Convert any file to HTML with a crude perl implementation of the
	1124	# UNIX strings command.
	1125
	1126	sub any_to_html {
[22429]	1127	my ($input_filename, $output_filestem) = @_;
[1445]	1128
	1129	# First generate a text file
	1130	return 0 unless (&any_to_text($input_filename, $output_filestem));
	1131
	1132	# create an HTML file from the text file
	1133	open(TEXT, "<$output_filestem.text");
	1134	open(HTML, ">$output_filestem.html");
	1135
[2241]	1136	print HTML "<html><head>\n";
	1137	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
	1138	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
	1139	print HTML "</head><body>\n\n";
[1734]	1140
[2755]	1141	my $line;
	1142	while ($line=<TEXT>) {
	1143	$line =~ s/</</g;
	1144	$line =~ s/>/>/g;
[16435]	1145	if ($line =~ m/^\s*$/) {
[2755]	1146	print HTML "<p>";
	1147	} else {
	1148	print HTML "<br> ", $line;
	1149	}
[1445]	1150	}
[1734]	1151	print HTML "\n</body></html>\n";
[1445]	1152
[2241]	1153	close HTML;
	1154	close TEXT;
	1155
[1445]	1156	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
	1157	return 1;
	1158	}
	1159
	1160	# Convert any file to TEXT with a crude perl implementation of the
	1161	# UNIX strings command.
[2755]	1162	# Note - this assumes ascii charsets :( (jrm21)
[1445]	1163
	1164	sub any_to_text {
[22429]	1165	my ($input_filename, $output_filestem) = @_;
[1445]	1166
[3350]	1167	if (!$use_strings) {
	1168	return 0;
	1169	}
[15120]	1170
	1171	print STDERR "\n** In any to text**\n\n";
[2755]	1172	open(IN, "<$input_filename") \|\| return 0;
[1734]	1173	binmode(IN);
[2755]	1174	open(OUT, ">$output_filestem.text") \|\| return 0;
[1445]	1175
	1176	my ($line);
[2755]	1177	my $output_line_count = 0;
[1445]	1178	while (<IN>) {
	1179	$line = $_;
[1734]	1180
[1445]	1181	# delete anything that isn't a printable character
	1182	$line =~ s/[^\040-\176]+/\n/sg;
	1183
	1184	# delete any string less than 10 characters long
[1734]	1185	$line =~ s/^.{0,9}$/\n/mg;
[16435]	1186	while ($line =~ m/^.{1,9}$/m) {
[1734]	1187	$line =~ s/^.{0,9}$/\n/mg;
[1445]	1188	$line =~ s/\n+/\n/sg;
	1189	}
	1190
	1191	# remove extraneous whitespace
	1192	$line =~ s/\n+/\n/gs;
	1193	$line =~ s/^\n//gs;
[1578]	1194
[1445]	1195	# output whatever is left
[16435]	1196	if ($line =~ m/[^\n ]/) {
[1445]	1197	print OUT $line;
[2755]	1198	++$output_line_count;
[1445]	1199	}
	1200	}
[2241]	1201
	1202	close OUT;
	1203	close IN;
	1204
[2755]	1205	if ($output_line_count) { # try to protect against binary only formats
	1206	return 1;
	1207	}
	1208
	1209	&util::rm("$output_filestem.text");
	1210	return 0;
	1211
[1445]	1212	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: