Context Navigation

pdftohtml.pl@ 32130

Last change on this file since 32130 was 27757, checked in by ak19, 11 years ago
Using FileUtils subroutines instead of deprecated calls to util package
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 8.4 KB

Rev	Line
[1928]	1	#!/usr/bin/perl -w
	2
	3
	4	###########################################################################
	5	#
[2715]	6	# pdftohtml.pl -- convert PDF documents to HTML format
[1928]	7	#
	8	# A component of the Greenstone digital library software
	9	# from the New Zealand Digital Library Project at the
	10	# University of Waikato, New Zealand.
	11	#
[2715]	12	# Copyright (C) 2001 New Zealand Digital Library Project
[1928]	13	#
	14	# This program is free software; you can redistribute it and/or modify
	15	# it under the terms of the GNU General Public License as published by
	16	# the Free Software Foundation; either version 2 of the License, or
	17	# (at your option) any later version.
	18	#
	19	# This program is distributed in the hope that it will be useful,
	20	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	21	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	22	# GNU General Public License for more details.
	23	#
	24	# You should have received a copy of the GNU General Public License
	25	# along with this program; if not, write to the Free Software
	26	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	27	#
	28	###########################################################################
	29
	30	# pdftohtml.pl is a wrapper for running pdftohtml utility which converts
	31	# PDF documents to HTML, and converts images to PNG format for display in
	32	# the HTML pages generated
	33
	34	BEGIN {
	35	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
	36	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
	37	}
	38
	39	use parsargv;
	40	use util;
[27757]	41	use FileUtils;
[1928]	42	use Cwd;
	43	use File::Basename;
	44
	45	sub print_usage {
[2352]	46	# note - we don't actually ever use most of these options...
[1928]	47	print STDERR
[3720]	48	("pdftohtml.pl wrapper for pdftohtml.\n",
[2118]	49	"Usage: pdftohtml [options] <PDF-file> <html-file>\n",
[2755]	50	"Options:\n",
	51	"\t-i\tignore images (don't extract)\n",
[3720]	52	"\t-a\tallow images only (continue even if no text is present)\n",
	53	"\t-c\tproduce complex output (requires ghostscript)\n",
[4103]	54	"\t-hidden\tExtract hidden text\n",
[3720]	55	"\t-zoom\tfactor by which to zoom the PDF (only useful if -c is set)\n"
[1928]	56	);
[1984]	57	exit (1);
[1928]	58	}
	59
	60	sub main {
	61	my (@ARGV) = @_;
[4103]	62	my ($allow_no_text, $ignore_images, $complex, $zoom, $hidden);
[1928]	63
	64	# read command-line arguments so that
	65	# you can change the command in this script
	66	if (!parsargv::parse(\@ARGV,
[2755]	67	'a', \$allow_no_text,
[3720]	68	'i', \$ignore_images,
	69	'c', \$complex,
[4103]	70	'hidden', \$hidden,
[3720]	71	'zoom/\d+/2', \$zoom,
[1928]	72	))
	73	{
	74	print_usage();
	75	}
	76
	77	# Make sure the input file exists and can be opened for reading
[2976]	78	if (scalar(@ARGV) != 2) {
[1928]	79	print_usage();
	80	}
	81
	82	my $input_filename = $ARGV[0];
	83	my $output_filestem = $ARGV[1];
[3522]	84
[3410]	85	$output_filestem =~ s/\.html$//i; # pdftohtml adds this suffix
[1928]	86
[3522]	87	# test that the directories exist to create the output file, or
	88	# we should exit immediately. (File:: is included by util.pm)
	89	my $output_dir = File::Basename::dirname($output_filestem);
	90	if (! -d $output_dir \|\| ! -w $output_dir) {
	91	die "pdftohtml.pl: cannot write to directory $output_dir\n";
	92	}
	93
[1928]	94	my @dir = split (/(\/\|\\)/, $input_filename);
[3410]	95	my $input_basename = pop(@dir);
	96	$input_basename =~ s/\.pdf//i;
[1928]	97	my $dir = join ("", @dir);
	98
	99	if (!-r $input_filename) {
	100	print STDERR "Error: unable to open $input_filename for reading\n";
	101	exit(1);
	102	}
	103
[2575]	104	# Heuristical code removed due to pdftohtml being "fixed" to not
[2352]	105	# create bitmaps for each char in some pdfs. However, this means we
	106	# now create .html files even if we can't extract any text. We should
	107	# check for that now instead someday...
[2118]	108
	109
[1928]	110	# formulate the command
[27757]	111	my $cmd = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "pdftohtml");
[2241]	112
	113	# don't include path on windows (to avoid having to play about
	114	# with quoting when GSDLHOME might contain spaces) but assume
[2575]	115	# that the PATH is set up correctly.
[2248]	116	$cmd = "pdftohtml" if ($ENV{'GSDLOS'} =~ /^windows$/);
[2241]	117
[2755]	118	$cmd .= " -i" if ($ignore_images);
[3720]	119	$cmd .= " -c" if ($complex);
[4103]	120	$cmd .= " -hidden" if ($hidden);
[3720]	121	$cmd .= " -zoom $zoom";
	122	$cmd .= " -noframes -p -enc UTF-8 \"$input_filename\" \"$output_filestem.html\"";
[2241]	123
[2289]	124	# system() returns -1 if it can't run, otherwise it's $cmds ret val.
[2655]	125	# note we return 0 if the file is "encrypted"
[2755]	126	$!=0;
[2289]	127	if (system($cmd)!=0) {
[2755]	128	print STDERR "pdftohtml error for $input_filename $!\n";
[2655]	129	# leave these for gsConvert.pl...
[27757]	130	#&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
	131	#&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
[2755]	132	return 1;
[1928]	133	}
	134
[2655]	135	if (! -e "$output_filestem.html") {
[2755]	136	return 1;
[2655]	137	}
	138
[2599]	139	# post-process to remove </b><b> and </i><i>, as these break up
	140	# words, screwing up indexing and searching.
[2755]	141	# At the same time, check that our .html file has some textual content.
[27757]	142	&FileUtils::moveFiles("$output_filestem.html","$output_filestem.html.tmp");
[2755]	143	$!=0;
[2599]	144	open INFILE, "$output_filestem.html.tmp" \|\|
	145	die "Couldn't open file: $!";
	146	open OUTFILE, ">$output_filestem.html" \|\|
	147	die "Couldn't open file for writing: $!";
	148	my $line;
[2755]	149	my $seen_textual_content=$allow_no_text;
[7018]	150	# check for unicode byte-order marker at the start of the file
	151	$line = <INFILE>;
	152	$line =~ s#\376\377##g;
	153	while ($line) {
[2599]	154	$line =~ s#</b><b>##g;
	155	$line =~ s#</i><i>##g;
[2715]	156	$line =~ s#\\#\\\\#g; # until macro language parsing is fixed...
[2755]	157	# check for any extracted text
	158	if ($seen_textual_content == 0) {
	159	my $tmp_line=$line;
	160	$tmp_line =~ s/<[^>]*>//g;
	161	$tmp_line =~ s/Page\s\d+//;
	162	$tmp_line =~ s/\s*//g;
	163	if ($tmp_line ne "") {
	164	$seen_textual_content=1;
	165	}
[7120]	166	# special - added to remove the filename from the title
	167	# this should be in the header, before we see "textual content"
[7643]	168	if ($line =~ m@<title>(.*?)</title>@i) {
[7120]	169	my $title=$1;
[7586]	170
[7120]	171	# is this title the name of a filename?
	172	if (-r "$title.pdf" \|\| -r "$title.html") {
	173	# remove the title
[7643]	174	$line =~ s@<title>.*?</title>@<title></title>\n<META NAME=\"filename\" CONTENT=\"$title\">@i;
[7120]	175	}
	176	}
[2755]	177	}
	178
[3410]	179	# relative hrefs to own document...
	180	$line =~ s@href=\"$input_basename\.html\#@href=\"\#@go;
[2715]	181	# escape underscores, but not if they're inside tags (eg img/href names)
	182	my $inatag = 0; # allow multi-line tags
	183	if ($line =~ /_/) {
	184	my @parts=split('_',$line);
	185	my $lastpart=pop @parts;
	186	foreach my $part (@parts) {
	187	if ($part =~ /<[^>]*$/) { # if we're starting a tag...
	188	$inatag=1;
	189	} elsif ($part =~ />[^<]*$/) { # closing a tag
	190	$inatag=0;
	191	}
	192	if ($inatag) {
	193	$part.='_';
	194	} else {
	195	$part.="_";
	196	}
	197	}
	198	$line=join('',@parts,$lastpart);
	199	}
	200
[2599]	201	print OUTFILE $line;
[7018]	202	$line = <INFILE>;
[2599]	203	}
	204	close INFILE;
	205	close OUTFILE;
[27757]	206	&FileUtils::removeFiles("$output_filestem.html.tmp");
[2599]	207
[1928]	208	# Need to convert images from PPM format to PNG format
	209	my @images;
[2118]	210
[2743]	211	my $directory=$output_filestem;
	212	$directory =~ s@[^\/]*$@@; # assume filename has no embedded slashes...
[7120]	213	# newer versions of pdftohtml don't seem to do images this way anymore?
[3720]	214	if (open (IMAGES, "${directory}images.log") \|\|
	215	open (IMAGES, "${directory}image.log")) {
	216	while (<IMAGES>) {
	217	push (@images, $_);
	218	}
	219	close IMAGES;
[27757]	220	&FileUtils::removeFiles("${directory}image.log") if (-e "${directory}image.log");
[3720]	221
[1928]	222	}
[2346]	223
[2755]	224	# no need to go any further if there is no text extracted from pdf.
	225	if ($seen_textual_content == 0) {
	226	print STDERR "Error: PDF contains no extractable text\n";
	227	# remove images...
	228	for $image (@images) {
	229	chomp($image);
[27757]	230	&FileUtils::removeFiles("${directory}$image");
[2755]	231	}
	232	return 1;
	233	}
	234
	235
	236
[1928]	237	for $image (@images) {
	238	chomp($image);
	239	my $cmd = "";
	240	if ($ENV{'GSDLOS'} =~ /^windows/i) {
[2743]	241	$cmd = "pnmtopng \"${directory}$image\"";
[2118]	242	if (system($cmd)!=0) {
[1928]	243	print STDERR "Error executing $cmd\n";
[2755]	244	#return 1; # not sure about whether to leave this one in or take it out
[2599]	245	next;
[1928]	246	}
	247	} else {
	248	my @nameparts = split(/\./, $image);
	249	my $image_base = shift(@nameparts);
[2930]	250	$cmd = "pnmtopng \"${directory}$image\" > \"${directory}$image_base.png\" 2>/dev/null";
[2118]	251	if (system($cmd)!=0) {
[24600]	252	$cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl convert \"${directory}$image\" \"${directory}$image_base.png\" 2>/dev/null";
[2118]	253	if (system($cmd)!=0) {
[2028]	254	print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n";
[2755]	255	#return 1; # not sure about whether to leave this one in or take it out
[2599]	256	next;
[1928]	257	}
	258	}
	259	}
[27757]	260	&FileUtils::removeFiles($image);
[1928]	261	}
	262
[2755]	263	return 0;
[1928]	264	}
	265
[2755]	266	# indicate our error status, 0 = success
	267	exit (&main(@ARGV));
	268

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/bin/script/pdftohtml.pl@ 32130

Download in other formats: