Context Navigation

source: gsdl/trunk/perllib/plugins/PDFPlugin.pm@ 18339

Last change on this file since 18339 was 18145, checked in by ak19, 16 years ago
When PDFPlugin now converts PDF pages to images it no longer uses the default screenviewsize of 500 but sets it to 1000. This didn't work previously because the convert_to pattern-match on pagedimg failed. It turned out that the pattern should have been looking for PagedImage instead.
Property svn:keywords set to `Author Date Id Revision`
File size: 12.2 KB

Rev	Line
[1410]	1	###########################################################################
	2	#
[15872]	3	# PDFPlugin.pm -- reasonably with-it pdf plugin
[1410]	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
[2661]	8	# Copyright (C) 1999-2001 New Zealand Digital Library Project
[1410]	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
[15872]	25	package PDFPlugin;
[1410]	26
[15872]	27	use ConvertBinaryFile;
	28	use ReadTextFile;
[8218]	29	use unicode;
[10353]	30	use strict;
	31	no strict 'refs'; # so we can use a var for filehandles (eg STDERR)
[1410]	32
[8716]	33	sub BEGIN {
[15872]	34	@PDFPlugin::ISA = ('ConvertBinaryFile', 'ReadTextFile');
[8716]	35	}
[1410]	36
[10452]	37	my $convert_to_list =
	38	[ { 'name' => "auto",
[15872]	39	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
[10452]	40	{ 'name' => "html",
[15872]	41	'desc' => "{ConvertBinaryFile.convert_to.html}" },
[10452]	42	{ 'name' => "text",
[15872]	43	'desc' => "{ConvertBinaryFile.convert_to.text}" },
[10452]	44	{ 'name' => "pagedimg_jpg",
[15872]	45	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"},
[10452]	46	{ 'name' => "pagedimg_gif",
[15872]	47	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}"},
[10452]	48	{ 'name' => "pagedimg_png",
[15872]	49	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}"},
[10452]	50	];
	51
	52
	53	my $arguments =
[10889]	54	[
	55	{ 'name' => "convert_to",
[15872]	56	'desc' => "{ConvertBinaryFile.convert_to}",
[10889]	57	'type' => "enum",
	58	'reqd' => "yes",
	59	'list' => $convert_to_list,
	60	'deft' => "html" },
	61	{ 'name' => "process_exp",
[15872]	62	'desc' => "{BasePlugin.process_exp}",
[10889]	63	'type' => "regexp",
	64	'deft' => &get_default_process_exp(),
	65	'reqd' => "no" },
	66	{ 'name' => "block_exp",
[15872]	67	'desc' => "{BasePlugin.block_exp}",
[10889]	68	'type' => "regexp",
	69	'deft' => &get_default_block_exp() },
	70	{ 'name' => "metadata_fields",
[15872]	71	'desc' => "{HTMLPlugin.metadata_fields}",
[10889]	72	'type' => "string",
	73	'deft' => "" },
	74	{ 'name' => "noimages",
[15872]	75	'desc' => "{PDFPlugin.noimages}",
[10889]	76	'type' => "flag" },
	77	{ 'name' => "allowimagesonly",
[15872]	78	'desc' => "{PDFPlugin.allowimagesonly}",
[10889]	79	'type' => "flag" },
	80	{ 'name' => "complex",
[15872]	81	'desc' => "{PDFPlugin.complex}",
[10889]	82	'type' => "flag" },
	83	{ 'name' => "nohidden",
[15872]	84	'desc' => "{PDFPlugin.nohidden}",
[10889]	85	'type' => "flag" },
	86	{ 'name' => "zoom",
[15872]	87	'desc' => "{PDFPlugin.zoom}",
[10889]	88	'deft' => "2",
	89	'range' => "1,3", # actually the range is 0.5-3
	90	'type' => "int" },
	91	{ 'name' => "use_sections",
[15872]	92	'desc' => "{PDFPlugin.use_sections}",
[10889]	93	'type' => "flag" },
	94	{ 'name' => "description_tags",
[15872]	95	'desc' => "{HTMLPlugin.description_tags}",
[10889]	96	'type' => "flag" }
	97	];
[3540]	98
[15872]	99	my $options = { 'name' => "PDFPlugin",
	100	'desc' => "{PDFPlugin.desc}",
[6408]	101	'abstract' => "no",
[3540]	102	'inherits' => "yes",
[15114]	103	'srcreplaceable' => "yes", # Source docs in PDF can be replaced with GS-generated html
[3540]	104	'args' => $arguments };
	105
[1410]	106	sub new {
[10218]	107	my ($class) = shift (@_);
	108	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	109	push(@$pluginlist, $class);
[2452]	110
[10218]	111	push(@$inputargs,"-title_sub");
	112	push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
[5616]	113
[15872]	114	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	115	push(@{$hashArgOptLists->{"OptList"}},$options);
[10429]	116
	117	my @arg_array = @$inputargs;
[15872]	118	my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
[10353]	119
[10580]	120	if ($self->{'info_only'}) {
	121	# don't worry about any options etc
	122	return bless $self, $class;
	123	}
	124
[15872]	125	$self->{'filename_extension'} = "pdf";
	126	$self->{'file_type'} = "PDF";
	127
	128	# these are passed through to gsConvert.pl by ConvertBinaryFile.pm
[10218]	129	my $zoom = $self->{"zoom"};
[3720]	130	$self->{'convert_options'} = "-pdf_zoom $zoom";
[10218]	131	$self->{'convert_options'} .= " -pdf_complex" if $self->{"complex"};
	132	$self->{'convert_options'} .= " -pdf_nohidden" if $self->{"nohidden"};
	133	$self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"};
[10452]	134	$self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"};
[3720]	135
[10273]	136	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
	137
[15872]	138	if (!defined $secondary_plugin_options->{'HTMLPlugin'}) {
	139	$secondary_plugin_options->{'HTMLPlugin'} = [];
[10273]	140	}
[15872]	141	if (!defined $secondary_plugin_options->{'TextPlugin'}) {
	142	$secondary_plugin_options->{'TextPlugin'} = [];
[10273]	143	}
[18145]	144	if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ m/(pagedimage\|pagedimg).*/i) {
[15872]	145	if (!defined $secondary_plugin_options->{'PagedImagePlugin'}){
	146	$secondary_plugin_options->{'PagedImagePlugin'} = [];
	147	my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
[10429]	148	push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
[18145]	149	push(@$pagedimg_options, "-screenviewsize", "1000");
[10353]	150	}
	151	}
[15872]	152	my $html_options = $secondary_plugin_options->{'HTMLPlugin'};
	153	my $text_options = $secondary_plugin_options->{'TextPlugin'};
	154	my $pagedimg_options = $secondary_plugin_options->{'PagedImagePlugin'};
[18145]	155
[15904]	156	# if ($self->{'input_encoding'} eq "auto") {
	157	# $self->{'input_encoding'} = "utf8";
	158	# }
[10273]	159
[10835]	160	# if pdftohtml is always producing utf8, then htmlplug always needs this option
	161	push(@$html_options,"-input_encoding", "utf8");
[12834]	162	push(@$html_options,"-extract_language") if $self->{'extract_language'};
[10273]	163	# Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
	164	# to extract these metadata fields from the HEAD META fields
[10724]	165	my $required_metadata;
	166	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
	167	push(@$html_options,"-metadata_fields",$self->{'metadata_fields'});
	168	} else {
	169	push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
	170	}
	171	#push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
	172
[10514]	173	if ($self->{'use_sections'} \|\| $self->{'description_tags'}) {
[10273]	174	$self->{'description_tags'} = 1;
	175	push(@$html_options,"-description_tags");
	176	}
[10429]	177
[10273]	178	# following title_sub removes "Page 1" added by pdftohtml, and a leading
	179	# "1", which is often the page number at the top of the page. Bad Luck
	180	# if your document title actually starts with "1 " - is there a better way?
	181	push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
	182	push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
[11122]	183
	184	my $associate_tail_re = $self->{'associate_tail_re'};
	185	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
	186	push(@$html_options, "-associate_tail_re", $associate_tail_re);
	187	push(@$text_options, "-associate_tail_re", $associate_tail_re);
	188	push(@$pagedimg_options, "-associate_tail_re", $associate_tail_re);
	189	}
	190
[10429]	191
[10273]	192	$self = bless $self, $class;
[10429]	193	$self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
[10273]	194	return $self;
[1410]	195	}
	196
	197	sub get_default_process_exp {
	198	my $self = shift (@_);
	199
	200	return q^(?i)\.pdf$^;
	201	}
[2661]	202
	203	# so we don't inherit HTMLPlug's block exp...
	204	sub get_default_block_exp {
	205	return "";
	206	}
[1410]	207
[10273]	208	sub convert_post_process
	209	{
[1410]	210	my $self = shift (@_);
[10273]	211	my ($conv_filename) = @_;
[9465]	212
[7019]	213	my $outhandle=$self->{'outhandle'};
	214
[15963]	215	#$self->{'input_encoding'} = "utf8"; # The output is always in utf8 (is it?? it is for html, but what about other types?)
	216	#my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
[8218]	217
[10273]	218	# read in file ($text will be in utf8)
	219	my $text = "";
[15963]	220	# encoding will be utf8 for html files - what about other types? will we do this step for them anyway?
	221	$self->read_file ($conv_filename, "utf8", "", \$text);
[10273]	222
	223	# Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
	224	# for each page). Metadata based on this calculation not set until process()
	225	#
	226	# Note: this is done even if we are not breaking to document into pages as it might
	227	# be useful to give an indication of document length in browser through setting
	228	# num_pages as metadata.
	229	my @pages = ($text =~ /\<[Aa] name=\"?\w+\"?>/ig);
	230	my $num_pages = scalar(@pages);
	231	$self->{'num_pages'} = $num_pages;
	232
[3411]	233	if ($self->{'use_sections'}
	234	&& $self->{'converted_to'} eq "HTML") {
	235
[15872]	236	print $outhandle "PDFPlugin: Calculating sections...\n";
[3411]	237
[3614]	238	# we have "<a name=1></a>" etc for each page
[8795]	239	# it may be <A name=
[10273]	240	my @sections = split('<[Aa] name=', $text);
[3411]	241
[10273]	242	my $top_section = "";
	243
[7019]	244	if (scalar (@sections) == 1) { #only one section - no split!
[15872]	245	print $outhandle "PDFPlugin: warning - no sections found\n";
[7019]	246	} else {
[10273]	247	$top_section .= shift @sections; # keep HTML header etc as top_section
[7019]	248	}
	249
[3411]	250	# handle first section specially for title? Or all use first 100...
	251
	252	my $title = $sections[0];
[8795]	253	$title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
[3411]	254	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
	255	$title =~ s/<[^>]*>/ /g;
	256	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
	257	$title =~ s/^\s+//s;
	258	$title =~ s/\s+$//;
	259	$title =~ s/\s+/ /gs;
	260	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
	261	$title =~ s/^\s+//s; # in case title_sub introduced any...
	262	$title = substr ($title, 0, 100);
	263	$title =~ s/\s\S*$/.../;
	264
[10273]	265
[7019]	266	if (scalar (@sections) == 1) { # no sections found
[10273]	267	$top_section .= $sections[0];
[7019]	268	@sections=();
	269	} else {
[10273]	270	$top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
[7019]	271	}
[3411]	272
	273	# add metadata per section...
	274	foreach my $section (@sections) {
[8795]	275	# section names are not always just digits, may be like "outline"
	276	$section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
[3614]	277
[3411]	278	$title = $1; # Greenstone does magic if sections are titled digits
	279	if (! defined($title) ) {
	280	print STDERR "no title: $section\n";
[8795]	281	$title = " "; # get rid of the undefined warning in next line
[3411]	282	}
[15872]	283	my $newsection = "<!-- from PDFPlugin -->\n<!-- <Section>\n";
[3411]	284	$newsection .= "<Metadata name=\"Title\">" . $title
	285	. "</Metadata>\n--><p>\n";
	286	$newsection .= $section;
	287	$newsection .= "<!--</Section>-->\n";
	288	$section = $newsection;
	289	}
	290
[10273]	291	$text=join('', ($top_section, @sections));
[3411]	292	}
	293
[8218]	294	# turn any high bytes that aren't valid utf-8 into utf-8.
[10273]	295	unicode::ensure_utf8(\$text);
[8218]	296
[10273]	297	# Write it out again!
	298	$self->utf8_write_file (\$text, $conv_filename);
	299	}
[7287]	300
	301
[10273]	302	# do plugin specific processing of doc_obj for HTML type
	303	sub process {
	304	my $self = shift (@_);
[15872]	305	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[7287]	306
[15963]	307	my $result = $self->process_type($base_dir,$file,$doc_obj);
[10273]	308
[8226]	309	# fix up the extracted date metadata to be in Greenstone date format,
	310	# and fix the capitalisation of 'date'
[8227]	311	my $cursection = $doc_obj->get_top_section();
	312	foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
[7287]	313	$doc_obj->delete_metadata($cursection, "date", $datemeta);
	314
	315	# We're just interested in the date bit, not the time
[8278]	316	# some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
	317	# set a /CreationDate, and set /ModDate to 000000000. pdftohtml
	318	# extracts the ModDate, so it is 0...
	319	$datemeta =~ /(\d+)-(\d+)-(\d+)/;
	320	my ($year, $month, $day) = ($1,$2,$3);
	321	if (defined($year) && defined($month) && defined($day)) {
	322	if ($year == 0) {next}
	323	if ($year < 100) {$year += 1900} # just to be safe
	324	if ($month =~ /^\d$/) {$month="0$month"} # single digit
	325	if ($day =~ /^\d$/) {$day="0$day"} # single digit
	326	my $date="$year$month$day";
	327	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
	328	}
[7287]	329	}
	330
[10273]	331	$doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'});
[8795]	332
	333	if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
	334	# we explicitly make it a paged document, cos greenstone won't get it
	335	# right if any section has an empty title, or one with letters in it
	336	$doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
	337	}
[10273]	338
[7287]	339	return $result;
[1410]	340	}
	341
	342	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: