Context Navigation

source: main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm@ 22953

Last change on this file since 22953 was 22953, checked in by davidb, 14 years ago
Further code tweaks to correctly support Unicode aware strings in our plugin code
Property svn:keywords set to `Author Date Id Revision`
File size: 13.2 KB

Rev	Line
[1410]	1	###########################################################################
	2	#
[15872]	3	# PDFPlugin.pm -- reasonably with-it pdf plugin
[1410]	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
[2661]	8	# Copyright (C) 1999-2001 New Zealand Digital Library Project
[1410]	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
[15872]	25	package PDFPlugin;
[1410]	26
[10353]	27	use strict;
[22702]	28	no strict 'refs'; # so we can use a var for filehandles (e.g. STDERR)
[1410]	29
[22705]	30	use ReadTextFile;
	31	use unicode;
[22702]	32
[22861]	33	use AutoLoadConverters;
[22864]	34	use ConvertBinaryFile;
[1410]	35
[22861]	36	@PDFPlugin::ISA = ('ConvertBinaryFile', 'AutoLoadConverters', 'ReadTextFile');
[22705]	37
	38
[10452]	39	my $convert_to_list =
	40	[ { 'name' => "auto",
[15872]	41	'desc' => "{ConvertBinaryFile.convert_to.auto}" },
[10452]	42	{ 'name' => "html",
[15872]	43	'desc' => "{ConvertBinaryFile.convert_to.html}" },
[10452]	44	{ 'name' => "text",
[15872]	45	'desc' => "{ConvertBinaryFile.convert_to.text}" },
[10452]	46	{ 'name' => "pagedimg_jpg",
[15872]	47	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"},
[10452]	48	{ 'name' => "pagedimg_gif",
[15872]	49	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}"},
[10452]	50	{ 'name' => "pagedimg_png",
[15872]	51	'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}"},
[10452]	52	];
	53
	54
	55	my $arguments =
[10889]	56	[
	57	{ 'name' => "convert_to",
[15872]	58	'desc' => "{ConvertBinaryFile.convert_to}",
[10889]	59	'type' => "enum",
	60	'reqd' => "yes",
	61	'list' => $convert_to_list,
	62	'deft' => "html" },
	63	{ 'name' => "process_exp",
[15872]	64	'desc' => "{BasePlugin.process_exp}",
[10889]	65	'type' => "regexp",
	66	'deft' => &get_default_process_exp(),
	67	'reqd' => "no" },
	68	{ 'name' => "block_exp",
[15872]	69	'desc' => "{BasePlugin.block_exp}",
[10889]	70	'type' => "regexp",
	71	'deft' => &get_default_block_exp() },
	72	{ 'name' => "metadata_fields",
[15872]	73	'desc' => "{HTMLPlugin.metadata_fields}",
[10889]	74	'type' => "string",
	75	'deft' => "" },
[21800]	76	{ 'name' => "metadata_field_separator",
	77	'desc' => "{HTMLPlugin.metadata_field_separator}",
	78	'type' => "string",
	79	'deft' => "" },
[10889]	80	{ 'name' => "noimages",
[15872]	81	'desc' => "{PDFPlugin.noimages}",
[10889]	82	'type' => "flag" },
	83	{ 'name' => "allowimagesonly",
[15872]	84	'desc' => "{PDFPlugin.allowimagesonly}",
[10889]	85	'type' => "flag" },
	86	{ 'name' => "complex",
[15872]	87	'desc' => "{PDFPlugin.complex}",
[10889]	88	'type' => "flag" },
	89	{ 'name' => "nohidden",
[15872]	90	'desc' => "{PDFPlugin.nohidden}",
[10889]	91	'type' => "flag" },
	92	{ 'name' => "zoom",
[15872]	93	'desc' => "{PDFPlugin.zoom}",
[10889]	94	'deft' => "2",
	95	'range' => "1,3", # actually the range is 0.5-3
	96	'type' => "int" },
	97	{ 'name' => "use_sections",
[15872]	98	'desc' => "{PDFPlugin.use_sections}",
[10889]	99	'type' => "flag" },
	100	{ 'name' => "description_tags",
[15872]	101	'desc' => "{HTMLPlugin.description_tags}",
[10889]	102	'type' => "flag" }
	103	];
[3540]	104
[15872]	105	my $options = { 'name' => "PDFPlugin",
	106	'desc' => "{PDFPlugin.desc}",
[6408]	107	'abstract' => "no",
[3540]	108	'inherits' => "yes",
[15114]	109	'srcreplaceable' => "yes", # Source docs in PDF can be replaced with GS-generated html
[3540]	110	'args' => $arguments };
	111
[1410]	112	sub new {
[10218]	113	my ($class) = shift (@_);
	114	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	115	push(@$pluginlist, $class);
[2452]	116
[10218]	117	push(@$inputargs,"-title_sub");
	118	push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?');
[5616]	119
[15872]	120	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	121	push(@{$hashArgOptLists->{"OptList"}},$options);
[10429]	122
[22861]	123	my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["PDFBoxConverter"],1);
	124	my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
	125	my $self = BasePlugin::merge_inheritance($auto_converter_self, $cbf_self);
[10353]	126
[10580]	127	if ($self->{'info_only'}) {
	128	# don't worry about any options etc
	129	return bless $self, $class;
	130	}
[22861]	131
	132	$self = bless $self, $class;
[15872]	133	$self->{'file_type'} = "PDF";
	134
	135	# these are passed through to gsConvert.pl by ConvertBinaryFile.pm
[10218]	136	my $zoom = $self->{"zoom"};
[3720]	137	$self->{'convert_options'} = "-pdf_zoom $zoom";
[10218]	138	$self->{'convert_options'} .= " -pdf_complex" if $self->{"complex"};
	139	$self->{'convert_options'} .= " -pdf_nohidden" if $self->{"nohidden"};
	140	$self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"};
[10452]	141	$self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"};
[3720]	142
[22597]	143	# check convert_to
	144	if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) {
	145	print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
	146	$self->{'convert_to'} = "html";
[10273]	147	}
[22597]	148	elsif ($self->{'convert_to'} eq "auto") {
	149	# choose html ?? is this the best option
	150	$self->{'convert_to'} = "html";
[10273]	151	}
[22597]	152	# set convert_to_plugin and convert_to_ext
[22702]	153	$self->set_standard_convert_settings();
[18145]	154
[22597]	155	my $secondary_plugin_name = $self->{'convert_to_plugin'};
	156	my $secondary_plugin_options = $self->{'secondary_plugin_options'};
[10273]	157
[22597]	158	if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
	159	$secondary_plugin_options->{$secondary_plugin_name} = [];
[10724]	160	}
[22597]	161	my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
[10429]	162
[10273]	163	# following title_sub removes "Page 1" added by pdftohtml, and a leading
	164	# "1", which is often the page number at the top of the page. Bad Luck
	165	# if your document title actually starts with "1 " - is there a better way?
[22597]	166	push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
[11122]	167	my $associate_tail_re = $self->{'associate_tail_re'};
	168	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
[22597]	169	push(@$specific_options, "-associate_tail_re", $associate_tail_re);
[11122]	170	}
[22597]	171	push(@$specific_options, "-file_rename_method", "none");
	172
	173	if ($secondary_plugin_name eq "HTMLPlugin") {
[22861]	174	# pdftohtml always produces utf8 - What about pdfbox???
[22597]	175	push(@$specific_options, "-input_encoding", "utf8");
	176	push(@$specific_options, "-extract_language") if $self->{'extract_language'};
	177	push(@$specific_options, "-processing_tmp_files");
	178	# Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
	179	# to extract these metadata fields from the HEAD META fields
	180	if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
	181	push(@$specific_options,"-metadata_fields",$self->{'metadata_fields'});
	182	} else {
	183	push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author<Creator>");
	184	}
	185	if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) {
	186	push(@$specific_options,"-metadata_field_separator",$self->{'metadata_field_separator'});
	187	}
	188	if ($self->{'use_sections'} \|\| $self->{'description_tags'}) {
	189	$self->{'description_tags'} = 1;
	190	push(@$specific_options, "-description_tags");
	191	}
	192	}
	193	elsif ($secondary_plugin_name eq "PagedImagePlugin") {
	194	push(@$specific_options, "-screenviewsize", "1000");
	195	push(@$specific_options, "-enable_cache");
	196	push(@$specific_options, "-processing_tmp_files");
	197	}
[11122]	198
[10273]	199	$self = bless $self, $class;
[10429]	200	$self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
[10273]	201	return $self;
[1410]	202	}
	203
	204	sub get_default_process_exp {
	205	my $self = shift (@_);
	206
	207	return q^(?i)\.pdf$^;
	208	}
[2661]	209
	210	# so we don't inherit HTMLPlug's block exp...
	211	sub get_default_block_exp {
	212	return "";
	213	}
[22861]	214
	215	sub init {
	216	my $self = shift (@_);
	217
	218	# ConvertBinaryFile init
	219	$self->SUPER::init(@_);
	220	$self->AutoLoadConverters::init();
	221
	222	}
	223
	224	sub begin {
	225	my $self = shift (@_);
	226
	227	$self->AutoLoadConverters::begin();
	228	$self->SUPER::begin(@_);
	229
	230	}
	231
	232	sub deinit {
	233	my $self = shift (@_);
[1410]	234
[22861]	235	$self->AutoLoadConverters::deinit();
	236	$self->SUPER::deinit(@_);
	237
	238	}
	239
	240
	241	sub tmp_area_convert_file {
	242
	243	my $self = shift (@_);
	244	return $self->AutoLoadConverters::tmp_area_convert_file(@_);
	245
	246	}
	247
[10273]	248	sub convert_post_process
	249	{
[1410]	250	my $self = shift (@_);
[10273]	251	my ($conv_filename) = @_;
[9465]	252
[7019]	253	my $outhandle=$self->{'outhandle'};
	254
[15963]	255	#$self->{'input_encoding'} = "utf8"; # The output is always in utf8 (is it?? it is for html, but what about other types?)
	256	#my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename);
[8218]	257
[10273]	258	# read in file ($text will be in utf8)
	259	my $text = "";
[15963]	260	# encoding will be utf8 for html files - what about other types? will we do this step for them anyway?
	261	$self->read_file ($conv_filename, "utf8", "", \$text);
[10273]	262
	263	# Calculate number of pages based on <a ...> tags (we have a <a name=1> etc
	264	# for each page). Metadata based on this calculation not set until process()
	265	#
	266	# Note: this is done even if we are not breaking to document into pages as it might
	267	# be useful to give an indication of document length in browser through setting
	268	# num_pages as metadata.
	269	my @pages = ($text =~ /\<[Aa] name=\"?\w+\"?>/ig);
	270	my $num_pages = scalar(@pages);
	271	$self->{'num_pages'} = $num_pages;
	272
[3411]	273	if ($self->{'use_sections'}
	274	&& $self->{'converted_to'} eq "HTML") {
	275
[15872]	276	print $outhandle "PDFPlugin: Calculating sections...\n";
[3411]	277
[3614]	278	# we have "<a name=1></a>" etc for each page
[8795]	279	# it may be <A name=
[10273]	280	my @sections = split('<[Aa] name=', $text);
[3411]	281
[10273]	282	my $top_section = "";
	283
[7019]	284	if (scalar (@sections) == 1) { #only one section - no split!
[15872]	285	print $outhandle "PDFPlugin: warning - no sections found\n";
[7019]	286	} else {
[10273]	287	$top_section .= shift @sections; # keep HTML header etc as top_section
[7019]	288	}
	289
[3411]	290	# handle first section specially for title? Or all use first 100...
	291
	292	my $title = $sections[0];
[8795]	293	$title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
[3411]	294	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
	295	$title =~ s/<[^>]*>/ /g;
	296	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
	297	$title =~ s/^\s+//s;
	298	$title =~ s/\s+$//;
	299	$title =~ s/\s+/ /gs;
	300	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
	301	$title =~ s/^\s+//s; # in case title_sub introduced any...
	302	$title = substr ($title, 0, 100);
	303	$title =~ s/\s\S*$/.../;
	304
[10273]	305
[7019]	306	if (scalar (@sections) == 1) { # no sections found
[10273]	307	$top_section .= $sections[0];
[7019]	308	@sections=();
	309	} else {
[10273]	310	$top_section .= "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
[7019]	311	}
[3411]	312
	313	# add metadata per section...
	314	foreach my $section (@sections) {
[8795]	315	# section names are not always just digits, may be like "outline"
	316	$section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
[3614]	317
[3411]	318	$title = $1; # Greenstone does magic if sections are titled digits
	319	if (! defined($title) ) {
	320	print STDERR "no title: $section\n";
[8795]	321	$title = " "; # get rid of the undefined warning in next line
[3411]	322	}
[15872]	323	my $newsection = "<!-- from PDFPlugin -->\n<!-- <Section>\n";
[3411]	324	$newsection .= "<Metadata name=\"Title\">" . $title
	325	. "</Metadata>\n--><p>\n";
	326	$newsection .= $section;
	327	$newsection .= "<!--</Section>-->\n";
	328	$section = $newsection;
	329	}
	330
[10273]	331	$text=join('', ($top_section, @sections));
[3411]	332	}
	333
[22953]	334
	335	# The following should no longer be needed, now that strings
	336	# read in are Unicode aware (in the Perl sense) rather than
	337	# raw binary strings that just happen to be UTF-8 compliant
	338
[8218]	339	# turn any high bytes that aren't valid utf-8 into utf-8.
[22953]	340	## unicode::ensure_utf8(\$text);
[8218]	341
[10273]	342	# Write it out again!
	343	$self->utf8_write_file (\$text, $conv_filename);
	344	}
[7287]	345
	346
[10273]	347	# do plugin specific processing of doc_obj for HTML type
	348	sub process {
	349	my $self = shift (@_);
[15872]	350	my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[7287]	351
[15963]	352	my $result = $self->process_type($base_dir,$file,$doc_obj);
[10273]	353
[8226]	354	# fix up the extracted date metadata to be in Greenstone date format,
	355	# and fix the capitalisation of 'date'
[8227]	356	my $cursection = $doc_obj->get_top_section();
	357	foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
[7287]	358	$doc_obj->delete_metadata($cursection, "date", $datemeta);
	359
	360	# We're just interested in the date bit, not the time
[8278]	361	# some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
	362	# set a /CreationDate, and set /ModDate to 000000000. pdftohtml
	363	# extracts the ModDate, so it is 0...
	364	$datemeta =~ /(\d+)-(\d+)-(\d+)/;
	365	my ($year, $month, $day) = ($1,$2,$3);
	366	if (defined($year) && defined($month) && defined($day)) {
	367	if ($year == 0) {next}
	368	if ($year < 100) {$year += 1900} # just to be safe
	369	if ($month =~ /^\d$/) {$month="0$month"} # single digit
	370	if ($day =~ /^\d$/) {$day="0$day"} # single digit
	371	my $date="$year$month$day";
	372	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
	373	}
[7287]	374	}
	375
[10273]	376	$doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'});
[8795]	377
	378	if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
	379	# we explicitly make it a paged document, cos greenstone won't get it
	380	# right if any section has an empty title, or one with letters in it
	381	$doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
	382	}
[10273]	383
[7287]	384	return $result;
[1410]	385	}
	386
	387	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: