Context Navigation

source: trunk/gsdl/perllib/plugins/PDFPlug.pm@ 8795

Last change on this file since 8795 was 8795, checked in by kjdon, 19 years ago
if use_sections is on, now we are a bit more relaxed about what the sections are split on. used to be <a name=, now its <[Aa] name=, and instead of matching \d+ for the title, we use \"?\w+\"?, cos sometimes you get <a name="outline"> And since we can now have non-digit titles in the document, we explicitly set gsdlthistype to be paged, otherwise greenstone will treat it as hierarchical if there are non-digit titles.
Property svn:keywords set to `Author Date Id Revision`
File size: 8.1 KB

Rev	Line
[1410]	1	###########################################################################
	2	#
	3	# PDFPlug.pm -- reasonably with-it pdf plugin
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
[2661]	8	# Copyright (C) 1999-2001 New Zealand Digital Library Project
[1410]	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
[8227]	25	use strict;
	26	no strict 'refs'; # so we can use a var for filehandles (eg STDERR)
[1410]	27
	28	package PDFPlug;
	29
	30	use ConvertToPlug;
[8218]	31	use unicode;
[1410]	32
[8716]	33	sub BEGIN {
[8227]	34	@PDFPlug::ISA = ('ConvertToPlug');
[8716]	35	}
[1410]	36
[4744]	37	my $arguments =
	38	[ { 'name' => "process_exp",
[4873]	39	'desc' => "{BasPlug.process_exp}",
[6408]	40	'type' => "regexp",
[4744]	41	'deft' => &get_default_process_exp(),
	42	'reqd' => "no" },
	43	{ 'name' => "block_exp",
[4873]	44	'desc' => "{BasPlug.block_exp}",
[6408]	45	'type' => "regexp",
[4873]	46	'deft' => &get_default_block_exp() },
[4744]	47	{ 'name' => "noimages",
[4873]	48	'desc' => "{PDFPlug.noimages}",
[4744]	49	'type' => "flag" },
	50	{ 'name' => "complex",
[4873]	51	'desc' => "{PDFPlug.complex}",
[4744]	52	'type' => "flag" },
	53	{ 'name' => "nohidden",
[4873]	54	'desc' => "{PDFPlug.nohidden}",
[4744]	55	'type' => "flag" },
	56	{ 'name' => "zoom",
[4873]	57	'desc' => "{PDFPlug.zoom}",
[4744]	58	'deft' => "2",
[7107]	59	'range' => "1,3", # actually the range is 0.5-3
[4744]	60	'type' => "int" },
	61	{ 'name' => "use_sections",
[4873]	62	'desc' => "{PDFPlug.use_sections}",
[4744]	63	'type' => "flag" } ];
[3540]	64
	65	my $options = { 'name' => "PDFPlug",
[5680]	66	'desc' => "{PDFPlug.desc}",
[6408]	67	'abstract' => "no",
[3540]	68	'inherits' => "yes",
	69	'args' => $arguments };
	70
[1410]	71	sub new {
	72	my $class = shift (@_);
[2452]	73
[4103]	74	my ($noimages, $complex, $zoom, $use_sections, $nohidden);
[5616]	75
	76	my @args=@_;
	77
[3720]	78	if (!parsargv::parse(\@_,
	79	q^noimages^, \$noimages,
	80	q^complex^, \$complex,
	81	q^zoom/\d+/2^, \$zoom,
[4103]	82	q^nohidden^, \$nohidden,
[5139]	83	q^use_sections^, \$use_sections,
[3720]	84	"allow_extra_options")) {
	85
[8227]	86	my $self = new ConvertToPlug($class, @_);
[7019]	87	my $outhandle=$self->{'outhandle'};
	88	print $outhandle "\nIncorrect options passed to PDFPlug, check your collect.cfg configuration file\n";
[4873]	89	$self->print_txt_usage(""); # Use default resource bundle
[7019]	90	exit 1;
[3720]	91	}
	92
[3833]	93
	94	if ($use_sections) {
	95	push (@args, "-description_tags");
	96	}
	97
[2452]	98	# following title_sub removes "Page 1" added by pdftohtml, and a leading
	99	# "1", which is often the page number at the top of the page. Bad Luck
	100	# if your document title actually starts with "1 " - is there a better way?
	101
[3411]	102	my $self = new ConvertToPlug ($class, @args, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?');
[5924]	103	$self->{'plugin_type'} = "PDFPlug";
[3411]	104	if ($use_sections) {
	105	$self->{'use_sections'}=1;
	106	}
[3833]	107
[3540]	108	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
	109	my $option_list = $self->{'option_list'};
	110	push( @{$option_list}, $options );
[3720]	111
	112	# these are passed through to gsConvert.pl by ConvertToPlug.pm
	113	$self->{'convert_options'} = "-pdf_zoom $zoom";
	114	$self->{'convert_options'} .= " -pdf_complex" if $complex;
[4103]	115	$self->{'convert_options'} .= " -pdf_nohidden" if $nohidden;
[3720]	116	$self->{'convert_options'} .= " -pdf_ignore_images" if $noimages;
	117
	118	# pdftohtml will always produce html files encoded as utf-8
	119	if ($self->{'input_encoding'} eq "auto") {
	120	$self->{'input_encoding'} = "utf8";
	121	$self->{'extract_language'} = 1;
	122	}
[5616]	123
[1410]	124	return bless $self, $class;
	125	}
	126
	127	sub get_default_process_exp {
	128	my $self = shift (@_);
	129
	130	return q^(?i)\.pdf$^;
	131	}
[2661]	132
	133	# so we don't inherit HTMLPlug's block exp...
	134	sub get_default_block_exp {
	135	return "";
	136	}
[1410]	137
	138
	139	# do plugin specific processing of doc_obj for HTML type
	140	sub process {
	141	my $self = shift (@_);
[7019]	142	my $outhandle=$self->{'outhandle'};
	143
[8218]	144	my $textref=$_[0];
	145
[3411]	146	if ($self->{'use_sections'}
	147	&& $self->{'converted_to'} eq "HTML") {
	148
[7019]	149	print $outhandle "PDFPlug: Calculating sections...\n";
[3411]	150
[3614]	151	# we have "<a name=1></a>" etc for each page
[8795]	152	# it may be <A name=
	153	my @sections = split('<[Aa] name=', $$textref);
[3411]	154
[7019]	155	if (scalar (@sections) == 1) { #only one section - no split!
	156	print $outhandle "PDFPlug: warning - no sections found\n";
	157	} else {
	158	shift @sections; # don't need HTML header, etc
	159	}
	160
[3411]	161	# handle first section specially for title? Or all use first 100...
	162
	163	my $title = $sections[0];
[8795]	164	$title =~ s/^\"?\w+\"?>//; # specific for pdftohtml...
[3411]	165	$title =~ s/<\/([^>]+)><\1>//g; # (eg) </b><b> - no space
	166	$title =~ s/<[^>]*>/ /g;
	167	$title =~ s/(?: \|\xc2\xa0)/ /g; # utf-8 for nbsp...
	168	$title =~ s/^\s+//s;
	169	$title =~ s/\s+$//;
	170	$title =~ s/\s+/ /gs;
	171	$title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
	172	$title =~ s/^\s+//s; # in case title_sub introduced any...
	173	$title = substr ($title, 0, 100);
	174	$title =~ s/\s\S*$/.../;
	175
[7019]	176	my $top_section;
	177	if (scalar (@sections) == 1) { # no sections found
	178	$top_section=$sections[0];
	179	@sections=();
	180	} else {
	181	$top_section = "<!--<Section>\n<Metadata name=\"Title\">$title</Metadata>\n-->\n <!--</Section>-->\n";
	182	}
[3411]	183
	184	# add metadata per section...
	185	foreach my $section (@sections) {
[8795]	186	# section names are not always just digits, may be like "outline"
	187	$section =~ s@^\"?(\w+)\"?></a>@@; # leftover from split expression...
[3614]	188
[3411]	189	$title = $1; # Greenstone does magic if sections are titled digits
	190	if (! defined($title) ) {
	191	print STDERR "no title: $section\n";
[8795]	192	$title = " "; # get rid of the undefined warning in next line
[3411]	193	}
	194	my $newsection = "<!-- from PDFPlug -->\n<!-- <Section>\n";
	195	$newsection .= "<Metadata name=\"Title\">" . $title
	196	. "</Metadata>\n--><p>\n";
	197	$newsection .= $section;
	198	$newsection .= "<!--</Section>-->\n";
	199	$section = $newsection;
	200	}
	201
	202	$$textref=join('', ($top_section, @sections));
	203	}
	204
[8218]	205	# turn any high bytes that aren't valid utf-8 into utf-8.
	206	unicode::ensure_utf8($textref);
	207
[2979]	208	print $outhandle "PDFPlug: passing $_[3] on to $self->{'converted_to'}Plug\n"
[1410]	209	if $self->{'verbosity'} > 1;
[7287]	210
[8226]	211	# tell htmlplug to extract these metadata fields from the HEAD META fields
	212	$self->{'metadata_fields'} .= ",date,author<Creator>";
[7287]	213
	214	my $result = ConvertToPlug::process_type($self,"pdf",@_);
	215
	216	my $doc_obj = pop(@_);
[8226]	217
	218	# fix up the extracted date metadata to be in Greenstone date format,
	219	# and fix the capitalisation of 'date'
[8227]	220	my $cursection = $doc_obj->get_top_section();
	221
	222	foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) {
[7287]	223	$doc_obj->delete_metadata($cursection, "date", $datemeta);
	224
	225	# We're just interested in the date bit, not the time
[8278]	226	# some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows")
	227	# set a /CreationDate, and set /ModDate to 000000000. pdftohtml
	228	# extracts the ModDate, so it is 0...
	229	$datemeta =~ /(\d+)-(\d+)-(\d+)/;
	230	my ($year, $month, $day) = ($1,$2,$3);
	231	if (defined($year) && defined($month) && defined($day)) {
	232	if ($year == 0) {next}
	233	if ($year < 100) {$year += 1900} # just to be safe
	234	if ($month =~ /^\d$/) {$month="0$month"} # single digit
	235	if ($day =~ /^\d$/) {$day="0$day"} # single digit
	236	my $date="$year$month$day";
	237	$doc_obj->add_utf8_metadata($cursection, "Date", $date);
	238	}
[7287]	239	}
	240
[8139]	241	# Add NumPages metadata (we have "<a name=1>" etc for each page)
[8795]	242	my @pages = ($$textref =~ /\<[Aa] name=\"?\w+\"?>/ig);
[8139]	243	$doc_obj->add_utf8_metadata($cursection, "NumPages", scalar(@pages));
[8795]	244
	245	if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") {
	246	# we explicitly make it a paged document, cos greenstone won't get it
	247	# right if any section has an empty title, or one with letters in it
	248	$doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged");
	249	}
[7287]	250	return $result;
[1410]	251	}
	252
	253	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: