Context Navigation

source: trunk/gsdl/perllib/plugins/PagedImgPlug.pm@ 10274

Last change on this file since 10274 was 10254, checked in by kjdon, 19 years ago
added 'use strict' to all plugins, and made modifications (mostly adding 'my') to make them compile
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 30.1 KB

Rev	Line
[6555]	1	###########################################################################
	2	#
	3	# PagedImgPlug.pm -- plugin for sets of images and OCR text that
	4	# make up a document
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
	9	# Copyright (C) 1999 New Zealand Digital Library Project
	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26
	27	# PagedImgPlug
	28	# processes sequences of images, with optional OCR text
	29	#
[7352]	30	# This plugin takes *.item files, which contain metadata and lists of image
[6555]	31	# files, and produces a document containing sections, one for each page.
[7352]	32	# The files should be named something.item, then you can have more than one
	33	# book in a directory. You will need to create these files, one for each
	34	# document/book.
[6555]	35	#
[10168]	36	#There are two formats for the item files: a plain text format, and an xml
	37	#format. You can use either format, and can have both formats in the same
	38	#collection if you like. If you use the plain format, you must not start the
	39	#file off with <PagedDocument>
	40
	41	#### PLAIN FORMAT
[7352]	42	# The format of the xxx.item file is as follows:
[6555]	43	# The first lines contain any metadata for the whole document
	44	# <metadata-name>metadata-value
	45	# eg.
	46	# <Title>Snail farming
	47	# <Date>19230102
	48	# Then comes a list of pages, one page per line, each line has the format
[7352]	49	#
[6555]	50	# pagenum:imagefile:textfile:r
[7352]	51	#
[6555]	52	# page num and imagefile are required. pagenum is used for the Title
	53	# of the section, and in the display is shown as page <pagenum>.
	54	# imagefile is the image for the page. textfile is an optional text
	55	# file containing the OCR (or any) text for the page - this gets added
	56	# as the text for the section. r is optional, and signals that the image
	57	# should be rotated 180deg. Eg use this if the image has been made upside down.
	58	# So an example item file looks like:
	59	# <Title>Snail farming
	60	# <Date>19960403
	61	# 1:p1.gif:p1.txt:
	62	# 2:p2.gif::
	63	# 3:p3.gif:p3.txt:
	64	# 3b:p3b.gif:p3b.txt:r
	65	# The second page has no text, the fourth page is a back page, and
	66	# should be rotated.
	67	#
[10168]	68
	69	#### XML FORMAT
	70	# The xml format looks like the following
	71	#<PagedDocument>
	72	#<Metadata name="Title">The Title of the entire document</Metadata>
	73	#<Page pagenum="1" imgfile="xxx.jpg" txtfile="yyy.jpg">
	74	#<Metadata name="Title">The Title of this page</Metadata>
	75	#</Page>
	76	#... more pages
	77	#</PagedDocument>
	78	#PagedDocument contains a list of Pages, Metadata and PageGroups. Any metadata
	79	#that is not inside another tag will belong to the document.
	80	#Each Page has a pagenum (not used at the moment), an imgfile and/or a txtfile.
	81	#These are both optional - if neither is used, the section will have no content.
	82	#Pages can also have metadata associated with them.
	83	#PageGroups can be introduced at any point - they can contain Metadata and Pages and other PageGroups. They are used to introduce hierarchical structure into the document.
	84	#For example
	85	#<PagedDocument>
	86	#<PageGroup>
	87	#<Page>
	88	#<Page>
	89	#</PageGroup>
	90	#<Page>
	91	#</PagedDocument>
	92	#would generate a structure like
	93	#X
	94	#--X
	95	# --X
	96	# --X
	97	#--X
	98	#PageGroup tags can also have imgfile/textfile metadata if you like - this way they get some content themselves.
	99
	100	#Currently the XML structure doesn't work very well with the paged document type, unless you use numerical Titles for each section.
	101	#There is still a bit of work to do on this format:
	102	#* enable other text file types, eg html, pdf etc
	103	#* make the document paging work properly
	104	#* add pagenum as Title unless a Title is present?
	105
[6555]	106	# All the supplemetary image amd text files should be in the same folder as
	107	# the .item file.
	108	#
	109	# To display the images instead of the document text, you can use [srcicon]
[7106]	110	# in the DocumentText format statement.
	111	# For example,
[6555]	112	#
[7106]	113	# format DocumentText "<center><table width=_pagewidth_><tr><td>[srcicon]</td></tr></table></center>"
[7352]	114	#
	115	# To have it create thumbnail size images, use the '-thumbnail' option.
	116	# To have it create medium size images for display, use the '-screenview'
	117	# option. As usual, running
	118	# 'perl -S pluginfo.pl PagedImgPlug' will list all the options.
[7106]	119
[7352]	120	# If you want the resulting documents to be presented with a table of
	121	# contents, use '-documenttype hierarchy', otherwise they will have
	122	# next and previous arrows, and a goto page X box.
	123
[7106]	124	# If you have used -screenview, you can also use [screenicon] in the format
	125	# statement to display the smaller image. Here is an example that switches
	126	# between the two:
	127	#
[10153]	128	# format DocumentText "<center><table width=_pagewidth_><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small'>Switch to small version.</a>,<a href='_httpdocument_&d=_cgiargd_&p=full'>Switch to fullsize version</a>}</td></tr><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small' title='Switch to small version'>[srcicon]</a>,<a href='_httpdocument_&d=_cgiargd_&p=full' title='Switch to fullsize version'>[screenicon]</a>}</td></tr></table></center>"
[6555]	129	#
	130	# Additional metadata can be added into the .item files, alternatively you can
[7352]	131	# use normal metadata.xml files, with the name of the xxx.item file as the
[10168]	132	# FileName (only for document level metadata).
[6555]	133
	134	package PagedImgPlug;
	135
[10168]	136	use XMLPlug;
[10254]	137	use strict;
	138	no strict 'refs'; # allow filehandles to be variables and viceversa
[6555]	139
	140	sub BEGIN {
[10218]	141	@PagedImgPlug::ISA = ('XMLPlug');
[6555]	142	}
	143
[6860]	144	my $type_list =
	145	[ { 'name' => "paged",
	146	'desc' => "{PagedImgPlug.documenttype.paged}" },
	147	{ 'name' => "hierarchy",
	148	'desc' => "{PagedImgPlug.documenttype.hierarchy}" } ];
	149
[6555]	150	my $arguments =
	151	[ { 'name' => "process_exp",
	152	'desc' => "{BasPlug.process_exp}",
	153	'type' => "string",
	154	'deft' => &get_default_process_exp(),
	155	'reqd' => "no" },
	156	{ 'name' => "block_exp",
	157	'desc' => "{BasPlug.block_exp}",
	158	'type' => "string",
	159	'deft' => &get_default_block_exp(),
	160	'reqd' => "no" },
	161	{ 'name' => "noscaleup",
	162	'desc' => "{ImagePlug.noscaleup}",
	163	'type' => "flag",
	164	'reqd' => "no" },
	165	{ 'name' => "thumbnail",
	166	'desc' => "{PagedImgPlug.thumbnail}",
	167	'type' => "flag",
	168	'reqd' => "no" },
	169	{ 'name' => "thumbnailsize",
	170	'desc' => "{ImagePlug.thumbnailsize}",
	171	'type' => "int",
	172	'deft' => "100",
[7106]	173	'range' => "1,",
[6555]	174	'reqd' => "no" },
	175	{ 'name' => "thumbnailtype",
	176	'desc' => "{ImagePlug.thumbnailtype}",
	177	'type' => "string",
	178	'deft' => "gif",
	179	'reqd' => "no" },
	180	{ 'name' => "screenview",
	181	'desc' => "{PagedImgPlug.screenview}",
	182	'type' => "flag",
	183	'reqd' => "no" },
[7106]	184	{ 'name' => "screenviewsize",
[6555]	185	'desc' => "{PagedImgPlug.screenviewsize}",
	186	'type' => "int",
[7106]	187	'deft' => "500",
	188	'range' => "1,",
[6555]	189	'reqd' => "no" },
	190	{ 'name' => "screenviewtype",
	191	'desc' => "{PagedImgPlug.screenviewtype}",
	192	'type' => "string",
	193	'deft' => "jpg",
	194	'reqd' => "no" },
	195	{ 'name' => "converttotype",
	196	'desc' => "{ImagePlug.converttotype}",
	197	'type' => "string",
	198	'deft' => "",
	199	'reqd' => "no" },
	200	{ 'name' => "minimumsize",
	201	'desc' => "{ImagePlug.minimumsize}",
	202	'type' => "int",
	203	'deft' => "100",
[7106]	204	'range' => "1,",
[6769]	205	'reqd' => "no" },
[8246]	206	{ 'name' => "headerpage",
	207	'desc' => "{PagedImgPlug.headerpage}",
[6769]	208	'type' => "flag",
	209	'reqd' => "no" },
	210	{ 'name' => "documenttype",
	211	'desc' => "{PagedImgPlug.documenttype}",
	212	'type' => "enum",
	213	'list' => $type_list,
	214	'deft' => "paged",
[6555]	215	'reqd' => "no" } ];
	216
[6769]	217
[6555]	218	my $options = { 'name' => "PagedImgPlug",
	219	'desc' => "{PagedImgPlug.desc}",
	220	'inherits' => "yes",
	221	'args' => $arguments };
	222
	223	sub new {
[10218]	224	my ($class) = shift (@_);
	225	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	226	push(@$pluginlist, $class);
[6555]	227
[10218]	228	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
	229	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
[6555]	230
[10218]	231	my $self = (defined $hashArgOptLists)? new XMLPlug($pluginlist,$inputargs,$hashArgOptLists): new XMLPlug($pluginlist,$inputargs);
[6555]	232
	233	return bless $self, $class;
	234	}
	235
	236	sub get_default_process_exp {
	237	my $self = shift (@_);
	238
	239	return q^\.item$^;
	240	}
	241
	242	# want to block everything except the .item ones
	243	# but instead we will block images and txt files
	244	sub get_default_block_exp {
	245	my $self = shift (@_);
	246
	247	return q^(?i)(\.jpe?g\|\.gif\|\.png\|\.tif?f\|\.te?xt\|~)$^
	248	}
	249	# Create the thumbnail and screenview images, and discover the Image's
	250	# size, width, and height using the convert utility.
	251	sub process_image {
	252	my $self = shift (@_);
	253	my $filename = shift (@_); # filename with full path
	254	my $srcfile = shift (@_); # filename without path
	255	my $doc_obj = shift (@_);
	256	my $section = shift (@_); #the current section
	257	my $rotate = shift (@_); # whether to rotate the image or not
	258
	259	my $top=0;
	260	if ($section eq $doc_obj->get_top_section()) {
	261	$top=1;
	262	}
	263	my $verbosity = $self->{'verbosity'};
	264	my $outhandle = $self->{'outhandle'};
	265
	266	# check the filename is okay
	267	return 0 if ($srcfile eq "" \|\| $filename eq "");
	268
	269	my $minimumsize = $self->{'minimumsize'};
	270	if (defined $minimumsize && (-s $filename < $minimumsize)) {
	271	print $outhandle "PagedImgPlug: \"$filename\" too small, skipping\n"
	272	if ($verbosity > 1);
	273	}
	274
	275	# Convert the image to a new type (if required), and rotate if required.
	276	my $converttotype = $self->{'converttotype'};
	277	my $originalfilename = ""; # only set if we do a conversion
	278	my $type = "unknown";
	279	my $converted = 0;
	280	my $rotated=0;
	281	if ($converttotype ne "" && $filename !~ /$converttotype$/) {
	282	$converted=1;
	283	$originalfilename = $filename;
	284	my $filehead = &util::get_tmp_filename();
	285	$filename = $filehead . ".$converttotype";
[10254]	286	my $n = 1;
[6555]	287	while (-e $filename) {
	288	$filename = "$filehead$n\.$converttotype";
	289	$n++;
	290	}
	291	$self->{'tmp_filename1'} = $filename;
	292
	293	my $rotate_option = "";
	294	if ($rotate eq "r") {
	295	$rotate_option = "-rotate 180 ";
	296	}
	297
	298	my $command = "convert -verbose \"$originalfilename\" $rotate_option \"$filename\"";
	299	print $outhandle "CONVERT: $command\n" if ($verbosity > 2);
	300	my $result = '';
	301	$result = `$command`;
	302	print $outhandle "CONVERT RESULT = $result\n" if ($verbosity > 2);
	303
	304	$type = $converttotype;
	305	} elsif ($rotate eq "r") {
	306	$rotated=1;
	307	$originalfilename = $filename;
	308	$filename = &util::get_tmp_filename();
	309
	310	my $command = "convert \"$originalfilename\" -rotate 180 \"$filename\"";
	311	print $outhandle "ROTATE: $command\n" if ($verbosity > 2);
	312	my $result = '';
	313	$result = `$command`;
	314	print $outhandle "ROTATE RESULT = $result\n" if ($verbosity > 2);
	315
	316	}
	317
	318
	319	# Add the image metadata
	320	my $file; # the new file name
[8117]	321	my $id = $srcfile;
	322	$id =~ s/\.([^\.]*)$//; # the new file name without an extension
[6555]	323	if ($converted) {
	324	# we have converted the image
	325	# add on the new extension
	326	$file .= "$id.$converttotype";
	327	} else {
	328	$file = $srcfile;
	329	}
	330
	331	my $url =$file; # the new file name prepared for a url
	332	my $srcurl = $srcfile;
	333	$url =~ s/ /%20/g;
	334	$srcurl =~ s/ /%20/g;
	335
	336	$doc_obj->add_metadata ($section, "Image", $url);
	337
	338	# Also want to set filename as 'Source' metadata to be
	339	# consistent with other plugins
	340	$doc_obj->add_metadata ($section, "Source", $srcurl);
	341
	342	my ($image_type, $image_width, $image_height, $image_size)
	343	= &identify($filename, $outhandle, $verbosity);
	344
	345	$doc_obj->add_metadata ($section, "ImageType", $image_type);
	346	$doc_obj->add_metadata ($section, "ImageWidth", $image_width);
	347	$doc_obj->add_metadata ($section, "ImageHeight", $image_height);
	348	$doc_obj->add_metadata ($section, "ImageSize", $image_size);
[8121]	349	$doc_obj->add_metadata ($section, "FileFormat", "PagedImg");
[6555]	350
	351	if ($type eq "unknown" && $image_type) {
	352	$type = $image_type;
	353	}
	354
	355	if ($top) {
	356	$doc_obj->add_metadata ($section, "srclink",
[8365]	357	"<a href=\"_httpcollection_/index/assoc/[assocfilepath]/[Image]\">");
	358	$doc_obj->add_metadata ($section, "srcicon", "<img src=\"_httpcollection_/index/assoc/[assocfilepath]/[Image]\">");
[6555]	359
	360	} else {
	361	$doc_obj->add_metadata ($section, "srclink",
[10168]	362	"<a href=\"_httpcollection_/index/assoc/[parent(Top):assocfilepath]/[Image]\">");
	363	$doc_obj->add_metadata ($section, "srcicon", "<img src=\"_httpcollection_/index/assoc/[parent(Top):assocfilepath]/[Image]\">");
[6555]	364
	365	}
	366	$doc_obj->add_metadata ($section, "/srclink", "</a>");
	367
	368
	369	# Add the image as an associated file
	370	$doc_obj->associate_file($filename,$file,"image/$type",$section);
	371	print $outhandle "associating file $filename as name $file\n" if ($verbosity > 2);
	372
	373	if ($self->{'thumbnail'}) {
	374	# Make the thumbnail image
	375	my $thumbnailsize = $self->{'thumbnailsize'} \|\| 100;
	376	my $thumbnailtype = $self->{'thumbnailtype'} \|\| 'gif';
	377
	378	my $filehead = &util::get_tmp_filename();
	379	my $thumbnailfile = $filehead . ".$thumbnailtype";
	380	my $n=1;
	381	while (-e $thumbnailfile) {
	382	$thumbnailfile = $filehead . $n . ".$thumbnailtype";
	383	$n++;
	384	}
	385
	386	$self->{'tmp_filename2'} = $thumbnailfile;
	387
	388	# Generate the thumbnail with convert
	389	my $command = "convert -verbose -geometry $thumbnailsize"
	390	. "x$thumbnailsize \"$filename\" \"$thumbnailfile\"";
	391	print $outhandle "THUMBNAIL: $command\n" if ($verbosity > 2);
	392	my $result = '';
	393	$result = `$command 2>&1` ;
	394	print $outhandle "THUMB RESULT: $result\n" if ($verbosity > 2);
	395
	396	# Add the thumbnail as an associated file ...
	397	if (-e "$thumbnailfile") {
	398	$doc_obj->associate_file("$thumbnailfile", $id."thumb.$thumbnailtype", "image/$thumbnailtype",$section);
	399	$doc_obj->add_metadata ($section, "ThumbType", $thumbnailtype);
	400	$doc_obj->add_metadata ($section, "Thumb", $id."thumb.$thumbnailtype");
[10168]	401	if ($top) {
	402	$doc_obj->add_metadata ($section, "thumbicon", "<img src=\"_httpcollection_/index/assoc/[assocfilepath]/[Thumb]\" width=[ThumbWidth] height=[ThumbHeight]>");
	403	} else {
	404	$doc_obj->add_metadata ($section, "thumbicon", "<img src=\"_httpcollection_/index/assoc/[parent(Top):assocfilepath]/[Thumb]\" width=[ThumbWidth] height=[ThumbHeight]>");
	405	}
[6555]	406	}
	407
	408	# Extract Thumnail metadata from convert output
	409	if ($result =~ m/[0-9]+x[0-9]+=>([0-9]+)x([0-9]+)/) {
	410	$doc_obj->add_metadata ($section, "ThumbWidth", $1);
	411	$doc_obj->add_metadata ($section, "ThumbHeight", $2);
	412	}
	413	}
	414	# Make a screen-sized version of the picture if requested
	415	if ($self->{'screenview'}) {
	416
	417	# To do: if the actual image is smaller than the screenview size,
	418	# we should use the original !
	419
	420	my $screenviewsize = $self->{'screenviewsize'} \|\| 500;
	421	my $screenviewtype = $self->{'screenviewtype'} \|\| 'jpeg';
	422	my $filehead = &util::get_tmp_filename();
	423	my $screenviewfilename = $filehead . ".$screenviewtype";
	424	my $n=1;
	425	while (-e $screenviewfilename) {
	426	$screenviewfilename = "$filehead$n\.$screenviewtype";
	427	$n++;
	428	}
	429	$self->{'tmp_filename3'} = $screenviewfilename;
	430
	431	# make the screenview image
	432	my $command = "convert -verbose -geometry $screenviewsize"
	433	. "x$screenviewsize \"$filename\" \"$screenviewfilename\"";
	434	print $outhandle "SCREENVIEW: $command\n" if ($verbosity > 2);
	435	my $result = "";
	436	$result = `$command 2>&1` ;
	437	print $outhandle "SCREENVIEW RESULT: $result\n" if ($verbosity > 3);
	438
	439	# get screenview dimensions, size and type
	440	if ($result =~ m/[0-9]+x[0-9]+=>([0-9]+)x([0-9]+)/) {
	441	$doc_obj->add_metadata ($section, "ScreenWidth", $1);
	442	$doc_obj->add_metadata ($section, "ScreenHeight", $2);
[8245]	443	}elsif ($result =~ m/([0-9]+)x([0-9]+)/) {
	444	#if the image hasn't changed size, the previous regex doesn't match
	445	$doc_obj->add_metadata ($section, "ScreenWidth", $1);
	446	$doc_obj->add_metadata ($section, "ScreenHeight", $2);
[6555]	447	}
	448
	449	#add the screenview as an associated file ...
	450	if (-e "$screenviewfilename") {
	451	$doc_obj->associate_file("$screenviewfilename", $id."sv.$screenviewtype",
	452	"image/$screenviewtype",$section);
	453	print $outhandle "associating screen file $screenviewfilename as name $id sv.$screenviewtype\n" if ($verbosity > 2);
	454
	455	$doc_obj->add_metadata ($section, "ScreenType", $screenviewtype);
	456	$doc_obj->add_metadata ($section, "Screen", $id."sv.$screenviewtype");
	457
	458	if ($top) {
[8365]	459	$doc_obj->add_metadata ($section, "screenicon", "<img src=\"_httpcollection_/index/assoc/[assocfilepath]/[Screen]\" width=[ScreenWidth] height=[ScreenHeight]>");
[6555]	460	} else {
[10168]	461	$doc_obj->add_metadata ($section, "screenicon", "<img src=\"_httpcollection_/index/assoc/[parent(Top):assocfilepath]/[Screen]\" width=[ScreenWidth] height=[ScreenHeight]>");
[6555]	462
	463	}
	464	} else {
	465	print $outhandle "PagedImgPlug: couldn't find \"$screenviewfilename\"\n";
	466	}
	467	}
	468
	469	return $type;
	470
	471
	472	}
	473
	474
	475
	476	# Discover the characteristics of an image file with the ImageMagick
	477	# "identify" command.
	478
	479	sub identify {
	480	my ($image, $outhandle, $verbosity) = @_;
	481
	482	# Use the ImageMagick "identify" command to get the file specs
[8245]	483	my $command = "identify \"$image\" 2>&1";
[6555]	484	print $outhandle "$command\n" if ($verbosity > 2);
	485	my $result = '';
	486	$result = `$command`;
	487	print $outhandle "$result\n" if ($verbosity > 3);
	488
	489	# Read the type, width, and height
	490	my $type = 'unknown';
	491	my $width = 'unknown';
	492	my $height = 'unknown';
	493
	494	my $image_safe = quotemeta $image;
	495	if ($result =~ /^$image_safe (\w+) (\d+)x(\d+)/) {
	496	$type = $1;
	497	$width = $2;
	498	$height = $3;
	499	}
	500
	501	# Read the size
	502	my $size = "unknown";
	503	if ($result =~ m/^.* ([0-9]+)b/) {
	504	$size = $1;
	505	} elsif ($result =~ m/^.* ([0-9]+)kb/) {
	506	$size = 1024 * $1;
	507	}
	508
	509	print $outhandle "file: $image:\t $type, $width, $height, $size\n"
	510	if ($verbosity > 3);
	511
	512	# Return the specs
	513	return ($type, $width, $height, $size);
	514	}
	515
	516
	517	# The PagedImgPlug read() function. This function does all the right things
	518	# to make general options work for a given plugin. It calls the process()
	519	# function which does all the work specific to a plugin (like the old
	520	# read functions used to do). Most plugins should define their own
	521	# process() function and let this read() function keep control.
	522	#
	523	# PagedImgPlug overrides read() because there is no need to read the actual
	524	# text of the file in, because the contents of the file is not text...
	525	#
	526	# Return number of files processed, undef if can't process
	527	# Note that $base_dir might be "" and that $file might
	528	# include directories
	529
	530	sub read {
[10254]	531	my $self = shift (@_);
[9853]	532	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[8909]	533	my $outhandle = $self->{'outhandle'};
	534	my $smart_block = $self->{'smart_block'};
	535
[6555]	536	my $filename = &util::filename_cat($base_dir, $file);
[8909]	537
	538	if ($self->associate_with($file,$filename,$metadata)) {
	539	# a form of smart block
	540	$self->{'num_blocked'} ++;
	541	return 0; # blocked
	542	}
	543
	544	if ($smart_block) {
	545	if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
	546	$self->{'num_blocked'} ++;
	547	return 0; # blocked
	548	}
	549	} elsif ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
	550	$self->{'num_blocked'} ++;
	551	return 0; # blocked
	552	}
[10168]	553
[6555]	554	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
	555	return undef;
	556	}
	557
	558	print $outhandle "PagedImgPlug processing \"$filename\"\n"
	559	if $self->{'verbosity'} > 1;
[9466]	560	print STDERR "<Processing n='$file' p='PagedImgPlug'>\n" if ($gli);
[6555]	561
[10168]	562	# here we need to decide if we have an old text .item file, or a new xml
	563	# .item file - for now the test is if the first non-empty line is
	564	# <PagedDocument> then its xml
	565	my $xml_version = 0;
	566	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
	567	my $line = "";
	568	my $num = 0;
	569	$line = <ITEMFILE>;
	570	while ($line !~ /\w/) {
	571	$line = <ITEMFILE>;
	572	}
	573	chomp $line;
	574	if ($line =~ /^<PagedDocument/) {
	575	$xml_version = 1;
	576	}
	577	close ITEMFILE;
	578	my $doc_obj;
	579	if ($xml_version) {
[6555]	580
[10168]	581	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
	582	$self->{'file'} = $file;
	583	$self->{'filename'} = $filename;
	584	$self->{'processor'} = $processor;
	585	$self->{'metadata'} = $metadata;
	586	$self->{'gli'} = $gli;
	587	eval {
	588	$@ = "";
	589	my $xslt = $self->{'xslt'};
	590	if (defined $xslt && ($xslt ne "")) {
	591	# perform xslt
	592	my $transformed_xml = $self->apply_xslt($xslt,$filename);
	593
	594	# feed transformed file (now in memory as string) into XML parser
	595	#$self->{'parser'}->parse($transformed_xml);
	596	$self->parse_string($transformed_xml);
	597	}
	598	else {
	599	#$self->{'parser'}->parsefile($filename);
	600	$self->parse_file($filename);
	601	}
	602	};
	603
	604	if ($@) {
	605
	606	# parsefile may either croak somewhere in XML::Parser (e.g. because
	607	# the document is not well formed) or die somewhere in XMLPlug or a
	608	# derived plugin (e.g. because we're attempting to process a
	609	# document whose DOCTYPE is not meant for this plugin). For the
	610	# first case we'll print a warning and continue, for the second
	611	# we'll just continue quietly
	612
	613	print STDERR "**** XML Parse Error is: $@\n";
	614
	615	my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
	616	if (defined $msg) {
	617	my $outhandle = $self->{'outhandle'};
	618	my $plugin_name = ref ($self);
	619	print $outhandle "$plugin_name failed to process $file ($msg)\n";
	620	}
[6555]	621
[10168]	622	# reset ourself for the next document
	623	$self->{'section_level'}=0;
	624	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
	625	return -1; # error during processing
	626	}
	627	$doc_obj = $self->{'doc_obj'};
	628
	629	} else {
	630	my ($dir);
	631	($dir, $file) = $filename =~ /^(.?)([^\/\\])$/;
	632
	633	#process the .item file
	634	$doc_obj = $self->process_item($filename, $dir, $file, $processor);
	635
	636	}
	637
[8909]	638	if ($self->{'cover_image'}) {
	639	$self->associate_cover_image($doc_obj, $filename);
	640	}
	641
[6555]	642	# include any metadata passed in from previous plugins
	643	# note that this metadata is associated with the top level section
	644	my $section = $doc_obj->get_top_section();
	645	$self->extra_metadata ($doc_obj, $section, $metadata);
	646
[10254]	647	# do plugin specific processing of doc_obj - don't need this unless
	648	# something inherits from PagedImgPlug
	649	#unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
	650	# print STDERR "<ProcessingError n='$file'>\n" if ($gli);
	651	# return -1;
	652	# }
[6555]	653
	654	# do any automatic metadata extraction
	655	$self->auto_extract_metadata ($doc_obj);
	656
	657	# process the document
	658	$processor->process($doc_obj);
	659
	660	# clean up temporary files - we do this here instead of in
	661	# process_image becuase associated files aren't actually copied
	662	# until after process has been run.
	663	if (defined $self->{'tmp_filename1'} &&
	664	-e $self->{'tmp_filename1'}) {
	665	&util::rm($self->{'tmp_filename1'})
	666	}
	667	if (defined $self->{'tmp_filename2'} &&
	668	-e $self->{'tmp_filename2'}) {
	669	&util::rm($self->{'tmp_filename2'})
	670	}
	671	if (defined $self->{'tmp_filename3'} &&
	672	-e $self->{'tmp_filename3'}) {
	673	&util::rm($self->{'tmp_filename3'})
	674	}
	675
	676	$self->{'num_processed'}++;
	677
	678	return 1;
	679	}
	680
[10168]	681	sub xml_start_tag {
	682	my $self = shift(@_);
	683	my ($expat, $element) = @_;
	684	$self->{'element'} = $element;
	685
	686	my $doc_obj = $self->{'doc_obj'};
	687	if ($element eq "PagedDocument") {
	688	$self->{'current_section'} = $doc_obj->get_top_section();
	689	} elsif ($element eq "PageGroup" \|\| $element eq "Page") {
	690	# create a new section as a child
	691	$self->{'current_section'} = $doc_obj->insert_section($doc_obj->get_end_child($self->{'current_section'}));
	692	$self->{'num_pages'}++;
	693	# assign pagenum as what??
	694	my $pagenum = $_{'pagenum'}; #TODO!!
	695	$doc_obj->set_utf8_metadata_element($self->{'current_section'}, 'PageNum', $pagenum);
	696	my ($imgfile) = $_{'imgfile'};
	697	if (defined $imgfile) {
	698	$self->process_image($self->{'base_dir'}.$imgfile, $imgfile, $doc_obj, $self->{'current_section'});
	699	}
	700	my ($txtfile) = $_{'txtfile'};
	701	if (defined($txtfile)) {
	702	$self->process_text ($self->{'base_dir'}.$txtfile, $txtfile, $doc_obj, $self->{'current_section'});
	703	} else {
	704	# otherwise add in some dummy text
	705	$doc_obj->add_text($self->{'current_section'}, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
	706	}
	707	} elsif ($element eq "Metadata") {
	708	$self->{'metadata_name'} = $_{'name'};
	709	}
	710	}
	711
	712	sub xml_end_tag {
	713	my $self = shift(@_);
	714	my ($expat, $element) = @_;
	715
	716	my $doc_obj = $self->{'doc_obj'};
	717	if ($element eq "Page" \|\| $element eq "PageGroup") {
	718	# move the current section back to the parent
	719	$self->{'current_section'} = $doc_obj->get_parent_section($self->{'current_section'});
	720	} elsif ($element eq "Metadata") {
	721
	722	$doc_obj->add_utf8_metadata ($self->{'current_section'}, $self->{'metadata_name'}, $self->{'metadata_value'});
	723	$self->{'metadata_name'} = "";
	724	$self->{'metadata_value'} = "";
	725
	726	}
	727	# otherwise we ignore the end tag
	728	}
	729
	730
	731	sub xml_text {
	732	my $self = shift(@_);
	733	my ($expat) = @_;
	734
	735	if ($self->{'element'} eq "Metadata") {
	736	$self->{'metadata_value'} .= $_;
	737	}
	738	}
	739
	740	sub xml_doctype {
	741	}
	742
	743	sub open_document {
	744	my $self = shift(@_);
	745
	746	# create a new document
	747	$self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
	748	my $doc_obj = $self->{'doc_obj'};
	749	$doc_obj->set_OIDtype ($self->{'processor'}->{'OIDtype'});
	750	my ($dir, $file) = $self->{'filename'} =~ /^(.?)([^\/\\])$/;
	751	$self->{'base_dir'} = $dir;
	752	$self->{'num_pages'} = 0;
	753	my $topsection = $doc_obj->get_top_section();
[10218]	754	if ($self->{'documenttype'} eq 'paged') {
[10168]	755	# set the gsdlthistype metadata to Paged - this ensures this document will
	756	# be treated as a Paged doc, even if Titles are not numeric
	757
	758	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
	759	} else {
	760	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
	761	}
	762
	763	$doc_obj->add_metadata ($topsection, "Source", $file);
	764	if ($self->{'headerpage'}) {
	765	$doc_obj->add_text($topsection, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
	766	}
	767
	768	}
	769
	770	sub close_document {
	771	my $self = shift(@_);
	772	my $doc_obj = $self->{'doc_obj'};
	773
	774	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
	775	$doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "PagedImg");
	776
	777	# add numpages metadata
	778	$doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(), 'NumPages', $self->{'num_pages'});
	779
	780	# add an OID
	781	$doc_obj->set_OID();
	782
	783	}
	784
[6555]	785	sub process_item {
	786	my $self = shift (@_);
	787	my ($filename, $dir, $file, $processor) = @_;
	788
[9420]	789	my $doc_obj = new doc ($filename, "indexed_doc");
[6555]	790	my $topsection = $doc_obj->get_top_section();
	791
[10218]	792	if ($self->{'documenttype'} eq 'paged') {
[6769]	793	# set the gsdlthistype metadata to Paged - this ensures this document will
	794	# be treated as a Paged doc, even if Titles are not numeric
	795
	796	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
	797	} else {
	798	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
	799	}
[8909]	800
[9144]	801	$doc_obj->add_metadata ($topsection, "Source", $file);
	802
[6555]	803	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
	804	my $line = "";
	805	my $num = 0;
	806	while (defined ($line = <ITEMFILE>)) {
	807	next unless $line =~ /\w/;
	808	chomp $line;
	809	if ($line =~ /^<([^>])>(.?)\s*$/) {
	810	$doc_obj->set_utf8_metadata_element ($topsection, $1, $2);
[10254]	811	#$meta->{$1} = $2;
[6555]	812	} else {
	813	$num++;
	814	# line should be like page:imagefilename:textfilename:r - the r is optional -> means rotate the image 180 deg
	815	$line =~ s/^\s+//; #remove space at the front
	816	$line =~ s/\s+$//; #remove space at the end
	817	my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
	818
	819	# create a new section for each image file
[8402]	820	my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
[6555]	821	# the page number becomes the Title
	822	$doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum);
	823	# process the image for this page
	824	my $result = $self->process_image($dir.$imgname, $imgname, $doc_obj, $cursection, $rotate);
	825
	826	if (!defined $result)
	827	{
	828	print "PagedImgPlug: couldn't process image \"$dir.$imgname\" for item \"$filename\"\n";
	829	}
	830
	831	# process the text file if one is there
	832	if (defined $txtname && $txtname ne "") {
	833	$result = undef;
	834	$result = $self->process_text ($dir.$txtname, $txtname, $doc_obj, $cursection);
	835	if (!defined $result) {
	836	print "PagedImgPlug: couldn't process text file \"$dir.$txtname\" for item \"$filename\"\n";
	837	}
	838	} else {
	839	# otherwise add in some dummy text
[7506]	840	$doc_obj->add_text($cursection, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
[6555]	841	}
	842	}
	843	}
	844
[7106]	845	close ITEMFILE;
	846
[8402]	847	# if we want a header page, we need to add some text into the top section, otherwise this section will become invisible
	848	if ($self->{'headerpage'}) {
	849	$doc_obj->add_text($topsection, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
	850	}
[6555]	851	$file =~ s/\.item//i;
[8245]	852	$doc_obj->set_OID ();
[6555]	853	# add numpages metadata
	854	$doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num");
	855	return $doc_obj;
	856	}
	857
	858	sub process_text {
	859	my $self = shift (@_);
	860	my ($fullpath, $file, $doc_obj, $cursection) = @_;
	861
	862	# Do encoding stuff
	863	my ($language, $encoding) = $self->textcat_get_language_encoding ($fullpath);
	864
	865	my $text="";
	866	&BasPlug::read_file($self, $fullpath, $encoding, $language, \$text);
	867	if (!length ($text)) {
	868	my $plugin_name = ref ($self);
	869	print "PagedImgPlug: ERROR: $fullpath contains no text\n" if $self->{'verbosity'};
	870	return 0;
	871	}
	872
	873	# we need to escape the escape character, or else mg will convert into
	874	# eg literal newlines, instead of leaving the text as '\n'
	875	$text =~ s/\\/\\\\/g; # macro language
	876	$text =~ s/_/\\_/g; # macro language
	877	$text =~ s/</</g;
	878	$text =~ s/>/>/g;
	879
	880	# insert preformat tags and add text to document object
	881	$doc_obj->add_utf8_text($cursection, "<pre>\n$text\n</pre>");
	882
	883	return 1;
	884	}
	885
	886	# do plugin specific processing of doc_obj
	887	sub process {
	888	my $self = shift (@_);
	889	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
	890	my $outhandle = $self->{'outhandle'};
	891
	892	return 1;
	893	}
	894
	895	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: