Context Navigation

source: gsdl/trunk/perllib/plugins/PagedImgPlug.pm@ 14174

Last change on this file since 14174 was 14174, checked in by qq6, 17 years ago
add NoText metadata which can be used to suppress the dummy tex
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 32.5 KB

Rev	Line
[6555]	1	###########################################################################
	2	#
	3	# PagedImgPlug.pm -- plugin for sets of images and OCR text that
	4	# make up a document
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
	9	# Copyright (C) 1999 New Zealand Digital Library Project
	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26
	27	# PagedImgPlug
	28	# processes sequences of images, with optional OCR text
	29	#
[7352]	30	# This plugin takes *.item files, which contain metadata and lists of image
[6555]	31	# files, and produces a document containing sections, one for each page.
[7352]	32	# The files should be named something.item, then you can have more than one
	33	# book in a directory. You will need to create these files, one for each
	34	# document/book.
[6555]	35	#
[10168]	36	#There are two formats for the item files: a plain text format, and an xml
	37	#format. You can use either format, and can have both formats in the same
	38	#collection if you like. If you use the plain format, you must not start the
	39	#file off with <PagedDocument>
	40
	41	#### PLAIN FORMAT
[7352]	42	# The format of the xxx.item file is as follows:
[6555]	43	# The first lines contain any metadata for the whole document
	44	# <metadata-name>metadata-value
	45	# eg.
	46	# <Title>Snail farming
	47	# <Date>19230102
	48	# Then comes a list of pages, one page per line, each line has the format
[7352]	49	#
[6555]	50	# pagenum:imagefile:textfile:r
[7352]	51	#
[6555]	52	# page num and imagefile are required. pagenum is used for the Title
	53	# of the section, and in the display is shown as page <pagenum>.
	54	# imagefile is the image for the page. textfile is an optional text
	55	# file containing the OCR (or any) text for the page - this gets added
	56	# as the text for the section. r is optional, and signals that the image
	57	# should be rotated 180deg. Eg use this if the image has been made upside down.
	58	# So an example item file looks like:
	59	# <Title>Snail farming
	60	# <Date>19960403
	61	# 1:p1.gif:p1.txt:
	62	# 2:p2.gif::
	63	# 3:p3.gif:p3.txt:
	64	# 3b:p3b.gif:p3b.txt:r
	65	# The second page has no text, the fourth page is a back page, and
	66	# should be rotated.
	67	#
[10168]	68
	69	#### XML FORMAT
	70	# The xml format looks like the following
	71	#<PagedDocument>
	72	#<Metadata name="Title">The Title of the entire document</Metadata>
	73	#<Page pagenum="1" imgfile="xxx.jpg" txtfile="yyy.jpg">
	74	#<Metadata name="Title">The Title of this page</Metadata>
	75	#</Page>
	76	#... more pages
	77	#</PagedDocument>
	78	#PagedDocument contains a list of Pages, Metadata and PageGroups. Any metadata
	79	#that is not inside another tag will belong to the document.
	80	#Each Page has a pagenum (not used at the moment), an imgfile and/or a txtfile.
	81	#These are both optional - if neither is used, the section will have no content.
	82	#Pages can also have metadata associated with them.
	83	#PageGroups can be introduced at any point - they can contain Metadata and Pages and other PageGroups. They are used to introduce hierarchical structure into the document.
	84	#For example
	85	#<PagedDocument>
	86	#<PageGroup>
	87	#<Page>
	88	#<Page>
	89	#</PageGroup>
	90	#<Page>
	91	#</PagedDocument>
	92	#would generate a structure like
	93	#X
	94	#--X
	95	# --X
	96	# --X
	97	#--X
	98	#PageGroup tags can also have imgfile/textfile metadata if you like - this way they get some content themselves.
	99
	100	#Currently the XML structure doesn't work very well with the paged document type, unless you use numerical Titles for each section.
	101	#There is still a bit of work to do on this format:
	102	#* enable other text file types, eg html, pdf etc
	103	#* make the document paging work properly
	104	#* add pagenum as Title unless a Title is present?
	105
[6555]	106	# All the supplemetary image amd text files should be in the same folder as
	107	# the .item file.
	108	#
	109	# To display the images instead of the document text, you can use [srcicon]
[7106]	110	# in the DocumentText format statement.
	111	# For example,
[6555]	112	#
[7106]	113	# format DocumentText "<center><table width=_pagewidth_><tr><td>[srcicon]</td></tr></table></center>"
[7352]	114	#
	115	# To have it create thumbnail size images, use the '-thumbnail' option.
	116	# To have it create medium size images for display, use the '-screenview'
	117	# option. As usual, running
	118	# 'perl -S pluginfo.pl PagedImgPlug' will list all the options.
[7106]	119
[7352]	120	# If you want the resulting documents to be presented with a table of
	121	# contents, use '-documenttype hierarchy', otherwise they will have
	122	# next and previous arrows, and a goto page X box.
	123
[7106]	124	# If you have used -screenview, you can also use [screenicon] in the format
	125	# statement to display the smaller image. Here is an example that switches
	126	# between the two:
	127	#
[10153]	128	# format DocumentText "<center><table width=_pagewidth_><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small'>Switch to small version.</a>,<a href='_httpdocument_&d=_cgiargd_&p=full'>Switch to fullsize version</a>}</td></tr><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small' title='Switch to small version'>[srcicon]</a>,<a href='_httpdocument_&d=_cgiargd_&p=full' title='Switch to fullsize version'>[screenicon]</a>}</td></tr></table></center>"
[6555]	129	#
	130	# Additional metadata can be added into the .item files, alternatively you can
[7352]	131	# use normal metadata.xml files, with the name of the xxx.item file as the
[10168]	132	# FileName (only for document level metadata).
[6555]	133
	134	package PagedImgPlug;
	135
[10168]	136	use XMLPlug;
[10254]	137	use strict;
	138	no strict 'refs'; # allow filehandles to be variables and viceversa
[6555]	139
	140	sub BEGIN {
[10218]	141	@PagedImgPlug::ISA = ('XMLPlug');
[6555]	142	}
	143
[6860]	144	my $type_list =
	145	[ { 'name' => "paged",
	146	'desc' => "{PagedImgPlug.documenttype.paged}" },
	147	{ 'name' => "hierarchy",
	148	'desc' => "{PagedImgPlug.documenttype.hierarchy}" } ];
	149
[6555]	150	my $arguments =
	151	[ { 'name' => "process_exp",
	152	'desc' => "{BasPlug.process_exp}",
	153	'type' => "string",
	154	'deft' => &get_default_process_exp(),
	155	'reqd' => "no" },
	156	{ 'name' => "block_exp",
	157	'desc' => "{BasPlug.block_exp}",
	158	'type' => "string",
	159	'deft' => &get_default_block_exp(),
	160	'reqd' => "no" },
[10354]	161	{ 'name' => "title_sub",
	162	'desc' => "{HTMLPlug.title_sub}",
	163	'type' => "string",
	164	'deft' => "" },
[6555]	165	{ 'name' => "noscaleup",
	166	'desc' => "{ImagePlug.noscaleup}",
	167	'type' => "flag",
	168	'reqd' => "no" },
	169	{ 'name' => "thumbnail",
	170	'desc' => "{PagedImgPlug.thumbnail}",
	171	'type' => "flag",
	172	'reqd' => "no" },
	173	{ 'name' => "thumbnailsize",
	174	'desc' => "{ImagePlug.thumbnailsize}",
	175	'type' => "int",
	176	'deft' => "100",
[7106]	177	'range' => "1,",
[6555]	178	'reqd' => "no" },
	179	{ 'name' => "thumbnailtype",
	180	'desc' => "{ImagePlug.thumbnailtype}",
	181	'type' => "string",
	182	'deft' => "gif",
	183	'reqd' => "no" },
	184	{ 'name' => "screenview",
	185	'desc' => "{PagedImgPlug.screenview}",
	186	'type' => "flag",
	187	'reqd' => "no" },
[7106]	188	{ 'name' => "screenviewsize",
[6555]	189	'desc' => "{PagedImgPlug.screenviewsize}",
	190	'type' => "int",
[7106]	191	'deft' => "500",
	192	'range' => "1,",
[6555]	193	'reqd' => "no" },
	194	{ 'name' => "screenviewtype",
	195	'desc' => "{PagedImgPlug.screenviewtype}",
	196	'type' => "string",
	197	'deft' => "jpg",
	198	'reqd' => "no" },
	199	{ 'name' => "converttotype",
	200	'desc' => "{ImagePlug.converttotype}",
	201	'type' => "string",
	202	'deft' => "",
	203	'reqd' => "no" },
	204	{ 'name' => "minimumsize",
	205	'desc' => "{ImagePlug.minimumsize}",
	206	'type' => "int",
	207	'deft' => "100",
[7106]	208	'range' => "1,",
[6769]	209	'reqd' => "no" },
[8246]	210	{ 'name' => "headerpage",
	211	'desc' => "{PagedImgPlug.headerpage}",
[6769]	212	'type' => "flag",
	213	'reqd' => "no" },
	214	{ 'name' => "documenttype",
	215	'desc' => "{PagedImgPlug.documenttype}",
	216	'type' => "enum",
	217	'list' => $type_list,
	218	'deft' => "paged",
[6555]	219	'reqd' => "no" } ];
	220
[6769]	221
[6555]	222	my $options = { 'name' => "PagedImgPlug",
	223	'desc' => "{PagedImgPlug.desc}",
[11676]	224	'abstract' => "no",
[6555]	225	'inherits' => "yes",
	226	'args' => $arguments };
	227
	228	sub new {
[10218]	229	my ($class) = shift (@_);
	230	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	231	push(@$pluginlist, $class);
[6555]	232
[10218]	233	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
	234	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
[10276]	235
[12169]	236	my $self = new XMLPlug($pluginlist, $inputargs, $hashArgOptLists);
[10354]	237
[6555]	238	return bless $self, $class;
	239	}
	240
	241	sub get_default_process_exp {
	242	my $self = shift (@_);
	243
	244	return q^\.item$^;
	245	}
	246
[13222]	247	sub get_doctype {
	248	my $self = shift(@_);
	249
	250	return "PagedDocument";
	251	}
	252
	253
[6555]	254	# want to block everything except the .item ones
	255	# but instead we will block images and txt files
	256	sub get_default_block_exp {
	257	my $self = shift (@_);
	258
	259	return q^(?i)(\.jpe?g\|\.gif\|\.png\|\.tif?f\|\.te?xt\|~)$^
	260	}
[10344]	261
[6555]	262	# Create the thumbnail and screenview images, and discover the Image's
	263	# size, width, and height using the convert utility.
	264	sub process_image {
	265	my $self = shift (@_);
	266	my $filename = shift (@_); # filename with full path
	267	my $srcfile = shift (@_); # filename without path
	268	my $doc_obj = shift (@_);
	269	my $section = shift (@_); #the current section
	270	my $rotate = shift (@_); # whether to rotate the image or not
[10276]	271	$rotate = 0 unless defined $rotate;
	272
[11249]	273	# check that the image file exists!!
	274	if (!-f $filename) {
	275	print "PagedImgPlug: ERROR: File $filename does not exist, skipping\n";
	276	return 0;
	277	}
	278
[6555]	279	my $top=0;
	280	if ($section eq $doc_obj->get_top_section()) {
	281	$top=1;
	282	}
	283	my $verbosity = $self->{'verbosity'};
	284	my $outhandle = $self->{'outhandle'};
	285
	286	# check the filename is okay
	287	return 0 if ($srcfile eq "" \|\| $filename eq "");
	288
	289	my $minimumsize = $self->{'minimumsize'};
	290	if (defined $minimumsize && (-s $filename < $minimumsize)) {
	291	print $outhandle "PagedImgPlug: \"$filename\" too small, skipping\n"
	292	if ($verbosity > 1);
	293	}
	294
	295	# Convert the image to a new type (if required), and rotate if required.
	296	my $converttotype = $self->{'converttotype'};
	297	my $originalfilename = ""; # only set if we do a conversion
	298	my $type = "unknown";
	299	my $converted = 0;
	300	my $rotated=0;
[10276]	301
[6555]	302	if ($converttotype ne "" && $filename !~ /$converttotype$/) {
	303	$converted=1;
	304	$originalfilename = $filename;
	305	my $filehead = &util::get_tmp_filename();
	306	$filename = $filehead . ".$converttotype";
[10254]	307	my $n = 1;
[6555]	308	while (-e $filename) {
	309	$filename = "$filehead$n\.$converttotype";
	310	$n++;
	311	}
	312	$self->{'tmp_filename1'} = $filename;
	313
	314	my $rotate_option = "";
	315	if ($rotate eq "r") {
	316	$rotate_option = "-rotate 180 ";
	317	}
	318
	319	my $command = "convert -verbose \"$originalfilename\" $rotate_option \"$filename\"";
	320	print $outhandle "CONVERT: $command\n" if ($verbosity > 2);
	321	my $result = '';
	322	$result = `$command`;
	323	print $outhandle "CONVERT RESULT = $result\n" if ($verbosity > 2);
	324
	325	$type = $converttotype;
	326	} elsif ($rotate eq "r") {
	327	$rotated=1;
	328	$originalfilename = $filename;
	329	$filename = &util::get_tmp_filename();
	330
	331	my $command = "convert \"$originalfilename\" -rotate 180 \"$filename\"";
	332	print $outhandle "ROTATE: $command\n" if ($verbosity > 2);
	333	my $result = '';
	334	$result = `$command`;
	335	print $outhandle "ROTATE RESULT = $result\n" if ($verbosity > 2);
	336
	337	}
	338
	339
	340	# Add the image metadata
	341	my $file; # the new file name
[8117]	342	my $id = $srcfile;
	343	$id =~ s/\.([^\.]*)$//; # the new file name without an extension
[6555]	344	if ($converted) {
	345	# we have converted the image
	346	# add on the new extension
	347	$file .= "$id.$converttotype";
	348	} else {
	349	$file = $srcfile;
	350	}
	351
	352	my $url =$file; # the new file name prepared for a url
	353	my $srcurl = $srcfile;
[13544]	354	##$url =~ s/ /%20/g;
	355	##$srcurl =~ s/ /%20/g;
[6555]	356
	357	$doc_obj->add_metadata ($section, "Image", $url);
	358
	359	# Also want to set filename as 'Source' metadata to be
	360	# consistent with other plugins
	361	$doc_obj->add_metadata ($section, "Source", $srcurl);
	362
	363	my ($image_type, $image_width, $image_height, $image_size)
	364	= &identify($filename, $outhandle, $verbosity);
	365
	366	$doc_obj->add_metadata ($section, "ImageType", $image_type);
	367	$doc_obj->add_metadata ($section, "ImageWidth", $image_width);
	368	$doc_obj->add_metadata ($section, "ImageHeight", $image_height);
	369	$doc_obj->add_metadata ($section, "ImageSize", $image_size);
[8121]	370	$doc_obj->add_metadata ($section, "FileFormat", "PagedImg");
[14174]	371	# add NoText metadata which can be used to suppress the dummy text
	372	$doc_obj->add_metadata ($section, "NoText", "1");
[6555]	373
[14174]	374
[6555]	375	if ($type eq "unknown" && $image_type) {
	376	$type = $image_type;
	377	}
	378
	379	if ($top) {
	380	$doc_obj->add_metadata ($section, "srclink",
[11834]	381	"<a href=\"_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/[Image]\">");
	382	$doc_obj->add_metadata ($section, "srcicon", "<img src=\"_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/[Image]\">");
[6555]	383
	384	} else {
	385	$doc_obj->add_metadata ($section, "srclink",
[11834]	386	"<a href=\"_httpprefix_/collect/[collection]/index/assoc/[parent(Top):assocfilepath]/[Image]\">");
	387	$doc_obj->add_metadata ($section, "srcicon", "<img src=\"_httpprefix_/collect/[collection]/index/assoc/[parent(Top):assocfilepath]/[Image]\">");
[6555]	388
	389	}
	390	$doc_obj->add_metadata ($section, "/srclink", "</a>");
	391
	392
	393	# Add the image as an associated file
	394	$doc_obj->associate_file($filename,$file,"image/$type",$section);
	395	print $outhandle "associating file $filename as name $file\n" if ($verbosity > 2);
	396
	397	if ($self->{'thumbnail'}) {
	398	# Make the thumbnail image
	399	my $thumbnailsize = $self->{'thumbnailsize'} \|\| 100;
	400	my $thumbnailtype = $self->{'thumbnailtype'} \|\| 'gif';
	401
	402	my $filehead = &util::get_tmp_filename();
	403	my $thumbnailfile = $filehead . ".$thumbnailtype";
	404	my $n=1;
	405	while (-e $thumbnailfile) {
	406	$thumbnailfile = $filehead . $n . ".$thumbnailtype";
	407	$n++;
	408	}
	409
	410	$self->{'tmp_filename2'} = $thumbnailfile;
	411
	412	# Generate the thumbnail with convert
	413	my $command = "convert -verbose -geometry $thumbnailsize"
	414	. "x$thumbnailsize \"$filename\" \"$thumbnailfile\"";
	415	print $outhandle "THUMBNAIL: $command\n" if ($verbosity > 2);
	416	my $result = '';
	417	$result = `$command 2>&1` ;
	418	print $outhandle "THUMB RESULT: $result\n" if ($verbosity > 2);
	419
	420	# Add the thumbnail as an associated file ...
	421	if (-e "$thumbnailfile") {
	422	$doc_obj->associate_file("$thumbnailfile", $id."thumb.$thumbnailtype", "image/$thumbnailtype",$section);
	423	$doc_obj->add_metadata ($section, "ThumbType", $thumbnailtype);
	424	$doc_obj->add_metadata ($section, "Thumb", $id."thumb.$thumbnailtype");
[10168]	425	if ($top) {
[11834]	426	$doc_obj->add_metadata ($section, "thumbicon", "<img src=\"_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/[Thumb]\" width=[ThumbWidth] height=[ThumbHeight]>");
[10168]	427	} else {
[11834]	428	$doc_obj->add_metadata ($section, "thumbicon", "<img src=\"_httpprefix_/collect/[collection]/index/assoc/[parent(Top):assocfilepath]/[Thumb]\" width=[ThumbWidth] height=[ThumbHeight]>");
[10168]	429	}
[6555]	430	}
	431
	432	# Extract Thumnail metadata from convert output
	433	if ($result =~ m/[0-9]+x[0-9]+=>([0-9]+)x([0-9]+)/) {
	434	$doc_obj->add_metadata ($section, "ThumbWidth", $1);
	435	$doc_obj->add_metadata ($section, "ThumbHeight", $2);
	436	}
	437	}
	438	# Make a screen-sized version of the picture if requested
	439	if ($self->{'screenview'}) {
	440
	441	# To do: if the actual image is smaller than the screenview size,
	442	# we should use the original !
	443
	444	my $screenviewsize = $self->{'screenviewsize'} \|\| 500;
	445	my $screenviewtype = $self->{'screenviewtype'} \|\| 'jpeg';
	446	my $filehead = &util::get_tmp_filename();
	447	my $screenviewfilename = $filehead . ".$screenviewtype";
	448	my $n=1;
	449	while (-e $screenviewfilename) {
	450	$screenviewfilename = "$filehead$n\.$screenviewtype";
	451	$n++;
	452	}
	453	$self->{'tmp_filename3'} = $screenviewfilename;
	454
	455	# make the screenview image
	456	my $command = "convert -verbose -geometry $screenviewsize"
	457	. "x$screenviewsize \"$filename\" \"$screenviewfilename\"";
	458	print $outhandle "SCREENVIEW: $command\n" if ($verbosity > 2);
	459	my $result = "";
	460	$result = `$command 2>&1` ;
	461	print $outhandle "SCREENVIEW RESULT: $result\n" if ($verbosity > 3);
	462
	463	# get screenview dimensions, size and type
	464	if ($result =~ m/[0-9]+x[0-9]+=>([0-9]+)x([0-9]+)/) {
	465	$doc_obj->add_metadata ($section, "ScreenWidth", $1);
	466	$doc_obj->add_metadata ($section, "ScreenHeight", $2);
[8245]	467	}elsif ($result =~ m/([0-9]+)x([0-9]+)/) {
	468	#if the image hasn't changed size, the previous regex doesn't match
	469	$doc_obj->add_metadata ($section, "ScreenWidth", $1);
	470	$doc_obj->add_metadata ($section, "ScreenHeight", $2);
[6555]	471	}
	472
	473	#add the screenview as an associated file ...
	474	if (-e "$screenviewfilename") {
	475	$doc_obj->associate_file("$screenviewfilename", $id."sv.$screenviewtype",
	476	"image/$screenviewtype",$section);
	477	print $outhandle "associating screen file $screenviewfilename as name $id sv.$screenviewtype\n" if ($verbosity > 2);
	478
	479	$doc_obj->add_metadata ($section, "ScreenType", $screenviewtype);
	480	$doc_obj->add_metadata ($section, "Screen", $id."sv.$screenviewtype");
	481
	482	if ($top) {
[11834]	483	$doc_obj->add_metadata ($section, "screenicon", "<img src=\"_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/[Screen]\" width=[ScreenWidth] height=[ScreenHeight]>");
[6555]	484	} else {
[11834]	485	$doc_obj->add_metadata ($section, "screenicon", "<img src=\"_httpprefix_/collect/[collection]/index/assoc/[parent(Top):assocfilepath]/[Screen]\" width=[ScreenWidth] height=[ScreenHeight]>");
[6555]	486
	487	}
	488	} else {
	489	print $outhandle "PagedImgPlug: couldn't find \"$screenviewfilename\"\n";
	490	}
	491	}
	492
	493	return $type;
	494
	495
	496	}
	497
	498
	499
	500	# Discover the characteristics of an image file with the ImageMagick
	501	# "identify" command.
	502
	503	sub identify {
	504	my ($image, $outhandle, $verbosity) = @_;
	505
	506	# Use the ImageMagick "identify" command to get the file specs
[8245]	507	my $command = "identify \"$image\" 2>&1";
[6555]	508	print $outhandle "$command\n" if ($verbosity > 2);
	509	my $result = '';
	510	$result = `$command`;
	511	print $outhandle "$result\n" if ($verbosity > 3);
	512
	513	# Read the type, width, and height
	514	my $type = 'unknown';
	515	my $width = 'unknown';
	516	my $height = 'unknown';
	517
	518	my $image_safe = quotemeta $image;
	519	if ($result =~ /^$image_safe (\w+) (\d+)x(\d+)/) {
	520	$type = $1;
	521	$width = $2;
	522	$height = $3;
	523	}
	524
	525	# Read the size
	526	my $size = "unknown";
	527	if ($result =~ m/^.* ([0-9]+)b/) {
	528	$size = $1;
	529	} elsif ($result =~ m/^.* ([0-9]+)kb/) {
	530	$size = 1024 * $1;
	531	}
	532
	533	print $outhandle "file: $image:\t $type, $width, $height, $size\n"
	534	if ($verbosity > 3);
	535
	536	# Return the specs
	537	return ($type, $width, $height, $size);
	538	}
	539
	540
	541	# The PagedImgPlug read() function. This function does all the right things
	542	# to make general options work for a given plugin. It calls the process()
	543	# function which does all the work specific to a plugin (like the old
	544	# read functions used to do). Most plugins should define their own
	545	# process() function and let this read() function keep control.
	546	#
	547	# PagedImgPlug overrides read() because there is no need to read the actual
	548	# text of the file in, because the contents of the file is not text...
	549	#
	550	# Return number of files processed, undef if can't process
	551	# Note that $base_dir might be "" and that $file might
	552	# include directories
	553
[10276]	554	sub read_into_doc_obj {
[10254]	555	my $self = shift (@_);
[9853]	556	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[8909]	557	my $outhandle = $self->{'outhandle'};
[10276]	558
[11090]	559	#check process and block exps, smart block, etc
	560	my ($block_status,$filename) = $self->read_block(@_);
	561	return $block_status if ((!defined $block_status) \|\| ($block_status==0));
[10276]	562
[6555]	563	print $outhandle "PagedImgPlug processing \"$filename\"\n"
[10276]	564	if $self->{'verbosity'} > 1;
[9466]	565	print STDERR "<Processing n='$file' p='PagedImgPlug'>\n" if ($gli);
[10276]	566
[10168]	567	# here we need to decide if we have an old text .item file, or a new xml
	568	# .item file - for now the test is if the first non-empty line is
	569	# <PagedDocument> then its xml
	570	my $xml_version = 0;
	571	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
[10403]	572
	573	my $backup_filename = "backup.item";
	574	open (BACKUP,">$backup_filename")\|\| die "couldn't write to $backup_filename\n";
[10168]	575	my $line = "";
	576	my $num = 0;
	577	$line = <ITEMFILE>;
	578	while ($line !~ /\w/) {
	579	$line = <ITEMFILE>;
	580	}
	581	chomp $line;
[10276]	582	if ($line =~ /<PagedDocument/) {
[10168]	583	$xml_version = 1;
	584	}
	585	close ITEMFILE;
[10403]	586	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
[12683]	587	$line = <ITEMFILE>;
	588	$line =~ s/^\xEF\xBB\xBF//; # strip BOM
	589	$line =~ s/\x0B+//ig;
	590	$line =~ s/&/&/g;
	591	print BACKUP ($line);
[10403]	592	#Tidy up the item file some metadata title contains \vt-vertical tab
	593	while ($line = <ITEMFILE>) {
	594	$line =~ s/\x0B+//ig;
[10434]	595	$line =~ s/&/&/g;
[10403]	596	print BACKUP ($line);
	597	}
	598	close ITEMFILE;
	599	close BACKUP;
	600	&File::Copy::copy ($backup_filename, $filename);
	601	&util::rm($backup_filename);
[12683]	602
[10168]	603	my $doc_obj;
	604	if ($xml_version) {
	605	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
	606	$self->{'file'} = $file;
	607	$self->{'filename'} = $filename;
	608	$self->{'processor'} = $processor;
	609	$self->{'metadata'} = $metadata;
[11333]	610
[10168]	611	eval {
	612	$@ = "";
	613	my $xslt = $self->{'xslt'};
	614	if (defined $xslt && ($xslt ne "")) {
	615	# perform xslt
	616	my $transformed_xml = $self->apply_xslt($xslt,$filename);
	617
	618	# feed transformed file (now in memory as string) into XML parser
	619	#$self->{'parser'}->parse($transformed_xml);
	620	$self->parse_string($transformed_xml);
	621	}
	622	else {
	623	#$self->{'parser'}->parsefile($filename);
	624	$self->parse_file($filename);
	625	}
	626	};
	627
[10403]	628
	629
[10168]	630	if ($@) {
	631
	632	# parsefile may either croak somewhere in XML::Parser (e.g. because
	633	# the document is not well formed) or die somewhere in XMLPlug or a
	634	# derived plugin (e.g. because we're attempting to process a
	635	# document whose DOCTYPE is not meant for this plugin). For the
	636	# first case we'll print a warning and continue, for the second
	637	# we'll just continue quietly
	638
	639	print STDERR "**** XML Parse Error is: $@\n";
	640
	641	my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
	642	if (defined $msg) {
	643	my $outhandle = $self->{'outhandle'};
	644	my $plugin_name = ref ($self);
	645	print $outhandle "$plugin_name failed to process $file ($msg)\n";
	646	}
[6555]	647
[10168]	648	# reset ourself for the next document
	649	$self->{'section_level'}=0;
	650	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
	651	return -1; # error during processing
	652	}
	653	$doc_obj = $self->{'doc_obj'};
	654	} else {
	655	my ($dir);
	656	($dir, $file) = $filename =~ /^(.?)([^\/\\])$/;
	657
	658	#process the .item file
	659	$doc_obj = $self->process_item($filename, $dir, $file, $processor);
	660
	661	}
	662
[8909]	663	if ($self->{'cover_image'}) {
	664	$self->associate_cover_image($doc_obj, $filename);
	665	}
	666
[6555]	667	# include any metadata passed in from previous plugins
	668	# note that this metadata is associated with the top level section
	669	my $section = $doc_obj->get_top_section();
	670	$self->extra_metadata ($doc_obj, $section, $metadata);
[10276]	671	#my $text="";
	672	# do plugin specific processing of doc_obj
[10254]	673	#unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
[10276]	674	#print STDERR "<ProcessingError n='$file'>\n" if ($gli);
	675	#return -1;
	676	#}
[6555]	677	# do any automatic metadata extraction
	678	$self->auto_extract_metadata ($doc_obj);
	679
[10276]	680	$self->{'num_processed'}++;
	681	return (1,$doc_obj);
	682	}
[6555]	683
[10276]	684	sub read
	685	{
	686	my $self = shift (@_);
	687	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
	688
	689	my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
	690
	691	if ((defined $process_status) && ($process_status == 1)) {
	692	# process the document
	693	$processor->process($doc_obj);
	694
	695	#if(defined($self->{'places_filename'})){
	696	# &util::rm($self->{'places_filename'});
	697	# $self->{'places_filename'} = undef;
	698	#}
	699	#$self->{'num_processed'} ++;
	700	undef $doc_obj;
	701	}
	702
[6555]	703	# clean up temporary files - we do this here instead of in
	704	# process_image becuase associated files aren't actually copied
	705	# until after process has been run.
	706	if (defined $self->{'tmp_filename1'} &&
	707	-e $self->{'tmp_filename1'}) {
[10276]	708	&util::rm($self->{'tmp_filename1'})
[6555]	709	}
	710	if (defined $self->{'tmp_filename2'} &&
	711	-e $self->{'tmp_filename2'}) {
[10276]	712	&util::rm($self->{'tmp_filename2'})
[6555]	713	}
	714	if (defined $self->{'tmp_filename3'} &&
	715	-e $self->{'tmp_filename3'}) {
	716	&util::rm($self->{'tmp_filename3'})
	717	}
[10276]	718	# if process_status == 1, then the file has been processed.
	719	return $process_status;
[6555]	720	}
	721
[10168]	722	sub xml_start_tag {
	723	my $self = shift(@_);
	724	my ($expat, $element) = @_;
	725	$self->{'element'} = $element;
	726
	727	my $doc_obj = $self->{'doc_obj'};
	728	if ($element eq "PagedDocument") {
	729	$self->{'current_section'} = $doc_obj->get_top_section();
	730	} elsif ($element eq "PageGroup" \|\| $element eq "Page") {
	731	# create a new section as a child
	732	$self->{'current_section'} = $doc_obj->insert_section($doc_obj->get_end_child($self->{'current_section'}));
	733	$self->{'num_pages'}++;
	734	# assign pagenum as what??
	735	my $pagenum = $_{'pagenum'}; #TODO!!
[11090]	736	if (defined $pagenum) {
	737	$doc_obj->set_utf8_metadata_element($self->{'current_section'}, 'PageNum', $pagenum);
	738	}
[10168]	739	my ($imgfile) = $_{'imgfile'};
	740	if (defined $imgfile) {
	741	$self->process_image($self->{'base_dir'}.$imgfile, $imgfile, $doc_obj, $self->{'current_section'});
	742	}
	743	my ($txtfile) = $_{'txtfile'};
[10276]	744	if (defined($txtfile)&& $txtfile ne "") {
[10168]	745	$self->process_text ($self->{'base_dir'}.$txtfile, $txtfile, $doc_obj, $self->{'current_section'});
	746	} else {
[13243]	747	# otherwise add in some dummy text
	748	#create an empty text string so we don't break downstream plugins
[13269]	749	my $text = &gsprintf::lookup_string("{BasPlug.dummy_text}",1);
[13243]	750	$doc_obj->add_utf8_text($self->{'current_section'}, $text);
[10168]	751	}
	752	} elsif ($element eq "Metadata") {
	753	$self->{'metadata_name'} = $_{'name'};
	754	}
	755	}
	756
	757	sub xml_end_tag {
	758	my $self = shift(@_);
	759	my ($expat, $element) = @_;
	760
	761	my $doc_obj = $self->{'doc_obj'};
	762	if ($element eq "Page" \|\| $element eq "PageGroup") {
[10344]	763	# if Title hasn't been assigned, set PageNum as Title
	764	if (!defined $doc_obj->get_metadata_element ($self->{'current_section'}, "Title") && defined $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" )) {
	765	$doc_obj->add_utf8_metadata ($self->{'current_section'}, "Title", $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" ));
	766	}
[10168]	767	# move the current section back to the parent
	768	$self->{'current_section'} = $doc_obj->get_parent_section($self->{'current_section'});
	769	} elsif ($element eq "Metadata") {
	770
	771	$doc_obj->add_utf8_metadata ($self->{'current_section'}, $self->{'metadata_name'}, $self->{'metadata_value'});
	772	$self->{'metadata_name'} = "";
	773	$self->{'metadata_value'} = "";
	774
	775	}
	776	# otherwise we ignore the end tag
	777	}
	778
	779
	780	sub xml_text {
	781	my $self = shift(@_);
	782	my ($expat) = @_;
	783
[11506]	784	if ($self->{'element'} eq "Metadata" && $self->{'metadata_name'}) {
[10168]	785	$self->{'metadata_value'} .= $_;
	786	}
	787	}
	788
	789	sub xml_doctype {
	790	}
	791
	792	sub open_document {
	793	my $self = shift(@_);
	794
	795	# create a new document
	796	$self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
	797	my $doc_obj = $self->{'doc_obj'};
	798	$doc_obj->set_OIDtype ($self->{'processor'}->{'OIDtype'});
	799	my ($dir, $file) = $self->{'filename'} =~ /^(.?)([^\/\\])$/;
	800	$self->{'base_dir'} = $dir;
	801	$self->{'num_pages'} = 0;
	802	my $topsection = $doc_obj->get_top_section();
[10218]	803	if ($self->{'documenttype'} eq 'paged') {
[10168]	804	# set the gsdlthistype metadata to Paged - this ensures this document will
	805	# be treated as a Paged doc, even if Titles are not numeric
	806
	807	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
	808	} else {
	809	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
	810	}
	811
	812	$doc_obj->add_metadata ($topsection, "Source", $file);
	813	if ($self->{'headerpage'}) {
	814	$doc_obj->add_text($topsection, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
	815	}
	816
	817	}
	818
	819	sub close_document {
	820	my $self = shift(@_);
	821	my $doc_obj = $self->{'doc_obj'};
	822
	823	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
	824	$doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "PagedImg");
	825
	826	# add numpages metadata
	827	$doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(), 'NumPages', $self->{'num_pages'});
	828
	829	# add an OID
	830	$doc_obj->set_OID();
	831
	832	}
	833
[6555]	834	sub process_item {
	835	my $self = shift (@_);
	836	my ($filename, $dir, $file, $processor) = @_;
	837
[9420]	838	my $doc_obj = new doc ($filename, "indexed_doc");
[12270]	839	$doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
[6555]	840	my $topsection = $doc_obj->get_top_section();
[11090]	841	$doc_obj->add_utf8_metadata($topsection, "Plugin", "$self->{'plugin_type'}");
	842	$doc_obj->add_metadata($topsection, "FileFormat", "PagedImg");
[6555]	843
[10218]	844	if ($self->{'documenttype'} eq 'paged') {
[6769]	845	# set the gsdlthistype metadata to Paged - this ensures this document will
	846	# be treated as a Paged doc, even if Titles are not numeric
	847	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
	848	} else {
	849	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
	850	}
[8909]	851
[9144]	852	$doc_obj->add_metadata ($topsection, "Source", $file);
	853
[14117]	854
[6555]	855	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
	856	my $line = "";
	857	my $num = 0;
	858	while (defined ($line = <ITEMFILE>)) {
	859	next unless $line =~ /\w/;
	860	chomp $line;
[11090]	861	next if $line =~ /^#/; # ignore comment lines
[10613]	862	if ($line =~ /^<([^>])>\s(.?)\s$/) {
[6555]	863	$doc_obj->set_utf8_metadata_element ($topsection, $1, $2);
[10254]	864	#$meta->{$1} = $2;
[6555]	865	} else {
	866	$num++;
	867	# line should be like page:imagefilename:textfilename:r - the r is optional -> means rotate the image 180 deg
	868	$line =~ s/^\s+//; #remove space at the front
	869	$line =~ s/\s+$//; #remove space at the end
	870	my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
	871
	872	# create a new section for each image file
[8402]	873	my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
[6555]	874	# the page number becomes the Title
	875	$doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum);
[14117]	876	# add NoText metadata which can be used to suppress the dummy text
[14174]	877	$doc_obj->add_metadata($cursection, "NoText", "1");
[14117]	878
[11881]	879	# process the image for this page if there is one
	880	if (defined $imgname && $imgname ne "") {
	881	my $result1 = $self->process_image($dir.$imgname, $imgname, $doc_obj, $cursection, $rotate);
	882
	883	if (!defined $result1)
	884	{
	885	print "PagedImgPlug: couldn't process image \"$dir.$imgname\" for item \"$filename\"\n";
	886	}
[6555]	887	}
	888	# process the text file if one is there
	889	if (defined $txtname && $txtname ne "") {
[11881]	890	my $result2 = $self->process_text ($dir.$txtname, $txtname, $doc_obj, $cursection);
	891	if (!defined $result2) {
[6555]	892	print "PagedImgPlug: couldn't process text file \"$dir.$txtname\" for item \"$filename\"\n";
	893	}
	894	} else {
	895	# otherwise add in some dummy text
[7506]	896	$doc_obj->add_text($cursection, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
[6555]	897	}
	898	}
	899	}
	900
[7106]	901	close ITEMFILE;
	902
[8402]	903	# if we want a header page, we need to add some text into the top section, otherwise this section will become invisible
	904	if ($self->{'headerpage'}) {
	905	$doc_obj->add_text($topsection, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
	906	}
[6555]	907	$file =~ s/\.item//i;
[8245]	908	$doc_obj->set_OID ();
[6555]	909	# add numpages metadata
	910	$doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num");
	911	return $doc_obj;
	912	}
	913
	914	sub process_text {
	915	my $self = shift (@_);
	916	my ($fullpath, $file, $doc_obj, $cursection) = @_;
[11249]	917
	918	# check that the text file exists!!
	919	if (!-f $fullpath) {
	920	print "PagedImgPlug: ERROR: File $fullpath does not exist, skipping\n";
	921	return 0;
	922	}
[6555]	923
	924	# Do encoding stuff
	925	my ($language, $encoding) = $self->textcat_get_language_encoding ($fullpath);
[10276]	926
[6555]	927	my $text="";
	928	&BasPlug::read_file($self, $fullpath, $encoding, $language, \$text);
	929	if (!length ($text)) {
[13524]	930	# It's a bit unusual but not out of the question to have no text, so just give a warning
	931	print "PagedImgPlug: WARNING: $fullpath contains no text\n";
[6555]	932	}
	933
	934	# we need to escape the escape character, or else mg will convert into
	935	# eg literal newlines, instead of leaving the text as '\n'
	936	$text =~ s/\\/\\\\/g; # macro language
	937	$text =~ s/_/\\_/g; # macro language
	938	$text =~ s/</</g;
	939	$text =~ s/>/>/g;
	940
	941	# insert preformat tags and add text to document object
	942	$doc_obj->add_utf8_text($cursection, "<pre>\n$text\n</pre>");
	943
	944	return 1;
	945	}
	946
	947	# do plugin specific processing of doc_obj
	948	sub process {
	949	my $self = shift (@_);
	950	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
	951	my $outhandle = $self->{'outhandle'};
	952
	953	return 1;
	954	}
	955
	956	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: