Context Navigation

source: trunk/gsdl/perllib/plugins/PagedImgPlug.pm@ 10254

Last change on this file since 10254 was 10254, checked in by kjdon, 19 years ago
added 'use strict' to all plugins, and made modifications (mostly adding 'my') to make them compile
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 30.1 KB

Line
1	###########################################################################
2	#
3	# PagedImgPlug.pm -- plugin for sets of images and OCR text that
4	# make up a document
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	# PagedImgPlug
28	# processes sequences of images, with optional OCR text
29	#
30	# This plugin takes *.item files, which contain metadata and lists of image
31	# files, and produces a document containing sections, one for each page.
32	# The files should be named something.item, then you can have more than one
33	# book in a directory. You will need to create these files, one for each
34	# document/book.
35	#
36	#There are two formats for the item files: a plain text format, and an xml
37	#format. You can use either format, and can have both formats in the same
38	#collection if you like. If you use the plain format, you must not start the
39	#file off with <PagedDocument>
40
41	#### PLAIN FORMAT
42	# The format of the xxx.item file is as follows:
43	# The first lines contain any metadata for the whole document
44	# <metadata-name>metadata-value
45	# eg.
46	# <Title>Snail farming
47	# <Date>19230102
48	# Then comes a list of pages, one page per line, each line has the format
49	#
50	# pagenum:imagefile:textfile:r
51	#
52	# page num and imagefile are required. pagenum is used for the Title
53	# of the section, and in the display is shown as page <pagenum>.
54	# imagefile is the image for the page. textfile is an optional text
55	# file containing the OCR (or any) text for the page - this gets added
56	# as the text for the section. r is optional, and signals that the image
57	# should be rotated 180deg. Eg use this if the image has been made upside down.
58	# So an example item file looks like:
59	# <Title>Snail farming
60	# <Date>19960403
61	# 1:p1.gif:p1.txt:
62	# 2:p2.gif::
63	# 3:p3.gif:p3.txt:
64	# 3b:p3b.gif:p3b.txt:r
65	# The second page has no text, the fourth page is a back page, and
66	# should be rotated.
67	#
68
69	#### XML FORMAT
70	# The xml format looks like the following
71	#<PagedDocument>
72	#<Metadata name="Title">The Title of the entire document</Metadata>
73	#<Page pagenum="1" imgfile="xxx.jpg" txtfile="yyy.jpg">
74	#<Metadata name="Title">The Title of this page</Metadata>
75	#</Page>
76	#... more pages
77	#</PagedDocument>
78	#PagedDocument contains a list of Pages, Metadata and PageGroups. Any metadata
79	#that is not inside another tag will belong to the document.
80	#Each Page has a pagenum (not used at the moment), an imgfile and/or a txtfile.
81	#These are both optional - if neither is used, the section will have no content.
82	#Pages can also have metadata associated with them.
83	#PageGroups can be introduced at any point - they can contain Metadata and Pages and other PageGroups. They are used to introduce hierarchical structure into the document.
84	#For example
85	#<PagedDocument>
86	#<PageGroup>
87	#<Page>
88	#<Page>
89	#</PageGroup>
90	#<Page>
91	#</PagedDocument>
92	#would generate a structure like
93	#X
94	#--X
95	# --X
96	# --X
97	#--X
98	#PageGroup tags can also have imgfile/textfile metadata if you like - this way they get some content themselves.
99
100	#Currently the XML structure doesn't work very well with the paged document type, unless you use numerical Titles for each section.
101	#There is still a bit of work to do on this format:
102	#* enable other text file types, eg html, pdf etc
103	#* make the document paging work properly
104	#* add pagenum as Title unless a Title is present?
105
106	# All the supplemetary image amd text files should be in the same folder as
107	# the .item file.
108	#
109	# To display the images instead of the document text, you can use [srcicon]
110	# in the DocumentText format statement.
111	# For example,
112	#
113	# format DocumentText "<center><table width=_pagewidth_><tr><td>[srcicon]</td></tr></table></center>"
114	#
115	# To have it create thumbnail size images, use the '-thumbnail' option.
116	# To have it create medium size images for display, use the '-screenview'
117	# option. As usual, running
118	# 'perl -S pluginfo.pl PagedImgPlug' will list all the options.
119
120	# If you want the resulting documents to be presented with a table of
121	# contents, use '-documenttype hierarchy', otherwise they will have
122	# next and previous arrows, and a goto page X box.
123
124	# If you have used -screenview, you can also use [screenicon] in the format
125	# statement to display the smaller image. Here is an example that switches
126	# between the two:
127	#
128	# format DocumentText "<center><table width=_pagewidth_><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small'>Switch to small version.</a>,<a href='_httpdocument_&d=_cgiargd_&p=full'>Switch to fullsize version</a>}</td></tr><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small' title='Switch to small version'>[srcicon]</a>,<a href='_httpdocument_&d=_cgiargd_&p=full' title='Switch to fullsize version'>[screenicon]</a>}</td></tr></table></center>"
129	#
130	# Additional metadata can be added into the .item files, alternatively you can
131	# use normal metadata.xml files, with the name of the xxx.item file as the
132	# FileName (only for document level metadata).
133
134	package PagedImgPlug;
135
136	use XMLPlug;
137	use strict;
138	no strict 'refs'; # allow filehandles to be variables and viceversa
139
140	sub BEGIN {
141	@PagedImgPlug::ISA = ('XMLPlug');
142	}
143
144	my $type_list =
145	[ { 'name' => "paged",
146	'desc' => "{PagedImgPlug.documenttype.paged}" },
147	{ 'name' => "hierarchy",
148	'desc' => "{PagedImgPlug.documenttype.hierarchy}" } ];
149
150	my $arguments =
151	[ { 'name' => "process_exp",
152	'desc' => "{BasPlug.process_exp}",
153	'type' => "string",
154	'deft' => &get_default_process_exp(),
155	'reqd' => "no" },
156	{ 'name' => "block_exp",
157	'desc' => "{BasPlug.block_exp}",
158	'type' => "string",
159	'deft' => &get_default_block_exp(),
160	'reqd' => "no" },
161	{ 'name' => "noscaleup",
162	'desc' => "{ImagePlug.noscaleup}",
163	'type' => "flag",
164	'reqd' => "no" },
165	{ 'name' => "thumbnail",
166	'desc' => "{PagedImgPlug.thumbnail}",
167	'type' => "flag",
168	'reqd' => "no" },
169	{ 'name' => "thumbnailsize",
170	'desc' => "{ImagePlug.thumbnailsize}",
171	'type' => "int",
172	'deft' => "100",
173	'range' => "1,",
174	'reqd' => "no" },
175	{ 'name' => "thumbnailtype",
176	'desc' => "{ImagePlug.thumbnailtype}",
177	'type' => "string",
178	'deft' => "gif",
179	'reqd' => "no" },
180	{ 'name' => "screenview",
181	'desc' => "{PagedImgPlug.screenview}",
182	'type' => "flag",
183	'reqd' => "no" },
184	{ 'name' => "screenviewsize",
185	'desc' => "{PagedImgPlug.screenviewsize}",
186	'type' => "int",
187	'deft' => "500",
188	'range' => "1,",
189	'reqd' => "no" },
190	{ 'name' => "screenviewtype",
191	'desc' => "{PagedImgPlug.screenviewtype}",
192	'type' => "string",
193	'deft' => "jpg",
194	'reqd' => "no" },
195	{ 'name' => "converttotype",
196	'desc' => "{ImagePlug.converttotype}",
197	'type' => "string",
198	'deft' => "",
199	'reqd' => "no" },
200	{ 'name' => "minimumsize",
201	'desc' => "{ImagePlug.minimumsize}",
202	'type' => "int",
203	'deft' => "100",
204	'range' => "1,",
205	'reqd' => "no" },
206	{ 'name' => "headerpage",
207	'desc' => "{PagedImgPlug.headerpage}",
208	'type' => "flag",
209	'reqd' => "no" },
210	{ 'name' => "documenttype",
211	'desc' => "{PagedImgPlug.documenttype}",
212	'type' => "enum",
213	'list' => $type_list,
214	'deft' => "paged",
215	'reqd' => "no" } ];
216
217
218	my $options = { 'name' => "PagedImgPlug",
219	'desc' => "{PagedImgPlug.desc}",
220	'inherits' => "yes",
221	'args' => $arguments };
222
223	sub new {
224	my ($class) = shift (@_);
225	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
226	push(@$pluginlist, $class);
227
228	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
229	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
230
231	my $self = (defined $hashArgOptLists)? new XMLPlug($pluginlist,$inputargs,$hashArgOptLists): new XMLPlug($pluginlist,$inputargs);
232
233	return bless $self, $class;
234	}
235
236	sub get_default_process_exp {
237	my $self = shift (@_);
238
239	return q^\.item$^;
240	}
241
242	# want to block everything except the .item ones
243	# but instead we will block images and txt files
244	sub get_default_block_exp {
245	my $self = shift (@_);
246
247	return q^(?i)(\.jpe?g\|\.gif\|\.png\|\.tif?f\|\.te?xt\|~)$^
248	}
249	# Create the thumbnail and screenview images, and discover the Image's
250	# size, width, and height using the convert utility.
251	sub process_image {
252	my $self = shift (@_);
253	my $filename = shift (@_); # filename with full path
254	my $srcfile = shift (@_); # filename without path
255	my $doc_obj = shift (@_);
256	my $section = shift (@_); #the current section
257	my $rotate = shift (@_); # whether to rotate the image or not
258
259	my $top=0;
260	if ($section eq $doc_obj->get_top_section()) {
261	$top=1;
262	}
263	my $verbosity = $self->{'verbosity'};
264	my $outhandle = $self->{'outhandle'};
265
266	# check the filename is okay
267	return 0 if ($srcfile eq "" \|\| $filename eq "");
268
269	my $minimumsize = $self->{'minimumsize'};
270	if (defined $minimumsize && (-s $filename < $minimumsize)) {
271	print $outhandle "PagedImgPlug: \"$filename\" too small, skipping\n"
272	if ($verbosity > 1);
273	}
274
275	# Convert the image to a new type (if required), and rotate if required.
276	my $converttotype = $self->{'converttotype'};
277	my $originalfilename = ""; # only set if we do a conversion
278	my $type = "unknown";
279	my $converted = 0;
280	my $rotated=0;
281	if ($converttotype ne "" && $filename !~ /$converttotype$/) {
282	$converted=1;
283	$originalfilename = $filename;
284	my $filehead = &util::get_tmp_filename();
285	$filename = $filehead . ".$converttotype";
286	my $n = 1;
287	while (-e $filename) {
288	$filename = "$filehead$n\.$converttotype";
289	$n++;
290	}
291	$self->{'tmp_filename1'} = $filename;
292
293	my $rotate_option = "";
294	if ($rotate eq "r") {
295	$rotate_option = "-rotate 180 ";
296	}
297
298	my $command = "convert -verbose \"$originalfilename\" $rotate_option \"$filename\"";
299	print $outhandle "CONVERT: $command\n" if ($verbosity > 2);
300	my $result = '';
301	$result = `$command`;
302	print $outhandle "CONVERT RESULT = $result\n" if ($verbosity > 2);
303
304	$type = $converttotype;
305	} elsif ($rotate eq "r") {
306	$rotated=1;
307	$originalfilename = $filename;
308	$filename = &util::get_tmp_filename();
309
310	my $command = "convert \"$originalfilename\" -rotate 180 \"$filename\"";
311	print $outhandle "ROTATE: $command\n" if ($verbosity > 2);
312	my $result = '';
313	$result = `$command`;
314	print $outhandle "ROTATE RESULT = $result\n" if ($verbosity > 2);
315
316	}
317
318
319	# Add the image metadata
320	my $file; # the new file name
321	my $id = $srcfile;
322	$id =~ s/\.([^\.]*)$//; # the new file name without an extension
323	if ($converted) {
324	# we have converted the image
325	# add on the new extension
326	$file .= "$id.$converttotype";
327	} else {
328	$file = $srcfile;
329	}
330
331	my $url =$file; # the new file name prepared for a url
332	my $srcurl = $srcfile;
333	$url =~ s/ /%20/g;
334	$srcurl =~ s/ /%20/g;
335
336	$doc_obj->add_metadata ($section, "Image", $url);
337
338	# Also want to set filename as 'Source' metadata to be
339	# consistent with other plugins
340	$doc_obj->add_metadata ($section, "Source", $srcurl);
341
342	my ($image_type, $image_width, $image_height, $image_size)
343	= &identify($filename, $outhandle, $verbosity);
344
345	$doc_obj->add_metadata ($section, "ImageType", $image_type);
346	$doc_obj->add_metadata ($section, "ImageWidth", $image_width);
347	$doc_obj->add_metadata ($section, "ImageHeight", $image_height);
348	$doc_obj->add_metadata ($section, "ImageSize", $image_size);
349	$doc_obj->add_metadata ($section, "FileFormat", "PagedImg");
350
351	if ($type eq "unknown" && $image_type) {
352	$type = $image_type;
353	}
354
355	if ($top) {
356	$doc_obj->add_metadata ($section, "srclink",
357	"<a href=\"_httpcollection_/index/assoc/[assocfilepath]/[Image]\">");
358	$doc_obj->add_metadata ($section, "srcicon", "<img src=\"_httpcollection_/index/assoc/[assocfilepath]/[Image]\">");
359
360	} else {
361	$doc_obj->add_metadata ($section, "srclink",
362	"<a href=\"_httpcollection_/index/assoc/[parent(Top):assocfilepath]/[Image]\">");
363	$doc_obj->add_metadata ($section, "srcicon", "<img src=\"_httpcollection_/index/assoc/[parent(Top):assocfilepath]/[Image]\">");
364
365	}
366	$doc_obj->add_metadata ($section, "/srclink", "</a>");
367
368
369	# Add the image as an associated file
370	$doc_obj->associate_file($filename,$file,"image/$type",$section);
371	print $outhandle "associating file $filename as name $file\n" if ($verbosity > 2);
372
373	if ($self->{'thumbnail'}) {
374	# Make the thumbnail image
375	my $thumbnailsize = $self->{'thumbnailsize'} \|\| 100;
376	my $thumbnailtype = $self->{'thumbnailtype'} \|\| 'gif';
377
378	my $filehead = &util::get_tmp_filename();
379	my $thumbnailfile = $filehead . ".$thumbnailtype";
380	my $n=1;
381	while (-e $thumbnailfile) {
382	$thumbnailfile = $filehead . $n . ".$thumbnailtype";
383	$n++;
384	}
385
386	$self->{'tmp_filename2'} = $thumbnailfile;
387
388	# Generate the thumbnail with convert
389	my $command = "convert -verbose -geometry $thumbnailsize"
390	. "x$thumbnailsize \"$filename\" \"$thumbnailfile\"";
391	print $outhandle "THUMBNAIL: $command\n" if ($verbosity > 2);
392	my $result = '';
393	$result = `$command 2>&1` ;
394	print $outhandle "THUMB RESULT: $result\n" if ($verbosity > 2);
395
396	# Add the thumbnail as an associated file ...
397	if (-e "$thumbnailfile") {
398	$doc_obj->associate_file("$thumbnailfile", $id."thumb.$thumbnailtype", "image/$thumbnailtype",$section);
399	$doc_obj->add_metadata ($section, "ThumbType", $thumbnailtype);
400	$doc_obj->add_metadata ($section, "Thumb", $id."thumb.$thumbnailtype");
401	if ($top) {
402	$doc_obj->add_metadata ($section, "thumbicon", "<img src=\"_httpcollection_/index/assoc/[assocfilepath]/[Thumb]\" width=[ThumbWidth] height=[ThumbHeight]>");
403	} else {
404	$doc_obj->add_metadata ($section, "thumbicon", "<img src=\"_httpcollection_/index/assoc/[parent(Top):assocfilepath]/[Thumb]\" width=[ThumbWidth] height=[ThumbHeight]>");
405	}
406	}
407
408	# Extract Thumnail metadata from convert output
409	if ($result =~ m/[0-9]+x[0-9]+=>([0-9]+)x([0-9]+)/) {
410	$doc_obj->add_metadata ($section, "ThumbWidth", $1);
411	$doc_obj->add_metadata ($section, "ThumbHeight", $2);
412	}
413	}
414	# Make a screen-sized version of the picture if requested
415	if ($self->{'screenview'}) {
416
417	# To do: if the actual image is smaller than the screenview size,
418	# we should use the original !
419
420	my $screenviewsize = $self->{'screenviewsize'} \|\| 500;
421	my $screenviewtype = $self->{'screenviewtype'} \|\| 'jpeg';
422	my $filehead = &util::get_tmp_filename();
423	my $screenviewfilename = $filehead . ".$screenviewtype";
424	my $n=1;
425	while (-e $screenviewfilename) {
426	$screenviewfilename = "$filehead$n\.$screenviewtype";
427	$n++;
428	}
429	$self->{'tmp_filename3'} = $screenviewfilename;
430
431	# make the screenview image
432	my $command = "convert -verbose -geometry $screenviewsize"
433	. "x$screenviewsize \"$filename\" \"$screenviewfilename\"";
434	print $outhandle "SCREENVIEW: $command\n" if ($verbosity > 2);
435	my $result = "";
436	$result = `$command 2>&1` ;
437	print $outhandle "SCREENVIEW RESULT: $result\n" if ($verbosity > 3);
438
439	# get screenview dimensions, size and type
440	if ($result =~ m/[0-9]+x[0-9]+=>([0-9]+)x([0-9]+)/) {
441	$doc_obj->add_metadata ($section, "ScreenWidth", $1);
442	$doc_obj->add_metadata ($section, "ScreenHeight", $2);
443	}elsif ($result =~ m/([0-9]+)x([0-9]+)/) {
444	#if the image hasn't changed size, the previous regex doesn't match
445	$doc_obj->add_metadata ($section, "ScreenWidth", $1);
446	$doc_obj->add_metadata ($section, "ScreenHeight", $2);
447	}
448
449	#add the screenview as an associated file ...
450	if (-e "$screenviewfilename") {
451	$doc_obj->associate_file("$screenviewfilename", $id."sv.$screenviewtype",
452	"image/$screenviewtype",$section);
453	print $outhandle "associating screen file $screenviewfilename as name $id sv.$screenviewtype\n" if ($verbosity > 2);
454
455	$doc_obj->add_metadata ($section, "ScreenType", $screenviewtype);
456	$doc_obj->add_metadata ($section, "Screen", $id."sv.$screenviewtype");
457
458	if ($top) {
459	$doc_obj->add_metadata ($section, "screenicon", "<img src=\"_httpcollection_/index/assoc/[assocfilepath]/[Screen]\" width=[ScreenWidth] height=[ScreenHeight]>");
460	} else {
461	$doc_obj->add_metadata ($section, "screenicon", "<img src=\"_httpcollection_/index/assoc/[parent(Top):assocfilepath]/[Screen]\" width=[ScreenWidth] height=[ScreenHeight]>");
462
463	}
464	} else {
465	print $outhandle "PagedImgPlug: couldn't find \"$screenviewfilename\"\n";
466	}
467	}
468
469	return $type;
470
471
472	}
473
474
475
476	# Discover the characteristics of an image file with the ImageMagick
477	# "identify" command.
478
479	sub identify {
480	my ($image, $outhandle, $verbosity) = @_;
481
482	# Use the ImageMagick "identify" command to get the file specs
483	my $command = "identify \"$image\" 2>&1";
484	print $outhandle "$command\n" if ($verbosity > 2);
485	my $result = '';
486	$result = `$command`;
487	print $outhandle "$result\n" if ($verbosity > 3);
488
489	# Read the type, width, and height
490	my $type = 'unknown';
491	my $width = 'unknown';
492	my $height = 'unknown';
493
494	my $image_safe = quotemeta $image;
495	if ($result =~ /^$image_safe (\w+) (\d+)x(\d+)/) {
496	$type = $1;
497	$width = $2;
498	$height = $3;
499	}
500
501	# Read the size
502	my $size = "unknown";
503	if ($result =~ m/^.* ([0-9]+)b/) {
504	$size = $1;
505	} elsif ($result =~ m/^.* ([0-9]+)kb/) {
506	$size = 1024 * $1;
507	}
508
509	print $outhandle "file: $image:\t $type, $width, $height, $size\n"
510	if ($verbosity > 3);
511
512	# Return the specs
513	return ($type, $width, $height, $size);
514	}
515
516
517	# The PagedImgPlug read() function. This function does all the right things
518	# to make general options work for a given plugin. It calls the process()
519	# function which does all the work specific to a plugin (like the old
520	# read functions used to do). Most plugins should define their own
521	# process() function and let this read() function keep control.
522	#
523	# PagedImgPlug overrides read() because there is no need to read the actual
524	# text of the file in, because the contents of the file is not text...
525	#
526	# Return number of files processed, undef if can't process
527	# Note that $base_dir might be "" and that $file might
528	# include directories
529
530	sub read {
531	my $self = shift (@_);
532	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
533	my $outhandle = $self->{'outhandle'};
534	my $smart_block = $self->{'smart_block'};
535
536	my $filename = &util::filename_cat($base_dir, $file);
537
538	if ($self->associate_with($file,$filename,$metadata)) {
539	# a form of smart block
540	$self->{'num_blocked'} ++;
541	return 0; # blocked
542	}
543
544	if ($smart_block) {
545	if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
546	$self->{'num_blocked'} ++;
547	return 0; # blocked
548	}
549	} elsif ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
550	$self->{'num_blocked'} ++;
551	return 0; # blocked
552	}
553
554	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
555	return undef;
556	}
557
558	print $outhandle "PagedImgPlug processing \"$filename\"\n"
559	if $self->{'verbosity'} > 1;
560	print STDERR "<Processing n='$file' p='PagedImgPlug'>\n" if ($gli);
561
562	# here we need to decide if we have an old text .item file, or a new xml
563	# .item file - for now the test is if the first non-empty line is
564	# <PagedDocument> then its xml
565	my $xml_version = 0;
566	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
567	my $line = "";
568	my $num = 0;
569	$line = <ITEMFILE>;
570	while ($line !~ /\w/) {
571	$line = <ITEMFILE>;
572	}
573	chomp $line;
574	if ($line =~ /^<PagedDocument/) {
575	$xml_version = 1;
576	}
577	close ITEMFILE;
578	my $doc_obj;
579	if ($xml_version) {
580
581	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
582	$self->{'file'} = $file;
583	$self->{'filename'} = $filename;
584	$self->{'processor'} = $processor;
585	$self->{'metadata'} = $metadata;
586	$self->{'gli'} = $gli;
587	eval {
588	$@ = "";
589	my $xslt = $self->{'xslt'};
590	if (defined $xslt && ($xslt ne "")) {
591	# perform xslt
592	my $transformed_xml = $self->apply_xslt($xslt,$filename);
593
594	# feed transformed file (now in memory as string) into XML parser
595	#$self->{'parser'}->parse($transformed_xml);
596	$self->parse_string($transformed_xml);
597	}
598	else {
599	#$self->{'parser'}->parsefile($filename);
600	$self->parse_file($filename);
601	}
602	};
603
604	if ($@) {
605
606	# parsefile may either croak somewhere in XML::Parser (e.g. because
607	# the document is not well formed) or die somewhere in XMLPlug or a
608	# derived plugin (e.g. because we're attempting to process a
609	# document whose DOCTYPE is not meant for this plugin). For the
610	# first case we'll print a warning and continue, for the second
611	# we'll just continue quietly
612
613	print STDERR "**** XML Parse Error is: $@\n";
614
615	my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
616	if (defined $msg) {
617	my $outhandle = $self->{'outhandle'};
618	my $plugin_name = ref ($self);
619	print $outhandle "$plugin_name failed to process $file ($msg)\n";
620	}
621
622	# reset ourself for the next document
623	$self->{'section_level'}=0;
624	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
625	return -1; # error during processing
626	}
627	$doc_obj = $self->{'doc_obj'};
628
629	} else {
630	my ($dir);
631	($dir, $file) = $filename =~ /^(.?)([^\/\\])$/;
632
633	#process the .item file
634	$doc_obj = $self->process_item($filename, $dir, $file, $processor);
635
636	}
637
638	if ($self->{'cover_image'}) {
639	$self->associate_cover_image($doc_obj, $filename);
640	}
641
642	# include any metadata passed in from previous plugins
643	# note that this metadata is associated with the top level section
644	my $section = $doc_obj->get_top_section();
645	$self->extra_metadata ($doc_obj, $section, $metadata);
646
647	# do plugin specific processing of doc_obj - don't need this unless
648	# something inherits from PagedImgPlug
649	#unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
650	# print STDERR "<ProcessingError n='$file'>\n" if ($gli);
651	# return -1;
652	# }
653
654	# do any automatic metadata extraction
655	$self->auto_extract_metadata ($doc_obj);
656
657	# process the document
658	$processor->process($doc_obj);
659
660	# clean up temporary files - we do this here instead of in
661	# process_image becuase associated files aren't actually copied
662	# until after process has been run.
663	if (defined $self->{'tmp_filename1'} &&
664	-e $self->{'tmp_filename1'}) {
665	&util::rm($self->{'tmp_filename1'})
666	}
667	if (defined $self->{'tmp_filename2'} &&
668	-e $self->{'tmp_filename2'}) {
669	&util::rm($self->{'tmp_filename2'})
670	}
671	if (defined $self->{'tmp_filename3'} &&
672	-e $self->{'tmp_filename3'}) {
673	&util::rm($self->{'tmp_filename3'})
674	}
675
676	$self->{'num_processed'}++;
677
678	return 1;
679	}
680
681	sub xml_start_tag {
682	my $self = shift(@_);
683	my ($expat, $element) = @_;
684	$self->{'element'} = $element;
685
686	my $doc_obj = $self->{'doc_obj'};
687	if ($element eq "PagedDocument") {
688	$self->{'current_section'} = $doc_obj->get_top_section();
689	} elsif ($element eq "PageGroup" \|\| $element eq "Page") {
690	# create a new section as a child
691	$self->{'current_section'} = $doc_obj->insert_section($doc_obj->get_end_child($self->{'current_section'}));
692	$self->{'num_pages'}++;
693	# assign pagenum as what??
694	my $pagenum = $_{'pagenum'}; #TODO!!
695	$doc_obj->set_utf8_metadata_element($self->{'current_section'}, 'PageNum', $pagenum);
696	my ($imgfile) = $_{'imgfile'};
697	if (defined $imgfile) {
698	$self->process_image($self->{'base_dir'}.$imgfile, $imgfile, $doc_obj, $self->{'current_section'});
699	}
700	my ($txtfile) = $_{'txtfile'};
701	if (defined($txtfile)) {
702	$self->process_text ($self->{'base_dir'}.$txtfile, $txtfile, $doc_obj, $self->{'current_section'});
703	} else {
704	# otherwise add in some dummy text
705	$doc_obj->add_text($self->{'current_section'}, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
706	}
707	} elsif ($element eq "Metadata") {
708	$self->{'metadata_name'} = $_{'name'};
709	}
710	}
711
712	sub xml_end_tag {
713	my $self = shift(@_);
714	my ($expat, $element) = @_;
715
716	my $doc_obj = $self->{'doc_obj'};
717	if ($element eq "Page" \|\| $element eq "PageGroup") {
718	# move the current section back to the parent
719	$self->{'current_section'} = $doc_obj->get_parent_section($self->{'current_section'});
720	} elsif ($element eq "Metadata") {
721
722	$doc_obj->add_utf8_metadata ($self->{'current_section'}, $self->{'metadata_name'}, $self->{'metadata_value'});
723	$self->{'metadata_name'} = "";
724	$self->{'metadata_value'} = "";
725
726	}
727	# otherwise we ignore the end tag
728	}
729
730
731	sub xml_text {
732	my $self = shift(@_);
733	my ($expat) = @_;
734
735	if ($self->{'element'} eq "Metadata") {
736	$self->{'metadata_value'} .= $_;
737	}
738	}
739
740	sub xml_doctype {
741	}
742
743	sub open_document {
744	my $self = shift(@_);
745
746	# create a new document
747	$self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
748	my $doc_obj = $self->{'doc_obj'};
749	$doc_obj->set_OIDtype ($self->{'processor'}->{'OIDtype'});
750	my ($dir, $file) = $self->{'filename'} =~ /^(.?)([^\/\\])$/;
751	$self->{'base_dir'} = $dir;
752	$self->{'num_pages'} = 0;
753	my $topsection = $doc_obj->get_top_section();
754	if ($self->{'documenttype'} eq 'paged') {
755	# set the gsdlthistype metadata to Paged - this ensures this document will
756	# be treated as a Paged doc, even if Titles are not numeric
757
758	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
759	} else {
760	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
761	}
762
763	$doc_obj->add_metadata ($topsection, "Source", $file);
764	if ($self->{'headerpage'}) {
765	$doc_obj->add_text($topsection, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
766	}
767
768	}
769
770	sub close_document {
771	my $self = shift(@_);
772	my $doc_obj = $self->{'doc_obj'};
773
774	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
775	$doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "PagedImg");
776
777	# add numpages metadata
778	$doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(), 'NumPages', $self->{'num_pages'});
779
780	# add an OID
781	$doc_obj->set_OID();
782
783	}
784
785	sub process_item {
786	my $self = shift (@_);
787	my ($filename, $dir, $file, $processor) = @_;
788
789	my $doc_obj = new doc ($filename, "indexed_doc");
790	my $topsection = $doc_obj->get_top_section();
791
792	if ($self->{'documenttype'} eq 'paged') {
793	# set the gsdlthistype metadata to Paged - this ensures this document will
794	# be treated as a Paged doc, even if Titles are not numeric
795
796	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
797	} else {
798	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
799	}
800
801	$doc_obj->add_metadata ($topsection, "Source", $file);
802
803	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
804	my $line = "";
805	my $num = 0;
806	while (defined ($line = <ITEMFILE>)) {
807	next unless $line =~ /\w/;
808	chomp $line;
809	if ($line =~ /^<([^>])>(.?)\s*$/) {
810	$doc_obj->set_utf8_metadata_element ($topsection, $1, $2);
811	#$meta->{$1} = $2;
812	} else {
813	$num++;
814	# line should be like page:imagefilename:textfilename:r - the r is optional -> means rotate the image 180 deg
815	$line =~ s/^\s+//; #remove space at the front
816	$line =~ s/\s+$//; #remove space at the end
817	my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
818
819	# create a new section for each image file
820	my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
821	# the page number becomes the Title
822	$doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum);
823	# process the image for this page
824	my $result = $self->process_image($dir.$imgname, $imgname, $doc_obj, $cursection, $rotate);
825
826	if (!defined $result)
827	{
828	print "PagedImgPlug: couldn't process image \"$dir.$imgname\" for item \"$filename\"\n";
829	}
830
831	# process the text file if one is there
832	if (defined $txtname && $txtname ne "") {
833	$result = undef;
834	$result = $self->process_text ($dir.$txtname, $txtname, $doc_obj, $cursection);
835	if (!defined $result) {
836	print "PagedImgPlug: couldn't process text file \"$dir.$txtname\" for item \"$filename\"\n";
837	}
838	} else {
839	# otherwise add in some dummy text
840	$doc_obj->add_text($cursection, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
841	}
842	}
843	}
844
845	close ITEMFILE;
846
847	# if we want a header page, we need to add some text into the top section, otherwise this section will become invisible
848	if ($self->{'headerpage'}) {
849	$doc_obj->add_text($topsection, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
850	}
851	$file =~ s/\.item//i;
852	$doc_obj->set_OID ();
853	# add numpages metadata
854	$doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num");
855	return $doc_obj;
856	}
857
858	sub process_text {
859	my $self = shift (@_);
860	my ($fullpath, $file, $doc_obj, $cursection) = @_;
861
862	# Do encoding stuff
863	my ($language, $encoding) = $self->textcat_get_language_encoding ($fullpath);
864
865	my $text="";
866	&BasPlug::read_file($self, $fullpath, $encoding, $language, \$text);
867	if (!length ($text)) {
868	my $plugin_name = ref ($self);
869	print "PagedImgPlug: ERROR: $fullpath contains no text\n" if $self->{'verbosity'};
870	return 0;
871	}
872
873	# we need to escape the escape character, or else mg will convert into
874	# eg literal newlines, instead of leaving the text as '\n'
875	$text =~ s/\\/\\\\/g; # macro language
876	$text =~ s/_/\\_/g; # macro language
877	$text =~ s/</</g;
878	$text =~ s/>/>/g;
879
880	# insert preformat tags and add text to document object
881	$doc_obj->add_utf8_text($cursection, "<pre>\n$text\n</pre>");
882
883	return 1;
884	}
885
886	# do plugin specific processing of doc_obj
887	sub process {
888	my $self = shift (@_);
889	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
890	my $outhandle = $self->{'outhandle'};
891
892	return 1;
893	}
894
895	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: