Context Navigation

source: trunk/gsdl/perllib/plugins/PagedImgPlug.pm@ 10168

Last change on this file since 10168 was 10168, checked in by kjdon, 19 years ago
modified this to use a new xml format. it should work as before on the old format. now inherits from XMLPlug.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 30.5 KB

Line
1	###########################################################################
2	#
3	# PagedImgPlug.pm -- plugin for sets of images and OCR text that
4	# make up a document
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	# PagedImgPlug
28	# processes sequences of images, with optional OCR text
29	#
30	# This plugin takes *.item files, which contain metadata and lists of image
31	# files, and produces a document containing sections, one for each page.
32	# The files should be named something.item, then you can have more than one
33	# book in a directory. You will need to create these files, one for each
34	# document/book.
35	#
36	#There are two formats for the item files: a plain text format, and an xml
37	#format. You can use either format, and can have both formats in the same
38	#collection if you like. If you use the plain format, you must not start the
39	#file off with <PagedDocument>
40
41	#### PLAIN FORMAT
42	# The format of the xxx.item file is as follows:
43	# The first lines contain any metadata for the whole document
44	# <metadata-name>metadata-value
45	# eg.
46	# <Title>Snail farming
47	# <Date>19230102
48	# Then comes a list of pages, one page per line, each line has the format
49	#
50	# pagenum:imagefile:textfile:r
51	#
52	# page num and imagefile are required. pagenum is used for the Title
53	# of the section, and in the display is shown as page <pagenum>.
54	# imagefile is the image for the page. textfile is an optional text
55	# file containing the OCR (or any) text for the page - this gets added
56	# as the text for the section. r is optional, and signals that the image
57	# should be rotated 180deg. Eg use this if the image has been made upside down.
58	# So an example item file looks like:
59	# <Title>Snail farming
60	# <Date>19960403
61	# 1:p1.gif:p1.txt:
62	# 2:p2.gif::
63	# 3:p3.gif:p3.txt:
64	# 3b:p3b.gif:p3b.txt:r
65	# The second page has no text, the fourth page is a back page, and
66	# should be rotated.
67	#
68
69	#### XML FORMAT
70	# The xml format looks like the following
71	#<PagedDocument>
72	#<Metadata name="Title">The Title of the entire document</Metadata>
73	#<Page pagenum="1" imgfile="xxx.jpg" txtfile="yyy.jpg">
74	#<Metadata name="Title">The Title of this page</Metadata>
75	#</Page>
76	#... more pages
77	#</PagedDocument>
78	#PagedDocument contains a list of Pages, Metadata and PageGroups. Any metadata
79	#that is not inside another tag will belong to the document.
80	#Each Page has a pagenum (not used at the moment), an imgfile and/or a txtfile.
81	#These are both optional - if neither is used, the section will have no content.
82	#Pages can also have metadata associated with them.
83	#PageGroups can be introduced at any point - they can contain Metadata and Pages and other PageGroups. They are used to introduce hierarchical structure into the document.
84	#For example
85	#<PagedDocument>
86	#<PageGroup>
87	#<Page>
88	#<Page>
89	#</PageGroup>
90	#<Page>
91	#</PagedDocument>
92	#would generate a structure like
93	#X
94	#--X
95	# --X
96	# --X
97	#--X
98	#PageGroup tags can also have imgfile/textfile metadata if you like - this way they get some content themselves.
99
100	#Currently the XML structure doesn't work very well with the paged document type, unless you use numerical Titles for each section.
101	#There is still a bit of work to do on this format:
102	#* enable other text file types, eg html, pdf etc
103	#* make the document paging work properly
104	#* add pagenum as Title unless a Title is present?
105
106	# All the supplemetary image amd text files should be in the same folder as
107	# the .item file.
108	#
109	# To display the images instead of the document text, you can use [srcicon]
110	# in the DocumentText format statement.
111	# For example,
112	#
113	# format DocumentText "<center><table width=_pagewidth_><tr><td>[srcicon]</td></tr></table></center>"
114	#
115	# To have it create thumbnail size images, use the '-thumbnail' option.
116	# To have it create medium size images for display, use the '-screenview'
117	# option. As usual, running
118	# 'perl -S pluginfo.pl PagedImgPlug' will list all the options.
119
120	# If you want the resulting documents to be presented with a table of
121	# contents, use '-documenttype hierarchy', otherwise they will have
122	# next and previous arrows, and a goto page X box.
123
124	# If you have used -screenview, you can also use [screenicon] in the format
125	# statement to display the smaller image. Here is an example that switches
126	# between the two:
127	#
128	# format DocumentText "<center><table width=_pagewidth_><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small'>Switch to small version.</a>,<a href='_httpdocument_&d=_cgiargd_&p=full'>Switch to fullsize version</a>}</td></tr><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small' title='Switch to small version'>[srcicon]</a>,<a href='_httpdocument_&d=_cgiargd_&p=full' title='Switch to fullsize version'>[screenicon]</a>}</td></tr></table></center>"
129	#
130	# Additional metadata can be added into the .item files, alternatively you can
131	# use normal metadata.xml files, with the name of the xxx.item file as the
132	# FileName (only for document level metadata).
133
134	package PagedImgPlug;
135
136	use XMLPlug;
137
138	sub BEGIN {
139	@ISA = ('XMLPlug');
140	}
141
142	my $type_list =
143	[ { 'name' => "paged",
144	'desc' => "{PagedImgPlug.documenttype.paged}" },
145	{ 'name' => "hierarchy",
146	'desc' => "{PagedImgPlug.documenttype.hierarchy}" } ];
147
148	my $arguments =
149	[ { 'name' => "process_exp",
150	'desc' => "{BasPlug.process_exp}",
151	'type' => "string",
152	'deft' => &get_default_process_exp(),
153	'reqd' => "no" },
154	{ 'name' => "block_exp",
155	'desc' => "{BasPlug.block_exp}",
156	'type' => "string",
157	'deft' => &get_default_block_exp(),
158	'reqd' => "no" },
159	{ 'name' => "noscaleup",
160	'desc' => "{ImagePlug.noscaleup}",
161	'type' => "flag",
162	'reqd' => "no" },
163	{ 'name' => "thumbnail",
164	'desc' => "{PagedImgPlug.thumbnail}",
165	'type' => "flag",
166	'reqd' => "no" },
167	{ 'name' => "thumbnailsize",
168	'desc' => "{ImagePlug.thumbnailsize}",
169	'type' => "int",
170	'deft' => "100",
171	'range' => "1,",
172	'reqd' => "no" },
173	{ 'name' => "thumbnailtype",
174	'desc' => "{ImagePlug.thumbnailtype}",
175	'type' => "string",
176	'deft' => "gif",
177	'reqd' => "no" },
178	{ 'name' => "screenview",
179	'desc' => "{PagedImgPlug.screenview}",
180	'type' => "flag",
181	'reqd' => "no" },
182	{ 'name' => "screenviewsize",
183	'desc' => "{PagedImgPlug.screenviewsize}",
184	'type' => "int",
185	'deft' => "500",
186	'range' => "1,",
187	'reqd' => "no" },
188	{ 'name' => "screenviewtype",
189	'desc' => "{PagedImgPlug.screenviewtype}",
190	'type' => "string",
191	'deft' => "jpg",
192	'reqd' => "no" },
193	{ 'name' => "converttotype",
194	'desc' => "{ImagePlug.converttotype}",
195	'type' => "string",
196	'deft' => "",
197	'reqd' => "no" },
198	{ 'name' => "minimumsize",
199	'desc' => "{ImagePlug.minimumsize}",
200	'type' => "int",
201	'deft' => "100",
202	'range' => "1,",
203	'reqd' => "no" },
204	{ 'name' => "headerpage",
205	'desc' => "{PagedImgPlug.headerpage}",
206	'type' => "flag",
207	'reqd' => "no" },
208	{ 'name' => "documenttype",
209	'desc' => "{PagedImgPlug.documenttype}",
210	'type' => "enum",
211	'list' => $type_list,
212	'deft' => "paged",
213	'reqd' => "no" } ];
214
215
216	my $options = { 'name' => "PagedImgPlug",
217	'desc' => "{PagedImgPlug.desc}",
218	'inherits' => "yes",
219	'args' => $arguments };
220
221	sub new {
222	my ($class) = @_;
223	my $plugin_name = shift (@_);
224	$self = new XMLPlug ("PagedImgPlug", @_);
225
226	my $option_list = $self->{'option_list'};
227	push( @{$option_list}, $options );
228
229	if (!parsargv::parse(\@_,
230	q^noscaleup^, \$self->{'noscaleup'},
231	q^converttotype/.*/^, \$self->{'converttotype'},
232	q^minimumsize/[0-9]*/100^, \$self->{'minimumsize'},
233
234	q^thumbnailsize/[0-9]*/100^, \$self->{'thumbnailsize'},
235	q^thumbnailtype/.*/gif^, \$self->{'thumbnailtype'},
236	q^screenviewsize/[0-9]*/0^, \$self->{'screenviewsize'},
237	q^screenviewtype/.*/jpg^, \$self->{'screenviewtype'},
238	q^thumbnail^, \$self->{'thumbnail'},
239	q^screenview^, \$self->{'screenview'},
240	q^headerpage^, \$self->{'headerpage'},
241	'documenttype/^(paged\|hierarchy)$/paged', \$self->{'doctype'},
242	"allow_extra_options")) {
243
244	print STDERR "\nPagedImgPlug uses an incorrect option.\n";
245	print STDERR "Check your collect.cfg configuration file.\n";
246	$self->print_txt_usage(""); # Use default resource bundle
247	die "\n";
248	}
249
250	return bless $self, $class;
251	}
252
253	sub get_default_process_exp {
254	my $self = shift (@_);
255
256	return q^\.item$^;
257	}
258
259	# want to block everything except the .item ones
260	# but instead we will block images and txt files
261	sub get_default_block_exp {
262	my $self = shift (@_);
263
264	return q^(?i)(\.jpe?g\|\.gif\|\.png\|\.tif?f\|\.te?xt\|~)$^
265	}
266	# Create the thumbnail and screenview images, and discover the Image's
267	# size, width, and height using the convert utility.
268	sub process_image {
269	my $self = shift (@_);
270	my $filename = shift (@_); # filename with full path
271	my $srcfile = shift (@_); # filename without path
272	my $doc_obj = shift (@_);
273	my $section = shift (@_); #the current section
274	my $rotate = shift (@_); # whether to rotate the image or not
275
276	my $top=0;
277	if ($section eq $doc_obj->get_top_section()) {
278	$top=1;
279	}
280	my $verbosity = $self->{'verbosity'};
281	my $outhandle = $self->{'outhandle'};
282
283	# check the filename is okay
284	return 0 if ($srcfile eq "" \|\| $filename eq "");
285
286	my $minimumsize = $self->{'minimumsize'};
287	if (defined $minimumsize && (-s $filename < $minimumsize)) {
288	print $outhandle "PagedImgPlug: \"$filename\" too small, skipping\n"
289	if ($verbosity > 1);
290	}
291
292	# Convert the image to a new type (if required), and rotate if required.
293	my $converttotype = $self->{'converttotype'};
294	my $originalfilename = ""; # only set if we do a conversion
295	my $type = "unknown";
296	my $converted = 0;
297	my $rotated=0;
298	if ($converttotype ne "" && $filename !~ /$converttotype$/) {
299	$converted=1;
300	$originalfilename = $filename;
301	my $filehead = &util::get_tmp_filename();
302	$filename = $filehead . ".$converttotype";
303	$n = 1;
304	while (-e $filename) {
305	$filename = "$filehead$n\.$converttotype";
306	$n++;
307	}
308	$self->{'tmp_filename1'} = $filename;
309
310	my $rotate_option = "";
311	if ($rotate eq "r") {
312	$rotate_option = "-rotate 180 ";
313	}
314
315	my $command = "convert -verbose \"$originalfilename\" $rotate_option \"$filename\"";
316	print $outhandle "CONVERT: $command\n" if ($verbosity > 2);
317	my $result = '';
318	$result = `$command`;
319	print $outhandle "CONVERT RESULT = $result\n" if ($verbosity > 2);
320
321	$type = $converttotype;
322	} elsif ($rotate eq "r") {
323	$rotated=1;
324	$originalfilename = $filename;
325	$filename = &util::get_tmp_filename();
326
327	my $command = "convert \"$originalfilename\" -rotate 180 \"$filename\"";
328	print $outhandle "ROTATE: $command\n" if ($verbosity > 2);
329	my $result = '';
330	$result = `$command`;
331	print $outhandle "ROTATE RESULT = $result\n" if ($verbosity > 2);
332
333	}
334
335
336	# Add the image metadata
337	my $file; # the new file name
338	my $id = $srcfile;
339	$id =~ s/\.([^\.]*)$//; # the new file name without an extension
340	if ($converted) {
341	# we have converted the image
342	# add on the new extension
343	$file .= "$id.$converttotype";
344	} else {
345	$file = $srcfile;
346	}
347
348	my $url =$file; # the new file name prepared for a url
349	my $srcurl = $srcfile;
350	$url =~ s/ /%20/g;
351	$srcurl =~ s/ /%20/g;
352
353	$doc_obj->add_metadata ($section, "Image", $url);
354
355	# Also want to set filename as 'Source' metadata to be
356	# consistent with other plugins
357	$doc_obj->add_metadata ($section, "Source", $srcurl);
358
359	my ($image_type, $image_width, $image_height, $image_size)
360	= &identify($filename, $outhandle, $verbosity);
361
362	$doc_obj->add_metadata ($section, "ImageType", $image_type);
363	$doc_obj->add_metadata ($section, "ImageWidth", $image_width);
364	$doc_obj->add_metadata ($section, "ImageHeight", $image_height);
365	$doc_obj->add_metadata ($section, "ImageSize", $image_size);
366	$doc_obj->add_metadata ($section, "FileFormat", "PagedImg");
367
368	if ($type eq "unknown" && $image_type) {
369	$type = $image_type;
370	}
371
372	if ($top) {
373	$doc_obj->add_metadata ($section, "srclink",
374	"<a href=\"_httpcollection_/index/assoc/[assocfilepath]/[Image]\">");
375	$doc_obj->add_metadata ($section, "srcicon", "<img src=\"_httpcollection_/index/assoc/[assocfilepath]/[Image]\">");
376
377	} else {
378	$doc_obj->add_metadata ($section, "srclink",
379	"<a href=\"_httpcollection_/index/assoc/[parent(Top):assocfilepath]/[Image]\">");
380	$doc_obj->add_metadata ($section, "srcicon", "<img src=\"_httpcollection_/index/assoc/[parent(Top):assocfilepath]/[Image]\">");
381
382	}
383	$doc_obj->add_metadata ($section, "/srclink", "</a>");
384
385
386	# Add the image as an associated file
387	$doc_obj->associate_file($filename,$file,"image/$type",$section);
388	print $outhandle "associating file $filename as name $file\n" if ($verbosity > 2);
389
390	if ($self->{'thumbnail'}) {
391	# Make the thumbnail image
392	my $thumbnailsize = $self->{'thumbnailsize'} \|\| 100;
393	my $thumbnailtype = $self->{'thumbnailtype'} \|\| 'gif';
394
395	my $filehead = &util::get_tmp_filename();
396	my $thumbnailfile = $filehead . ".$thumbnailtype";
397	my $n=1;
398	while (-e $thumbnailfile) {
399	$thumbnailfile = $filehead . $n . ".$thumbnailtype";
400	$n++;
401	}
402
403	$self->{'tmp_filename2'} = $thumbnailfile;
404
405	# Generate the thumbnail with convert
406	my $command = "convert -verbose -geometry $thumbnailsize"
407	. "x$thumbnailsize \"$filename\" \"$thumbnailfile\"";
408	print $outhandle "THUMBNAIL: $command\n" if ($verbosity > 2);
409	my $result = '';
410	$result = `$command 2>&1` ;
411	print $outhandle "THUMB RESULT: $result\n" if ($verbosity > 2);
412
413	# Add the thumbnail as an associated file ...
414	if (-e "$thumbnailfile") {
415	$doc_obj->associate_file("$thumbnailfile", $id."thumb.$thumbnailtype", "image/$thumbnailtype",$section);
416	$doc_obj->add_metadata ($section, "ThumbType", $thumbnailtype);
417	$doc_obj->add_metadata ($section, "Thumb", $id."thumb.$thumbnailtype");
418	if ($top) {
419	$doc_obj->add_metadata ($section, "thumbicon", "<img src=\"_httpcollection_/index/assoc/[assocfilepath]/[Thumb]\" width=[ThumbWidth] height=[ThumbHeight]>");
420	} else {
421	$doc_obj->add_metadata ($section, "thumbicon", "<img src=\"_httpcollection_/index/assoc/[parent(Top):assocfilepath]/[Thumb]\" width=[ThumbWidth] height=[ThumbHeight]>");
422	}
423	}
424
425	# Extract Thumnail metadata from convert output
426	if ($result =~ m/[0-9]+x[0-9]+=>([0-9]+)x([0-9]+)/) {
427	$doc_obj->add_metadata ($section, "ThumbWidth", $1);
428	$doc_obj->add_metadata ($section, "ThumbHeight", $2);
429	}
430	}
431	# Make a screen-sized version of the picture if requested
432	if ($self->{'screenview'}) {
433
434	# To do: if the actual image is smaller than the screenview size,
435	# we should use the original !
436
437	my $screenviewsize = $self->{'screenviewsize'} \|\| 500;
438	my $screenviewtype = $self->{'screenviewtype'} \|\| 'jpeg';
439	my $filehead = &util::get_tmp_filename();
440	my $screenviewfilename = $filehead . ".$screenviewtype";
441	my $n=1;
442	while (-e $screenviewfilename) {
443	$screenviewfilename = "$filehead$n\.$screenviewtype";
444	$n++;
445	}
446	$self->{'tmp_filename3'} = $screenviewfilename;
447
448	# make the screenview image
449	my $command = "convert -verbose -geometry $screenviewsize"
450	. "x$screenviewsize \"$filename\" \"$screenviewfilename\"";
451	print $outhandle "SCREENVIEW: $command\n" if ($verbosity > 2);
452	my $result = "";
453	$result = `$command 2>&1` ;
454	print $outhandle "SCREENVIEW RESULT: $result\n" if ($verbosity > 3);
455
456	# get screenview dimensions, size and type
457	if ($result =~ m/[0-9]+x[0-9]+=>([0-9]+)x([0-9]+)/) {
458	$doc_obj->add_metadata ($section, "ScreenWidth", $1);
459	$doc_obj->add_metadata ($section, "ScreenHeight", $2);
460	}elsif ($result =~ m/([0-9]+)x([0-9]+)/) {
461	#if the image hasn't changed size, the previous regex doesn't match
462	$doc_obj->add_metadata ($section, "ScreenWidth", $1);
463	$doc_obj->add_metadata ($section, "ScreenHeight", $2);
464	}
465
466	#add the screenview as an associated file ...
467	if (-e "$screenviewfilename") {
468	$doc_obj->associate_file("$screenviewfilename", $id."sv.$screenviewtype",
469	"image/$screenviewtype",$section);
470	print $outhandle "associating screen file $screenviewfilename as name $id sv.$screenviewtype\n" if ($verbosity > 2);
471
472	$doc_obj->add_metadata ($section, "ScreenType", $screenviewtype);
473	$doc_obj->add_metadata ($section, "Screen", $id."sv.$screenviewtype");
474
475	if ($top) {
476	$doc_obj->add_metadata ($section, "screenicon", "<img src=\"_httpcollection_/index/assoc/[assocfilepath]/[Screen]\" width=[ScreenWidth] height=[ScreenHeight]>");
477	} else {
478	$doc_obj->add_metadata ($section, "screenicon", "<img src=\"_httpcollection_/index/assoc/[parent(Top):assocfilepath]/[Screen]\" width=[ScreenWidth] height=[ScreenHeight]>");
479
480	}
481	} else {
482	print $outhandle "PagedImgPlug: couldn't find \"$screenviewfilename\"\n";
483	}
484	}
485
486	return $type;
487
488
489	}
490
491
492
493	# Discover the characteristics of an image file with the ImageMagick
494	# "identify" command.
495
496	sub identify {
497	my ($image, $outhandle, $verbosity) = @_;
498
499	# Use the ImageMagick "identify" command to get the file specs
500	my $command = "identify \"$image\" 2>&1";
501	print $outhandle "$command\n" if ($verbosity > 2);
502	my $result = '';
503	$result = `$command`;
504	print $outhandle "$result\n" if ($verbosity > 3);
505
506	# Read the type, width, and height
507	my $type = 'unknown';
508	my $width = 'unknown';
509	my $height = 'unknown';
510
511	my $image_safe = quotemeta $image;
512	if ($result =~ /^$image_safe (\w+) (\d+)x(\d+)/) {
513	$type = $1;
514	$width = $2;
515	$height = $3;
516	}
517
518	# Read the size
519	my $size = "unknown";
520	if ($result =~ m/^.* ([0-9]+)b/) {
521	$size = $1;
522	} elsif ($result =~ m/^.* ([0-9]+)kb/) {
523	$size = 1024 * $1;
524	}
525
526	print $outhandle "file: $image:\t $type, $width, $height, $size\n"
527	if ($verbosity > 3);
528
529	# Return the specs
530	return ($type, $width, $height, $size);
531	}
532
533
534	# The PagedImgPlug read() function. This function does all the right things
535	# to make general options work for a given plugin. It calls the process()
536	# function which does all the work specific to a plugin (like the old
537	# read functions used to do). Most plugins should define their own
538	# process() function and let this read() function keep control.
539	#
540	# PagedImgPlug overrides read() because there is no need to read the actual
541	# text of the file in, because the contents of the file is not text...
542	#
543	# Return number of files processed, undef if can't process
544	# Note that $base_dir might be "" and that $file might
545	# include directories
546
547	sub read {
548	$self = shift (@_);
549	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
550	my $outhandle = $self->{'outhandle'};
551	my $smart_block = $self->{'smart_block'};
552
553	my $filename = &util::filename_cat($base_dir, $file);
554
555	if ($self->associate_with($file,$filename,$metadata)) {
556	# a form of smart block
557	$self->{'num_blocked'} ++;
558	return 0; # blocked
559	}
560
561	if ($smart_block) {
562	if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
563	$self->{'num_blocked'} ++;
564	return 0; # blocked
565	}
566	} elsif ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
567	$self->{'num_blocked'} ++;
568	return 0; # blocked
569	}
570
571	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
572	return undef;
573	}
574
575	print $outhandle "PagedImgPlug processing \"$filename\"\n"
576	if $self->{'verbosity'} > 1;
577	print STDERR "<Processing n='$file' p='PagedImgPlug'>\n" if ($gli);
578
579	# here we need to decide if we have an old text .item file, or a new xml
580	# .item file - for now the test is if the first non-empty line is
581	# <PagedDocument> then its xml
582	my $xml_version = 0;
583	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
584	my $line = "";
585	my $num = 0;
586	$line = <ITEMFILE>;
587	while ($line !~ /\w/) {
588	$line = <ITEMFILE>;
589	}
590	chomp $line;
591	if ($line =~ /^<PagedDocument/) {
592	$xml_version = 1;
593	}
594	close ITEMFILE;
595	my $doc_obj;
596	if ($xml_version) {
597
598	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
599	$self->{'file'} = $file;
600	$self->{'filename'} = $filename;
601	$self->{'processor'} = $processor;
602	$self->{'metadata'} = $metadata;
603	$self->{'gli'} = $gli;
604	eval {
605	$@ = "";
606	my $xslt = $self->{'xslt'};
607	if (defined $xslt && ($xslt ne "")) {
608	# perform xslt
609	my $transformed_xml = $self->apply_xslt($xslt,$filename);
610
611	# feed transformed file (now in memory as string) into XML parser
612	#$self->{'parser'}->parse($transformed_xml);
613	$self->parse_string($transformed_xml);
614	}
615	else {
616	#$self->{'parser'}->parsefile($filename);
617	$self->parse_file($filename);
618	}
619	};
620
621	if ($@) {
622
623	# parsefile may either croak somewhere in XML::Parser (e.g. because
624	# the document is not well formed) or die somewhere in XMLPlug or a
625	# derived plugin (e.g. because we're attempting to process a
626	# document whose DOCTYPE is not meant for this plugin). For the
627	# first case we'll print a warning and continue, for the second
628	# we'll just continue quietly
629
630	print STDERR "**** XML Parse Error is: $@\n";
631
632	my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
633	if (defined $msg) {
634	my $outhandle = $self->{'outhandle'};
635	my $plugin_name = ref ($self);
636	print $outhandle "$plugin_name failed to process $file ($msg)\n";
637	}
638
639	# reset ourself for the next document
640	$self->{'section_level'}=0;
641	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
642	return -1; # error during processing
643	}
644	$doc_obj = $self->{'doc_obj'};
645
646	} else {
647	my ($dir);
648	($dir, $file) = $filename =~ /^(.?)([^\/\\])$/;
649
650	#process the .item file
651	$doc_obj = $self->process_item($filename, $dir, $file, $processor);
652
653	}
654
655	if ($self->{'cover_image'}) {
656	$self->associate_cover_image($doc_obj, $filename);
657	}
658
659	# include any metadata passed in from previous plugins
660	# note that this metadata is associated with the top level section
661	my $section = $doc_obj->get_top_section();
662	$self->extra_metadata ($doc_obj, $section, $metadata);
663
664	# do plugin specific processing of doc_obj
665	unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
666	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
667	return -1;
668	}
669
670	# do any automatic metadata extraction
671	$self->auto_extract_metadata ($doc_obj);
672
673	# process the document
674	$processor->process($doc_obj);
675
676	# clean up temporary files - we do this here instead of in
677	# process_image becuase associated files aren't actually copied
678	# until after process has been run.
679	if (defined $self->{'tmp_filename1'} &&
680	-e $self->{'tmp_filename1'}) {
681	&util::rm($self->{'tmp_filename1'})
682	}
683	if (defined $self->{'tmp_filename2'} &&
684	-e $self->{'tmp_filename2'}) {
685	&util::rm($self->{'tmp_filename2'})
686	}
687	if (defined $self->{'tmp_filename3'} &&
688	-e $self->{'tmp_filename3'}) {
689	&util::rm($self->{'tmp_filename3'})
690	}
691
692	$self->{'num_processed'}++;
693
694	return 1;
695	}
696
697	sub xml_start_tag {
698	my $self = shift(@_);
699	my ($expat, $element) = @_;
700	$self->{'element'} = $element;
701
702	my $doc_obj = $self->{'doc_obj'};
703	if ($element eq "PagedDocument") {
704	$self->{'current_section'} = $doc_obj->get_top_section();
705	} elsif ($element eq "PageGroup" \|\| $element eq "Page") {
706	# create a new section as a child
707	$self->{'current_section'} = $doc_obj->insert_section($doc_obj->get_end_child($self->{'current_section'}));
708	$self->{'num_pages'}++;
709	# assign pagenum as what??
710	my $pagenum = $_{'pagenum'}; #TODO!!
711	$doc_obj->set_utf8_metadata_element($self->{'current_section'}, 'PageNum', $pagenum);
712	my ($imgfile) = $_{'imgfile'};
713	if (defined $imgfile) {
714	$self->process_image($self->{'base_dir'}.$imgfile, $imgfile, $doc_obj, $self->{'current_section'});
715	}
716	my ($txtfile) = $_{'txtfile'};
717	if (defined($txtfile)) {
718	$self->process_text ($self->{'base_dir'}.$txtfile, $txtfile, $doc_obj, $self->{'current_section'});
719	} else {
720	# otherwise add in some dummy text
721	$doc_obj->add_text($self->{'current_section'}, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
722	}
723	} elsif ($element eq "Metadata") {
724	$self->{'metadata_name'} = $_{'name'};
725	}
726	}
727
728	sub xml_end_tag {
729	my $self = shift(@_);
730	my ($expat, $element) = @_;
731
732	my $doc_obj = $self->{'doc_obj'};
733	if ($element eq "Page" \|\| $element eq "PageGroup") {
734	# move the current section back to the parent
735	$self->{'current_section'} = $doc_obj->get_parent_section($self->{'current_section'});
736	} elsif ($element eq "Metadata") {
737
738	$doc_obj->add_utf8_metadata ($self->{'current_section'}, $self->{'metadata_name'}, $self->{'metadata_value'});
739	$self->{'metadata_name'} = "";
740	$self->{'metadata_value'} = "";
741
742	}
743	# otherwise we ignore the end tag
744	}
745
746
747	sub xml_text {
748	my $self = shift(@_);
749	my ($expat) = @_;
750
751	if ($self->{'element'} eq "Metadata") {
752	$self->{'metadata_value'} .= $_;
753	}
754	}
755
756	sub xml_doctype {
757	}
758
759	sub open_document {
760	my $self = shift(@_);
761
762	# create a new document
763	$self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
764	my $doc_obj = $self->{'doc_obj'};
765	$doc_obj->set_OIDtype ($self->{'processor'}->{'OIDtype'});
766	my ($dir, $file) = $self->{'filename'} =~ /^(.?)([^\/\\])$/;
767	$self->{'base_dir'} = $dir;
768	$self->{'num_pages'} = 0;
769	my $topsection = $doc_obj->get_top_section();
770	if ($self->{'doctype'} eq 'paged') {
771	# set the gsdlthistype metadata to Paged - this ensures this document will
772	# be treated as a Paged doc, even if Titles are not numeric
773
774	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
775	} else {
776	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
777	}
778
779	$doc_obj->add_metadata ($topsection, "Source", $file);
780	if ($self->{'headerpage'}) {
781	$doc_obj->add_text($topsection, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
782	}
783
784	}
785
786	sub close_document {
787	my $self = shift(@_);
788	my $doc_obj = $self->{'doc_obj'};
789
790	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
791	$doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "PagedImg");
792
793	# add numpages metadata
794	$doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(), 'NumPages', $self->{'num_pages'});
795
796	# add an OID
797	$doc_obj->set_OID();
798
799	}
800
801	sub process_item {
802	my $self = shift (@_);
803	my ($filename, $dir, $file, $processor) = @_;
804
805	my $doc_obj = new doc ($filename, "indexed_doc");
806	my $topsection = $doc_obj->get_top_section();
807
808	if ($self->{'doctype'} eq 'paged') {
809	# set the gsdlthistype metadata to Paged - this ensures this document will
810	# be treated as a Paged doc, even if Titles are not numeric
811
812	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
813	} else {
814	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
815	}
816
817	$doc_obj->add_metadata ($topsection, "Source", $file);
818
819	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
820	my $line = "";
821	my $num = 0;
822	while (defined ($line = <ITEMFILE>)) {
823	next unless $line =~ /\w/;
824	chomp $line;
825	if ($line =~ /^<([^>])>(.?)\s*$/) {
826	$doc_obj->set_utf8_metadata_element ($topsection, $1, $2);
827	$meta->{$1} = $2;
828	} else {
829	$num++;
830	# line should be like page:imagefilename:textfilename:r - the r is optional -> means rotate the image 180 deg
831	$line =~ s/^\s+//; #remove space at the front
832	$line =~ s/\s+$//; #remove space at the end
833	my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
834
835	# create a new section for each image file
836	my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
837	# the page number becomes the Title
838	$doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum);
839	# process the image for this page
840	my $result = $self->process_image($dir.$imgname, $imgname, $doc_obj, $cursection, $rotate);
841
842	if (!defined $result)
843	{
844	print "PagedImgPlug: couldn't process image \"$dir.$imgname\" for item \"$filename\"\n";
845	}
846
847	# process the text file if one is there
848	if (defined $txtname && $txtname ne "") {
849	$result = undef;
850	$result = $self->process_text ($dir.$txtname, $txtname, $doc_obj, $cursection);
851	if (!defined $result) {
852	print "PagedImgPlug: couldn't process text file \"$dir.$txtname\" for item \"$filename\"\n";
853	}
854	} else {
855	# otherwise add in some dummy text
856	$doc_obj->add_text($cursection, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
857	}
858	}
859	}
860
861	close ITEMFILE;
862
863	# if we want a header page, we need to add some text into the top section, otherwise this section will become invisible
864	if ($self->{'headerpage'}) {
865	$doc_obj->add_text($topsection, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
866	}
867	$file =~ s/\.item//i;
868	$doc_obj->set_OID ();
869	# add numpages metadata
870	$doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num");
871	return $doc_obj;
872	}
873
874	sub process_text {
875	my $self = shift (@_);
876	my ($fullpath, $file, $doc_obj, $cursection) = @_;
877
878	# Do encoding stuff
879	my ($language, $encoding) = $self->textcat_get_language_encoding ($fullpath);
880
881	my $text="";
882	&BasPlug::read_file($self, $fullpath, $encoding, $language, \$text);
883	if (!length ($text)) {
884	my $plugin_name = ref ($self);
885	print "PagedImgPlug: ERROR: $fullpath contains no text\n" if $self->{'verbosity'};
886	return 0;
887	}
888
889	# we need to escape the escape character, or else mg will convert into
890	# eg literal newlines, instead of leaving the text as '\n'
891	$text =~ s/\\/\\\\/g; # macro language
892	$text =~ s/_/\\_/g; # macro language
893	$text =~ s/</</g;
894	$text =~ s/>/>/g;
895
896	# insert preformat tags and add text to document object
897	$doc_obj->add_utf8_text($cursection, "<pre>\n$text\n</pre>");
898
899	return 1;
900	}
901
902	# do plugin specific processing of doc_obj
903	sub process {
904	my $self = shift (@_);
905	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
906	my $outhandle = $self->{'outhandle'};
907
908	return 1;
909	}
910
911	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: