Context Navigation

source: trunk/gsdl/perllib/plugins/PagedImgPlug.pm@ 10218

Last change on this file since 10218 was 10218, checked in by kjdon, 19 years ago
Jeffrey's new parsing modifications, committed approx 6 July, 15.16
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 29.9 KB

Line
1	###########################################################################
2	#
3	# PagedImgPlug.pm -- plugin for sets of images and OCR text that
4	# make up a document
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	# PagedImgPlug
28	# processes sequences of images, with optional OCR text
29	#
30	# This plugin takes *.item files, which contain metadata and lists of image
31	# files, and produces a document containing sections, one for each page.
32	# The files should be named something.item, then you can have more than one
33	# book in a directory. You will need to create these files, one for each
34	# document/book.
35	#
36	#There are two formats for the item files: a plain text format, and an xml
37	#format. You can use either format, and can have both formats in the same
38	#collection if you like. If you use the plain format, you must not start the
39	#file off with <PagedDocument>
40
41	#### PLAIN FORMAT
42	# The format of the xxx.item file is as follows:
43	# The first lines contain any metadata for the whole document
44	# <metadata-name>metadata-value
45	# eg.
46	# <Title>Snail farming
47	# <Date>19230102
48	# Then comes a list of pages, one page per line, each line has the format
49	#
50	# pagenum:imagefile:textfile:r
51	#
52	# page num and imagefile are required. pagenum is used for the Title
53	# of the section, and in the display is shown as page <pagenum>.
54	# imagefile is the image for the page. textfile is an optional text
55	# file containing the OCR (or any) text for the page - this gets added
56	# as the text for the section. r is optional, and signals that the image
57	# should be rotated 180deg. Eg use this if the image has been made upside down.
58	# So an example item file looks like:
59	# <Title>Snail farming
60	# <Date>19960403
61	# 1:p1.gif:p1.txt:
62	# 2:p2.gif::
63	# 3:p3.gif:p3.txt:
64	# 3b:p3b.gif:p3b.txt:r
65	# The second page has no text, the fourth page is a back page, and
66	# should be rotated.
67	#
68
69	#### XML FORMAT
70	# The xml format looks like the following
71	#<PagedDocument>
72	#<Metadata name="Title">The Title of the entire document</Metadata>
73	#<Page pagenum="1" imgfile="xxx.jpg" txtfile="yyy.jpg">
74	#<Metadata name="Title">The Title of this page</Metadata>
75	#</Page>
76	#... more pages
77	#</PagedDocument>
78	#PagedDocument contains a list of Pages, Metadata and PageGroups. Any metadata
79	#that is not inside another tag will belong to the document.
80	#Each Page has a pagenum (not used at the moment), an imgfile and/or a txtfile.
81	#These are both optional - if neither is used, the section will have no content.
82	#Pages can also have metadata associated with them.
83	#PageGroups can be introduced at any point - they can contain Metadata and Pages and other PageGroups. They are used to introduce hierarchical structure into the document.
84	#For example
85	#<PagedDocument>
86	#<PageGroup>
87	#<Page>
88	#<Page>
89	#</PageGroup>
90	#<Page>
91	#</PagedDocument>
92	#would generate a structure like
93	#X
94	#--X
95	# --X
96	# --X
97	#--X
98	#PageGroup tags can also have imgfile/textfile metadata if you like - this way they get some content themselves.
99
100	#Currently the XML structure doesn't work very well with the paged document type, unless you use numerical Titles for each section.
101	#There is still a bit of work to do on this format:
102	#* enable other text file types, eg html, pdf etc
103	#* make the document paging work properly
104	#* add pagenum as Title unless a Title is present?
105
106	# All the supplemetary image amd text files should be in the same folder as
107	# the .item file.
108	#
109	# To display the images instead of the document text, you can use [srcicon]
110	# in the DocumentText format statement.
111	# For example,
112	#
113	# format DocumentText "<center><table width=_pagewidth_><tr><td>[srcicon]</td></tr></table></center>"
114	#
115	# To have it create thumbnail size images, use the '-thumbnail' option.
116	# To have it create medium size images for display, use the '-screenview'
117	# option. As usual, running
118	# 'perl -S pluginfo.pl PagedImgPlug' will list all the options.
119
120	# If you want the resulting documents to be presented with a table of
121	# contents, use '-documenttype hierarchy', otherwise they will have
122	# next and previous arrows, and a goto page X box.
123
124	# If you have used -screenview, you can also use [screenicon] in the format
125	# statement to display the smaller image. Here is an example that switches
126	# between the two:
127	#
128	# format DocumentText "<center><table width=_pagewidth_><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small'>Switch to small version.</a>,<a href='_httpdocument_&d=_cgiargd_&p=full'>Switch to fullsize version</a>}</td></tr><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small' title='Switch to small version'>[srcicon]</a>,<a href='_httpdocument_&d=_cgiargd_&p=full' title='Switch to fullsize version'>[screenicon]</a>}</td></tr></table></center>"
129	#
130	# Additional metadata can be added into the .item files, alternatively you can
131	# use normal metadata.xml files, with the name of the xxx.item file as the
132	# FileName (only for document level metadata).
133
134	package PagedImgPlug;
135
136	use XMLPlug;
137
138	sub BEGIN {
139	@PagedImgPlug::ISA = ('XMLPlug');
140	}
141
142	my $type_list =
143	[ { 'name' => "paged",
144	'desc' => "{PagedImgPlug.documenttype.paged}" },
145	{ 'name' => "hierarchy",
146	'desc' => "{PagedImgPlug.documenttype.hierarchy}" } ];
147
148	my $arguments =
149	[ { 'name' => "process_exp",
150	'desc' => "{BasPlug.process_exp}",
151	'type' => "string",
152	'deft' => &get_default_process_exp(),
153	'reqd' => "no" },
154	{ 'name' => "block_exp",
155	'desc' => "{BasPlug.block_exp}",
156	'type' => "string",
157	'deft' => &get_default_block_exp(),
158	'reqd' => "no" },
159	{ 'name' => "noscaleup",
160	'desc' => "{ImagePlug.noscaleup}",
161	'type' => "flag",
162	'reqd' => "no" },
163	{ 'name' => "thumbnail",
164	'desc' => "{PagedImgPlug.thumbnail}",
165	'type' => "flag",
166	'reqd' => "no" },
167	{ 'name' => "thumbnailsize",
168	'desc' => "{ImagePlug.thumbnailsize}",
169	'type' => "int",
170	'deft' => "100",
171	'range' => "1,",
172	'reqd' => "no" },
173	{ 'name' => "thumbnailtype",
174	'desc' => "{ImagePlug.thumbnailtype}",
175	'type' => "string",
176	'deft' => "gif",
177	'reqd' => "no" },
178	{ 'name' => "screenview",
179	'desc' => "{PagedImgPlug.screenview}",
180	'type' => "flag",
181	'reqd' => "no" },
182	{ 'name' => "screenviewsize",
183	'desc' => "{PagedImgPlug.screenviewsize}",
184	'type' => "int",
185	'deft' => "500",
186	'range' => "1,",
187	'reqd' => "no" },
188	{ 'name' => "screenviewtype",
189	'desc' => "{PagedImgPlug.screenviewtype}",
190	'type' => "string",
191	'deft' => "jpg",
192	'reqd' => "no" },
193	{ 'name' => "converttotype",
194	'desc' => "{ImagePlug.converttotype}",
195	'type' => "string",
196	'deft' => "",
197	'reqd' => "no" },
198	{ 'name' => "minimumsize",
199	'desc' => "{ImagePlug.minimumsize}",
200	'type' => "int",
201	'deft' => "100",
202	'range' => "1,",
203	'reqd' => "no" },
204	{ 'name' => "headerpage",
205	'desc' => "{PagedImgPlug.headerpage}",
206	'type' => "flag",
207	'reqd' => "no" },
208	{ 'name' => "documenttype",
209	'desc' => "{PagedImgPlug.documenttype}",
210	'type' => "enum",
211	'list' => $type_list,
212	'deft' => "paged",
213	'reqd' => "no" } ];
214
215
216	my $options = { 'name' => "PagedImgPlug",
217	'desc' => "{PagedImgPlug.desc}",
218	'inherits' => "yes",
219	'args' => $arguments };
220
221	sub new {
222	my ($class) = shift (@_);
223	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
224	push(@$pluginlist, $class);
225
226	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
227	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
228
229	my $self = (defined $hashArgOptLists)? new XMLPlug($pluginlist,$inputargs,$hashArgOptLists): new XMLPlug($pluginlist,$inputargs);
230
231	return bless $self, $class;
232	}
233
234	sub get_default_process_exp {
235	my $self = shift (@_);
236
237	return q^\.item$^;
238	}
239
240	# want to block everything except the .item ones
241	# but instead we will block images and txt files
242	sub get_default_block_exp {
243	my $self = shift (@_);
244
245	return q^(?i)(\.jpe?g\|\.gif\|\.png\|\.tif?f\|\.te?xt\|~)$^
246	}
247	# Create the thumbnail and screenview images, and discover the Image's
248	# size, width, and height using the convert utility.
249	sub process_image {
250	my $self = shift (@_);
251	my $filename = shift (@_); # filename with full path
252	my $srcfile = shift (@_); # filename without path
253	my $doc_obj = shift (@_);
254	my $section = shift (@_); #the current section
255	my $rotate = shift (@_); # whether to rotate the image or not
256
257	my $top=0;
258	if ($section eq $doc_obj->get_top_section()) {
259	$top=1;
260	}
261	my $verbosity = $self->{'verbosity'};
262	my $outhandle = $self->{'outhandle'};
263
264	# check the filename is okay
265	return 0 if ($srcfile eq "" \|\| $filename eq "");
266
267	my $minimumsize = $self->{'minimumsize'};
268	if (defined $minimumsize && (-s $filename < $minimumsize)) {
269	print $outhandle "PagedImgPlug: \"$filename\" too small, skipping\n"
270	if ($verbosity > 1);
271	}
272
273	# Convert the image to a new type (if required), and rotate if required.
274	my $converttotype = $self->{'converttotype'};
275	my $originalfilename = ""; # only set if we do a conversion
276	my $type = "unknown";
277	my $converted = 0;
278	my $rotated=0;
279	if ($converttotype ne "" && $filename !~ /$converttotype$/) {
280	$converted=1;
281	$originalfilename = $filename;
282	my $filehead = &util::get_tmp_filename();
283	$filename = $filehead . ".$converttotype";
284	$n = 1;
285	while (-e $filename) {
286	$filename = "$filehead$n\.$converttotype";
287	$n++;
288	}
289	$self->{'tmp_filename1'} = $filename;
290
291	my $rotate_option = "";
292	if ($rotate eq "r") {
293	$rotate_option = "-rotate 180 ";
294	}
295
296	my $command = "convert -verbose \"$originalfilename\" $rotate_option \"$filename\"";
297	print $outhandle "CONVERT: $command\n" if ($verbosity > 2);
298	my $result = '';
299	$result = `$command`;
300	print $outhandle "CONVERT RESULT = $result\n" if ($verbosity > 2);
301
302	$type = $converttotype;
303	} elsif ($rotate eq "r") {
304	$rotated=1;
305	$originalfilename = $filename;
306	$filename = &util::get_tmp_filename();
307
308	my $command = "convert \"$originalfilename\" -rotate 180 \"$filename\"";
309	print $outhandle "ROTATE: $command\n" if ($verbosity > 2);
310	my $result = '';
311	$result = `$command`;
312	print $outhandle "ROTATE RESULT = $result\n" if ($verbosity > 2);
313
314	}
315
316
317	# Add the image metadata
318	my $file; # the new file name
319	my $id = $srcfile;
320	$id =~ s/\.([^\.]*)$//; # the new file name without an extension
321	if ($converted) {
322	# we have converted the image
323	# add on the new extension
324	$file .= "$id.$converttotype";
325	} else {
326	$file = $srcfile;
327	}
328
329	my $url =$file; # the new file name prepared for a url
330	my $srcurl = $srcfile;
331	$url =~ s/ /%20/g;
332	$srcurl =~ s/ /%20/g;
333
334	$doc_obj->add_metadata ($section, "Image", $url);
335
336	# Also want to set filename as 'Source' metadata to be
337	# consistent with other plugins
338	$doc_obj->add_metadata ($section, "Source", $srcurl);
339
340	my ($image_type, $image_width, $image_height, $image_size)
341	= &identify($filename, $outhandle, $verbosity);
342
343	$doc_obj->add_metadata ($section, "ImageType", $image_type);
344	$doc_obj->add_metadata ($section, "ImageWidth", $image_width);
345	$doc_obj->add_metadata ($section, "ImageHeight", $image_height);
346	$doc_obj->add_metadata ($section, "ImageSize", $image_size);
347	$doc_obj->add_metadata ($section, "FileFormat", "PagedImg");
348
349	if ($type eq "unknown" && $image_type) {
350	$type = $image_type;
351	}
352
353	if ($top) {
354	$doc_obj->add_metadata ($section, "srclink",
355	"<a href=\"_httpcollection_/index/assoc/[assocfilepath]/[Image]\">");
356	$doc_obj->add_metadata ($section, "srcicon", "<img src=\"_httpcollection_/index/assoc/[assocfilepath]/[Image]\">");
357
358	} else {
359	$doc_obj->add_metadata ($section, "srclink",
360	"<a href=\"_httpcollection_/index/assoc/[parent(Top):assocfilepath]/[Image]\">");
361	$doc_obj->add_metadata ($section, "srcicon", "<img src=\"_httpcollection_/index/assoc/[parent(Top):assocfilepath]/[Image]\">");
362
363	}
364	$doc_obj->add_metadata ($section, "/srclink", "</a>");
365
366
367	# Add the image as an associated file
368	$doc_obj->associate_file($filename,$file,"image/$type",$section);
369	print $outhandle "associating file $filename as name $file\n" if ($verbosity > 2);
370
371	if ($self->{'thumbnail'}) {
372	# Make the thumbnail image
373	my $thumbnailsize = $self->{'thumbnailsize'} \|\| 100;
374	my $thumbnailtype = $self->{'thumbnailtype'} \|\| 'gif';
375
376	my $filehead = &util::get_tmp_filename();
377	my $thumbnailfile = $filehead . ".$thumbnailtype";
378	my $n=1;
379	while (-e $thumbnailfile) {
380	$thumbnailfile = $filehead . $n . ".$thumbnailtype";
381	$n++;
382	}
383
384	$self->{'tmp_filename2'} = $thumbnailfile;
385
386	# Generate the thumbnail with convert
387	my $command = "convert -verbose -geometry $thumbnailsize"
388	. "x$thumbnailsize \"$filename\" \"$thumbnailfile\"";
389	print $outhandle "THUMBNAIL: $command\n" if ($verbosity > 2);
390	my $result = '';
391	$result = `$command 2>&1` ;
392	print $outhandle "THUMB RESULT: $result\n" if ($verbosity > 2);
393
394	# Add the thumbnail as an associated file ...
395	if (-e "$thumbnailfile") {
396	$doc_obj->associate_file("$thumbnailfile", $id."thumb.$thumbnailtype", "image/$thumbnailtype",$section);
397	$doc_obj->add_metadata ($section, "ThumbType", $thumbnailtype);
398	$doc_obj->add_metadata ($section, "Thumb", $id."thumb.$thumbnailtype");
399	if ($top) {
400	$doc_obj->add_metadata ($section, "thumbicon", "<img src=\"_httpcollection_/index/assoc/[assocfilepath]/[Thumb]\" width=[ThumbWidth] height=[ThumbHeight]>");
401	} else {
402	$doc_obj->add_metadata ($section, "thumbicon", "<img src=\"_httpcollection_/index/assoc/[parent(Top):assocfilepath]/[Thumb]\" width=[ThumbWidth] height=[ThumbHeight]>");
403	}
404	}
405
406	# Extract Thumnail metadata from convert output
407	if ($result =~ m/[0-9]+x[0-9]+=>([0-9]+)x([0-9]+)/) {
408	$doc_obj->add_metadata ($section, "ThumbWidth", $1);
409	$doc_obj->add_metadata ($section, "ThumbHeight", $2);
410	}
411	}
412	# Make a screen-sized version of the picture if requested
413	if ($self->{'screenview'}) {
414
415	# To do: if the actual image is smaller than the screenview size,
416	# we should use the original !
417
418	my $screenviewsize = $self->{'screenviewsize'} \|\| 500;
419	my $screenviewtype = $self->{'screenviewtype'} \|\| 'jpeg';
420	my $filehead = &util::get_tmp_filename();
421	my $screenviewfilename = $filehead . ".$screenviewtype";
422	my $n=1;
423	while (-e $screenviewfilename) {
424	$screenviewfilename = "$filehead$n\.$screenviewtype";
425	$n++;
426	}
427	$self->{'tmp_filename3'} = $screenviewfilename;
428
429	# make the screenview image
430	my $command = "convert -verbose -geometry $screenviewsize"
431	. "x$screenviewsize \"$filename\" \"$screenviewfilename\"";
432	print $outhandle "SCREENVIEW: $command\n" if ($verbosity > 2);
433	my $result = "";
434	$result = `$command 2>&1` ;
435	print $outhandle "SCREENVIEW RESULT: $result\n" if ($verbosity > 3);
436
437	# get screenview dimensions, size and type
438	if ($result =~ m/[0-9]+x[0-9]+=>([0-9]+)x([0-9]+)/) {
439	$doc_obj->add_metadata ($section, "ScreenWidth", $1);
440	$doc_obj->add_metadata ($section, "ScreenHeight", $2);
441	}elsif ($result =~ m/([0-9]+)x([0-9]+)/) {
442	#if the image hasn't changed size, the previous regex doesn't match
443	$doc_obj->add_metadata ($section, "ScreenWidth", $1);
444	$doc_obj->add_metadata ($section, "ScreenHeight", $2);
445	}
446
447	#add the screenview as an associated file ...
448	if (-e "$screenviewfilename") {
449	$doc_obj->associate_file("$screenviewfilename", $id."sv.$screenviewtype",
450	"image/$screenviewtype",$section);
451	print $outhandle "associating screen file $screenviewfilename as name $id sv.$screenviewtype\n" if ($verbosity > 2);
452
453	$doc_obj->add_metadata ($section, "ScreenType", $screenviewtype);
454	$doc_obj->add_metadata ($section, "Screen", $id."sv.$screenviewtype");
455
456	if ($top) {
457	$doc_obj->add_metadata ($section, "screenicon", "<img src=\"_httpcollection_/index/assoc/[assocfilepath]/[Screen]\" width=[ScreenWidth] height=[ScreenHeight]>");
458	} else {
459	$doc_obj->add_metadata ($section, "screenicon", "<img src=\"_httpcollection_/index/assoc/[parent(Top):assocfilepath]/[Screen]\" width=[ScreenWidth] height=[ScreenHeight]>");
460
461	}
462	} else {
463	print $outhandle "PagedImgPlug: couldn't find \"$screenviewfilename\"\n";
464	}
465	}
466
467	return $type;
468
469
470	}
471
472
473
474	# Discover the characteristics of an image file with the ImageMagick
475	# "identify" command.
476
477	sub identify {
478	my ($image, $outhandle, $verbosity) = @_;
479
480	# Use the ImageMagick "identify" command to get the file specs
481	my $command = "identify \"$image\" 2>&1";
482	print $outhandle "$command\n" if ($verbosity > 2);
483	my $result = '';
484	$result = `$command`;
485	print $outhandle "$result\n" if ($verbosity > 3);
486
487	# Read the type, width, and height
488	my $type = 'unknown';
489	my $width = 'unknown';
490	my $height = 'unknown';
491
492	my $image_safe = quotemeta $image;
493	if ($result =~ /^$image_safe (\w+) (\d+)x(\d+)/) {
494	$type = $1;
495	$width = $2;
496	$height = $3;
497	}
498
499	# Read the size
500	my $size = "unknown";
501	if ($result =~ m/^.* ([0-9]+)b/) {
502	$size = $1;
503	} elsif ($result =~ m/^.* ([0-9]+)kb/) {
504	$size = 1024 * $1;
505	}
506
507	print $outhandle "file: $image:\t $type, $width, $height, $size\n"
508	if ($verbosity > 3);
509
510	# Return the specs
511	return ($type, $width, $height, $size);
512	}
513
514
515	# The PagedImgPlug read() function. This function does all the right things
516	# to make general options work for a given plugin. It calls the process()
517	# function which does all the work specific to a plugin (like the old
518	# read functions used to do). Most plugins should define their own
519	# process() function and let this read() function keep control.
520	#
521	# PagedImgPlug overrides read() because there is no need to read the actual
522	# text of the file in, because the contents of the file is not text...
523	#
524	# Return number of files processed, undef if can't process
525	# Note that $base_dir might be "" and that $file might
526	# include directories
527
528	sub read {
529	$self = shift (@_);
530	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
531	my $outhandle = $self->{'outhandle'};
532	my $smart_block = $self->{'smart_block'};
533
534	my $filename = &util::filename_cat($base_dir, $file);
535
536	if ($self->associate_with($file,$filename,$metadata)) {
537	# a form of smart block
538	$self->{'num_blocked'} ++;
539	return 0; # blocked
540	}
541
542	if ($smart_block) {
543	if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
544	$self->{'num_blocked'} ++;
545	return 0; # blocked
546	}
547	} elsif ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
548	$self->{'num_blocked'} ++;
549	return 0; # blocked
550	}
551
552	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
553	return undef;
554	}
555
556	print $outhandle "PagedImgPlug processing \"$filename\"\n"
557	if $self->{'verbosity'} > 1;
558	print STDERR "<Processing n='$file' p='PagedImgPlug'>\n" if ($gli);
559
560	# here we need to decide if we have an old text .item file, or a new xml
561	# .item file - for now the test is if the first non-empty line is
562	# <PagedDocument> then its xml
563	my $xml_version = 0;
564	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
565	my $line = "";
566	my $num = 0;
567	$line = <ITEMFILE>;
568	while ($line !~ /\w/) {
569	$line = <ITEMFILE>;
570	}
571	chomp $line;
572	if ($line =~ /^<PagedDocument/) {
573	$xml_version = 1;
574	}
575	close ITEMFILE;
576	my $doc_obj;
577	if ($xml_version) {
578
579	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
580	$self->{'file'} = $file;
581	$self->{'filename'} = $filename;
582	$self->{'processor'} = $processor;
583	$self->{'metadata'} = $metadata;
584	$self->{'gli'} = $gli;
585	eval {
586	$@ = "";
587	my $xslt = $self->{'xslt'};
588	if (defined $xslt && ($xslt ne "")) {
589	# perform xslt
590	my $transformed_xml = $self->apply_xslt($xslt,$filename);
591
592	# feed transformed file (now in memory as string) into XML parser
593	#$self->{'parser'}->parse($transformed_xml);
594	$self->parse_string($transformed_xml);
595	}
596	else {
597	#$self->{'parser'}->parsefile($filename);
598	$self->parse_file($filename);
599	}
600	};
601
602	if ($@) {
603
604	# parsefile may either croak somewhere in XML::Parser (e.g. because
605	# the document is not well formed) or die somewhere in XMLPlug or a
606	# derived plugin (e.g. because we're attempting to process a
607	# document whose DOCTYPE is not meant for this plugin). For the
608	# first case we'll print a warning and continue, for the second
609	# we'll just continue quietly
610
611	print STDERR "**** XML Parse Error is: $@\n";
612
613	my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
614	if (defined $msg) {
615	my $outhandle = $self->{'outhandle'};
616	my $plugin_name = ref ($self);
617	print $outhandle "$plugin_name failed to process $file ($msg)\n";
618	}
619
620	# reset ourself for the next document
621	$self->{'section_level'}=0;
622	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
623	return -1; # error during processing
624	}
625	$doc_obj = $self->{'doc_obj'};
626
627	} else {
628	my ($dir);
629	($dir, $file) = $filename =~ /^(.?)([^\/\\])$/;
630
631	#process the .item file
632	$doc_obj = $self->process_item($filename, $dir, $file, $processor);
633
634	}
635
636	if ($self->{'cover_image'}) {
637	$self->associate_cover_image($doc_obj, $filename);
638	}
639
640	# include any metadata passed in from previous plugins
641	# note that this metadata is associated with the top level section
642	my $section = $doc_obj->get_top_section();
643	$self->extra_metadata ($doc_obj, $section, $metadata);
644
645	# do plugin specific processing of doc_obj
646	unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
647	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
648	return -1;
649	}
650
651	# do any automatic metadata extraction
652	$self->auto_extract_metadata ($doc_obj);
653
654	# process the document
655	$processor->process($doc_obj);
656
657	# clean up temporary files - we do this here instead of in
658	# process_image becuase associated files aren't actually copied
659	# until after process has been run.
660	if (defined $self->{'tmp_filename1'} &&
661	-e $self->{'tmp_filename1'}) {
662	&util::rm($self->{'tmp_filename1'})
663	}
664	if (defined $self->{'tmp_filename2'} &&
665	-e $self->{'tmp_filename2'}) {
666	&util::rm($self->{'tmp_filename2'})
667	}
668	if (defined $self->{'tmp_filename3'} &&
669	-e $self->{'tmp_filename3'}) {
670	&util::rm($self->{'tmp_filename3'})
671	}
672
673	$self->{'num_processed'}++;
674
675	return 1;
676	}
677
678	sub xml_start_tag {
679	my $self = shift(@_);
680	my ($expat, $element) = @_;
681	$self->{'element'} = $element;
682
683	my $doc_obj = $self->{'doc_obj'};
684	if ($element eq "PagedDocument") {
685	$self->{'current_section'} = $doc_obj->get_top_section();
686	} elsif ($element eq "PageGroup" \|\| $element eq "Page") {
687	# create a new section as a child
688	$self->{'current_section'} = $doc_obj->insert_section($doc_obj->get_end_child($self->{'current_section'}));
689	$self->{'num_pages'}++;
690	# assign pagenum as what??
691	my $pagenum = $_{'pagenum'}; #TODO!!
692	$doc_obj->set_utf8_metadata_element($self->{'current_section'}, 'PageNum', $pagenum);
693	my ($imgfile) = $_{'imgfile'};
694	if (defined $imgfile) {
695	$self->process_image($self->{'base_dir'}.$imgfile, $imgfile, $doc_obj, $self->{'current_section'});
696	}
697	my ($txtfile) = $_{'txtfile'};
698	if (defined($txtfile)) {
699	$self->process_text ($self->{'base_dir'}.$txtfile, $txtfile, $doc_obj, $self->{'current_section'});
700	} else {
701	# otherwise add in some dummy text
702	$doc_obj->add_text($self->{'current_section'}, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
703	}
704	} elsif ($element eq "Metadata") {
705	$self->{'metadata_name'} = $_{'name'};
706	}
707	}
708
709	sub xml_end_tag {
710	my $self = shift(@_);
711	my ($expat, $element) = @_;
712
713	my $doc_obj = $self->{'doc_obj'};
714	if ($element eq "Page" \|\| $element eq "PageGroup") {
715	# move the current section back to the parent
716	$self->{'current_section'} = $doc_obj->get_parent_section($self->{'current_section'});
717	} elsif ($element eq "Metadata") {
718
719	$doc_obj->add_utf8_metadata ($self->{'current_section'}, $self->{'metadata_name'}, $self->{'metadata_value'});
720	$self->{'metadata_name'} = "";
721	$self->{'metadata_value'} = "";
722
723	}
724	# otherwise we ignore the end tag
725	}
726
727
728	sub xml_text {
729	my $self = shift(@_);
730	my ($expat) = @_;
731
732	if ($self->{'element'} eq "Metadata") {
733	$self->{'metadata_value'} .= $_;
734	}
735	}
736
737	sub xml_doctype {
738	}
739
740	sub open_document {
741	my $self = shift(@_);
742
743	# create a new document
744	$self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
745	my $doc_obj = $self->{'doc_obj'};
746	$doc_obj->set_OIDtype ($self->{'processor'}->{'OIDtype'});
747	my ($dir, $file) = $self->{'filename'} =~ /^(.?)([^\/\\])$/;
748	$self->{'base_dir'} = $dir;
749	$self->{'num_pages'} = 0;
750	my $topsection = $doc_obj->get_top_section();
751	if ($self->{'documenttype'} eq 'paged') {
752	# set the gsdlthistype metadata to Paged - this ensures this document will
753	# be treated as a Paged doc, even if Titles are not numeric
754
755	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
756	} else {
757	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
758	}
759
760	$doc_obj->add_metadata ($topsection, "Source", $file);
761	if ($self->{'headerpage'}) {
762	$doc_obj->add_text($topsection, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
763	}
764
765	}
766
767	sub close_document {
768	my $self = shift(@_);
769	my $doc_obj = $self->{'doc_obj'};
770
771	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
772	$doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "PagedImg");
773
774	# add numpages metadata
775	$doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(), 'NumPages', $self->{'num_pages'});
776
777	# add an OID
778	$doc_obj->set_OID();
779
780	}
781
782	sub process_item {
783	my $self = shift (@_);
784	my ($filename, $dir, $file, $processor) = @_;
785
786	my $doc_obj = new doc ($filename, "indexed_doc");
787	my $topsection = $doc_obj->get_top_section();
788
789	if ($self->{'documenttype'} eq 'paged') {
790	# set the gsdlthistype metadata to Paged - this ensures this document will
791	# be treated as a Paged doc, even if Titles are not numeric
792
793	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
794	} else {
795	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
796	}
797
798	$doc_obj->add_metadata ($topsection, "Source", $file);
799
800	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
801	my $line = "";
802	my $num = 0;
803	while (defined ($line = <ITEMFILE>)) {
804	next unless $line =~ /\w/;
805	chomp $line;
806	if ($line =~ /^<([^>])>(.?)\s*$/) {
807	$doc_obj->set_utf8_metadata_element ($topsection, $1, $2);
808	$meta->{$1} = $2;
809	} else {
810	$num++;
811	# line should be like page:imagefilename:textfilename:r - the r is optional -> means rotate the image 180 deg
812	$line =~ s/^\s+//; #remove space at the front
813	$line =~ s/\s+$//; #remove space at the end
814	my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
815
816	# create a new section for each image file
817	my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
818	# the page number becomes the Title
819	$doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum);
820	# process the image for this page
821	my $result = $self->process_image($dir.$imgname, $imgname, $doc_obj, $cursection, $rotate);
822
823	if (!defined $result)
824	{
825	print "PagedImgPlug: couldn't process image \"$dir.$imgname\" for item \"$filename\"\n";
826	}
827
828	# process the text file if one is there
829	if (defined $txtname && $txtname ne "") {
830	$result = undef;
831	$result = $self->process_text ($dir.$txtname, $txtname, $doc_obj, $cursection);
832	if (!defined $result) {
833	print "PagedImgPlug: couldn't process text file \"$dir.$txtname\" for item \"$filename\"\n";
834	}
835	} else {
836	# otherwise add in some dummy text
837	$doc_obj->add_text($cursection, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
838	}
839	}
840	}
841
842	close ITEMFILE;
843
844	# if we want a header page, we need to add some text into the top section, otherwise this section will become invisible
845	if ($self->{'headerpage'}) {
846	$doc_obj->add_text($topsection, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
847	}
848	$file =~ s/\.item//i;
849	$doc_obj->set_OID ();
850	# add numpages metadata
851	$doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num");
852	return $doc_obj;
853	}
854
855	sub process_text {
856	my $self = shift (@_);
857	my ($fullpath, $file, $doc_obj, $cursection) = @_;
858
859	# Do encoding stuff
860	my ($language, $encoding) = $self->textcat_get_language_encoding ($fullpath);
861
862	my $text="";
863	&BasPlug::read_file($self, $fullpath, $encoding, $language, \$text);
864	if (!length ($text)) {
865	my $plugin_name = ref ($self);
866	print "PagedImgPlug: ERROR: $fullpath contains no text\n" if $self->{'verbosity'};
867	return 0;
868	}
869
870	# we need to escape the escape character, or else mg will convert into
871	# eg literal newlines, instead of leaving the text as '\n'
872	$text =~ s/\\/\\\\/g; # macro language
873	$text =~ s/_/\\_/g; # macro language
874	$text =~ s/</</g;
875	$text =~ s/>/>/g;
876
877	# insert preformat tags and add text to document object
878	$doc_obj->add_utf8_text($cursection, "<pre>\n$text\n</pre>");
879
880	return 1;
881	}
882
883	# do plugin specific processing of doc_obj
884	sub process {
885	my $self = shift (@_);
886	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
887	my $outhandle = $self->{'outhandle'};
888
889	return 1;
890	}
891
892	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: