source: trunk/gsdl/perllib/plugins/PagedImgPlug.pm@ 10254

Last change on this file since 10254 was 10254, checked in by kjdon, 19 years ago

added 'use strict' to all plugins, and made modifications (mostly adding 'my') to make them compile

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 30.1 KB
Line 
1###########################################################################
2#
3# PagedImgPlug.pm -- plugin for sets of images and OCR text that
4# make up a document
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# PagedImgPlug
28# processes sequences of images, with optional OCR text
29#
30# This plugin takes *.item files, which contain metadata and lists of image
31# files, and produces a document containing sections, one for each page.
32# The files should be named something.item, then you can have more than one
33# book in a directory. You will need to create these files, one for each
34# document/book.
35#
36#There are two formats for the item files: a plain text format, and an xml
37#format. You can use either format, and can have both formats in the same
38#collection if you like. If you use the plain format, you must not start the
39#file off with <PagedDocument>
40
41#### PLAIN FORMAT
42# The format of the xxx.item file is as follows:
43# The first lines contain any metadata for the whole document
44# <metadata-name>metadata-value
45# eg.
46# <Title>Snail farming
47# <Date>19230102
48# Then comes a list of pages, one page per line, each line has the format
49#
50# pagenum:imagefile:textfile:r
51#
52# page num and imagefile are required. pagenum is used for the Title
53# of the section, and in the display is shown as page <pagenum>.
54# imagefile is the image for the page. textfile is an optional text
55# file containing the OCR (or any) text for the page - this gets added
56# as the text for the section. r is optional, and signals that the image
57# should be rotated 180deg. Eg use this if the image has been made upside down.
58# So an example item file looks like:
59# <Title>Snail farming
60# <Date>19960403
61# 1:p1.gif:p1.txt:
62# 2:p2.gif::
63# 3:p3.gif:p3.txt:
64# 3b:p3b.gif:p3b.txt:r
65# The second page has no text, the fourth page is a back page, and
66# should be rotated.
67#
68
69#### XML FORMAT
70# The xml format looks like the following
71#<PagedDocument>
72#<Metadata name="Title">The Title of the entire document</Metadata>
73#<Page pagenum="1" imgfile="xxx.jpg" txtfile="yyy.jpg">
74#<Metadata name="Title">The Title of this page</Metadata>
75#</Page>
76#... more pages
77#</PagedDocument>
78#PagedDocument contains a list of Pages, Metadata and PageGroups. Any metadata
79#that is not inside another tag will belong to the document.
80#Each Page has a pagenum (not used at the moment), an imgfile and/or a txtfile.
81#These are both optional - if neither is used, the section will have no content.
82#Pages can also have metadata associated with them.
83#PageGroups can be introduced at any point - they can contain Metadata and Pages and other PageGroups. They are used to introduce hierarchical structure into the document.
84#For example
85#<PagedDocument>
86#<PageGroup>
87#<Page>
88#<Page>
89#</PageGroup>
90#<Page>
91#</PagedDocument>
92#would generate a structure like
93#X
94#--X
95# --X
96# --X
97#--X
98#PageGroup tags can also have imgfile/textfile metadata if you like - this way they get some content themselves.
99
100#Currently the XML structure doesn't work very well with the paged document type, unless you use numerical Titles for each section.
101#There is still a bit of work to do on this format:
102#* enable other text file types, eg html, pdf etc
103#* make the document paging work properly
104#* add pagenum as Title unless a Title is present?
105
106# All the supplemetary image amd text files should be in the same folder as
107# the .item file.
108#
109# To display the images instead of the document text, you can use [srcicon]
110# in the DocumentText format statement.
111# For example,
112#
113# format DocumentText "<center><table width=_pagewidth_><tr><td>[srcicon]</td></tr></table></center>"
114#
115# To have it create thumbnail size images, use the '-thumbnail' option.
116# To have it create medium size images for display, use the '-screenview'
117# option. As usual, running
118# 'perl -S pluginfo.pl PagedImgPlug' will list all the options.
119
120# If you want the resulting documents to be presented with a table of
121# contents, use '-documenttype hierarchy', otherwise they will have
122# next and previous arrows, and a goto page X box.
123
124# If you have used -screenview, you can also use [screenicon] in the format
125# statement to display the smaller image. Here is an example that switches
126# between the two:
127#
128# format DocumentText "<center><table width=_pagewidth_><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small'>Switch to small version.</a>,<a href='_httpdocument_&d=_cgiargd_&p=full'>Switch to fullsize version</a>}</td></tr><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small' title='Switch to small version'>[srcicon]</a>,<a href='_httpdocument_&d=_cgiargd_&p=full' title='Switch to fullsize version'>[screenicon]</a>}</td></tr></table></center>"
129#
130# Additional metadata can be added into the .item files, alternatively you can
131# use normal metadata.xml files, with the name of the xxx.item file as the
132# FileName (only for document level metadata).
133
134package PagedImgPlug;
135
136use XMLPlug;
137use strict;
138no strict 'refs'; # allow filehandles to be variables and viceversa
139
140sub BEGIN {
141 @PagedImgPlug::ISA = ('XMLPlug');
142}
143
144my $type_list =
145 [ { 'name' => "paged",
146 'desc' => "{PagedImgPlug.documenttype.paged}" },
147 { 'name' => "hierarchy",
148 'desc' => "{PagedImgPlug.documenttype.hierarchy}" } ];
149
150my $arguments =
151 [ { 'name' => "process_exp",
152 'desc' => "{BasPlug.process_exp}",
153 'type' => "string",
154 'deft' => &get_default_process_exp(),
155 'reqd' => "no" },
156 { 'name' => "block_exp",
157 'desc' => "{BasPlug.block_exp}",
158 'type' => "string",
159 'deft' => &get_default_block_exp(),
160 'reqd' => "no" },
161 { 'name' => "noscaleup",
162 'desc' => "{ImagePlug.noscaleup}",
163 'type' => "flag",
164 'reqd' => "no" },
165 { 'name' => "thumbnail",
166 'desc' => "{PagedImgPlug.thumbnail}",
167 'type' => "flag",
168 'reqd' => "no" },
169 { 'name' => "thumbnailsize",
170 'desc' => "{ImagePlug.thumbnailsize}",
171 'type' => "int",
172 'deft' => "100",
173 'range' => "1,",
174 'reqd' => "no" },
175 { 'name' => "thumbnailtype",
176 'desc' => "{ImagePlug.thumbnailtype}",
177 'type' => "string",
178 'deft' => "gif",
179 'reqd' => "no" },
180 { 'name' => "screenview",
181 'desc' => "{PagedImgPlug.screenview}",
182 'type' => "flag",
183 'reqd' => "no" },
184 { 'name' => "screenviewsize",
185 'desc' => "{PagedImgPlug.screenviewsize}",
186 'type' => "int",
187 'deft' => "500",
188 'range' => "1,",
189 'reqd' => "no" },
190 { 'name' => "screenviewtype",
191 'desc' => "{PagedImgPlug.screenviewtype}",
192 'type' => "string",
193 'deft' => "jpg",
194 'reqd' => "no" },
195 { 'name' => "converttotype",
196 'desc' => "{ImagePlug.converttotype}",
197 'type' => "string",
198 'deft' => "",
199 'reqd' => "no" },
200 { 'name' => "minimumsize",
201 'desc' => "{ImagePlug.minimumsize}",
202 'type' => "int",
203 'deft' => "100",
204 'range' => "1,",
205 'reqd' => "no" },
206 { 'name' => "headerpage",
207 'desc' => "{PagedImgPlug.headerpage}",
208 'type' => "flag",
209 'reqd' => "no" },
210 { 'name' => "documenttype",
211 'desc' => "{PagedImgPlug.documenttype}",
212 'type' => "enum",
213 'list' => $type_list,
214 'deft' => "paged",
215 'reqd' => "no" } ];
216
217
218my $options = { 'name' => "PagedImgPlug",
219 'desc' => "{PagedImgPlug.desc}",
220 'inherits' => "yes",
221 'args' => $arguments };
222
223sub new {
224 my ($class) = shift (@_);
225 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
226 push(@$pluginlist, $class);
227
228 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
229 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
230
231 my $self = (defined $hashArgOptLists)? new XMLPlug($pluginlist,$inputargs,$hashArgOptLists): new XMLPlug($pluginlist,$inputargs);
232
233 return bless $self, $class;
234}
235
236sub get_default_process_exp {
237 my $self = shift (@_);
238
239 return q^\.item$^;
240}
241
242# want to block everything except the .item ones
243# but instead we will block images and txt files
244sub get_default_block_exp {
245 my $self = shift (@_);
246
247 return q^(?i)(\.jpe?g|\.gif|\.png|\.tif?f|\.te?xt|~)$^
248}
249# Create the thumbnail and screenview images, and discover the Image's
250# size, width, and height using the convert utility.
251sub process_image {
252 my $self = shift (@_);
253 my $filename = shift (@_); # filename with full path
254 my $srcfile = shift (@_); # filename without path
255 my $doc_obj = shift (@_);
256 my $section = shift (@_); #the current section
257 my $rotate = shift (@_); # whether to rotate the image or not
258
259 my $top=0;
260 if ($section eq $doc_obj->get_top_section()) {
261 $top=1;
262 }
263 my $verbosity = $self->{'verbosity'};
264 my $outhandle = $self->{'outhandle'};
265
266 # check the filename is okay
267 return 0 if ($srcfile eq "" || $filename eq "");
268
269 my $minimumsize = $self->{'minimumsize'};
270 if (defined $minimumsize && (-s $filename < $minimumsize)) {
271 print $outhandle "PagedImgPlug: \"$filename\" too small, skipping\n"
272 if ($verbosity > 1);
273 }
274
275 # Convert the image to a new type (if required), and rotate if required.
276 my $converttotype = $self->{'converttotype'};
277 my $originalfilename = ""; # only set if we do a conversion
278 my $type = "unknown";
279 my $converted = 0;
280 my $rotated=0;
281 if ($converttotype ne "" && $filename !~ /$converttotype$/) {
282 $converted=1;
283 $originalfilename = $filename;
284 my $filehead = &util::get_tmp_filename();
285 $filename = $filehead . ".$converttotype";
286 my $n = 1;
287 while (-e $filename) {
288 $filename = "$filehead$n\.$converttotype";
289 $n++;
290 }
291 $self->{'tmp_filename1'} = $filename;
292
293 my $rotate_option = "";
294 if ($rotate eq "r") {
295 $rotate_option = "-rotate 180 ";
296 }
297
298 my $command = "convert -verbose \"$originalfilename\" $rotate_option \"$filename\"";
299 print $outhandle "CONVERT: $command\n" if ($verbosity > 2);
300 my $result = '';
301 $result = `$command`;
302 print $outhandle "CONVERT RESULT = $result\n" if ($verbosity > 2);
303
304 $type = $converttotype;
305 } elsif ($rotate eq "r") {
306 $rotated=1;
307 $originalfilename = $filename;
308 $filename = &util::get_tmp_filename();
309
310 my $command = "convert \"$originalfilename\" -rotate 180 \"$filename\"";
311 print $outhandle "ROTATE: $command\n" if ($verbosity > 2);
312 my $result = '';
313 $result = `$command`;
314 print $outhandle "ROTATE RESULT = $result\n" if ($verbosity > 2);
315
316 }
317
318
319 # Add the image metadata
320 my $file; # the new file name
321 my $id = $srcfile;
322 $id =~ s/\.([^\.]*)$//; # the new file name without an extension
323 if ($converted) {
324 # we have converted the image
325 # add on the new extension
326 $file .= "$id.$converttotype";
327 } else {
328 $file = $srcfile;
329 }
330
331 my $url =$file; # the new file name prepared for a url
332 my $srcurl = $srcfile;
333 $url =~ s/ /%20/g;
334 $srcurl =~ s/ /%20/g;
335
336 $doc_obj->add_metadata ($section, "Image", $url);
337
338 # Also want to set filename as 'Source' metadata to be
339 # consistent with other plugins
340 $doc_obj->add_metadata ($section, "Source", $srcurl);
341
342 my ($image_type, $image_width, $image_height, $image_size)
343 = &identify($filename, $outhandle, $verbosity);
344
345 $doc_obj->add_metadata ($section, "ImageType", $image_type);
346 $doc_obj->add_metadata ($section, "ImageWidth", $image_width);
347 $doc_obj->add_metadata ($section, "ImageHeight", $image_height);
348 $doc_obj->add_metadata ($section, "ImageSize", $image_size);
349 $doc_obj->add_metadata ($section, "FileFormat", "PagedImg");
350
351 if ($type eq "unknown" && $image_type) {
352 $type = $image_type;
353 }
354
355 if ($top) {
356 $doc_obj->add_metadata ($section, "srclink",
357 "<a href=\"_httpcollection_/index/assoc/[assocfilepath]/[Image]\">");
358 $doc_obj->add_metadata ($section, "srcicon", "<img src=\"_httpcollection_/index/assoc/[assocfilepath]/[Image]\">");
359
360 } else {
361 $doc_obj->add_metadata ($section, "srclink",
362 "<a href=\"_httpcollection_/index/assoc/[parent(Top):assocfilepath]/[Image]\">");
363 $doc_obj->add_metadata ($section, "srcicon", "<img src=\"_httpcollection_/index/assoc/[parent(Top):assocfilepath]/[Image]\">");
364
365 }
366 $doc_obj->add_metadata ($section, "/srclink", "</a>");
367
368
369 # Add the image as an associated file
370 $doc_obj->associate_file($filename,$file,"image/$type",$section);
371 print $outhandle "associating file $filename as name $file\n" if ($verbosity > 2);
372
373 if ($self->{'thumbnail'}) {
374 # Make the thumbnail image
375 my $thumbnailsize = $self->{'thumbnailsize'} || 100;
376 my $thumbnailtype = $self->{'thumbnailtype'} || 'gif';
377
378 my $filehead = &util::get_tmp_filename();
379 my $thumbnailfile = $filehead . ".$thumbnailtype";
380 my $n=1;
381 while (-e $thumbnailfile) {
382 $thumbnailfile = $filehead . $n . ".$thumbnailtype";
383 $n++;
384 }
385
386 $self->{'tmp_filename2'} = $thumbnailfile;
387
388 # Generate the thumbnail with convert
389 my $command = "convert -verbose -geometry $thumbnailsize"
390 . "x$thumbnailsize \"$filename\" \"$thumbnailfile\"";
391 print $outhandle "THUMBNAIL: $command\n" if ($verbosity > 2);
392 my $result = '';
393 $result = `$command 2>&1` ;
394 print $outhandle "THUMB RESULT: $result\n" if ($verbosity > 2);
395
396 # Add the thumbnail as an associated file ...
397 if (-e "$thumbnailfile") {
398 $doc_obj->associate_file("$thumbnailfile", $id."thumb.$thumbnailtype", "image/$thumbnailtype",$section);
399 $doc_obj->add_metadata ($section, "ThumbType", $thumbnailtype);
400 $doc_obj->add_metadata ($section, "Thumb", $id."thumb.$thumbnailtype");
401 if ($top) {
402 $doc_obj->add_metadata ($section, "thumbicon", "<img src=\"_httpcollection_/index/assoc/[assocfilepath]/[Thumb]\" width=[ThumbWidth] height=[ThumbHeight]>");
403 } else {
404 $doc_obj->add_metadata ($section, "thumbicon", "<img src=\"_httpcollection_/index/assoc/[parent(Top):assocfilepath]/[Thumb]\" width=[ThumbWidth] height=[ThumbHeight]>");
405 }
406 }
407
408 # Extract Thumnail metadata from convert output
409 if ($result =~ m/[0-9]+x[0-9]+=>([0-9]+)x([0-9]+)/) {
410 $doc_obj->add_metadata ($section, "ThumbWidth", $1);
411 $doc_obj->add_metadata ($section, "ThumbHeight", $2);
412 }
413 }
414 # Make a screen-sized version of the picture if requested
415 if ($self->{'screenview'}) {
416
417 # To do: if the actual image is smaller than the screenview size,
418 # we should use the original !
419
420 my $screenviewsize = $self->{'screenviewsize'} || 500;
421 my $screenviewtype = $self->{'screenviewtype'} || 'jpeg';
422 my $filehead = &util::get_tmp_filename();
423 my $screenviewfilename = $filehead . ".$screenviewtype";
424 my $n=1;
425 while (-e $screenviewfilename) {
426 $screenviewfilename = "$filehead$n\.$screenviewtype";
427 $n++;
428 }
429 $self->{'tmp_filename3'} = $screenviewfilename;
430
431 # make the screenview image
432 my $command = "convert -verbose -geometry $screenviewsize"
433 . "x$screenviewsize \"$filename\" \"$screenviewfilename\"";
434 print $outhandle "SCREENVIEW: $command\n" if ($verbosity > 2);
435 my $result = "";
436 $result = `$command 2>&1` ;
437 print $outhandle "SCREENVIEW RESULT: $result\n" if ($verbosity > 3);
438
439 # get screenview dimensions, size and type
440 if ($result =~ m/[0-9]+x[0-9]+=>([0-9]+)x([0-9]+)/) {
441 $doc_obj->add_metadata ($section, "ScreenWidth", $1);
442 $doc_obj->add_metadata ($section, "ScreenHeight", $2);
443 }elsif ($result =~ m/([0-9]+)x([0-9]+)/) {
444 #if the image hasn't changed size, the previous regex doesn't match
445 $doc_obj->add_metadata ($section, "ScreenWidth", $1);
446 $doc_obj->add_metadata ($section, "ScreenHeight", $2);
447 }
448
449 #add the screenview as an associated file ...
450 if (-e "$screenviewfilename") {
451 $doc_obj->associate_file("$screenviewfilename", $id."sv.$screenviewtype",
452 "image/$screenviewtype",$section);
453 print $outhandle "associating screen file $screenviewfilename as name $id sv.$screenviewtype\n" if ($verbosity > 2);
454
455 $doc_obj->add_metadata ($section, "ScreenType", $screenviewtype);
456 $doc_obj->add_metadata ($section, "Screen", $id."sv.$screenviewtype");
457
458 if ($top) {
459 $doc_obj->add_metadata ($section, "screenicon", "<img src=\"_httpcollection_/index/assoc/[assocfilepath]/[Screen]\" width=[ScreenWidth] height=[ScreenHeight]>");
460 } else {
461 $doc_obj->add_metadata ($section, "screenicon", "<img src=\"_httpcollection_/index/assoc/[parent(Top):assocfilepath]/[Screen]\" width=[ScreenWidth] height=[ScreenHeight]>");
462
463 }
464 } else {
465 print $outhandle "PagedImgPlug: couldn't find \"$screenviewfilename\"\n";
466 }
467 }
468
469 return $type;
470
471
472}
473
474
475
476# Discover the characteristics of an image file with the ImageMagick
477# "identify" command.
478
479sub identify {
480 my ($image, $outhandle, $verbosity) = @_;
481
482 # Use the ImageMagick "identify" command to get the file specs
483 my $command = "identify \"$image\" 2>&1";
484 print $outhandle "$command\n" if ($verbosity > 2);
485 my $result = '';
486 $result = `$command`;
487 print $outhandle "$result\n" if ($verbosity > 3);
488
489 # Read the type, width, and height
490 my $type = 'unknown';
491 my $width = 'unknown';
492 my $height = 'unknown';
493
494 my $image_safe = quotemeta $image;
495 if ($result =~ /^$image_safe (\w+) (\d+)x(\d+)/) {
496 $type = $1;
497 $width = $2;
498 $height = $3;
499 }
500
501 # Read the size
502 my $size = "unknown";
503 if ($result =~ m/^.* ([0-9]+)b/) {
504 $size = $1;
505 } elsif ($result =~ m/^.* ([0-9]+)kb/) {
506 $size = 1024 * $1;
507 }
508
509 print $outhandle "file: $image:\t $type, $width, $height, $size\n"
510 if ($verbosity > 3);
511
512 # Return the specs
513 return ($type, $width, $height, $size);
514}
515
516
517# The PagedImgPlug read() function. This function does all the right things
518# to make general options work for a given plugin. It calls the process()
519# function which does all the work specific to a plugin (like the old
520# read functions used to do). Most plugins should define their own
521# process() function and let this read() function keep control.
522#
523# PagedImgPlug overrides read() because there is no need to read the actual
524# text of the file in, because the contents of the file is not text...
525#
526# Return number of files processed, undef if can't process
527# Note that $base_dir might be "" and that $file might
528# include directories
529
530sub read {
531 my $self = shift (@_);
532 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
533 my $outhandle = $self->{'outhandle'};
534 my $smart_block = $self->{'smart_block'};
535
536 my $filename = &util::filename_cat($base_dir, $file);
537
538 if ($self->associate_with($file,$filename,$metadata)) {
539 # a form of smart block
540 $self->{'num_blocked'} ++;
541 return 0; # blocked
542 }
543
544 if ($smart_block) {
545 if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
546 $self->{'num_blocked'} ++;
547 return 0; # blocked
548 }
549 } elsif ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
550 $self->{'num_blocked'} ++;
551 return 0; # blocked
552 }
553
554 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
555 return undef;
556 }
557
558 print $outhandle "PagedImgPlug processing \"$filename\"\n"
559 if $self->{'verbosity'} > 1;
560 print STDERR "<Processing n='$file' p='PagedImgPlug'>\n" if ($gli);
561
562 # here we need to decide if we have an old text .item file, or a new xml
563 # .item file - for now the test is if the first non-empty line is
564 # <PagedDocument> then its xml
565 my $xml_version = 0;
566 open (ITEMFILE, $filename) || die "couldn't open $filename\n";
567 my $line = "";
568 my $num = 0;
569 $line = <ITEMFILE>;
570 while ($line !~ /\w/) {
571 $line = <ITEMFILE>;
572 }
573 chomp $line;
574 if ($line =~ /^<PagedDocument/) {
575 $xml_version = 1;
576 }
577 close ITEMFILE;
578 my $doc_obj;
579 if ($xml_version) {
580
581 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
582 $self->{'file'} = $file;
583 $self->{'filename'} = $filename;
584 $self->{'processor'} = $processor;
585 $self->{'metadata'} = $metadata;
586 $self->{'gli'} = $gli;
587 eval {
588 $@ = "";
589 my $xslt = $self->{'xslt'};
590 if (defined $xslt && ($xslt ne "")) {
591 # perform xslt
592 my $transformed_xml = $self->apply_xslt($xslt,$filename);
593
594 # feed transformed file (now in memory as string) into XML parser
595 #$self->{'parser'}->parse($transformed_xml);
596 $self->parse_string($transformed_xml);
597 }
598 else {
599 #$self->{'parser'}->parsefile($filename);
600 $self->parse_file($filename);
601 }
602 };
603
604 if ($@) {
605
606 # parsefile may either croak somewhere in XML::Parser (e.g. because
607 # the document is not well formed) or die somewhere in XMLPlug or a
608 # derived plugin (e.g. because we're attempting to process a
609 # document whose DOCTYPE is not meant for this plugin). For the
610 # first case we'll print a warning and continue, for the second
611 # we'll just continue quietly
612
613 print STDERR "**** XML Parse Error is: $@\n";
614
615 my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
616 if (defined $msg) {
617 my $outhandle = $self->{'outhandle'};
618 my $plugin_name = ref ($self);
619 print $outhandle "$plugin_name failed to process $file ($msg)\n";
620 }
621
622 # reset ourself for the next document
623 $self->{'section_level'}=0;
624 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
625 return -1; # error during processing
626 }
627 $doc_obj = $self->{'doc_obj'};
628
629 } else {
630 my ($dir);
631 ($dir, $file) = $filename =~ /^(.*?)([^\/\\]*)$/;
632
633 #process the .item file
634 $doc_obj = $self->process_item($filename, $dir, $file, $processor);
635
636 }
637
638 if ($self->{'cover_image'}) {
639 $self->associate_cover_image($doc_obj, $filename);
640 }
641
642 # include any metadata passed in from previous plugins
643 # note that this metadata is associated with the top level section
644 my $section = $doc_obj->get_top_section();
645 $self->extra_metadata ($doc_obj, $section, $metadata);
646
647 # do plugin specific processing of doc_obj - don't need this unless
648 # something inherits from PagedImgPlug
649 #unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
650# print STDERR "<ProcessingError n='$file'>\n" if ($gli);
651# return -1;
652# }
653
654 # do any automatic metadata extraction
655 $self->auto_extract_metadata ($doc_obj);
656
657 # process the document
658 $processor->process($doc_obj);
659
660 # clean up temporary files - we do this here instead of in
661 # process_image becuase associated files aren't actually copied
662 # until after process has been run.
663 if (defined $self->{'tmp_filename1'} &&
664 -e $self->{'tmp_filename1'}) {
665 &util::rm($self->{'tmp_filename1'})
666 }
667 if (defined $self->{'tmp_filename2'} &&
668 -e $self->{'tmp_filename2'}) {
669 &util::rm($self->{'tmp_filename2'})
670 }
671 if (defined $self->{'tmp_filename3'} &&
672 -e $self->{'tmp_filename3'}) {
673 &util::rm($self->{'tmp_filename3'})
674 }
675
676 $self->{'num_processed'}++;
677
678 return 1;
679}
680
681sub xml_start_tag {
682 my $self = shift(@_);
683 my ($expat, $element) = @_;
684 $self->{'element'} = $element;
685
686 my $doc_obj = $self->{'doc_obj'};
687 if ($element eq "PagedDocument") {
688 $self->{'current_section'} = $doc_obj->get_top_section();
689 } elsif ($element eq "PageGroup" || $element eq "Page") {
690 # create a new section as a child
691 $self->{'current_section'} = $doc_obj->insert_section($doc_obj->get_end_child($self->{'current_section'}));
692 $self->{'num_pages'}++;
693 # assign pagenum as what??
694 my $pagenum = $_{'pagenum'}; #TODO!!
695 $doc_obj->set_utf8_metadata_element($self->{'current_section'}, 'PageNum', $pagenum);
696 my ($imgfile) = $_{'imgfile'};
697 if (defined $imgfile) {
698 $self->process_image($self->{'base_dir'}.$imgfile, $imgfile, $doc_obj, $self->{'current_section'});
699 }
700 my ($txtfile) = $_{'txtfile'};
701 if (defined($txtfile)) {
702 $self->process_text ($self->{'base_dir'}.$txtfile, $txtfile, $doc_obj, $self->{'current_section'});
703 } else {
704 # otherwise add in some dummy text
705 $doc_obj->add_text($self->{'current_section'}, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
706 }
707 } elsif ($element eq "Metadata") {
708 $self->{'metadata_name'} = $_{'name'};
709 }
710}
711
712sub xml_end_tag {
713 my $self = shift(@_);
714 my ($expat, $element) = @_;
715
716 my $doc_obj = $self->{'doc_obj'};
717 if ($element eq "Page" || $element eq "PageGroup") {
718 # move the current section back to the parent
719 $self->{'current_section'} = $doc_obj->get_parent_section($self->{'current_section'});
720 } elsif ($element eq "Metadata") {
721
722 $doc_obj->add_utf8_metadata ($self->{'current_section'}, $self->{'metadata_name'}, $self->{'metadata_value'});
723 $self->{'metadata_name'} = "";
724 $self->{'metadata_value'} = "";
725
726 }
727 # otherwise we ignore the end tag
728}
729
730
731sub xml_text {
732 my $self = shift(@_);
733 my ($expat) = @_;
734
735 if ($self->{'element'} eq "Metadata") {
736 $self->{'metadata_value'} .= $_;
737 }
738}
739
740sub xml_doctype {
741}
742
743sub open_document {
744 my $self = shift(@_);
745
746 # create a new document
747 $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
748 my $doc_obj = $self->{'doc_obj'};
749 $doc_obj->set_OIDtype ($self->{'processor'}->{'OIDtype'});
750 my ($dir, $file) = $self->{'filename'} =~ /^(.*?)([^\/\\]*)$/;
751 $self->{'base_dir'} = $dir;
752 $self->{'num_pages'} = 0;
753 my $topsection = $doc_obj->get_top_section();
754 if ($self->{'documenttype'} eq 'paged') {
755 # set the gsdlthistype metadata to Paged - this ensures this document will
756 # be treated as a Paged doc, even if Titles are not numeric
757
758 $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
759 } else {
760 $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
761 }
762
763 $doc_obj->add_metadata ($topsection, "Source", $file);
764 if ($self->{'headerpage'}) {
765 $doc_obj->add_text($topsection, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
766 }
767
768}
769
770sub close_document {
771 my $self = shift(@_);
772 my $doc_obj = $self->{'doc_obj'};
773
774 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
775 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "PagedImg");
776
777 # add numpages metadata
778 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(), 'NumPages', $self->{'num_pages'});
779
780 # add an OID
781 $doc_obj->set_OID();
782
783}
784
785sub process_item {
786 my $self = shift (@_);
787 my ($filename, $dir, $file, $processor) = @_;
788
789 my $doc_obj = new doc ($filename, "indexed_doc");
790 my $topsection = $doc_obj->get_top_section();
791
792 if ($self->{'documenttype'} eq 'paged') {
793 # set the gsdlthistype metadata to Paged - this ensures this document will
794 # be treated as a Paged doc, even if Titles are not numeric
795
796 $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
797 } else {
798 $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
799 }
800
801 $doc_obj->add_metadata ($topsection, "Source", $file);
802
803 open (ITEMFILE, $filename) || die "couldn't open $filename\n";
804 my $line = "";
805 my $num = 0;
806 while (defined ($line = <ITEMFILE>)) {
807 next unless $line =~ /\w/;
808 chomp $line;
809 if ($line =~ /^<([^>]*)>(.*?)\s*$/) {
810 $doc_obj->set_utf8_metadata_element ($topsection, $1, $2);
811 #$meta->{$1} = $2;
812 } else {
813 $num++;
814 # line should be like page:imagefilename:textfilename:r - the r is optional -> means rotate the image 180 deg
815 $line =~ s/^\s+//; #remove space at the front
816 $line =~ s/\s+$//; #remove space at the end
817 my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
818
819 # create a new section for each image file
820 my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
821 # the page number becomes the Title
822 $doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum);
823 # process the image for this page
824 my $result = $self->process_image($dir.$imgname, $imgname, $doc_obj, $cursection, $rotate);
825
826 if (!defined $result)
827 {
828 print "PagedImgPlug: couldn't process image \"$dir.$imgname\" for item \"$filename\"\n";
829 }
830
831 # process the text file if one is there
832 if (defined $txtname && $txtname ne "") {
833 $result = undef;
834 $result = $self->process_text ($dir.$txtname, $txtname, $doc_obj, $cursection);
835 if (!defined $result) {
836 print "PagedImgPlug: couldn't process text file \"$dir.$txtname\" for item \"$filename\"\n";
837 }
838 } else {
839 # otherwise add in some dummy text
840 $doc_obj->add_text($cursection, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
841 }
842 }
843 }
844
845 close ITEMFILE;
846
847 # if we want a header page, we need to add some text into the top section, otherwise this section will become invisible
848 if ($self->{'headerpage'}) {
849 $doc_obj->add_text($topsection, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
850 }
851 $file =~ s/\.item//i;
852 $doc_obj->set_OID ();
853 # add numpages metadata
854 $doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num");
855 return $doc_obj;
856}
857
858sub process_text {
859 my $self = shift (@_);
860 my ($fullpath, $file, $doc_obj, $cursection) = @_;
861
862 # Do encoding stuff
863 my ($language, $encoding) = $self->textcat_get_language_encoding ($fullpath);
864
865 my $text="";
866 &BasPlug::read_file($self, $fullpath, $encoding, $language, \$text);
867 if (!length ($text)) {
868 my $plugin_name = ref ($self);
869 print "PagedImgPlug: ERROR: $fullpath contains no text\n" if $self->{'verbosity'};
870 return 0;
871 }
872
873 # we need to escape the escape character, or else mg will convert into
874 # eg literal newlines, instead of leaving the text as '\n'
875 $text =~ s/\\/\\\\/g; # macro language
876 $text =~ s/_/\\_/g; # macro language
877 $text =~ s/</&lt;/g;
878 $text =~ s/>/&gt;/g;
879
880 # insert preformat tags and add text to document object
881 $doc_obj->add_utf8_text($cursection, "<pre>\n$text\n</pre>");
882
883 return 1;
884}
885
886# do plugin specific processing of doc_obj
887sub process {
888 my $self = shift (@_);
889 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
890 my $outhandle = $self->{'outhandle'};
891
892 return 1;
893}
894
8951;
Note: See TracBrowser for help on using the repository browser.