source: trunk/gsdl/perllib/plugins/PagedImgPlug.pm@ 10613

Last change on this file since 10613 was 10613, checked in by kjdon, 19 years ago

modified the item file metadata regex so that space is allowed (and removed) from the front of the value

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 31.7 KB
Line 
1###########################################################################
2#
3# PagedImgPlug.pm -- plugin for sets of images and OCR text that
4# make up a document
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# PagedImgPlug
28# processes sequences of images, with optional OCR text
29#
30# This plugin takes *.item files, which contain metadata and lists of image
31# files, and produces a document containing sections, one for each page.
32# The files should be named something.item, then you can have more than one
33# book in a directory. You will need to create these files, one for each
34# document/book.
35#
36#There are two formats for the item files: a plain text format, and an xml
37#format. You can use either format, and can have both formats in the same
38#collection if you like. If you use the plain format, you must not start the
39#file off with <PagedDocument>
40
41#### PLAIN FORMAT
42# The format of the xxx.item file is as follows:
43# The first lines contain any metadata for the whole document
44# <metadata-name>metadata-value
45# eg.
46# <Title>Snail farming
47# <Date>19230102
48# Then comes a list of pages, one page per line, each line has the format
49#
50# pagenum:imagefile:textfile:r
51#
52# page num and imagefile are required. pagenum is used for the Title
53# of the section, and in the display is shown as page <pagenum>.
54# imagefile is the image for the page. textfile is an optional text
55# file containing the OCR (or any) text for the page - this gets added
56# as the text for the section. r is optional, and signals that the image
57# should be rotated 180deg. Eg use this if the image has been made upside down.
58# So an example item file looks like:
59# <Title>Snail farming
60# <Date>19960403
61# 1:p1.gif:p1.txt:
62# 2:p2.gif::
63# 3:p3.gif:p3.txt:
64# 3b:p3b.gif:p3b.txt:r
65# The second page has no text, the fourth page is a back page, and
66# should be rotated.
67#
68
69#### XML FORMAT
70# The xml format looks like the following
71#<PagedDocument>
72#<Metadata name="Title">The Title of the entire document</Metadata>
73#<Page pagenum="1" imgfile="xxx.jpg" txtfile="yyy.jpg">
74#<Metadata name="Title">The Title of this page</Metadata>
75#</Page>
76#... more pages
77#</PagedDocument>
78#PagedDocument contains a list of Pages, Metadata and PageGroups. Any metadata
79#that is not inside another tag will belong to the document.
80#Each Page has a pagenum (not used at the moment), an imgfile and/or a txtfile.
81#These are both optional - if neither is used, the section will have no content.
82#Pages can also have metadata associated with them.
83#PageGroups can be introduced at any point - they can contain Metadata and Pages and other PageGroups. They are used to introduce hierarchical structure into the document.
84#For example
85#<PagedDocument>
86#<PageGroup>
87#<Page>
88#<Page>
89#</PageGroup>
90#<Page>
91#</PagedDocument>
92#would generate a structure like
93#X
94#--X
95# --X
96# --X
97#--X
98#PageGroup tags can also have imgfile/textfile metadata if you like - this way they get some content themselves.
99
100#Currently the XML structure doesn't work very well with the paged document type, unless you use numerical Titles for each section.
101#There is still a bit of work to do on this format:
102#* enable other text file types, eg html, pdf etc
103#* make the document paging work properly
104#* add pagenum as Title unless a Title is present?
105
106# All the supplemetary image amd text files should be in the same folder as
107# the .item file.
108#
109# To display the images instead of the document text, you can use [srcicon]
110# in the DocumentText format statement.
111# For example,
112#
113# format DocumentText "<center><table width=_pagewidth_><tr><td>[srcicon]</td></tr></table></center>"
114#
115# To have it create thumbnail size images, use the '-thumbnail' option.
116# To have it create medium size images for display, use the '-screenview'
117# option. As usual, running
118# 'perl -S pluginfo.pl PagedImgPlug' will list all the options.
119
120# If you want the resulting documents to be presented with a table of
121# contents, use '-documenttype hierarchy', otherwise they will have
122# next and previous arrows, and a goto page X box.
123
124# If you have used -screenview, you can also use [screenicon] in the format
125# statement to display the smaller image. Here is an example that switches
126# between the two:
127#
128# format DocumentText "<center><table width=_pagewidth_><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small'>Switch to small version.</a>,<a href='_httpdocument_&d=_cgiargd_&p=full'>Switch to fullsize version</a>}</td></tr><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small' title='Switch to small version'>[srcicon]</a>,<a href='_httpdocument_&d=_cgiargd_&p=full' title='Switch to fullsize version'>[screenicon]</a>}</td></tr></table></center>"
129#
130# Additional metadata can be added into the .item files, alternatively you can
131# use normal metadata.xml files, with the name of the xxx.item file as the
132# FileName (only for document level metadata).
133
134package PagedImgPlug;
135
136use XMLPlug;
137use strict;
138no strict 'refs'; # allow filehandles to be variables and viceversa
139
140sub BEGIN {
141 @PagedImgPlug::ISA = ('XMLPlug');
142}
143
144my $type_list =
145 [ { 'name' => "paged",
146 'desc' => "{PagedImgPlug.documenttype.paged}" },
147 { 'name' => "hierarchy",
148 'desc' => "{PagedImgPlug.documenttype.hierarchy}" } ];
149
150my $arguments =
151 [ { 'name' => "process_exp",
152 'desc' => "{BasPlug.process_exp}",
153 'type' => "string",
154 'deft' => &get_default_process_exp(),
155 'reqd' => "no" },
156 { 'name' => "block_exp",
157 'desc' => "{BasPlug.block_exp}",
158 'type' => "string",
159 'deft' => &get_default_block_exp(),
160 'reqd' => "no" },
161 { 'name' => "title_sub",
162 'desc' => "{HTMLPlug.title_sub}",
163 'type' => "string",
164 'deft' => "" },
165 { 'name' => "noscaleup",
166 'desc' => "{ImagePlug.noscaleup}",
167 'type' => "flag",
168 'reqd' => "no" },
169 { 'name' => "thumbnail",
170 'desc' => "{PagedImgPlug.thumbnail}",
171 'type' => "flag",
172 'reqd' => "no" },
173 { 'name' => "thumbnailsize",
174 'desc' => "{ImagePlug.thumbnailsize}",
175 'type' => "int",
176 'deft' => "100",
177 'range' => "1,",
178 'reqd' => "no" },
179 { 'name' => "thumbnailtype",
180 'desc' => "{ImagePlug.thumbnailtype}",
181 'type' => "string",
182 'deft' => "gif",
183 'reqd' => "no" },
184 { 'name' => "screenview",
185 'desc' => "{PagedImgPlug.screenview}",
186 'type' => "flag",
187 'reqd' => "no" },
188 { 'name' => "screenviewsize",
189 'desc' => "{PagedImgPlug.screenviewsize}",
190 'type' => "int",
191 'deft' => "500",
192 'range' => "1,",
193 'reqd' => "no" },
194 { 'name' => "screenviewtype",
195 'desc' => "{PagedImgPlug.screenviewtype}",
196 'type' => "string",
197 'deft' => "jpg",
198 'reqd' => "no" },
199 { 'name' => "converttotype",
200 'desc' => "{ImagePlug.converttotype}",
201 'type' => "string",
202 'deft' => "",
203 'reqd' => "no" },
204 { 'name' => "minimumsize",
205 'desc' => "{ImagePlug.minimumsize}",
206 'type' => "int",
207 'deft' => "100",
208 'range' => "1,",
209 'reqd' => "no" },
210 { 'name' => "headerpage",
211 'desc' => "{PagedImgPlug.headerpage}",
212 'type' => "flag",
213 'reqd' => "no" },
214 { 'name' => "documenttype",
215 'desc' => "{PagedImgPlug.documenttype}",
216 'type' => "enum",
217 'list' => $type_list,
218 'deft' => "paged",
219 'reqd' => "no" } ];
220
221
222my $options = { 'name' => "PagedImgPlug",
223 'desc' => "{PagedImgPlug.desc}",
224 'inherits' => "yes",
225 'args' => $arguments };
226
227sub new {
228 my ($class) = shift (@_);
229 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
230 push(@$pluginlist, $class);
231
232 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
233 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
234
235 my $self = (defined $hashArgOptLists)? new XMLPlug($pluginlist,$inputargs,$hashArgOptLists): new XMLPlug($pluginlist,$inputargs);
236
237 return bless $self, $class;
238}
239
240sub get_default_process_exp {
241 my $self = shift (@_);
242
243 return q^\.item$^;
244}
245
246# want to block everything except the .item ones
247# but instead we will block images and txt files
248sub get_default_block_exp {
249 my $self = shift (@_);
250
251 return q^(?i)(\.jpe?g|\.gif|\.png|\.tif?f|\.te?xt|~)$^
252}
253
254# Create the thumbnail and screenview images, and discover the Image's
255# size, width, and height using the convert utility.
256sub process_image {
257 my $self = shift (@_);
258 my $filename = shift (@_); # filename with full path
259 my $srcfile = shift (@_); # filename without path
260 my $doc_obj = shift (@_);
261 my $section = shift (@_); #the current section
262 my $rotate = shift (@_); # whether to rotate the image or not
263 $rotate = 0 unless defined $rotate;
264
265 my $top=0;
266 if ($section eq $doc_obj->get_top_section()) {
267 $top=1;
268 }
269 my $verbosity = $self->{'verbosity'};
270 my $outhandle = $self->{'outhandle'};
271
272 # check the filename is okay
273 return 0 if ($srcfile eq "" || $filename eq "");
274
275 my $minimumsize = $self->{'minimumsize'};
276 if (defined $minimumsize && (-s $filename < $minimumsize)) {
277 print $outhandle "PagedImgPlug: \"$filename\" too small, skipping\n"
278 if ($verbosity > 1);
279 }
280
281 # Convert the image to a new type (if required), and rotate if required.
282 my $converttotype = $self->{'converttotype'};
283 my $originalfilename = ""; # only set if we do a conversion
284 my $type = "unknown";
285 my $converted = 0;
286 my $rotated=0;
287
288 if ($converttotype ne "" && $filename !~ /$converttotype$/) {
289 $converted=1;
290 $originalfilename = $filename;
291 my $filehead = &util::get_tmp_filename();
292 $filename = $filehead . ".$converttotype";
293 my $n = 1;
294 while (-e $filename) {
295 $filename = "$filehead$n\.$converttotype";
296 $n++;
297 }
298 $self->{'tmp_filename1'} = $filename;
299
300 my $rotate_option = "";
301 if ($rotate eq "r") {
302 $rotate_option = "-rotate 180 ";
303 }
304
305 my $command = "convert -verbose \"$originalfilename\" $rotate_option \"$filename\"";
306 print $outhandle "CONVERT: $command\n" if ($verbosity > 2);
307 my $result = '';
308 $result = `$command`;
309 print $outhandle "CONVERT RESULT = $result\n" if ($verbosity > 2);
310
311 $type = $converttotype;
312 } elsif ($rotate eq "r") {
313 $rotated=1;
314 $originalfilename = $filename;
315 $filename = &util::get_tmp_filename();
316
317 my $command = "convert \"$originalfilename\" -rotate 180 \"$filename\"";
318 print $outhandle "ROTATE: $command\n" if ($verbosity > 2);
319 my $result = '';
320 $result = `$command`;
321 print $outhandle "ROTATE RESULT = $result\n" if ($verbosity > 2);
322
323 }
324
325
326 # Add the image metadata
327 my $file; # the new file name
328 my $id = $srcfile;
329 $id =~ s/\.([^\.]*)$//; # the new file name without an extension
330 if ($converted) {
331 # we have converted the image
332 # add on the new extension
333 $file .= "$id.$converttotype";
334 } else {
335 $file = $srcfile;
336 }
337
338 my $url =$file; # the new file name prepared for a url
339 my $srcurl = $srcfile;
340 $url =~ s/ /%20/g;
341 $srcurl =~ s/ /%20/g;
342
343 $doc_obj->add_metadata ($section, "Image", $url);
344
345 # Also want to set filename as 'Source' metadata to be
346 # consistent with other plugins
347 $doc_obj->add_metadata ($section, "Source", $srcurl);
348
349 my ($image_type, $image_width, $image_height, $image_size)
350 = &identify($filename, $outhandle, $verbosity);
351
352 $doc_obj->add_metadata ($section, "ImageType", $image_type);
353 $doc_obj->add_metadata ($section, "ImageWidth", $image_width);
354 $doc_obj->add_metadata ($section, "ImageHeight", $image_height);
355 $doc_obj->add_metadata ($section, "ImageSize", $image_size);
356 $doc_obj->add_metadata ($section, "FileFormat", "PagedImg");
357
358 if ($type eq "unknown" && $image_type) {
359 $type = $image_type;
360 }
361
362 if ($top) {
363 $doc_obj->add_metadata ($section, "srclink",
364 "<a href=\"_httpcollection_/index/assoc/[assocfilepath]/[Image]\">");
365 $doc_obj->add_metadata ($section, "srcicon", "<img src=\"_httpcollection_/index/assoc/[assocfilepath]/[Image]\">");
366
367 } else {
368 $doc_obj->add_metadata ($section, "srclink",
369 "<a href=\"_httpcollection_/index/assoc/[parent(Top):assocfilepath]/[Image]\">");
370 $doc_obj->add_metadata ($section, "srcicon", "<img src=\"_httpcollection_/index/assoc/[parent(Top):assocfilepath]/[Image]\">");
371
372 }
373 $doc_obj->add_metadata ($section, "/srclink", "</a>");
374
375
376 # Add the image as an associated file
377 $doc_obj->associate_file($filename,$file,"image/$type",$section);
378 print $outhandle "associating file $filename as name $file\n" if ($verbosity > 2);
379
380 if ($self->{'thumbnail'}) {
381 # Make the thumbnail image
382 my $thumbnailsize = $self->{'thumbnailsize'} || 100;
383 my $thumbnailtype = $self->{'thumbnailtype'} || 'gif';
384
385 my $filehead = &util::get_tmp_filename();
386 my $thumbnailfile = $filehead . ".$thumbnailtype";
387 my $n=1;
388 while (-e $thumbnailfile) {
389 $thumbnailfile = $filehead . $n . ".$thumbnailtype";
390 $n++;
391 }
392
393 $self->{'tmp_filename2'} = $thumbnailfile;
394
395 # Generate the thumbnail with convert
396 my $command = "convert -verbose -geometry $thumbnailsize"
397 . "x$thumbnailsize \"$filename\" \"$thumbnailfile\"";
398 print $outhandle "THUMBNAIL: $command\n" if ($verbosity > 2);
399 my $result = '';
400 $result = `$command 2>&1` ;
401 print $outhandle "THUMB RESULT: $result\n" if ($verbosity > 2);
402
403 # Add the thumbnail as an associated file ...
404 if (-e "$thumbnailfile") {
405 $doc_obj->associate_file("$thumbnailfile", $id."thumb.$thumbnailtype", "image/$thumbnailtype",$section);
406 $doc_obj->add_metadata ($section, "ThumbType", $thumbnailtype);
407 $doc_obj->add_metadata ($section, "Thumb", $id."thumb.$thumbnailtype");
408 if ($top) {
409 $doc_obj->add_metadata ($section, "thumbicon", "<img src=\"_httpcollection_/index/assoc/[assocfilepath]/[Thumb]\" width=[ThumbWidth] height=[ThumbHeight]>");
410 } else {
411 $doc_obj->add_metadata ($section, "thumbicon", "<img src=\"_httpcollection_/index/assoc/[parent(Top):assocfilepath]/[Thumb]\" width=[ThumbWidth] height=[ThumbHeight]>");
412 }
413 }
414
415 # Extract Thumnail metadata from convert output
416 if ($result =~ m/[0-9]+x[0-9]+=>([0-9]+)x([0-9]+)/) {
417 $doc_obj->add_metadata ($section, "ThumbWidth", $1);
418 $doc_obj->add_metadata ($section, "ThumbHeight", $2);
419 }
420 }
421 # Make a screen-sized version of the picture if requested
422 if ($self->{'screenview'}) {
423
424 # To do: if the actual image is smaller than the screenview size,
425 # we should use the original !
426
427 my $screenviewsize = $self->{'screenviewsize'} || 500;
428 my $screenviewtype = $self->{'screenviewtype'} || 'jpeg';
429 my $filehead = &util::get_tmp_filename();
430 my $screenviewfilename = $filehead . ".$screenviewtype";
431 my $n=1;
432 while (-e $screenviewfilename) {
433 $screenviewfilename = "$filehead$n\.$screenviewtype";
434 $n++;
435 }
436 $self->{'tmp_filename3'} = $screenviewfilename;
437
438 # make the screenview image
439 my $command = "convert -verbose -geometry $screenviewsize"
440 . "x$screenviewsize \"$filename\" \"$screenviewfilename\"";
441 print $outhandle "SCREENVIEW: $command\n" if ($verbosity > 2);
442 my $result = "";
443 $result = `$command 2>&1` ;
444 print $outhandle "SCREENVIEW RESULT: $result\n" if ($verbosity > 3);
445
446 # get screenview dimensions, size and type
447 if ($result =~ m/[0-9]+x[0-9]+=>([0-9]+)x([0-9]+)/) {
448 $doc_obj->add_metadata ($section, "ScreenWidth", $1);
449 $doc_obj->add_metadata ($section, "ScreenHeight", $2);
450 }elsif ($result =~ m/([0-9]+)x([0-9]+)/) {
451 #if the image hasn't changed size, the previous regex doesn't match
452 $doc_obj->add_metadata ($section, "ScreenWidth", $1);
453 $doc_obj->add_metadata ($section, "ScreenHeight", $2);
454 }
455
456 #add the screenview as an associated file ...
457 if (-e "$screenviewfilename") {
458 $doc_obj->associate_file("$screenviewfilename", $id."sv.$screenviewtype",
459 "image/$screenviewtype",$section);
460 print $outhandle "associating screen file $screenviewfilename as name $id sv.$screenviewtype\n" if ($verbosity > 2);
461
462 $doc_obj->add_metadata ($section, "ScreenType", $screenviewtype);
463 $doc_obj->add_metadata ($section, "Screen", $id."sv.$screenviewtype");
464
465 if ($top) {
466 $doc_obj->add_metadata ($section, "screenicon", "<img src=\"_httpcollection_/index/assoc/[assocfilepath]/[Screen]\" width=[ScreenWidth] height=[ScreenHeight]>");
467 } else {
468 $doc_obj->add_metadata ($section, "screenicon", "<img src=\"_httpcollection_/index/assoc/[parent(Top):assocfilepath]/[Screen]\" width=[ScreenWidth] height=[ScreenHeight]>");
469
470 }
471 } else {
472 print $outhandle "PagedImgPlug: couldn't find \"$screenviewfilename\"\n";
473 }
474 }
475
476 return $type;
477
478
479}
480
481
482
483# Discover the characteristics of an image file with the ImageMagick
484# "identify" command.
485
486sub identify {
487 my ($image, $outhandle, $verbosity) = @_;
488
489 # Use the ImageMagick "identify" command to get the file specs
490 my $command = "identify \"$image\" 2>&1";
491 print $outhandle "$command\n" if ($verbosity > 2);
492 my $result = '';
493 $result = `$command`;
494 print $outhandle "$result\n" if ($verbosity > 3);
495
496 # Read the type, width, and height
497 my $type = 'unknown';
498 my $width = 'unknown';
499 my $height = 'unknown';
500
501 my $image_safe = quotemeta $image;
502 if ($result =~ /^$image_safe (\w+) (\d+)x(\d+)/) {
503 $type = $1;
504 $width = $2;
505 $height = $3;
506 }
507
508 # Read the size
509 my $size = "unknown";
510 if ($result =~ m/^.* ([0-9]+)b/) {
511 $size = $1;
512 } elsif ($result =~ m/^.* ([0-9]+)kb/) {
513 $size = 1024 * $1;
514 }
515
516 print $outhandle "file: $image:\t $type, $width, $height, $size\n"
517 if ($verbosity > 3);
518
519 # Return the specs
520 return ($type, $width, $height, $size);
521}
522
523
524# The PagedImgPlug read() function. This function does all the right things
525# to make general options work for a given plugin. It calls the process()
526# function which does all the work specific to a plugin (like the old
527# read functions used to do). Most plugins should define their own
528# process() function and let this read() function keep control.
529#
530# PagedImgPlug overrides read() because there is no need to read the actual
531# text of the file in, because the contents of the file is not text...
532#
533# Return number of files processed, undef if can't process
534# Note that $base_dir might be "" and that $file might
535# include directories
536
537sub read_into_doc_obj {
538 my $self = shift (@_);
539 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
540 my $outhandle = $self->{'outhandle'};
541 my $smart_block = $self->{'smart_block'};
542
543 my $filename = &util::filename_cat($base_dir, $file);
544
545 if ($self->associate_with($file,$filename,$metadata)) {
546 # a form of smart block
547 $self->{'num_blocked'} ++;
548 return 0; # blocked
549 }
550
551 if ($smart_block) {
552 if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
553 $self->{'num_blocked'} ++;
554 return 0; # blocked
555 }
556 } elsif ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
557 $self->{'num_blocked'} ++;
558 return 0; # blocked
559 }
560
561 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
562 return undef;
563 }
564
565 print $outhandle "PagedImgPlug processing \"$filename\"\n"
566 if $self->{'verbosity'} > 1;
567 print STDERR "<Processing n='$file' p='PagedImgPlug'>\n" if ($gli);
568
569 # here we need to decide if we have an old text .item file, or a new xml
570 # .item file - for now the test is if the first non-empty line is
571 # <PagedDocument> then its xml
572 my $xml_version = 0;
573 open (ITEMFILE, $filename) || die "couldn't open $filename\n";
574
575 my $backup_filename = "backup.item";
576 open (BACKUP,">$backup_filename")|| die "couldn't write to $backup_filename\n";
577 my $line = "";
578 my $num = 0;
579 $line = <ITEMFILE>;
580 while ($line !~ /\w/) {
581 $line = <ITEMFILE>;
582 }
583 chomp $line;
584 if ($line =~ /<PagedDocument/) {
585 $xml_version = 1;
586 }
587 close ITEMFILE;
588 open (ITEMFILE, $filename) || die "couldn't open $filename\n";
589 #Tidy up the item file some metadata title contains \vt-vertical tab
590 while ($line = <ITEMFILE>) {
591 $line =~ s/\x0B+//ig;
592 $line =~ s/&/&amp;/g;
593 print BACKUP ($line);
594 }
595 close ITEMFILE;
596 close BACKUP;
597 &File::Copy::copy ($backup_filename, $filename);
598 &util::rm($backup_filename);
599 #print STDERR "xml version = $xml_version\n";
600 my $doc_obj;
601 if ($xml_version) {
602 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
603 $self->{'file'} = $file;
604 $self->{'filename'} = $filename;
605 $self->{'processor'} = $processor;
606 $self->{'metadata'} = $metadata;
607 $self->{'gli'} = $gli;
608 eval {
609 $@ = "";
610 my $xslt = $self->{'xslt'};
611 if (defined $xslt && ($xslt ne "")) {
612 # perform xslt
613 my $transformed_xml = $self->apply_xslt($xslt,$filename);
614
615 # feed transformed file (now in memory as string) into XML parser
616 #$self->{'parser'}->parse($transformed_xml);
617 $self->parse_string($transformed_xml);
618 }
619 else {
620 #$self->{'parser'}->parsefile($filename);
621 $self->parse_file($filename);
622 }
623 };
624
625
626
627 if ($@) {
628
629 # parsefile may either croak somewhere in XML::Parser (e.g. because
630 # the document is not well formed) or die somewhere in XMLPlug or a
631 # derived plugin (e.g. because we're attempting to process a
632 # document whose DOCTYPE is not meant for this plugin). For the
633 # first case we'll print a warning and continue, for the second
634 # we'll just continue quietly
635
636 print STDERR "**** XML Parse Error is: $@\n";
637
638 my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
639 if (defined $msg) {
640 my $outhandle = $self->{'outhandle'};
641 my $plugin_name = ref ($self);
642 print $outhandle "$plugin_name failed to process $file ($msg)\n";
643 }
644
645 # reset ourself for the next document
646 $self->{'section_level'}=0;
647 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
648 return -1; # error during processing
649 }
650 $doc_obj = $self->{'doc_obj'};
651 } else {
652 my ($dir);
653 ($dir, $file) = $filename =~ /^(.*?)([^\/\\]*)$/;
654
655 #process the .item file
656 $doc_obj = $self->process_item($filename, $dir, $file, $processor);
657
658 }
659
660 if ($self->{'cover_image'}) {
661 $self->associate_cover_image($doc_obj, $filename);
662 }
663
664 # include any metadata passed in from previous plugins
665 # note that this metadata is associated with the top level section
666 my $section = $doc_obj->get_top_section();
667 $self->extra_metadata ($doc_obj, $section, $metadata);
668 #my $text="";
669 # do plugin specific processing of doc_obj
670 #unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
671 #print STDERR "<ProcessingError n='$file'>\n" if ($gli);
672 #return -1;
673 #}
674 # do any automatic metadata extraction
675 $self->auto_extract_metadata ($doc_obj);
676
677 $self->{'num_processed'}++;
678 return (1,$doc_obj);
679}
680
681sub read
682{
683 my $self = shift (@_);
684 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
685
686 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
687
688 if ((defined $process_status) && ($process_status == 1)) {
689 # process the document
690 $processor->process($doc_obj);
691
692 #if(defined($self->{'places_filename'})){
693 # &util::rm($self->{'places_filename'});
694 # $self->{'places_filename'} = undef;
695 #}
696 #$self->{'num_processed'} ++;
697 undef $doc_obj;
698 }
699
700 # clean up temporary files - we do this here instead of in
701 # process_image becuase associated files aren't actually copied
702 # until after process has been run.
703 if (defined $self->{'tmp_filename1'} &&
704 -e $self->{'tmp_filename1'}) {
705 &util::rm($self->{'tmp_filename1'})
706 }
707 if (defined $self->{'tmp_filename2'} &&
708 -e $self->{'tmp_filename2'}) {
709 &util::rm($self->{'tmp_filename2'})
710 }
711 if (defined $self->{'tmp_filename3'} &&
712 -e $self->{'tmp_filename3'}) {
713 &util::rm($self->{'tmp_filename3'})
714 }
715 # if process_status == 1, then the file has been processed.
716 return $process_status;
717}
718
719sub xml_start_tag {
720 my $self = shift(@_);
721 my ($expat, $element) = @_;
722 $self->{'element'} = $element;
723
724 my $doc_obj = $self->{'doc_obj'};
725 if ($element eq "PagedDocument") {
726 $self->{'current_section'} = $doc_obj->get_top_section();
727 } elsif ($element eq "PageGroup" || $element eq "Page") {
728 # create a new section as a child
729 $self->{'current_section'} = $doc_obj->insert_section($doc_obj->get_end_child($self->{'current_section'}));
730 $self->{'num_pages'}++;
731 # assign pagenum as what??
732 my $pagenum = $_{'pagenum'}; #TODO!!
733 $doc_obj->set_utf8_metadata_element($self->{'current_section'}, 'PageNum', $pagenum);
734 my ($imgfile) = $_{'imgfile'};
735 if (defined $imgfile) {
736 $self->process_image($self->{'base_dir'}.$imgfile, $imgfile, $doc_obj, $self->{'current_section'});
737 }
738 my ($txtfile) = $_{'txtfile'};
739 if (defined($txtfile)&& $txtfile ne "") {
740 $self->process_text ($self->{'base_dir'}.$txtfile, $txtfile, $doc_obj, $self->{'current_section'});
741 } else {
742 # otherwise add in some dummy text
743 $doc_obj->add_text($self->{'current_section'}, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
744 }
745 } elsif ($element eq "Metadata") {
746 $self->{'metadata_name'} = $_{'name'};
747 }
748}
749
750sub xml_end_tag {
751 my $self = shift(@_);
752 my ($expat, $element) = @_;
753
754 my $doc_obj = $self->{'doc_obj'};
755 if ($element eq "Page" || $element eq "PageGroup") {
756 # if Title hasn't been assigned, set PageNum as Title
757 if (!defined $doc_obj->get_metadata_element ($self->{'current_section'}, "Title") && defined $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" )) {
758 $doc_obj->add_utf8_metadata ($self->{'current_section'}, "Title", $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" ));
759 }
760 # move the current section back to the parent
761 $self->{'current_section'} = $doc_obj->get_parent_section($self->{'current_section'});
762 } elsif ($element eq "Metadata") {
763
764 $doc_obj->add_utf8_metadata ($self->{'current_section'}, $self->{'metadata_name'}, $self->{'metadata_value'});
765 $self->{'metadata_name'} = "";
766 $self->{'metadata_value'} = "";
767
768 }
769 # otherwise we ignore the end tag
770}
771
772
773sub xml_text {
774 my $self = shift(@_);
775 my ($expat) = @_;
776
777 if ($self->{'element'} eq "Metadata") {
778 $self->{'metadata_value'} .= $_;
779 }
780}
781
782sub xml_doctype {
783}
784
785sub open_document {
786 my $self = shift(@_);
787
788 # create a new document
789 $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
790 my $doc_obj = $self->{'doc_obj'};
791 $doc_obj->set_OIDtype ($self->{'processor'}->{'OIDtype'});
792 my ($dir, $file) = $self->{'filename'} =~ /^(.*?)([^\/\\]*)$/;
793 $self->{'base_dir'} = $dir;
794 $self->{'num_pages'} = 0;
795 my $topsection = $doc_obj->get_top_section();
796 if ($self->{'documenttype'} eq 'paged') {
797 # set the gsdlthistype metadata to Paged - this ensures this document will
798 # be treated as a Paged doc, even if Titles are not numeric
799
800 $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
801 } else {
802 $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
803 }
804
805 $doc_obj->add_metadata ($topsection, "Source", $file);
806 if ($self->{'headerpage'}) {
807 $doc_obj->add_text($topsection, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
808 }
809
810}
811
812sub close_document {
813 my $self = shift(@_);
814 my $doc_obj = $self->{'doc_obj'};
815
816 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
817 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "PagedImg");
818
819 # add numpages metadata
820 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(), 'NumPages', $self->{'num_pages'});
821
822 # add an OID
823 $doc_obj->set_OID();
824
825}
826
827sub process_item {
828 my $self = shift (@_);
829 my ($filename, $dir, $file, $processor) = @_;
830
831 my $doc_obj = new doc ($filename, "indexed_doc");
832 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
833 my $topsection = $doc_obj->get_top_section();
834
835 if ($self->{'documenttype'} eq 'paged') {
836 # set the gsdlthistype metadata to Paged - this ensures this document will
837 # be treated as a Paged doc, even if Titles are not numeric
838 $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
839 } else {
840 $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
841 }
842
843 $doc_obj->add_metadata ($topsection, "Source", $file);
844
845 open (ITEMFILE, $filename) || die "couldn't open $filename\n";
846 my $line = "";
847 my $num = 0;
848 while (defined ($line = <ITEMFILE>)) {
849 next unless $line =~ /\w/;
850 chomp $line;
851 if ($line =~ /^<([^>]*)>\s*(.*?)\s*$/) {
852 $doc_obj->set_utf8_metadata_element ($topsection, $1, $2);
853 #$meta->{$1} = $2;
854 } else {
855 $num++;
856 # line should be like page:imagefilename:textfilename:r - the r is optional -> means rotate the image 180 deg
857 $line =~ s/^\s+//; #remove space at the front
858 $line =~ s/\s+$//; #remove space at the end
859 my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
860
861 # create a new section for each image file
862 my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
863 # the page number becomes the Title
864 $doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum);
865 # process the image for this page
866 my $result = $self->process_image($dir.$imgname, $imgname, $doc_obj, $cursection, $rotate);
867
868 if (!defined $result)
869 {
870 print "PagedImgPlug: couldn't process image \"$dir.$imgname\" for item \"$filename\"\n";
871 }
872
873 # process the text file if one is there
874 if (defined $txtname && $txtname ne "") {
875 $result = undef;
876 $result = $self->process_text ($dir.$txtname, $txtname, $doc_obj, $cursection);
877 if (!defined $result) {
878 print "PagedImgPlug: couldn't process text file \"$dir.$txtname\" for item \"$filename\"\n";
879 }
880 } else {
881 # otherwise add in some dummy text
882 $doc_obj->add_text($cursection, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
883 }
884 }
885 }
886
887 close ITEMFILE;
888
889 # if we want a header page, we need to add some text into the top section, otherwise this section will become invisible
890 if ($self->{'headerpage'}) {
891 $doc_obj->add_text($topsection, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
892 }
893 $file =~ s/\.item//i;
894 $doc_obj->set_OID ();
895 # add numpages metadata
896 $doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num");
897 return $doc_obj;
898}
899
900sub process_text {
901 my $self = shift (@_);
902 my ($fullpath, $file, $doc_obj, $cursection) = @_;
903
904 # Do encoding stuff
905 my ($language, $encoding) = $self->textcat_get_language_encoding ($fullpath);
906
907 my $text="";
908 &BasPlug::read_file($self, $fullpath, $encoding, $language, \$text);
909 if (!length ($text)) {
910 my $plugin_name = ref ($self);
911 print "PagedImgPlug: ERROR: $fullpath contains no text\n" if $self->{'verbosity'};
912 return 0;
913 }
914
915 # we need to escape the escape character, or else mg will convert into
916 # eg literal newlines, instead of leaving the text as '\n'
917 $text =~ s/\\/\\\\/g; # macro language
918 $text =~ s/_/\\_/g; # macro language
919 $text =~ s/</&lt;/g;
920 $text =~ s/>/&gt;/g;
921
922 # insert preformat tags and add text to document object
923 $doc_obj->add_utf8_text($cursection, "<pre>\n$text\n</pre>");
924
925 return 1;
926}
927
928# do plugin specific processing of doc_obj
929sub process {
930 my $self = shift (@_);
931 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
932 my $outhandle = $self->{'outhandle'};
933
934 return 1;
935}
936
9371;
Note: See TracBrowser for help on using the repository browser.