Context Navigation

source: trunk/gsdl/perllib/plugins/PagedImgPlug.pm@ 10613

Last change on this file since 10613 was 10613, checked in by kjdon, 19 years ago
modified the item file metadata regex so that space is allowed (and removed) from the front of the value
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 31.7 KB

Line
1	###########################################################################
2	#
3	# PagedImgPlug.pm -- plugin for sets of images and OCR text that
4	# make up a document
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	# PagedImgPlug
28	# processes sequences of images, with optional OCR text
29	#
30	# This plugin takes *.item files, which contain metadata and lists of image
31	# files, and produces a document containing sections, one for each page.
32	# The files should be named something.item, then you can have more than one
33	# book in a directory. You will need to create these files, one for each
34	# document/book.
35	#
36	#There are two formats for the item files: a plain text format, and an xml
37	#format. You can use either format, and can have both formats in the same
38	#collection if you like. If you use the plain format, you must not start the
39	#file off with <PagedDocument>
40
41	#### PLAIN FORMAT
42	# The format of the xxx.item file is as follows:
43	# The first lines contain any metadata for the whole document
44	# <metadata-name>metadata-value
45	# eg.
46	# <Title>Snail farming
47	# <Date>19230102
48	# Then comes a list of pages, one page per line, each line has the format
49	#
50	# pagenum:imagefile:textfile:r
51	#
52	# page num and imagefile are required. pagenum is used for the Title
53	# of the section, and in the display is shown as page <pagenum>.
54	# imagefile is the image for the page. textfile is an optional text
55	# file containing the OCR (or any) text for the page - this gets added
56	# as the text for the section. r is optional, and signals that the image
57	# should be rotated 180deg. Eg use this if the image has been made upside down.
58	# So an example item file looks like:
59	# <Title>Snail farming
60	# <Date>19960403
61	# 1:p1.gif:p1.txt:
62	# 2:p2.gif::
63	# 3:p3.gif:p3.txt:
64	# 3b:p3b.gif:p3b.txt:r
65	# The second page has no text, the fourth page is a back page, and
66	# should be rotated.
67	#
68
69	#### XML FORMAT
70	# The xml format looks like the following
71	#<PagedDocument>
72	#<Metadata name="Title">The Title of the entire document</Metadata>
73	#<Page pagenum="1" imgfile="xxx.jpg" txtfile="yyy.jpg">
74	#<Metadata name="Title">The Title of this page</Metadata>
75	#</Page>
76	#... more pages
77	#</PagedDocument>
78	#PagedDocument contains a list of Pages, Metadata and PageGroups. Any metadata
79	#that is not inside another tag will belong to the document.
80	#Each Page has a pagenum (not used at the moment), an imgfile and/or a txtfile.
81	#These are both optional - if neither is used, the section will have no content.
82	#Pages can also have metadata associated with them.
83	#PageGroups can be introduced at any point - they can contain Metadata and Pages and other PageGroups. They are used to introduce hierarchical structure into the document.
84	#For example
85	#<PagedDocument>
86	#<PageGroup>
87	#<Page>
88	#<Page>
89	#</PageGroup>
90	#<Page>
91	#</PagedDocument>
92	#would generate a structure like
93	#X
94	#--X
95	# --X
96	# --X
97	#--X
98	#PageGroup tags can also have imgfile/textfile metadata if you like - this way they get some content themselves.
99
100	#Currently the XML structure doesn't work very well with the paged document type, unless you use numerical Titles for each section.
101	#There is still a bit of work to do on this format:
102	#* enable other text file types, eg html, pdf etc
103	#* make the document paging work properly
104	#* add pagenum as Title unless a Title is present?
105
106	# All the supplemetary image amd text files should be in the same folder as
107	# the .item file.
108	#
109	# To display the images instead of the document text, you can use [srcicon]
110	# in the DocumentText format statement.
111	# For example,
112	#
113	# format DocumentText "<center><table width=_pagewidth_><tr><td>[srcicon]</td></tr></table></center>"
114	#
115	# To have it create thumbnail size images, use the '-thumbnail' option.
116	# To have it create medium size images for display, use the '-screenview'
117	# option. As usual, running
118	# 'perl -S pluginfo.pl PagedImgPlug' will list all the options.
119
120	# If you want the resulting documents to be presented with a table of
121	# contents, use '-documenttype hierarchy', otherwise they will have
122	# next and previous arrows, and a goto page X box.
123
124	# If you have used -screenview, you can also use [screenicon] in the format
125	# statement to display the smaller image. Here is an example that switches
126	# between the two:
127	#
128	# format DocumentText "<center><table width=_pagewidth_><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small'>Switch to small version.</a>,<a href='_httpdocument_&d=_cgiargd_&p=full'>Switch to fullsize version</a>}</td></tr><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small' title='Switch to small version'>[srcicon]</a>,<a href='_httpdocument_&d=_cgiargd_&p=full' title='Switch to fullsize version'>[screenicon]</a>}</td></tr></table></center>"
129	#
130	# Additional metadata can be added into the .item files, alternatively you can
131	# use normal metadata.xml files, with the name of the xxx.item file as the
132	# FileName (only for document level metadata).
133
134	package PagedImgPlug;
135
136	use XMLPlug;
137	use strict;
138	no strict 'refs'; # allow filehandles to be variables and viceversa
139
140	sub BEGIN {
141	@PagedImgPlug::ISA = ('XMLPlug');
142	}
143
144	my $type_list =
145	[ { 'name' => "paged",
146	'desc' => "{PagedImgPlug.documenttype.paged}" },
147	{ 'name' => "hierarchy",
148	'desc' => "{PagedImgPlug.documenttype.hierarchy}" } ];
149
150	my $arguments =
151	[ { 'name' => "process_exp",
152	'desc' => "{BasPlug.process_exp}",
153	'type' => "string",
154	'deft' => &get_default_process_exp(),
155	'reqd' => "no" },
156	{ 'name' => "block_exp",
157	'desc' => "{BasPlug.block_exp}",
158	'type' => "string",
159	'deft' => &get_default_block_exp(),
160	'reqd' => "no" },
161	{ 'name' => "title_sub",
162	'desc' => "{HTMLPlug.title_sub}",
163	'type' => "string",
164	'deft' => "" },
165	{ 'name' => "noscaleup",
166	'desc' => "{ImagePlug.noscaleup}",
167	'type' => "flag",
168	'reqd' => "no" },
169	{ 'name' => "thumbnail",
170	'desc' => "{PagedImgPlug.thumbnail}",
171	'type' => "flag",
172	'reqd' => "no" },
173	{ 'name' => "thumbnailsize",
174	'desc' => "{ImagePlug.thumbnailsize}",
175	'type' => "int",
176	'deft' => "100",
177	'range' => "1,",
178	'reqd' => "no" },
179	{ 'name' => "thumbnailtype",
180	'desc' => "{ImagePlug.thumbnailtype}",
181	'type' => "string",
182	'deft' => "gif",
183	'reqd' => "no" },
184	{ 'name' => "screenview",
185	'desc' => "{PagedImgPlug.screenview}",
186	'type' => "flag",
187	'reqd' => "no" },
188	{ 'name' => "screenviewsize",
189	'desc' => "{PagedImgPlug.screenviewsize}",
190	'type' => "int",
191	'deft' => "500",
192	'range' => "1,",
193	'reqd' => "no" },
194	{ 'name' => "screenviewtype",
195	'desc' => "{PagedImgPlug.screenviewtype}",
196	'type' => "string",
197	'deft' => "jpg",
198	'reqd' => "no" },
199	{ 'name' => "converttotype",
200	'desc' => "{ImagePlug.converttotype}",
201	'type' => "string",
202	'deft' => "",
203	'reqd' => "no" },
204	{ 'name' => "minimumsize",
205	'desc' => "{ImagePlug.minimumsize}",
206	'type' => "int",
207	'deft' => "100",
208	'range' => "1,",
209	'reqd' => "no" },
210	{ 'name' => "headerpage",
211	'desc' => "{PagedImgPlug.headerpage}",
212	'type' => "flag",
213	'reqd' => "no" },
214	{ 'name' => "documenttype",
215	'desc' => "{PagedImgPlug.documenttype}",
216	'type' => "enum",
217	'list' => $type_list,
218	'deft' => "paged",
219	'reqd' => "no" } ];
220
221
222	my $options = { 'name' => "PagedImgPlug",
223	'desc' => "{PagedImgPlug.desc}",
224	'inherits' => "yes",
225	'args' => $arguments };
226
227	sub new {
228	my ($class) = shift (@_);
229	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
230	push(@$pluginlist, $class);
231
232	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
233	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
234
235	my $self = (defined $hashArgOptLists)? new XMLPlug($pluginlist,$inputargs,$hashArgOptLists): new XMLPlug($pluginlist,$inputargs);
236
237	return bless $self, $class;
238	}
239
240	sub get_default_process_exp {
241	my $self = shift (@_);
242
243	return q^\.item$^;
244	}
245
246	# want to block everything except the .item ones
247	# but instead we will block images and txt files
248	sub get_default_block_exp {
249	my $self = shift (@_);
250
251	return q^(?i)(\.jpe?g\|\.gif\|\.png\|\.tif?f\|\.te?xt\|~)$^
252	}
253
254	# Create the thumbnail and screenview images, and discover the Image's
255	# size, width, and height using the convert utility.
256	sub process_image {
257	my $self = shift (@_);
258	my $filename = shift (@_); # filename with full path
259	my $srcfile = shift (@_); # filename without path
260	my $doc_obj = shift (@_);
261	my $section = shift (@_); #the current section
262	my $rotate = shift (@_); # whether to rotate the image or not
263	$rotate = 0 unless defined $rotate;
264
265	my $top=0;
266	if ($section eq $doc_obj->get_top_section()) {
267	$top=1;
268	}
269	my $verbosity = $self->{'verbosity'};
270	my $outhandle = $self->{'outhandle'};
271
272	# check the filename is okay
273	return 0 if ($srcfile eq "" \|\| $filename eq "");
274
275	my $minimumsize = $self->{'minimumsize'};
276	if (defined $minimumsize && (-s $filename < $minimumsize)) {
277	print $outhandle "PagedImgPlug: \"$filename\" too small, skipping\n"
278	if ($verbosity > 1);
279	}
280
281	# Convert the image to a new type (if required), and rotate if required.
282	my $converttotype = $self->{'converttotype'};
283	my $originalfilename = ""; # only set if we do a conversion
284	my $type = "unknown";
285	my $converted = 0;
286	my $rotated=0;
287
288	if ($converttotype ne "" && $filename !~ /$converttotype$/) {
289	$converted=1;
290	$originalfilename = $filename;
291	my $filehead = &util::get_tmp_filename();
292	$filename = $filehead . ".$converttotype";
293	my $n = 1;
294	while (-e $filename) {
295	$filename = "$filehead$n\.$converttotype";
296	$n++;
297	}
298	$self->{'tmp_filename1'} = $filename;
299
300	my $rotate_option = "";
301	if ($rotate eq "r") {
302	$rotate_option = "-rotate 180 ";
303	}
304
305	my $command = "convert -verbose \"$originalfilename\" $rotate_option \"$filename\"";
306	print $outhandle "CONVERT: $command\n" if ($verbosity > 2);
307	my $result = '';
308	$result = `$command`;
309	print $outhandle "CONVERT RESULT = $result\n" if ($verbosity > 2);
310
311	$type = $converttotype;
312	} elsif ($rotate eq "r") {
313	$rotated=1;
314	$originalfilename = $filename;
315	$filename = &util::get_tmp_filename();
316
317	my $command = "convert \"$originalfilename\" -rotate 180 \"$filename\"";
318	print $outhandle "ROTATE: $command\n" if ($verbosity > 2);
319	my $result = '';
320	$result = `$command`;
321	print $outhandle "ROTATE RESULT = $result\n" if ($verbosity > 2);
322
323	}
324
325
326	# Add the image metadata
327	my $file; # the new file name
328	my $id = $srcfile;
329	$id =~ s/\.([^\.]*)$//; # the new file name without an extension
330	if ($converted) {
331	# we have converted the image
332	# add on the new extension
333	$file .= "$id.$converttotype";
334	} else {
335	$file = $srcfile;
336	}
337
338	my $url =$file; # the new file name prepared for a url
339	my $srcurl = $srcfile;
340	$url =~ s/ /%20/g;
341	$srcurl =~ s/ /%20/g;
342
343	$doc_obj->add_metadata ($section, "Image", $url);
344
345	# Also want to set filename as 'Source' metadata to be
346	# consistent with other plugins
347	$doc_obj->add_metadata ($section, "Source", $srcurl);
348
349	my ($image_type, $image_width, $image_height, $image_size)
350	= &identify($filename, $outhandle, $verbosity);
351
352	$doc_obj->add_metadata ($section, "ImageType", $image_type);
353	$doc_obj->add_metadata ($section, "ImageWidth", $image_width);
354	$doc_obj->add_metadata ($section, "ImageHeight", $image_height);
355	$doc_obj->add_metadata ($section, "ImageSize", $image_size);
356	$doc_obj->add_metadata ($section, "FileFormat", "PagedImg");
357
358	if ($type eq "unknown" && $image_type) {
359	$type = $image_type;
360	}
361
362	if ($top) {
363	$doc_obj->add_metadata ($section, "srclink",
364	"<a href=\"_httpcollection_/index/assoc/[assocfilepath]/[Image]\">");
365	$doc_obj->add_metadata ($section, "srcicon", "<img src=\"_httpcollection_/index/assoc/[assocfilepath]/[Image]\">");
366
367	} else {
368	$doc_obj->add_metadata ($section, "srclink",
369	"<a href=\"_httpcollection_/index/assoc/[parent(Top):assocfilepath]/[Image]\">");
370	$doc_obj->add_metadata ($section, "srcicon", "<img src=\"_httpcollection_/index/assoc/[parent(Top):assocfilepath]/[Image]\">");
371
372	}
373	$doc_obj->add_metadata ($section, "/srclink", "</a>");
374
375
376	# Add the image as an associated file
377	$doc_obj->associate_file($filename,$file,"image/$type",$section);
378	print $outhandle "associating file $filename as name $file\n" if ($verbosity > 2);
379
380	if ($self->{'thumbnail'}) {
381	# Make the thumbnail image
382	my $thumbnailsize = $self->{'thumbnailsize'} \|\| 100;
383	my $thumbnailtype = $self->{'thumbnailtype'} \|\| 'gif';
384
385	my $filehead = &util::get_tmp_filename();
386	my $thumbnailfile = $filehead . ".$thumbnailtype";
387	my $n=1;
388	while (-e $thumbnailfile) {
389	$thumbnailfile = $filehead . $n . ".$thumbnailtype";
390	$n++;
391	}
392
393	$self->{'tmp_filename2'} = $thumbnailfile;
394
395	# Generate the thumbnail with convert
396	my $command = "convert -verbose -geometry $thumbnailsize"
397	. "x$thumbnailsize \"$filename\" \"$thumbnailfile\"";
398	print $outhandle "THUMBNAIL: $command\n" if ($verbosity > 2);
399	my $result = '';
400	$result = `$command 2>&1` ;
401	print $outhandle "THUMB RESULT: $result\n" if ($verbosity > 2);
402
403	# Add the thumbnail as an associated file ...
404	if (-e "$thumbnailfile") {
405	$doc_obj->associate_file("$thumbnailfile", $id."thumb.$thumbnailtype", "image/$thumbnailtype",$section);
406	$doc_obj->add_metadata ($section, "ThumbType", $thumbnailtype);
407	$doc_obj->add_metadata ($section, "Thumb", $id."thumb.$thumbnailtype");
408	if ($top) {
409	$doc_obj->add_metadata ($section, "thumbicon", "<img src=\"_httpcollection_/index/assoc/[assocfilepath]/[Thumb]\" width=[ThumbWidth] height=[ThumbHeight]>");
410	} else {
411	$doc_obj->add_metadata ($section, "thumbicon", "<img src=\"_httpcollection_/index/assoc/[parent(Top):assocfilepath]/[Thumb]\" width=[ThumbWidth] height=[ThumbHeight]>");
412	}
413	}
414
415	# Extract Thumnail metadata from convert output
416	if ($result =~ m/[0-9]+x[0-9]+=>([0-9]+)x([0-9]+)/) {
417	$doc_obj->add_metadata ($section, "ThumbWidth", $1);
418	$doc_obj->add_metadata ($section, "ThumbHeight", $2);
419	}
420	}
421	# Make a screen-sized version of the picture if requested
422	if ($self->{'screenview'}) {
423
424	# To do: if the actual image is smaller than the screenview size,
425	# we should use the original !
426
427	my $screenviewsize = $self->{'screenviewsize'} \|\| 500;
428	my $screenviewtype = $self->{'screenviewtype'} \|\| 'jpeg';
429	my $filehead = &util::get_tmp_filename();
430	my $screenviewfilename = $filehead . ".$screenviewtype";
431	my $n=1;
432	while (-e $screenviewfilename) {
433	$screenviewfilename = "$filehead$n\.$screenviewtype";
434	$n++;
435	}
436	$self->{'tmp_filename3'} = $screenviewfilename;
437
438	# make the screenview image
439	my $command = "convert -verbose -geometry $screenviewsize"
440	. "x$screenviewsize \"$filename\" \"$screenviewfilename\"";
441	print $outhandle "SCREENVIEW: $command\n" if ($verbosity > 2);
442	my $result = "";
443	$result = `$command 2>&1` ;
444	print $outhandle "SCREENVIEW RESULT: $result\n" if ($verbosity > 3);
445
446	# get screenview dimensions, size and type
447	if ($result =~ m/[0-9]+x[0-9]+=>([0-9]+)x([0-9]+)/) {
448	$doc_obj->add_metadata ($section, "ScreenWidth", $1);
449	$doc_obj->add_metadata ($section, "ScreenHeight", $2);
450	}elsif ($result =~ m/([0-9]+)x([0-9]+)/) {
451	#if the image hasn't changed size, the previous regex doesn't match
452	$doc_obj->add_metadata ($section, "ScreenWidth", $1);
453	$doc_obj->add_metadata ($section, "ScreenHeight", $2);
454	}
455
456	#add the screenview as an associated file ...
457	if (-e "$screenviewfilename") {
458	$doc_obj->associate_file("$screenviewfilename", $id."sv.$screenviewtype",
459	"image/$screenviewtype",$section);
460	print $outhandle "associating screen file $screenviewfilename as name $id sv.$screenviewtype\n" if ($verbosity > 2);
461
462	$doc_obj->add_metadata ($section, "ScreenType", $screenviewtype);
463	$doc_obj->add_metadata ($section, "Screen", $id."sv.$screenviewtype");
464
465	if ($top) {
466	$doc_obj->add_metadata ($section, "screenicon", "<img src=\"_httpcollection_/index/assoc/[assocfilepath]/[Screen]\" width=[ScreenWidth] height=[ScreenHeight]>");
467	} else {
468	$doc_obj->add_metadata ($section, "screenicon", "<img src=\"_httpcollection_/index/assoc/[parent(Top):assocfilepath]/[Screen]\" width=[ScreenWidth] height=[ScreenHeight]>");
469
470	}
471	} else {
472	print $outhandle "PagedImgPlug: couldn't find \"$screenviewfilename\"\n";
473	}
474	}
475
476	return $type;
477
478
479	}
480
481
482
483	# Discover the characteristics of an image file with the ImageMagick
484	# "identify" command.
485
486	sub identify {
487	my ($image, $outhandle, $verbosity) = @_;
488
489	# Use the ImageMagick "identify" command to get the file specs
490	my $command = "identify \"$image\" 2>&1";
491	print $outhandle "$command\n" if ($verbosity > 2);
492	my $result = '';
493	$result = `$command`;
494	print $outhandle "$result\n" if ($verbosity > 3);
495
496	# Read the type, width, and height
497	my $type = 'unknown';
498	my $width = 'unknown';
499	my $height = 'unknown';
500
501	my $image_safe = quotemeta $image;
502	if ($result =~ /^$image_safe (\w+) (\d+)x(\d+)/) {
503	$type = $1;
504	$width = $2;
505	$height = $3;
506	}
507
508	# Read the size
509	my $size = "unknown";
510	if ($result =~ m/^.* ([0-9]+)b/) {
511	$size = $1;
512	} elsif ($result =~ m/^.* ([0-9]+)kb/) {
513	$size = 1024 * $1;
514	}
515
516	print $outhandle "file: $image:\t $type, $width, $height, $size\n"
517	if ($verbosity > 3);
518
519	# Return the specs
520	return ($type, $width, $height, $size);
521	}
522
523
524	# The PagedImgPlug read() function. This function does all the right things
525	# to make general options work for a given plugin. It calls the process()
526	# function which does all the work specific to a plugin (like the old
527	# read functions used to do). Most plugins should define their own
528	# process() function and let this read() function keep control.
529	#
530	# PagedImgPlug overrides read() because there is no need to read the actual
531	# text of the file in, because the contents of the file is not text...
532	#
533	# Return number of files processed, undef if can't process
534	# Note that $base_dir might be "" and that $file might
535	# include directories
536
537	sub read_into_doc_obj {
538	my $self = shift (@_);
539	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
540	my $outhandle = $self->{'outhandle'};
541	my $smart_block = $self->{'smart_block'};
542
543	my $filename = &util::filename_cat($base_dir, $file);
544
545	if ($self->associate_with($file,$filename,$metadata)) {
546	# a form of smart block
547	$self->{'num_blocked'} ++;
548	return 0; # blocked
549	}
550
551	if ($smart_block) {
552	if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
553	$self->{'num_blocked'} ++;
554	return 0; # blocked
555	}
556	} elsif ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
557	$self->{'num_blocked'} ++;
558	return 0; # blocked
559	}
560
561	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
562	return undef;
563	}
564
565	print $outhandle "PagedImgPlug processing \"$filename\"\n"
566	if $self->{'verbosity'} > 1;
567	print STDERR "<Processing n='$file' p='PagedImgPlug'>\n" if ($gli);
568
569	# here we need to decide if we have an old text .item file, or a new xml
570	# .item file - for now the test is if the first non-empty line is
571	# <PagedDocument> then its xml
572	my $xml_version = 0;
573	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
574
575	my $backup_filename = "backup.item";
576	open (BACKUP,">$backup_filename")\|\| die "couldn't write to $backup_filename\n";
577	my $line = "";
578	my $num = 0;
579	$line = <ITEMFILE>;
580	while ($line !~ /\w/) {
581	$line = <ITEMFILE>;
582	}
583	chomp $line;
584	if ($line =~ /<PagedDocument/) {
585	$xml_version = 1;
586	}
587	close ITEMFILE;
588	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
589	#Tidy up the item file some metadata title contains \vt-vertical tab
590	while ($line = <ITEMFILE>) {
591	$line =~ s/\x0B+//ig;
592	$line =~ s/&/&/g;
593	print BACKUP ($line);
594	}
595	close ITEMFILE;
596	close BACKUP;
597	&File::Copy::copy ($backup_filename, $filename);
598	&util::rm($backup_filename);
599	#print STDERR "xml version = $xml_version\n";
600	my $doc_obj;
601	if ($xml_version) {
602	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
603	$self->{'file'} = $file;
604	$self->{'filename'} = $filename;
605	$self->{'processor'} = $processor;
606	$self->{'metadata'} = $metadata;
607	$self->{'gli'} = $gli;
608	eval {
609	$@ = "";
610	my $xslt = $self->{'xslt'};
611	if (defined $xslt && ($xslt ne "")) {
612	# perform xslt
613	my $transformed_xml = $self->apply_xslt($xslt,$filename);
614
615	# feed transformed file (now in memory as string) into XML parser
616	#$self->{'parser'}->parse($transformed_xml);
617	$self->parse_string($transformed_xml);
618	}
619	else {
620	#$self->{'parser'}->parsefile($filename);
621	$self->parse_file($filename);
622	}
623	};
624
625
626
627	if ($@) {
628
629	# parsefile may either croak somewhere in XML::Parser (e.g. because
630	# the document is not well formed) or die somewhere in XMLPlug or a
631	# derived plugin (e.g. because we're attempting to process a
632	# document whose DOCTYPE is not meant for this plugin). For the
633	# first case we'll print a warning and continue, for the second
634	# we'll just continue quietly
635
636	print STDERR "**** XML Parse Error is: $@\n";
637
638	my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
639	if (defined $msg) {
640	my $outhandle = $self->{'outhandle'};
641	my $plugin_name = ref ($self);
642	print $outhandle "$plugin_name failed to process $file ($msg)\n";
643	}
644
645	# reset ourself for the next document
646	$self->{'section_level'}=0;
647	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
648	return -1; # error during processing
649	}
650	$doc_obj = $self->{'doc_obj'};
651	} else {
652	my ($dir);
653	($dir, $file) = $filename =~ /^(.?)([^\/\\])$/;
654
655	#process the .item file
656	$doc_obj = $self->process_item($filename, $dir, $file, $processor);
657
658	}
659
660	if ($self->{'cover_image'}) {
661	$self->associate_cover_image($doc_obj, $filename);
662	}
663
664	# include any metadata passed in from previous plugins
665	# note that this metadata is associated with the top level section
666	my $section = $doc_obj->get_top_section();
667	$self->extra_metadata ($doc_obj, $section, $metadata);
668	#my $text="";
669	# do plugin specific processing of doc_obj
670	#unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
671	#print STDERR "<ProcessingError n='$file'>\n" if ($gli);
672	#return -1;
673	#}
674	# do any automatic metadata extraction
675	$self->auto_extract_metadata ($doc_obj);
676
677	$self->{'num_processed'}++;
678	return (1,$doc_obj);
679	}
680
681	sub read
682	{
683	my $self = shift (@_);
684	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
685
686	my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
687
688	if ((defined $process_status) && ($process_status == 1)) {
689	# process the document
690	$processor->process($doc_obj);
691
692	#if(defined($self->{'places_filename'})){
693	# &util::rm($self->{'places_filename'});
694	# $self->{'places_filename'} = undef;
695	#}
696	#$self->{'num_processed'} ++;
697	undef $doc_obj;
698	}
699
700	# clean up temporary files - we do this here instead of in
701	# process_image becuase associated files aren't actually copied
702	# until after process has been run.
703	if (defined $self->{'tmp_filename1'} &&
704	-e $self->{'tmp_filename1'}) {
705	&util::rm($self->{'tmp_filename1'})
706	}
707	if (defined $self->{'tmp_filename2'} &&
708	-e $self->{'tmp_filename2'}) {
709	&util::rm($self->{'tmp_filename2'})
710	}
711	if (defined $self->{'tmp_filename3'} &&
712	-e $self->{'tmp_filename3'}) {
713	&util::rm($self->{'tmp_filename3'})
714	}
715	# if process_status == 1, then the file has been processed.
716	return $process_status;
717	}
718
719	sub xml_start_tag {
720	my $self = shift(@_);
721	my ($expat, $element) = @_;
722	$self->{'element'} = $element;
723
724	my $doc_obj = $self->{'doc_obj'};
725	if ($element eq "PagedDocument") {
726	$self->{'current_section'} = $doc_obj->get_top_section();
727	} elsif ($element eq "PageGroup" \|\| $element eq "Page") {
728	# create a new section as a child
729	$self->{'current_section'} = $doc_obj->insert_section($doc_obj->get_end_child($self->{'current_section'}));
730	$self->{'num_pages'}++;
731	# assign pagenum as what??
732	my $pagenum = $_{'pagenum'}; #TODO!!
733	$doc_obj->set_utf8_metadata_element($self->{'current_section'}, 'PageNum', $pagenum);
734	my ($imgfile) = $_{'imgfile'};
735	if (defined $imgfile) {
736	$self->process_image($self->{'base_dir'}.$imgfile, $imgfile, $doc_obj, $self->{'current_section'});
737	}
738	my ($txtfile) = $_{'txtfile'};
739	if (defined($txtfile)&& $txtfile ne "") {
740	$self->process_text ($self->{'base_dir'}.$txtfile, $txtfile, $doc_obj, $self->{'current_section'});
741	} else {
742	# otherwise add in some dummy text
743	$doc_obj->add_text($self->{'current_section'}, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
744	}
745	} elsif ($element eq "Metadata") {
746	$self->{'metadata_name'} = $_{'name'};
747	}
748	}
749
750	sub xml_end_tag {
751	my $self = shift(@_);
752	my ($expat, $element) = @_;
753
754	my $doc_obj = $self->{'doc_obj'};
755	if ($element eq "Page" \|\| $element eq "PageGroup") {
756	# if Title hasn't been assigned, set PageNum as Title
757	if (!defined $doc_obj->get_metadata_element ($self->{'current_section'}, "Title") && defined $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" )) {
758	$doc_obj->add_utf8_metadata ($self->{'current_section'}, "Title", $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" ));
759	}
760	# move the current section back to the parent
761	$self->{'current_section'} = $doc_obj->get_parent_section($self->{'current_section'});
762	} elsif ($element eq "Metadata") {
763
764	$doc_obj->add_utf8_metadata ($self->{'current_section'}, $self->{'metadata_name'}, $self->{'metadata_value'});
765	$self->{'metadata_name'} = "";
766	$self->{'metadata_value'} = "";
767
768	}
769	# otherwise we ignore the end tag
770	}
771
772
773	sub xml_text {
774	my $self = shift(@_);
775	my ($expat) = @_;
776
777	if ($self->{'element'} eq "Metadata") {
778	$self->{'metadata_value'} .= $_;
779	}
780	}
781
782	sub xml_doctype {
783	}
784
785	sub open_document {
786	my $self = shift(@_);
787
788	# create a new document
789	$self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
790	my $doc_obj = $self->{'doc_obj'};
791	$doc_obj->set_OIDtype ($self->{'processor'}->{'OIDtype'});
792	my ($dir, $file) = $self->{'filename'} =~ /^(.?)([^\/\\])$/;
793	$self->{'base_dir'} = $dir;
794	$self->{'num_pages'} = 0;
795	my $topsection = $doc_obj->get_top_section();
796	if ($self->{'documenttype'} eq 'paged') {
797	# set the gsdlthistype metadata to Paged - this ensures this document will
798	# be treated as a Paged doc, even if Titles are not numeric
799
800	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
801	} else {
802	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
803	}
804
805	$doc_obj->add_metadata ($topsection, "Source", $file);
806	if ($self->{'headerpage'}) {
807	$doc_obj->add_text($topsection, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
808	}
809
810	}
811
812	sub close_document {
813	my $self = shift(@_);
814	my $doc_obj = $self->{'doc_obj'};
815
816	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
817	$doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "PagedImg");
818
819	# add numpages metadata
820	$doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(), 'NumPages', $self->{'num_pages'});
821
822	# add an OID
823	$doc_obj->set_OID();
824
825	}
826
827	sub process_item {
828	my $self = shift (@_);
829	my ($filename, $dir, $file, $processor) = @_;
830
831	my $doc_obj = new doc ($filename, "indexed_doc");
832	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
833	my $topsection = $doc_obj->get_top_section();
834
835	if ($self->{'documenttype'} eq 'paged') {
836	# set the gsdlthistype metadata to Paged - this ensures this document will
837	# be treated as a Paged doc, even if Titles are not numeric
838	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
839	} else {
840	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
841	}
842
843	$doc_obj->add_metadata ($topsection, "Source", $file);
844
845	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
846	my $line = "";
847	my $num = 0;
848	while (defined ($line = <ITEMFILE>)) {
849	next unless $line =~ /\w/;
850	chomp $line;
851	if ($line =~ /^<([^>])>\s(.?)\s$/) {
852	$doc_obj->set_utf8_metadata_element ($topsection, $1, $2);
853	#$meta->{$1} = $2;
854	} else {
855	$num++;
856	# line should be like page:imagefilename:textfilename:r - the r is optional -> means rotate the image 180 deg
857	$line =~ s/^\s+//; #remove space at the front
858	$line =~ s/\s+$//; #remove space at the end
859	my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
860
861	# create a new section for each image file
862	my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
863	# the page number becomes the Title
864	$doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum);
865	# process the image for this page
866	my $result = $self->process_image($dir.$imgname, $imgname, $doc_obj, $cursection, $rotate);
867
868	if (!defined $result)
869	{
870	print "PagedImgPlug: couldn't process image \"$dir.$imgname\" for item \"$filename\"\n";
871	}
872
873	# process the text file if one is there
874	if (defined $txtname && $txtname ne "") {
875	$result = undef;
876	$result = $self->process_text ($dir.$txtname, $txtname, $doc_obj, $cursection);
877	if (!defined $result) {
878	print "PagedImgPlug: couldn't process text file \"$dir.$txtname\" for item \"$filename\"\n";
879	}
880	} else {
881	# otherwise add in some dummy text
882	$doc_obj->add_text($cursection, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
883	}
884	}
885	}
886
887	close ITEMFILE;
888
889	# if we want a header page, we need to add some text into the top section, otherwise this section will become invisible
890	if ($self->{'headerpage'}) {
891	$doc_obj->add_text($topsection, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
892	}
893	$file =~ s/\.item//i;
894	$doc_obj->set_OID ();
895	# add numpages metadata
896	$doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num");
897	return $doc_obj;
898	}
899
900	sub process_text {
901	my $self = shift (@_);
902	my ($fullpath, $file, $doc_obj, $cursection) = @_;
903
904	# Do encoding stuff
905	my ($language, $encoding) = $self->textcat_get_language_encoding ($fullpath);
906
907	my $text="";
908	&BasPlug::read_file($self, $fullpath, $encoding, $language, \$text);
909	if (!length ($text)) {
910	my $plugin_name = ref ($self);
911	print "PagedImgPlug: ERROR: $fullpath contains no text\n" if $self->{'verbosity'};
912	return 0;
913	}
914
915	# we need to escape the escape character, or else mg will convert into
916	# eg literal newlines, instead of leaving the text as '\n'
917	$text =~ s/\\/\\\\/g; # macro language
918	$text =~ s/_/\\_/g; # macro language
919	$text =~ s/</</g;
920	$text =~ s/>/>/g;
921
922	# insert preformat tags and add text to document object
923	$doc_obj->add_utf8_text($cursection, "<pre>\n$text\n</pre>");
924
925	return 1;
926	}
927
928	# do plugin specific processing of doc_obj
929	sub process {
930	my $self = shift (@_);
931	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
932	my $outhandle = $self->{'outhandle'};
933
934	return 1;
935	}
936
937	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: