Context Navigation

source: gsdl/trunk/perllib/plugins/PagedImgPlug.pm@ 14174

Last change on this file since 14174 was 14174, checked in by qq6, 17 years ago
add NoText metadata which can be used to suppress the dummy tex
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 32.5 KB

Line
1	###########################################################################
2	#
3	# PagedImgPlug.pm -- plugin for sets of images and OCR text that
4	# make up a document
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	# PagedImgPlug
28	# processes sequences of images, with optional OCR text
29	#
30	# This plugin takes *.item files, which contain metadata and lists of image
31	# files, and produces a document containing sections, one for each page.
32	# The files should be named something.item, then you can have more than one
33	# book in a directory. You will need to create these files, one for each
34	# document/book.
35	#
36	#There are two formats for the item files: a plain text format, and an xml
37	#format. You can use either format, and can have both formats in the same
38	#collection if you like. If you use the plain format, you must not start the
39	#file off with <PagedDocument>
40
41	#### PLAIN FORMAT
42	# The format of the xxx.item file is as follows:
43	# The first lines contain any metadata for the whole document
44	# <metadata-name>metadata-value
45	# eg.
46	# <Title>Snail farming
47	# <Date>19230102
48	# Then comes a list of pages, one page per line, each line has the format
49	#
50	# pagenum:imagefile:textfile:r
51	#
52	# page num and imagefile are required. pagenum is used for the Title
53	# of the section, and in the display is shown as page <pagenum>.
54	# imagefile is the image for the page. textfile is an optional text
55	# file containing the OCR (or any) text for the page - this gets added
56	# as the text for the section. r is optional, and signals that the image
57	# should be rotated 180deg. Eg use this if the image has been made upside down.
58	# So an example item file looks like:
59	# <Title>Snail farming
60	# <Date>19960403
61	# 1:p1.gif:p1.txt:
62	# 2:p2.gif::
63	# 3:p3.gif:p3.txt:
64	# 3b:p3b.gif:p3b.txt:r
65	# The second page has no text, the fourth page is a back page, and
66	# should be rotated.
67	#
68
69	#### XML FORMAT
70	# The xml format looks like the following
71	#<PagedDocument>
72	#<Metadata name="Title">The Title of the entire document</Metadata>
73	#<Page pagenum="1" imgfile="xxx.jpg" txtfile="yyy.jpg">
74	#<Metadata name="Title">The Title of this page</Metadata>
75	#</Page>
76	#... more pages
77	#</PagedDocument>
78	#PagedDocument contains a list of Pages, Metadata and PageGroups. Any metadata
79	#that is not inside another tag will belong to the document.
80	#Each Page has a pagenum (not used at the moment), an imgfile and/or a txtfile.
81	#These are both optional - if neither is used, the section will have no content.
82	#Pages can also have metadata associated with them.
83	#PageGroups can be introduced at any point - they can contain Metadata and Pages and other PageGroups. They are used to introduce hierarchical structure into the document.
84	#For example
85	#<PagedDocument>
86	#<PageGroup>
87	#<Page>
88	#<Page>
89	#</PageGroup>
90	#<Page>
91	#</PagedDocument>
92	#would generate a structure like
93	#X
94	#--X
95	# --X
96	# --X
97	#--X
98	#PageGroup tags can also have imgfile/textfile metadata if you like - this way they get some content themselves.
99
100	#Currently the XML structure doesn't work very well with the paged document type, unless you use numerical Titles for each section.
101	#There is still a bit of work to do on this format:
102	#* enable other text file types, eg html, pdf etc
103	#* make the document paging work properly
104	#* add pagenum as Title unless a Title is present?
105
106	# All the supplemetary image amd text files should be in the same folder as
107	# the .item file.
108	#
109	# To display the images instead of the document text, you can use [srcicon]
110	# in the DocumentText format statement.
111	# For example,
112	#
113	# format DocumentText "<center><table width=_pagewidth_><tr><td>[srcicon]</td></tr></table></center>"
114	#
115	# To have it create thumbnail size images, use the '-thumbnail' option.
116	# To have it create medium size images for display, use the '-screenview'
117	# option. As usual, running
118	# 'perl -S pluginfo.pl PagedImgPlug' will list all the options.
119
120	# If you want the resulting documents to be presented with a table of
121	# contents, use '-documenttype hierarchy', otherwise they will have
122	# next and previous arrows, and a goto page X box.
123
124	# If you have used -screenview, you can also use [screenicon] in the format
125	# statement to display the smaller image. Here is an example that switches
126	# between the two:
127	#
128	# format DocumentText "<center><table width=_pagewidth_><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small'>Switch to small version.</a>,<a href='_httpdocument_&d=_cgiargd_&p=full'>Switch to fullsize version</a>}</td></tr><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small' title='Switch to small version'>[srcicon]</a>,<a href='_httpdocument_&d=_cgiargd_&p=full' title='Switch to fullsize version'>[screenicon]</a>}</td></tr></table></center>"
129	#
130	# Additional metadata can be added into the .item files, alternatively you can
131	# use normal metadata.xml files, with the name of the xxx.item file as the
132	# FileName (only for document level metadata).
133
134	package PagedImgPlug;
135
136	use XMLPlug;
137	use strict;
138	no strict 'refs'; # allow filehandles to be variables and viceversa
139
140	sub BEGIN {
141	@PagedImgPlug::ISA = ('XMLPlug');
142	}
143
144	my $type_list =
145	[ { 'name' => "paged",
146	'desc' => "{PagedImgPlug.documenttype.paged}" },
147	{ 'name' => "hierarchy",
148	'desc' => "{PagedImgPlug.documenttype.hierarchy}" } ];
149
150	my $arguments =
151	[ { 'name' => "process_exp",
152	'desc' => "{BasPlug.process_exp}",
153	'type' => "string",
154	'deft' => &get_default_process_exp(),
155	'reqd' => "no" },
156	{ 'name' => "block_exp",
157	'desc' => "{BasPlug.block_exp}",
158	'type' => "string",
159	'deft' => &get_default_block_exp(),
160	'reqd' => "no" },
161	{ 'name' => "title_sub",
162	'desc' => "{HTMLPlug.title_sub}",
163	'type' => "string",
164	'deft' => "" },
165	{ 'name' => "noscaleup",
166	'desc' => "{ImagePlug.noscaleup}",
167	'type' => "flag",
168	'reqd' => "no" },
169	{ 'name' => "thumbnail",
170	'desc' => "{PagedImgPlug.thumbnail}",
171	'type' => "flag",
172	'reqd' => "no" },
173	{ 'name' => "thumbnailsize",
174	'desc' => "{ImagePlug.thumbnailsize}",
175	'type' => "int",
176	'deft' => "100",
177	'range' => "1,",
178	'reqd' => "no" },
179	{ 'name' => "thumbnailtype",
180	'desc' => "{ImagePlug.thumbnailtype}",
181	'type' => "string",
182	'deft' => "gif",
183	'reqd' => "no" },
184	{ 'name' => "screenview",
185	'desc' => "{PagedImgPlug.screenview}",
186	'type' => "flag",
187	'reqd' => "no" },
188	{ 'name' => "screenviewsize",
189	'desc' => "{PagedImgPlug.screenviewsize}",
190	'type' => "int",
191	'deft' => "500",
192	'range' => "1,",
193	'reqd' => "no" },
194	{ 'name' => "screenviewtype",
195	'desc' => "{PagedImgPlug.screenviewtype}",
196	'type' => "string",
197	'deft' => "jpg",
198	'reqd' => "no" },
199	{ 'name' => "converttotype",
200	'desc' => "{ImagePlug.converttotype}",
201	'type' => "string",
202	'deft' => "",
203	'reqd' => "no" },
204	{ 'name' => "minimumsize",
205	'desc' => "{ImagePlug.minimumsize}",
206	'type' => "int",
207	'deft' => "100",
208	'range' => "1,",
209	'reqd' => "no" },
210	{ 'name' => "headerpage",
211	'desc' => "{PagedImgPlug.headerpage}",
212	'type' => "flag",
213	'reqd' => "no" },
214	{ 'name' => "documenttype",
215	'desc' => "{PagedImgPlug.documenttype}",
216	'type' => "enum",
217	'list' => $type_list,
218	'deft' => "paged",
219	'reqd' => "no" } ];
220
221
222	my $options = { 'name' => "PagedImgPlug",
223	'desc' => "{PagedImgPlug.desc}",
224	'abstract' => "no",
225	'inherits' => "yes",
226	'args' => $arguments };
227
228	sub new {
229	my ($class) = shift (@_);
230	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
231	push(@$pluginlist, $class);
232
233	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
234	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
235
236	my $self = new XMLPlug($pluginlist, $inputargs, $hashArgOptLists);
237
238	return bless $self, $class;
239	}
240
241	sub get_default_process_exp {
242	my $self = shift (@_);
243
244	return q^\.item$^;
245	}
246
247	sub get_doctype {
248	my $self = shift(@_);
249
250	return "PagedDocument";
251	}
252
253
254	# want to block everything except the .item ones
255	# but instead we will block images and txt files
256	sub get_default_block_exp {
257	my $self = shift (@_);
258
259	return q^(?i)(\.jpe?g\|\.gif\|\.png\|\.tif?f\|\.te?xt\|~)$^
260	}
261
262	# Create the thumbnail and screenview images, and discover the Image's
263	# size, width, and height using the convert utility.
264	sub process_image {
265	my $self = shift (@_);
266	my $filename = shift (@_); # filename with full path
267	my $srcfile = shift (@_); # filename without path
268	my $doc_obj = shift (@_);
269	my $section = shift (@_); #the current section
270	my $rotate = shift (@_); # whether to rotate the image or not
271	$rotate = 0 unless defined $rotate;
272
273	# check that the image file exists!!
274	if (!-f $filename) {
275	print "PagedImgPlug: ERROR: File $filename does not exist, skipping\n";
276	return 0;
277	}
278
279	my $top=0;
280	if ($section eq $doc_obj->get_top_section()) {
281	$top=1;
282	}
283	my $verbosity = $self->{'verbosity'};
284	my $outhandle = $self->{'outhandle'};
285
286	# check the filename is okay
287	return 0 if ($srcfile eq "" \|\| $filename eq "");
288
289	my $minimumsize = $self->{'minimumsize'};
290	if (defined $minimumsize && (-s $filename < $minimumsize)) {
291	print $outhandle "PagedImgPlug: \"$filename\" too small, skipping\n"
292	if ($verbosity > 1);
293	}
294
295	# Convert the image to a new type (if required), and rotate if required.
296	my $converttotype = $self->{'converttotype'};
297	my $originalfilename = ""; # only set if we do a conversion
298	my $type = "unknown";
299	my $converted = 0;
300	my $rotated=0;
301
302	if ($converttotype ne "" && $filename !~ /$converttotype$/) {
303	$converted=1;
304	$originalfilename = $filename;
305	my $filehead = &util::get_tmp_filename();
306	$filename = $filehead . ".$converttotype";
307	my $n = 1;
308	while (-e $filename) {
309	$filename = "$filehead$n\.$converttotype";
310	$n++;
311	}
312	$self->{'tmp_filename1'} = $filename;
313
314	my $rotate_option = "";
315	if ($rotate eq "r") {
316	$rotate_option = "-rotate 180 ";
317	}
318
319	my $command = "convert -verbose \"$originalfilename\" $rotate_option \"$filename\"";
320	print $outhandle "CONVERT: $command\n" if ($verbosity > 2);
321	my $result = '';
322	$result = `$command`;
323	print $outhandle "CONVERT RESULT = $result\n" if ($verbosity > 2);
324
325	$type = $converttotype;
326	} elsif ($rotate eq "r") {
327	$rotated=1;
328	$originalfilename = $filename;
329	$filename = &util::get_tmp_filename();
330
331	my $command = "convert \"$originalfilename\" -rotate 180 \"$filename\"";
332	print $outhandle "ROTATE: $command\n" if ($verbosity > 2);
333	my $result = '';
334	$result = `$command`;
335	print $outhandle "ROTATE RESULT = $result\n" if ($verbosity > 2);
336
337	}
338
339
340	# Add the image metadata
341	my $file; # the new file name
342	my $id = $srcfile;
343	$id =~ s/\.([^\.]*)$//; # the new file name without an extension
344	if ($converted) {
345	# we have converted the image
346	# add on the new extension
347	$file .= "$id.$converttotype";
348	} else {
349	$file = $srcfile;
350	}
351
352	my $url =$file; # the new file name prepared for a url
353	my $srcurl = $srcfile;
354	##$url =~ s/ /%20/g;
355	##$srcurl =~ s/ /%20/g;
356
357	$doc_obj->add_metadata ($section, "Image", $url);
358
359	# Also want to set filename as 'Source' metadata to be
360	# consistent with other plugins
361	$doc_obj->add_metadata ($section, "Source", $srcurl);
362
363	my ($image_type, $image_width, $image_height, $image_size)
364	= &identify($filename, $outhandle, $verbosity);
365
366	$doc_obj->add_metadata ($section, "ImageType", $image_type);
367	$doc_obj->add_metadata ($section, "ImageWidth", $image_width);
368	$doc_obj->add_metadata ($section, "ImageHeight", $image_height);
369	$doc_obj->add_metadata ($section, "ImageSize", $image_size);
370	$doc_obj->add_metadata ($section, "FileFormat", "PagedImg");
371	# add NoText metadata which can be used to suppress the dummy text
372	$doc_obj->add_metadata ($section, "NoText", "1");
373
374
375	if ($type eq "unknown" && $image_type) {
376	$type = $image_type;
377	}
378
379	if ($top) {
380	$doc_obj->add_metadata ($section, "srclink",
381	"<a href=\"_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/[Image]\">");
382	$doc_obj->add_metadata ($section, "srcicon", "<img src=\"_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/[Image]\">");
383
384	} else {
385	$doc_obj->add_metadata ($section, "srclink",
386	"<a href=\"_httpprefix_/collect/[collection]/index/assoc/[parent(Top):assocfilepath]/[Image]\">");
387	$doc_obj->add_metadata ($section, "srcicon", "<img src=\"_httpprefix_/collect/[collection]/index/assoc/[parent(Top):assocfilepath]/[Image]\">");
388
389	}
390	$doc_obj->add_metadata ($section, "/srclink", "</a>");
391
392
393	# Add the image as an associated file
394	$doc_obj->associate_file($filename,$file,"image/$type",$section);
395	print $outhandle "associating file $filename as name $file\n" if ($verbosity > 2);
396
397	if ($self->{'thumbnail'}) {
398	# Make the thumbnail image
399	my $thumbnailsize = $self->{'thumbnailsize'} \|\| 100;
400	my $thumbnailtype = $self->{'thumbnailtype'} \|\| 'gif';
401
402	my $filehead = &util::get_tmp_filename();
403	my $thumbnailfile = $filehead . ".$thumbnailtype";
404	my $n=1;
405	while (-e $thumbnailfile) {
406	$thumbnailfile = $filehead . $n . ".$thumbnailtype";
407	$n++;
408	}
409
410	$self->{'tmp_filename2'} = $thumbnailfile;
411
412	# Generate the thumbnail with convert
413	my $command = "convert -verbose -geometry $thumbnailsize"
414	. "x$thumbnailsize \"$filename\" \"$thumbnailfile\"";
415	print $outhandle "THUMBNAIL: $command\n" if ($verbosity > 2);
416	my $result = '';
417	$result = `$command 2>&1` ;
418	print $outhandle "THUMB RESULT: $result\n" if ($verbosity > 2);
419
420	# Add the thumbnail as an associated file ...
421	if (-e "$thumbnailfile") {
422	$doc_obj->associate_file("$thumbnailfile", $id."thumb.$thumbnailtype", "image/$thumbnailtype",$section);
423	$doc_obj->add_metadata ($section, "ThumbType", $thumbnailtype);
424	$doc_obj->add_metadata ($section, "Thumb", $id."thumb.$thumbnailtype");
425	if ($top) {
426	$doc_obj->add_metadata ($section, "thumbicon", "<img src=\"_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/[Thumb]\" width=[ThumbWidth] height=[ThumbHeight]>");
427	} else {
428	$doc_obj->add_metadata ($section, "thumbicon", "<img src=\"_httpprefix_/collect/[collection]/index/assoc/[parent(Top):assocfilepath]/[Thumb]\" width=[ThumbWidth] height=[ThumbHeight]>");
429	}
430	}
431
432	# Extract Thumnail metadata from convert output
433	if ($result =~ m/[0-9]+x[0-9]+=>([0-9]+)x([0-9]+)/) {
434	$doc_obj->add_metadata ($section, "ThumbWidth", $1);
435	$doc_obj->add_metadata ($section, "ThumbHeight", $2);
436	}
437	}
438	# Make a screen-sized version of the picture if requested
439	if ($self->{'screenview'}) {
440
441	# To do: if the actual image is smaller than the screenview size,
442	# we should use the original !
443
444	my $screenviewsize = $self->{'screenviewsize'} \|\| 500;
445	my $screenviewtype = $self->{'screenviewtype'} \|\| 'jpeg';
446	my $filehead = &util::get_tmp_filename();
447	my $screenviewfilename = $filehead . ".$screenviewtype";
448	my $n=1;
449	while (-e $screenviewfilename) {
450	$screenviewfilename = "$filehead$n\.$screenviewtype";
451	$n++;
452	}
453	$self->{'tmp_filename3'} = $screenviewfilename;
454
455	# make the screenview image
456	my $command = "convert -verbose -geometry $screenviewsize"
457	. "x$screenviewsize \"$filename\" \"$screenviewfilename\"";
458	print $outhandle "SCREENVIEW: $command\n" if ($verbosity > 2);
459	my $result = "";
460	$result = `$command 2>&1` ;
461	print $outhandle "SCREENVIEW RESULT: $result\n" if ($verbosity > 3);
462
463	# get screenview dimensions, size and type
464	if ($result =~ m/[0-9]+x[0-9]+=>([0-9]+)x([0-9]+)/) {
465	$doc_obj->add_metadata ($section, "ScreenWidth", $1);
466	$doc_obj->add_metadata ($section, "ScreenHeight", $2);
467	}elsif ($result =~ m/([0-9]+)x([0-9]+)/) {
468	#if the image hasn't changed size, the previous regex doesn't match
469	$doc_obj->add_metadata ($section, "ScreenWidth", $1);
470	$doc_obj->add_metadata ($section, "ScreenHeight", $2);
471	}
472
473	#add the screenview as an associated file ...
474	if (-e "$screenviewfilename") {
475	$doc_obj->associate_file("$screenviewfilename", $id."sv.$screenviewtype",
476	"image/$screenviewtype",$section);
477	print $outhandle "associating screen file $screenviewfilename as name $id sv.$screenviewtype\n" if ($verbosity > 2);
478
479	$doc_obj->add_metadata ($section, "ScreenType", $screenviewtype);
480	$doc_obj->add_metadata ($section, "Screen", $id."sv.$screenviewtype");
481
482	if ($top) {
483	$doc_obj->add_metadata ($section, "screenicon", "<img src=\"_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/[Screen]\" width=[ScreenWidth] height=[ScreenHeight]>");
484	} else {
485	$doc_obj->add_metadata ($section, "screenicon", "<img src=\"_httpprefix_/collect/[collection]/index/assoc/[parent(Top):assocfilepath]/[Screen]\" width=[ScreenWidth] height=[ScreenHeight]>");
486
487	}
488	} else {
489	print $outhandle "PagedImgPlug: couldn't find \"$screenviewfilename\"\n";
490	}
491	}
492
493	return $type;
494
495
496	}
497
498
499
500	# Discover the characteristics of an image file with the ImageMagick
501	# "identify" command.
502
503	sub identify {
504	my ($image, $outhandle, $verbosity) = @_;
505
506	# Use the ImageMagick "identify" command to get the file specs
507	my $command = "identify \"$image\" 2>&1";
508	print $outhandle "$command\n" if ($verbosity > 2);
509	my $result = '';
510	$result = `$command`;
511	print $outhandle "$result\n" if ($verbosity > 3);
512
513	# Read the type, width, and height
514	my $type = 'unknown';
515	my $width = 'unknown';
516	my $height = 'unknown';
517
518	my $image_safe = quotemeta $image;
519	if ($result =~ /^$image_safe (\w+) (\d+)x(\d+)/) {
520	$type = $1;
521	$width = $2;
522	$height = $3;
523	}
524
525	# Read the size
526	my $size = "unknown";
527	if ($result =~ m/^.* ([0-9]+)b/) {
528	$size = $1;
529	} elsif ($result =~ m/^.* ([0-9]+)kb/) {
530	$size = 1024 * $1;
531	}
532
533	print $outhandle "file: $image:\t $type, $width, $height, $size\n"
534	if ($verbosity > 3);
535
536	# Return the specs
537	return ($type, $width, $height, $size);
538	}
539
540
541	# The PagedImgPlug read() function. This function does all the right things
542	# to make general options work for a given plugin. It calls the process()
543	# function which does all the work specific to a plugin (like the old
544	# read functions used to do). Most plugins should define their own
545	# process() function and let this read() function keep control.
546	#
547	# PagedImgPlug overrides read() because there is no need to read the actual
548	# text of the file in, because the contents of the file is not text...
549	#
550	# Return number of files processed, undef if can't process
551	# Note that $base_dir might be "" and that $file might
552	# include directories
553
554	sub read_into_doc_obj {
555	my $self = shift (@_);
556	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
557	my $outhandle = $self->{'outhandle'};
558
559	#check process and block exps, smart block, etc
560	my ($block_status,$filename) = $self->read_block(@_);
561	return $block_status if ((!defined $block_status) \|\| ($block_status==0));
562
563	print $outhandle "PagedImgPlug processing \"$filename\"\n"
564	if $self->{'verbosity'} > 1;
565	print STDERR "<Processing n='$file' p='PagedImgPlug'>\n" if ($gli);
566
567	# here we need to decide if we have an old text .item file, or a new xml
568	# .item file - for now the test is if the first non-empty line is
569	# <PagedDocument> then its xml
570	my $xml_version = 0;
571	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
572
573	my $backup_filename = "backup.item";
574	open (BACKUP,">$backup_filename")\|\| die "couldn't write to $backup_filename\n";
575	my $line = "";
576	my $num = 0;
577	$line = <ITEMFILE>;
578	while ($line !~ /\w/) {
579	$line = <ITEMFILE>;
580	}
581	chomp $line;
582	if ($line =~ /<PagedDocument/) {
583	$xml_version = 1;
584	}
585	close ITEMFILE;
586	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
587	$line = <ITEMFILE>;
588	$line =~ s/^\xEF\xBB\xBF//; # strip BOM
589	$line =~ s/\x0B+//ig;
590	$line =~ s/&/&/g;
591	print BACKUP ($line);
592	#Tidy up the item file some metadata title contains \vt-vertical tab
593	while ($line = <ITEMFILE>) {
594	$line =~ s/\x0B+//ig;
595	$line =~ s/&/&/g;
596	print BACKUP ($line);
597	}
598	close ITEMFILE;
599	close BACKUP;
600	&File::Copy::copy ($backup_filename, $filename);
601	&util::rm($backup_filename);
602
603	my $doc_obj;
604	if ($xml_version) {
605	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
606	$self->{'file'} = $file;
607	$self->{'filename'} = $filename;
608	$self->{'processor'} = $processor;
609	$self->{'metadata'} = $metadata;
610
611	eval {
612	$@ = "";
613	my $xslt = $self->{'xslt'};
614	if (defined $xslt && ($xslt ne "")) {
615	# perform xslt
616	my $transformed_xml = $self->apply_xslt($xslt,$filename);
617
618	# feed transformed file (now in memory as string) into XML parser
619	#$self->{'parser'}->parse($transformed_xml);
620	$self->parse_string($transformed_xml);
621	}
622	else {
623	#$self->{'parser'}->parsefile($filename);
624	$self->parse_file($filename);
625	}
626	};
627
628
629
630	if ($@) {
631
632	# parsefile may either croak somewhere in XML::Parser (e.g. because
633	# the document is not well formed) or die somewhere in XMLPlug or a
634	# derived plugin (e.g. because we're attempting to process a
635	# document whose DOCTYPE is not meant for this plugin). For the
636	# first case we'll print a warning and continue, for the second
637	# we'll just continue quietly
638
639	print STDERR "**** XML Parse Error is: $@\n";
640
641	my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
642	if (defined $msg) {
643	my $outhandle = $self->{'outhandle'};
644	my $plugin_name = ref ($self);
645	print $outhandle "$plugin_name failed to process $file ($msg)\n";
646	}
647
648	# reset ourself for the next document
649	$self->{'section_level'}=0;
650	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
651	return -1; # error during processing
652	}
653	$doc_obj = $self->{'doc_obj'};
654	} else {
655	my ($dir);
656	($dir, $file) = $filename =~ /^(.?)([^\/\\])$/;
657
658	#process the .item file
659	$doc_obj = $self->process_item($filename, $dir, $file, $processor);
660
661	}
662
663	if ($self->{'cover_image'}) {
664	$self->associate_cover_image($doc_obj, $filename);
665	}
666
667	# include any metadata passed in from previous plugins
668	# note that this metadata is associated with the top level section
669	my $section = $doc_obj->get_top_section();
670	$self->extra_metadata ($doc_obj, $section, $metadata);
671	#my $text="";
672	# do plugin specific processing of doc_obj
673	#unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
674	#print STDERR "<ProcessingError n='$file'>\n" if ($gli);
675	#return -1;
676	#}
677	# do any automatic metadata extraction
678	$self->auto_extract_metadata ($doc_obj);
679
680	$self->{'num_processed'}++;
681	return (1,$doc_obj);
682	}
683
684	sub read
685	{
686	my $self = shift (@_);
687	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
688
689	my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
690
691	if ((defined $process_status) && ($process_status == 1)) {
692	# process the document
693	$processor->process($doc_obj);
694
695	#if(defined($self->{'places_filename'})){
696	# &util::rm($self->{'places_filename'});
697	# $self->{'places_filename'} = undef;
698	#}
699	#$self->{'num_processed'} ++;
700	undef $doc_obj;
701	}
702
703	# clean up temporary files - we do this here instead of in
704	# process_image becuase associated files aren't actually copied
705	# until after process has been run.
706	if (defined $self->{'tmp_filename1'} &&
707	-e $self->{'tmp_filename1'}) {
708	&util::rm($self->{'tmp_filename1'})
709	}
710	if (defined $self->{'tmp_filename2'} &&
711	-e $self->{'tmp_filename2'}) {
712	&util::rm($self->{'tmp_filename2'})
713	}
714	if (defined $self->{'tmp_filename3'} &&
715	-e $self->{'tmp_filename3'}) {
716	&util::rm($self->{'tmp_filename3'})
717	}
718	# if process_status == 1, then the file has been processed.
719	return $process_status;
720	}
721
722	sub xml_start_tag {
723	my $self = shift(@_);
724	my ($expat, $element) = @_;
725	$self->{'element'} = $element;
726
727	my $doc_obj = $self->{'doc_obj'};
728	if ($element eq "PagedDocument") {
729	$self->{'current_section'} = $doc_obj->get_top_section();
730	} elsif ($element eq "PageGroup" \|\| $element eq "Page") {
731	# create a new section as a child
732	$self->{'current_section'} = $doc_obj->insert_section($doc_obj->get_end_child($self->{'current_section'}));
733	$self->{'num_pages'}++;
734	# assign pagenum as what??
735	my $pagenum = $_{'pagenum'}; #TODO!!
736	if (defined $pagenum) {
737	$doc_obj->set_utf8_metadata_element($self->{'current_section'}, 'PageNum', $pagenum);
738	}
739	my ($imgfile) = $_{'imgfile'};
740	if (defined $imgfile) {
741	$self->process_image($self->{'base_dir'}.$imgfile, $imgfile, $doc_obj, $self->{'current_section'});
742	}
743	my ($txtfile) = $_{'txtfile'};
744	if (defined($txtfile)&& $txtfile ne "") {
745	$self->process_text ($self->{'base_dir'}.$txtfile, $txtfile, $doc_obj, $self->{'current_section'});
746	} else {
747	# otherwise add in some dummy text
748	#create an empty text string so we don't break downstream plugins
749	my $text = &gsprintf::lookup_string("{BasPlug.dummy_text}",1);
750	$doc_obj->add_utf8_text($self->{'current_section'}, $text);
751	}
752	} elsif ($element eq "Metadata") {
753	$self->{'metadata_name'} = $_{'name'};
754	}
755	}
756
757	sub xml_end_tag {
758	my $self = shift(@_);
759	my ($expat, $element) = @_;
760
761	my $doc_obj = $self->{'doc_obj'};
762	if ($element eq "Page" \|\| $element eq "PageGroup") {
763	# if Title hasn't been assigned, set PageNum as Title
764	if (!defined $doc_obj->get_metadata_element ($self->{'current_section'}, "Title") && defined $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" )) {
765	$doc_obj->add_utf8_metadata ($self->{'current_section'}, "Title", $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" ));
766	}
767	# move the current section back to the parent
768	$self->{'current_section'} = $doc_obj->get_parent_section($self->{'current_section'});
769	} elsif ($element eq "Metadata") {
770
771	$doc_obj->add_utf8_metadata ($self->{'current_section'}, $self->{'metadata_name'}, $self->{'metadata_value'});
772	$self->{'metadata_name'} = "";
773	$self->{'metadata_value'} = "";
774
775	}
776	# otherwise we ignore the end tag
777	}
778
779
780	sub xml_text {
781	my $self = shift(@_);
782	my ($expat) = @_;
783
784	if ($self->{'element'} eq "Metadata" && $self->{'metadata_name'}) {
785	$self->{'metadata_value'} .= $_;
786	}
787	}
788
789	sub xml_doctype {
790	}
791
792	sub open_document {
793	my $self = shift(@_);
794
795	# create a new document
796	$self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc");
797	my $doc_obj = $self->{'doc_obj'};
798	$doc_obj->set_OIDtype ($self->{'processor'}->{'OIDtype'});
799	my ($dir, $file) = $self->{'filename'} =~ /^(.?)([^\/\\])$/;
800	$self->{'base_dir'} = $dir;
801	$self->{'num_pages'} = 0;
802	my $topsection = $doc_obj->get_top_section();
803	if ($self->{'documenttype'} eq 'paged') {
804	# set the gsdlthistype metadata to Paged - this ensures this document will
805	# be treated as a Paged doc, even if Titles are not numeric
806
807	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
808	} else {
809	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
810	}
811
812	$doc_obj->add_metadata ($topsection, "Source", $file);
813	if ($self->{'headerpage'}) {
814	$doc_obj->add_text($topsection, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
815	}
816
817	}
818
819	sub close_document {
820	my $self = shift(@_);
821	my $doc_obj = $self->{'doc_obj'};
822
823	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
824	$doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "PagedImg");
825
826	# add numpages metadata
827	$doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(), 'NumPages', $self->{'num_pages'});
828
829	# add an OID
830	$doc_obj->set_OID();
831
832	}
833
834	sub process_item {
835	my $self = shift (@_);
836	my ($filename, $dir, $file, $processor) = @_;
837
838	my $doc_obj = new doc ($filename, "indexed_doc");
839	$doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
840	my $topsection = $doc_obj->get_top_section();
841	$doc_obj->add_utf8_metadata($topsection, "Plugin", "$self->{'plugin_type'}");
842	$doc_obj->add_metadata($topsection, "FileFormat", "PagedImg");
843
844	if ($self->{'documenttype'} eq 'paged') {
845	# set the gsdlthistype metadata to Paged - this ensures this document will
846	# be treated as a Paged doc, even if Titles are not numeric
847	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
848	} else {
849	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
850	}
851
852	$doc_obj->add_metadata ($topsection, "Source", $file);
853
854
855	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
856	my $line = "";
857	my $num = 0;
858	while (defined ($line = <ITEMFILE>)) {
859	next unless $line =~ /\w/;
860	chomp $line;
861	next if $line =~ /^#/; # ignore comment lines
862	if ($line =~ /^<([^>])>\s(.?)\s$/) {
863	$doc_obj->set_utf8_metadata_element ($topsection, $1, $2);
864	#$meta->{$1} = $2;
865	} else {
866	$num++;
867	# line should be like page:imagefilename:textfilename:r - the r is optional -> means rotate the image 180 deg
868	$line =~ s/^\s+//; #remove space at the front
869	$line =~ s/\s+$//; #remove space at the end
870	my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
871
872	# create a new section for each image file
873	my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
874	# the page number becomes the Title
875	$doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum);
876	# add NoText metadata which can be used to suppress the dummy text
877	$doc_obj->add_metadata($cursection, "NoText", "1");
878
879	# process the image for this page if there is one
880	if (defined $imgname && $imgname ne "") {
881	my $result1 = $self->process_image($dir.$imgname, $imgname, $doc_obj, $cursection, $rotate);
882
883	if (!defined $result1)
884	{
885	print "PagedImgPlug: couldn't process image \"$dir.$imgname\" for item \"$filename\"\n";
886	}
887	}
888	# process the text file if one is there
889	if (defined $txtname && $txtname ne "") {
890	my $result2 = $self->process_text ($dir.$txtname, $txtname, $doc_obj, $cursection);
891	if (!defined $result2) {
892	print "PagedImgPlug: couldn't process text file \"$dir.$txtname\" for item \"$filename\"\n";
893	}
894	} else {
895	# otherwise add in some dummy text
896	$doc_obj->add_text($cursection, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
897	}
898	}
899	}
900
901	close ITEMFILE;
902
903	# if we want a header page, we need to add some text into the top section, otherwise this section will become invisible
904	if ($self->{'headerpage'}) {
905	$doc_obj->add_text($topsection, &gsprintf::lookup_string("{BasPlug.dummy_text}"));
906	}
907	$file =~ s/\.item//i;
908	$doc_obj->set_OID ();
909	# add numpages metadata
910	$doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num");
911	return $doc_obj;
912	}
913
914	sub process_text {
915	my $self = shift (@_);
916	my ($fullpath, $file, $doc_obj, $cursection) = @_;
917
918	# check that the text file exists!!
919	if (!-f $fullpath) {
920	print "PagedImgPlug: ERROR: File $fullpath does not exist, skipping\n";
921	return 0;
922	}
923
924	# Do encoding stuff
925	my ($language, $encoding) = $self->textcat_get_language_encoding ($fullpath);
926
927	my $text="";
928	&BasPlug::read_file($self, $fullpath, $encoding, $language, \$text);
929	if (!length ($text)) {
930	# It's a bit unusual but not out of the question to have no text, so just give a warning
931	print "PagedImgPlug: WARNING: $fullpath contains no text\n";
932	}
933
934	# we need to escape the escape character, or else mg will convert into
935	# eg literal newlines, instead of leaving the text as '\n'
936	$text =~ s/\\/\\\\/g; # macro language
937	$text =~ s/_/\\_/g; # macro language
938	$text =~ s/</</g;
939	$text =~ s/>/>/g;
940
941	# insert preformat tags and add text to document object
942	$doc_obj->add_utf8_text($cursection, "<pre>\n$text\n</pre>");
943
944	return 1;
945	}
946
947	# do plugin specific processing of doc_obj
948	sub process {
949	my $self = shift (@_);
950	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
951	my $outhandle = $self->{'outhandle'};
952
953	return 1;
954	}
955
956	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: