Context Navigation

source: main/trunk/greenstone2/perllib/plugins/PagedImagePlugin.pm@ 27509

Last change on this file since 27509 was 27509, checked in by ak19, 11 years ago
Using the recommended FileUtils.pm methods in place of the deprecated utils.pm methods.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 28.5 KB

Line
1	###########################################################################
2	#
3	# PagedImagePlugin.pm -- plugin for sets of images and OCR text that
4	# make up a document
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	# PagedImagePlugin
28	# processes sequences of images, with optional OCR text
29	#
30	# This plugin takes *.item files, which contain metadata and lists of image
31	# files, and produces a document containing sections, one for each page.
32	# The files should be named something.item, then you can have more than one
33	# book in a directory. You will need to create these files, one for each
34	# document/book.
35	#
36	#There are two formats for the item files: a plain text format, and an xml
37	#format. You can use either format, and can have both formats in the same
38	#collection if you like. If you use the plain format, you must not start the
39	#file off with <PagedDocument>
40
41	#### PLAIN FORMAT
42	# The format of the xxx.item file is as follows:
43	# The first lines contain any metadata for the whole document
44	# <metadata-name>metadata-value
45	# eg.
46	# <Title>Snail farming
47	# <Date>19230102
48	# Then comes a list of pages, one page per line, each line has the format
49	#
50	# pagenum:imagefile:textfile:r
51	#
52	# page num and imagefile are required. pagenum is used for the Title
53	# of the section, and in the display is shown as page <pagenum>.
54	# imagefile is the image for the page. textfile is an optional text
55	# file containing the OCR (or any) text for the page - this gets added
56	# as the text for the section. r is optional, and signals that the image
57	# should be rotated 180deg. Eg use this if the image has been made upside down.
58	# So an example item file looks like:
59	# <Title>Snail farming
60	# <Date>19960403
61	# 1:p1.gif:p1.txt:
62	# 2:p2.gif::
63	# 3:p3.gif:p3.txt:
64	# 3b:p3b.gif:p3b.txt:r
65	# The second page has no text, the fourth page is a back page, and
66	# should be rotated.
67	#
68
69	#### XML FORMAT
70	# The xml format looks like the following
71	#<PagedDocument>
72	#<Metadata name="Title">The Title of the entire document</Metadata>
73	#<Page pagenum="1" imgfile="xxx.jpg" txtfile="yyy.txt">
74	#<Metadata name="Title">The Title of this page</Metadata>
75	#</Page>
76	#... more pages
77	#</PagedDocument>
78	#PagedDocument contains a list of Pages, Metadata and PageGroups. Any metadata
79	#that is not inside another tag will belong to the document.
80	#Each Page has a pagenum (not used at the moment), an imgfile and/or a txtfile.
81	#These are both optional - if neither is used, the section will have no content.
82	#Pages can also have metadata associated with them.
83	#PageGroups can be introduced at any point - they can contain Metadata and Pages and other PageGroups. They are used to introduce hierarchical structure into the document.
84	#For example
85	#<PagedDocument>
86	#<PageGroup>
87	#<Page>
88	#<Page>
89	#</PageGroup>
90	#<Page>
91	#</PagedDocument>
92	#would generate a structure like
93	#X
94	#--X
95	# --X
96	# --X
97	#--X
98	#PageGroup tags can also have imgfile/textfile metadata if you like - this way they get some content themselves.
99
100	#Currently the XML structure doesn't work very well with the paged document type, unless you use numerical Titles for each section.
101	#There is still a bit of work to do on this format:
102	#* enable other text file types, eg html, pdf etc
103	#* make the document paging work properly
104	#* add pagenum as Title unless a Title is present?
105
106	# All the supplemetary image amd text files should be in the same folder as
107	# the .item file.
108	#
109	# To display the images instead of the document text, you can use [srcicon]
110	# in the DocumentText format statement.
111	# For example,
112	#
113	# format DocumentText "<center><table width=_pagewidth_><tr><td>[srcicon]</td></tr></table></center>"
114	#
115	# To have it create thumbnail size images, use the '-create_thumbnail' option.
116	# To have it create medium size images for display, use the '-create_screenview'
117	# option. As usual, running
118	# 'perl -S pluginfo.pl PagedImagePlugin' will list all the options.
119
120	# If you want the resulting documents to be presented with a table of
121	# contents, use '-documenttype hierarchy', otherwise they will have
122	# next and previous arrows, and a goto page X box.
123
124	# If you have used -create_screenview, you can also use [screenicon] in the format
125	# statement to display the smaller image. Here is an example that switches
126	# between the two:
127	#
128	# format DocumentText "<center><table width=_pagewidth_><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small'>Switch to small version.</a>,<a href='_httpdocument_&d=_cgiargd_&p=full'>Switch to fullsize version</a>}</td></tr><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small' title='Switch to small version'>[srcicon]</a>,<a href='_httpdocument_&d=_cgiargd_&p=full' title='Switch to fullsize version'>[screenicon]</a>}</td></tr></table></center>"
129	#
130	# Additional metadata can be added into the .item files, alternatively you can
131	# use normal metadata.xml files, with the name of the xxx.item file as the
132	# FileName (only for document level metadata).
133
134	package PagedImagePlugin;
135
136	use Encode;
137	use ReadXMLFile;
138	use ReadTextFile;
139	use ImageConverter;
140	use MetadataRead;
141
142	use strict;
143	no strict 'refs'; # allow filehandles to be variables and viceversa
144
145	sub BEGIN {
146	@PagedImagePlugin::ISA = ('MetadataRead', 'ReadXMLFile', 'ReadTextFile', 'ImageConverter');
147	}
148
149	my $gs2_type_list =
150	[ { 'name' => "auto",
151	'desc' => "{PagedImagePlugin.documenttype.auto2}" },
152	{ 'name' => "paged",
153	'desc' => "{PagedImagePlugin.documenttype.paged2}" },
154	{ 'name' => "hierarchy",
155	'desc' => "{PagedImagePlugin.documenttype.hierarchy}" }
156	];
157
158	my $gs3_type_list =
159	[ { 'name' => "auto",
160	'desc' => "{PagedImagePlugin.documenttype.auto3}" },
161	{ 'name' => "paged",
162	'desc' => "{PagedImagePlugin.documenttype.paged3}" },
163	{ 'name' => "hierarchy",
164	'desc' => "{PagedImagePlugin.documenttype.hierarchy}" },
165	{ 'name' => "pagedhierarchy",
166	'desc' => "{PagedImagePlugin.documenttype.pagedhierarchy}" }
167	];
168
169	my $arguments =
170	[ { 'name' => "process_exp",
171	'desc' => "{BasePlugin.process_exp}",
172	'type' => "string",
173	'deft' => &get_default_process_exp(),
174	'reqd' => "no" },
175	{ 'name' => "title_sub",
176	'desc' => "{HTMLPlugin.title_sub}",
177	'type' => "string",
178	'deft' => "" },
179	{ 'name' => "headerpage",
180	'desc' => "{PagedImagePlugin.headerpage}",
181	'type' => "flag",
182	'reqd' => "no" },
183	# { 'name' => "documenttype",
184	# 'desc' => "{PagedImagePlugin.documenttype}",
185	# 'type' => "enum",
186	# 'list' => $type_list,
187	# 'deft' => "auto",
188	# 'reqd' => "no" },
189	{'name' => "processing_tmp_files",
190	'desc' => "{BasePlugin.processing_tmp_files}",
191	'type' => "flag",
192	'hiddengli' => "yes"}
193	];
194
195	my $doc_type_opt = { 'name' => "documenttype",
196	'desc' => "{PagedImagePlugin.documenttype}",
197	'type' => "enum",
198	'deft' => "auto",
199	'reqd' => "no" };
200
201	my $options = { 'name' => "PagedImagePlugin",
202	'desc' => "{PagedImagePlugin.desc}",
203	'abstract' => "no",
204	'inherits' => "yes",
205	'args' => $arguments };
206
207	sub new {
208	my ($class) = shift (@_);
209	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
210	push(@$pluginlist, $class);
211
212	push(@{$hashArgOptLists->{"OptList"}},$options);
213
214	my $imc_self = new ImageConverter($pluginlist, $inputargs, $hashArgOptLists);
215
216	# we can use this plugin to check gs3 version
217	if ($imc_self->{'gs_version'} eq "3") {
218	$doc_type_opt->{'list'} = $gs3_type_list;
219	}
220	else {
221	$doc_type_opt->{'list'} = $gs2_type_list;
222	}
223	push(@$arguments,$doc_type_opt);
224	# now we add the args to the list for parsing
225	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
226
227	my $rtf_self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists, 1);
228	my $rxf_self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
229
230	my $self = BasePlugin::merge_inheritance($imc_self,$rtf_self,$rxf_self);
231
232	# Update $self used by XML::Parser so it finds callback functions
233	# such as start_document here and not in ReadXMLFile (which is what
234	# $self was when new XML::Parser was done)
235	#
236	# If the $self returned by this constructor is the same as the one
237	# used in ReadXMLFile (e.g. in the GreenstoneXMLPlugin) then this step isn't necessary
238	#
239	# Consider embedding this type of assignment into merge_inheritance
240	# to help catch all cases?
241
242	$rxf_self->{'parser'}->{'PluginObj'} = $self;
243
244	return bless $self, $class;
245	}
246
247
248	sub init {
249	my $self = shift (@_);
250	my ($verbosity, $outhandle, $failhandle) = @_;
251
252	$self->SUPER::init(@_);
253	$self->ImageConverter::init();
254	}
255
256	sub begin {
257	my $self = shift (@_);
258	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
259
260	$self->SUPER::begin(@_);
261	$self->ImageConverter::begin(@_);
262	}
263
264	sub get_default_process_exp {
265	my $self = shift (@_);
266
267	return q^\.item$^;
268	}
269
270	sub get_doctype {
271	my $self = shift(@_);
272
273	return "PagedDocument";
274	}
275
276
277	# want to use BasePlugin's version of this, not ReadXMLFile's
278	sub can_process_this_file {
279	my $self = shift(@_);
280	return $self->BasePlugin::can_process_this_file(@_);
281	}
282
283	# instead of a block exp, now we scan the file and record all text and img files mentioned there for blocking.
284	sub store_block_files
285	{
286	my $self = shift (@_);
287	my ($filename_full_path, $block_hash) = @_;
288
289	my $xml_version = $self->is_xml_item_file($filename_full_path);
290
291	# do we need to do this?
292	# does BOM interfere just with XML parsing? In that case don't need it here
293	# if we do it here, we are modifying the file before we have worked out if
294	# its new or not, so it will always be reimported.
295	#$self->tidy_item_file($filename_full_path);
296
297	my ($dir, $file) = $filename_full_path =~ /^(.?)([^\/\\])$/;
298	if ($xml_version) {
299
300	# do something
301	$self->scan_xml_for_files_to_block($filename_full_path, $dir, $block_hash);
302	} else {
303
304	$self->scan_item_for_files_to_block($filename_full_path, $dir, $block_hash);
305	}
306
307	}
308
309	# we want to use BasePlugin's read, not ReadXMLFile's
310	sub read
311	{
312	my $self = shift (@_);
313
314	$self->BasePlugin::read(@_);
315	}
316
317
318
319	sub read_into_doc_obj {
320	my $self = shift (@_);
321	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
322	my $outhandle = $self->{'outhandle'};
323	my $verbosity = $self->{'verbosity'};
324
325	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
326
327	print $outhandle "PagedImagePlugin processing \"$filename_full_path\"\n"
328	if $verbosity > 1;
329	print STDERR "<Processing n='$file' p='PagedImagePlugin'>\n" if ($gli);
330
331	$self->{'MaxImageWidth'} = 0;
332	$self->{'MaxImageHeight'} = 0;
333
334	# here we need to decide if we have an old text .item file, or a new xml
335	# .item file
336	my $xml_version = $self->is_xml_item_file($filename_full_path);
337
338	$self->tidy_item_file($filename_full_path);
339
340	my $doc_obj;
341	if ($xml_version) {
342	# careful checking needed here!! are we using local xml handlers or super ones
343	$self->ReadXMLFile::read($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
344	$doc_obj = $self->{'doc_obj'};
345	} else {
346	my ($dir, $item_file);
347	($dir, $item_file) = $filename_full_path =~ /^(.?)([^\/\\])$/;
348
349	#process the .item file
350	$doc_obj = $self->process_item($filename_full_path, $dir, $item_file, $processor, $metadata);
351
352	}
353
354	my $section = $doc_obj->get_top_section();
355
356	$doc_obj->add_utf8_metadata($section, "Plugin", "$self->{'plugin_type'}");
357	$doc_obj->add_metadata($section, "FileFormat", "PagedImage");
358
359	# include any metadata passed in from previous plugins
360	# note that this metadata is associated with the top level section
361	$self->add_associated_files($doc_obj, $filename_full_path);
362	$self->extra_metadata ($doc_obj, $section, $metadata);
363	$self->auto_extract_metadata ($doc_obj);
364	$self->plugin_specific_process($base_dir, $file, $doc_obj, $gli);
365	# if we haven't found any Title so far, assign one
366	$self->title_fallback($doc_obj,$section,$filename_no_path);
367
368	$self->add_OID($doc_obj);
369	return (1,$doc_obj);
370	}
371	# override this for an inheriting plugin to add extra metadata etc
372	sub plugin_specific_process {
373	my $self = shift(@_);
374	my ($base_dir, $file, $doc_obj, $gli) = @_;
375
376	}
377
378	# for now, the test is if the first non-empty line is <PagedDocument>, then its xml
379	sub is_xml_item_file {
380	my $self = shift(@_);
381	my ($filename) = @_;
382
383	my $xml_version = 0;
384	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
385
386	my $line = "";
387	my $num = 0;
388
389	$line = <ITEMFILE>;
390	while (defined ($line) && ($line !~ /\w/)) {
391	$line = <ITEMFILE>;
392	}
393
394	if (defined $line) {
395	chomp $line;
396	if ($line =~ /<PagedDocument/) {
397	$xml_version = 1;
398	}
399	}
400
401	close ITEMFILE;
402	return $xml_version;
403	}
404
405	sub tidy_item_file {
406	my $self = shift(@_);
407	my ($filename) = @_;
408
409	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
410	my $backup_filename = "backup.item";
411	open (BACKUP,">$backup_filename")\|\| die "couldn't write to $backup_filename\n";
412	my $line = "";
413	$line = <ITEMFILE>;
414	$line =~ s/^\xEF\xBB\xBF//; # strip BOM
415	$line =~ s/\x0B+//ig;
416	$line =~ s/&/&/g;
417	print BACKUP ($line);
418	#Tidy up the item file some metadata title contains \vt-vertical tab
419	while ($line = <ITEMFILE>) {
420	$line =~ s/\x0B+//ig;
421	$line =~ s/&/&/g;
422	print BACKUP ($line);
423	}
424	close ITEMFILE;
425	close BACKUP;
426	&File::Copy::copy ($backup_filename, $filename);
427	&FileUtils::removeFiles($backup_filename);
428
429	}
430
431	sub rotate_image {
432	my $self = shift (@_);
433	my ($filename_full_path) = @_;
434
435	my ($this_filetype) = $filename_full_path =~ /\.([^\.]*)$/;
436	my $result = $self->convert($filename_full_path, $this_filetype, "-rotate 180", "ROTATE");
437	my ($new_filename) = ($result =~ /=>(.*\.$this_filetype)/);
438	if (-e "$new_filename") {
439	return $new_filename;
440	}
441	# somethings gone wrong
442	return $filename_full_path;
443
444	}
445
446	sub process_image {
447	my $self = shift(@_);
448	my ($filename_full_path, $filename_no_path, $doc_obj, $section, $rotate) = @_;
449	# check the filenames
450	return 0 if ($filename_no_path eq "" \|\| !-f $filename_full_path);
451
452	# remember that this image file was one of our source files, but only
453	# if we are not processing a tmp file
454	if (!$self->{'processing_tmp_files'} ) {
455	$doc_obj->associate_source_file($filename_full_path);
456	}
457	# do rotation
458	if ((defined $rotate) && ($rotate eq "r")) {
459	# we get a new temporary file which is rotated
460	$filename_full_path = $self->rotate_image($filename_full_path);
461	}
462
463	# do generate images
464	my $result = 0;
465	if ($self->{'image_conversion_available'} == 1) {
466	# do we need to convert $filename_no_path to utf8/url encoded?
467	# We are already reading in from a file, what encoding is it in???
468	my $url_encoded_full_filename
469	= &unicode::raw_filename_to_url_encoded($filename_full_path);
470	$result = $self->generate_images($filename_full_path, $url_encoded_full_filename, $doc_obj, $section);
471	}
472	#overwrite one set in ImageConverter
473	$doc_obj->set_metadata_element ($section, "FileFormat", "PagedImage");
474	return $result;
475	}
476
477
478	sub xml_start_tag {
479	my $self = shift(@_);
480	my ($expat, $element) = @_;
481	$self->{'element'} = $element;
482
483	my $doc_obj = $self->{'doc_obj'};
484	if ($element eq "PagedDocument") {
485	$self->{'current_section'} = $doc_obj->get_top_section();
486	} elsif ($element eq "PageGroup" \|\| $element eq "Page") {
487	if ($element eq "PageGroup") {
488	$self->{'has_internal_structure'} = 1;
489	}
490	# create a new section as a child
491	$self->{'current_section'} = $doc_obj->insert_section($doc_obj->get_end_child($self->{'current_section'}));
492	$self->{'num_pages'}++;
493	# assign pagenum as what??
494	my $pagenum = $_{'pagenum'}; #TODO!!
495	if (defined $pagenum) {
496	$doc_obj->set_utf8_metadata_element($self->{'current_section'}, 'PageNum', $pagenum);
497	}
498	my ($imgfile) = $_{'imgfile'};
499	if (defined $imgfile) {
500	# *****
501	# What about support for rotate image (e.g. old ':r' notation)?
502	$self->process_image($self->{'xml_file_dir'}.$imgfile, $imgfile, $doc_obj, $self->{'current_section'});
503	}
504	my ($txtfile) = $_{'txtfile'};
505	if (defined($txtfile)&& $txtfile ne "") {
506	$self->process_text ($self->{'xml_file_dir'}.$txtfile, $txtfile, $doc_obj, $self->{'current_section'});
507	} else {
508	$self->add_dummy_text($doc_obj, $self->{'current_section'});
509	}
510	} elsif ($element eq "Metadata") {
511	$self->{'metadata_name'} = $_{'name'};
512	}
513	}
514
515	sub xml_end_tag {
516	my $self = shift(@_);
517	my ($expat, $element) = @_;
518
519	my $doc_obj = $self->{'doc_obj'};
520	if ($element eq "Page" \|\| $element eq "PageGroup") {
521	# if Title hasn't been assigned, set PageNum as Title
522	if (!defined $doc_obj->get_metadata_element ($self->{'current_section'}, "Title") && defined $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" )) {
523	$doc_obj->add_utf8_metadata ($self->{'current_section'}, "Title", $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" ));
524	}
525	# move the current section back to the parent
526	$self->{'current_section'} = $doc_obj->get_parent_section($self->{'current_section'});
527	} elsif ($element eq "Metadata") {
528
529	# text read in by XML::Parser is in Perl's binary byte value
530	# form ... need to explicitly make it UTF-8
531	my $meta_name = decode("utf-8",$self->{'metadata_name'});
532	my $metadata_value = decode("utf-8",$self->{'metadata_value'});
533
534	if ($meta_name =~ /\./) {
535	$meta_name = "ex.$meta_name";
536	}
537
538	$doc_obj->add_utf8_metadata ($self->{'current_section'}, $meta_name, $metadata_value);
539	$self->{'metadata_name'} = "";
540	$self->{'metadata_value'} = "";
541
542	}
543	# otherwise we ignore the end tag
544	}
545
546
547	sub xml_text {
548	my $self = shift(@_);
549	my ($expat) = @_;
550
551	if ($self->{'element'} eq "Metadata" && $self->{'metadata_name'}) {
552	$self->{'metadata_value'} .= $_;
553	}
554	}
555
556	sub xml_doctype {
557	}
558
559	sub open_document {
560	my $self = shift(@_);
561
562	# create a new document
563	$self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc", $self->{'file_rename_method'});
564	# TODO is file filenmae_no_path??
565	$self->set_initial_doc_fields($self->{'doc_obj'}, $self->{'filename'}, $self->{'processor'}, $self->{'metadata'});
566
567	my ($dir, $file) = $self->{'filename'} =~ /^(.?)([^\/\\])$/;
568	$self->{'xml_file_dir'} = $dir;
569	$self->{'num_pages'} = 0;
570	$self->{'has_internal_structure'} = 0;
571
572	}
573
574	sub close_document {
575	my $self = shift(@_);
576	my $doc_obj = $self->{'doc_obj'};
577
578	my $topsection = $doc_obj->get_top_section();
579
580	# add numpages metadata
581	$doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', $self->{'num_pages'});
582
583	# set the document type
584	my $final_doc_type = "";
585	if ($self->{'documenttype'} eq "auto") {
586	if ($self->{'has_internal_structure'}) {
587	if ($self->{'gs_version'} eq "3") {
588	$final_doc_type = "pagedhierarchy";
589	}
590	else {
591	$final_doc_type = "hierarchy";
592	}
593	} else {
594	$final_doc_type = "paged";
595	}
596	} else {
597	# set to what doc type option was set to
598	$final_doc_type = $self->{'documenttype'};
599	}
600	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", $final_doc_type);
601	### capiatalisation????
602	# if ($self->{'documenttype'} eq 'paged') {
603	# set the gsdlthistype metadata to Paged - this ensures this document will
604	# be treated as a Paged doc, even if Titles are not numeric
605	# $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
606	# } else {
607	# $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
608	# }
609
610	$doc_obj->set_utf8_metadata_element($topsection,"MaxImageWidth",$self->{'MaxImageWidth'});
611	$doc_obj->set_utf8_metadata_element($topsection,"MaxImageHeight",$self->{'MaxImageHeight'});
612	$self->{'MaxImageWidth'} = undef;
613	$self->{'MaxImageHeight'} = undef;
614
615	}
616
617
618	sub set_initial_doc_fields {
619	my $self = shift(@_);
620	my ($doc_obj, $filename_full_path, $processor, $metadata) = @_;
621
622	my $topsection = $doc_obj->get_top_section();
623
624	my $plugin_filename_encoding = $self->{'filename_encoding'};
625	my $filename_encoding = $self->deduce_filename_encoding($filename_full_path,$metadata,$plugin_filename_encoding);
626	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
627
628	# if we want a header page, we need to add some text into the top section, otherwise this section will become invisible
629	if ($self->{'headerpage'}) {
630	$self->add_dummy_text($doc_obj, $topsection);
631	}
632	}
633
634	sub scan_xml_for_files_to_block
635	{
636	my $self = shift (@_);
637	my ($filename_full_path, $dir, $block_hash) = @_;
638
639	open (ITEMFILE, $filename_full_path) \|\| die "couldn't open $filename_full_path to work out which files to block\n";
640	my $line = "";
641	while (defined ($line = <ITEMFILE>)) {
642	next unless $line =~ /\w/;
643
644	if ($line =~ /imgfile=\"([^\"]+)\"/) {
645	&util::block_filename($block_hash,&FileUtils::filenameConcatenate($dir,$1));
646	}
647	if ($line =~ /txtfile=\"([^\"]+)\"/) {
648	&util::block_filename($block_hash,&FileUtils::filenameConcatenate($dir,$1));
649	}
650	}
651	close ITEMFILE;
652
653	}
654
655	sub scan_item_for_files_to_block
656	{
657	my $self = shift (@_);
658	my ($filename_full_path, $dir, $block_hash) = @_;
659
660	open (ITEMFILE, $filename_full_path) \|\| die "couldn't open $filename_full_path to work out which files to block\n";
661	my $line = "";
662	while (defined ($line = <ITEMFILE>)) {
663	next unless $line =~ /\w/;
664	chomp $line;
665	next if $line =~ /^#/; # ignore comment lines
666	next if ($line =~ /^<([^>])>\s(.?)\s$/); # ignore metadata lines
667	# line should be like page:imagefilename:textfilename:r
668	$line =~ s/^\s+//; #remove space at the front
669	$line =~ s/\s+$//; #remove space at the end
670	my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
671
672	# find the image file if there is one
673	if (defined $imgname && $imgname ne "") {
674	&util::block_filename($block_hash, &FileUtils::filenameConcatenate( $dir,$imgname));
675	}
676	# find the text file if there is one
677	if (defined $txtname && $txtname ne "") {
678	&util::block_filename($block_hash, &FileUtils::filenameConcatenate($dir,$txtname));
679	}
680	}
681	close ITEMFILE;
682
683	}
684
685	sub process_item {
686	my $self = shift (@_);
687	my ($filename_full_path, $dir, $filename_no_path, $processor, $metadata) = @_;
688
689	my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
690	$self->set_initial_doc_fields($doc_obj, $filename_full_path, $processor, $metadata);
691	my $topsection = $doc_obj->get_top_section();
692	# simple item files are always paged unless user specified
693	if ($self->{'documenttype'} eq "auto") {
694	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "paged");
695	} else {
696	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", $self->{'documenttype'});
697	}
698	open (ITEMFILE, $filename_full_path) \|\| die "couldn't open $filename_full_path\n";
699	my $line = "";
700	my $num = 0;
701	while (defined ($line = <ITEMFILE>)) {
702
703	# Since process_item is called not on an XML item file, but a text item file
704	# don't decode into UTF8 the text that was read in, since it's already UTF-8
705	#$line = decode("utf-8",$line);
706
707	next unless $line =~ /\w/;
708	chomp $line;
709	next if $line =~ /^#/; # ignore comment lines
710	if ($line =~ /^<([^>])>\s(.?)\s$/) {
711	my $meta_name = $1;
712	my $meta_value = $2;
713	if ($meta_name =~ /\./) {
714	$meta_name = "ex.$meta_name";
715	}
716	$doc_obj->set_utf8_metadata_element ($topsection, $meta_name, $meta_value);
717	#$meta->{$1} = $2;
718	} else {
719	$num++;
720	# line should be like page:imagefilename:textfilename:r - the r is optional -> means rotate the image 180 deg
721	$line =~ s/^\s+//; #remove space at the front
722	$line =~ s/\s+$//; #remove space at the end
723	my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
724
725	# create a new section for each image file
726	my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
727	# the page number becomes the Title
728	$doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum);
729
730	# process the image for this page if there is one
731	if (defined $imgname && $imgname ne "") {
732	my $result1 = $self->process_image($dir.$imgname, $imgname, $doc_obj, $cursection, $rotate);
733	if (!defined $result1)
734	{
735	print "PagedImagePlugin: couldn't process image \"$dir$imgname\" for item \"$filename_full_path\"\n";
736	}
737	}
738	# process the text file if one is there
739	if (defined $txtname && $txtname ne "") {
740	my $result2 = $self->process_text ($dir.$txtname, $txtname, $doc_obj, $cursection);
741
742	if (!defined $result2) {
743	print "PagedImagePlugin: couldn't process text file \"$dir.$txtname\" for item \"$filename_full_path\"\n";
744	$self->add_dummy_text($doc_obj, $cursection);
745	}
746	} else {
747	# otherwise add in some dummy text
748	$self->add_dummy_text($doc_obj, $cursection);
749	}
750	}
751	}
752
753	close ITEMFILE;
754
755	# add numpages metadata
756	$doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num");
757
758	$doc_obj->set_utf8_metadata_element($topsection,"MaxImageWidth",$self->{'MaxImageWidth'});
759	$doc_obj->set_utf8_metadata_element($topsection,"MaxImageHeight",$self->{'MaxImageHeight'});
760	$self->{'MaxImageWidth'} = undef;
761	$self->{'MaxImageHeight'} = undef;
762
763
764	return $doc_obj;
765	}
766
767	sub process_text {
768	my $self = shift (@_);
769	my ($filename_full_path, $file, $doc_obj, $cursection) = @_;
770
771	# check that the text file exists!!
772	if (!-f $filename_full_path) {
773	print "PagedImagePlugin: ERROR: File $filename_full_path does not exist, skipping\n";
774	return 0;
775	}
776
777	# remember that this text file was one of our source files, but only
778	# if we are not processing a tmp file
779	if (!$self->{'processing_tmp_files'} ) {
780	$doc_obj->associate_source_file($filename_full_path);
781	}
782	# Do encoding stuff
783	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
784
785	my $text="";
786	&ReadTextFile::read_file($self, $filename_full_path, $encoding, $language, \$text); # already decoded as utf8
787	if (!length ($text)) {
788	# It's a bit unusual but not out of the question to have no text, so just give a warning
789	print "PagedImagePlugin: WARNING: $filename_full_path contains no text\n";
790	}
791
792	# we need to escape the escape character, or else mg will convert into
793	# eg literal newlines, instead of leaving the text as '\n'
794	$text =~ s/\\/\\\\/g; # macro language
795	$text =~ s/_/\\_/g; # macro language
796
797
798	if ($text =~ m/<html.?>\s<head.?>.<\/head>\s<body.?>(.)<\/body>\s<\/html>\s*$/is) {
799	# looks like HTML input
800	# no need to escape < and > or put in <pre> tags
801
802	$text = $1;
803
804	# add text to document object
805	$doc_obj->add_utf8_text($cursection, "$text");
806	}
807	else {
808	$text =~ s/</</g;
809	$text =~ s/>/>/g;
810
811	# insert preformat tags and add text to document object
812	$doc_obj->add_utf8_text($cursection, "<pre>\n$text\n</pre>");
813	}
814
815
816	return 1;
817	}
818
819
820	sub clean_up_after_doc_obj_processing {
821	my $self = shift(@_);
822
823	$self->ImageConverter::clean_up_temporary_files();
824	}
825
826	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: