Context Navigation

source: main/trunk/greenstone2/perllib/plugins/PagedImagePlugin.pm@ 23363

Last change on this file since 23363 was 23363, checked in by davidb, 13 years ago
Plugin code upgrade to support Greenstone working with filenames under Windows when then go beyond Latin-1 and start turning up in their DOS abbreviated form (e.g. Test~1.txt)
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 25.7 KB

Line
1	###########################################################################
2	#
3	# PagedImagePlugin.pm -- plugin for sets of images and OCR text that
4	# make up a document
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright (C) 1999 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	# PagedImagePlugin
28	# processes sequences of images, with optional OCR text
29	#
30	# This plugin takes *.item files, which contain metadata and lists of image
31	# files, and produces a document containing sections, one for each page.
32	# The files should be named something.item, then you can have more than one
33	# book in a directory. You will need to create these files, one for each
34	# document/book.
35	#
36	#There are two formats for the item files: a plain text format, and an xml
37	#format. You can use either format, and can have both formats in the same
38	#collection if you like. If you use the plain format, you must not start the
39	#file off with <PagedDocument>
40
41	#### PLAIN FORMAT
42	# The format of the xxx.item file is as follows:
43	# The first lines contain any metadata for the whole document
44	# <metadata-name>metadata-value
45	# eg.
46	# <Title>Snail farming
47	# <Date>19230102
48	# Then comes a list of pages, one page per line, each line has the format
49	#
50	# pagenum:imagefile:textfile:r
51	#
52	# page num and imagefile are required. pagenum is used for the Title
53	# of the section, and in the display is shown as page <pagenum>.
54	# imagefile is the image for the page. textfile is an optional text
55	# file containing the OCR (or any) text for the page - this gets added
56	# as the text for the section. r is optional, and signals that the image
57	# should be rotated 180deg. Eg use this if the image has been made upside down.
58	# So an example item file looks like:
59	# <Title>Snail farming
60	# <Date>19960403
61	# 1:p1.gif:p1.txt:
62	# 2:p2.gif::
63	# 3:p3.gif:p3.txt:
64	# 3b:p3b.gif:p3b.txt:r
65	# The second page has no text, the fourth page is a back page, and
66	# should be rotated.
67	#
68
69	#### XML FORMAT
70	# The xml format looks like the following
71	#<PagedDocument>
72	#<Metadata name="Title">The Title of the entire document</Metadata>
73	#<Page pagenum="1" imgfile="xxx.jpg" txtfile="yyy.txt">
74	#<Metadata name="Title">The Title of this page</Metadata>
75	#</Page>
76	#... more pages
77	#</PagedDocument>
78	#PagedDocument contains a list of Pages, Metadata and PageGroups. Any metadata
79	#that is not inside another tag will belong to the document.
80	#Each Page has a pagenum (not used at the moment), an imgfile and/or a txtfile.
81	#These are both optional - if neither is used, the section will have no content.
82	#Pages can also have metadata associated with them.
83	#PageGroups can be introduced at any point - they can contain Metadata and Pages and other PageGroups. They are used to introduce hierarchical structure into the document.
84	#For example
85	#<PagedDocument>
86	#<PageGroup>
87	#<Page>
88	#<Page>
89	#</PageGroup>
90	#<Page>
91	#</PagedDocument>
92	#would generate a structure like
93	#X
94	#--X
95	# --X
96	# --X
97	#--X
98	#PageGroup tags can also have imgfile/textfile metadata if you like - this way they get some content themselves.
99
100	#Currently the XML structure doesn't work very well with the paged document type, unless you use numerical Titles for each section.
101	#There is still a bit of work to do on this format:
102	#* enable other text file types, eg html, pdf etc
103	#* make the document paging work properly
104	#* add pagenum as Title unless a Title is present?
105
106	# All the supplemetary image amd text files should be in the same folder as
107	# the .item file.
108	#
109	# To display the images instead of the document text, you can use [srcicon]
110	# in the DocumentText format statement.
111	# For example,
112	#
113	# format DocumentText "<center><table width=_pagewidth_><tr><td>[srcicon]</td></tr></table></center>"
114	#
115	# To have it create thumbnail size images, use the '-create_thumbnail' option.
116	# To have it create medium size images for display, use the '-create_screenview'
117	# option. As usual, running
118	# 'perl -S pluginfo.pl PagedImagePlugin' will list all the options.
119
120	# If you want the resulting documents to be presented with a table of
121	# contents, use '-documenttype hierarchy', otherwise they will have
122	# next and previous arrows, and a goto page X box.
123
124	# If you have used -create_screenview, you can also use [screenicon] in the format
125	# statement to display the smaller image. Here is an example that switches
126	# between the two:
127	#
128	# format DocumentText "<center><table width=_pagewidth_><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small'>Switch to small version.</a>,<a href='_httpdocument_&d=_cgiargd_&p=full'>Switch to fullsize version</a>}</td></tr><tr><td>{If}{_cgiargp_ eq full,<a href='_httpdocument_&d=_cgiargd_&p=small' title='Switch to small version'>[srcicon]</a>,<a href='_httpdocument_&d=_cgiargd_&p=full' title='Switch to fullsize version'>[screenicon]</a>}</td></tr></table></center>"
129	#
130	# Additional metadata can be added into the .item files, alternatively you can
131	# use normal metadata.xml files, with the name of the xxx.item file as the
132	# FileName (only for document level metadata).
133
134	package PagedImagePlugin;
135
136	use ReadXMLFile;
137	use ReadTextFile;
138	use ImageConverter;
139
140	use strict;
141	no strict 'refs'; # allow filehandles to be variables and viceversa
142
143	sub BEGIN {
144	@PagedImagePlugin::ISA = ('ReadXMLFile', 'ReadTextFile', 'ImageConverter');
145	}
146
147	my $type_list =
148	[ { 'name' => "paged",
149	'desc' => "{PagedImagePlugin.documenttype.paged}" },
150	{ 'name' => "hierarchy",
151	'desc' => "{PagedImagePlugin.documenttype.hierarchy}" } ];
152
153	my $arguments =
154	[ { 'name' => "process_exp",
155	'desc' => "{BasePlugin.process_exp}",
156	'type' => "string",
157	'deft' => &get_default_process_exp(),
158	'reqd' => "no" },
159	{ 'name' => "title_sub",
160	'desc' => "{HTMLPlugin.title_sub}",
161	'type' => "string",
162	'deft' => "" },
163	{ 'name' => "headerpage",
164	'desc' => "{PagedImagePlugin.headerpage}",
165	'type' => "flag",
166	'reqd' => "no" },
167	{ 'name' => "documenttype",
168	'desc' => "{PagedImagePlugin.documenttype}",
169	'type' => "enum",
170	'list' => $type_list,
171	'deft' => "paged",
172	'reqd' => "no" },
173	{'name' => "processing_tmp_files",
174	'desc' => "{BasePlugin.processing_tmp_files}",
175	'type' => "flag",
176	'hiddengli' => "yes"}
177	];
178
179
180	my $options = { 'name' => "PagedImagePlugin",
181	'desc' => "{PagedImagePlugin.desc}",
182	'abstract' => "no",
183	'inherits' => "yes",
184	'args' => $arguments };
185
186	sub new {
187	my ($class) = shift (@_);
188	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
189	push(@$pluginlist, $class);
190
191	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
192	push(@{$hashArgOptLists->{"OptList"}},$options);
193
194	my $imc_self = new ImageConverter($pluginlist, $inputargs, $hashArgOptLists);
195	my $rtf_self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists, 1);
196	my $rxf_self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
197
198	my $self = BasePlugin::merge_inheritance($imc_self,$rtf_self,$rxf_self);
199
200	# Update $self used by XML::Parser so it finds callback functions
201	# such as start_document here and not in ReadXMLFile (which is what
202	# $self was when new XML::Parser was done)
203	#
204	# If the $self returned by this constructor is the same as the one
205	# used in ReadXMLFile (e.g. in the GreenstoneXMLPlugin) then this step isn't necessary
206	#
207	# Consider embedding this type of assignment into merge_inheritance
208	# to help catch all cases?
209
210	$rxf_self->{'parser'}->{'PluginObj'} = $self;
211
212	return bless $self, $class;
213	}
214
215
216	sub init {
217	my $self = shift (@_);
218	my ($verbosity, $outhandle, $failhandle) = @_;
219
220	$self->SUPER::init(@_);
221	$self->ImageConverter::init();
222	}
223
224	sub begin {
225	my $self = shift (@_);
226	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
227
228	$self->SUPER::begin(@_);
229	$self->ImageConverter::begin(@_);
230	}
231
232	sub get_default_process_exp {
233	my $self = shift (@_);
234
235	return q^\.item$^;
236	}
237
238	sub get_doctype {
239	my $self = shift(@_);
240
241	return "PagedDocument";
242	}
243
244
245	# want to use BasePlugin's version of this, not ReadXMLFile's
246	sub can_process_this_file {
247	my $self = shift(@_);
248	return $self->BasePlugin::can_process_this_file(@_);
249	}
250
251	# instead of a block exp, now we scan the file and record all text and img files mentioned there for blocking.
252	sub store_block_files
253	{
254	my $self = shift (@_);
255	my ($filename_full_path, $block_hash) = @_;
256
257	my $xml_version = $self->is_xml_item_file($filename_full_path);
258
259	# do we need to do this?
260	# does BOM interfere just with XML parsing? In that case don't need it here
261	# if we do it here, we are modifying the file before we have worked out if
262	# its new or not, so it will always be reimported.
263	#$self->tidy_item_file($filename_full_path);
264
265	my ($dir, $file) = $filename_full_path =~ /^(.?)([^\/\\])$/;
266	if ($xml_version) {
267
268	# do something
269	$self->scan_xml_for_files_to_block($filename_full_path, $dir, $block_hash);
270	} else {
271
272	$self->scan_item_for_files_to_block($filename_full_path, $dir, $block_hash);
273	}
274
275	}
276
277	# we want to use BasePlugin's read, not ReadXMLFile's
278	sub read
279	{
280	my $self = shift (@_);
281
282	$self->BasePlugin::read(@_);
283	}
284
285
286
287	sub read_into_doc_obj {
288	my $self = shift (@_);
289	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
290	my $outhandle = $self->{'outhandle'};
291	my $verbosity = $self->{'verbosity'};
292
293	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
294
295	print $outhandle "PagedImagePlugin processing \"$filename_full_path\"\n"
296	if $verbosity > 1;
297	print STDERR "<Processing n='$file' p='PagedImagePlugin'>\n" if ($gli);
298
299	$self->{'MaxImageWidth'} = 0;
300	$self->{'MaxImageHeight'} = 0;
301
302	# here we need to decide if we have an old text .item file, or a new xml
303	# .item file
304	my $xml_version = $self->is_xml_item_file($filename_full_path);
305
306	$self->tidy_item_file($filename_full_path);
307
308	my $doc_obj;
309	if ($xml_version) {
310	# careful checking needed here!! are we using local xml handlers or super ones
311	$self->ReadXMLFile::read($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
312	$doc_obj = $self->{'doc_obj'};
313	} else {
314	my ($dir);
315	($dir, $file) = $filename_full_path =~ /^(.?)([^\/\\])$/;
316
317	#process the .item file
318	$doc_obj = $self->process_item($filename_full_path, $dir, $file, $processor, $metadata);
319
320	}
321
322	my $section = $doc_obj->get_top_section();
323
324	$doc_obj->add_utf8_metadata($section, "Plugin", "$self->{'plugin_type'}");
325	$doc_obj->add_metadata($section, "FileFormat", "PagedImage");
326
327	# include any metadata passed in from previous plugins
328	# note that this metadata is associated with the top level section
329	$self->add_associated_files($doc_obj, $filename_full_path);
330	$self->extra_metadata ($doc_obj, $section, $metadata);
331	$self->auto_extract_metadata ($doc_obj);
332
333	# if we haven't found any Title so far, assign one
334	$self->title_fallback($doc_obj,$section,$filename_no_path);
335
336	$self->add_OID($doc_obj);
337	return (1,$doc_obj);
338	}
339
340	# for now, the test is if the first non-empty line is <PagedDocument>, then its xml
341	sub is_xml_item_file {
342	my $self = shift(@_);
343	my ($filename) = @_;
344
345	my $xml_version = 0;
346	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
347
348	my $line = "";
349	my $num = 0;
350
351	$line = <ITEMFILE>;
352	while (defined ($line) && ($line !~ /\w/)) {
353	$line = <ITEMFILE>;
354	}
355
356	if (defined $line) {
357	chomp $line;
358	if ($line =~ /<PagedDocument/) {
359	$xml_version = 1;
360	}
361	}
362
363	close ITEMFILE;
364	return $xml_version;
365	}
366
367	sub tidy_item_file {
368	my $self = shift(@_);
369	my ($filename) = @_;
370
371	open (ITEMFILE, $filename) \|\| die "couldn't open $filename\n";
372	my $backup_filename = "backup.item";
373	open (BACKUP,">$backup_filename")\|\| die "couldn't write to $backup_filename\n";
374	my $line = "";
375	$line = <ITEMFILE>;
376	$line =~ s/^\xEF\xBB\xBF//; # strip BOM
377	$line =~ s/\x0B+//ig;
378	$line =~ s/&/&/g;
379	print BACKUP ($line);
380	#Tidy up the item file some metadata title contains \vt-vertical tab
381	while ($line = <ITEMFILE>) {
382	$line =~ s/\x0B+//ig;
383	$line =~ s/&/&/g;
384	print BACKUP ($line);
385	}
386	close ITEMFILE;
387	close BACKUP;
388	&File::Copy::copy ($backup_filename, $filename);
389	&util::rm($backup_filename);
390
391	}
392
393	sub rotate_image {
394	my $self = shift (@_);
395	my ($filename_full_path) = @_;
396
397	my ($this_filetype) = $filename_full_path =~ /\.([^\.]*)$/;
398	my $result = $self->convert($filename_full_path, $this_filetype, "-rotate 180", "ROTATE");
399	my ($new_filename) = ($result =~ /=>(.*\.$this_filetype)/);
400	if (-e "$new_filename") {
401	return $new_filename;
402	}
403	# somethings gone wrong
404	return $filename_full_path;
405
406	}
407
408	sub process_image {
409	my $self = shift(@_);
410	my ($filename_full_path, $filename_no_path, $doc_obj, $section, $rotate) = @_;
411	# check the filenames
412	return 0 if ($filename_no_path eq "" \|\| !-f $filename_full_path);
413
414	# remember that this image file was one of our source files, but only
415	# if we are not processing a tmp file
416	if (!$self->{'processing_tmp_files'} ) {
417	$doc_obj->associate_source_file($filename_full_path);
418	}
419	# do rotation
420	if ((defined $rotate) && ($rotate eq "r")) {
421	# we get a new temporary file which is rotated
422	$filename_full_path = $self->rotate_image($filename_full_path);
423	}
424
425	# do generate images
426	my $result = 0;
427	if ($self->{'image_conversion_available'} == 1) {
428	# do we need to convert $filename_no_path to utf8/url encoded?
429	# We are already reading in from a file, what encoding is it in???
430	my $url_encoded_full_filename
431	= &unicode::raw_filename_to_url_encoded($filename_full_path);
432	$result = $self->generate_images($filename_full_path, $url_encoded_full_filename, $doc_obj, $section);
433	}
434	#overwrite one set in ImageConverter
435	$doc_obj->set_metadata_element ($section, "FileFormat", "PagedImage");
436	return $result;
437	}
438
439
440	sub xml_start_tag {
441	my $self = shift(@_);
442	my ($expat, $element) = @_;
443	$self->{'element'} = $element;
444
445	my $doc_obj = $self->{'doc_obj'};
446	if ($element eq "PagedDocument") {
447	$self->{'current_section'} = $doc_obj->get_top_section();
448	} elsif ($element eq "PageGroup" \|\| $element eq "Page") {
449	# create a new section as a child
450	$self->{'current_section'} = $doc_obj->insert_section($doc_obj->get_end_child($self->{'current_section'}));
451	$self->{'num_pages'}++;
452	# assign pagenum as what??
453	my $pagenum = $_{'pagenum'}; #TODO!!
454	if (defined $pagenum) {
455	$doc_obj->set_utf8_metadata_element($self->{'current_section'}, 'PageNum', $pagenum);
456	}
457	my ($imgfile) = $_{'imgfile'};
458	if (defined $imgfile) {
459	# *****
460	# What about support for rotate image (e.g. old ':r' notation)?
461	$self->process_image($self->{'xml_file_dir'}.$imgfile, $imgfile, $doc_obj, $self->{'current_section'});
462	}
463	my ($txtfile) = $_{'txtfile'};
464	if (defined($txtfile)&& $txtfile ne "") {
465	$self->process_text ($self->{'xml_file_dir'}.$txtfile, $txtfile, $doc_obj, $self->{'current_section'});
466	} else {
467	$self->add_dummy_text($doc_obj, $self->{'current_section'});
468	}
469	} elsif ($element eq "Metadata") {
470	$self->{'metadata_name'} = $_{'name'};
471	}
472	}
473
474	sub xml_end_tag {
475	my $self = shift(@_);
476	my ($expat, $element) = @_;
477
478	my $doc_obj = $self->{'doc_obj'};
479	if ($element eq "Page" \|\| $element eq "PageGroup") {
480	# if Title hasn't been assigned, set PageNum as Title
481	if (!defined $doc_obj->get_metadata_element ($self->{'current_section'}, "Title") && defined $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" )) {
482	$doc_obj->add_utf8_metadata ($self->{'current_section'}, "Title", $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" ));
483	}
484	# move the current section back to the parent
485	$self->{'current_section'} = $doc_obj->get_parent_section($self->{'current_section'});
486	} elsif ($element eq "Metadata") {
487	my $meta_name = $self->{'metadata_name'};
488	if ($meta_name =~ /\./) {
489	$meta_name = "ex.$meta_name";
490	}
491	$doc_obj->add_utf8_metadata ($self->{'current_section'}, $meta_name, $self->{'metadata_value'});
492	$self->{'metadata_name'} = "";
493	$self->{'metadata_value'} = "";
494
495	}
496	# otherwise we ignore the end tag
497	}
498
499
500	sub xml_text {
501	my $self = shift(@_);
502	my ($expat) = @_;
503
504	if ($self->{'element'} eq "Metadata" && $self->{'metadata_name'}) {
505	$self->{'metadata_value'} .= $_;
506	}
507	}
508
509	sub xml_doctype {
510	}
511
512	sub open_document {
513	my $self = shift(@_);
514
515	# create a new document
516	$self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc", $self->{'file_rename_method'});
517	# TODO is file filenmae_no_path??
518	$self->set_initial_doc_fields($self->{'doc_obj'}, $self->{'filename'}, $self->{'processor'}, $self->{'metadata'});
519
520	my ($dir, $file) = $self->{'filename'} =~ /^(.?)([^\/\\])$/;
521	$self->{'xml_file_dir'} = $dir;
522	$self->{'num_pages'} = 0;
523
524	}
525
526	sub close_document {
527	my $self = shift(@_);
528	my $doc_obj = $self->{'doc_obj'};
529
530	# add numpages metadata
531	my $topsection = $doc_obj->get_top_section();
532
533	$doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', $self->{'num_pages'});
534
535	$doc_obj->set_utf8_metadata_element($topsection,"MaxImageWidth",$self->{'MaxImageWidth'});
536	$doc_obj->set_utf8_metadata_element($topsection,"MaxImageHeight",$self->{'MaxImageHeight'});
537	$self->{'MaxImageWidth'} = undef;
538	$self->{'MaxImageHeight'} = undef;
539
540	}
541
542
543	sub set_initial_doc_fields {
544	my $self = shift(@_);
545	my ($doc_obj, $filename_full_path, $processor, $metadata) = @_;
546
547	my $topsection = $doc_obj->get_top_section();
548
549	if ($self->{'documenttype'} eq 'paged') {
550	# set the gsdlthistype metadata to Paged - this ensures this document will
551	# be treated as a Paged doc, even if Titles are not numeric
552	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
553	} else {
554	$doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
555	}
556
557	my $plugin_filename_encoding = $self->{'filename_encoding'};
558	my $filename_encoding = $self->deduce_filename_encoding($filename_full_path,$metadata,$plugin_filename_encoding);
559	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
560
561	# if we want a header page, we need to add some text into the top section, otherwise this section will become invisible
562	if ($self->{'headerpage'}) {
563	$self->add_dummy_text($doc_obj, $topsection);
564	}
565	}
566
567	sub scan_xml_for_files_to_block
568	{
569	my $self = shift (@_);
570	my ($filename_full_path, $dir, $block_hash) = @_;
571
572	open (ITEMFILE, $filename_full_path) \|\| die "couldn't open $filename_full_path to work out which files to block\n";
573	my $line = "";
574	while (defined ($line = <ITEMFILE>)) {
575	next unless $line =~ /\w/;
576
577	if ($line =~ /imgfile=\"([^\"]+)\"/) {
578	$block_hash->{'file_blocks'}->{$dir.$1} = 1;
579	}
580	if ($line =~ /txtfile=\"([^\"]+)\"/) {
581	$block_hash->{'file_blocks'}->{$dir.$1} = 1;
582	}
583	}
584	close ITEMFILE;
585
586	}
587
588	sub scan_item_for_files_to_block
589	{
590	my $self = shift (@_);
591	my ($filename_full_path, $dir, $block_hash) = @_;
592
593	open (ITEMFILE, $filename_full_path) \|\| die "couldn't open $filename_full_path to work out which files to block\n";
594	my $line = "";
595	while (defined ($line = <ITEMFILE>)) {
596	next unless $line =~ /\w/;
597	chomp $line;
598	next if $line =~ /^#/; # ignore comment lines
599	next if ($line =~ /^<([^>])>\s(.?)\s$/); # ignore metadata lines
600	# line should be like page:imagefilename:textfilename:r
601	$line =~ s/^\s+//; #remove space at the front
602	$line =~ s/\s+$//; #remove space at the end
603	my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
604
605	# find the image file if there is one
606	if (defined $imgname && $imgname ne "") {
607	$block_hash->{'file_blocks'}->{$dir.$imgname}=1;
608	}
609	# find the text file if there is one
610	if (defined $txtname && $txtname ne "") {
611	$block_hash->{'file_blocks'}->{$dir.$txtname} = 1;
612	}
613	}
614	close ITEMFILE;
615
616	}
617
618	sub process_item {
619	my $self = shift (@_);
620	my ($filename_full_path, $dir, $filename_no_path, $processor, $metadata) = @_;
621
622	my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
623	$self->set_initial_doc_fields($doc_obj, $filename_full_path, $processor, $metadata);
624	my $topsection = $doc_obj->get_top_section();
625	open (ITEMFILE, $filename_full_path) \|\| die "couldn't open $filename_full_path\n";
626	my $line = "";
627	my $num = 0;
628	while (defined ($line = <ITEMFILE>)) {
629	next unless $line =~ /\w/;
630	chomp $line;
631	next if $line =~ /^#/; # ignore comment lines
632	if ($line =~ /^<([^>])>\s(.?)\s$/) {
633	my $meta_name = $1;
634	my $meta_value = $2;
635	if ($meta_name =~ /\./) {
636	$meta_name = "ex.$meta_name";
637	}
638	$doc_obj->set_utf8_metadata_element ($topsection, $meta_name, $meta_value);
639	#$meta->{$1} = $2;
640	} else {
641	$num++;
642	# line should be like page:imagefilename:textfilename:r - the r is optional -> means rotate the image 180 deg
643	$line =~ s/^\s+//; #remove space at the front
644	$line =~ s/\s+$//; #remove space at the end
645	my ($pagenum, $imgname, $txtname, $rotate) = split /:/, $line;
646
647	# create a new section for each image file
648	my $cursection = $doc_obj->insert_section($doc_obj->get_end_child($topsection));
649	# the page number becomes the Title
650	$doc_obj->set_utf8_metadata_element($cursection, 'Title', $pagenum);
651
652	# process the image for this page if there is one
653	if (defined $imgname && $imgname ne "") {
654	my $result1 = $self->process_image($dir.$imgname, $imgname, $doc_obj, $cursection, $rotate);
655	if (!defined $result1)
656	{
657	print "PagedImagePlugin: couldn't process image \"$dir$imgname\" for item \"$filename_full_path\"\n";
658	}
659	}
660	# process the text file if one is there
661	if (defined $txtname && $txtname ne "") {
662	my $result2 = $self->process_text ($dir.$txtname, $txtname, $doc_obj, $cursection);
663
664	if (!defined $result2) {
665	print "PagedImagePlugin: couldn't process text file \"$dir.$txtname\" for item \"$filename_full_path\"\n";
666	$self->add_dummy_text($doc_obj, $cursection);
667	}
668	} else {
669	# otherwise add in some dummy text
670	$self->add_dummy_text($doc_obj, $cursection);
671	}
672	}
673	}
674
675	close ITEMFILE;
676
677	# add numpages metadata
678	$doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', "$num");
679
680	$doc_obj->set_utf8_metadata_element($topsection,"MaxImageWidth",$self->{'MaxImageWidth'});
681	$doc_obj->set_utf8_metadata_element($topsection,"MaxImageHeight",$self->{'MaxImageHeight'});
682	$self->{'MaxImageWidth'} = undef;
683	$self->{'MaxImageHeight'} = undef;
684
685
686	return $doc_obj;
687	}
688
689	sub process_text {
690	my $self = shift (@_);
691	my ($filename_full_path, $file, $doc_obj, $cursection) = @_;
692
693	# check that the text file exists!!
694	if (!-f $filename_full_path) {
695	print "PagedImagePlugin: ERROR: File $filename_full_path does not exist, skipping\n";
696	return 0;
697	}
698
699	# remember that this text file was one of our source files, but only
700	# if we are not processing a tmp file
701	if (!$self->{'processing_tmp_files'} ) {
702	$doc_obj->associate_source_file($filename_full_path);
703	}
704	# Do encoding stuff
705	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
706
707	my $text="";
708	&ReadTextFile::read_file($self, $filename_full_path, $encoding, $language, \$text);
709	if (!length ($text)) {
710	# It's a bit unusual but not out of the question to have no text, so just give a warning
711	print "PagedImagePlugin: WARNING: $filename_full_path contains no text\n";
712	}
713
714	# we need to escape the escape character, or else mg will convert into
715	# eg literal newlines, instead of leaving the text as '\n'
716	$text =~ s/\\/\\\\/g; # macro language
717	$text =~ s/_/\\_/g; # macro language
718
719
720	if ($text =~ m/<html.?>\s<head.?>.<\/head>\s<body.?>(.)<\/body>\s<\/html>\s*$/is) {
721	# looks like HTML input
722	# no need to escape < and > or put in <pre> tags
723
724	$text = $1;
725
726	# add text to document object
727	$doc_obj->add_utf8_text($cursection, "$text");
728	}
729	else {
730	$text =~ s/</</g;
731	$text =~ s/>/>/g;
732
733	# insert preformat tags and add text to document object
734	$doc_obj->add_utf8_text($cursection, "<pre>\n$text\n</pre>");
735	}
736
737
738	return 1;
739	}
740
741
742	sub clean_up_after_doc_obj_processing {
743	my $self = shift(@_);
744
745	$self->ImageConverter::clean_up_temporary_files();
746	}
747
748	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: