Context Navigation

source: main/trunk/greenstone2/perllib/plugins/HathiTrustMETSPlugin.pm@ 32186

Last change on this file since 32186 was 31492, checked in by kjdon, 7 years ago
renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes
Property svn:executable set to ``*
File size: 19.8 KB

Rev	Line
[31284]	1	###########################################################################
	2	#
	3	# HathiTrustMETSPlugin.pm -- plugin for sets of HathiTrust METS OCR'd
	4	# text that make up a document
	5	#
	6	# A component of the Greenstone digital library software
	7	# from the New Zealand Digital Library Project at the
	8	# University of Waikato, New Zealand.
	9	#
	10	# Copyright (C) 1999 New Zealand Digital Library Project
	11	#
	12	# This program is free software; you can redistribute it and/or modify
	13	# it under the terms of the GNU General Public License as published by
	14	# the Free Software Foundation; either version 2 of the License, or
	15	# (at your option) any later version.
	16	#
	17	# This program is distributed in the hope that it will be useful,
	18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	# GNU General Public License for more details.
	21	#
	22	# You should have received a copy of the GNU General Public License
	23	# along with this program; if not, write to the Free Software
	24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	25	#
	26	###########################################################################
	27
	28	# HathiTrustMETSPlugin
	29	# processes HathiTrust METS files that are accompanied with page-by-page
	30	# OCR'd txt files
	31	#
	32	# All the supplemetary text files should be in a subfolder of the same
	33	# name as the METS file
	34	#
	35	# As usual, running
	36	# 'perl -S pluginfo.pl HathiTrustMETSPlugin' will list all the options.
	37
	38
	39	package HathiTrustMETSPlugin;
	40
	41	use Encode;
	42	use ReadXMLFile;
	43	use ReadTextFile;
	44	# We don't currently work with the scanned image from HathiTrust METS
	45	# but leave it in for future proofing
	46	use ImageConverter;
	47	use MetadataRead;
	48
	49	use JSON;
	50
	51	use strict;
	52	no strict 'refs'; # allow filehandles to be variables and viceversa
	53
	54	sub BEGIN {
	55	@HathiTrustMETSPlugin::ISA = ('MetadataRead', 'ReadXMLFile', 'ReadTextFile', 'ImageConverter'
	56	);
	57	}
	58
	59	# One day HathiTrust might give more than page structure
	60	my $gs2_type_list =
	61	[
	62	# { 'name' => "auto",
	63	# 'desc' => "{PagedImagePlugin.documenttype.auto2}" },
	64	# { 'name' => "paged",
	65	# 'desc' => "{PagedImagePlugin.documenttype.paged2}" },
	66	{ 'name' => "hierarchy",
	67	'desc' => "{PagedImagePlugin.documenttype.hierarchy}" }
	68	];
	69
	70	my $gs3_type_list =
	71	[
	72	# { 'name' => "auto",
	73	# 'desc' => "{PagedImagePlugin.documenttype.auto3}" },
	74	# { 'name' => "paged",
	75	# 'desc' => "{PagedImagePlugin.documenttype.paged3}" },
	76	{ 'name' => "hierarchy",
	77	'desc' => "{PagedImagePlugin.documenttype.hierarchy}" }
	78	# { 'name' => "pagedhierarchy",
	79	# 'desc' => "{PagedImagePlugin.documenttype.pagedhierarchy}" }
	80	];
	81
	82	my $arguments =
	83	[ { 'name' => "process_exp",
[31492]	84	'desc' => "{BaseImporter.process_exp}",
[31284]	85	'type' => "string",
	86	'deft' => &get_default_process_exp(),
	87	'reqd' => "no" },
	88	{ 'name' => "title_sub",
	89	'desc' => "{HTMLPlugin.title_sub}",
	90	'type' => "string",
	91	'deft' => "" },
	92	{ 'name' => "headerpage",
	93	'desc' => "{HathiTrustMETSPlugin.headerpage}",
	94	'type' => "flag",
	95	'reqd' => "no" },
	96	# { 'name' => "documenttype",
	97	# 'desc' => "{HathiTrustMETSPlugin.documenttype}",
	98	# 'type' => "enum",
	99	# 'list' => $type_list,
	100	# 'deft' => "auto",
	101	# 'reqd' => "no" },
	102	{'name' => "processing_tmp_files",
[31492]	103	'desc' => "{BaseImporter.processing_tmp_files}",
[31284]	104	'type' => "flag",
	105	'hiddengli' => "yes"}
	106	];
	107
	108	my $doc_type_opt = { 'name' => "documenttype",
	109	'desc' => "{HathiTrustMETSPlugin.documenttype}",
	110	'type' => "enum",
	111	'deft' => "auto",
	112	'reqd' => "no" };
	113
	114	my $options = { 'name' => "HathiTrustMETSPlugin",
	115	'desc' => "{HathiTrustMETSPlugin.desc}",
	116	'abstract' => "no",
	117	'inherits' => "yes",
	118	'args' => $arguments };
	119
	120	sub new {
	121	my ($class) = shift (@_);
	122	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	123	push(@$pluginlist, $class);
	124
	125	push(@{$hashArgOptLists->{"OptList"}},$options);
	126
	127	my $imc_self = new ImageConverter($pluginlist, $inputargs, $hashArgOptLists);
	128
	129	# we can use this plugin to check gs3 version
	130	if ($imc_self->{'gs_version'} eq "3") {
	131	$doc_type_opt->{'list'} = $gs3_type_list;
	132	}
	133	else {
	134	$doc_type_opt->{'list'} = $gs2_type_list;
	135	}
	136	push(@$arguments,$doc_type_opt);
	137	# now we add the args to the list for parsing
	138	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	139
	140	my $rtf_self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists, 1);
	141	my $rxf_self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
	142
[31492]	143	my $self = BaseImporter::merge_inheritance($imc_self,$rtf_self,$rxf_self);
[31284]	144
	145	# Update $self used by XML::Parser so it finds callback functions
	146	# such as start_document here and not in ReadXMLFile (which is what
	147	# $self was when new XML::Parser was done)
	148	#
	149	# If the $self returned by this constructor is the same as the one
	150	# used in ReadXMLFile (e.g. in the GreenstoneXMLPlugin) then this step isn't necessary
	151	#
	152	# Consider embedding this type of assignment into merge_inheritance
	153	# to help catch all cases?
	154
	155	$rxf_self->{'parser'}->{'PluginObj'} = $self;
	156
	157	return bless $self, $class;
	158	}
	159
	160
	161	sub init {
	162	my $self = shift (@_);
	163	my ($verbosity, $outhandle, $failhandle) = @_;
	164
	165	$self->SUPER::init(@_);
	166	$self->ImageConverter::init();
	167	}
	168
	169	sub begin {
	170	my $self = shift (@_);
	171	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
	172
	173	$self->SUPER::begin(@_);
	174	$self->ImageConverter::begin(@_);
	175	}
	176
	177	sub get_default_process_exp {
	178	my $self = shift (@_);
	179
	180	return q^\.mets.xml$^;
	181	}
	182
	183	sub get_doctype {
	184	my $self = shift(@_);
	185
	186	return "METS:mets";
	187	}
	188
	189
[31492]	190	# want to use BaseImporter's version of this, not ReadXMLFile's
[31284]	191	sub can_process_this_file {
	192	my $self = shift(@_);
[31492]	193	return $self->BaseImporter::can_process_this_file(@_);
[31284]	194	}
	195
	196	# instead of a block exp, now we scan the file and record all text and img files mentioned there for blocking.
	197	sub store_block_files
	198	{
	199	my $self = shift (@_);
	200	my ($filename_full_path, $block_hash) = @_;
	201
	202	# do we need to do this?
	203	# does BOM interfere just with XML parsing? In that case don't need it here
	204	# if we do it here, we are modifying the file before we have worked out if
	205	# its new or not, so it will always be reimported.
	206	#$self->tidy_item_file($filename_full_path);
	207
	208	my ($dir, $file) = $filename_full_path =~ /^(.?)([^\/\\])$/;
	209
	210	# do something
	211	$self->scan_xml_for_files_to_block($filename_full_path, $dir, $block_hash);
	212
	213	}
	214
[31492]	215	# we want to use BaseImporter's read, not ReadXMLFile's
[31284]	216	sub read
	217	{
	218	my $self = shift (@_);
	219
[31492]	220	$self->BaseImporter::read(@_);
[31284]	221	}
	222
	223
	224
	225	sub read_into_doc_obj {
	226	my $self = shift (@_);
	227	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
	228	my $outhandle = $self->{'outhandle'};
	229	my $verbosity = $self->{'verbosity'};
	230
	231	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
	232
	233	print $outhandle "HathiTrustMETSPlugin processing \"$filename_full_path\"\n"
	234	if $verbosity > 1;
	235	print STDERR "<Processing n='$file' p='HathiTrustMETSPlugin'>\n" if ($gli);
	236
	237	## $self->{'MaxImageWidth'} = 0;
	238	## $self->{'MaxImageHeight'} = 0;
	239
	240
	241	##$self->tidy_item_file($filename_full_path);
	242
	243	# careful checking needed here!! are we using local xml handlers or super ones
	244	$self->ReadXMLFile::read($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
	245	my $doc_obj = $self->{'doc_obj'};
	246
	247
	248	my $section = $doc_obj->get_top_section();
	249
	250	$doc_obj->add_utf8_metadata($section, "Plugin", "$self->{'plugin_type'}");
	251	$doc_obj->add_metadata($section, "FileFormat", "HathiTrustMETS");
	252
	253	# include any metadata passed in from previous plugins
	254	# note that this metadata is associated with the top level section
	255	$self->add_associated_files($doc_obj, $filename_full_path);
	256	$self->extra_metadata ($doc_obj, $section, $metadata);
	257	$self->auto_extract_metadata ($doc_obj);
	258	$self->plugin_specific_process($base_dir, $file, $doc_obj, $gli);
	259	# if we haven't found any Title so far, assign one
	260	$self->title_fallback($doc_obj,$section,$filename_no_path);
	261
	262	$self->add_OID($doc_obj);
	263	return (1,$doc_obj);
	264	}
	265
	266
	267	sub parse_aux_json_metadata {
	268	my $self = shift(@_);
	269	my ($base_dir, $file, $doc_obj, $gli) = @_;
	270
	271	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
	272
	273	my $topsection = $doc_obj->get_top_section();
	274
	275	my $json_metadata_filename = $filename_full_path;
	276	$json_metadata_filename =~ s/\.mets.xml$/.json/;
	277
	278	my $json_text = "";
	279	$self->ReadTextFile::read_file($json_metadata_filename,"utf8",undef,\$json_text);
	280
	281	my $json_rec = decode_json $json_text;
	282	my $records = $json_rec->{'records'};
	283	my @keys = keys %{$records};
	284
	285	my $key = shift @keys; # there should only be one
	286	my $record = $records->{$key};
	287
	288	my @md_fields = ( "recordURL", "titles", "isbns", "issns", "oclcs", "lccns", "publishDates" );
	289
	290	foreach my $md_field (@md_fields) {
	291	my $value_array = $record->{$md_field};
	292
	293	my $md_name = $md_field;
	294	$md_name =~ s/s$//;
	295
	296	foreach my $md_value (@$value_array) {
	297
	298	if ($md_name eq "title") {
	299	$doc_obj->set_utf8_metadata_element ($topsection, "Title", $md_value);
	300	$doc_obj->set_utf8_metadata_element ($topsection, "dc.Title", $md_value);
	301	}
	302	else {
	303	$doc_obj->set_utf8_metadata_element ($topsection, $md_name, $md_value);
	304	}
	305	}
	306	}
	307
	308	my $htid = $json_rec->{'items'}->[0]->{'htid'};
	309	my $docName = $htid;
	310	my $docNameIE = $htid;
	311	$docNameIE =~ s/^.*?\.//;
	312
	313	$doc_obj->set_utf8_metadata_element ($topsection, "docName", $docName);
	314	$doc_obj->set_utf8_metadata_element ($topsection, "docNameIE", $docNameIE);
	315
	316	}
	317
	318
	319	# override this for an inheriting plugin to add extra metadata etc
	320	sub plugin_specific_process {
	321	my $self = shift(@_);
	322	my ($base_dir, $file, $doc_obj, $gli) = @_;
	323
	324	$self->parse_aux_json_metadata($base_dir,$file,$doc_obj,$gli);
	325	}
	326
	327	# sub tidy_item_file {
	328	# ... see PagedImagePlugin
	329	# }
	330
	331	# sub rotate_image {
	332	# ... see PagedImagePlugin
	333	# }
	334
	335	# sub process_image {
	336	# ... see PagedImagePlugin
	337	# }
	338
	339
	340
	341	sub xml_start_tag {
	342	my $self = shift(@_);
	343	my ($expat, $element) = @_;
	344	$self->{'element'} = $element;
	345
	346	my $doc_obj = $self->{'doc_obj'};
	347	if ($element eq "METS:mets") {
	348	$self->{'current_section'} = $doc_obj->get_top_section();
	349	# } elsif ($element eq "PageGroup" \|\| $element eq "Page") {
	350	## if ($element eq "PageGroup") {
	351	## $self->{'has_internal_structure'} = 1;
	352	}
	353	elsif (($element eq "METS:FLocat") && ($_{'xlink:href'} =~ m/\.txt$/)) {
	354	# e.g. <METS:FLocat LOCTYPE="OTHER" OTHERLOCTYPE="SYSTEM" xlink:href="00000000.txt"/>
	355
	356	# create a new section as a child
	357	$self->{'current_section'} = $doc_obj->insert_section($doc_obj->get_end_child($self->{'current_section'}));
	358	$self->{'num_pages'}++;
	359	# assign pagenum as ... what?? => use page sequence number
	360	my $txtfile = $_{'xlink:href'};
	361	my ($pagenum) = ($txtfile =~ m/^(\d+)/);
	362
	363	if (defined $pagenum) {
	364	my $pagenum_int = int($pagenum);
	365	$doc_obj->set_utf8_metadata_element($self->{'current_section'}, "Title", "Page $pagenum_int");
	366	}
	367	## my ($imgfile) = $_{'imgfile'};
	368	## if (defined $imgfile) {
	369	## # *****
	370	## # What about support for rotate image (e.g. old ':r' notation)?
	371	## $self->process_image($self->{'xml_file_dir'}.$imgfile, $imgfile, $doc_obj, $self->{'current_section'});
	372	## }
	373
	374	## my ($txtfile) = $_{'txtfile'};
	375	if (defined($txtfile)&& $txtfile ne "") {
	376	my $full_txt_filename = &FileUtils::filenameConcatenate($self->{'xml_file_dir'},$txtfile);
	377	$self->process_text ($full_txt_filename, $txtfile, $doc_obj, $self->{'current_section'});
	378	} else {
	379	$self->add_dummy_text($doc_obj, $self->{'current_section'});
	380	}
	381	}
	382	## elsif ($element eq "Metadata") {
	383	## $self->{'metadata_name'} = $_{'name'};
	384	## }
	385	}
	386
	387	sub xml_end_tag {
	388	my $self = shift(@_);
	389	my ($expat, $element) = @_;
	390
	391	my $doc_obj = $self->{'doc_obj'};
	392	## if ($element eq "Page" \|\| $element eq "PageGroup") {
	393	if (($element eq "METS:FLocat") && ($_{'xlink:href'} =~ m/\.txt$/)) {
	394	# if Title hasn't been assigned, set PageNum as Title
	395	if (!defined $doc_obj->get_metadata_element ($self->{'current_section'}, "Title") && defined $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" )) {
	396	$doc_obj->add_utf8_metadata ($self->{'current_section'}, "Title", $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" ));
	397	}
	398	# move the current section back to the parent
	399	$self->{'current_section'} = $doc_obj->get_parent_section($self->{'current_section'});
	400	} elsif ($element eq "Metadata") {
	401
	402	# text read in by XML::Parser is in Perl's binary byte value
	403	# form ... need to explicitly make it UTF-8
	404	my $meta_name = decode("utf-8",$self->{'metadata_name'});
	405	my $metadata_value = decode("utf-8",$self->{'metadata_value'});
	406
	407	if ($meta_name =~ /\./) {
	408	$meta_name = "ex.$meta_name";
	409	}
	410
	411	$doc_obj->add_utf8_metadata ($self->{'current_section'}, $meta_name, $metadata_value);
	412	$self->{'metadata_name'} = "";
	413	$self->{'metadata_value'} = "";
	414
	415	}
	416	# otherwise we ignore the end tag
	417	}
	418
	419
	420	sub xml_text {
	421	my $self = shift(@_);
	422	my ($expat) = @_;
	423
	424	if ($self->{'element'} eq "Metadata" && $self->{'metadata_name'}) {
	425	$self->{'metadata_value'} .= $_;
	426	}
	427	}
	428
	429	sub xml_doctype {
	430	}
	431
	432	sub open_document {
	433	my $self = shift(@_);
	434
	435	# create a new document
	436	$self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc", $self->{'file_rename_method'});
	437	# TODO is file filenmae_no_path??
	438	$self->set_initial_doc_fields($self->{'doc_obj'}, $self->{'filename'}, $self->{'processor'}, $self->{'metadata'});
	439
	440	## my ($dir, $file) = $self->{'filename'} =~ /^(.?)([^\/\\])$/;
	441	my ($dir, $file_ext) = $self->{'filename'} =~ /^(.*?)(\.mets\.xml)$/;
	442
	443	$self->{'xml_file_dir'} = $dir;
	444	$self->{'num_pages'} = 0;
	445	## $self->{'has_internal_structure'} = 0;
	446
	447	}
	448
	449	sub close_document {
	450	my $self = shift(@_);
	451	my $doc_obj = $self->{'doc_obj'};
	452
	453	my $topsection = $doc_obj->get_top_section();
	454
	455	# add numpages metadata
	456	$doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', $self->{'num_pages'}); # ##### !!!!
	457
	458	# set the document type
	459	my $final_doc_type = "";
	460	## if ($self->{'documenttype'} eq "auto") {
	461	### if ($self->{'has_internal_structure'}) {
	462	### if ($self->{'gs_version'} eq "3") {
	463	### $final_doc_type = "pagedhierarchy";
	464	### }
	465	### else {
	466	### $final_doc_type = "hierarchy";
	467	### }
	468	### } else {
	469	### $final_doc_type = "paged";
	470	### }
	471	### } else {
	472	## # set to what doc type option was set to
	473	## $final_doc_type = $self->{'documenttype'};
	474	## }
	475	# $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", $final_doc_type); # #### !!!!!
	476	### capiatalisation????
	477	# if ($self->{'documenttype'} eq 'paged') {
	478	# set the gsdlthistype metadata to Paged - this ensures this document will
	479	# be treated as a Paged doc, even if Titles are not numeric
	480	# $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
	481	# } else {
	482	# $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
	483	# }
	484
	485	## $doc_obj->set_utf8_metadata_element($topsection,"MaxImageWidth",$self->{'MaxImageWidth'});
	486	## $doc_obj->set_utf8_metadata_element($topsection,"MaxImageHeight",$self->{'MaxImageHeight'});
	487	## $self->{'MaxImageWidth'} = undef;
	488	## $self->{'MaxImageHeight'} = undef;
	489
	490	}
	491
	492
	493	sub set_initial_doc_fields {
	494	my $self = shift(@_);
	495	my ($doc_obj, $filename_full_path, $processor, $metadata) = @_;
	496
	497	my $topsection = $doc_obj->get_top_section();
	498
	499	my $plugin_filename_encoding = $self->{'filename_encoding'};
	500	my $filename_encoding = $self->deduce_filename_encoding($filename_full_path,$metadata,$plugin_filename_encoding);
	501	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
	502
	503	# if we want a header page, we need to add some text into the top section, otherwise this section will become invisible
	504	if ($self->{'headerpage'}) {
	505	$self->add_dummy_text($doc_obj, $topsection);
	506	}
	507	}
	508
	509	sub scan_xml_for_files_to_block
	510	{
	511	my $self = shift (@_);
	512	my ($filename_full_path, $dir, $block_hash) = @_;
	513
	514	my ($file_root) = ($filename_full_path =~ m/^(.*)\.mets\.xml$/);
	515
[31480]	516	$self->block_raw_filename($block_hash,"$file_root.zip");
	517	$self->block_raw_filename($block_hash,"$file_root.json");
[31284]	518
	519	my $page_dir = $file_root;
	520
	521	open (METSFILE, $filename_full_path) \|\| die "couldn't open $filename_full_path to work out which files to block\n";
	522	my $line = "";
	523	while (defined ($line = <METSFILE>)) {
	524	next unless $line =~ /\w/;
	525
	526	# Exaple of what we are looking for
	527	# <METS:FLocat LOCTYPE="OTHER" OTHERLOCTYPE="SYSTEM" xlink:href="00000000.txt"/>
	528
	529	if ($line =~ /xlink:href=\"([^\"]+)\"/) {
	530	my $txt_filename = &FileUtils::filenameConcatenate($page_dir,$1);
	531	my $topics_filename = $txt_filename . ".topics";
[31480]	532	$self->block_raw_filename($block_hash,$txt_filename);
	533	$self->block_raw_filename($block_hash,$topics_filename);
[31284]	534	}
	535	}
	536	close METSFILE;
	537
	538	}
	539
	540
	541	sub process_text {
	542	my $self = shift (@_);
	543	my ($filename_full_path, $file, $doc_obj, $cursection) = @_;
	544
	545	# check that the text file exists!!
	546	if (!-f $filename_full_path) {
	547	print "HathiTrustMETSPlugin: ERROR: File $filename_full_path does not exist, skipping\n";
	548	return 0;
	549	}
	550
	551	# remember that this text file was one of our source files, but only
	552	# if we are not processing a tmp file
	553	if (!$self->{'processing_tmp_files'} ) {
	554	$doc_obj->associate_source_file($filename_full_path);
	555	}
	556	# Do encoding stuff
	557	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
	558
	559	my $text="";
	560	if ( -s $filename_full_path > 0 ) {
	561	&ReadTextFile::read_file($self, $filename_full_path, $encoding, $language, \$text); # already decoded as utf8
	562	}
	563
	564	# HathiTrust often has empty files
	565	## if (!length ($text)) {
	566	## # It's a bit unusual but not out of the question to have no text, so just give a warning
	567	## print "HathiTrustMETSPlugin: WARNING: $filename_full_path contains no text\n";
	568	## }
	569
	570	# we need to escape the escape character, or else mg will convert into
	571	# eg literal newlines, instead of leaving the text as '\n'
	572	$text =~ s/\\/\\\\/g; # macro language
	573	$text =~ s/_/\\_/g; # macro language
	574
	575
	576	if ($text =~ m/<html.?>\s<head.?>.<\/head>\s<body.?>(.)<\/body>\s<\/html>\s*$/is) {
	577	# looks like HTML input
	578	# no need to escape < and > or put in <pre> tags
	579
	580	$text = $1;
	581
	582	# add text to document object
	583	$doc_obj->add_utf8_text($cursection, "$text");
	584	}
	585	else {
	586	$text =~ s/</</g;
	587	$text =~ s/>/>/g;
	588
	589	# insert preformat tags and add text to document object
	590	$doc_obj->add_utf8_text($cursection, "<pre>\n$text\n</pre>");
	591	}
	592
	593	my $topics_filename = $filename_full_path . ".topics";
	594	if ( -s $topics_filename > 0 ) {
	595
	596	my $topics_text = "";
	597	$self->ReadTextFile::read_file($topics_filename,"utf8",undef,\$topics_text);
	598
	599	my @topics_array = split(/\\|/,$topics_text);
	600	foreach my $topic (@topics_array) {
	601	if ($topic ne "") {
	602	$doc_obj->set_utf8_metadata_element ($cursection, "concept", $topic);
	603	}
	604	}
	605	}
	606
	607	return 1;
	608	}
	609
	610
	611	sub clean_up_after_doc_obj_processing {
	612	my $self = shift(@_);
	613
	614	$self->ImageConverter::clean_up_temporary_files();
	615	}
	616
	617	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: