Context Navigation

source: main/trunk/greenstone2/perllib/plugins/HathiTrustMETSPlugin.pm@ 32783

Last change on this file since 32783 was 32783, checked in by kjdon, 5 years ago
adding missing strings and tidying up some mislabelling
Property svn:executable set to ``*
File size: 19.6 KB

Rev	Line
[31284]	1	###########################################################################
	2	#
	3	# HathiTrustMETSPlugin.pm -- plugin for sets of HathiTrust METS OCR'd
	4	# text that make up a document
	5	#
	6	# A component of the Greenstone digital library software
	7	# from the New Zealand Digital Library Project at the
	8	# University of Waikato, New Zealand.
	9	#
	10	# Copyright (C) 1999 New Zealand Digital Library Project
	11	#
	12	# This program is free software; you can redistribute it and/or modify
	13	# it under the terms of the GNU General Public License as published by
	14	# the Free Software Foundation; either version 2 of the License, or
	15	# (at your option) any later version.
	16	#
	17	# This program is distributed in the hope that it will be useful,
	18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	# GNU General Public License for more details.
	21	#
	22	# You should have received a copy of the GNU General Public License
	23	# along with this program; if not, write to the Free Software
	24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	25	#
	26	###########################################################################
	27
	28	# HathiTrustMETSPlugin
	29	# processes HathiTrust METS files that are accompanied with page-by-page
	30	# OCR'd txt files
	31	#
	32	# All the supplemetary text files should be in a subfolder of the same
	33	# name as the METS file
	34	#
	35	# As usual, running
	36	# 'perl -S pluginfo.pl HathiTrustMETSPlugin' will list all the options.
	37
	38
	39	package HathiTrustMETSPlugin;
	40
	41	use Encode;
	42	use ReadXMLFile;
	43	use ReadTextFile;
	44	# We don't currently work with the scanned image from HathiTrust METS
	45	# but leave it in for future proofing
	46	use ImageConverter;
	47	use MetadataRead;
	48
	49	use JSON;
	50
	51	use strict;
	52	no strict 'refs'; # allow filehandles to be variables and viceversa
	53
	54	sub BEGIN {
	55	@HathiTrustMETSPlugin::ISA = ('MetadataRead', 'ReadXMLFile', 'ReadTextFile', 'ImageConverter'
	56	);
	57	}
	58
	59	# One day HathiTrust might give more than page structure
	60	my $gs2_type_list =
	61	[
	62	# { 'name' => "auto",
	63	# 'desc' => "{PagedImagePlugin.documenttype.auto2}" },
	64	# { 'name' => "paged",
	65	# 'desc' => "{PagedImagePlugin.documenttype.paged2}" },
	66	{ 'name' => "hierarchy",
	67	'desc' => "{PagedImagePlugin.documenttype.hierarchy}" }
	68	];
	69
	70	my $gs3_type_list =
	71	[
	72	# { 'name' => "auto",
	73	# 'desc' => "{PagedImagePlugin.documenttype.auto3}" },
	74	# { 'name' => "paged",
	75	# 'desc' => "{PagedImagePlugin.documenttype.paged3}" },
	76	{ 'name' => "hierarchy",
	77	'desc' => "{PagedImagePlugin.documenttype.hierarchy}" }
	78	# { 'name' => "pagedhierarchy",
	79	# 'desc' => "{PagedImagePlugin.documenttype.pagedhierarchy}" }
	80	];
	81
	82	my $arguments =
	83	[ { 'name' => "process_exp",
[31492]	84	'desc' => "{BaseImporter.process_exp}",
[31284]	85	'type' => "string",
	86	'deft' => &get_default_process_exp(),
	87	'reqd' => "no" },
	88	{ 'name' => "title_sub",
	89	'desc' => "{HTMLPlugin.title_sub}",
	90	'type' => "string",
	91	'deft' => "" },
	92	{ 'name' => "headerpage",
	93	'desc' => "{HathiTrustMETSPlugin.headerpage}",
	94	'type' => "flag",
	95	'reqd' => "no" },
	96	{'name' => "processing_tmp_files",
[31492]	97	'desc' => "{BaseImporter.processing_tmp_files}",
[31284]	98	'type' => "flag",
	99	'hiddengli' => "yes"}
	100	];
	101
	102	my $doc_type_opt = { 'name' => "documenttype",
[32783]	103	'desc' => "{PagedImagePlugin.documenttype}",
[31284]	104	'type' => "enum",
[32783]	105	'deft' => "hierarchy",
[31284]	106	'reqd' => "no" };
	107
	108	my $options = { 'name' => "HathiTrustMETSPlugin",
	109	'desc' => "{HathiTrustMETSPlugin.desc}",
	110	'abstract' => "no",
	111	'inherits' => "yes",
	112	'args' => $arguments };
	113
	114	sub new {
	115	my ($class) = shift (@_);
	116	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	117	push(@$pluginlist, $class);
	118
	119	push(@{$hashArgOptLists->{"OptList"}},$options);
	120
	121	my $imc_self = new ImageConverter($pluginlist, $inputargs, $hashArgOptLists);
	122
	123	# we can use this plugin to check gs3 version
	124	if ($imc_self->{'gs_version'} eq "3") {
	125	$doc_type_opt->{'list'} = $gs3_type_list;
	126	}
	127	else {
	128	$doc_type_opt->{'list'} = $gs2_type_list;
	129	}
	130	push(@$arguments,$doc_type_opt);
	131	# now we add the args to the list for parsing
	132	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	133
	134	my $rtf_self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists, 1);
	135	my $rxf_self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
	136
[31492]	137	my $self = BaseImporter::merge_inheritance($imc_self,$rtf_self,$rxf_self);
[31284]	138
	139	# Update $self used by XML::Parser so it finds callback functions
	140	# such as start_document here and not in ReadXMLFile (which is what
	141	# $self was when new XML::Parser was done)
	142	#
	143	# If the $self returned by this constructor is the same as the one
	144	# used in ReadXMLFile (e.g. in the GreenstoneXMLPlugin) then this step isn't necessary
	145	#
	146	# Consider embedding this type of assignment into merge_inheritance
	147	# to help catch all cases?
	148
	149	$rxf_self->{'parser'}->{'PluginObj'} = $self;
	150
	151	return bless $self, $class;
	152	}
	153
	154
	155	sub init {
	156	my $self = shift (@_);
	157	my ($verbosity, $outhandle, $failhandle) = @_;
	158
	159	$self->SUPER::init(@_);
	160	$self->ImageConverter::init();
	161	}
	162
	163	sub begin {
	164	my $self = shift (@_);
	165	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
	166
	167	$self->SUPER::begin(@_);
	168	$self->ImageConverter::begin(@_);
	169	}
	170
	171	sub get_default_process_exp {
	172	my $self = shift (@_);
	173
	174	return q^\.mets.xml$^;
	175	}
	176
	177	sub get_doctype {
	178	my $self = shift(@_);
	179
	180	return "METS:mets";
	181	}
	182
	183
[31492]	184	# want to use BaseImporter's version of this, not ReadXMLFile's
[31284]	185	sub can_process_this_file {
	186	my $self = shift(@_);
[31492]	187	return $self->BaseImporter::can_process_this_file(@_);
[31284]	188	}
	189
	190	# instead of a block exp, now we scan the file and record all text and img files mentioned there for blocking.
	191	sub store_block_files
	192	{
	193	my $self = shift (@_);
	194	my ($filename_full_path, $block_hash) = @_;
	195
	196	# do we need to do this?
	197	# does BOM interfere just with XML parsing? In that case don't need it here
	198	# if we do it here, we are modifying the file before we have worked out if
	199	# its new or not, so it will always be reimported.
	200	#$self->tidy_item_file($filename_full_path);
	201
	202	my ($dir, $file) = $filename_full_path =~ /^(.?)([^\/\\])$/;
	203
	204	# do something
	205	$self->scan_xml_for_files_to_block($filename_full_path, $dir, $block_hash);
	206
	207	}
	208
[31492]	209	# we want to use BaseImporter's read, not ReadXMLFile's
[31284]	210	sub read
	211	{
	212	my $self = shift (@_);
	213
[31492]	214	$self->BaseImporter::read(@_);
[31284]	215	}
	216
	217
	218
	219	sub read_into_doc_obj {
	220	my $self = shift (@_);
	221	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
	222	my $outhandle = $self->{'outhandle'};
	223	my $verbosity = $self->{'verbosity'};
	224
	225	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
	226
	227	print $outhandle "HathiTrustMETSPlugin processing \"$filename_full_path\"\n"
	228	if $verbosity > 1;
	229	print STDERR "<Processing n='$file' p='HathiTrustMETSPlugin'>\n" if ($gli);
	230
	231	## $self->{'MaxImageWidth'} = 0;
	232	## $self->{'MaxImageHeight'} = 0;
	233
	234
	235	##$self->tidy_item_file($filename_full_path);
	236
	237	# careful checking needed here!! are we using local xml handlers or super ones
	238	$self->ReadXMLFile::read($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
	239	my $doc_obj = $self->{'doc_obj'};
	240
	241
	242	my $section = $doc_obj->get_top_section();
	243
	244	$doc_obj->add_utf8_metadata($section, "Plugin", "$self->{'plugin_type'}");
	245	$doc_obj->add_metadata($section, "FileFormat", "HathiTrustMETS");
	246
	247	# include any metadata passed in from previous plugins
	248	# note that this metadata is associated with the top level section
	249	$self->add_associated_files($doc_obj, $filename_full_path);
	250	$self->extra_metadata ($doc_obj, $section, $metadata);
	251	$self->auto_extract_metadata ($doc_obj);
	252	$self->plugin_specific_process($base_dir, $file, $doc_obj, $gli);
	253	# if we haven't found any Title so far, assign one
	254	$self->title_fallback($doc_obj,$section,$filename_no_path);
	255
	256	$self->add_OID($doc_obj);
	257	return (1,$doc_obj);
	258	}
	259
	260
	261	sub parse_aux_json_metadata {
	262	my $self = shift(@_);
	263	my ($base_dir, $file, $doc_obj, $gli) = @_;
	264
	265	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
	266
	267	my $topsection = $doc_obj->get_top_section();
	268
	269	my $json_metadata_filename = $filename_full_path;
	270	$json_metadata_filename =~ s/\.mets.xml$/.json/;
	271
	272	my $json_text = "";
	273	$self->ReadTextFile::read_file($json_metadata_filename,"utf8",undef,\$json_text);
	274
	275	my $json_rec = decode_json $json_text;
	276	my $records = $json_rec->{'records'};
	277	my @keys = keys %{$records};
	278
	279	my $key = shift @keys; # there should only be one
	280	my $record = $records->{$key};
	281
	282	my @md_fields = ( "recordURL", "titles", "isbns", "issns", "oclcs", "lccns", "publishDates" );
	283
	284	foreach my $md_field (@md_fields) {
	285	my $value_array = $record->{$md_field};
	286
	287	my $md_name = $md_field;
	288	$md_name =~ s/s$//;
	289
	290	foreach my $md_value (@$value_array) {
	291
	292	if ($md_name eq "title") {
	293	$doc_obj->set_utf8_metadata_element ($topsection, "Title", $md_value);
	294	$doc_obj->set_utf8_metadata_element ($topsection, "dc.Title", $md_value);
	295	}
	296	else {
	297	$doc_obj->set_utf8_metadata_element ($topsection, $md_name, $md_value);
	298	}
	299	}
	300	}
	301
	302	my $htid = $json_rec->{'items'}->[0]->{'htid'};
	303	my $docName = $htid;
	304	my $docNameIE = $htid;
	305	$docNameIE =~ s/^.*?\.//;
	306
	307	$doc_obj->set_utf8_metadata_element ($topsection, "docName", $docName);
	308	$doc_obj->set_utf8_metadata_element ($topsection, "docNameIE", $docNameIE);
	309
	310	}
	311
	312
	313	# override this for an inheriting plugin to add extra metadata etc
	314	sub plugin_specific_process {
	315	my $self = shift(@_);
	316	my ($base_dir, $file, $doc_obj, $gli) = @_;
	317
	318	$self->parse_aux_json_metadata($base_dir,$file,$doc_obj,$gli);
	319	}
	320
	321	# sub tidy_item_file {
	322	# ... see PagedImagePlugin
	323	# }
	324
	325	# sub rotate_image {
	326	# ... see PagedImagePlugin
	327	# }
	328
	329	# sub process_image {
	330	# ... see PagedImagePlugin
	331	# }
	332
	333
	334
	335	sub xml_start_tag {
	336	my $self = shift(@_);
	337	my ($expat, $element) = @_;
	338	$self->{'element'} = $element;
	339
	340	my $doc_obj = $self->{'doc_obj'};
	341	if ($element eq "METS:mets") {
	342	$self->{'current_section'} = $doc_obj->get_top_section();
	343	# } elsif ($element eq "PageGroup" \|\| $element eq "Page") {
	344	## if ($element eq "PageGroup") {
	345	## $self->{'has_internal_structure'} = 1;
	346	}
	347	elsif (($element eq "METS:FLocat") && ($_{'xlink:href'} =~ m/\.txt$/)) {
	348	# e.g. <METS:FLocat LOCTYPE="OTHER" OTHERLOCTYPE="SYSTEM" xlink:href="00000000.txt"/>
	349
	350	# create a new section as a child
	351	$self->{'current_section'} = $doc_obj->insert_section($doc_obj->get_end_child($self->{'current_section'}));
	352	$self->{'num_pages'}++;
	353	# assign pagenum as ... what?? => use page sequence number
	354	my $txtfile = $_{'xlink:href'};
	355	my ($pagenum) = ($txtfile =~ m/^(\d+)/);
	356
	357	if (defined $pagenum) {
	358	my $pagenum_int = int($pagenum);
	359	$doc_obj->set_utf8_metadata_element($self->{'current_section'}, "Title", "Page $pagenum_int");
	360	}
	361	## my ($imgfile) = $_{'imgfile'};
	362	## if (defined $imgfile) {
	363	## # *****
	364	## # What about support for rotate image (e.g. old ':r' notation)?
	365	## $self->process_image($self->{'xml_file_dir'}.$imgfile, $imgfile, $doc_obj, $self->{'current_section'});
	366	## }
	367
	368	## my ($txtfile) = $_{'txtfile'};
	369	if (defined($txtfile)&& $txtfile ne "") {
	370	my $full_txt_filename = &FileUtils::filenameConcatenate($self->{'xml_file_dir'},$txtfile);
	371	$self->process_text ($full_txt_filename, $txtfile, $doc_obj, $self->{'current_section'});
	372	} else {
	373	$self->add_dummy_text($doc_obj, $self->{'current_section'});
	374	}
	375	}
	376	## elsif ($element eq "Metadata") {
	377	## $self->{'metadata_name'} = $_{'name'};
	378	## }
	379	}
	380
	381	sub xml_end_tag {
	382	my $self = shift(@_);
	383	my ($expat, $element) = @_;
	384
	385	my $doc_obj = $self->{'doc_obj'};
	386	## if ($element eq "Page" \|\| $element eq "PageGroup") {
	387	if (($element eq "METS:FLocat") && ($_{'xlink:href'} =~ m/\.txt$/)) {
	388	# if Title hasn't been assigned, set PageNum as Title
	389	if (!defined $doc_obj->get_metadata_element ($self->{'current_section'}, "Title") && defined $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" )) {
	390	$doc_obj->add_utf8_metadata ($self->{'current_section'}, "Title", $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" ));
	391	}
	392	# move the current section back to the parent
	393	$self->{'current_section'} = $doc_obj->get_parent_section($self->{'current_section'});
	394	} elsif ($element eq "Metadata") {
	395
	396	# text read in by XML::Parser is in Perl's binary byte value
	397	# form ... need to explicitly make it UTF-8
	398	my $meta_name = decode("utf-8",$self->{'metadata_name'});
	399	my $metadata_value = decode("utf-8",$self->{'metadata_value'});
	400
	401	if ($meta_name =~ /\./) {
	402	$meta_name = "ex.$meta_name";
	403	}
	404
	405	$doc_obj->add_utf8_metadata ($self->{'current_section'}, $meta_name, $metadata_value);
	406	$self->{'metadata_name'} = "";
	407	$self->{'metadata_value'} = "";
	408
	409	}
	410	# otherwise we ignore the end tag
	411	}
	412
	413
	414	sub xml_text {
	415	my $self = shift(@_);
	416	my ($expat) = @_;
	417
	418	if ($self->{'element'} eq "Metadata" && $self->{'metadata_name'}) {
	419	$self->{'metadata_value'} .= $_;
	420	}
	421	}
	422
	423	sub xml_doctype {
	424	}
	425
	426	sub open_document {
	427	my $self = shift(@_);
	428
	429	# create a new document
	430	$self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc", $self->{'file_rename_method'});
	431	# TODO is file filenmae_no_path??
	432	$self->set_initial_doc_fields($self->{'doc_obj'}, $self->{'filename'}, $self->{'processor'}, $self->{'metadata'});
	433
	434	## my ($dir, $file) = $self->{'filename'} =~ /^(.?)([^\/\\])$/;
	435	my ($dir, $file_ext) = $self->{'filename'} =~ /^(.*?)(\.mets\.xml)$/;
	436
	437	$self->{'xml_file_dir'} = $dir;
	438	$self->{'num_pages'} = 0;
	439	## $self->{'has_internal_structure'} = 0;
	440
	441	}
	442
	443	sub close_document {
	444	my $self = shift(@_);
	445	my $doc_obj = $self->{'doc_obj'};
	446
	447	my $topsection = $doc_obj->get_top_section();
	448
	449	# add numpages metadata
	450	$doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', $self->{'num_pages'}); # ##### !!!!
	451
	452	# set the document type
	453	my $final_doc_type = "";
	454	## if ($self->{'documenttype'} eq "auto") {
	455	### if ($self->{'has_internal_structure'}) {
	456	### if ($self->{'gs_version'} eq "3") {
	457	### $final_doc_type = "pagedhierarchy";
	458	### }
	459	### else {
	460	### $final_doc_type = "hierarchy";
	461	### }
	462	### } else {
	463	### $final_doc_type = "paged";
	464	### }
	465	### } else {
	466	## # set to what doc type option was set to
	467	## $final_doc_type = $self->{'documenttype'};
	468	## }
	469	# $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", $final_doc_type); # #### !!!!!
	470	### capiatalisation????
	471	# if ($self->{'documenttype'} eq 'paged') {
	472	# set the gsdlthistype metadata to Paged - this ensures this document will
	473	# be treated as a Paged doc, even if Titles are not numeric
	474	# $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
	475	# } else {
	476	# $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
	477	# }
	478
	479	## $doc_obj->set_utf8_metadata_element($topsection,"MaxImageWidth",$self->{'MaxImageWidth'});
	480	## $doc_obj->set_utf8_metadata_element($topsection,"MaxImageHeight",$self->{'MaxImageHeight'});
	481	## $self->{'MaxImageWidth'} = undef;
	482	## $self->{'MaxImageHeight'} = undef;
	483
	484	}
	485
	486
	487	sub set_initial_doc_fields {
	488	my $self = shift(@_);
	489	my ($doc_obj, $filename_full_path, $processor, $metadata) = @_;
	490
	491	my $topsection = $doc_obj->get_top_section();
	492
	493	my $plugin_filename_encoding = $self->{'filename_encoding'};
	494	my $filename_encoding = $self->deduce_filename_encoding($filename_full_path,$metadata,$plugin_filename_encoding);
	495	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
	496
	497	# if we want a header page, we need to add some text into the top section, otherwise this section will become invisible
	498	if ($self->{'headerpage'}) {
	499	$self->add_dummy_text($doc_obj, $topsection);
	500	}
	501	}
	502
	503	sub scan_xml_for_files_to_block
	504	{
	505	my $self = shift (@_);
	506	my ($filename_full_path, $dir, $block_hash) = @_;
	507
	508	my ($file_root) = ($filename_full_path =~ m/^(.*)\.mets\.xml$/);
	509
[31480]	510	$self->block_raw_filename($block_hash,"$file_root.zip");
	511	$self->block_raw_filename($block_hash,"$file_root.json");
[31284]	512
	513	my $page_dir = $file_root;
	514
	515	open (METSFILE, $filename_full_path) \|\| die "couldn't open $filename_full_path to work out which files to block\n";
	516	my $line = "";
	517	while (defined ($line = <METSFILE>)) {
	518	next unless $line =~ /\w/;
	519
	520	# Exaple of what we are looking for
	521	# <METS:FLocat LOCTYPE="OTHER" OTHERLOCTYPE="SYSTEM" xlink:href="00000000.txt"/>
	522
	523	if ($line =~ /xlink:href=\"([^\"]+)\"/) {
	524	my $txt_filename = &FileUtils::filenameConcatenate($page_dir,$1);
	525	my $topics_filename = $txt_filename . ".topics";
[31480]	526	$self->block_raw_filename($block_hash,$txt_filename);
	527	$self->block_raw_filename($block_hash,$topics_filename);
[31284]	528	}
	529	}
	530	close METSFILE;
	531
	532	}
	533
	534
	535	sub process_text {
	536	my $self = shift (@_);
	537	my ($filename_full_path, $file, $doc_obj, $cursection) = @_;
	538
	539	# check that the text file exists!!
	540	if (!-f $filename_full_path) {
	541	print "HathiTrustMETSPlugin: ERROR: File $filename_full_path does not exist, skipping\n";
	542	return 0;
	543	}
	544
	545	# remember that this text file was one of our source files, but only
	546	# if we are not processing a tmp file
	547	if (!$self->{'processing_tmp_files'} ) {
	548	$doc_obj->associate_source_file($filename_full_path);
	549	}
	550	# Do encoding stuff
	551	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
	552
	553	my $text="";
	554	if ( -s $filename_full_path > 0 ) {
	555	&ReadTextFile::read_file($self, $filename_full_path, $encoding, $language, \$text); # already decoded as utf8
	556	}
	557
	558	# HathiTrust often has empty files
	559	## if (!length ($text)) {
	560	## # It's a bit unusual but not out of the question to have no text, so just give a warning
	561	## print "HathiTrustMETSPlugin: WARNING: $filename_full_path contains no text\n";
	562	## }
	563
	564	# we need to escape the escape character, or else mg will convert into
	565	# eg literal newlines, instead of leaving the text as '\n'
	566	$text =~ s/\\/\\\\/g; # macro language
	567	$text =~ s/_/\\_/g; # macro language
	568
	569
	570	if ($text =~ m/<html.?>\s<head.?>.<\/head>\s<body.?>(.)<\/body>\s<\/html>\s*$/is) {
	571	# looks like HTML input
	572	# no need to escape < and > or put in <pre> tags
	573
	574	$text = $1;
	575
	576	# add text to document object
	577	$doc_obj->add_utf8_text($cursection, "$text");
	578	}
	579	else {
	580	$text =~ s/</</g;
	581	$text =~ s/>/>/g;
	582
	583	# insert preformat tags and add text to document object
	584	$doc_obj->add_utf8_text($cursection, "<pre>\n$text\n</pre>");
	585	}
	586
	587	my $topics_filename = $filename_full_path . ".topics";
	588	if ( -s $topics_filename > 0 ) {
	589
	590	my $topics_text = "";
	591	$self->ReadTextFile::read_file($topics_filename,"utf8",undef,\$topics_text);
	592
	593	my @topics_array = split(/\\|/,$topics_text);
	594	foreach my $topic (@topics_array) {
	595	if ($topic ne "") {
	596	$doc_obj->set_utf8_metadata_element ($cursection, "concept", $topic);
	597	}
	598	}
	599	}
	600
	601	return 1;
	602	}
	603
	604
	605	sub clean_up_after_doc_obj_processing {
	606	my $self = shift(@_);
	607
	608	$self->ImageConverter::clean_up_temporary_files();
	609	}
	610
	611	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: