Context Navigation

source: main/trunk/greenstone2/perllib/plugins/HathiTrustMETSPlugin.pm@ 31492

Last change on this file since 31492 was 31492, checked in by kjdon, 7 years ago
renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes
Property svn:executable set to ``*
File size: 19.8 KB

Line
1	###########################################################################
2	#
3	# HathiTrustMETSPlugin.pm -- plugin for sets of HathiTrust METS OCR'd
4	# text that make up a document
5	#
6	# A component of the Greenstone digital library software
7	# from the New Zealand Digital Library Project at the
8	# University of Waikato, New Zealand.
9	#
10	# Copyright (C) 1999 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28	# HathiTrustMETSPlugin
29	# processes HathiTrust METS files that are accompanied with page-by-page
30	# OCR'd txt files
31	#
32	# All the supplemetary text files should be in a subfolder of the same
33	# name as the METS file
34	#
35	# As usual, running
36	# 'perl -S pluginfo.pl HathiTrustMETSPlugin' will list all the options.
37
38
39	package HathiTrustMETSPlugin;
40
41	use Encode;
42	use ReadXMLFile;
43	use ReadTextFile;
44	# We don't currently work with the scanned image from HathiTrust METS
45	# but leave it in for future proofing
46	use ImageConverter;
47	use MetadataRead;
48
49	use JSON;
50
51	use strict;
52	no strict 'refs'; # allow filehandles to be variables and viceversa
53
54	sub BEGIN {
55	@HathiTrustMETSPlugin::ISA = ('MetadataRead', 'ReadXMLFile', 'ReadTextFile', 'ImageConverter'
56	);
57	}
58
59	# One day HathiTrust might give more than page structure
60	my $gs2_type_list =
61	[
62	# { 'name' => "auto",
63	# 'desc' => "{PagedImagePlugin.documenttype.auto2}" },
64	# { 'name' => "paged",
65	# 'desc' => "{PagedImagePlugin.documenttype.paged2}" },
66	{ 'name' => "hierarchy",
67	'desc' => "{PagedImagePlugin.documenttype.hierarchy}" }
68	];
69
70	my $gs3_type_list =
71	[
72	# { 'name' => "auto",
73	# 'desc' => "{PagedImagePlugin.documenttype.auto3}" },
74	# { 'name' => "paged",
75	# 'desc' => "{PagedImagePlugin.documenttype.paged3}" },
76	{ 'name' => "hierarchy",
77	'desc' => "{PagedImagePlugin.documenttype.hierarchy}" }
78	# { 'name' => "pagedhierarchy",
79	# 'desc' => "{PagedImagePlugin.documenttype.pagedhierarchy}" }
80	];
81
82	my $arguments =
83	[ { 'name' => "process_exp",
84	'desc' => "{BaseImporter.process_exp}",
85	'type' => "string",
86	'deft' => &get_default_process_exp(),
87	'reqd' => "no" },
88	{ 'name' => "title_sub",
89	'desc' => "{HTMLPlugin.title_sub}",
90	'type' => "string",
91	'deft' => "" },
92	{ 'name' => "headerpage",
93	'desc' => "{HathiTrustMETSPlugin.headerpage}",
94	'type' => "flag",
95	'reqd' => "no" },
96	# { 'name' => "documenttype",
97	# 'desc' => "{HathiTrustMETSPlugin.documenttype}",
98	# 'type' => "enum",
99	# 'list' => $type_list,
100	# 'deft' => "auto",
101	# 'reqd' => "no" },
102	{'name' => "processing_tmp_files",
103	'desc' => "{BaseImporter.processing_tmp_files}",
104	'type' => "flag",
105	'hiddengli' => "yes"}
106	];
107
108	my $doc_type_opt = { 'name' => "documenttype",
109	'desc' => "{HathiTrustMETSPlugin.documenttype}",
110	'type' => "enum",
111	'deft' => "auto",
112	'reqd' => "no" };
113
114	my $options = { 'name' => "HathiTrustMETSPlugin",
115	'desc' => "{HathiTrustMETSPlugin.desc}",
116	'abstract' => "no",
117	'inherits' => "yes",
118	'args' => $arguments };
119
120	sub new {
121	my ($class) = shift (@_);
122	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
123	push(@$pluginlist, $class);
124
125	push(@{$hashArgOptLists->{"OptList"}},$options);
126
127	my $imc_self = new ImageConverter($pluginlist, $inputargs, $hashArgOptLists);
128
129	# we can use this plugin to check gs3 version
130	if ($imc_self->{'gs_version'} eq "3") {
131	$doc_type_opt->{'list'} = $gs3_type_list;
132	}
133	else {
134	$doc_type_opt->{'list'} = $gs2_type_list;
135	}
136	push(@$arguments,$doc_type_opt);
137	# now we add the args to the list for parsing
138	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
139
140	my $rtf_self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists, 1);
141	my $rxf_self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
142
143	my $self = BaseImporter::merge_inheritance($imc_self,$rtf_self,$rxf_self);
144
145	# Update $self used by XML::Parser so it finds callback functions
146	# such as start_document here and not in ReadXMLFile (which is what
147	# $self was when new XML::Parser was done)
148	#
149	# If the $self returned by this constructor is the same as the one
150	# used in ReadXMLFile (e.g. in the GreenstoneXMLPlugin) then this step isn't necessary
151	#
152	# Consider embedding this type of assignment into merge_inheritance
153	# to help catch all cases?
154
155	$rxf_self->{'parser'}->{'PluginObj'} = $self;
156
157	return bless $self, $class;
158	}
159
160
161	sub init {
162	my $self = shift (@_);
163	my ($verbosity, $outhandle, $failhandle) = @_;
164
165	$self->SUPER::init(@_);
166	$self->ImageConverter::init();
167	}
168
169	sub begin {
170	my $self = shift (@_);
171	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
172
173	$self->SUPER::begin(@_);
174	$self->ImageConverter::begin(@_);
175	}
176
177	sub get_default_process_exp {
178	my $self = shift (@_);
179
180	return q^\.mets.xml$^;
181	}
182
183	sub get_doctype {
184	my $self = shift(@_);
185
186	return "METS:mets";
187	}
188
189
190	# want to use BaseImporter's version of this, not ReadXMLFile's
191	sub can_process_this_file {
192	my $self = shift(@_);
193	return $self->BaseImporter::can_process_this_file(@_);
194	}
195
196	# instead of a block exp, now we scan the file and record all text and img files mentioned there for blocking.
197	sub store_block_files
198	{
199	my $self = shift (@_);
200	my ($filename_full_path, $block_hash) = @_;
201
202	# do we need to do this?
203	# does BOM interfere just with XML parsing? In that case don't need it here
204	# if we do it here, we are modifying the file before we have worked out if
205	# its new or not, so it will always be reimported.
206	#$self->tidy_item_file($filename_full_path);
207
208	my ($dir, $file) = $filename_full_path =~ /^(.?)([^\/\\])$/;
209
210	# do something
211	$self->scan_xml_for_files_to_block($filename_full_path, $dir, $block_hash);
212
213	}
214
215	# we want to use BaseImporter's read, not ReadXMLFile's
216	sub read
217	{
218	my $self = shift (@_);
219
220	$self->BaseImporter::read(@_);
221	}
222
223
224
225	sub read_into_doc_obj {
226	my $self = shift (@_);
227	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
228	my $outhandle = $self->{'outhandle'};
229	my $verbosity = $self->{'verbosity'};
230
231	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
232
233	print $outhandle "HathiTrustMETSPlugin processing \"$filename_full_path\"\n"
234	if $verbosity > 1;
235	print STDERR "<Processing n='$file' p='HathiTrustMETSPlugin'>\n" if ($gli);
236
237	## $self->{'MaxImageWidth'} = 0;
238	## $self->{'MaxImageHeight'} = 0;
239
240
241	##$self->tidy_item_file($filename_full_path);
242
243	# careful checking needed here!! are we using local xml handlers or super ones
244	$self->ReadXMLFile::read($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
245	my $doc_obj = $self->{'doc_obj'};
246
247
248	my $section = $doc_obj->get_top_section();
249
250	$doc_obj->add_utf8_metadata($section, "Plugin", "$self->{'plugin_type'}");
251	$doc_obj->add_metadata($section, "FileFormat", "HathiTrustMETS");
252
253	# include any metadata passed in from previous plugins
254	# note that this metadata is associated with the top level section
255	$self->add_associated_files($doc_obj, $filename_full_path);
256	$self->extra_metadata ($doc_obj, $section, $metadata);
257	$self->auto_extract_metadata ($doc_obj);
258	$self->plugin_specific_process($base_dir, $file, $doc_obj, $gli);
259	# if we haven't found any Title so far, assign one
260	$self->title_fallback($doc_obj,$section,$filename_no_path);
261
262	$self->add_OID($doc_obj);
263	return (1,$doc_obj);
264	}
265
266
267	sub parse_aux_json_metadata {
268	my $self = shift(@_);
269	my ($base_dir, $file, $doc_obj, $gli) = @_;
270
271	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
272
273	my $topsection = $doc_obj->get_top_section();
274
275	my $json_metadata_filename = $filename_full_path;
276	$json_metadata_filename =~ s/\.mets.xml$/.json/;
277
278	my $json_text = "";
279	$self->ReadTextFile::read_file($json_metadata_filename,"utf8",undef,\$json_text);
280
281	my $json_rec = decode_json $json_text;
282	my $records = $json_rec->{'records'};
283	my @keys = keys %{$records};
284
285	my $key = shift @keys; # there should only be one
286	my $record = $records->{$key};
287
288	my @md_fields = ( "recordURL", "titles", "isbns", "issns", "oclcs", "lccns", "publishDates" );
289
290	foreach my $md_field (@md_fields) {
291	my $value_array = $record->{$md_field};
292
293	my $md_name = $md_field;
294	$md_name =~ s/s$//;
295
296	foreach my $md_value (@$value_array) {
297
298	if ($md_name eq "title") {
299	$doc_obj->set_utf8_metadata_element ($topsection, "Title", $md_value);
300	$doc_obj->set_utf8_metadata_element ($topsection, "dc.Title", $md_value);
301	}
302	else {
303	$doc_obj->set_utf8_metadata_element ($topsection, $md_name, $md_value);
304	}
305	}
306	}
307
308	my $htid = $json_rec->{'items'}->[0]->{'htid'};
309	my $docName = $htid;
310	my $docNameIE = $htid;
311	$docNameIE =~ s/^.*?\.//;
312
313	$doc_obj->set_utf8_metadata_element ($topsection, "docName", $docName);
314	$doc_obj->set_utf8_metadata_element ($topsection, "docNameIE", $docNameIE);
315
316	}
317
318
319	# override this for an inheriting plugin to add extra metadata etc
320	sub plugin_specific_process {
321	my $self = shift(@_);
322	my ($base_dir, $file, $doc_obj, $gli) = @_;
323
324	$self->parse_aux_json_metadata($base_dir,$file,$doc_obj,$gli);
325	}
326
327	# sub tidy_item_file {
328	# ... see PagedImagePlugin
329	# }
330
331	# sub rotate_image {
332	# ... see PagedImagePlugin
333	# }
334
335	# sub process_image {
336	# ... see PagedImagePlugin
337	# }
338
339
340
341	sub xml_start_tag {
342	my $self = shift(@_);
343	my ($expat, $element) = @_;
344	$self->{'element'} = $element;
345
346	my $doc_obj = $self->{'doc_obj'};
347	if ($element eq "METS:mets") {
348	$self->{'current_section'} = $doc_obj->get_top_section();
349	# } elsif ($element eq "PageGroup" \|\| $element eq "Page") {
350	## if ($element eq "PageGroup") {
351	## $self->{'has_internal_structure'} = 1;
352	}
353	elsif (($element eq "METS:FLocat") && ($_{'xlink:href'} =~ m/\.txt$/)) {
354	# e.g. <METS:FLocat LOCTYPE="OTHER" OTHERLOCTYPE="SYSTEM" xlink:href="00000000.txt"/>
355
356	# create a new section as a child
357	$self->{'current_section'} = $doc_obj->insert_section($doc_obj->get_end_child($self->{'current_section'}));
358	$self->{'num_pages'}++;
359	# assign pagenum as ... what?? => use page sequence number
360	my $txtfile = $_{'xlink:href'};
361	my ($pagenum) = ($txtfile =~ m/^(\d+)/);
362
363	if (defined $pagenum) {
364	my $pagenum_int = int($pagenum);
365	$doc_obj->set_utf8_metadata_element($self->{'current_section'}, "Title", "Page $pagenum_int");
366	}
367	## my ($imgfile) = $_{'imgfile'};
368	## if (defined $imgfile) {
369	## # *****
370	## # What about support for rotate image (e.g. old ':r' notation)?
371	## $self->process_image($self->{'xml_file_dir'}.$imgfile, $imgfile, $doc_obj, $self->{'current_section'});
372	## }
373
374	## my ($txtfile) = $_{'txtfile'};
375	if (defined($txtfile)&& $txtfile ne "") {
376	my $full_txt_filename = &FileUtils::filenameConcatenate($self->{'xml_file_dir'},$txtfile);
377	$self->process_text ($full_txt_filename, $txtfile, $doc_obj, $self->{'current_section'});
378	} else {
379	$self->add_dummy_text($doc_obj, $self->{'current_section'});
380	}
381	}
382	## elsif ($element eq "Metadata") {
383	## $self->{'metadata_name'} = $_{'name'};
384	## }
385	}
386
387	sub xml_end_tag {
388	my $self = shift(@_);
389	my ($expat, $element) = @_;
390
391	my $doc_obj = $self->{'doc_obj'};
392	## if ($element eq "Page" \|\| $element eq "PageGroup") {
393	if (($element eq "METS:FLocat") && ($_{'xlink:href'} =~ m/\.txt$/)) {
394	# if Title hasn't been assigned, set PageNum as Title
395	if (!defined $doc_obj->get_metadata_element ($self->{'current_section'}, "Title") && defined $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" )) {
396	$doc_obj->add_utf8_metadata ($self->{'current_section'}, "Title", $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" ));
397	}
398	# move the current section back to the parent
399	$self->{'current_section'} = $doc_obj->get_parent_section($self->{'current_section'});
400	} elsif ($element eq "Metadata") {
401
402	# text read in by XML::Parser is in Perl's binary byte value
403	# form ... need to explicitly make it UTF-8
404	my $meta_name = decode("utf-8",$self->{'metadata_name'});
405	my $metadata_value = decode("utf-8",$self->{'metadata_value'});
406
407	if ($meta_name =~ /\./) {
408	$meta_name = "ex.$meta_name";
409	}
410
411	$doc_obj->add_utf8_metadata ($self->{'current_section'}, $meta_name, $metadata_value);
412	$self->{'metadata_name'} = "";
413	$self->{'metadata_value'} = "";
414
415	}
416	# otherwise we ignore the end tag
417	}
418
419
420	sub xml_text {
421	my $self = shift(@_);
422	my ($expat) = @_;
423
424	if ($self->{'element'} eq "Metadata" && $self->{'metadata_name'}) {
425	$self->{'metadata_value'} .= $_;
426	}
427	}
428
429	sub xml_doctype {
430	}
431
432	sub open_document {
433	my $self = shift(@_);
434
435	# create a new document
436	$self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc", $self->{'file_rename_method'});
437	# TODO is file filenmae_no_path??
438	$self->set_initial_doc_fields($self->{'doc_obj'}, $self->{'filename'}, $self->{'processor'}, $self->{'metadata'});
439
440	## my ($dir, $file) = $self->{'filename'} =~ /^(.?)([^\/\\])$/;
441	my ($dir, $file_ext) = $self->{'filename'} =~ /^(.*?)(\.mets\.xml)$/;
442
443	$self->{'xml_file_dir'} = $dir;
444	$self->{'num_pages'} = 0;
445	## $self->{'has_internal_structure'} = 0;
446
447	}
448
449	sub close_document {
450	my $self = shift(@_);
451	my $doc_obj = $self->{'doc_obj'};
452
453	my $topsection = $doc_obj->get_top_section();
454
455	# add numpages metadata
456	$doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', $self->{'num_pages'}); # ##### !!!!
457
458	# set the document type
459	my $final_doc_type = "";
460	## if ($self->{'documenttype'} eq "auto") {
461	### if ($self->{'has_internal_structure'}) {
462	### if ($self->{'gs_version'} eq "3") {
463	### $final_doc_type = "pagedhierarchy";
464	### }
465	### else {
466	### $final_doc_type = "hierarchy";
467	### }
468	### } else {
469	### $final_doc_type = "paged";
470	### }
471	### } else {
472	## # set to what doc type option was set to
473	## $final_doc_type = $self->{'documenttype'};
474	## }
475	# $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", $final_doc_type); # #### !!!!!
476	### capiatalisation????
477	# if ($self->{'documenttype'} eq 'paged') {
478	# set the gsdlthistype metadata to Paged - this ensures this document will
479	# be treated as a Paged doc, even if Titles are not numeric
480	# $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
481	# } else {
482	# $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
483	# }
484
485	## $doc_obj->set_utf8_metadata_element($topsection,"MaxImageWidth",$self->{'MaxImageWidth'});
486	## $doc_obj->set_utf8_metadata_element($topsection,"MaxImageHeight",$self->{'MaxImageHeight'});
487	## $self->{'MaxImageWidth'} = undef;
488	## $self->{'MaxImageHeight'} = undef;
489
490	}
491
492
493	sub set_initial_doc_fields {
494	my $self = shift(@_);
495	my ($doc_obj, $filename_full_path, $processor, $metadata) = @_;
496
497	my $topsection = $doc_obj->get_top_section();
498
499	my $plugin_filename_encoding = $self->{'filename_encoding'};
500	my $filename_encoding = $self->deduce_filename_encoding($filename_full_path,$metadata,$plugin_filename_encoding);
501	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
502
503	# if we want a header page, we need to add some text into the top section, otherwise this section will become invisible
504	if ($self->{'headerpage'}) {
505	$self->add_dummy_text($doc_obj, $topsection);
506	}
507	}
508
509	sub scan_xml_for_files_to_block
510	{
511	my $self = shift (@_);
512	my ($filename_full_path, $dir, $block_hash) = @_;
513
514	my ($file_root) = ($filename_full_path =~ m/^(.*)\.mets\.xml$/);
515
516	$self->block_raw_filename($block_hash,"$file_root.zip");
517	$self->block_raw_filename($block_hash,"$file_root.json");
518
519	my $page_dir = $file_root;
520
521	open (METSFILE, $filename_full_path) \|\| die "couldn't open $filename_full_path to work out which files to block\n";
522	my $line = "";
523	while (defined ($line = <METSFILE>)) {
524	next unless $line =~ /\w/;
525
526	# Exaple of what we are looking for
527	# <METS:FLocat LOCTYPE="OTHER" OTHERLOCTYPE="SYSTEM" xlink:href="00000000.txt"/>
528
529	if ($line =~ /xlink:href=\"([^\"]+)\"/) {
530	my $txt_filename = &FileUtils::filenameConcatenate($page_dir,$1);
531	my $topics_filename = $txt_filename . ".topics";
532	$self->block_raw_filename($block_hash,$txt_filename);
533	$self->block_raw_filename($block_hash,$topics_filename);
534	}
535	}
536	close METSFILE;
537
538	}
539
540
541	sub process_text {
542	my $self = shift (@_);
543	my ($filename_full_path, $file, $doc_obj, $cursection) = @_;
544
545	# check that the text file exists!!
546	if (!-f $filename_full_path) {
547	print "HathiTrustMETSPlugin: ERROR: File $filename_full_path does not exist, skipping\n";
548	return 0;
549	}
550
551	# remember that this text file was one of our source files, but only
552	# if we are not processing a tmp file
553	if (!$self->{'processing_tmp_files'} ) {
554	$doc_obj->associate_source_file($filename_full_path);
555	}
556	# Do encoding stuff
557	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
558
559	my $text="";
560	if ( -s $filename_full_path > 0 ) {
561	&ReadTextFile::read_file($self, $filename_full_path, $encoding, $language, \$text); # already decoded as utf8
562	}
563
564	# HathiTrust often has empty files
565	## if (!length ($text)) {
566	## # It's a bit unusual but not out of the question to have no text, so just give a warning
567	## print "HathiTrustMETSPlugin: WARNING: $filename_full_path contains no text\n";
568	## }
569
570	# we need to escape the escape character, or else mg will convert into
571	# eg literal newlines, instead of leaving the text as '\n'
572	$text =~ s/\\/\\\\/g; # macro language
573	$text =~ s/_/\\_/g; # macro language
574
575
576	if ($text =~ m/<html.?>\s<head.?>.<\/head>\s<body.?>(.)<\/body>\s<\/html>\s*$/is) {
577	# looks like HTML input
578	# no need to escape < and > or put in <pre> tags
579
580	$text = $1;
581
582	# add text to document object
583	$doc_obj->add_utf8_text($cursection, "$text");
584	}
585	else {
586	$text =~ s/</</g;
587	$text =~ s/>/>/g;
588
589	# insert preformat tags and add text to document object
590	$doc_obj->add_utf8_text($cursection, "<pre>\n$text\n</pre>");
591	}
592
593	my $topics_filename = $filename_full_path . ".topics";
594	if ( -s $topics_filename > 0 ) {
595
596	my $topics_text = "";
597	$self->ReadTextFile::read_file($topics_filename,"utf8",undef,\$topics_text);
598
599	my @topics_array = split(/\\|/,$topics_text);
600	foreach my $topic (@topics_array) {
601	if ($topic ne "") {
602	$doc_obj->set_utf8_metadata_element ($cursection, "concept", $topic);
603	}
604	}
605	}
606
607	return 1;
608	}
609
610
611	sub clean_up_after_doc_obj_processing {
612	my $self = shift(@_);
613
614	$self->ImageConverter::clean_up_temporary_files();
615	}
616
617	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: