source: main/trunk/greenstone2/perllib/plugins/HathiTrustMETSPlugin.pm@ 31492

Last change on this file since 31492 was 31492, checked in by kjdon, 4 years ago

renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes

  • Property svn:executable set to *
File size: 19.8 KB
Line 
1###########################################################################
2#
3# HathiTrustMETSPlugin.pm -- plugin for sets of HathiTrust METS OCR'd
4# text that make up a document
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# HathiTrustMETSPlugin
29# processes HathiTrust METS files that are accompanied with page-by-page
30# OCR'd txt files
31#
32# All the supplemetary text files should be in a subfolder of the same
33# name as the METS file
34#
35# As usual, running
36# 'perl -S pluginfo.pl HathiTrustMETSPlugin' will list all the options.
37
38
39package HathiTrustMETSPlugin;
40
41use Encode;
42use ReadXMLFile;
43use ReadTextFile;
44# We don't currently work with the scanned image from HathiTrust METS
45# but leave it in for future proofing
46use ImageConverter;
47use MetadataRead;
48
49use JSON;
50
51use strict;
52no strict 'refs'; # allow filehandles to be variables and viceversa
53
54sub BEGIN {
55 @HathiTrustMETSPlugin::ISA = ('MetadataRead', 'ReadXMLFile', 'ReadTextFile', 'ImageConverter'
56 );
57}
58
59# One day HathiTrust might give more than page structure
60my $gs2_type_list =
61 [
62# { 'name' => "auto",
63# 'desc' => "{PagedImagePlugin.documenttype.auto2}" },
64# { 'name' => "paged",
65# 'desc' => "{PagedImagePlugin.documenttype.paged2}" },
66 { 'name' => "hierarchy",
67 'desc' => "{PagedImagePlugin.documenttype.hierarchy}" }
68 ];
69
70my $gs3_type_list =
71 [
72# { 'name' => "auto",
73# 'desc' => "{PagedImagePlugin.documenttype.auto3}" },
74# { 'name' => "paged",
75# 'desc' => "{PagedImagePlugin.documenttype.paged3}" },
76 { 'name' => "hierarchy",
77 'desc' => "{PagedImagePlugin.documenttype.hierarchy}" }
78# { 'name' => "pagedhierarchy",
79# 'desc' => "{PagedImagePlugin.documenttype.pagedhierarchy}" }
80 ];
81
82my $arguments =
83 [ { 'name' => "process_exp",
84 'desc' => "{BaseImporter.process_exp}",
85 'type' => "string",
86 'deft' => &get_default_process_exp(),
87 'reqd' => "no" },
88 { 'name' => "title_sub",
89 'desc' => "{HTMLPlugin.title_sub}",
90 'type' => "string",
91 'deft' => "" },
92 { 'name' => "headerpage",
93 'desc' => "{HathiTrustMETSPlugin.headerpage}",
94 'type' => "flag",
95 'reqd' => "no" },
96# { 'name' => "documenttype",
97# 'desc' => "{HathiTrustMETSPlugin.documenttype}",
98# 'type' => "enum",
99# 'list' => $type_list,
100# 'deft' => "auto",
101# 'reqd' => "no" },
102 {'name' => "processing_tmp_files",
103 'desc' => "{BaseImporter.processing_tmp_files}",
104 'type' => "flag",
105 'hiddengli' => "yes"}
106 ];
107
108my $doc_type_opt = { 'name' => "documenttype",
109 'desc' => "{HathiTrustMETSPlugin.documenttype}",
110 'type' => "enum",
111 'deft' => "auto",
112 'reqd' => "no" };
113
114my $options = { 'name' => "HathiTrustMETSPlugin",
115 'desc' => "{HathiTrustMETSPlugin.desc}",
116 'abstract' => "no",
117 'inherits' => "yes",
118 'args' => $arguments };
119
120sub new {
121 my ($class) = shift (@_);
122 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
123 push(@$pluginlist, $class);
124
125 push(@{$hashArgOptLists->{"OptList"}},$options);
126
127 my $imc_self = new ImageConverter($pluginlist, $inputargs, $hashArgOptLists);
128
129 # we can use this plugin to check gs3 version
130 if ($imc_self->{'gs_version'} eq "3") {
131 $doc_type_opt->{'list'} = $gs3_type_list;
132 }
133 else {
134 $doc_type_opt->{'list'} = $gs2_type_list;
135 }
136 push(@$arguments,$doc_type_opt);
137 # now we add the args to the list for parsing
138 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
139
140 my $rtf_self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists, 1);
141 my $rxf_self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
142
143 my $self = BaseImporter::merge_inheritance($imc_self,$rtf_self,$rxf_self);
144
145 # Update $self used by XML::Parser so it finds callback functions
146 # such as start_document here and not in ReadXMLFile (which is what
147 # $self was when new XML::Parser was done)
148 #
149 # If the $self returned by this constructor is the same as the one
150 # used in ReadXMLFile (e.g. in the GreenstoneXMLPlugin) then this step isn't necessary
151 #
152 # Consider embedding this type of assignment into merge_inheritance
153 # to help catch all cases?
154
155 $rxf_self->{'parser'}->{'PluginObj'} = $self;
156
157 return bless $self, $class;
158}
159
160
161sub init {
162 my $self = shift (@_);
163 my ($verbosity, $outhandle, $failhandle) = @_;
164
165 $self->SUPER::init(@_);
166 $self->ImageConverter::init();
167}
168
169sub begin {
170 my $self = shift (@_);
171 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
172
173 $self->SUPER::begin(@_);
174 $self->ImageConverter::begin(@_);
175}
176
177sub get_default_process_exp {
178 my $self = shift (@_);
179
180 return q^\.mets.xml$^;
181}
182
183sub get_doctype {
184 my $self = shift(@_);
185
186 return "METS:mets";
187}
188
189
190# want to use BaseImporter's version of this, not ReadXMLFile's
191sub can_process_this_file {
192 my $self = shift(@_);
193 return $self->BaseImporter::can_process_this_file(@_);
194}
195
196# instead of a block exp, now we scan the file and record all text and img files mentioned there for blocking.
197sub store_block_files
198{
199 my $self = shift (@_);
200 my ($filename_full_path, $block_hash) = @_;
201
202 # do we need to do this?
203 # does BOM interfere just with XML parsing? In that case don't need it here
204 # if we do it here, we are modifying the file before we have worked out if
205 # its new or not, so it will always be reimported.
206 #$self->tidy_item_file($filename_full_path);
207
208 my ($dir, $file) = $filename_full_path =~ /^(.*?)([^\/\\]*)$/;
209
210 # do something
211 $self->scan_xml_for_files_to_block($filename_full_path, $dir, $block_hash);
212
213}
214
215# we want to use BaseImporter's read, not ReadXMLFile's
216sub read
217{
218 my $self = shift (@_);
219
220 $self->BaseImporter::read(@_);
221}
222
223
224
225sub read_into_doc_obj {
226 my $self = shift (@_);
227 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
228 my $outhandle = $self->{'outhandle'};
229 my $verbosity = $self->{'verbosity'};
230
231 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
232
233 print $outhandle "HathiTrustMETSPlugin processing \"$filename_full_path\"\n"
234 if $verbosity > 1;
235 print STDERR "<Processing n='$file' p='HathiTrustMETSPlugin'>\n" if ($gli);
236
237## $self->{'MaxImageWidth'} = 0;
238## $self->{'MaxImageHeight'} = 0;
239
240
241 ##$self->tidy_item_file($filename_full_path);
242
243 # careful checking needed here!! are we using local xml handlers or super ones
244 $self->ReadXMLFile::read($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
245 my $doc_obj = $self->{'doc_obj'};
246
247
248 my $section = $doc_obj->get_top_section();
249
250 $doc_obj->add_utf8_metadata($section, "Plugin", "$self->{'plugin_type'}");
251 $doc_obj->add_metadata($section, "FileFormat", "HathiTrustMETS");
252
253 # include any metadata passed in from previous plugins
254 # note that this metadata is associated with the top level section
255 $self->add_associated_files($doc_obj, $filename_full_path);
256 $self->extra_metadata ($doc_obj, $section, $metadata);
257 $self->auto_extract_metadata ($doc_obj);
258 $self->plugin_specific_process($base_dir, $file, $doc_obj, $gli);
259 # if we haven't found any Title so far, assign one
260 $self->title_fallback($doc_obj,$section,$filename_no_path);
261
262 $self->add_OID($doc_obj);
263 return (1,$doc_obj);
264}
265
266
267sub parse_aux_json_metadata {
268 my $self = shift(@_);
269 my ($base_dir, $file, $doc_obj, $gli) = @_;
270
271 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
272
273 my $topsection = $doc_obj->get_top_section();
274
275 my $json_metadata_filename = $filename_full_path;
276 $json_metadata_filename =~ s/\.mets.xml$/.json/;
277
278 my $json_text = "";
279 $self->ReadTextFile::read_file($json_metadata_filename,"utf8",undef,\$json_text);
280
281 my $json_rec = decode_json $json_text;
282 my $records = $json_rec->{'records'};
283 my @keys = keys %{$records};
284
285 my $key = shift @keys; # there should only be one
286 my $record = $records->{$key};
287
288 my @md_fields = ( "recordURL", "titles", "isbns", "issns", "oclcs", "lccns", "publishDates" );
289
290 foreach my $md_field (@md_fields) {
291 my $value_array = $record->{$md_field};
292
293 my $md_name = $md_field;
294 $md_name =~ s/s$//;
295
296 foreach my $md_value (@$value_array) {
297
298 if ($md_name eq "title") {
299 $doc_obj->set_utf8_metadata_element ($topsection, "Title", $md_value);
300 $doc_obj->set_utf8_metadata_element ($topsection, "dc.Title", $md_value);
301 }
302 else {
303 $doc_obj->set_utf8_metadata_element ($topsection, $md_name, $md_value);
304 }
305 }
306 }
307
308 my $htid = $json_rec->{'items'}->[0]->{'htid'};
309 my $docName = $htid;
310 my $docNameIE = $htid;
311 $docNameIE =~ s/^.*?\.//;
312
313 $doc_obj->set_utf8_metadata_element ($topsection, "docName", $docName);
314 $doc_obj->set_utf8_metadata_element ($topsection, "docNameIE", $docNameIE);
315
316}
317
318
319# override this for an inheriting plugin to add extra metadata etc
320sub plugin_specific_process {
321 my $self = shift(@_);
322 my ($base_dir, $file, $doc_obj, $gli) = @_;
323
324 $self->parse_aux_json_metadata($base_dir,$file,$doc_obj,$gli);
325}
326
327# sub tidy_item_file {
328# ... see PagedImagePlugin
329# }
330
331# sub rotate_image {
332# ... see PagedImagePlugin
333# }
334
335# sub process_image {
336# ... see PagedImagePlugin
337# }
338
339
340
341sub xml_start_tag {
342 my $self = shift(@_);
343 my ($expat, $element) = @_;
344 $self->{'element'} = $element;
345
346 my $doc_obj = $self->{'doc_obj'};
347 if ($element eq "METS:mets") {
348 $self->{'current_section'} = $doc_obj->get_top_section();
349# } elsif ($element eq "PageGroup" || $element eq "Page") {
350## if ($element eq "PageGroup") {
351## $self->{'has_internal_structure'} = 1;
352 }
353 elsif (($element eq "METS:FLocat") && ($_{'xlink:href'} =~ m/\.txt$/)) {
354 # e.g. <METS:FLocat LOCTYPE="OTHER" OTHERLOCTYPE="SYSTEM" xlink:href="00000000.txt"/>
355
356 # create a new section as a child
357 $self->{'current_section'} = $doc_obj->insert_section($doc_obj->get_end_child($self->{'current_section'}));
358 $self->{'num_pages'}++;
359 # assign pagenum as ... what?? => use page sequence number
360 my $txtfile = $_{'xlink:href'};
361 my ($pagenum) = ($txtfile =~ m/^(\d+)/);
362
363 if (defined $pagenum) {
364 my $pagenum_int = int($pagenum);
365 $doc_obj->set_utf8_metadata_element($self->{'current_section'}, "Title", "Page $pagenum_int");
366 }
367## my ($imgfile) = $_{'imgfile'};
368## if (defined $imgfile) {
369## # *****
370## # What about support for rotate image (e.g. old ':r' notation)?
371## $self->process_image($self->{'xml_file_dir'}.$imgfile, $imgfile, $doc_obj, $self->{'current_section'});
372## }
373
374## my ($txtfile) = $_{'txtfile'};
375 if (defined($txtfile)&& $txtfile ne "") {
376 my $full_txt_filename = &FileUtils::filenameConcatenate($self->{'xml_file_dir'},$txtfile);
377 $self->process_text ($full_txt_filename, $txtfile, $doc_obj, $self->{'current_section'});
378 } else {
379 $self->add_dummy_text($doc_obj, $self->{'current_section'});
380 }
381 }
382## elsif ($element eq "Metadata") {
383## $self->{'metadata_name'} = $_{'name'};
384## }
385}
386
387sub xml_end_tag {
388 my $self = shift(@_);
389 my ($expat, $element) = @_;
390
391 my $doc_obj = $self->{'doc_obj'};
392## if ($element eq "Page" || $element eq "PageGroup") {
393 if (($element eq "METS:FLocat") && ($_{'xlink:href'} =~ m/\.txt$/)) {
394 # if Title hasn't been assigned, set PageNum as Title
395 if (!defined $doc_obj->get_metadata_element ($self->{'current_section'}, "Title") && defined $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" )) {
396 $doc_obj->add_utf8_metadata ($self->{'current_section'}, "Title", $doc_obj->get_metadata_element ($self->{'current_section'}, "PageNum" ));
397 }
398 # move the current section back to the parent
399 $self->{'current_section'} = $doc_obj->get_parent_section($self->{'current_section'});
400 } elsif ($element eq "Metadata") {
401
402 # text read in by XML::Parser is in Perl's binary byte value
403 # form ... need to explicitly make it UTF-8
404 my $meta_name = decode("utf-8",$self->{'metadata_name'});
405 my $metadata_value = decode("utf-8",$self->{'metadata_value'});
406
407 if ($meta_name =~ /\./) {
408 $meta_name = "ex.$meta_name";
409 }
410
411 $doc_obj->add_utf8_metadata ($self->{'current_section'}, $meta_name, $metadata_value);
412 $self->{'metadata_name'} = "";
413 $self->{'metadata_value'} = "";
414
415 }
416 # otherwise we ignore the end tag
417}
418
419
420sub xml_text {
421 my $self = shift(@_);
422 my ($expat) = @_;
423
424 if ($self->{'element'} eq "Metadata" && $self->{'metadata_name'}) {
425 $self->{'metadata_value'} .= $_;
426 }
427}
428
429sub xml_doctype {
430}
431
432sub open_document {
433 my $self = shift(@_);
434
435 # create a new document
436 $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc", $self->{'file_rename_method'});
437 # TODO is file filenmae_no_path??
438 $self->set_initial_doc_fields($self->{'doc_obj'}, $self->{'filename'}, $self->{'processor'}, $self->{'metadata'});
439
440## my ($dir, $file) = $self->{'filename'} =~ /^(.*?)([^\/\\]*)$/;
441 my ($dir, $file_ext) = $self->{'filename'} =~ /^(.*?)(\.mets\.xml)$/;
442
443 $self->{'xml_file_dir'} = $dir;
444 $self->{'num_pages'} = 0;
445## $self->{'has_internal_structure'} = 0;
446
447}
448
449sub close_document {
450 my $self = shift(@_);
451 my $doc_obj = $self->{'doc_obj'};
452
453 my $topsection = $doc_obj->get_top_section();
454
455 # add numpages metadata
456 $doc_obj->set_utf8_metadata_element ($topsection, 'NumPages', $self->{'num_pages'}); # ##### !!!!
457
458 # set the document type
459 my $final_doc_type = "";
460## if ($self->{'documenttype'} eq "auto") {
461### if ($self->{'has_internal_structure'}) {
462### if ($self->{'gs_version'} eq "3") {
463### $final_doc_type = "pagedhierarchy";
464### }
465### else {
466### $final_doc_type = "hierarchy";
467### }
468### } else {
469### $final_doc_type = "paged";
470### }
471### } else {
472## # set to what doc type option was set to
473## $final_doc_type = $self->{'documenttype'};
474## }
475# $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", $final_doc_type); # #### !!!!!
476 ### capiatalisation????
477# if ($self->{'documenttype'} eq 'paged') {
478 # set the gsdlthistype metadata to Paged - this ensures this document will
479 # be treated as a Paged doc, even if Titles are not numeric
480# $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Paged");
481# } else {
482# $doc_obj->set_utf8_metadata_element ($topsection, "gsdlthistype", "Hierarchy");
483# }
484
485## $doc_obj->set_utf8_metadata_element($topsection,"MaxImageWidth",$self->{'MaxImageWidth'});
486## $doc_obj->set_utf8_metadata_element($topsection,"MaxImageHeight",$self->{'MaxImageHeight'});
487## $self->{'MaxImageWidth'} = undef;
488## $self->{'MaxImageHeight'} = undef;
489
490}
491
492
493sub set_initial_doc_fields {
494 my $self = shift(@_);
495 my ($doc_obj, $filename_full_path, $processor, $metadata) = @_;
496
497 my $topsection = $doc_obj->get_top_section();
498
499 my $plugin_filename_encoding = $self->{'filename_encoding'};
500 my $filename_encoding = $self->deduce_filename_encoding($filename_full_path,$metadata,$plugin_filename_encoding);
501 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
502
503 # if we want a header page, we need to add some text into the top section, otherwise this section will become invisible
504 if ($self->{'headerpage'}) {
505 $self->add_dummy_text($doc_obj, $topsection);
506 }
507}
508
509sub scan_xml_for_files_to_block
510{
511 my $self = shift (@_);
512 my ($filename_full_path, $dir, $block_hash) = @_;
513
514 my ($file_root) = ($filename_full_path =~ m/^(.*)\.mets\.xml$/);
515
516 $self->block_raw_filename($block_hash,"$file_root.zip");
517 $self->block_raw_filename($block_hash,"$file_root.json");
518
519 my $page_dir = $file_root;
520
521 open (METSFILE, $filename_full_path) || die "couldn't open $filename_full_path to work out which files to block\n";
522 my $line = "";
523 while (defined ($line = <METSFILE>)) {
524 next unless $line =~ /\w/;
525
526 # Exaple of what we are looking for
527 # <METS:FLocat LOCTYPE="OTHER" OTHERLOCTYPE="SYSTEM" xlink:href="00000000.txt"/>
528
529 if ($line =~ /xlink:href=\"([^\"]+)\"/) {
530 my $txt_filename = &FileUtils::filenameConcatenate($page_dir,$1);
531 my $topics_filename = $txt_filename . ".topics";
532 $self->block_raw_filename($block_hash,$txt_filename);
533 $self->block_raw_filename($block_hash,$topics_filename);
534 }
535 }
536 close METSFILE;
537
538}
539
540
541sub process_text {
542 my $self = shift (@_);
543 my ($filename_full_path, $file, $doc_obj, $cursection) = @_;
544
545 # check that the text file exists!!
546 if (!-f $filename_full_path) {
547 print "HathiTrustMETSPlugin: ERROR: File $filename_full_path does not exist, skipping\n";
548 return 0;
549 }
550
551 # remember that this text file was one of our source files, but only
552 # if we are not processing a tmp file
553 if (!$self->{'processing_tmp_files'} ) {
554 $doc_obj->associate_source_file($filename_full_path);
555 }
556 # Do encoding stuff
557 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
558
559 my $text="";
560 if ( -s $filename_full_path > 0 ) {
561 &ReadTextFile::read_file($self, $filename_full_path, $encoding, $language, \$text); # already decoded as utf8
562 }
563
564# HathiTrust often has empty files
565## if (!length ($text)) {
566## # It's a bit unusual but not out of the question to have no text, so just give a warning
567## print "HathiTrustMETSPlugin: WARNING: $filename_full_path contains no text\n";
568## }
569
570 # we need to escape the escape character, or else mg will convert into
571 # eg literal newlines, instead of leaving the text as '\n'
572 $text =~ s/\\/\\\\/g; # macro language
573 $text =~ s/_/\\_/g; # macro language
574
575
576 if ($text =~ m/<html.*?>\s*<head.*?>.*<\/head>\s*<body.*?>(.*)<\/body>\s*<\/html>\s*$/is) {
577 # looks like HTML input
578 # no need to escape < and > or put in <pre> tags
579
580 $text = $1;
581
582 # add text to document object
583 $doc_obj->add_utf8_text($cursection, "$text");
584 }
585 else {
586 $text =~ s/</&lt;/g;
587 $text =~ s/>/&gt;/g;
588
589 # insert preformat tags and add text to document object
590 $doc_obj->add_utf8_text($cursection, "<pre>\n$text\n</pre>");
591 }
592
593 my $topics_filename = $filename_full_path . ".topics";
594 if ( -s $topics_filename > 0 ) {
595
596 my $topics_text = "";
597 $self->ReadTextFile::read_file($topics_filename,"utf8",undef,\$topics_text);
598
599 my @topics_array = split(/\|/,$topics_text);
600 foreach my $topic (@topics_array) {
601 if ($topic ne "") {
602 $doc_obj->set_utf8_metadata_element ($cursection, "concept", $topic);
603 }
604 }
605 }
606
607 return 1;
608}
609
610
611sub clean_up_after_doc_obj_processing {
612 my $self = shift(@_);
613
614 $self->ImageConverter::clean_up_temporary_files();
615}
616
6171;
Note: See TracBrowser for help on using the repository browser.