source: main/trunk/greenstone2/perllib/plugins/LOMPlugin.pm@ 23484

Last change on this file since 23484 was 23212, checked in by kjdon, 14 years ago

metadata_read no longer takes maxdocs args - metadata_read must process all docs, so that whatever few are actually processed by read will get their metadata

  • Property svn:keywords set to Author Date Id Revision
File size: 19.0 KB
Line 
1###########################################################################
2#
3# LOMPlugin.pm -- plugin for import the collection from LOM
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2005 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27### Note this plugin currently can't download source documents from outside if you are behind a firewall.
28# Unless, you set the http_proxy environment variable to be your proxy server,
29# and set proxy_user and proxy_password in .wgetrc file in home directory.
30# (does that work on windows??)
31
32package LOMPlugin;
33
34use ReadTextFile;
35use MetadataPass;
36use XMLParser;
37use Cwd;
38
39sub BEGIN {
40 @ISA = ('ReadTextFile', 'MetadataPass');
41}
42
43use strict; # every perl program should have this!
44no strict 'refs'; # make an exception so we can use variables as filehandles
45
46
47my $arguments =
48 [ { 'name' => "process_exp",
49 'desc' => "{BasePlugin.process_exp}",
50 'type' => "string",
51 'deft' => &get_default_process_exp(),
52 'reqd' => "no" },
53 { 'name' => "root_tag",
54 'desc' => "{LOMPlugin.root_tag}",
55 'type' => "regexp",
56 'deft' => q/^(?i)lom$/,
57 'reqd' => "no" },
58 { 'name' => "check_timestamp",
59 'desc' => "{LOMPlugin.check_timestamp}",
60 'type' => "flag" },
61 { 'name' => "download_srcdocs",
62 'desc' => "{LOMPlugin.download_srcdocs}",
63 'type' => "regexp",
64 'deft' => "",
65 'reqd' => "no" }];
66
67my $options = { 'name' => "LOMPlugin",
68 'desc' => "{LOMPlugin.desc}",
69 'abstract' => "no",
70 'inherits' => "yes",
71 'args' => $arguments };
72
73
74
75my ($self);
76sub new {
77 my $class = shift (@_);
78 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
79 push(@$pluginlist, $class);
80
81 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
82 push(@{$hashArgOptLists->{"OptList"}},$options);
83
84 $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
85
86 if ($self->{'info_only'}) {
87 # don't worry about creating the XML parser as all we want is the
88 # list of plugin options
89 return bless $self, $class;
90 }
91
92 #create XML::Parser object for parsing dublin_core.xml files
93 my $parser = new XML::Parser('Style' => 'Stream',
94 'Handlers' => {'Char' => \&Char,
95 'Doctype' => \&Doctype
96 });
97 $self->{'parser'} = $parser;
98
99 $self->{'extra_blocks'} = {};
100
101 return bless $self, $class;
102}
103
104sub get_default_process_exp {
105 my $self = shift (@_);
106
107 return q^(?i)\.xml$^;
108}
109
110
111sub can_process_this_file {
112 my $self = shift(@_);
113 my ($filename) = @_;
114
115 if ($self->SUPER::can_process_this_file($filename) && $self->check_doctype($filename)) {
116 return 1; # its a file for us
117 }
118 return 0;
119}
120
121sub metadata_read {
122 my $self = shift (@_);
123 my ($pluginfo, $base_dir, $file, $block_hash,
124 $extrametakeys, $extrametadata, $extrametafile,
125 $processor, $gli, $aux) = @_;
126
127 my $outhandle = $self->{'outhandle'};
128
129 # can we process this file??
130 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
131 return undef unless $self->can_process_this_file($filename_full_path);
132
133 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
134
135 print $outhandle "LOMPlugin: extracting metadata from $file\n"
136 if $self->{'verbosity'} > 1;
137
138 my ($dir,$tail) = $filename_full_path =~ /^(.*?)([^\/\\]*)$/;
139 $self->{'output_dir'} = $dir;
140
141 eval {
142 $self->{'parser'}->parsefile($filename_full_path);
143 };
144
145 if ($@) {
146 print $outhandle "LOMPlugin: skipping $filename_full_path as not conformant to LOM syntax\n" if ($self->{'verbosity'} > 1);
147 print $outhandle "\n Perl Error:\n $@\n" if ($self->{'verbosity'}>2);
148 return 0;
149 }
150
151 $self->{'output_dir'} = undef;
152
153 my $file_re;
154 my $lom_srcdoc = $self->{'lom_srcdoc'};
155
156 if (defined $lom_srcdoc) {
157 my $dirsep = &util::get_re_dirsep();
158 $lom_srcdoc =~ s/^$base_dir($dirsep)//;
159 $self->{'extra_blocks'}->{$file}++;
160 $file_re = $lom_srcdoc;
161 }
162 else {
163 $file_re = $tail;
164 }
165 $file_re = &util::filename_to_regex($file_re);
166 $self->{'lom_srcdoc'} = undef; # reset for next file to be processed
167
168 push(@$extrametakeys,$file_re);
169 $extrametadata->{$file_re} = $self->{'saved_metadata'};
170 if (defined $lom_srcdoc) {
171 # copied from oaiplugin
172 if (!defined $extrametafile->{$file_re}) {
173 $extrametafile->{$file_re} = {};
174 }
175 #maps the file to full path
176 $extrametafile->{$file_re}->{$file} = $filename_full_path;
177 }
178
179 return 1;
180}
181
182sub check_doctype {
183 $self = shift (@_);
184
185 my ($filename) = @_;
186
187 if (open(XMLIN,"<$filename")) {
188 my $doctype = $self->{'root_tag'};
189 ## check whether the doctype has the same name as the root element tag
190 while (defined (my $line = <XMLIN>)) {
191 ## find the root element
192 if ($line =~ /<([\w\d:]+)[\s>]/){
193 my $root = $1;
194 if ($root !~ $doctype){
195 close(XMLIN);
196 return 0;
197 }
198 else {
199 close(XMLIN);
200 return 1;
201 }
202 }
203 }
204 close(XMLIN);
205 }
206
207 return undef; # haven't found a valid line
208
209}
210
211sub read_file {
212 my $self = shift (@_);
213 my ($filename, $encoding, $language, $textref) = @_;
214
215 my $metadata_table = $self->{'metadata_table'};
216
217 my $rawtext = $metadata_table->{'rawtext'};
218
219 delete $metadata_table->{'rawtext'};
220
221 $$textref = $rawtext;
222}
223
224sub read {
225 my $self = shift (@_);
226 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
227
228 my $outhandle = $self->{'outhandle'};
229
230 return 0 if (defined $self->{'extra_blocks'}->{$file});
231
232 # can we process this file??
233 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
234 return undef unless $self->can_process_this_file($filename_full_path);
235
236 $self->{'metadata_table'} = $metadata;
237
238 my $lom_language = $metadata->{'lom_language'};
239
240 my $store_input_encoding;
241 my $store_extract_language;
242 my $store_default_language;
243 my $store_default_encoding;
244
245 if (defined $lom_language) {
246 delete $metadata->{'lom_language'};
247
248 $store_input_encoding = $self->{'input_encoding'};
249 $store_extract_language = $self->{'extract_language'};
250 $store_default_language = $self->{'default_language'};
251 $store_default_encoding = $self->{'default_encoding'};
252
253 $self->{'input_encoding'} = "utf8";
254 $self->{'extract_language'} = 0;
255 $self->{'default_language'} = $lom_language;
256 $self->{'default_encoding'} = "utf8";
257 }
258
259 my $rv = $self->SUPER::read(@_);
260
261 if (defined $lom_language) {
262 $self->{'input_encoding'} = $store_input_encoding;
263 $self->{'extract_language'} = $store_extract_language;
264 $self->{'default_language'} = $store_default_language;
265 $self->{'default_encoding'} = $store_default_encoding;
266 }
267
268 $self->{'metadata_table'} = undef;
269
270 return $rv;
271}
272
273# do plugin specific processing of doc_obj
274sub process {
275 my $self = shift (@_);
276 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
277 my $outhandle = $self->{'outhandle'};
278
279 my $cursection = $doc_obj->get_top_section();
280 $doc_obj->add_utf8_text($cursection, $$textref);
281
282 return 1;
283}
284
285sub Doctype {
286 my ($expat, $name, $sysid, $pubid, $internal) = @_;
287
288 my $root_tag = $self->{'root_tag'};
289
290 if ($name !~ /$root_tag/) {
291 die "Root tag $name does not match regular expression $root_tag";
292 }
293}
294
295sub StartTag {
296 my ($expat, $element) = @_;
297
298 my %attr = %_;
299
300 my $raw_tag = "&lt;$element";
301 map { $raw_tag .= " $_=\"$attr{$_}\""; } keys %attr;
302 $raw_tag .= "&gt;";
303
304 if ($element =~ m/$self->{'root_tag'}/) {
305 $self->{'raw_text'} = $raw_tag;
306
307 $self->{'saved_metadata'} = {};
308 $self->{'metaname_stack'} = [];
309 $self->{'lom_datatype'} = "";
310 $self->{'lom_language'} = undef;
311 $self->{'metadatatext'} = "<table class=\"metadata\" width=\"_pagewidth_\" >\n";
312 }
313 else {
314 my $xml_depth = scalar(@{$self->{'metaname_stack'}});
315 $self->{'raw_text'} .= "\n";
316 $self->{'raw_text'} .= "&nbsp;&nbsp;" x $xml_depth;
317 $self->{'raw_text'} .= $raw_tag;
318
319 my $metaname_stack = $self->{'metaname_stack'};
320 push(@$metaname_stack,$element);
321 if (scalar(@$metaname_stack)==1) {
322 # top level LOM category
323 my $style = "class=\"metadata\"";
324 my $open_close
325 = "<a id=\"${element}opencloselink\" href=\"javascript:hideTBodyArea('$element')\">\n";
326 $open_close
327 .= "<img id=\"${element}openclose\" border=\"0\" src=\"_httpopenmdicon_\"></a>\n";
328
329 my $header_line = " <tr $style ><th $style colspan=\"3\">$open_close \u$element</th></tr>\n";
330 my $md_tbody = "<tbody id=\"$element\">\n";
331
332 $self->{'mdheader'} = $header_line;
333 $self->{'mdtbody'} = $md_tbody;
334 $self->{'mdtbody_text'} = "";
335 }
336 }
337}
338
339sub EndTag {
340 my ($expat, $element) = @_;
341
342 my $raw_tag = "&lt;/$element&gt;";
343
344 if ($element =~ m/$self->{'root_tag'}/) {
345 $self->{'raw_text'} .= $raw_tag;
346
347 my $metadatatext = $self->{'metadatatext'};
348 $metadatatext .= "</table>";
349
350 my $raw_text = $self->{'raw_text'};
351
352 $self->{'saved_metadata'}->{'MetadataTable'} = $metadatatext;
353 $self->{'metadatatext'} = "";
354
355 $self->{'saved_metadata'}->{'rawtext'} = $raw_text;
356 $self->{'raw_text'} = "";
357
358 if (defined $self->{'lom_language'}) {
359 $self->{'saved_metadata'}->{'lom_language'} = $self->{'lom_language'};
360 $self->{'lom_language'} = undef;
361 }
362 }
363 else {
364 my $metaname_stack = $self->{'metaname_stack'};
365
366 if (scalar(@$metaname_stack)==1) {
367 my $header_line = $self->{'mdheader'};
368 my $tbody_start = $self->{'mdtbody'};
369 my $tbody_text = $self->{'mdtbody_text'};
370 if ($tbody_text !~ m/^\s*$/s) {
371 my $tbody_end = "</tbody>\n";
372 my $table_chunk
373 = $header_line.$tbody_start.$tbody_text.$tbody_end;
374
375 $self->{'metadatatext'} .= $table_chunk;
376 }
377 $self->{'mdtheader'} = "";
378 $self->{'mdtbody'} = "";
379 $self->{'mdtbody_text'} = "";
380 }
381
382 pop(@$metaname_stack);
383
384 my $xml_depth = scalar(@{$self->{'metaname_stack'}});
385 $self->{'raw_text'} .= "\n";
386 $self->{'raw_text'} .= "&nbsp;&nbsp;" x $xml_depth;
387 $self->{'raw_text'} .= $raw_tag;
388 }
389}
390
391sub process_datatype_info
392{
393 my $self = shift(@_);
394 my ($metaname_stack,$md_content) = @_;
395
396 my @without_dt_stack = @$metaname_stack; # without datatype stack
397
398 my $innermost_element = $without_dt_stack[$#without_dt_stack];
399
400 # Loose last item if encoding datatype information
401 if ($innermost_element =~ m/^(lang)?string$/) {
402 $self->{'lom_datatype'} = $innermost_element;
403
404 pop @without_dt_stack;
405 $innermost_element = $without_dt_stack[$#without_dt_stack];
406 }
407 elsif ($innermost_element =~ m/^date(Time)?$/i) {
408 if ($innermost_element =~ m/^date$/i) {
409 $self->{'lom_datatype'} = "dateTime";
410 }
411 else {
412 $self->{'lom_datatype'} = $innermost_element;
413
414 pop @without_dt_stack;
415 $innermost_element = $without_dt_stack[$#without_dt_stack];
416 }
417
418 if ($md_content =~ m/^(\d{1,2})\s*(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s*(\d{4})/i) {
419 my ($day,$mon,$year) = ($1,$2,$3);
420
421 my %month_lookup = ( 'jan' => 1, 'feb' => 2, 'mar' => 3,
422 'apr' => 4, 'may' => 5, 'jun' => 6,
423 'jul' => 7, 'aug' => 8, 'sep' => 9,
424 'oct' => 10, 'nov' => 11, 'dec' => 12 );
425
426 my $mon_num = $month_lookup{lc($mon)};
427
428 $md_content = sprintf("%d%02d%02d",$year,$mon_num,$day);
429 }
430
431 $md_content =~ s/\-//g;
432 }
433
434 if ($innermost_element eq "source") {
435 $self->{'lom_source'} = $md_content;
436 }
437 elsif ($innermost_element eq "value") {
438 $self->{'lom_value'} = $md_content;
439 }
440
441 return (\@without_dt_stack,$innermost_element,$md_content);
442}
443
444sub reset_datatype_info
445{
446 my $self = shift(@_);
447
448 $self->{'lom_datatype'} = "";
449}
450
451
452sub pretty_print_text
453{
454 my $self = shift(@_);
455
456 my ($pretty_print_text) = @_;
457
458## $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
459 $pretty_print_text = &util::hyperlink_text($pretty_print_text);
460
461#### $pretty_print_text =~ s/(BEGIN:vCard.*END:vCard)/<pre>$1<\/pre>/sg;
462
463 if ($self->{'lom_datatype'} eq "dateTime") {
464 if ($pretty_print_text =~ m/^(\d{4})(\d{2})(\d{2})$/) {
465 $pretty_print_text = "$1-$2-$3";
466 }
467 }
468
469 return $pretty_print_text;
470}
471
472sub pretty_print_table_tr
473{
474 my $self = shift (@_);
475 my ($without_dt_stack) = @_;
476
477 my $style = "class=\"metadata\"";
478
479 my $innermost_element = $without_dt_stack->[scalar(@$without_dt_stack)-1];
480 my $outermost_element = $without_dt_stack->[0];
481
482 # Loose top level stack item (already named in pretty print table)
483 my @pretty_print_stack = @$without_dt_stack;
484 shift @pretty_print_stack;
485
486 if ($innermost_element eq "source") {
487 return if (!defined $self->{'lom_value'});
488 }
489
490 if ($innermost_element eq "value") {
491 return if (!defined $self->{'lom_source'});
492 }
493
494 my $pretty_print_text = "";
495
496 if (($innermost_element eq "value") || ($innermost_element eq "source")) {
497 my $source = $self->{'lom_source'};
498 my $value = $self->pretty_print_text($self->{'lom_value'});
499
500 $self->{'lom_source'} = undef;
501 $self->{'lom_value'} = undef;
502
503 pop @pretty_print_stack;
504
505 $pretty_print_text = "<td $style>$source</td><td $style>$value</td>";
506 }
507 else {
508 $pretty_print_text = $self->pretty_print_text($_);
509 $pretty_print_text = "<td $style colspan=2>$pretty_print_text</td>";
510 }
511 my $pretty_print_fmn = join(' : ',map { "\u$_"; } @pretty_print_stack);
512
513
514 # my $tr_attr = "id=\"$outermost_element\" style=\"display:block;\"";
515 my $tr_attr = "$style id=\"$outermost_element\"";
516
517 my $mdtext_line = " <tr $tr_attr><td $style><nobr>$pretty_print_fmn</nobr></td>$pretty_print_text</tr>\n";
518 $self->{'mdtbody_text'} .= $mdtext_line;
519}
520
521
522sub check_for_language
523{
524 my $self = shift(@_);
525 my ($innermost_element,$md_content) = @_;
526
527 # Look for 'language' tag
528 if ($innermost_element eq "language") {
529 my $lom_lang = $self->{'lom_language'};
530
531 if (defined $lom_lang) {
532 my $new_lom_lang = $md_content;
533 $new_lom_lang =~ s/-.*//; # remove endings like -US or -GB
534
535 if ($lom_lang ne $new_lom_lang) {
536 my $outhandle = $self->{'outhandle'};
537
538 print $outhandle "Warning: Conflicting general language in record\n";
539 print $outhandle " $new_lom_lang (previous value for language = $lom_lang)\n";
540 }
541 # otherwise, existing value OK => do nothing
542 }
543 else {
544 $lom_lang = $md_content;
545 $lom_lang =~ s/-.*//; # remove endings like -US or -GB
546
547 $self->{'lom_language'} = $lom_lang;
548 }
549 }
550}
551
552sub found_specific_identifier
553{
554 my $self = shift(@_);
555 my ($specific_id,$full_mname,$md_content) = @_;
556
557 my $found_id = 0;
558 if ($full_mname eq $specific_id) {
559 if ($md_content =~ m/^(http|ftp):/) {
560 $found_id = 1;
561 }
562 }
563
564 return $found_id;
565}
566
567sub download_srcdoc
568{
569 my $self = shift(@_);
570 my ($doc_url) = @_;
571
572 my $outhandle = $self->{'outhandle'};
573 my $output_dir = $self->{'output_dir'};
574
575 $output_dir = &util::filename_cat($output_dir,"_gsdldown.all");
576
577 if (! -d $output_dir) {
578 mkdir $output_dir;
579 }
580
581 my $re_dirsep = &util::get_re_dirsep();
582 my $os_dirsep = &util::get_dirsep();
583
584 my $file_url = $doc_url;
585 $file_url =~ s/$re_dirsep/$os_dirsep/g;
586 $file_url =~ s/^(http|ftp):\/\///;
587 $file_url .= "index.html" if ($file_url =~ m/\/$/);
588
589 my $full_file_url = &util::filename_cat($output_dir,$file_url);
590 # the path to srcdoc will be used later in extrametadata to associate
591 # the lom metadata with the document. Needs to be relative to current
592 # directory.
593 my $srcdoc_path = &util::filename_cat("_gsdldown.all", $file_url);
594 my $check_timestamp = $self->{'check_timestamp'};
595 my $status;
596
597 if (($check_timestamp) || (!$check_timestamp && !-e $full_file_url)) {
598 if (!-e $full_file_url) {
599 print $outhandle "Mirroring $doc_url\n";
600 }
601 else {
602 print $outhandle "Checking to see if update needed for $doc_url\n";
603 }
604
605 # on linux, if we pass an absolute path as -P arg to wget, then it
606 # stuffs up the
607 # URL rewriting in the file. Need a relative path or none, so now
608 # we change working directory first.
609 my $changed_dir = 0;
610 my $current_dir = cwd();
611 my $wget_cmd = "";
612 if ($ENV{'GSDLOS'} ne "windows") {
613 $changed_dir = 1;
614
615 chdir "$output_dir";
616 $wget_cmd = "wget -nv --timestamping -k -p \"$doc_url\"";
617 } else {
618 $wget_cmd = "wget -nv -P \"$output_dir\" --timestamping -k -p \"$doc_url\"";
619 }
620 ##print STDERR "**** wget = $wget_cmd\n";
621
622
623 $status = system($wget_cmd);
624 if ($changed_dir) {
625 chdir $current_dir;
626 }
627 if ($status==0) {
628 $self->{'lom_srcdoc'} = $srcdoc_path;
629 }
630 else {
631 $self->{'lom_srcdoc'} = undef;
632 print $outhandle "Error: failed to execute $wget_cmd\n";
633 }
634 }
635 else {
636 # not time-stamping and file already exists
637 $status=0;
638 $self->{'lom_srcdoc'} = $srcdoc_path;
639 }
640
641 return $status==0;
642
643}
644
645
646sub check_for_identifier
647{
648 my $self = shift(@_);
649 my ($full_mname,$md_content) = @_;
650
651 my $success = 0;
652
653 my $download_re = $self->{'download_srcdocs'};
654 if (($download_re ne "") && $md_content =~ m/$download_re/) {
655
656 if ($self->found_specific_identifier("general^identifier^entry",$full_mname,$md_content)) {
657 $success = $self->download_srcdoc($md_content);
658 }
659
660 if (!$success) {
661 if ($self->found_specific_identifier("technical^location",$full_mname,$md_content)) {
662 $success = $self->download_srcdoc($md_content);
663 }
664 }
665 }
666
667 return $success;
668}
669
670
671sub Text {
672 if ($_ !~ m/^\s*$/) {
673 #
674 # Work out indentations and line wraps for raw XML
675 #
676 my $xml_depth = scalar(@{$self->{'metaname_stack'}})+1;
677 my $indent = "&nbsp;&nbsp;" x $xml_depth;
678
679 my $formatted_text = "\n".$_;
680
681 # break into lines < 80 chars on space
682 $formatted_text =~ s/(.{50,80})\s+/$1\n/mg;
683 $formatted_text =~ s/^/$indent/mg;
684 ## $formatted_text =~ s/\s+$//s;
685
686 $self->{'raw_text'} .= $formatted_text;
687 }
688
689 my $metaname_stack = $self->{'metaname_stack'};
690 if (($_ !~ /^\s*$/) && (scalar(@$metaname_stack)>0)) {
691
692 my ($without_dt_stack,$innermost_element,$md_content)
693 = $self->process_datatype_info($metaname_stack,$_);
694
695 $self->pretty_print_table_tr($without_dt_stack);
696
697 my $full_mname = join('^',@{$without_dt_stack});
698 $self->set_filere_metadata(lc($full_mname),$md_content);
699
700 $self->check_for_language($innermost_element,$md_content);
701 $self->check_for_identifier($full_mname,$md_content); # source doc
702
703 $self->reset_datatype_info();
704 }
705}
706
707# This Char function overrides the one in XML::Parser::Stream to overcome a
708# problem where $expat->{Text} is treated as the return value, slowing
709# things down significantly in some cases.
710sub Char {
711 $_[0]->{'Text'} .= $_[1];
712 return undef;
713}
714
7151;
Note: See TracBrowser for help on using the repository browser.