source: gsdl/trunk/perllib/plugins/LOMPlugin.pm@ 19493

Last change on this file since 19493 was 19493, checked in by davidb, 15 years ago

Introduction of new extrametafile to track which metadata.xml file a piece of metadata came from

  • Property svn:keywords set to Author Date Id Revision
File size: 18.8 KB
Line 
1###########################################################################
2#
3# LOMPlugin.pm -- plugin for import the collection from LOM
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2005 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27### Note this plugin currently can't download source documents from outside if you are behind a firewall.
28
29package LOMPlugin;
30
31use ReadTextFile;
32use MetadataPass;
33use XMLParser;
34use Cwd;
35
36sub BEGIN {
37 @ISA = ('ReadTextFile', 'MetadataPass');
38}
39
40use strict; # every perl program should have this!
41no strict 'refs'; # make an exception so we can use variables as filehandles
42
43
44my $arguments =
45 [ { 'name' => "process_exp",
46 'desc' => "{BasePlugin.process_exp}",
47 'type' => "string",
48 'deft' => &get_default_process_exp(),
49 'reqd' => "no" },
50 { 'name' => "root_tag",
51 'desc' => "{LOMPlugin.root_tag}",
52 'type' => "regexp",
53 'deft' => q/^(?i)lom$/,
54 'reqd' => "no" },
55 { 'name' => "check_timestamp",
56 'desc' => "{LOMPlugin.check_timestamp}",
57 'type' => "flag" },
58 { 'name' => "download_srcdocs",
59 'desc' => "{LOMPlugin.download_srcdocs}",
60 'type' => "regexp",
61 'deft' => "",
62 'reqd' => "no" }];
63
64my $options = { 'name' => "LOMPlugin",
65 'desc' => "{LOMPlugin.desc}",
66 'abstract' => "no",
67 'inherits' => "yes",
68 'args' => $arguments };
69
70
71
72my ($self);
73sub new {
74 my $class = shift (@_);
75 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
76 push(@$pluginlist, $class);
77
78 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
79 push(@{$hashArgOptLists->{"OptList"}},$options);
80
81 $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
82
83 if ($self->{'info_only'}) {
84 # don't worry about creating the XML parser as all we want is the
85 # list of plugin options
86 return bless $self, $class;
87 }
88
89 #create XML::Parser object for parsing dublin_core.xml files
90 my $parser = new XML::Parser('Style' => 'Stream',
91 'Handlers' => {'Char' => \&Char,
92 'Doctype' => \&Doctype
93 });
94 $self->{'parser'} = $parser;
95
96 $self->{'extra_blocks'} = {};
97
98 return bless $self, $class;
99}
100
101sub get_default_process_exp {
102 my $self = shift (@_);
103
104 return q^(?i)\.xml$^;
105}
106
107
108sub can_process_this_file {
109 my $self = shift(@_);
110 my ($filename) = @_;
111
112 if ($self->SUPER::can_process_this_file($filename) && $self->check_doctype($filename)) {
113 return 1; # its a file for us
114 }
115 return 0;
116}
117
118sub metadata_read {
119 my $self = shift (@_);
120 my ($pluginfo, $base_dir, $file, $block_hash,
121 $extrametakeys, $extrametadata, $extrametafile,
122 $processor, $maxdocs, $gli) = @_;
123
124 my $outhandle = $self->{'outhandle'};
125
126 # can we process this file??
127 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
128 return undef unless $self->can_process_this_file($filename_full_path);
129
130 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
131
132 print $outhandle "LOMPlugin: extracting metadata from $file\n"
133 if $self->{'verbosity'} > 1;
134
135 my ($dir,$tail) = $filename_full_path =~ /^(.*?)([^\/\\]*)$/;
136 $self->{'output_dir'} = $dir;
137
138 eval {
139 $self->{'parser'}->parsefile($filename_full_path);
140 };
141
142 if ($@) {
143 print $outhandle "LOMPlugin: skipping $filename_full_path as not conformant to LOM syntax\n" if ($self->{'verbosity'} > 1);
144 print $outhandle "\n Perl Error:\n $@\n" if ($self->{'verbosity'}>2);
145 return 0;
146 }
147
148 $self->{'output_dir'} = undef;
149
150 my $file_re;
151 my $lom_srcdoc = $self->{'lom_srcdoc'};
152
153 if (defined $lom_srcdoc) {
154 my $dirsep = &util::get_re_dirsep();
155 $lom_srcdoc =~ s/^$base_dir($dirsep)//;
156 $self->{'extra_blocks'}->{$file}++;
157
158 $file_re = $lom_srcdoc;
159 }
160 else {
161 $file_re = $tail;
162 }
163 $file_re =~ s/\./\\\./g;
164
165 $self->{'lom_srcdoc'} = undef; # reset for next file to be processed
166
167 push(@$extrametakeys,$file_re);
168 $extrametadata->{$file_re} = $self->{'saved_metadata'};
169
170 return 1;
171}
172
173sub check_doctype {
174 $self = shift (@_);
175
176 my ($filename) = @_;
177
178 if (open(XMLIN,"<$filename")) {
179 my $doctype = $self->{'root_tag'};
180 ## check whether the doctype has the same name as the root element tag
181 while (defined (my $line = <XMLIN>)) {
182 ## find the root element
183 if ($line =~ /<([\w\d:]+)[\s>]/){
184 my $root = $1;
185 if ($root !~ $doctype){
186 close(XMLIN);
187 return 0;
188 }
189 else {
190 close(XMLIN);
191 return 1;
192 }
193 }
194 }
195 close(XMLIN);
196 }
197
198 return undef; # haven't found a valid line
199
200}
201
202sub read_file {
203 my $self = shift (@_);
204 my ($filename, $encoding, $language, $textref) = @_;
205
206 my $metadata_table = $self->{'metadata_table'};
207
208 my $rawtext = $metadata_table->{'rawtext'};
209
210 delete $metadata_table->{'rawtext'};
211
212 $$textref = $rawtext;
213}
214
215sub read {
216 my $self = shift (@_);
217 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
218
219 my $outhandle = $self->{'outhandle'};
220
221 return 0 if (defined $self->{'extra_blocks'}->{$file});
222
223 # can we process this file??
224 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
225 return undef unless $self->can_process_this_file($filename_full_path);
226
227 $self->{'metadata_table'} = $metadata;
228
229 my $lom_language = $metadata->{'lom_language'};
230
231 my $store_input_encoding;
232 my $store_extract_language;
233 my $store_default_language;
234 my $store_default_encoding;
235
236 if (defined $lom_language) {
237 delete $metadata->{'lom_language'};
238
239 $store_input_encoding = $self->{'input_encoding'};
240 $store_extract_language = $self->{'extract_language'};
241 $store_default_language = $self->{'default_language'};
242 $store_default_encoding = $self->{'default_encoding'};
243
244 $self->{'input_encoding'} = "utf8";
245 $self->{'extract_language'} = 0;
246 $self->{'default_language'} = $lom_language;
247 $self->{'default_encoding'} = "utf8";
248 }
249
250 my $rv = $self->SUPER::read(@_);
251
252 if (defined $lom_language) {
253 $self->{'input_encoding'} = $store_input_encoding;
254 $self->{'extract_language'} = $store_extract_language;
255 $self->{'default_language'} = $store_default_language;
256 $self->{'default_encoding'} = $store_default_encoding;
257 }
258
259 $self->{'metadata_table'} = undef;
260
261 return $rv;
262}
263
264# do plugin specific processing of doc_obj
265sub process {
266 my $self = shift (@_);
267 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
268 my $outhandle = $self->{'outhandle'};
269
270 print STDERR "<Processing n='$file' p='LOMPlugin'>\n" if ($gli);
271
272 print $outhandle "LOMPLug: processing $file\n";
273
274 my $cursection = $doc_obj->get_top_section();
275 $doc_obj->add_utf8_text($cursection, $$textref);
276
277 return 1;
278}
279
280sub Doctype {
281 my ($expat, $name, $sysid, $pubid, $internal) = @_;
282
283 my $root_tag = $self->{'root_tag'};
284
285 if ($name !~ /$root_tag/) {
286 die "Root tag $name does not match regular expression $root_tag";
287 }
288}
289
290sub StartTag {
291 my ($expat, $element) = @_;
292
293 my %attr = %_;
294
295 my $raw_tag = "&lt;$element";
296 map { $raw_tag .= " $_=\"$attr{$_}\""; } keys %attr;
297 $raw_tag .= "&gt;";
298
299 if ($element =~ m/$self->{'root_tag'}/) {
300 $self->{'raw_text'} = $raw_tag;
301
302 $self->{'saved_metadata'} = {};
303 $self->{'metaname_stack'} = [];
304 $self->{'lom_datatype'} = "";
305 $self->{'lom_language'} = undef;
306 $self->{'metadatatext'} = "<table class=\"metadata\" width=\"_pagewidth_\" >\n";
307 }
308 else {
309 my $xml_depth = scalar(@{$self->{'metaname_stack'}});
310 $self->{'raw_text'} .= "\n";
311 $self->{'raw_text'} .= "&nbsp;&nbsp;" x $xml_depth;
312 $self->{'raw_text'} .= $raw_tag;
313
314 my $metaname_stack = $self->{'metaname_stack'};
315 push(@$metaname_stack,$element);
316 if (scalar(@$metaname_stack)==1) {
317 # top level LOM category
318 my $style = "class=\"metadata\"";
319 my $open_close
320 = "<a id=\"${element}opencloselink\" href=\"javascript:hideTBodyArea('$element')\">\n";
321 $open_close
322 .= "<img id=\"${element}openclose\" border=\"0\" src=\"_httpopenmdicon_\"></a>\n";
323
324 my $header_line = " <tr $style ><th $style colspan=\"3\">$open_close \u$element</th></tr>\n";
325 my $md_tbody = "<tbody id=\"$element\">\n";
326
327 $self->{'mdheader'} = $header_line;
328 $self->{'mdtbody'} = $md_tbody;
329 $self->{'mdtbody_text'} = "";
330 }
331 }
332}
333
334sub EndTag {
335 my ($expat, $element) = @_;
336
337 my $raw_tag = "&lt;/$element&gt;";
338
339 if ($element =~ m/$self->{'root_tag'}/) {
340 $self->{'raw_text'} .= $raw_tag;
341
342 my $metadatatext = $self->{'metadatatext'};
343 $metadatatext .= "</table>";
344
345 my $raw_text = $self->{'raw_text'};
346
347 $self->{'saved_metadata'}->{'MetadataTable'} = $metadatatext;
348 $self->{'metadatatext'} = "";
349
350 $self->{'saved_metadata'}->{'rawtext'} = $raw_text;
351 $self->{'raw_text'} = "";
352
353 if (defined $self->{'lom_language'}) {
354 $self->{'saved_metadata'}->{'lom_language'} = $self->{'lom_language'};
355 $self->{'lom_language'} = undef;
356 }
357 }
358 else {
359 my $metaname_stack = $self->{'metaname_stack'};
360
361 if (scalar(@$metaname_stack)==1) {
362 my $header_line = $self->{'mdheader'};
363 my $tbody_start = $self->{'mdtbody'};
364 my $tbody_text = $self->{'mdtbody_text'};
365 if ($tbody_text !~ m/^\s*$/s) {
366 my $tbody_end = "</tbody>\n";
367 my $table_chunk
368 = $header_line.$tbody_start.$tbody_text.$tbody_end;
369
370 $self->{'metadatatext'} .= $table_chunk;
371 }
372 $self->{'mdtheader'} = "";
373 $self->{'mdtbody'} = "";
374 $self->{'mdtbody_text'} = "";
375 }
376
377 pop(@$metaname_stack);
378
379 my $xml_depth = scalar(@{$self->{'metaname_stack'}});
380 $self->{'raw_text'} .= "\n";
381 $self->{'raw_text'} .= "&nbsp;&nbsp;" x $xml_depth;
382 $self->{'raw_text'} .= $raw_tag;
383 }
384}
385
386sub process_datatype_info
387{
388 my $self = shift(@_);
389 my ($metaname_stack,$md_content) = @_;
390
391 my @without_dt_stack = @$metaname_stack; # without datatype stack
392
393 my $innermost_element = $without_dt_stack[$#without_dt_stack];
394
395 # Loose last item if encoding datatype information
396 if ($innermost_element =~ m/^(lang)?string$/) {
397 $self->{'lom_datatype'} = $innermost_element;
398
399 pop @without_dt_stack;
400 $innermost_element = $without_dt_stack[$#without_dt_stack];
401 }
402 elsif ($innermost_element =~ m/^date(Time)?$/i) {
403 if ($innermost_element =~ m/^date$/i) {
404 $self->{'lom_datatype'} = "dateTime";
405 }
406 else {
407 $self->{'lom_datatype'} = $innermost_element;
408
409 pop @without_dt_stack;
410 $innermost_element = $without_dt_stack[$#without_dt_stack];
411 }
412
413 if ($md_content =~ m/^(\d{1,2})\s*(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s*(\d{4})/i) {
414 my ($day,$mon,$year) = ($1,$2,$3);
415
416 my %month_lookup = ( 'jan' => 1, 'feb' => 2, 'mar' => 3,
417 'apr' => 4, 'may' => 5, 'jun' => 6,
418 'jul' => 7, 'aug' => 8, 'sep' => 9,
419 'oct' => 10, 'nov' => 11, 'dec' => 12 );
420
421 my $mon_num = $month_lookup{lc($mon)};
422
423 $md_content = sprintf("%d%02d%02d",$year,$mon_num,$day);
424 }
425
426 $md_content =~ s/\-//g;
427 }
428
429 if ($innermost_element eq "source") {
430 $self->{'lom_source'} = $md_content;
431 }
432 elsif ($innermost_element eq "value") {
433 $self->{'lom_value'} = $md_content;
434 }
435
436 return (\@without_dt_stack,$innermost_element,$md_content);
437}
438
439sub reset_datatype_info
440{
441 my $self = shift(@_);
442
443 $self->{'lom_datatype'} = "";
444}
445
446
447sub pretty_print_text
448{
449 my $self = shift(@_);
450
451 my ($pretty_print_text) = @_;
452
453## $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
454 $pretty_print_text = &util::hyperlink_text($pretty_print_text);
455
456#### $pretty_print_text =~ s/(BEGIN:vCard.*END:vCard)/<pre>$1<\/pre>/sg;
457
458 if ($self->{'lom_datatype'} eq "dateTime") {
459 if ($pretty_print_text =~ m/^(\d{4})(\d{2})(\d{2})$/) {
460 $pretty_print_text = "$1-$2-$3";
461 }
462 }
463
464 return $pretty_print_text;
465}
466
467sub pretty_print_table_tr
468{
469 my $self = shift (@_);
470 my ($without_dt_stack) = @_;
471
472 my $style = "class=\"metadata\"";
473
474 my $innermost_element = $without_dt_stack->[scalar(@$without_dt_stack)-1];
475 my $outermost_element = $without_dt_stack->[0];
476
477 # Loose top level stack item (already named in pretty print table)
478 my @pretty_print_stack = @$without_dt_stack;
479 shift @pretty_print_stack;
480
481 if ($innermost_element eq "source") {
482 return if (!defined $self->{'lom_value'});
483 }
484
485 if ($innermost_element eq "value") {
486 return if (!defined $self->{'lom_source'});
487 }
488
489 my $pretty_print_text = "";
490
491 if (($innermost_element eq "value") || ($innermost_element eq "source")) {
492 my $source = $self->{'lom_source'};
493 my $value = $self->pretty_print_text($self->{'lom_value'});
494
495 $self->{'lom_source'} = undef;
496 $self->{'lom_value'} = undef;
497
498 pop @pretty_print_stack;
499
500 $pretty_print_text = "<td $style>$source</td><td $style>$value</td>";
501 }
502 else {
503 $pretty_print_text = $self->pretty_print_text($_);
504 $pretty_print_text = "<td $style colspan=2>$pretty_print_text</td>";
505 }
506 my $pretty_print_fmn = join(' : ',map { "\u$_"; } @pretty_print_stack);
507
508
509 # my $tr_attr = "id=\"$outermost_element\" style=\"display:block;\"";
510 my $tr_attr = "$style id=\"$outermost_element\"";
511
512 my $mdtext_line = " <tr $tr_attr><td $style><nobr>$pretty_print_fmn</nobr></td>$pretty_print_text</tr>\n";
513 $self->{'mdtbody_text'} .= $mdtext_line;
514}
515
516
517sub check_for_language
518{
519 my $self = shift(@_);
520 my ($innermost_element,$md_content) = @_;
521
522 # Look for 'language' tag
523 if ($innermost_element eq "language") {
524 my $lom_lang = $self->{'lom_language'};
525
526 if (defined $lom_lang) {
527 my $new_lom_lang = $md_content;
528 $new_lom_lang =~ s/-.*//; # remove endings like -US or -GB
529
530 if ($lom_lang ne $new_lom_lang) {
531 my $outhandle = $self->{'outhandle'};
532
533 print $outhandle "Warning: Conflicting general language in record\n";
534 print $outhandle " $new_lom_lang (previous value for language = $lom_lang)\n";
535 }
536 # otherwise, existing value OK => do nothing
537 }
538 else {
539 $lom_lang = $md_content;
540 $lom_lang =~ s/-.*//; # remove endings like -US or -GB
541
542 $self->{'lom_language'} = $lom_lang;
543 }
544 }
545}
546
547sub found_specific_identifier
548{
549 my $self = shift(@_);
550 my ($specific_id,$full_mname,$md_content) = @_;
551
552 my $found_id = 0;
553 if ($full_mname eq $specific_id) {
554 if ($md_content =~ m/^(http|ftp):/) {
555 $found_id = 1;
556 }
557 }
558
559 return $found_id;
560}
561
562sub download_srcdoc
563{
564 my $self = shift(@_);
565 my ($doc_url) = @_;
566
567 my $outhandle = $self->{'outhandle'};
568 my $output_dir = $self->{'output_dir'};
569
570 $output_dir = &util::filename_cat($output_dir,"_gsdldown.all");
571
572 if (! -d $output_dir) {
573 mkdir $output_dir;
574 }
575
576 my $re_dirsep = &util::get_re_dirsep();
577 my $os_dirsep = &util::get_dirsep();
578
579 my $file_url = $doc_url;
580 $file_url =~ s/$re_dirsep/$os_dirsep/g;
581 $file_url =~ s/^(http|ftp):\/\///;
582 $file_url .= "index.html" if ($file_url =~ m/\/$/);
583
584 my $full_file_url = &util::filename_cat($output_dir,$file_url);
585
586 my $check_timestamp = $self->{'check_timestamp'};
587 my $status;
588
589 if (($check_timestamp) || (!$check_timestamp && !-e $full_file_url)) {
590 if (!-e $full_file_url) {
591 print $outhandle "Mirroring $doc_url\n";
592 }
593 else {
594 print $outhandle "Checking to see if update needed for $doc_url\n";
595 }
596
597 # on linux, if we pass an absolute path as -P arg to wget, then it
598 # stuffs up the
599 # URL rewriting in the file. Need a relative path or none, so now
600 # we change working directory first.
601 my $changed_dir = 0;
602 my $current_dir = cwd();
603 my $wget_cmd = "";
604 if ($ENV{'GSDLOS'} ne "windows") {
605 $changed_dir = 1;
606
607 chdir "$output_dir";
608 $wget_cmd = "wget -nv --timestamping -k -p \"$doc_url\"";
609 } else {
610 $wget_cmd = "wget -nv -P \"$output_dir\" --timestamping -k -p \"$doc_url\"";
611 }
612 #my $wget_cmd = "wget -nv -P$output_dir --timestamping -k -p \"$doc_url\"";
613 ### my $full_dir_url = &File::Basename::dirname($full_file_url);
614
615 ## my $wget_cmd = "wget -O $full_file_url --timestamping -k \"$doc_url\"";
616 #my $wget_cmd = "wget -nv -P$output_dir --timestamping -x -k \"$doc_url\"";
617 print STDERR "**** wget = $wget_cmd\n";
618
619
620 $status = system($wget_cmd);
621 if ($changed_dir) {
622 chdir $current_dir;
623 }
624 if ($status==0) {
625 $self->{'lom_srcdoc'} = $full_file_url;
626 }
627 else {
628 $self->{'lom_srcdoc'} = undef;
629 print $outhandle "Error: failed to execute $wget_cmd\n";
630 }
631 }
632 else {
633 # not time-stamping and file already exists
634 $status=0;
635 $self->{'lom_srcdoc'} = $full_file_url;
636 }
637
638 return $status==0;
639
640}
641
642
643sub check_for_identifier
644{
645 my $self = shift(@_);
646 my ($full_mname,$md_content) = @_;
647
648 my $success = 0;
649
650 my $download_re = $self->{'download_srcdocs'};
651 if (($download_re ne "") && $md_content =~ m/$download_re/) {
652
653 if ($self->found_specific_identifier("general^identifier^entry",$full_mname,$md_content)) {
654 $success = $self->download_srcdoc($md_content);
655 }
656
657 if (!$success) {
658 if ($self->found_specific_identifier("technical^location",$full_mname,$md_content)) {
659 $success = $self->download_srcdoc($md_content);
660 }
661 }
662 }
663
664 return $success;
665}
666
667
668sub Text {
669 if ($_ !~ m/^\s*$/) {
670 #
671 # Work out indentations and line wraps for raw XML
672 #
673 my $xml_depth = scalar(@{$self->{'metaname_stack'}})+1;
674 my $indent = "&nbsp;&nbsp;" x $xml_depth;
675
676 my $formatted_text = "\n".$_;
677
678 # break into lines < 80 chars on space
679 $formatted_text =~ s/(.{50,80})\s+/$1\n/mg;
680 $formatted_text =~ s/^/$indent/mg;
681 ## $formatted_text =~ s/\s+$//s;
682
683 $self->{'raw_text'} .= $formatted_text;
684 }
685
686 my $metaname_stack = $self->{'metaname_stack'};
687 if (($_ !~ /^\s*$/) && (scalar(@$metaname_stack)>0)) {
688
689 my ($without_dt_stack,$innermost_element,$md_content)
690 = $self->process_datatype_info($metaname_stack,$_);
691
692 $self->pretty_print_table_tr($without_dt_stack);
693
694 my $full_mname = join('^',@{$without_dt_stack});
695 $self->set_filere_metadata(lc($full_mname),$md_content);
696
697 $self->check_for_language($innermost_element,$md_content);
698 $self->check_for_identifier($full_mname,$md_content); # source doc
699
700 $self->reset_datatype_info();
701 }
702}
703
704# This Char function overrides the one in XML::Parser::Stream to overcome a
705# problem where $expat->{Text} is treated as the return value, slowing
706# things down significantly in some cases.
707sub Char {
708 $_[0]->{'Text'} .= $_[1];
709 return undef;
710}
711
7121;
Note: See TracBrowser for help on using the repository browser.