source: gsdl/trunk/perllib/plugins/LOMPlugin.pm@ 18327

Last change on this file since 18327 was 17300, checked in by kjdon, 16 years ago

removed the metadata argument from metadata_read as its not used and just confuses things when implementing this

  • Property svn:keywords set to Author Date Id Revision
File size: 18.8 KB
Line 
1###########################################################################
2#
3# LOMPlugin.pm -- plugin for import the collection from LOM
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2005 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27### Note this plugin currently can't download source documents from outside if you are behind a firewall.
28
29package LOMPlugin;
30
31use ReadTextFile;
32use MetadataPass;
33use XMLParser;
34use Cwd;
35
36sub BEGIN {
37 @ISA = ('ReadTextFile', 'MetadataPass');
38}
39
40use strict; # every perl program should have this!
41no strict 'refs'; # make an exception so we can use variables as filehandles
42
43
44my $arguments =
45 [ { 'name' => "process_exp",
46 'desc' => "{BasePlugin.process_exp}",
47 'type' => "string",
48 'deft' => &get_default_process_exp(),
49 'reqd' => "no" },
50 { 'name' => "root_tag",
51 'desc' => "{LOMPlugin.root_tag}",
52 'type' => "regexp",
53 'deft' => q/^(?i)lom$/,
54 'reqd' => "no" },
55 { 'name' => "check_timestamp",
56 'desc' => "{LOMPlugin.check_timestamp}",
57 'type' => "flag" },
58 { 'name' => "download_srcdocs",
59 'desc' => "{LOMPlugin.download_srcdocs}",
60 'type' => "regexp",
61 'deft' => "",
62 'reqd' => "no" }];
63
64my $options = { 'name' => "LOMPlugin",
65 'desc' => "{LOMPlugin.desc}",
66 'abstract' => "no",
67 'inherits' => "yes",
68 'args' => $arguments };
69
70
71
72my ($self);
73sub new {
74 my $class = shift (@_);
75 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
76 push(@$pluginlist, $class);
77
78 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
79 push(@{$hashArgOptLists->{"OptList"}},$options);
80
81 $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
82
83 if ($self->{'info_only'}) {
84 # don't worry about creating the XML parser as all we want is the
85 # list of plugin options
86 return bless $self, $class;
87 }
88
89 #create XML::Parser object for parsing dublin_core.xml files
90 my $parser = new XML::Parser('Style' => 'Stream',
91 'Handlers' => {'Char' => \&Char,
92 'Doctype' => \&Doctype
93 });
94 $self->{'parser'} = $parser;
95
96 $self->{'extra_blocks'} = {};
97
98 return bless $self, $class;
99}
100
101sub get_default_process_exp {
102 my $self = shift (@_);
103
104 return q^(?i)\.xml$^;
105}
106
107
108sub can_process_this_file {
109 my $self = shift(@_);
110 my ($filename) = @_;
111
112 if ($self->SUPER::can_process_this_file($filename) && $self->check_doctype($filename)) {
113 return 1; # its a file for us
114 }
115 return 0;
116}
117
118sub metadata_read {
119 my $self = shift (@_);
120 my ($pluginfo, $base_dir, $file, $block_hash, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
121
122 my $outhandle = $self->{'outhandle'};
123
124 # can we process this file??
125 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
126 return undef unless $self->can_process_this_file($filename_full_path);
127
128 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
129
130 print $outhandle "LOMPlugin: extracting metadata from $file\n"
131 if $self->{'verbosity'} > 1;
132
133 my ($dir,$tail) = $filename_full_path =~ /^(.*?)([^\/\\]*)$/;
134 $self->{'output_dir'} = $dir;
135
136 eval {
137 $self->{'parser'}->parsefile($filename_full_path);
138 };
139
140 if ($@) {
141 print $outhandle "LOMPlugin: skipping $filename_full_path as not conformant to LOM syntax\n" if ($self->{'verbosity'} > 1);
142 print $outhandle "\n Perl Error:\n $@\n" if ($self->{'verbosity'}>2);
143 return 0;
144 }
145
146 $self->{'output_dir'} = undef;
147
148 my $file_re;
149 my $lom_srcdoc = $self->{'lom_srcdoc'};
150
151 if (defined $lom_srcdoc) {
152 my $dirsep = &util::get_re_dirsep();
153 $lom_srcdoc =~ s/^$base_dir($dirsep)//;
154 $self->{'extra_blocks'}->{$file}++;
155
156 $file_re = $lom_srcdoc;
157 }
158 else {
159 $file_re = $tail;
160 }
161 $file_re =~ s/\./\\\./g;
162
163 $self->{'lom_srcdoc'} = undef; # reset for next file to be processed
164
165 push(@$extrametakeys,$file_re);
166 $extrametadata->{$file_re} = $self->{'saved_metadata'};
167
168 return 1;
169}
170
171sub check_doctype {
172 $self = shift (@_);
173
174 my ($filename) = @_;
175
176 if (open(XMLIN,"<$filename")) {
177 my $doctype = $self->{'root_tag'};
178 ## check whether the doctype has the same name as the root element tag
179 while (defined (my $line = <XMLIN>)) {
180 ## find the root element
181 if ($line =~ /<([\w\d:]+)[\s>]/){
182 my $root = $1;
183 if ($root !~ $doctype){
184 close(XMLIN);
185 return 0;
186 }
187 else {
188 close(XMLIN);
189 return 1;
190 }
191 }
192 }
193 close(XMLIN);
194 }
195
196 return undef; # haven't found a valid line
197
198}
199
200sub read_file {
201 my $self = shift (@_);
202 my ($filename, $encoding, $language, $textref) = @_;
203
204 my $metadata_table = $self->{'metadata_table'};
205
206 my $rawtext = $metadata_table->{'rawtext'};
207
208 delete $metadata_table->{'rawtext'};
209
210 $$textref = $rawtext;
211}
212
213sub read {
214 my $self = shift (@_);
215 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
216
217 my $outhandle = $self->{'outhandle'};
218
219 return 0 if (defined $self->{'extra_blocks'}->{$file});
220
221 # can we process this file??
222 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
223 return undef unless $self->can_process_this_file($filename_full_path);
224
225 $self->{'metadata_table'} = $metadata;
226
227 my $lom_language = $metadata->{'lom_language'};
228
229 my $store_input_encoding;
230 my $store_extract_language;
231 my $store_default_language;
232 my $store_default_encoding;
233
234 if (defined $lom_language) {
235 delete $metadata->{'lom_language'};
236
237 $store_input_encoding = $self->{'input_encoding'};
238 $store_extract_language = $self->{'extract_language'};
239 $store_default_language = $self->{'default_language'};
240 $store_default_encoding = $self->{'default_encoding'};
241
242 $self->{'input_encoding'} = "utf8";
243 $self->{'extract_language'} = 0;
244 $self->{'default_language'} = $lom_language;
245 $self->{'default_encoding'} = "utf8";
246 }
247
248 my $rv = $self->SUPER::read(@_);
249
250 if (defined $lom_language) {
251 $self->{'input_encoding'} = $store_input_encoding;
252 $self->{'extract_language'} = $store_extract_language;
253 $self->{'default_language'} = $store_default_language;
254 $self->{'default_encoding'} = $store_default_encoding;
255 }
256
257 $self->{'metadata_table'} = undef;
258
259 return $rv;
260}
261
262# do plugin specific processing of doc_obj
263sub process {
264 my $self = shift (@_);
265 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
266 my $outhandle = $self->{'outhandle'};
267
268 print STDERR "<Processing n='$file' p='LOMPlugin'>\n" if ($gli);
269
270 print $outhandle "LOMPLug: processing $file\n";
271
272 my $cursection = $doc_obj->get_top_section();
273 $doc_obj->add_utf8_text($cursection, $$textref);
274
275 return 1;
276}
277
278sub Doctype {
279 my ($expat, $name, $sysid, $pubid, $internal) = @_;
280
281 my $root_tag = $self->{'root_tag'};
282
283 if ($name !~ /$root_tag/) {
284 die "Root tag $name does not match regular expression $root_tag";
285 }
286}
287
288sub StartTag {
289 my ($expat, $element) = @_;
290
291 my %attr = %_;
292
293 my $raw_tag = "&lt;$element";
294 map { $raw_tag .= " $_=\"$attr{$_}\""; } keys %attr;
295 $raw_tag .= "&gt;";
296
297 if ($element =~ m/$self->{'root_tag'}/) {
298 $self->{'raw_text'} = $raw_tag;
299
300 $self->{'saved_metadata'} = {};
301 $self->{'metaname_stack'} = [];
302 $self->{'lom_datatype'} = "";
303 $self->{'lom_language'} = undef;
304 $self->{'metadatatext'} = "<table class=\"metadata\" width=\"_pagewidth_\" >\n";
305 }
306 else {
307 my $xml_depth = scalar(@{$self->{'metaname_stack'}});
308 $self->{'raw_text'} .= "\n";
309 $self->{'raw_text'} .= "&nbsp;&nbsp;" x $xml_depth;
310 $self->{'raw_text'} .= $raw_tag;
311
312 my $metaname_stack = $self->{'metaname_stack'};
313 push(@$metaname_stack,$element);
314 if (scalar(@$metaname_stack)==1) {
315 # top level LOM category
316 my $style = "class=\"metadata\"";
317 my $open_close
318 = "<a id=\"${element}opencloselink\" href=\"javascript:hideTBodyArea('$element')\">\n";
319 $open_close
320 .= "<img id=\"${element}openclose\" border=\"0\" src=\"_httpopenmdicon_\"></a>\n";
321
322 my $header_line = " <tr $style ><th $style colspan=\"3\">$open_close \u$element</th></tr>\n";
323 my $md_tbody = "<tbody id=\"$element\">\n";
324
325 $self->{'mdheader'} = $header_line;
326 $self->{'mdtbody'} = $md_tbody;
327 $self->{'mdtbody_text'} = "";
328 }
329 }
330}
331
332sub EndTag {
333 my ($expat, $element) = @_;
334
335 my $raw_tag = "&lt;/$element&gt;";
336
337 if ($element =~ m/$self->{'root_tag'}/) {
338 $self->{'raw_text'} .= $raw_tag;
339
340 my $metadatatext = $self->{'metadatatext'};
341 $metadatatext .= "</table>";
342
343 my $raw_text = $self->{'raw_text'};
344
345 $self->{'saved_metadata'}->{'MetadataTable'} = $metadatatext;
346 $self->{'metadatatext'} = "";
347
348 $self->{'saved_metadata'}->{'rawtext'} = $raw_text;
349 $self->{'raw_text'} = "";
350
351 if (defined $self->{'lom_language'}) {
352 $self->{'saved_metadata'}->{'lom_language'} = $self->{'lom_language'};
353 $self->{'lom_language'} = undef;
354 }
355 }
356 else {
357 my $metaname_stack = $self->{'metaname_stack'};
358
359 if (scalar(@$metaname_stack)==1) {
360 my $header_line = $self->{'mdheader'};
361 my $tbody_start = $self->{'mdtbody'};
362 my $tbody_text = $self->{'mdtbody_text'};
363 if ($tbody_text !~ m/^\s*$/s) {
364 my $tbody_end = "</tbody>\n";
365 my $table_chunk
366 = $header_line.$tbody_start.$tbody_text.$tbody_end;
367
368 $self->{'metadatatext'} .= $table_chunk;
369 }
370 $self->{'mdtheader'} = "";
371 $self->{'mdtbody'} = "";
372 $self->{'mdtbody_text'} = "";
373 }
374
375 pop(@$metaname_stack);
376
377 my $xml_depth = scalar(@{$self->{'metaname_stack'}});
378 $self->{'raw_text'} .= "\n";
379 $self->{'raw_text'} .= "&nbsp;&nbsp;" x $xml_depth;
380 $self->{'raw_text'} .= $raw_tag;
381 }
382}
383
384sub process_datatype_info
385{
386 my $self = shift(@_);
387 my ($metaname_stack,$md_content) = @_;
388
389 my @without_dt_stack = @$metaname_stack; # without datatype stack
390
391 my $innermost_element = $without_dt_stack[$#without_dt_stack];
392
393 # Loose last item if encoding datatype information
394 if ($innermost_element =~ m/^(lang)?string$/) {
395 $self->{'lom_datatype'} = $innermost_element;
396
397 pop @without_dt_stack;
398 $innermost_element = $without_dt_stack[$#without_dt_stack];
399 }
400 elsif ($innermost_element =~ m/^date(Time)?$/i) {
401 if ($innermost_element =~ m/^date$/i) {
402 $self->{'lom_datatype'} = "dateTime";
403 }
404 else {
405 $self->{'lom_datatype'} = $innermost_element;
406
407 pop @without_dt_stack;
408 $innermost_element = $without_dt_stack[$#without_dt_stack];
409 }
410
411 if ($md_content =~ m/^(\d{1,2})\s*(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s*(\d{4})/i) {
412 my ($day,$mon,$year) = ($1,$2,$3);
413
414 my %month_lookup = ( 'jan' => 1, 'feb' => 2, 'mar' => 3,
415 'apr' => 4, 'may' => 5, 'jun' => 6,
416 'jul' => 7, 'aug' => 8, 'sep' => 9,
417 'oct' => 10, 'nov' => 11, 'dec' => 12 );
418
419 my $mon_num = $month_lookup{lc($mon)};
420
421 $md_content = sprintf("%d%02d%02d",$year,$mon_num,$day);
422 }
423
424 $md_content =~ s/\-//g;
425 }
426
427 if ($innermost_element eq "source") {
428 $self->{'lom_source'} = $md_content;
429 }
430 elsif ($innermost_element eq "value") {
431 $self->{'lom_value'} = $md_content;
432 }
433
434 return (\@without_dt_stack,$innermost_element,$md_content);
435}
436
437sub reset_datatype_info
438{
439 my $self = shift(@_);
440
441 $self->{'lom_datatype'} = "";
442}
443
444
445sub pretty_print_text
446{
447 my $self = shift(@_);
448
449 my ($pretty_print_text) = @_;
450
451## $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
452 $pretty_print_text = &util::hyperlink_text($pretty_print_text);
453
454#### $pretty_print_text =~ s/(BEGIN:vCard.*END:vCard)/<pre>$1<\/pre>/sg;
455
456 if ($self->{'lom_datatype'} eq "dateTime") {
457 if ($pretty_print_text =~ m/^(\d{4})(\d{2})(\d{2})$/) {
458 $pretty_print_text = "$1-$2-$3";
459 }
460 }
461
462 return $pretty_print_text;
463}
464
465sub pretty_print_table_tr
466{
467 my $self = shift (@_);
468 my ($without_dt_stack) = @_;
469
470 my $style = "class=\"metadata\"";
471
472 my $innermost_element = $without_dt_stack->[scalar(@$without_dt_stack)-1];
473 my $outermost_element = $without_dt_stack->[0];
474
475 # Loose top level stack item (already named in pretty print table)
476 my @pretty_print_stack = @$without_dt_stack;
477 shift @pretty_print_stack;
478
479 if ($innermost_element eq "source") {
480 return if (!defined $self->{'lom_value'});
481 }
482
483 if ($innermost_element eq "value") {
484 return if (!defined $self->{'lom_source'});
485 }
486
487 my $pretty_print_text = "";
488
489 if (($innermost_element eq "value") || ($innermost_element eq "source")) {
490 my $source = $self->{'lom_source'};
491 my $value = $self->pretty_print_text($self->{'lom_value'});
492
493 $self->{'lom_source'} = undef;
494 $self->{'lom_value'} = undef;
495
496 pop @pretty_print_stack;
497
498 $pretty_print_text = "<td $style>$source</td><td $style>$value</td>";
499 }
500 else {
501 $pretty_print_text = $self->pretty_print_text($_);
502 $pretty_print_text = "<td $style colspan=2>$pretty_print_text</td>";
503 }
504 my $pretty_print_fmn = join(' : ',map { "\u$_"; } @pretty_print_stack);
505
506
507 # my $tr_attr = "id=\"$outermost_element\" style=\"display:block;\"";
508 my $tr_attr = "$style id=\"$outermost_element\"";
509
510 my $mdtext_line = " <tr $tr_attr><td $style><nobr>$pretty_print_fmn</nobr></td>$pretty_print_text</tr>\n";
511 $self->{'mdtbody_text'} .= $mdtext_line;
512}
513
514
515sub check_for_language
516{
517 my $self = shift(@_);
518 my ($innermost_element,$md_content) = @_;
519
520 # Look for 'language' tag
521 if ($innermost_element eq "language") {
522 my $lom_lang = $self->{'lom_language'};
523
524 if (defined $lom_lang) {
525 my $new_lom_lang = $md_content;
526 $new_lom_lang =~ s/-.*//; # remove endings like -US or -GB
527
528 if ($lom_lang ne $new_lom_lang) {
529 my $outhandle = $self->{'outhandle'};
530
531 print $outhandle "Warning: Conflicting general language in record\n";
532 print $outhandle " $new_lom_lang (previous value for language = $lom_lang)\n";
533 }
534 # otherwise, existing value OK => do nothing
535 }
536 else {
537 $lom_lang = $md_content;
538 $lom_lang =~ s/-.*//; # remove endings like -US or -GB
539
540 $self->{'lom_language'} = $lom_lang;
541 }
542 }
543}
544
545sub found_specific_identifier
546{
547 my $self = shift(@_);
548 my ($specific_id,$full_mname,$md_content) = @_;
549
550 my $found_id = 0;
551 if ($full_mname eq $specific_id) {
552 if ($md_content =~ m/^(http|ftp):/) {
553 $found_id = 1;
554 }
555 }
556
557 return $found_id;
558}
559
560sub download_srcdoc
561{
562 my $self = shift(@_);
563 my ($doc_url) = @_;
564
565 my $outhandle = $self->{'outhandle'};
566 my $output_dir = $self->{'output_dir'};
567
568 $output_dir = &util::filename_cat($output_dir,"_gsdldown.all");
569
570 if (! -d $output_dir) {
571 mkdir $output_dir;
572 }
573
574 my $re_dirsep = &util::get_re_dirsep();
575 my $os_dirsep = &util::get_dirsep();
576
577 my $file_url = $doc_url;
578 $file_url =~ s/$re_dirsep/$os_dirsep/g;
579 $file_url =~ s/^(http|ftp):\/\///;
580 $file_url .= "index.html" if ($file_url =~ m/\/$/);
581
582 my $full_file_url = &util::filename_cat($output_dir,$file_url);
583
584 my $check_timestamp = $self->{'check_timestamp'};
585 my $status;
586
587 if (($check_timestamp) || (!$check_timestamp && !-e $full_file_url)) {
588 if (!-e $full_file_url) {
589 print $outhandle "Mirroring $doc_url\n";
590 }
591 else {
592 print $outhandle "Checking to see if update needed for $doc_url\n";
593 }
594
595 # on linux, if we pass an absolute path as -P arg to wget, then it
596 # stuffs up the
597 # URL rewriting in the file. Need a relative path or none, so now
598 # we change working directory first.
599 my $changed_dir = 0;
600 my $current_dir = cwd();
601 my $wget_cmd = "";
602 if ($ENV{'GSDLOS'} ne "windows") {
603 $changed_dir = 1;
604
605 chdir "$output_dir";
606 $wget_cmd = "wget -nv --timestamping -k -p \"$doc_url\"";
607 } else {
608 $wget_cmd = "wget -nv -P \"$output_dir\" --timestamping -k -p \"$doc_url\"";
609 }
610 #my $wget_cmd = "wget -nv -P$output_dir --timestamping -k -p \"$doc_url\"";
611 ### my $full_dir_url = &File::Basename::dirname($full_file_url);
612
613 ## my $wget_cmd = "wget -O $full_file_url --timestamping -k \"$doc_url\"";
614 #my $wget_cmd = "wget -nv -P$output_dir --timestamping -x -k \"$doc_url\"";
615 print STDERR "**** wget = $wget_cmd\n";
616
617
618 $status = system($wget_cmd);
619 if ($changed_dir) {
620 chdir $current_dir;
621 }
622 if ($status==0) {
623 $self->{'lom_srcdoc'} = $full_file_url;
624 }
625 else {
626 $self->{'lom_srcdoc'} = undef;
627 print $outhandle "Error: failed to execute $wget_cmd\n";
628 }
629 }
630 else {
631 # not time-stamping and file already exists
632 $status=0;
633 $self->{'lom_srcdoc'} = $full_file_url;
634 }
635
636 return $status==0;
637
638}
639
640
641sub check_for_identifier
642{
643 my $self = shift(@_);
644 my ($full_mname,$md_content) = @_;
645
646 my $success = 0;
647
648 my $download_re = $self->{'download_srcdocs'};
649 if (($download_re ne "") && $md_content =~ m/$download_re/) {
650
651 if ($self->found_specific_identifier("general^identifier^entry",$full_mname,$md_content)) {
652 $success = $self->download_srcdoc($md_content);
653 }
654
655 if (!$success) {
656 if ($self->found_specific_identifier("technical^location",$full_mname,$md_content)) {
657 $success = $self->download_srcdoc($md_content);
658 }
659 }
660 }
661
662 return $success;
663}
664
665
666sub Text {
667 if ($_ !~ m/^\s*$/) {
668 #
669 # Work out indentations and line wraps for raw XML
670 #
671 my $xml_depth = scalar(@{$self->{'metaname_stack'}})+1;
672 my $indent = "&nbsp;&nbsp;" x $xml_depth;
673
674 my $formatted_text = "\n".$_;
675
676 # break into lines < 80 chars on space
677 $formatted_text =~ s/(.{50,80})\s+/$1\n/mg;
678 $formatted_text =~ s/^/$indent/mg;
679 ## $formatted_text =~ s/\s+$//s;
680
681 $self->{'raw_text'} .= $formatted_text;
682 }
683
684 my $metaname_stack = $self->{'metaname_stack'};
685 if (($_ !~ /^\s*$/) && (scalar(@$metaname_stack)>0)) {
686
687 my ($without_dt_stack,$innermost_element,$md_content)
688 = $self->process_datatype_info($metaname_stack,$_);
689
690 $self->pretty_print_table_tr($without_dt_stack);
691
692 my $full_mname = join('^',@{$without_dt_stack});
693 $self->set_filere_metadata(lc($full_mname),$md_content);
694
695 $self->check_for_language($innermost_element,$md_content);
696 $self->check_for_identifier($full_mname,$md_content); # source doc
697
698 $self->reset_datatype_info();
699 }
700}
701
702# This Char function overrides the one in XML::Parser::Stream to overcome a
703# problem where $expat->{Text} is treated as the return value, slowing
704# things down significantly in some cases.
705sub Char {
706 $_[0]->{'Text'} .= $_[1];
707 return undef;
708}
709
7101;
Note: See TracBrowser for help on using the repository browser.