source: gsdl/trunk/perllib/plugins/LOMPlug.pm@ 14661

Last change on this file since 14661 was 13334, checked in by kjdon, 17 years ago

added a comment about home it doesn't work behind a firewall

  • Property svn:keywords set to Author Date Id Revision
File size: 18.5 KB
Line 
1###########################################################################
2#
3# LOMPlug.pm -- plugin for import the collection from LOM
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2005 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27### Note this plugin currently can't download source documents from outside if you are behind a firewall.
28
29package LOMPlug;
30
31use BasPlug;
32use MetadataPass;
33use XMLParser;
34use Cwd;
35
36sub BEGIN {
37 @ISA = ('BasPlug', 'MetadataPass');
38}
39
40use strict; # every perl program should have this!
41no strict 'refs'; # make an exception so we can use variables as filehandles
42
43
44my $arguments =
45 [ { 'name' => "process_exp",
46 'desc' => "{BasPlug.process_exp}",
47 'type' => "string",
48 'deft' => &get_default_process_exp(),
49 'reqd' => "no" },
50 { 'name' => "root_tag",
51 'desc' => "{LOMPlug.root_tag}",
52 'type' => "regexp",
53 'deft' => q/^(?i)lom$/,
54 'reqd' => "no" },
55 { 'name' => "check_timestamp",
56 'desc' => "{LOMPlug.check_timestamp}",
57 'type' => "flag" },
58 { 'name' => "download_srcdocs",
59 'desc' => "{LOMPlug.download_srcdocs}",
60 'type' => "regexp",
61 'deft' => "",
62 'reqd' => "no" }];
63
64my $options = { 'name' => "LOMPlug",
65 'desc' => "{LOMPlug.desc}",
66 'inherits' => "yes",
67 'args' => $arguments };
68
69
70
71my ($self);
72sub new {
73 my $class = shift (@_);
74 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
75 push(@$pluginlist, $class);
76
77 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
78 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
79
80 $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
81
82 #create XML::Parser object for parsing dublin_core.xml files
83 my $parser = new XML::Parser('Style' => 'Stream',
84 'Handlers' => {'Char' => \&Char,
85 'Doctype' => \&Doctype
86 });
87 $self->{'parser'} = $parser;
88
89 $self->{'extra_blocks'} = {};
90
91 return bless $self, $class;
92}
93
94sub get_default_process_exp {
95 my $self = shift (@_);
96
97 return q^(?i)\.xml$^;
98}
99
100
101
102
103sub metadata_read {
104 my $self = shift (@_);
105 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
106
107 my $outhandle = $self->{'outhandle'};
108
109 my $filename = $file;
110 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
111
112 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
113 return undef; # can't recognise
114 }
115 if (!$self->check_doctype($filename)) {
116 # this file is not for us
117 return undef;
118 }
119
120 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
121
122 print $outhandle "LOMPlug: extracting metadata from $file\n"
123 if $self->{'verbosity'} > 1;
124
125 my ($dir,$tail) = $filename =~ /^(.*?)([^\/\\]*)$/;
126 $self->{'output_dir'} = $dir;
127
128 eval {
129 $self->{'parser'}->parsefile($filename);
130 };
131
132 if ($@) {
133 print $outhandle "LOMPlug: skipping $filename as not conformant to LOM syntax\n" if ($self->{'verbosity'} > 1);
134 print $outhandle "\n Perl Error:\n $@\n" if ($self->{'verbosity'}>2);
135 return 0;
136 }
137
138 $self->{'output_dir'} = undef;
139
140 my $file_re;
141 my $lom_srcdoc = $self->{'lom_srcdoc'};
142
143 if (defined $lom_srcdoc) {
144 my $dirsep = &util::get_re_dirsep();
145 $lom_srcdoc =~ s/^$base_dir($dirsep)//;
146 $self->{'extra_blocks'}->{$file}++;
147
148 $file_re = $lom_srcdoc;
149 }
150 else {
151 $file_re = $tail;
152 }
153 $file_re =~ s/\./\\\./g;
154
155 $self->{'lom_srcdoc'} = undef; # reset for next file to be processed
156
157 push(@$extrametakeys,$file_re);
158 $extrametadata->{$file_re} = $self->{'saved_metadata'};
159
160 return 1;
161}
162
163sub check_doctype {
164 $self = shift (@_);
165
166 my ($filename) = @_;
167
168 if (open(XMLIN,"<$filename")) {
169 my $doctype = $self->{'root_tag'};
170 ## check whether the doctype has the same name as the root element tag
171 while (defined (my $line = <XMLIN>)) {
172 ## find the root element
173 if ($line =~ /<([\w\d:]+)[\s>]/){
174 my $root = $1;
175 if ($root !~ $doctype){
176 close(XMLIN);
177 return 0;
178 }
179 else {
180 close(XMLIN);
181 return 1;
182 }
183 }
184 }
185 close(XMLIN);
186 }
187
188 return undef; # haven't found a valid line
189
190}
191
192sub read_file {
193 my $self = shift (@_);
194 my ($filename, $encoding, $language, $textref) = @_;
195
196 my $metadata_table = $self->{'metadata_table'};
197
198 my $rawtext = $metadata_table->{'rawtext'};
199
200 delete $metadata_table->{'rawtext'};
201
202 $$textref = $rawtext;
203}
204
205sub read {
206 my $self = shift (@_);
207 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
208
209 my $outhandle = $self->{'outhandle'};
210
211 return 0 if (defined $self->{'extra_blocks'}->{$file});
212
213 # need to check whether this file is for us
214 my ($block_status,$filename) = $self->read_block(@_);
215 return $block_status if ((!defined $block_status) || ($block_status==0));
216 if (!$self->check_doctype($filename)) {
217 # this file is not for us
218 return undef;
219 }
220
221 $self->{'metadata_table'} = $metadata;
222
223 my $lom_language = $metadata->{'lom_language'};
224
225 my $store_input_encoding;
226 my $store_extract_language;
227 my $store_default_language;
228 my $store_default_encoding;
229
230 if (defined $lom_language) {
231 delete $metadata->{'lom_language'};
232
233 $store_input_encoding = $self->{'input_encoding'};
234 $store_extract_language = $self->{'extract_language'};
235 $store_default_language = $self->{'default_language'};
236 $store_default_encoding = $self->{'default_encoding'};
237
238 $self->{'input_encoding'} = "utf8";
239 $self->{'extract_language'} = 0;
240 $self->{'default_language'} = $lom_language;
241 $self->{'default_encoding'} = "utf8";
242 }
243
244 my $rv = $self->SUPER::read(@_);
245
246 if (defined $lom_language) {
247 $self->{'input_encoding'} = $store_input_encoding;
248 $self->{'extract_language'} = $store_extract_language;
249 $self->{'default_language'} = $store_default_language;
250 $self->{'default_encoding'} = $store_default_encoding;
251 }
252
253 $self->{'metadata_table'} = undef;
254
255 return $rv;
256}
257
258# do plugin specific processing of doc_obj
259sub process {
260 my $self = shift (@_);
261 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
262 my $outhandle = $self->{'outhandle'};
263
264 print STDERR "<Processing n='$file' p='LOMPlug'>\n" if ($gli);
265
266 print $outhandle "LOMPLug: processing $file\n";
267
268 my $cursection = $doc_obj->get_top_section();
269 $doc_obj->add_utf8_text($cursection, $$textref);
270
271 return 1;
272}
273
274sub Doctype {
275 my ($expat, $name, $sysid, $pubid, $internal) = @_;
276
277 my $root_tag = $self->{'root_tag'};
278
279 if ($name !~ /$root_tag/) {
280 die "Root tag $name does not match regular expression $root_tag";
281 }
282}
283
284sub StartTag {
285 my ($expat, $element) = @_;
286
287 my %attr = %_;
288
289 my $raw_tag = "&lt;$element";
290 map { $raw_tag .= " $_=\"$attr{$_}\""; } keys %attr;
291 $raw_tag .= "&gt;";
292
293 if ($element =~ m/$self->{'root_tag'}/) {
294 $self->{'raw_text'} = $raw_tag;
295
296 $self->{'saved_metadata'} = {};
297 $self->{'metaname_stack'} = [];
298 $self->{'lom_datatype'} = "";
299 $self->{'lom_language'} = undef;
300 $self->{'metadatatext'} = "<table class=\"metadata\" width=\"_pagewidth_\" >\n";
301 }
302 else {
303 my $xml_depth = scalar(@{$self->{'metaname_stack'}});
304 $self->{'raw_text'} .= "\n";
305 $self->{'raw_text'} .= "&nbsp;&nbsp;" x $xml_depth;
306 $self->{'raw_text'} .= $raw_tag;
307
308 my $metaname_stack = $self->{'metaname_stack'};
309 push(@$metaname_stack,$element);
310 if (scalar(@$metaname_stack)==1) {
311 # top level LOM category
312 my $style = "class=\"metadata\"";
313 my $open_close
314 = "<a id=\"${element}opencloselink\" href=\"javascript:hideTBodyArea('$element')\">\n";
315 $open_close
316 .= "<img id=\"${element}openclose\" border=\"0\" src=\"_httpopenmdicon_\"></a>\n";
317
318 my $header_line = " <tr $style ><th $style colspan=\"3\">$open_close \u$element</th></tr>\n";
319 my $md_tbody = "<tbody id=\"$element\">\n";
320
321 $self->{'mdheader'} = $header_line;
322 $self->{'mdtbody'} = $md_tbody;
323 $self->{'mdtbody_text'} = "";
324 }
325 }
326}
327
328sub EndTag {
329 my ($expat, $element) = @_;
330
331 my $raw_tag = "&lt;/$element&gt;";
332
333 if ($element =~ m/$self->{'root_tag'}/) {
334 $self->{'raw_text'} .= $raw_tag;
335
336 my $metadatatext = $self->{'metadatatext'};
337 $metadatatext .= "</table>";
338
339 my $raw_text = $self->{'raw_text'};
340
341 $self->{'saved_metadata'}->{'MetadataTable'} = $metadatatext;
342 $self->{'metadatatext'} = "";
343
344 $self->{'saved_metadata'}->{'rawtext'} = $raw_text;
345 $self->{'raw_text'} = "";
346
347 if (defined $self->{'lom_language'}) {
348 $self->{'saved_metadata'}->{'lom_language'} = $self->{'lom_language'};
349 $self->{'lom_language'} = undef;
350 }
351 }
352 else {
353 my $metaname_stack = $self->{'metaname_stack'};
354
355 if (scalar(@$metaname_stack)==1) {
356 my $header_line = $self->{'mdheader'};
357 my $tbody_start = $self->{'mdtbody'};
358 my $tbody_text = $self->{'mdtbody_text'};
359 if ($tbody_text !~ m/^\s*$/s) {
360 my $tbody_end = "</tbody>\n";
361 my $table_chunk
362 = $header_line.$tbody_start.$tbody_text.$tbody_end;
363
364 $self->{'metadatatext'} .= $table_chunk;
365 }
366 $self->{'mdtheader'} = "";
367 $self->{'mdtbody'} = "";
368 $self->{'mdtbody_text'} = "";
369 }
370
371 pop(@$metaname_stack);
372
373 my $xml_depth = scalar(@{$self->{'metaname_stack'}});
374 $self->{'raw_text'} .= "\n";
375 $self->{'raw_text'} .= "&nbsp;&nbsp;" x $xml_depth;
376 $self->{'raw_text'} .= $raw_tag;
377 }
378}
379
380sub process_datatype_info
381{
382 my $self = shift(@_);
383 my ($metaname_stack,$md_content) = @_;
384
385 my @without_dt_stack = @$metaname_stack; # without datatype stack
386
387 my $innermost_element = $without_dt_stack[$#without_dt_stack];
388
389 # Loose last item if encoding datatype information
390 if ($innermost_element =~ m/^(lang)?string$/) {
391 $self->{'lom_datatype'} = $innermost_element;
392
393 pop @without_dt_stack;
394 $innermost_element = $without_dt_stack[$#without_dt_stack];
395 }
396 elsif ($innermost_element =~ m/^date(Time)?$/i) {
397 if ($innermost_element =~ m/^date$/i) {
398 $self->{'lom_datatype'} = "dateTime";
399 }
400 else {
401 $self->{'lom_datatype'} = $innermost_element;
402
403 pop @without_dt_stack;
404 $innermost_element = $without_dt_stack[$#without_dt_stack];
405 }
406
407 if ($md_content =~ m/^(\d{1,2})\s*(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s*(\d{4})/i) {
408 my ($day,$mon,$year) = ($1,$2,$3);
409
410 my %month_lookup = ( 'jan' => 1, 'feb' => 2, 'mar' => 3,
411 'apr' => 4, 'may' => 5, 'jun' => 6,
412 'jul' => 7, 'aug' => 8, 'sep' => 9,
413 'oct' => 10, 'nov' => 11, 'dec' => 12 );
414
415 my $mon_num = $month_lookup{lc($mon)};
416
417 $md_content = sprintf("%d%02d%02d",$year,$mon_num,$day);
418 }
419
420 $md_content =~ s/\-//g;
421 }
422
423 if ($innermost_element eq "source") {
424 $self->{'lom_source'} = $md_content;
425 }
426 elsif ($innermost_element eq "value") {
427 $self->{'lom_value'} = $md_content;
428 }
429
430 return (\@without_dt_stack,$innermost_element,$md_content);
431}
432
433sub reset_datatype_info
434{
435 my $self = shift(@_);
436
437 $self->{'lom_datatype'} = "";
438}
439
440
441sub pretty_print_text
442{
443 my $self = shift(@_);
444
445 my ($pretty_print_text) = @_;
446
447## $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
448 $pretty_print_text = &util::hyperlink_text($pretty_print_text);
449
450#### $pretty_print_text =~ s/(BEGIN:vCard.*END:vCard)/<pre>$1<\/pre>/sg;
451
452 if ($self->{'lom_datatype'} eq "dateTime") {
453 if ($pretty_print_text =~ m/^(\d{4})(\d{2})(\d{2})$/) {
454 $pretty_print_text = "$1-$2-$3";
455 }
456 }
457
458 return $pretty_print_text;
459}
460
461sub pretty_print_table_tr
462{
463 my $self = shift (@_);
464 my ($without_dt_stack) = @_;
465
466 my $style = "class=\"metadata\"";
467
468 my $innermost_element = $without_dt_stack->[scalar(@$without_dt_stack)-1];
469 my $outermost_element = $without_dt_stack->[0];
470
471 # Loose top level stack item (already named in pretty print table)
472 my @pretty_print_stack = @$without_dt_stack;
473 shift @pretty_print_stack;
474
475 if ($innermost_element eq "source") {
476 return if (!defined $self->{'lom_value'});
477 }
478
479 if ($innermost_element eq "value") {
480 return if (!defined $self->{'lom_source'});
481 }
482
483 my $pretty_print_text = "";
484
485 if (($innermost_element eq "value") || ($innermost_element eq "source")) {
486 my $source = $self->{'lom_source'};
487 my $value = $self->pretty_print_text($self->{'lom_value'});
488
489 $self->{'lom_source'} = undef;
490 $self->{'lom_value'} = undef;
491
492 pop @pretty_print_stack;
493
494 $pretty_print_text = "<td $style>$source</td><td $style>$value</td>";
495 }
496 else {
497 $pretty_print_text = $self->pretty_print_text($_);
498 $pretty_print_text = "<td $style colspan=2>$pretty_print_text</td>";
499 }
500 my $pretty_print_fmn = join(' : ',map { "\u$_"; } @pretty_print_stack);
501
502
503 # my $tr_attr = "id=\"$outermost_element\" style=\"display:block;\"";
504 my $tr_attr = "$style id=\"$outermost_element\"";
505
506 my $mdtext_line = " <tr $tr_attr><td $style><nobr>$pretty_print_fmn</nobr></td>$pretty_print_text</tr>\n";
507 $self->{'mdtbody_text'} .= $mdtext_line;
508}
509
510
511sub check_for_language
512{
513 my $self = shift(@_);
514 my ($innermost_element,$md_content) = @_;
515
516 # Look for 'language' tag
517 if ($innermost_element eq "language") {
518 my $lom_lang = $self->{'lom_language'};
519
520 if (defined $lom_lang) {
521 my $new_lom_lang = $md_content;
522 $new_lom_lang =~ s/-.*//; # remove endings like -US or -GB
523
524 if ($lom_lang ne $new_lom_lang) {
525 my $outhandle = $self->{'outhandle'};
526
527 print $outhandle "Warning: Conflicting general language in record\n";
528 print $outhandle " $new_lom_lang (previous value for language = $lom_lang)\n";
529 }
530 # otherwise, existing value OK => do nothing
531 }
532 else {
533 $lom_lang = $md_content;
534 $lom_lang =~ s/-.*//; # remove endings like -US or -GB
535
536 $self->{'lom_language'} = $lom_lang;
537 }
538 }
539}
540
541sub found_specific_identifier
542{
543 my $self = shift(@_);
544 my ($specific_id,$full_mname,$md_content) = @_;
545
546 my $found_id = 0;
547 if ($full_mname eq $specific_id) {
548 if ($md_content =~ m/^(http|ftp):/) {
549 $found_id = 1;
550 }
551 }
552
553 return $found_id;
554}
555
556sub download_srcdoc
557{
558 my $self = shift(@_);
559 my ($doc_url) = @_;
560
561 my $outhandle = $self->{'outhandle'};
562 my $output_dir = $self->{'output_dir'};
563
564 $output_dir = &util::filename_cat($output_dir,"_gsdldown.all");
565
566 if (! -d $output_dir) {
567 mkdir $output_dir;
568 }
569
570 my $re_dirsep = &util::get_re_dirsep();
571 my $os_dirsep = &util::get_dirsep();
572
573 my $file_url = $doc_url;
574 $file_url =~ s/$re_dirsep/$os_dirsep/g;
575 $file_url =~ s/^(http|ftp):\/\///;
576 $file_url .= "index.html" if ($file_url =~ m/\/$/);
577
578 my $full_file_url = &util::filename_cat($output_dir,$file_url);
579
580 my $check_timestamp = $self->{'check_timestamp'};
581 my $status;
582
583 if (($check_timestamp) || (!$check_timestamp && !-e $full_file_url)) {
584 if (!-e $full_file_url) {
585 print $outhandle "Mirroring $doc_url\n";
586 }
587 else {
588 print $outhandle "Checking to see if update needed for $doc_url\n";
589 }
590
591 # on linux, if we pass an absolute path as -P arg to wget, then it
592 # stuffs up the
593 # URL rewriting in the file. Need a relative path or none, so now
594 # we change working directory first.
595 my $changed_dir = 0;
596 my $current_dir = cwd();
597 my $wget_cmd = "";
598 if ($ENV{'GSDLOS'} ne "windows") {
599 $changed_dir = 1;
600
601 chdir "$output_dir";
602 $wget_cmd = "wget -nv --timestamping -k -p \"$doc_url\"";
603 } else {
604 $wget_cmd = "wget -nv -P \"$output_dir\" --timestamping -k -p \"$doc_url\"";
605 }
606 #my $wget_cmd = "wget -nv -P$output_dir --timestamping -k -p \"$doc_url\"";
607 ### my $full_dir_url = &File::Basename::dirname($full_file_url);
608
609 ## my $wget_cmd = "wget -O $full_file_url --timestamping -k \"$doc_url\"";
610 #my $wget_cmd = "wget -nv -P$output_dir --timestamping -x -k \"$doc_url\"";
611 print STDERR "**** wget = $wget_cmd\n";
612
613
614 $status = system($wget_cmd);
615 if ($changed_dir) {
616 chdir $current_dir;
617 }
618 if ($status==0) {
619 $self->{'lom_srcdoc'} = $full_file_url;
620 }
621 else {
622 $self->{'lom_srcdoc'} = undef;
623 print $outhandle "Error: failed to execute $wget_cmd\n";
624 }
625 }
626 else {
627 # not time-stamping and file already exists
628 $status=0;
629 $self->{'lom_srcdoc'} = $full_file_url;
630 }
631
632 return $status==0;
633
634}
635
636
637sub check_for_identifier
638{
639 my $self = shift(@_);
640 my ($full_mname,$md_content) = @_;
641
642 my $success = 0;
643
644 my $download_re = $self->{'download_srcdocs'};
645 if (($download_re ne "") && $md_content =~ m/$download_re/) {
646
647 if ($self->found_specific_identifier("general^identifier^entry",$full_mname,$md_content)) {
648 $success = $self->download_srcdoc($md_content);
649 }
650
651 if (!$success) {
652 if ($self->found_specific_identifier("technical^location",$full_mname,$md_content)) {
653 $success = $self->download_srcdoc($md_content);
654 }
655 }
656 }
657
658 return $success;
659}
660
661
662sub Text {
663 if ($_ !~ m/^\s*$/) {
664 #
665 # Work out indentations and line wraps for raw XML
666 #
667 my $xml_depth = scalar(@{$self->{'metaname_stack'}})+1;
668 my $indent = "&nbsp;&nbsp;" x $xml_depth;
669
670 my $formatted_text = "\n".$_;
671
672 # break into lines < 80 chars on space
673 $formatted_text =~ s/(.{50,80})\s+/$1\n/mg;
674 $formatted_text =~ s/^/$indent/mg;
675 ## $formatted_text =~ s/\s+$//s;
676
677 $self->{'raw_text'} .= $formatted_text;
678 }
679
680 my $metaname_stack = $self->{'metaname_stack'};
681 if (($_ !~ /^\s*$/) && (scalar(@$metaname_stack)>0)) {
682
683 my ($without_dt_stack,$innermost_element,$md_content)
684 = $self->process_datatype_info($metaname_stack,$_);
685
686 $self->pretty_print_table_tr($without_dt_stack);
687
688 my $full_mname = join('^',@{$without_dt_stack});
689 $self->set_filere_metadata(lc($full_mname),$md_content);
690
691 $self->check_for_language($innermost_element,$md_content);
692 $self->check_for_identifier($full_mname,$md_content); # source doc
693
694 $self->reset_datatype_info();
695 }
696}
697
698# This Char function overrides the one in XML::Parser::Stream to overcome a
699# problem where $expat->{Text} is treated as the return value, slowing
700# things down significantly in some cases.
701sub Char {
702 $_[0]->{'Text'} .= $_[1];
703 return undef;
704}
705
7061;
Note: See TracBrowser for help on using the repository browser.