source: gsdl/trunk/perllib/plugins/LOMPlugin.pm@ 15872

Last change on this file since 15872 was 15872, checked in by kjdon, 16 years ago

plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...

  • Property svn:keywords set to Author Date Id Revision
File size: 18.6 KB
Line 
1###########################################################################
2#
3# LOMPlugin.pm -- plugin for import the collection from LOM
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2005 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27### Note this plugin currently can't download source documents from outside if you are behind a firewall.
28
29package LOMPlugin;
30
31use ReadTextFile;
32use MetadataPass;
33use XMLParser;
34use Cwd;
35
36sub BEGIN {
37 @ISA = ('ReadTextFile', 'MetadataPass');
38}
39
40use strict; # every perl program should have this!
41no strict 'refs'; # make an exception so we can use variables as filehandles
42
43
44my $arguments =
45 [ { 'name' => "process_exp",
46 'desc' => "{ReadTextFile.process_exp}",
47 'type' => "string",
48 'deft' => &get_default_process_exp(),
49 'reqd' => "no" },
50 { 'name' => "root_tag",
51 'desc' => "{LOMPlugin.root_tag}",
52 'type' => "regexp",
53 'deft' => q/^(?i)lom$/,
54 'reqd' => "no" },
55 { 'name' => "check_timestamp",
56 'desc' => "{LOMPlugin.check_timestamp}",
57 'type' => "flag" },
58 { 'name' => "download_srcdocs",
59 'desc' => "{LOMPlugin.download_srcdocs}",
60 'type' => "regexp",
61 'deft' => "",
62 'reqd' => "no" }];
63
64my $options = { 'name' => "LOMPlugin",
65 'desc' => "{LOMPlugin.desc}",
66 'inherits' => "yes",
67 'args' => $arguments };
68
69
70
71my ($self);
72sub new {
73 my $class = shift (@_);
74 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
75 push(@$pluginlist, $class);
76
77 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
78 push(@{$hashArgOptLists->{"OptList"}},$options);
79
80 $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
81
82 if ($self->{'info_only'}) {
83 # don't worry about creating the XML parser as all we want is the
84 # list of plugin options
85 return bless $self, $class;
86 }
87
88 #create XML::Parser object for parsing dublin_core.xml files
89 my $parser = new XML::Parser('Style' => 'Stream',
90 'Handlers' => {'Char' => \&Char,
91 'Doctype' => \&Doctype
92 });
93 $self->{'parser'} = $parser;
94
95 $self->{'extra_blocks'} = {};
96
97 return bless $self, $class;
98}
99
100sub get_default_process_exp {
101 my $self = shift (@_);
102
103 return q^(?i)\.xml$^;
104}
105
106
107
108
109sub metadata_read {
110 my $self = shift (@_);
111 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
112
113 my $outhandle = $self->{'outhandle'};
114
115 my $filename = $file;
116 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
117
118 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
119 return undef; # can't recognise
120 }
121 if (!$self->check_doctype($filename)) {
122 # this file is not for us
123 return undef;
124 }
125
126 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
127
128 print $outhandle "LOMPlugin: extracting metadata from $file\n"
129 if $self->{'verbosity'} > 1;
130
131 my ($dir,$tail) = $filename =~ /^(.*?)([^\/\\]*)$/;
132 $self->{'output_dir'} = $dir;
133
134 eval {
135 $self->{'parser'}->parsefile($filename);
136 };
137
138 if ($@) {
139 print $outhandle "LOMPlugin: skipping $filename as not conformant to LOM syntax\n" if ($self->{'verbosity'} > 1);
140 print $outhandle "\n Perl Error:\n $@\n" if ($self->{'verbosity'}>2);
141 return 0;
142 }
143
144 $self->{'output_dir'} = undef;
145
146 my $file_re;
147 my $lom_srcdoc = $self->{'lom_srcdoc'};
148
149 if (defined $lom_srcdoc) {
150 my $dirsep = &util::get_re_dirsep();
151 $lom_srcdoc =~ s/^$base_dir($dirsep)//;
152 $self->{'extra_blocks'}->{$file}++;
153
154 $file_re = $lom_srcdoc;
155 }
156 else {
157 $file_re = $tail;
158 }
159 $file_re =~ s/\./\\\./g;
160
161 $self->{'lom_srcdoc'} = undef; # reset for next file to be processed
162
163 push(@$extrametakeys,$file_re);
164 $extrametadata->{$file_re} = $self->{'saved_metadata'};
165
166 return 1;
167}
168
169sub check_doctype {
170 $self = shift (@_);
171
172 my ($filename) = @_;
173
174 if (open(XMLIN,"<$filename")) {
175 my $doctype = $self->{'root_tag'};
176 ## check whether the doctype has the same name as the root element tag
177 while (defined (my $line = <XMLIN>)) {
178 ## find the root element
179 if ($line =~ /<([\w\d:]+)[\s>]/){
180 my $root = $1;
181 if ($root !~ $doctype){
182 close(XMLIN);
183 return 0;
184 }
185 else {
186 close(XMLIN);
187 return 1;
188 }
189 }
190 }
191 close(XMLIN);
192 }
193
194 return undef; # haven't found a valid line
195
196}
197
198sub read_file {
199 my $self = shift (@_);
200 my ($filename, $encoding, $language, $textref) = @_;
201
202 my $metadata_table = $self->{'metadata_table'};
203
204 my $rawtext = $metadata_table->{'rawtext'};
205
206 delete $metadata_table->{'rawtext'};
207
208 $$textref = $rawtext;
209}
210
211sub read {
212 my $self = shift (@_);
213 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
214
215 my $outhandle = $self->{'outhandle'};
216
217 return 0 if (defined $self->{'extra_blocks'}->{$file});
218
219 # need to check whether this file is for us
220 my ($block_status,$filename) = $self->read_block(@_);
221 return $block_status if ((!defined $block_status) || ($block_status==0));
222 if (!$self->check_doctype($filename)) {
223 # this file is not for us
224 return undef;
225 }
226
227 $self->{'metadata_table'} = $metadata;
228
229 my $lom_language = $metadata->{'lom_language'};
230
231 my $store_input_encoding;
232 my $store_extract_language;
233 my $store_default_language;
234 my $store_default_encoding;
235
236 if (defined $lom_language) {
237 delete $metadata->{'lom_language'};
238
239 $store_input_encoding = $self->{'input_encoding'};
240 $store_extract_language = $self->{'extract_language'};
241 $store_default_language = $self->{'default_language'};
242 $store_default_encoding = $self->{'default_encoding'};
243
244 $self->{'input_encoding'} = "utf8";
245 $self->{'extract_language'} = 0;
246 $self->{'default_language'} = $lom_language;
247 $self->{'default_encoding'} = "utf8";
248 }
249
250 my $rv = $self->SUPER::read(@_);
251
252 if (defined $lom_language) {
253 $self->{'input_encoding'} = $store_input_encoding;
254 $self->{'extract_language'} = $store_extract_language;
255 $self->{'default_language'} = $store_default_language;
256 $self->{'default_encoding'} = $store_default_encoding;
257 }
258
259 $self->{'metadata_table'} = undef;
260
261 return $rv;
262}
263
264# do plugin specific processing of doc_obj
265sub process {
266 my $self = shift (@_);
267 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
268 my $outhandle = $self->{'outhandle'};
269
270 print STDERR "<Processing n='$file' p='LOMPlugin'>\n" if ($gli);
271
272 print $outhandle "LOMPLug: processing $file\n";
273
274 my $cursection = $doc_obj->get_top_section();
275 $doc_obj->add_utf8_text($cursection, $$textref);
276
277 return 1;
278}
279
280sub Doctype {
281 my ($expat, $name, $sysid, $pubid, $internal) = @_;
282
283 my $root_tag = $self->{'root_tag'};
284
285 if ($name !~ /$root_tag/) {
286 die "Root tag $name does not match regular expression $root_tag";
287 }
288}
289
290sub StartTag {
291 my ($expat, $element) = @_;
292
293 my %attr = %_;
294
295 my $raw_tag = "&lt;$element";
296 map { $raw_tag .= " $_=\"$attr{$_}\""; } keys %attr;
297 $raw_tag .= "&gt;";
298
299 if ($element =~ m/$self->{'root_tag'}/) {
300 $self->{'raw_text'} = $raw_tag;
301
302 $self->{'saved_metadata'} = {};
303 $self->{'metaname_stack'} = [];
304 $self->{'lom_datatype'} = "";
305 $self->{'lom_language'} = undef;
306 $self->{'metadatatext'} = "<table class=\"metadata\" width=\"_pagewidth_\" >\n";
307 }
308 else {
309 my $xml_depth = scalar(@{$self->{'metaname_stack'}});
310 $self->{'raw_text'} .= "\n";
311 $self->{'raw_text'} .= "&nbsp;&nbsp;" x $xml_depth;
312 $self->{'raw_text'} .= $raw_tag;
313
314 my $metaname_stack = $self->{'metaname_stack'};
315 push(@$metaname_stack,$element);
316 if (scalar(@$metaname_stack)==1) {
317 # top level LOM category
318 my $style = "class=\"metadata\"";
319 my $open_close
320 = "<a id=\"${element}opencloselink\" href=\"javascript:hideTBodyArea('$element')\">\n";
321 $open_close
322 .= "<img id=\"${element}openclose\" border=\"0\" src=\"_httpopenmdicon_\"></a>\n";
323
324 my $header_line = " <tr $style ><th $style colspan=\"3\">$open_close \u$element</th></tr>\n";
325 my $md_tbody = "<tbody id=\"$element\">\n";
326
327 $self->{'mdheader'} = $header_line;
328 $self->{'mdtbody'} = $md_tbody;
329 $self->{'mdtbody_text'} = "";
330 }
331 }
332}
333
334sub EndTag {
335 my ($expat, $element) = @_;
336
337 my $raw_tag = "&lt;/$element&gt;";
338
339 if ($element =~ m/$self->{'root_tag'}/) {
340 $self->{'raw_text'} .= $raw_tag;
341
342 my $metadatatext = $self->{'metadatatext'};
343 $metadatatext .= "</table>";
344
345 my $raw_text = $self->{'raw_text'};
346
347 $self->{'saved_metadata'}->{'MetadataTable'} = $metadatatext;
348 $self->{'metadatatext'} = "";
349
350 $self->{'saved_metadata'}->{'rawtext'} = $raw_text;
351 $self->{'raw_text'} = "";
352
353 if (defined $self->{'lom_language'}) {
354 $self->{'saved_metadata'}->{'lom_language'} = $self->{'lom_language'};
355 $self->{'lom_language'} = undef;
356 }
357 }
358 else {
359 my $metaname_stack = $self->{'metaname_stack'};
360
361 if (scalar(@$metaname_stack)==1) {
362 my $header_line = $self->{'mdheader'};
363 my $tbody_start = $self->{'mdtbody'};
364 my $tbody_text = $self->{'mdtbody_text'};
365 if ($tbody_text !~ m/^\s*$/s) {
366 my $tbody_end = "</tbody>\n";
367 my $table_chunk
368 = $header_line.$tbody_start.$tbody_text.$tbody_end;
369
370 $self->{'metadatatext'} .= $table_chunk;
371 }
372 $self->{'mdtheader'} = "";
373 $self->{'mdtbody'} = "";
374 $self->{'mdtbody_text'} = "";
375 }
376
377 pop(@$metaname_stack);
378
379 my $xml_depth = scalar(@{$self->{'metaname_stack'}});
380 $self->{'raw_text'} .= "\n";
381 $self->{'raw_text'} .= "&nbsp;&nbsp;" x $xml_depth;
382 $self->{'raw_text'} .= $raw_tag;
383 }
384}
385
386sub process_datatype_info
387{
388 my $self = shift(@_);
389 my ($metaname_stack,$md_content) = @_;
390
391 my @without_dt_stack = @$metaname_stack; # without datatype stack
392
393 my $innermost_element = $without_dt_stack[$#without_dt_stack];
394
395 # Loose last item if encoding datatype information
396 if ($innermost_element =~ m/^(lang)?string$/) {
397 $self->{'lom_datatype'} = $innermost_element;
398
399 pop @without_dt_stack;
400 $innermost_element = $without_dt_stack[$#without_dt_stack];
401 }
402 elsif ($innermost_element =~ m/^date(Time)?$/i) {
403 if ($innermost_element =~ m/^date$/i) {
404 $self->{'lom_datatype'} = "dateTime";
405 }
406 else {
407 $self->{'lom_datatype'} = $innermost_element;
408
409 pop @without_dt_stack;
410 $innermost_element = $without_dt_stack[$#without_dt_stack];
411 }
412
413 if ($md_content =~ m/^(\d{1,2})\s*(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s*(\d{4})/i) {
414 my ($day,$mon,$year) = ($1,$2,$3);
415
416 my %month_lookup = ( 'jan' => 1, 'feb' => 2, 'mar' => 3,
417 'apr' => 4, 'may' => 5, 'jun' => 6,
418 'jul' => 7, 'aug' => 8, 'sep' => 9,
419 'oct' => 10, 'nov' => 11, 'dec' => 12 );
420
421 my $mon_num = $month_lookup{lc($mon)};
422
423 $md_content = sprintf("%d%02d%02d",$year,$mon_num,$day);
424 }
425
426 $md_content =~ s/\-//g;
427 }
428
429 if ($innermost_element eq "source") {
430 $self->{'lom_source'} = $md_content;
431 }
432 elsif ($innermost_element eq "value") {
433 $self->{'lom_value'} = $md_content;
434 }
435
436 return (\@without_dt_stack,$innermost_element,$md_content);
437}
438
439sub reset_datatype_info
440{
441 my $self = shift(@_);
442
443 $self->{'lom_datatype'} = "";
444}
445
446
447sub pretty_print_text
448{
449 my $self = shift(@_);
450
451 my ($pretty_print_text) = @_;
452
453## $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
454 $pretty_print_text = &util::hyperlink_text($pretty_print_text);
455
456#### $pretty_print_text =~ s/(BEGIN:vCard.*END:vCard)/<pre>$1<\/pre>/sg;
457
458 if ($self->{'lom_datatype'} eq "dateTime") {
459 if ($pretty_print_text =~ m/^(\d{4})(\d{2})(\d{2})$/) {
460 $pretty_print_text = "$1-$2-$3";
461 }
462 }
463
464 return $pretty_print_text;
465}
466
467sub pretty_print_table_tr
468{
469 my $self = shift (@_);
470 my ($without_dt_stack) = @_;
471
472 my $style = "class=\"metadata\"";
473
474 my $innermost_element = $without_dt_stack->[scalar(@$without_dt_stack)-1];
475 my $outermost_element = $without_dt_stack->[0];
476
477 # Loose top level stack item (already named in pretty print table)
478 my @pretty_print_stack = @$without_dt_stack;
479 shift @pretty_print_stack;
480
481 if ($innermost_element eq "source") {
482 return if (!defined $self->{'lom_value'});
483 }
484
485 if ($innermost_element eq "value") {
486 return if (!defined $self->{'lom_source'});
487 }
488
489 my $pretty_print_text = "";
490
491 if (($innermost_element eq "value") || ($innermost_element eq "source")) {
492 my $source = $self->{'lom_source'};
493 my $value = $self->pretty_print_text($self->{'lom_value'});
494
495 $self->{'lom_source'} = undef;
496 $self->{'lom_value'} = undef;
497
498 pop @pretty_print_stack;
499
500 $pretty_print_text = "<td $style>$source</td><td $style>$value</td>";
501 }
502 else {
503 $pretty_print_text = $self->pretty_print_text($_);
504 $pretty_print_text = "<td $style colspan=2>$pretty_print_text</td>";
505 }
506 my $pretty_print_fmn = join(' : ',map { "\u$_"; } @pretty_print_stack);
507
508
509 # my $tr_attr = "id=\"$outermost_element\" style=\"display:block;\"";
510 my $tr_attr = "$style id=\"$outermost_element\"";
511
512 my $mdtext_line = " <tr $tr_attr><td $style><nobr>$pretty_print_fmn</nobr></td>$pretty_print_text</tr>\n";
513 $self->{'mdtbody_text'} .= $mdtext_line;
514}
515
516
517sub check_for_language
518{
519 my $self = shift(@_);
520 my ($innermost_element,$md_content) = @_;
521
522 # Look for 'language' tag
523 if ($innermost_element eq "language") {
524 my $lom_lang = $self->{'lom_language'};
525
526 if (defined $lom_lang) {
527 my $new_lom_lang = $md_content;
528 $new_lom_lang =~ s/-.*//; # remove endings like -US or -GB
529
530 if ($lom_lang ne $new_lom_lang) {
531 my $outhandle = $self->{'outhandle'};
532
533 print $outhandle "Warning: Conflicting general language in record\n";
534 print $outhandle " $new_lom_lang (previous value for language = $lom_lang)\n";
535 }
536 # otherwise, existing value OK => do nothing
537 }
538 else {
539 $lom_lang = $md_content;
540 $lom_lang =~ s/-.*//; # remove endings like -US or -GB
541
542 $self->{'lom_language'} = $lom_lang;
543 }
544 }
545}
546
547sub found_specific_identifier
548{
549 my $self = shift(@_);
550 my ($specific_id,$full_mname,$md_content) = @_;
551
552 my $found_id = 0;
553 if ($full_mname eq $specific_id) {
554 if ($md_content =~ m/^(http|ftp):/) {
555 $found_id = 1;
556 }
557 }
558
559 return $found_id;
560}
561
562sub download_srcdoc
563{
564 my $self = shift(@_);
565 my ($doc_url) = @_;
566
567 my $outhandle = $self->{'outhandle'};
568 my $output_dir = $self->{'output_dir'};
569
570 $output_dir = &util::filename_cat($output_dir,"_gsdldown.all");
571
572 if (! -d $output_dir) {
573 mkdir $output_dir;
574 }
575
576 my $re_dirsep = &util::get_re_dirsep();
577 my $os_dirsep = &util::get_dirsep();
578
579 my $file_url = $doc_url;
580 $file_url =~ s/$re_dirsep/$os_dirsep/g;
581 $file_url =~ s/^(http|ftp):\/\///;
582 $file_url .= "index.html" if ($file_url =~ m/\/$/);
583
584 my $full_file_url = &util::filename_cat($output_dir,$file_url);
585
586 my $check_timestamp = $self->{'check_timestamp'};
587 my $status;
588
589 if (($check_timestamp) || (!$check_timestamp && !-e $full_file_url)) {
590 if (!-e $full_file_url) {
591 print $outhandle "Mirroring $doc_url\n";
592 }
593 else {
594 print $outhandle "Checking to see if update needed for $doc_url\n";
595 }
596
597 # on linux, if we pass an absolute path as -P arg to wget, then it
598 # stuffs up the
599 # URL rewriting in the file. Need a relative path or none, so now
600 # we change working directory first.
601 my $changed_dir = 0;
602 my $current_dir = cwd();
603 my $wget_cmd = "";
604 if ($ENV{'GSDLOS'} ne "windows") {
605 $changed_dir = 1;
606
607 chdir "$output_dir";
608 $wget_cmd = "wget -nv --timestamping -k -p \"$doc_url\"";
609 } else {
610 $wget_cmd = "wget -nv -P \"$output_dir\" --timestamping -k -p \"$doc_url\"";
611 }
612 #my $wget_cmd = "wget -nv -P$output_dir --timestamping -k -p \"$doc_url\"";
613 ### my $full_dir_url = &File::Basename::dirname($full_file_url);
614
615 ## my $wget_cmd = "wget -O $full_file_url --timestamping -k \"$doc_url\"";
616 #my $wget_cmd = "wget -nv -P$output_dir --timestamping -x -k \"$doc_url\"";
617 print STDERR "**** wget = $wget_cmd\n";
618
619
620 $status = system($wget_cmd);
621 if ($changed_dir) {
622 chdir $current_dir;
623 }
624 if ($status==0) {
625 $self->{'lom_srcdoc'} = $full_file_url;
626 }
627 else {
628 $self->{'lom_srcdoc'} = undef;
629 print $outhandle "Error: failed to execute $wget_cmd\n";
630 }
631 }
632 else {
633 # not time-stamping and file already exists
634 $status=0;
635 $self->{'lom_srcdoc'} = $full_file_url;
636 }
637
638 return $status==0;
639
640}
641
642
643sub check_for_identifier
644{
645 my $self = shift(@_);
646 my ($full_mname,$md_content) = @_;
647
648 my $success = 0;
649
650 my $download_re = $self->{'download_srcdocs'};
651 if (($download_re ne "") && $md_content =~ m/$download_re/) {
652
653 if ($self->found_specific_identifier("general^identifier^entry",$full_mname,$md_content)) {
654 $success = $self->download_srcdoc($md_content);
655 }
656
657 if (!$success) {
658 if ($self->found_specific_identifier("technical^location",$full_mname,$md_content)) {
659 $success = $self->download_srcdoc($md_content);
660 }
661 }
662 }
663
664 return $success;
665}
666
667
668sub Text {
669 if ($_ !~ m/^\s*$/) {
670 #
671 # Work out indentations and line wraps for raw XML
672 #
673 my $xml_depth = scalar(@{$self->{'metaname_stack'}})+1;
674 my $indent = "&nbsp;&nbsp;" x $xml_depth;
675
676 my $formatted_text = "\n".$_;
677
678 # break into lines < 80 chars on space
679 $formatted_text =~ s/(.{50,80})\s+/$1\n/mg;
680 $formatted_text =~ s/^/$indent/mg;
681 ## $formatted_text =~ s/\s+$//s;
682
683 $self->{'raw_text'} .= $formatted_text;
684 }
685
686 my $metaname_stack = $self->{'metaname_stack'};
687 if (($_ !~ /^\s*$/) && (scalar(@$metaname_stack)>0)) {
688
689 my ($without_dt_stack,$innermost_element,$md_content)
690 = $self->process_datatype_info($metaname_stack,$_);
691
692 $self->pretty_print_table_tr($without_dt_stack);
693
694 my $full_mname = join('^',@{$without_dt_stack});
695 $self->set_filere_metadata(lc($full_mname),$md_content);
696
697 $self->check_for_language($innermost_element,$md_content);
698 $self->check_for_identifier($full_mname,$md_content); # source doc
699
700 $self->reset_datatype_info();
701 }
702}
703
704# This Char function overrides the one in XML::Parser::Stream to overcome a
705# problem where $expat->{Text} is treated as the return value, slowing
706# things down significantly in some cases.
707sub Char {
708 $_[0]->{'Text'} .= $_[1];
709 return undef;
710}
711
7121;
Note: See TracBrowser for help on using the repository browser.