Changeset 9958
- Timestamp:
- 2005-05-25T17:31:13+12:00 (19 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/OAIPlug.pm
r9853 r9958 32 32 use parsargv; 33 33 34 use XMLPlug; 35 34 36 sub BEGIN { 35 @OAIPlug::ISA = ('BasPlug'); 36 } 37 @OAIPlug::ISA = ('XMLPlug'); 38 } 39 37 40 38 41 my $arguments = … … 52 55 sub new { 53 56 my $class = shift (@_); 54 my $self = new BasPlug ($class, @_);57 my $self = new XMLPlug ($class, @_); 55 58 $self->{'plugin_type'} = "OAIPlug"; 56 59 # 14-05-02 To allow for proper inheritance of arguments - John Thompson … … 75 78 } 76 79 80 sub xml_start_document { 81 $self->{'in_metadata_node'} = 0; 82 $self->{'rawxml'} = ""; 83 } 84 85 sub xml_end_document { 86 } 87 88 sub xml_doctype { 89 my $self = shift(@_); 90 91 my ($expat, $name, $sysid, $pubid, $internal) = @_; 92 93 # allow the short-lived and badly named "GreenstoneArchive" files to be processed 94 # as well as the "Archive" files which should now be created by import.pl 95 die "" if ($name !~ /^OAI-PMH$/); 96 97 my $outhandle = $self->{'outhandle'}; 98 print $outhandle "OAIPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1; 99 print STDERR "<Processing n='$self->{'file'}' p='OAIPlug'>\n" if $self->{'gli'}; 100 101 } 102 103 104 sub xml_start_tag { 105 my $self = shift(@_); 106 my ($expat,$element) = @_; 107 108 my %attr_hash = %_; 109 110 my $attr = ""; 111 map { $attr .= " $_=$attr_hash{$_}"; } keys %attr_hash; 112 113 $self->{'rawxml'} .= "<$element$attr>"; 114 115 if ($element eq "metadata") { 116 $self->{'in_metadata_node'} = 1; 117 $self->{'metadata_xml'} = ""; 118 } 119 120 if ($self->{'in_metadata_node'}) { 121 $self->{'metadata_xml'} .= "<$element$attr>"; 122 } 123 } 124 125 sub xml_end_tag { 126 my $self = shift(@_); 127 my ($expat, $element) = @_; 128 129 $self->{'rawxml'} .= "</$element>"; 130 131 if ($self->{'in_metadata_node'}) { 132 $self->{'metadata_xml'} .= "</$element>"; 133 } 134 135 if ($element eq "metadata") { 136 my $textref = \$self->{'metadata_xml'}; 137 my $metadata = $self->{'metadata'}; 138 $self->extract_oai_metadata($textref,$metadata); 139 140 $self->{'in_metadata_node'} = 0; 141 } 142 143 144 } 145 146 sub xml_text { 147 my $self = shift(@_); 148 my ($expat) = @_; 149 150 $self->{'rawxml'} .= $_; 151 152 if ($self->{'in_metadata_node'}) { 153 $self->{'metadata_xml'} .= $_; 154 } 155 } 156 157 158 77 159 78 160 sub read { … … 87 169 88 170 return 0 if ((-d $filename) && ($filename =~ m/srcdocs$/)); 171 172 if ($self->SUPER::read(@_)) { 173 174 # Do encoding stuff 175 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename); 89 176 90 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) { 91 $self->{'num_blocked'} ++; 92 return 0; 93 } 94 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) { 177 my $url_array = $metadata->{'URL'}; 178 my $num_urls = scalar(@$url_array); 179 180 my $srcdoc_exists = 0; 181 my $srcdoc_pos = 0; 182 my $filename_dir = &util::filename_head($filename); 183 184 for (my $i=0; $i<$num_urls; $i++) { 185 186 if ($url_array->[$i] !~ m/^(http|ftp):/) { 187 188 my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]); 189 190 if (-e $src_filename) { 191 $srcdoc_pos = $i; 192 $srcdoc_exists = 1; 193 } 194 } 195 } 196 197 if ($srcdoc_exists) 198 { 199 print $outhandle "OAIPlug: passing metadata on to $url_array->[0]\n" 200 if ($self->{'verbosity'}>1); 201 202 203 # Make pretty print metadata table stick with src filename 204 my $ppmd_table = $self->{'ppmd_table'}; 205 $metadata->{'prettymd'} = [ $ppmd_table ]; 206 $self->{'ppmd_table'} = undef; 207 208 return &plugin::read ($pluginfo, $filename_dir, $url_array->[0], 209 $metadata, $processor, $maxdocs, $total_count, $gli); 210 } 211 else 212 { 213 # create a new document 214 my $doc_obj = new doc ($filename, "indexed_doc"); 215 my $top_section = $doc_obj->get_top_section; 216 my $plugin_type = $self->{'plugin_type'}; 217 218 $doc_obj->add_utf8_metadata($top_section, "Language", $language); 219 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding); 220 $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type); 221 $doc_obj->add_metadata($top_section, "FileFormat", "OAI"); 222 $doc_obj->add_metadata($top_section, "FileSize", (-s $filename)); 223 224 # include any metadata passed in from previous plugins 225 # note that this metadata is associated with the top level section 226 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata); 227 228 # do plugin specific processing of doc_obj 229 my $textref = \$self->{'rawxml'}; 230 unless (defined ($self->process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) { 231 print STDERR "<ProcessingError n='$file'>\n" if ($gli); 232 return -1; 233 } 234 235 # do any automatic metadata extraction 236 $self->auto_extract_metadata ($doc_obj); 237 238 # add an OID 239 $doc_obj->set_OID(); 240 241 my $ppmd_table = $self->{'ppmd_table'}; 242 $doc_obj->add_utf8_metadata($top_section,"prettymd",$ppmd_table); 243 $self->{'ppmd_table'} = undef; 244 245 # process the document 246 $processor->process($doc_obj); 247 248 $self->{'num_processed'} ++; 249 250 return 1; # processed the file 251 } 252 } 253 else { 95 254 return undef; 96 }97 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up98 99 # Do encoding stuff100 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);101 102 ####103 # Above code exactly the same as in BasPlug104 # => consider making supporting function?105 ###106 107 # read in file ($text will be in utf8)108 my $text = "";109 $self->read_file ($filename, $encoding, $language, \$text);110 111 if (!length ($text)) {112 print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};113 return 0;114 }115 116 print STDERR "<Processing n='$file' p='OAIPlug'>\n" if ($gli);117 print $outhandle "OAIPlug: extracting metadata from $file\n"118 if ($self->{'verbosity'}>1);119 120 $self->extract_oai_metadata(\$text,$metadata);121 122 my $url_array = $metadata->{'URL'};123 124 if (defined $url_array && ($url_array->[0] !~ m/^http:/))125 {126 ## my $source_file = &util::filename_cat($base_dir, $file);127 128 my $url_base_dir = &util::filename_head($filename);129 130 ## print STDERR "*** url base dir = $url_base_dir/$url_array->[0]\n";131 print $outhandle "OAIPlug: passing metadata on to $url_array->[0]\n"132 if ($self->{'verbosity'}>1);133 134 return &plugin::read ($pluginfo, $url_base_dir, $url_array->[0],135 $metadata, $processor, $maxdocs, $total_count, $gli);136 }137 else138 {139 # create a new document140 my $doc_obj = new doc ($filename, "indexed_doc");141 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);142 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);143 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");144 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "OAI");145 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename));146 147 148 # include any metadata passed in from previous plugins149 # note that this metadata is associated with the top level section150 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);151 152 153 # do plugin specific processing of doc_obj154 unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {155 print STDERR "<ProcessingError n='$file'>\n" if ($gli);156 return -1;157 }158 159 # do any automatic metadata extraction160 $self->auto_extract_metadata ($doc_obj);161 162 # add an OID163 $doc_obj->set_OID();164 165 # process the document166 $processor->process($doc_obj);167 168 return 1; # processed the file169 255 } 170 256 } … … 199 285 200 286 287 # Improvement is to merge this with newer version in MetadataPass 288 289 sub open_prettyprint_metadata_table 290 { 291 my $self = shift(@_); 292 293 my $att = "width=100% cellspacing=2"; 294 my $style = "style=\'border-bottom: 4px solid #000080\'"; 295 296 $self->{'ppmd_table'} = "\n<table $att $style>"; 297 } 298 299 sub add_prettyprint_metadata_line 300 { 301 my $self = shift(@_); 302 my ($metaname, $metavalue_utf8) = @_; 303 304 $metavalue_utf8 =~ s/hdl\.handle\.net/mcgonagall.cs.waikato.ac.nz:8080\/dspace\/handle/; 305 $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8); 306 307 $self->{'ppmd_table'} .= " <tr bgcolor=#b5d3cd>\n"; 308 $self->{'ppmd_table'} .= " <td width=30%>\n"; 309 $self->{'ppmd_table'} .= " $metaname\n"; 310 $self->{'ppmd_table'} .= " </td>\n"; 311 $self->{'ppmd_table'} .= " <td>\n"; 312 $self->{'ppmd_table'} .= " $metavalue_utf8\n"; 313 $self->{'ppmd_table'} .= " </td>\n"; 314 $self->{'ppmd_table'} .= " </tr>\n"; 315 316 } 317 318 sub close_prettyprint_metadata_table 319 { 320 my $self = shift(@_); 321 322 $self->{'ppmd_table'} .= "</table>\n"; 323 } 324 325 326 201 327 202 328 sub extract_oai_metadata { … … 205 331 my $outhandle = $self->{'outhandle'}; 206 332 207 208 if ($$textref =~ m/<metadata>(.*?)<\/metadata>/s) 333 # Only handles DC metadata 334 335 $self->open_prettyprint_metadata_table(); 336 337 if ($$textref =~ m/<metadata\s*>(.*?)<\/metadata\s*>/s) 209 338 { 210 339 $metadata_text = $1; … … 215 344 # if URL given for document as identifier metadata, store it ... 216 345 # $doc_obj->add_utf8_metadata($cursection, "URL", $web_url); 346 217 347 my $metaname = $1; 218 348 my $metavalue = $2; … … 237 367 } 238 368 239 369 $self->add_prettyprint_metadata_line($metaname, $metavalue); 370 240 371 } 241 372 } 373 374 $self->close_prettyprint_metadata_table(); 242 375 } 243 376
Note:
See TracChangeset
for help on using the changeset viewer.