Changeset 7902


Ignore:
Timestamp:
2004-08-06T16:34:02+12:00 (20 years ago)
Author:
chi
Message:

Saving of documents (in archive format) extended to generate METS format
as alternative to GreenstoneArchive (GA) format. Controlled through
'import.pl -saveas METS ...'

Location:
trunk/gsdl/perllib
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/doc.pm

    r7798 r7902  
    216216}
    217217
     218sub buffer_txt_section_xml {
     219    my $self = shift(@_);
     220    my ($section) = @_;
     221 
     222    my $section_ptr = $self->_lookup_section ($section);
     223   
     224    return "" unless defined $section_ptr;
     225   
     226    my $all_text = "<Section>\n";
     227   
     228    ##output the text
     229    #$all_text .= "   <Content>";
     230    $all_text .= &_escape_text($section_ptr->{'text'});
     231    #$all_text .= "   </Content>\n";
     232   
     233   
     234    #output all the subsections
     235    foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
     236       $all_text .= $self->buffer_txt_section_xml("$section.$subsection");
     237     }
     238
     239     $all_text .= "</Section>\n";
     240
     241     
     242     $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
     243     return $all_text;
     244}
     245
     246sub buffer_mets_fileSection_section_xml() {
     247    my $self = shift(@_);
     248    my ($section, $doc_Dir) = @_;
     249
     250    $section="" unless defined $section;
     251   
     252
     253    my $section_ptr=$self->_lookup_section($section);
     254    return "" unless defined $section_ptr;
     255 
     256
     257    #**output fileSection by sections
     258    my $section_num ="1". $section;
     259 
     260   
     261    my $filePath = $doc_Dir . '/doctxt.xml';
     262   
     263    #**output the fileSection details
     264    my $all_text = '  <mets:fileGrp ID="FILEGROUP_PRELUDE' . $section_num . '">'. "\n";
     265    $all_text .= '    <mets:file MIMETYPE="text/xml" ID="FILE'.$section_num. '">'. "\n";
     266    $all_text .= '      <mets:FLocate LOCTYPE="URL" xlink:href="file:'.$filePath.'#xpointer(/Section[';
     267   
     268    my $xpath = "1".$section;
     269 
     270    $xpath =~ s/\./]\/Section[/g;
     271   
     272    $all_text .=  $xpath;
     273
     274    $all_text .= ']/text())" ID="FILE'. $section_num. '" />' . "\n";
     275    $all_text .= "    </mets:file>\n";
     276    $all_text .= "  </mets:fileGrp>\n";
     277
     278
     279    foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
     280       $all_text .= $self->buffer_mets_fileSection_section_xml("$section.$subsection",$doc_Dir);
     281    }
     282   
     283    $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
     284
     285    return $all_text;
     286}
     287
     288sub buffer_mets_fileWhole_section_xml(){
     289    my $self = shift(@_);
     290    my ($section) = @_;
     291
     292    my $section_ptr = $self-> _lookup_section($section);
     293    return "" unless defined $section_ptr;
     294   
     295    my $all_text="" unless defined $all_txt;
     296
     297    my ($dirPath)="" unless defined $dirPath;
     298    my $fileID=0;
     299
     300    #** output the fileSection for the whole section
     301    #*** get the sourcefile and associative file
     302
     303    foreach my $data (@{$section_ptr->{'metadata'}}){
     304       my $escaped_value = &_escape_text($data->[1]);
     305       if ($data->[0] eq "gsdlsourcefilename") {
     306          ($dirPath) = $escaped_value =~ m/^(.*)[\/\\][^\/\\]*$/;
     307         
     308      $all_text .= '  <mets:fileGrp ID="default">'."\n";
     309          ++$fileID;
     310          $all_text .= '    <mets:file MIMETYPE="text/xml" ID="default.'.$fileID.'">'. "\n";
     311          $all_text .= '      <mets:FLocate LOCTYPE="URL" xlink:href="file:'.$data->[1].'" ID="default.'.$fileID.'" />'."\n";
     312
     313          $all_text .= "    </mets:file>\n";
     314       }
     315       
     316       if ($data->[0] eq "gsdlassocfile"){
     317      $escaped_value =~ m/^(.*?):(.*):$/;
     318
     319      my $assfilePath = $dirPath . '/'. $1;
     320          ++$fileID;
     321      $all_text .= '    <mets:file MIMETYPE="'.$2.'" ID="default.'.$fileID. '">'. "\n";
     322      $all_text .= '      <mets:FLocate LOCTYPE="URL" xlink:href="file:'.$assfilePath.'" ID="default.'. $fileID.'" />'."\n";
     323         
     324      $all_text .= "    </mets:file>\n";
     325       }
     326    }
     327    $all_text .= "  </mets:fileGrp>\n";
     328                                           
     329                                     
     330    $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
     331   
     332    return $all_text;
     333}
     334
     335sub buffer_mets_StruMapSection_section_xml(){
     336    my $self = shift(@_);
     337    my ($section, $order_numref) = @_;
     338
     339    $section="" unless defined $section;
     340   
     341
     342    my $section_ptr=$self->_lookup_section($section);
     343    return "" unless defined $section_ptr;
     344
     345    #**output fileSection by sections
     346    my $section_num ="1". $section;
     347   
     348    #**output the StruMap details
     349 
     350    my $all_text = '  <mets:div ID="DS'. $section_num .'" TYPE="Section" ORDER="'.$$order_numref++.'" ORDERLABEL="'. $section_num .'" LABEL="';
     351    $all_text .= $section_num . '" DMDID="DM'.$section_num.'">'. "\n";
     352   
     353    $all_text .= '    <mets:fptr FILEID="FILEGROUP_PRELUDE'.$section_num.'" />'. "\n";
     354
     355
     356    foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
     357       $all_text .= $self->buffer_mets_StruMapSection_section_xml("$section.$subsection", $order_numref);
     358    }
     359   
     360    $all_text .= "  </mets:div>\n";
     361
     362    $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
     363
     364    return $all_text;
     365}
     366
     367
     368sub buffer_mets_StruMapWhole_section_xml(){
     369    my $self = shift(@_);
     370    my ($section) = @_;
     371   
     372    my $section_ptr = $self-> _lookup_section($section);
     373    return "" unless defined $section_ptr;
     374   
     375    my $all_text="" unless defined $all_txt;   
     376    my $fileID=0;
     377
     378    $all_text .= '<mets:structMap ID="All" TYPE="Whole Document" LABEL="All">'."\n";
     379    $all_text .= '  <mets:div ID="All" TYPE="Document" ORDER="All" ORDERLABEL="All" LABEL="Whole Documemt" DMDID="DM1">' . "\n";
     380 
     381
     382    #** output the StruMapSection for the whole section
     383    #*** get the sourcefile and associative file
     384
     385    foreach my $data (@{$section_ptr->{'metadata'}}){
     386       my $escaped_value = &_escape_text($data->[1]);
     387   
     388       if ($data->[0] eq "gsdlsourcefilename") {
     389          ++$fileID;
     390      $all_text .= '    <mets:fptr FILEID="default.'.$fileID.'" />'."\n";
     391       }
     392       
     393       if ($data->[0] eq "gsdlassocfile"){
     394          ++$fileID;
     395      $all_text .= '    <mets:fptr FILEID="default.'.$fileID. '" />'. "\n";
     396       }
     397    }
     398    $all_text .= "  </mets:div>\n";
     399   
     400    $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
     401   
     402    return $all_text;
     403}
     404
     405
     406sub buffer_mets_dmdSection_section_xml(){
     407    my $self = shift(@_);
     408    my ($section) = @_;
     409   
     410    $section="" unless defined $section;
     411   
     412    my $section_ptr=$self->_lookup_section($section);
     413    return "" unless defined $section_ptr;
     414
     415    #***convert section number
     416    my $section_num ="1". $section;
     417     
     418    #**output the dmdSection details 
     419    my $all_text = '<mets:dmdSec ID="DM'.$section_num.'" GROUPID="'.$section_num.'">'. "\n";
     420    $all_text .= '  <mets:mdWrap MDType="gsdl" ID="'.$section_num.'">'."\n";
     421    $all_text .= "    <mets:xmlData>\n";
     422    foreach my $data (@{$section_ptr->{'metadata'}}){
     423       my $escaped_value = &_escape_text($data->[1]);
     424       $all_text .= '      <gsdl:Metadata name="'. $data->[0].'">'. $escaped_value. "</gsdl:Metadata>\n";
     425    }
     426    $all_text .= "    </mets:xmlData>\n";
     427    $all_text .= "  </mets:mdWrap>\n";
     428    $all_text .= "</mets:dmdSec>\n";
     429
     430    foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
     431       $all_text .= $self->buffer_mets_dmdSection_section_xml("$section.$subsection");
     432    }
     433   
     434    $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
     435
     436    return $all_text;
     437}
     438
    218439sub output_section {
    219440    my $self = shift (@_);
     
    222443    print $handle $self->buffer_section_xml($section);
    223444}
     445
     446
     447#*** print out doctxt.xml file
     448sub output_txt_section {
     449    my $self = shift (@_);
     450    my ($handle, $section) = @_;
     451
     452    print $handle $self->buffer_txt_section_xml($section);
     453}
     454
     455#*** print out docmets.xml file
     456sub output_mets_section {
     457    my $self = shift(@_);
     458    my ($handle, $section, $doc_Dir) = @_;
     459   
     460    #***print out the dmdSection
     461    print $handle $self->buffer_mets_dmdSection_section_xml($section);
     462
     463    #***print out the fileSection by sections
     464    print $handle "<mets:fileSec>\n";
     465    print $handle $self->buffer_mets_fileSection_section_xml($section,$doc_Dir);
     466    #***print out the whole fileSection
     467    print $handle $self->buffer_mets_fileWhole_section_xml($section);
     468    print $handle "</mets:fileSec>\n";
     469   
     470    #***print out the StruMapSection by sections
     471    print $handle '<mets:structMap ID="Section" TYPE="Section" LABEL="Section">' . "\n";
     472    my $order_num=0;
     473    print $handle $self->buffer_mets_StruMapSection_section_xml($section, \$order_num);
     474    print $handle "</mets:structMap>\n";
     475    print $handle $self->buffer_mets_StruMapWhole_section_xml($section);
     476    print $handle "</mets:structMap>\n";
     477}
     478
    224479
    225480# look up the reference to the a particular section
  • trunk/gsdl/perllib/docproc.pm

    r2327 r7902  
    5757}
    5858
     59sub set_saveas {
     60    my $self = shift (@_);
     61    my ($saveas) = @_;
     62
     63    $self->{'saveas'} = $saveas;
     64
     65}
     66
    5967
    60681;
    6169
     70
     71
     72
     73
     74
     75
  • trunk/gsdl/perllib/docsave.pm

    r3834 r7902  
    9696    my $OID = $doc_obj->get_OID();
    9797    $OID = "NULL" unless defined $OID;
    98    
     98
     99
    99100    # get document's directory
    100101    my $doc_dir = $self->get_doc_dir ($OID);
    101    
     102     
     103   
    102104    # copy all the associated files, add this information as metadata
    103105    # to the document
     
    106108    my $doc_file
    107109        = &util::filename_cat ($self->{'archive_dir'}, $doc_dir, "doc.xml");
    108     my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
    109    
     110     
     111        #***define doctxt.xml file
     112        my $doc_txt_file
     113            = &util::filename_cat ($self->{'archive_dir'}, $doc_dir,"doctxt.xml");
     114    my $working_dir
     115            =&util::filename_cat ($self->{'archive_dir'}, $doc_dir);
     116       
     117        #***define docmets.xmlfile
     118        my $doc_mets_file
     119            = &util::filename_cat ($self->{'archive_dir'},$doc_dir, "docmets.xml");
     120 
     121        my $short_doc_file = &util::filename_cat ($doc_dir, "doc.xml");
     122    #my $short_txt_doc_file=&util::filename_cat ($doc_dir, "doctxt.xml");
     123        my $short_mets_doc_file=&util::filename_cat ($doc_dir, "docmets.xml");
     124   
     125
    110126    if (!open (OUTDOC, ">$doc_file")) {
    111         print $outhandle "docsave::process could not write to file $doc_file\n";
     127        print $outhandle "docsave::process could not write to file $doc_file\n";
    112128        return;
    113129    }
     
    117133    $doc_obj->output_section('docsave::OUTDOC', $doc_obj->get_top_section());
    118134    $self->output_xml_footer('docsave::OUTDOC');
    119     close OUTDOC;
     135
     136        #***save the document without metadata:doctxt.xml: Chi-Yu Huang
     137
     138    if (!open(OUTDOC_TXT, ">$doc_txt_file")){
     139            print $outhandle "docsave::process could not write to file $doc_mets_file\n";
     140            return;
     141        }
     142
     143        $self->output_txt_xml_header('docsave::OUTDOC_TXT');
     144        $doc_obj->output_txt_section('docsave::OUTDOC_TXT', $doc_obj->get_top_section());
     145    #$self->output_txt_xml_footer('docsave::OUTDOC_TXT');
     146               
     147        #***Convert doctxt.xml file to docmets.xml: Chi-Yu Huang
     148        if (!open(OUTDOC_METS,">$doc_mets_file")){
     149        print $outhandle "docsave::process could not write to file $doc_mets_file\n";
     150            return;
     151    }
     152         
     153        $self->output_mets_xml_header('docsave::OUTDOC_METS', $OID);
     154        $doc_obj->output_mets_section('docsave::OUTDOC_METS',$doc_obj->get_top_section(), $working_dir);
     155        $self->output_mets_xml_footer('docsave::OUTDOC_METS');
     156
     157        close OUTDOC;
     158        close OUTDOC_TXT;
     159    close OUTDOC_METS;
    120160
    121161    if ($self->{'gzip'}) {
     
    131171
    132172    # do the sortmeta thing
    133     my ($metadata);
    134     if (defined ($self->{'sortmeta'})) {
     173    my ($metadata); if (defined ($self->{'sortmeta'})) {
    135174        $metadata = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'sortmeta'});
    136175    }
    137176
    138177    # store reference in the archive_info
    139     $self->{'archive_info'}->add_info($OID, $short_doc_file, $metadata);
    140     }
    141 }
    142 
     178    if ($self->{'saveas'} eq "METS"){
     179        $self->{'archive_info'}->add_info($OID, $short_mets_doc_file, $metadata);
     180    } else {
     181        $self->{'archive_info'}->add_info($OID, $short_doc_file, $metadata);
     182    }
     183    }
     184}
     185
     186     
    143187sub group_process {
    144188    my $self = shift (@_);
     
    283327
    284328    # store reference in the archive_info
    285     $self->{'archive_info'}->add_info($OID, $short_doc_file);
    286 
     329    if ($self->{'saveas'} eq "METS"){
     330    $self->{'archive_info'}->add_info($OID, $short_mets_doc_file);
     331    } else {
     332    $self->{'archive_info'}->add_info($OID, $short_doc_file);
     333    }
     334   
    287335    return 1;
    288336}
     
    292340    my ($handle) = @_;
    293341
     342    print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
     343   
     344    print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
     345    print $handle "<Archive>\n";
     346}
     347
     348sub output_xml_footer {
     349    my $self = shift (@_);
     350    my ($handle) = @_;
     351
     352    print $handle "</Archive>\n";
     353}
     354
     355sub output_txt_xml_header{
     356    my $self = shift (@_);
     357    my ($handle) = @_;
    294358    print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
    295359    print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
    296     print $handle "<Archive>\n";
    297 }
    298 
    299 sub output_xml_footer {
    300     my $self = shift (@_);
    301     my ($handle) = @_;
    302 
    303     print $handle "</Archive>\n";
     360}
     361
     362sub output_txt_xml_footer{
     363    my $self = shift(@_);
     364    my ($handle) = @_;
     365    print $handle "<the end of the file>\n";
     366}
     367
     368sub output_mets_xml_header(){
     369    my $self = shift(@_);
     370    my ($handle, $OID) = @_;
     371    print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
     372    print $handle '<!DOCTYPE Archive SYSTEM "http://greenstone.org/dtd/Archive/1.0/Archive.dtd">' . "\n";
     373    print $handle '<mets:mets OBJID="'. $OID. ':2">' . "\n";
     374}
     375
     376sub output_mets_xml_footer() {
     377    my $self = shift(@_);
     378    my ($handle) = @_;
     379    print $handle '</mets:mets>' . "\n";
    304380}
    305381
    3063821;
     383
     384
     385
Note: See TracChangeset for help on using the changeset viewer.