Changeset 26183 for main

Show
Ignore:
Timestamp:
13.09.2012 10:12:59 (8 years ago)
Author:
ak19
Message:

Committing changes Dr Bainbridge and I made to the g2f-import and g2f-building long ago. Note g2f-import runs both import and export.

Location:
main/trunk/greenstone2/bin/script
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/bin/script/g2f-buildcol.pl

    r21687 r26183  
    2121    $ENV{'FEDORA_PROTOCOL'} = "http" if (!defined $ENV{'FEDORA_PROTOCOL'}); 
    2222    $ENV{'FEDORA_PID_NAMESPACE'} = "greenstone" if (!defined $ENV{'FEDORA_PID_NAMESPACE'}); 
     23    $ENV{'FEDORA_PREFIX'} = "/fedora" if (!defined $ENV{'FEDORA_PREFIX'}); 
    2324 
    2425    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/"); 
     
    3536use printusage; 
    3637use parse2; 
     38use cfgread; 
     39use colcfg; 
    3740 
    3841use g2futil; 
    3942 
     43use dbutil; 
    4044 
    4145my $arguments =  
     
    213217    foreach my $hd (@hash_dirs) { 
    214218 
     219        my $hash_id = &g2futil::get_hash_id($hd); 
     220         
     221        if (defined $hash_id) { 
     222 
     223        my $pid = "$pid_namespace:$gs_col-$hash_id"; 
     224         
     225 
     226        my $dsinfo_status = &g2futil::run_datastore_info($pid,$options); 
     227         
     228        if ($dsinfo_status == 0) { 
     229            print "  $pid being updated.\n";         
     230            &g2futil::run_purge($pid,$options); 
     231        } 
     232        else { 
     233            print "  $pid not present.\n"; 
     234        } 
     235        } 
     236 
    215237        my $docmets_filename  
    216238        = &util::filename_cat($hd,"docmets.xml"); 
     
    227249    else { 
    228250    print STDERR "Error: Unable to open directory $export_dir: $!\n"; 
    229     exit; 
    230     } 
    231  
    232  
     251    exit 1; 
     252    } 
     253 
     254 
     255# can possibly use inexport instead of running buildcol.pl through system() 
     256    print STDERR "**** Just for now, also run Greenstone's buildcol.pl\n"; 
     257 
     258    my $gs_opts = " -verbosity $verbosity"; 
     259    $gs_opts .= " -gli" if ($gli); 
     260    $gs_opts .= " -collectdir \"$collectdir\"" if ($collectdir); 
     261    $gs_opts .= " -mode infodb"; 
     262 
     263    my $gs_buildcol_arguments = "$gs_opts $gs_col"; 
     264 
     265    &g2futil::run_cmd("buildcol.pl", $gs_buildcol_arguments, $options); 
     266 
     267    # read in collect cfg file to work out db type 
     268    my $collectcfg = &util::filename_cat ($collectdir, $gs_col, "etc", "collectionConfig.xml"); 
     269    #print STDERR "**** collectcfg file: $collectcfg\n"; 
     270    unless(open(FIN, "<$collectcfg")) {  
     271    print STDERR "g2f-buildcol.pl: Unable to open $collectcfg...ERROR: $!\n"; 
     272    exit 1;  
     273    } 
     274    close(FIN); 
     275 
     276    # for now we assume GS3, since that's what the following gets implemented for 
     277    my $collect_cfg = &colcfg::read_collection_cfg ($collectcfg, "gs3"); 
     278    # get the database type for this collection from its configuration file (may be undefined) 
     279    my $infodbtype = $collect_cfg->{'infodbtype'} || &dbutil::get_default_infodb_type(); 
     280  
     281    # open .gdbm database file in building/text/$colname.gdb, using dbutil 
     282    my $colname = $gs_col; 
     283    $colname =~ s/(:?\\|\/)(.*)$/$1/; # remove any collect group from collection name to get tailname 
     284 
     285    my $building_txt_dir = &util::filename_cat ($collectdir, $gs_col, "building", "text"); 
     286    my $building_txt_db = &dbutil::get_infodb_file_path($infodbtype, "$colname", $building_txt_dir); 
     287 
     288    # foreach key that matches http://dir1/dir2/....file.xxx 
     289    my $db_keys = {}; 
     290    &dbutil::read_infodb_keys($infodbtype,$building_txt_db, $db_keys); 
     291 
     292    foreach my $key (keys %$db_keys) { 
     293    if($key =~ m@^http://@) { 
     294 
     295        # get value for the key 
     296        my $src_rec_string = &dbutil::read_infodb_entry($infodbtype,$building_txt_db, $key); 
     297        my $src_rec = &dbutil::convert_infodb_string_to_hash($src_rec_string); 
     298        my $OID_hash_value = $src_rec->{'section'}->[0]; 
     299        $OID_hash_value = "$pid_namespace:$gs_col-".$OID_hash_value; # convert to fedoraPID 
     300 
     301        #   its fedora pid = "greenstone-http:$colname-http:||dir|file.xxx" 
     302        # except that fedorapids don't like extra colons and don't like | 
     303        my $fedora_identifier = "$pid_namespace-http:$gs_col-$key"; 
     304        # CAN'T HAVE | OR : (as in "http:||one|two.html") in fedoraPID 
     305        $key =~ s@/@_@g;  
     306        $key =~ s@:@-@g; 
     307        my $fedora_pid = "$pid_namespace-http:$gs_col-$key"; 
     308 
     309        #   To run fedora ingest on the new file need to have sensible 
     310        #   filenames that won't offend windows      
     311        my $fedora_key_file_name = "$fedora_pid"; 
     312        $fedora_key_file_name =~ s@\.@-@g; 
     313        $fedora_key_file_name =~ s/\:/=/g; 
     314        $fedora_key_file_name .= ".xml"; 
     315        print STDERR "+++++ fpid: $fedora_pid, fedora-key filename: $fedora_key_file_name\n"; 
     316 
     317        #   write out a FedoraMets File for this key (in /tmp) 
     318        #   -> it has one metadata value, which is 'dc:title' = HASHxxxxxx 
     319         
     320         # The HASHID shouldn't be the title: then will have 
     321         # duplicate titles and it will be hard to search for 
     322         # unique ones. What about making the filename the 
     323         # dc.title and the HASHID the dc.identifier 
     324 
     325        my $contents = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n"; 
     326        $contents .= "<mets:mets xmlns:mets=\"http://www.loc.gov/METS/\"\n"; 
     327        $contents .= " xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n"; 
     328        $contents .= " xmlns:gsdl3=\"http://www.greenstone.org/namespace/gsdlmetadata/1.0/\"\n"; 
     329        $contents .= " xmlns:xlink=\"http://www.w3.org/1999/xlink\"\n"; 
     330        $contents .= " xsi:schemaLocation=\"http://www.loc.gov/METS/\n"; 
     331        $contents .= " http://www.loc.gov/standards/mets/mets.xsd\n"; 
     332        $contents .= " http://www.greenstone.org/namespace/gsdlmetadata/1.0/\n"; 
     333        $contents .= " http://www.greenstone.org/namespace/gsdlmetadata/1.0/gsdl_metadata.xsd\"\n"; 
     334        $contents .= " OBJID=\"$fedora_pid\"\n"; 
     335#       $contents .= " OBJID=\"greenstone:$gs_col-HASH1f814d07252c354039ee11\"\n"; 
     336        $contents .= " TYPE=\"FedoraObject\" LABEL=\"$fedora_pid\" EXT_VERSION=\"1.1\">\n"; 
     337        $contents .= "<mets:metsHdr RECORDSTATUS=\"A\"/>\n"; 
     338        $contents .= "   <mets:amdSec ID=\"DC\" >\n"; 
     339        $contents .= "      <mets:techMD ID=\"DC.0\">\n"; 
     340        $contents .= "         <mets:mdWrap LABEL=\"Metadata\" MDTYPE=\"OTHER\" OTHERMDTYPE=\"gsdl3\" ID=\"DCgsdl1\">\n"; 
     341        $contents .= "            <mets:xmlData>\n"; 
     342        $contents .= "               <oai_dc:dc xmlns:dc=\"http://purl.org/dc/elements/1.1/\" xmlns:oai_dc=\"http://www.openarchives.org/OAI/2.0/oai_dc/\" >\n"; 
     343        $contents .= "                  <dc:title>$OID_hash_value</dc:title>\n"; 
     344#       $contents .= "                  <dc:identifier>$fedora_identifier</dc:identifier>\n"; 
     345        $contents .= "               </oai_dc:dc>\n"; 
     346        $contents .= "            </mets:xmlData>\n"; 
     347        $contents .= "         </mets:mdWrap>\n"; 
     348        $contents .= "      </mets:techMD>\n"; 
     349        $contents .= "   </mets:amdSec>\n"; 
     350        $contents .= "</mets:mets>\n";       
     351 
     352    
     353        #   write out the file and then run fedora ingest on that file 
     354        #   The file gets purged in g2f-import.pl, so don't remove it from export dir now 
     355        my $fedora_key_file_path = &util::filename_cat($export_dir, $fedora_key_file_name); 
     356        unless(open(FOUT, ">$fedora_key_file_path")) {  
     357        print STDERR "g2f-buildcol.pl: Unable to open $fedora_key_file_path...ERROR: $!\n"; 
     358        exit 1;  
     359        } 
     360        print FOUT $contents; 
     361        close(FOUT); 
     362 
     363        print STDERR "<Build>\n" if $gli; 
     364        print STDERR "Ingesting $fedora_key_file_name\n"; 
     365        print STDERR "#### ".join(",", %$options)."\n"; 
     366 
     367        &g2futil::run_ingest($fedora_key_file_path,$options); 
     368        print STDERR "</Build>\n" if $gli; 
     369    } 
     370     
     371    } 
     372 
     373 
     374    # If successful!!! Then need to think about: 
     375    #    [CLX] nodes 
     376    #    Doing this with FedoraMETSPlugin 
     377 
     378     
    233379} 
    234380 
  • main/trunk/greenstone2/bin/script/g2f-import.pl

    r22338 r26183  
    212212    # readdir 
    213213    if (opendir(DIR, $export_dir)) { 
    214  
     214        my @xml_files = grep { $_ =~ m/^greenstone-http.*\.xml$/ } readdir(DIR); 
    215215        closedir DIR; 
     216 
     217        # purge all the (URL,hashID) metadata files that we inserted 
     218        # into fedora at the end of g2f-buildcol.pl 
     219        # convert the filenames into fedora-pids 
     220        # filename = greenstone-http=tmpcol-http-__test1-html.xml -> fpid = greenstone-http:tmpcol-http-__test1.html 
     221        foreach my $file (@xml_files) { 
     222        my $fedora_pid = $file; 
     223        $fedora_pid =~ s/\.xml$//; 
     224        $fedora_pid =~ s/\=/:/; 
     225        $fedora_pid =~ s/(.*)-(.*)$/$1.$2/; 
     226         
     227        print STDERR "#### fedora_pid: $fedora_pid\n"; 
     228        &g2futil::run_purge($fedora_pid,$options); # displays error message if first time (nothing to purge) 
     229        } 
     230 
    216231        my @hash_dirs = &g2futil::get_all_hash_dirs($export_dir,$maxdocs); 
    217232 
     
    246261    print "***\n"; 
    247262 
    248     my $gs_export_opts = "-saveas FedoraMETS -fedora_namespace $pid_namespace -verbosity $verbosity"; 
    249  
    250     $gs_export_opts .= " -gli" if ($gli); 
    251  
    252     $gs_export_opts .= " -language $language" if ($language); 
    253     $gs_export_opts .= " -collectdir \"$collectdir\"" if ($collectdir); 
    254     $gs_export_opts .= " -removeold" if ($removeold); 
    255     $gs_export_opts .= " -maxdocs $maxdocs" if ($maxdocs); 
    256  
    257     $gs_export_opts .= " -exportdir \"$export_dir\""; 
     263    my $gs_export_opts = "-saveas FedoraMETS -fedora_namespace $pid_namespace"; 
     264 
     265    my $gs_opts = " -verbosity $verbosity"; 
     266    $gs_opts .= " -gli" if ($gli); 
     267 
     268    $gs_opts .= " -language $language" if ($language); 
     269    $gs_opts .= " -collectdir \"$collectdir\"" if ($collectdir); 
     270    $gs_opts .= " -removeold" if ($removeold); 
     271    $gs_opts .= " -maxdocs $maxdocs" if ($maxdocs); 
     272 
     273    $gs_export_opts .= " $gs_opts -exportdir \"$export_dir\""; 
    258274 
    259275    my $gs_export_arguments = "$gs_export_opts $gs_col"; 
    260276 
    261277    &g2futil::run_cmd("export.pl", $gs_export_arguments, $options); 
     278 
     279    print STDERR "**** Just for now, also run Greenstone's import.pl\n"; 
     280# if we have the FedoraMETSPlugIN then we wouldn't have to run import anymore 
     281    my $gs_import_arguments = "$gs_opts $gs_col"; 
     282 
     283    &g2futil::run_cmd("import.pl", $gs_import_arguments, $options); 
    262284} 
    263285