Changeset 26183


Ignore:
Timestamp:
2012-09-13T10:12:59+12:00 (12 years ago)
Author:
ak19
Message:

Committing changes Dr Bainbridge and I made to the g2f-import and g2f-building long ago. Note g2f-import runs both import and export.

Location:
main/trunk/greenstone2/bin/script
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/bin/script/g2f-buildcol.pl

    r21687 r26183  
    2121    $ENV{'FEDORA_PROTOCOL'} = "http" if (!defined $ENV{'FEDORA_PROTOCOL'});
    2222    $ENV{'FEDORA_PID_NAMESPACE'} = "greenstone" if (!defined $ENV{'FEDORA_PID_NAMESPACE'});
     23    $ENV{'FEDORA_PREFIX'} = "/fedora" if (!defined $ENV{'FEDORA_PREFIX'});
    2324
    2425    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/");
     
    3536use printusage;
    3637use parse2;
     38use cfgread;
     39use colcfg;
    3740
    3841use g2futil;
    3942
     43use dbutil;
    4044
    4145my $arguments =
     
    213217    foreach my $hd (@hash_dirs) {
    214218
     219        my $hash_id = &g2futil::get_hash_id($hd);
     220       
     221        if (defined $hash_id) {
     222
     223        my $pid = "$pid_namespace:$gs_col-$hash_id";
     224       
     225
     226        my $dsinfo_status = &g2futil::run_datastore_info($pid,$options);
     227       
     228        if ($dsinfo_status == 0) {
     229            print "  $pid being updated.\n";       
     230            &g2futil::run_purge($pid,$options);
     231        }
     232        else {
     233            print "  $pid not present.\n";
     234        }
     235        }
     236
    215237        my $docmets_filename
    216238        = &util::filename_cat($hd,"docmets.xml");
     
    227249    else {
    228250    print STDERR "Error: Unable to open directory $export_dir: $!\n";
    229     exit;
    230     }
    231 
    232 
     251    exit 1;
     252    }
     253
     254
     255# can possibly use inexport instead of running buildcol.pl through system()
     256    print STDERR "**** Just for now, also run Greenstone's buildcol.pl\n";
     257
     258    my $gs_opts = " -verbosity $verbosity";
     259    $gs_opts .= " -gli" if ($gli);
     260    $gs_opts .= " -collectdir \"$collectdir\"" if ($collectdir);
     261    $gs_opts .= " -mode infodb";
     262
     263    my $gs_buildcol_arguments = "$gs_opts $gs_col";
     264
     265    &g2futil::run_cmd("buildcol.pl", $gs_buildcol_arguments, $options);
     266
     267    # read in collect cfg file to work out db type
     268    my $collectcfg = &util::filename_cat ($collectdir, $gs_col, "etc", "collectionConfig.xml");
     269    #print STDERR "**** collectcfg file: $collectcfg\n";
     270    unless(open(FIN, "<$collectcfg")) {
     271    print STDERR "g2f-buildcol.pl: Unable to open $collectcfg...ERROR: $!\n";
     272    exit 1;
     273    }
     274    close(FIN);
     275
     276    # for now we assume GS3, since that's what the following gets implemented for
     277    my $collect_cfg = &colcfg::read_collection_cfg ($collectcfg, "gs3");
     278    # get the database type for this collection from its configuration file (may be undefined)
     279    my $infodbtype = $collect_cfg->{'infodbtype'} || &dbutil::get_default_infodb_type();
     280 
     281    # open .gdbm database file in building/text/$colname.gdb, using dbutil
     282    my $colname = $gs_col;
     283    $colname =~ s/(:?\\|\/)(.*)$/$1/; # remove any collect group from collection name to get tailname
     284
     285    my $building_txt_dir = &util::filename_cat ($collectdir, $gs_col, "building", "text");
     286    my $building_txt_db = &dbutil::get_infodb_file_path($infodbtype, "$colname", $building_txt_dir);
     287
     288    # foreach key that matches http://dir1/dir2/....file.xxx
     289    my $db_keys = {};
     290    &dbutil::read_infodb_keys($infodbtype,$building_txt_db, $db_keys);
     291
     292    foreach my $key (keys %$db_keys) {
     293    if($key =~ m@^http://@) {
     294
     295        # get value for the key
     296        my $src_rec_string = &dbutil::read_infodb_entry($infodbtype,$building_txt_db, $key);
     297        my $src_rec = &dbutil::convert_infodb_string_to_hash($src_rec_string);
     298        my $OID_hash_value = $src_rec->{'section'}->[0];
     299        $OID_hash_value = "$pid_namespace:$gs_col-".$OID_hash_value; # convert to fedoraPID
     300
     301        #   its fedora pid = "greenstone-http:$colname-http:||dir|file.xxx"
     302        # except that fedorapids don't like extra colons and don't like |
     303        my $fedora_identifier = "$pid_namespace-http:$gs_col-$key";
     304        # CAN'T HAVE | OR : (as in "http:||one|two.html") in fedoraPID
     305        $key =~ s@/@_@g;
     306        $key =~ s@:@-@g;
     307        my $fedora_pid = "$pid_namespace-http:$gs_col-$key";
     308
     309        #   To run fedora ingest on the new file need to have sensible
     310        #   filenames that won't offend windows     
     311        my $fedora_key_file_name = "$fedora_pid";
     312        $fedora_key_file_name =~ s@\.@-@g;
     313        $fedora_key_file_name =~ s/\:/=/g;
     314        $fedora_key_file_name .= ".xml";
     315        print STDERR "+++++ fpid: $fedora_pid, fedora-key filename: $fedora_key_file_name\n";
     316
     317        #   write out a FedoraMets File for this key (in /tmp)
     318        #   -> it has one metadata value, which is 'dc:title' = HASHxxxxxx
     319       
     320         # The HASHID shouldn't be the title: then will have
     321         # duplicate titles and it will be hard to search for
     322         # unique ones. What about making the filename the
     323         # dc.title and the HASHID the dc.identifier
     324
     325        my $contents = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n";
     326        $contents .= "<mets:mets xmlns:mets=\"http://www.loc.gov/METS/\"\n";
     327        $contents .= " xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n";
     328        $contents .= " xmlns:gsdl3=\"http://www.greenstone.org/namespace/gsdlmetadata/1.0/\"\n";
     329        $contents .= " xmlns:xlink=\"http://www.w3.org/1999/xlink\"\n";
     330        $contents .= " xsi:schemaLocation=\"http://www.loc.gov/METS/\n";
     331        $contents .= " http://www.loc.gov/standards/mets/mets.xsd\n";
     332        $contents .= " http://www.greenstone.org/namespace/gsdlmetadata/1.0/\n";
     333        $contents .= " http://www.greenstone.org/namespace/gsdlmetadata/1.0/gsdl_metadata.xsd\"\n";
     334        $contents .= " OBJID=\"$fedora_pid\"\n";
     335#       $contents .= " OBJID=\"greenstone:$gs_col-HASH1f814d07252c354039ee11\"\n";
     336        $contents .= " TYPE=\"FedoraObject\" LABEL=\"$fedora_pid\" EXT_VERSION=\"1.1\">\n";
     337        $contents .= "<mets:metsHdr RECORDSTATUS=\"A\"/>\n";
     338        $contents .= "   <mets:amdSec ID=\"DC\" >\n";
     339        $contents .= "      <mets:techMD ID=\"DC.0\">\n";
     340        $contents .= "         <mets:mdWrap LABEL=\"Metadata\" MDTYPE=\"OTHER\" OTHERMDTYPE=\"gsdl3\" ID=\"DCgsdl1\">\n";
     341        $contents .= "            <mets:xmlData>\n";
     342        $contents .= "               <oai_dc:dc xmlns:dc=\"http://purl.org/dc/elements/1.1/\" xmlns:oai_dc=\"http://www.openarchives.org/OAI/2.0/oai_dc/\" >\n";
     343        $contents .= "                  <dc:title>$OID_hash_value</dc:title>\n";
     344#       $contents .= "                  <dc:identifier>$fedora_identifier</dc:identifier>\n";
     345        $contents .= "               </oai_dc:dc>\n";
     346        $contents .= "            </mets:xmlData>\n";
     347        $contents .= "         </mets:mdWrap>\n";
     348        $contents .= "      </mets:techMD>\n";
     349        $contents .= "   </mets:amdSec>\n";
     350        $contents .= "</mets:mets>\n";     
     351
     352   
     353        #   write out the file and then run fedora ingest on that file
     354        #   The file gets purged in g2f-import.pl, so don't remove it from export dir now
     355        my $fedora_key_file_path = &util::filename_cat($export_dir, $fedora_key_file_name);
     356        unless(open(FOUT, ">$fedora_key_file_path")) {
     357        print STDERR "g2f-buildcol.pl: Unable to open $fedora_key_file_path...ERROR: $!\n";
     358        exit 1;
     359        }
     360        print FOUT $contents;
     361        close(FOUT);
     362
     363        print STDERR "<Build>\n" if $gli;
     364        print STDERR "Ingesting $fedora_key_file_name\n";
     365        print STDERR "#### ".join(",", %$options)."\n";
     366
     367        &g2futil::run_ingest($fedora_key_file_path,$options);
     368        print STDERR "</Build>\n" if $gli;
     369    }
     370   
     371    }
     372
     373
     374    # If successful!!! Then need to think about:
     375    #    [CLX] nodes
     376    #    Doing this with FedoraMETSPlugin
     377
     378   
    233379}
    234380
  • main/trunk/greenstone2/bin/script/g2f-import.pl

    r22338 r26183  
    212212    # readdir
    213213    if (opendir(DIR, $export_dir)) {
    214 
     214        my @xml_files = grep { $_ =~ m/^greenstone-http.*\.xml$/ } readdir(DIR);
    215215        closedir DIR;
     216
     217        # purge all the (URL,hashID) metadata files that we inserted
     218        # into fedora at the end of g2f-buildcol.pl
     219        # convert the filenames into fedora-pids
     220        # filename = greenstone-http=tmpcol-http-__test1-html.xml -> fpid = greenstone-http:tmpcol-http-__test1.html
     221        foreach my $file (@xml_files) {
     222        my $fedora_pid = $file;
     223        $fedora_pid =~ s/\.xml$//;
     224        $fedora_pid =~ s/\=/:/;
     225        $fedora_pid =~ s/(.*)-(.*)$/$1.$2/;
     226       
     227        print STDERR "#### fedora_pid: $fedora_pid\n";
     228        &g2futil::run_purge($fedora_pid,$options); # displays error message if first time (nothing to purge)
     229        }
     230
    216231        my @hash_dirs = &g2futil::get_all_hash_dirs($export_dir,$maxdocs);
    217232
     
    246261    print "***\n";
    247262
    248     my $gs_export_opts = "-saveas FedoraMETS -fedora_namespace $pid_namespace -verbosity $verbosity";
    249 
    250     $gs_export_opts .= " -gli" if ($gli);
    251 
    252     $gs_export_opts .= " -language $language" if ($language);
    253     $gs_export_opts .= " -collectdir \"$collectdir\"" if ($collectdir);
    254     $gs_export_opts .= " -removeold" if ($removeold);
    255     $gs_export_opts .= " -maxdocs $maxdocs" if ($maxdocs);
    256 
    257     $gs_export_opts .= " -exportdir \"$export_dir\"";
     263    my $gs_export_opts = "-saveas FedoraMETS -fedora_namespace $pid_namespace";
     264
     265    my $gs_opts = " -verbosity $verbosity";
     266    $gs_opts .= " -gli" if ($gli);
     267
     268    $gs_opts .= " -language $language" if ($language);
     269    $gs_opts .= " -collectdir \"$collectdir\"" if ($collectdir);
     270    $gs_opts .= " -removeold" if ($removeold);
     271    $gs_opts .= " -maxdocs $maxdocs" if ($maxdocs);
     272
     273    $gs_export_opts .= " $gs_opts -exportdir \"$export_dir\"";
    258274
    259275    my $gs_export_arguments = "$gs_export_opts $gs_col";
    260276
    261277    &g2futil::run_cmd("export.pl", $gs_export_arguments, $options);
     278
     279    print STDERR "**** Just for now, also run Greenstone's import.pl\n";
     280# if we have the FedoraMETSPlugIN then we wouldn't have to run import anymore
     281    my $gs_import_arguments = "$gs_opts $gs_col";
     282
     283    &g2futil::run_cmd("import.pl", $gs_import_arguments, $options);
    262284}
    263285
Note: See TracChangeset for help on using the changeset viewer.