Changeset 33302 for main/trunk


Ignore:
Timestamp:
2019-07-05T22:23:48+12:00 (5 years ago)
Author:
ak19
Message:
  1. Adding GPSMapOverlayLabel extracted from GPS.mapOverlay meta to text indexes for searching, as with Coordinte and CoordShort. 2. Added a shortname for this index, ML for MapLabel. 3. On testing the indexing of the GPSMapOverlayLabel text, the old problem of increasingly duplicated Coordinate/CoordShort and now also GPSMapOverlayLabel meta in the infodb reappeared. Dr Bainbridge explained why this was (documented as comments in this commit) and fixed the problem by not processing GPS.mapOverlay meta into Coordinate and Label meta during the infodb pass (and dummy pass, so specifically specifically non-text passes) of buildcol. A natural consequence is that to check whether Coord and Label meta have been indexed, can no longer check the index/text/col.jdb but need to use Luke (if a lucene collection ) to check contents of index/sidx and index/didx. 4. An important change needed for the bugfix in 3 is reordering call to &classify::reconstruct_doc_objs_metadata() in basebuilder.pm to take place AFTER build_proc->set_mode(infodb) has taken place. 5. Changed cross-files global variables declared in doc.pm from our to my variables and tested this works.
Location:
main/trunk/greenstone2/perllib
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/basebuilder.pm

    r32539 r33302  
    480480    }
    481481
    482     if ($self->{'incremental'}) {
    483     # reconstruct doc_obj metadata from database for all docs
    484     $reconstructed_docs
    485         = &classify::reconstruct_doc_objs_metadata($infodb_type,
    486                                $infodb_file_path,
    487                                $database_recs);
    488     }
    489 
    490482    # set up the document processor
    491483
     
    498490    $self->{'buildproc'}->set_store_text(1);
    499491
     492    if ($self->{'incremental'}) {
     493    # reconstruct doc_obj metadata from database for all docs
     494    $reconstructed_docs
     495        = &classify::reconstruct_doc_objs_metadata($infodb_type,
     496                               $infodb_file_path,
     497                               $database_recs);
     498    }
     499   
    500500    # make_infodatabase needs full reset even for incremental build
    501501    # as incremental works by reconstructing all docs from the database and
  • main/trunk/greenstone2/perllib/basebuildproc.pm

    r27917 r33302  
    212212
    213213    $self->{'mode'} = $mode;
     214    $doc::processor_mode = $mode; # doc.pm needs to know what buildcol pass we're at
    214215}
    215216
  • main/trunk/greenstone2/perllib/doc.pm

    r33293 r33302  
    6262# Decided to follow the usage of $OIDcount above to declare the necessary package-level variables 'globally' accessible
    6363# actually accessible only for those packages that import this one (with 'use doc')
    64 our $cmd_line_mode = undef;
     64my $cmd_line_mode = undef;
     65
     66# processor_mode keeps track of which buildcol pass we're at: dummy, text (sidx/didx passes) or infodb
     67my $processor_mode = undef;
    6568
    6669# rename_method can be 'url', 'none', 'base64'
     
    11601163        }
    11611164    }
    1162 
    1163     elsif($field eq "GPS.mapOverlay") { # then the value is a JSON string
    1164 
    1165       if($cmd_line_mode eq "buildcol") {
    1166        #my $metaMap = $self->get_metadata_hashmap($section); ## TODO: Check if necessary to avoid duplication of <Coordinate> meta in index\text\<coll>.jdb
    1167      
    1168        #if(!$metaMap->{'Coordinate'}) {
    1169         #print STDERR "@@@@@@@@@@@@@@ cmd line mode (build phase) is now: $doc::cmd_line_mode\n";
    1170    
     1165   
     1166    elsif($field eq "GPS.mapOverlay") { # then the $value is a JSON string
     1167
     1168        # In order to allow searching map data enriched documents by map shape descriptions,
     1169        # and to run rawquery searches for other docs by proximity based on their map data,
     1170        # need to store the shape descriptions and Coordinate info for shapes into the text index.
     1171        # We add the description for each shape in the mapoverlay into the text index as GPSMapOverlayLabel
     1172        # And we add Coordinate (CD) and CoordShort (CS) info for each shape in the mapoverlay in the format (Lat, Lng) as ("37S339 175E342")
     1173        # where the digits before the N/S/E/W direction represents the whole number. And the digits after the direction are the
     1174        # decimal places which can range from 0 and 2-4 digits.
     1175       
     1176        # However, we only want to process GPS.mapOverlay only during buildcol and only in the text indexing passes (e.g. sidx and didx for lucene)
     1177        # and certainly never during the infodb pass of buildcol. The latter can end up duplicating Coordinate/CoordShort/GPSMapOverlayLabel for
     1178        # when rebuilding with the online doc editor as that runs incremental-rebuild which then calls basebuilder::reconstruct_doc_objs_metadata()
     1179        # on all docs NOT being incrementally rebuilt. That call would get meta from the infodb and use it to reconstruct the doc objects of docs
     1180        # NOT being incrementally built. If the Coord and Label meta were written to the infodb, they would then be loaded back in when the collection
     1181        # is incrementally rebuilt for those docs that don't need incremental processing. Then this function would once again add the same meta into
     1182        # the infodb, thus duplicating what goes into the infodb. Hence, don't do all the following if doc::processor_mode eq "infodb".
     1183
     1184        # Note that for incremental rebuilding, the text pass can be called textreindex for instance (and infodb pass can be incinfodb).
     1185        # So don't check for exact string match
     1186
     1187        if($doc::cmd_line_mode eq "buildcol" && $doc::processor_mode =~ m/text/) {# && $doc::processor_mode !~ m/infodb/) # if dummy pass important
    11711188       
    1172         print STDERR "GPS.mapOverlay before decoding, val = " . $value . "\n";
     1189        ###print STDERR "GPS.mapOverlay before decoding, val = " . $value . "\n";
    11731190       
    1174         # TODO html decode?
     1191        # TODO: html decode?
    11751192        $value =~ s@&#091;@[@g;
    11761193        $value =~ s@&#093;@]@g;
    11771194        $value =~ s@&quot;@"@g;
    1178         print STDERR "GPS.mapOverlay after decoding, val = " . $value . "\n";
     1195        ###print STDERR "GPS.mapOverlay after decoding, val = " . $value . "\n";
    11791196
    11801197        my $json_array = decode_json $value;
    1181         #my $json = JSON->new->allow_nonref;
    1182         #&printAllShapes($json, $json_array);
    1183 
    1184         foreach my $shape (@$json_array) {     
    1185 
     1198       
     1199        foreach my $shape (@$json_array) {
     1200
     1201            # Put each available shape description/label into this section's metadata with GPSMapOverlayLabel as metaname.
     1202            # Just as for Coordinate meta, don't need to know which shape a label belongs too. This is just so each label
     1203            # will be indexed, and therefore can be searched.
     1204           
     1205            my $description = $shape->{"description"};
     1206            if($description) {
     1207            push (@{$section_ptr->{'metadata'}}, ["GPSMapOverlayLabel", $description]);
     1208            ###print STDERR "@@@@############################################ Just added description meta: " . $description . "\n";
     1209            }
     1210           
    11861211            my $type = $shape->{"type"};
    1187             print STDERR "Type : " . $type . "\n";
     1212            ###print STDERR "Shape type : " . $type . "\n";
    11881213       
    11891214            if($type eq "circle") {
    1190                 #print STDERR "Found a circle:\n" . &printShape($json, $shape);
     1215                ###print STDERR "Found a circle:\n" . &printShape($json, $shape);
    11911216       
    11921217                # work out bounding box
     
    12061231                my $radius = $shape->{"radius"}; # in metres!
    12071232
    1208                 print STDERR "@@@ circle centre: ($centre_lat, $centre_lng), radius: $radius\n";
     1233                ###print STDERR "@@@ circle centre: ($centre_lat, $centre_lng), radius: $radius\n";
    12091234
    12101235                my $lat_north = $centre_lat + ($radius/111111);
    12111236                my $lat_south = $centre_lat - ($radius/111111);
    12121237               
    1213                 print STDERR "### lat_north:  $lat_north\n";
    1214                 print STDERR "### lat_south:  $lat_south\n";
     1238                ###print STDERR "### lat_north:  $lat_north\n";
     1239                ###print STDERR "### lat_south:  $lat_south\n";
    12151240
    12161241                # our latitude and longitude values are in degrees. But cos and sin etc in perl and generally all prog languages
     
    12181243                my $centre_lat_radians = $self->degreesToRadians($centre_lat);
    12191244                my $cos_in_radians = cos($centre_lat_radians);             
    1220                 print STDERR "cos $centre_lat_radians " . cos($centre_lat_radians) . "\n";
     1245                ###print STDERR "cos $centre_lat_radians " . cos($centre_lat_radians) . "\n";
    12211246                my $lng_east = $centre_lng + ($radius/(111111 * $cos_in_radians));
    12221247                my $lng_west = $centre_lng - ($radius/(111111 * $cos_in_radians));
    1223                 print STDERR "### lng_east  $lng_east\n";
    1224                 print STDERR "### lng_west  $lng_west\n";
     1248                ###print STDERR "### lng_east  $lng_east\n";
     1249                ###print STDERR "### lng_west  $lng_west\n";
    12251250
    12261251                my $cos_lat = cos($centre_lat);             
    1227                 print STDERR "cos $centre_lat is $cos_lat\n";
     1252                ###print STDERR "cos $centre_lat is $cos_lat\n";
    12281253
    12291254                $self->processCoordinate($section, $lat_north, $lng_east);
     
    12341259            }
    12351260            elsif ($type eq "marker") {
    1236                 print STDERR "@@ MARKER FOUND WITH LAT: " . $shape->{"position"}->{"lat"} . "\n";
    1237                 print STDERR "@@ MARKER FOUND WITH LNG: " . $shape->{"position"}->{"lng"} . "\n";
     1261                ###print STDERR "@@ MARKER FOUND WITH LAT: " . $shape->{"position"}->{"lat"} . "\n";
     1262                ###print STDERR "@@ MARKER FOUND WITH LNG: " . $shape->{"position"}->{"lng"} . "\n";
    12381263                $self->processCoordinate($section, $shape->{"position"}->{"lat"}, $shape->{"position"}->{"lng"});               
    12391264            }
     
    12541279       
    12551280        } # end for on each shape in GPS.mapOverlay
    1256        #}
    1257       }
     1281        } # end if(buildcol and text pass)
    12581282    } # end GPS.mapOverlay meta
    12591283
     
    12781302}
    12791303
     1304# Call as:
     1305# my $json = JSON->new->allow_nonref;
     1306# &printAllShapes($json, $json_array);
    12801307sub printAllShapes {
    12811308    my ($json, $json_array) = @_;
  • main/trunk/greenstone2/perllib/mgppbuildproc.pm

    r33144 r33302  
    7373              'text'=>'TX',
    7474              'TX'=>1,
     75              'GPSMapOverlayLabel'=> 'ML',
     76              'ML'=>1,
    7577              'Coordinate'=>'CD',
    7678              'CD'=>1,
Note: See TracChangeset for help on using the changeset viewer.