Changeset 33302

Show
Ignore:
Timestamp:
05.07.2019 22:23:48 (3 weeks ago)
Author:
ak19
Message:

1. Adding GPSMapOverlayLabel extracted from GPS.mapOverlay meta to text indexes for searching, as with Coordinte and CoordShort?. 2. Added a shortname for this index, ML for MapLabel?. 3. On testing the indexing of the GPSMapOverlayLabel text, the old problem of increasingly duplicated Coordinate/CoordShort and now also GPSMapOverlayLabel meta in the infodb reappeared. Dr Bainbridge explained why this was (documented as comments in this commit) and fixed the problem by not processing GPS.mapOverlay meta into Coordinate and Label meta during the infodb pass (and dummy pass, so specifically specifically non-text passes) of buildcol. A natural consequence is that to check whether Coord and Label meta have been indexed, can no longer check the index/text/col.jdb but need to use Luke (if a lucene collection ) to check contents of index/sidx and index/didx. 4. An important change needed for the bugfix in 3 is reordering call to &classify::reconstruct_doc_objs_metadata() in basebuilder.pm to take place AFTER build_proc->set_mode(infodb) has taken place. 5. Changed cross-files global variables declared in doc.pm from our to my variables and tested this works.

Location:
main/trunk/greenstone2/perllib
Files:
4 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/basebuilder.pm

    r32539 r33302  
    480480    } 
    481481 
    482     if ($self->{'incremental'}) { 
    483     # reconstruct doc_obj metadata from database for all docs 
    484     $reconstructed_docs  
    485         = &classify::reconstruct_doc_objs_metadata($infodb_type,  
    486                                $infodb_file_path, 
    487                                $database_recs); 
    488     } 
    489  
    490482    # set up the document processor 
    491483 
     
    498490    $self->{'buildproc'}->set_store_text(1); 
    499491 
     492    if ($self->{'incremental'}) { 
     493    # reconstruct doc_obj metadata from database for all docs 
     494    $reconstructed_docs  
     495        = &classify::reconstruct_doc_objs_metadata($infodb_type,  
     496                               $infodb_file_path, 
     497                               $database_recs); 
     498    } 
     499     
    500500    # make_infodatabase needs full reset even for incremental build 
    501501    # as incremental works by reconstructing all docs from the database and 
  • main/trunk/greenstone2/perllib/basebuildproc.pm

    r27917 r33302  
    212212 
    213213    $self->{'mode'} = $mode; 
     214    $doc::processor_mode = $mode; # doc.pm needs to know what buildcol pass we're at 
    214215} 
    215216 
  • main/trunk/greenstone2/perllib/doc.pm

    r33293 r33302  
    6262# Decided to follow the usage of $OIDcount above to declare the necessary package-level variables 'globally' accessible 
    6363# actually accessible only for those packages that import this one (with 'use doc') 
    64 our $cmd_line_mode = undef; 
     64my $cmd_line_mode = undef; 
     65 
     66# processor_mode keeps track of which buildcol pass we're at: dummy, text (sidx/didx passes) or infodb 
     67my $processor_mode = undef; 
    6568 
    6669# rename_method can be 'url', 'none', 'base64' 
     
    11601163        } 
    11611164    } 
    1162  
    1163     elsif($field eq "GPS.mapOverlay") { # then the value is a JSON string 
    1164  
    1165       if($cmd_line_mode eq "buildcol") { 
    1166        #my $metaMap = $self->get_metadata_hashmap($section); ## TODO: Check if necessary to avoid duplication of <Coordinate> meta in index\text\<coll>.jdb 
    1167        
    1168        #if(!$metaMap->{'Coordinate'}) { 
    1169         #print STDERR "@@@@@@@@@@@@@@ cmd line mode (build phase) is now: $doc::cmd_line_mode\n"; 
    1170      
     1165     
     1166    elsif($field eq "GPS.mapOverlay") { # then the $value is a JSON string 
     1167 
     1168        # In order to allow searching map data enriched documents by map shape descriptions, 
     1169        # and to run rawquery searches for other docs by proximity based on their map data, 
     1170        # need to store the shape descriptions and Coordinate info for shapes into the text index. 
     1171        # We add the description for each shape in the mapoverlay into the text index as GPSMapOverlayLabel 
     1172        # And we add Coordinate (CD) and CoordShort (CS) info for each shape in the mapoverlay in the format (Lat, Lng) as ("37S339 175E342") 
     1173        # where the digits before the N/S/E/W direction represents the whole number. And the digits after the direction are the 
     1174        # decimal places which can range from 0 and 2-4 digits. 
     1175         
     1176        # However, we only want to process GPS.mapOverlay only during buildcol and only in the text indexing passes (e.g. sidx and didx for lucene) 
     1177        # and certainly never during the infodb pass of buildcol. The latter can end up duplicating Coordinate/CoordShort/GPSMapOverlayLabel for 
     1178        # when rebuilding with the online doc editor as that runs incremental-rebuild which then calls basebuilder::reconstruct_doc_objs_metadata() 
     1179        # on all docs NOT being incrementally rebuilt. That call would get meta from the infodb and use it to reconstruct the doc objects of docs 
     1180        # NOT being incrementally built. If the Coord and Label meta were written to the infodb, they would then be loaded back in when the collection 
     1181        # is incrementally rebuilt for those docs that don't need incremental processing. Then this function would once again add the same meta into 
     1182        # the infodb, thus duplicating what goes into the infodb. Hence, don't do all the following if doc::processor_mode eq "infodb". 
     1183 
     1184        # Note that for incremental rebuilding, the text pass can be called textreindex for instance (and infodb pass can be incinfodb). 
     1185        # So don't check for exact string match 
     1186 
     1187        if($doc::cmd_line_mode eq "buildcol" && $doc::processor_mode =~ m/text/) {# && $doc::processor_mode !~ m/infodb/) # if dummy pass important 
    11711188         
    1172         print STDERR "GPS.mapOverlay before decoding, val = " . $value . "\n"; 
     1189        ###print STDERR "GPS.mapOverlay before decoding, val = " . $value . "\n"; 
    11731190         
    1174         # TODO html decode? 
     1191        # TODO: html decode? 
    11751192        $value =~ s@&#091;@[@g; 
    11761193        $value =~ s@&#093;@]@g; 
    11771194        $value =~ s@&quot;@"@g; 
    1178         print STDERR "GPS.mapOverlay after decoding, val = " . $value . "\n"; 
     1195        ###print STDERR "GPS.mapOverlay after decoding, val = " . $value . "\n"; 
    11791196 
    11801197        my $json_array = decode_json $value; 
    1181         #my $json = JSON->new->allow_nonref; 
    1182         #&printAllShapes($json, $json_array); 
    1183  
    1184         foreach my $shape (@$json_array) {       
    1185  
     1198         
     1199        foreach my $shape (@$json_array) { 
     1200 
     1201            # Put each available shape description/label into this section's metadata with GPSMapOverlayLabel as metaname. 
     1202            # Just as for Coordinate meta, don't need to know which shape a label belongs too. This is just so each label 
     1203            # will be indexed, and therefore can be searched. 
     1204             
     1205            my $description = $shape->{"description"}; 
     1206            if($description) { 
     1207            push (@{$section_ptr->{'metadata'}}, ["GPSMapOverlayLabel", $description]); 
     1208            ###print STDERR "@@@@############################################ Just added description meta: " . $description . "\n"; 
     1209            } 
     1210             
    11861211            my $type = $shape->{"type"}; 
    1187             print STDERR "Type : " . $type . "\n"; 
     1212            ###print STDERR "Shape type : " . $type . "\n"; 
    11881213         
    11891214            if($type eq "circle") { 
    1190                 #print STDERR "Found a circle:\n" . &printShape($json, $shape); 
     1215                ###print STDERR "Found a circle:\n" . &printShape($json, $shape); 
    11911216         
    11921217                # work out bounding box 
     
    12061231                my $radius = $shape->{"radius"}; # in metres! 
    12071232 
    1208                 print STDERR "@@@ circle centre: ($centre_lat, $centre_lng), radius: $radius\n"; 
     1233                ###print STDERR "@@@ circle centre: ($centre_lat, $centre_lng), radius: $radius\n"; 
    12091234 
    12101235                my $lat_north = $centre_lat + ($radius/111111); 
    12111236                my $lat_south = $centre_lat - ($radius/111111); 
    12121237                 
    1213                 print STDERR "### lat_north:  $lat_north\n";  
    1214                 print STDERR "### lat_south:  $lat_south\n"; 
     1238                ###print STDERR "### lat_north:  $lat_north\n";  
     1239                ###print STDERR "### lat_south:  $lat_south\n"; 
    12151240 
    12161241                # our latitude and longitude values are in degrees. But cos and sin etc in perl and generally all prog languages 
     
    12181243                my $centre_lat_radians = $self->degreesToRadians($centre_lat); 
    12191244                my $cos_in_radians = cos($centre_lat_radians);               
    1220                 print STDERR "cos $centre_lat_radians " . cos($centre_lat_radians) . "\n"; 
     1245                ###print STDERR "cos $centre_lat_radians " . cos($centre_lat_radians) . "\n"; 
    12211246                my $lng_east = $centre_lng + ($radius/(111111 * $cos_in_radians)); 
    12221247                my $lng_west = $centre_lng - ($radius/(111111 * $cos_in_radians)); 
    1223                 print STDERR "### lng_east  $lng_east\n";  
    1224                 print STDERR "### lng_west  $lng_west\n"; 
     1248                ###print STDERR "### lng_east  $lng_east\n";  
     1249                ###print STDERR "### lng_west  $lng_west\n"; 
    12251250 
    12261251                my $cos_lat = cos($centre_lat);              
    1227                 print STDERR "cos $centre_lat is $cos_lat\n"; 
     1252                ###print STDERR "cos $centre_lat is $cos_lat\n"; 
    12281253 
    12291254                $self->processCoordinate($section, $lat_north, $lng_east); 
     
    12341259            } 
    12351260            elsif ($type eq "marker") { 
    1236                 print STDERR "@@ MARKER FOUND WITH LAT: " . $shape->{"position"}->{"lat"} . "\n"; 
    1237                 print STDERR "@@ MARKER FOUND WITH LNG: " . $shape->{"position"}->{"lng"} . "\n"; 
     1261                ###print STDERR "@@ MARKER FOUND WITH LAT: " . $shape->{"position"}->{"lat"} . "\n"; 
     1262                ###print STDERR "@@ MARKER FOUND WITH LNG: " . $shape->{"position"}->{"lng"} . "\n"; 
    12381263                $self->processCoordinate($section, $shape->{"position"}->{"lat"}, $shape->{"position"}->{"lng"});                
    12391264            } 
     
    12541279         
    12551280        } # end for on each shape in GPS.mapOverlay 
    1256        #} 
    1257       } 
     1281        } # end if(buildcol and text pass) 
    12581282    } # end GPS.mapOverlay meta 
    12591283 
     
    12781302} 
    12791303 
     1304# Call as: 
     1305# my $json = JSON->new->allow_nonref; 
     1306# &printAllShapes($json, $json_array); 
    12801307sub printAllShapes { 
    12811308    my ($json, $json_array) = @_;  
  • main/trunk/greenstone2/perllib/mgppbuildproc.pm

    r33144 r33302  
    7373              'text'=>'TX', 
    7474              'TX'=>1, 
     75              'GPSMapOverlayLabel'=> 'ML', 
     76              'ML'=>1, 
    7577              'Coordinate'=>'CD', 
    7678              'CD'=>1,