Show
Ignore:
Timestamp:
11.04.2017 23:41:07 (3 years ago)
Author:
davidb
Message:

Additional _s and _ss fields to help with faceting. Temporarily commented out the full-text page part.

Location:
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures
Files:
3 modified

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeJSON.java

    r31505 r31597  
    134134                    } 
    135135                } 
    136                  
     136/*               
    137137                // 
    138138                // Now move on to POS extracted features per-page 
     
    185185 
    186186                } 
     187                */ 
    187188            } 
    188189        } 
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForSolrIngest.java

    r31502 r31597  
    124124                                                       solr_endpoints,_output_dir,_verbosity, 
    125125                                                       icu_tokenize,strict_file_io); 
    126  
    127126         
    128127        JavaRDD<Integer> per_volume_page_count = json_text_rdd.map(per_vol_json); 
    129128         
    130         Integer num_page_ids = per_volume_page_count.reduce((a, b) -> a + b); 
    131          
    132         System.out.println(""); 
    133         System.out.println("############"); 
    134         System.out.println("# Number of page ids: " + num_page_ids); 
     129        //Integer num_page_ids = per_volume_page_count.reduce((a, b) -> a + b); 
     130        long num_vol_ids = per_volume_page_count.count(); 
     131         
     132        System.out.println(""); 
     133        System.out.println("############"); 
     134        //System.out.println("# Number of page ids: " + num_page_ids); 
     135        System.out.println("# Number of volume ids: " + num_vol_ids); 
    135136        System.out.println("############"); 
    136137        System.out.println(""); 
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java

    r31510 r31597  
    7373                if (metavalue != null) { 
    7474                    solr_doc_json.put(metaname+"_t",metavalue); 
     75                    solr_doc_json.put(metaname+"_s",metavalue); 
    7576                } 
    7677            } 
     
    8081                if (metavalues != null) { 
    8182                    solr_doc_json.put(metaname+"_t",metavalues); 
    82                 } 
    83             } 
    84              
    85             solr_add_json.put("commitWithin", 5000); 
     83                    solr_doc_json.put(metaname+"_ss",metavalues); 
     84                } 
     85            } 
     86             
     87            solr_add_json.put("commitWithin", 60000); // used to be 5000 
    8688            solr_add_json.put("doc", solr_doc_json); 
    8789