- Timestamp:
- 2017-07-07T16:11:22+12:00 (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java
r31677 r31779 339 339 } 340 340 341 protected static ArrayList<String> lowerCaseTerms(String word) 342 { 343 ArrayList<String> words_out = new ArrayList<String>(); 344 345 Reader reader = new StringReader(word); 346 347 Tokenizer tokenizer = new StandardTokenizer(); 348 tokenizer.setReader(reader); 349 CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class); 350 351 TokenStream token_stream = new LowerCaseFilter(tokenizer); 352 353 try { 354 token_stream.reset(); 355 356 while (token_stream.incrementToken()) { 357 String term = charTermAttribute.toString(); 358 359 words_out.add(term); 360 } 361 362 token_stream.end(); 363 token_stream.close(); 364 } 365 catch (IOException e) { 366 e.printStackTrace(); 367 } 368 369 return words_out; 370 } 371 341 372 protected static ArrayList<POSString> getTokenPosCountWordsMapWhitelist(ArrayList<POSString> words_in, 342 373 WhitelistBloomFilter whitelist_bloomfilter) 343 374 { 344 ArrayList<POSString> words_out = new ArrayList<POSString>();375 ArrayList<POSString> pos_words_out = new ArrayList<POSString>(); 345 376 346 377 for (POSString pos_word: words_in) { 347 378 String word = pos_word.getString(); 379 String[] pos_tags = pos_word.getPOSTags(); 380 348 381 if (whitelist_bloomfilter.contains(word)) { 349 words_out.add(pos_word); 350 } 351 } 352 353 return words_out; 382 383 ArrayList<String> word_terms = lowerCaseTerms(word); 384 for (String term: word_terms) { 385 POSString pos_term = new POSString(term, pos_tags); 386 387 pos_words_out.add(pos_term); 388 } 389 390 // The old, direct way of adding the value in 391 //pos_words_out.add(pos_word); 392 } 393 else { 394 // else clause won't happen so often 395 // (has to be an 'obscure' word *not* be in the whitelist to get here) 396 // break down the word into terms, and see if any of them are in the whitelist instead 397 398 ArrayList<String> word_terms = lowerCaseTerms(word); 399 for (String term: word_terms) { 400 401 if (whitelist_bloomfilter.contains(term)) { 402 POSString pos_term = new POSString(term, pos_tags); 403 404 pos_words_out.add(pos_term); 405 } 406 } 407 408 409 } 410 } 411 412 return pos_words_out; 354 413 } 355 414 … … 435 494 { 436 495 ArrayList<POSString> cs_tokens = getTokenPosCountWordsArrayList(ef_token_pos_count, page_id,icu_tokenize); 437 ArrayList<POSString> lc_tokens = getTokenPosCountWordsMapCaseInsensitive(cs_tokens);496 //ArrayList<POSString> lc_tokens = getTokenPosCountWordsMapCaseInsensitive(cs_tokens); 438 497 439 498 ArrayList<POSString> tokens = null; 440 499 if (whitelist_bloomfilter != null) { 441 tokens = getTokenPosCountWordsMapWhitelist(lc_tokens,whitelist_bloomfilter); 442 } 443 else { 500 tokens = getTokenPosCountWordsMapWhitelist(cs_tokens,whitelist_bloomfilter); 501 //tokens = getTokenPosCountWordsMapWhitelist(lc_tokens,whitelist_bloomfilter); 502 } 503 else { 504 ArrayList<POSString> lc_tokens = getTokenPosCountWordsMapCaseInsensitive(cs_tokens); 444 505 tokens = lc_tokens; 445 506 }
Note:
See TracChangeset
for help on using the changeset viewer.