Changeset 33608
- Timestamp:
- 2019-10-30T23:02:26+13:00 (4 years ago)
- Location:
- gs3-extensions/maori-lang-detection
- Files:
-
- 1 added
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/hdfs-cc-work/GS_README.TXT
r33598 r33608 665 665 666 666 667 -------------------------------------------------------- 668 K. Reading data from hbase tables and backing up hbase 669 -------------------------------------------------------- 670 671 * Backing up HBase database: 672 https://blogs.msdn.microsoft.com/data_otaku/2016/12/21/working-with-the-hbase-import-and-export-utility/ 673 674 * From an image at http://dwgeek.com/read-hbase-table-using-hbase-shell-get-command.html/ 675 to see the contents of a table, inside hbase shell, type: 676 677 scan 'tablename' 678 679 e.g. scan '01066_webpage' and hit enter. 680 681 682 To list tables and see their "column families" (I don't yet understand what this is): 683 684 hbase shell 685 hbase(main):001:0> list 686 687 hbase(main):002:0> describe '01066_webpage' 688 Table 01066_webpage is ENABLED 689 01066_webpage 690 COLUMN FAMILIES DESCRIPTION 691 {NAME => 'f', BLOOMFILTER => 'ROW', VERSIONS => '1', IN_MEMORY => 'false', KEEP_DELETED_CELLS => 'FALSE', DATA_BLOCK_ENCODING => 'NONE', TTL => 'FOREVER', COMPRESSION => 'NONE', MIN_VERSIONS => '0', BLOCK 692 CACHE => 'true', BLOCKSIZE => '65536', REPLICATION_SCOPE => '0'} 693 {NAME => 'h', BLOOMFILTER => 'ROW', VERSIONS => '1', IN_MEMORY => 'false', KEEP_DELETED_CELLS => 'FALSE', DATA_BLOCK_ENCODING => 'NONE', TTL => 'FOREVER', COMPRESSION => 'NONE', MIN_VERSIONS => '0', BLOCK 694 CACHE => 'true', BLOCKSIZE => '65536', REPLICATION_SCOPE => '0'} 695 {NAME => 'il', BLOOMFILTER => 'ROW', VERSIONS => '1', IN_MEMORY => 'false', KEEP_DELETED_CELLS => 'FALSE', DATA_BLOCK_ENCODING => 'NONE', TTL => 'FOREVER', COMPRESSION => 'NONE', MIN_VERSIONS => '0', BLOC 696 KCACHE => 'true', BLOCKSIZE => '65536', REPLICATION_SCOPE => '0'} 697 {NAME => 'mk', BLOOMFILTER => 'ROW', VERSIONS => '1', IN_MEMORY => 'false', KEEP_DELETED_CELLS => 'FALSE', DATA_BLOCK_ENCODING => 'NONE', TTL => 'FOREVER', COMPRESSION => 'NONE', MIN_VERSIONS => '0', BLOC 698 KCACHE => 'true', BLOCKSIZE => '65536', REPLICATION_SCOPE => '0'} 699 {NAME => 'mtdt', BLOOMFILTER => 'ROW', VERSIONS => '1', IN_MEMORY => 'false', KEEP_DELETED_CELLS => 'FALSE', DATA_BLOCK_ENCODING => 'NONE', TTL => 'FOREVER', COMPRESSION => 'NONE', MIN_VERSIONS => '0', BL 700 OCKCACHE => 'true', BLOCKSIZE => '65536', REPLICATION_SCOPE => '0'} 701 {NAME => 'ol', BLOOMFILTER => 'ROW', VERSIONS => '1', IN_MEMORY => 'false', KEEP_DELETED_CELLS => 'FALSE', DATA_BLOCK_ENCODING => 'NONE', TTL => 'FOREVER', COMPRESSION => 'NONE', MIN_VERSIONS => '0', BLOC 702 KCACHE => 'true', BLOCKSIZE => '65536', REPLICATION_SCOPE => '0'} 703 {NAME => 'p', BLOOMFILTER => 'ROW', VERSIONS => '1', IN_MEMORY => 'false', KEEP_DELETED_CELLS => 'FALSE', DATA_BLOCK_ENCODING => 'NONE', TTL => 'FOREVER', COMPRESSION => 'NONE', MIN_VERSIONS => '0', BLOCK 704 CACHE => 'true', BLOCKSIZE => '65536', REPLICATION_SCOPE => '0'} 705 {NAME => 's', BLOOMFILTER => 'ROW', VERSIONS => '1', IN_MEMORY => 'false', KEEP_DELETED_CELLS => 'FALSE', DATA_BLOCK_ENCODING => 'NONE', TTL => 'FOREVER', COMPRESSION => 'NONE', MIN_VERSIONS => '0', BLOCK 706 CACHE => 'true', BLOCKSIZE => '65536', REPLICATION_SCOPE => '0'} 707 8 row(s) in 0.1180 seconds 708 667 709 668 710 -----------------------EOF------------------------ -
gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/batchcrawl.sh
r33574 r33608 64 64 echo "2. copy the regex-urlfilter file:" 2>&1 | tee -a ${siteDir}UNFINISHED 65 65 echo " cp $NUTCH_URLFILTER_TEMPLATE $NUTCH_URLFILTER_FILE" 2>&1 | tee -a ${siteDir}UNFINISHED 66 echo "3. Adjust # crawl iterations in old crawl command:\n$crawl_cmd" 2>&1 | tee -a ${siteDir}UNFINISHED 66 echo "3. Adjust # crawl iterations in old crawl command:" 2>&1 | tee -a ${siteDir}UNFINISHED 67 echo " $crawl_cmd" 2>&1 | tee -a ${siteDir}UNFINISHED 67 68 fi 68 69 -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/MaoriTextDetector.java
r33587 r33608 81 81 82 82 83 // we'll be storing just those sentences in t ext that are in MÄori.83 // we'll be storing just those sentences in the text that are in MÄori. 84 84 85 85 // OpenNLP language detection works best with a minimum of 2 sentences
Note:
See TracChangeset
for help on using the changeset viewer.