#!/bin/bash # This script exports the Hbase tables for the crawled websites' webpages to files called part-m-* # which presumably can be reimported into Hbase. # See https://blogs.msdn.microsoft.com/data_otaku/2016/12/21/working-with-the-hbase-import-and-export-utility/ # We want to backup our HBase tables that were created by crawling, so we can finally retire the # vagrant Virtual Machines we used to crawl web sites. # When doing this on the command line: # # hdfs dfs -mkdir /user/vagrant # EXPORT CMD: # vagrant@node5:~/crawled$ hbase org.apache.hadoop.hbase.mapreduce.Export "01066_webpage" "/user/vagrant/01066" # # Note that when the script runs the above with double quotes around the tablename or outputdir # hadoop doesn't like it. But such a cmd was okay from the command line. # # OUTPUT: # vagrant@node5:~/crawled$ hdfs dfs -ls /user/vagrant/01066 # Found 2 items # -rw-r--r-- 1 vagrant supergroup 0 2019-10-30 08:29 /user/vagrant/01066/_SUCCESS # -rw-r--r-- 1 vagrant supergroup 184788017 2019-10-30 08:29 /user/vagrant/01066/part-m-00000 sitesDir=crawled echo "SITES DIR (INPUT): $sitesDir" logOut=$sitesDir/exportHBase.log # output dir is on hdfs (i.e. hdfs:///user/vagrant) outputDir=/user/vagrant exportCmd="hbase org.apache.hadoop.hbase.mapreduce.Export" function exportAll() { if $(hdfs dfs -test -d "$outputDir"); then echo "Output directory on hdfs $outputDir already exists." else echo "Creating directory on hfds $outputDir..." hdfs dfs -mkdir $outputDir fi # https://stackoverflow.com/questions/4000613/perform-an-action-in-every-sub-directory-using-bash for siteDir in $sitesDir/*/; do # to get crawl_id like 00001 from $siteDir like to_crawl/sites/00001/ # Remove the $sitesDir prefix of to_crawl/sites followed by /, # Next remove the / suffix that remains crawlId=${siteDir#"$sitesDir/"} crawlId=${crawlId%/} echo "$siteDir $crawlId" fullExportCmd="$exportCmd ${crawlId}_webpage $outputDir/$crawlId" echo "Going to run command:" #echo " $exportCmd \"${crawlId}_webpage\" \"$outputDir/$crawlId\"" 2>&1 | tee $logOut echo " $fullExportCmd" 2>&1 | tee $logOut # run the export from hbase - produces a file called part-m-00000 (perhaps multiple files called part-m-*) $fullExportCmd 2>&1 | tee -a $logOut # if export was successful, should have file _SUCCESS in outputdir # In that case, copy the outputfile(s) named part-m-* into crawl folder if $(hdfs dfs -test -f "$outputDir/$crawlId/_SUCCESS"); then hdfs dfs -get $outputDir/$crawlId/part-m-* ${sitesDir}/$crawlId/. else echo "ERROR: EXPORT OF $crawlId FAILED" 2>&1 | tee -a $logOut fi done } # script begins here exportAll