#!/bin/bash

# This script exports the Hbase tables for the crawled websites' webpages to files called part-m-*
# which presumably can be reimported into Hbase.
# See https://blogs.msdn.microsoft.com/data_otaku/2016/12/21/working-with-the-hbase-import-and-export-utility/
# We want to backup our HBase tables that were created by crawling, so we can finally retire the
# vagrant Virtual Machines we used to crawl web sites.


# When doing this on the command line:
#
# hdfs dfs -mkdir /user/vagrant
# EXPORT CMD:
# vagrant@node5:~/crawled$ hbase org.apache.hadoop.hbase.mapreduce.Export "01066_webpage" "/user/vagrant/01066"
#
# Note that when the script runs the above with double quotes around the tablename or outputdir
# hadoop doesn't like it. But such a cmd was okay from the command line.
#
# OUTPUT:
# vagrant@node5:~/crawled$ hdfs dfs -ls /user/vagrant/01066
# Found 2 items
# -rw-r--r--   1 vagrant supergroup          0 2019-10-30 08:29 /user/vagrant/01066/_SUCCESS
# -rw-r--r--   1 vagrant supergroup  184788017 2019-10-30 08:29 /user/vagrant/01066/part-m-00000


sitesDir=crawled
echo "SITES DIR (INPUT): $sitesDir"

logOut=$sitesDir/exportHBase.log

# output dir is on hdfs (i.e. hdfs:///user/vagrant)
outputDir=/user/vagrant

exportCmd="hbase org.apache.hadoop.hbase.mapreduce.Export"

function exportAll() {

    if $(hdfs dfs -test -d "$outputDir"); then
	echo "Output directory on hdfs $outputDir already exists."
    else
	echo "Creating directory on hfds $outputDir..."
	hdfs dfs -mkdir $outputDir
    fi
    
    # https://stackoverflow.com/questions/4000613/perform-an-action-in-every-sub-directory-using-bash
    for siteDir in $sitesDir/*/; do
	
	# to get crawl_id like 00001 from $siteDir like to_crawl/sites/00001/
	# Remove the $sitesDir prefix of to_crawl/sites followed by /,
	# Next remove the / suffix that remains
	crawlId=${siteDir#"$sitesDir/"}
	crawlId=${crawlId%/}
	
	echo "$siteDir $crawlId"

	fullExportCmd="$exportCmd ${crawlId}_webpage $outputDir/$crawlId"
	
	echo "Going to run command:"
	#echo "   $exportCmd \"${crawlId}_webpage\" \"$outputDir/$crawlId\"" 2>&1 | tee $logOut
	echo "    $fullExportCmd" 2>&1 | tee $logOut

	# run the export from hbase - produces a file called part-m-00000 (perhaps multiple files called part-m-*)
	$fullExportCmd 2>&1 | tee -a $logOut

	
	# if export was successful, should have file _SUCCESS in outputdir
	# In that case, copy the outputfile(s) named part-m-* into crawl folder
	if $(hdfs dfs -test -f "$outputDir/$crawlId/_SUCCESS"); then
	    hdfs dfs -get $outputDir/$crawlId/part-m-* ${sitesDir}/$crawlId/.
	else
	    echo "ERROR: EXPORT OF $crawlId FAILED" 2>&1 | tee -a $logOut
	fi

    done
}


# script begins here
exportAll