1 | #!/bin/bash
|
---|
2 |
|
---|
3 | # This script exports the Hbase tables for the crawled websites' webpages to files called part-m-*
|
---|
4 | # which presumably can be reimported into Hbase.
|
---|
5 | # See https://blogs.msdn.microsoft.com/data_otaku/2016/12/21/working-with-the-hbase-import-and-export-utility/
|
---|
6 | # We want to backup our HBase tables that were created by crawling, so we can finally retire the
|
---|
7 | # vagrant Virtual Machines we used to crawl web sites.
|
---|
8 |
|
---|
9 |
|
---|
10 | # When doing this on the command line:
|
---|
11 | #
|
---|
12 | # hdfs dfs -mkdir /user/vagrant
|
---|
13 | # EXPORT CMD:
|
---|
14 | # vagrant@node5:~/crawled$ hbase org.apache.hadoop.hbase.mapreduce.Export "01066_webpage" "/user/vagrant/01066"
|
---|
15 | #
|
---|
16 | # Note that when the script runs the above with double quotes around the tablename or outputdir
|
---|
17 | # hadoop doesn't like it. But such a cmd was okay from the command line.
|
---|
18 | #
|
---|
19 | # OUTPUT:
|
---|
20 | # vagrant@node5:~/crawled$ hdfs dfs -ls /user/vagrant/01066
|
---|
21 | # Found 2 items
|
---|
22 | # -rw-r--r-- 1 vagrant supergroup 0 2019-10-30 08:29 /user/vagrant/01066/_SUCCESS
|
---|
23 | # -rw-r--r-- 1 vagrant supergroup 184788017 2019-10-30 08:29 /user/vagrant/01066/part-m-00000
|
---|
24 |
|
---|
25 |
|
---|
26 | sitesDir=crawled
|
---|
27 | echo "SITES DIR (INPUT): $sitesDir"
|
---|
28 |
|
---|
29 | logOut=$sitesDir/exportHBase.log
|
---|
30 |
|
---|
31 | # output dir is on hdfs (i.e. hdfs:///user/vagrant)
|
---|
32 | outputDir=/user/vagrant
|
---|
33 |
|
---|
34 | exportCmd="hbase org.apache.hadoop.hbase.mapreduce.Export"
|
---|
35 |
|
---|
36 | function exportAll() {
|
---|
37 |
|
---|
38 | if $(hdfs dfs -test -d "$outputDir"); then
|
---|
39 | echo "Output directory on hdfs $outputDir already exists."
|
---|
40 | else
|
---|
41 | echo "Creating directory on hfds $outputDir..."
|
---|
42 | hdfs dfs -mkdir $outputDir
|
---|
43 | fi
|
---|
44 |
|
---|
45 | # https://stackoverflow.com/questions/4000613/perform-an-action-in-every-sub-directory-using-bash
|
---|
46 | for siteDir in $sitesDir/*/; do
|
---|
47 |
|
---|
48 | # to get crawl_id like 00001 from $siteDir like to_crawl/sites/00001/
|
---|
49 | # Remove the $sitesDir prefix of to_crawl/sites followed by /,
|
---|
50 | # Next remove the / suffix that remains
|
---|
51 | crawlId=${siteDir#"$sitesDir/"}
|
---|
52 | crawlId=${crawlId%/}
|
---|
53 |
|
---|
54 | echo "$siteDir $crawlId"
|
---|
55 |
|
---|
56 | fullExportCmd="$exportCmd ${crawlId}_webpage $outputDir/$crawlId"
|
---|
57 |
|
---|
58 | echo "Going to run command:"
|
---|
59 | #echo " $exportCmd \"${crawlId}_webpage\" \"$outputDir/$crawlId\"" 2>&1 | tee $logOut
|
---|
60 | echo " $fullExportCmd" 2>&1 | tee $logOut
|
---|
61 |
|
---|
62 | # run the export from hbase - produces a file called part-m-00000 (perhaps multiple files called part-m-*)
|
---|
63 | $fullExportCmd 2>&1 | tee -a $logOut
|
---|
64 |
|
---|
65 |
|
---|
66 | # if export was successful, should have file _SUCCESS in outputdir
|
---|
67 | # In that case, copy the outputfile(s) named part-m-* into crawl folder
|
---|
68 | if $(hdfs dfs -test -f "$outputDir/$crawlId/_SUCCESS"); then
|
---|
69 | hdfs dfs -get $outputDir/$crawlId/part-m-* ${sitesDir}/$crawlId/.
|
---|
70 | else
|
---|
71 | echo "ERROR: EXPORT OF $crawlId FAILED" 2>&1 | tee -a $logOut
|
---|
72 | fi
|
---|
73 |
|
---|
74 | done
|
---|
75 | }
|
---|
76 |
|
---|
77 |
|
---|
78 | # script begins here
|
---|
79 | exportAll
|
---|