source: gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/exportHBase.sh@ 33608

Last change on this file since 33608 was 33608, checked in by ak19, 4 years ago
  1. New script to export from HBase so that we could in theory reimport into HBase. I've not tried the reimport out, but I followed instructions to export and I got a non-zero output file, so I am assuming it worked. 2. Committing today's new crawls in crawledNode4.tar. Each crawled site's folder inside it now includes a file called part-m-* that is the exported Hbase on that node VM. 3. Updated hdfs related GS_README.txt with instructions on viewing the contents of a table in HBase and a link on exporting/importing from HBase. 4. Minor changes like the tar files shouldn't be called tar.gz.
  • Property svn:executable set to *
File size: 2.6 KB
Line 
1#!/bin/bash
2
3# This script exports the Hbase tables for the crawled websites' webpages to files called part-m-*
4# which presumably can be reimported into Hbase.
5# See https://blogs.msdn.microsoft.com/data_otaku/2016/12/21/working-with-the-hbase-import-and-export-utility/
6# We want to backup our HBase tables that were created by crawling, so we can finally retire the
7# vagrant Virtual Machines we used to crawl web sites.
8
9
10# When doing this on the command line:
11#
12# hdfs dfs -mkdir /user/vagrant
13# EXPORT CMD:
14# vagrant@node5:~/crawled$ hbase org.apache.hadoop.hbase.mapreduce.Export "01066_webpage" "/user/vagrant/01066"
15#
16# Note that when the script runs the above with double quotes around the tablename or outputdir
17# hadoop doesn't like it. But such a cmd was okay from the command line.
18#
19# OUTPUT:
20# vagrant@node5:~/crawled$ hdfs dfs -ls /user/vagrant/01066
21# Found 2 items
22# -rw-r--r-- 1 vagrant supergroup 0 2019-10-30 08:29 /user/vagrant/01066/_SUCCESS
23# -rw-r--r-- 1 vagrant supergroup 184788017 2019-10-30 08:29 /user/vagrant/01066/part-m-00000
24
25
26sitesDir=crawled
27echo "SITES DIR (INPUT): $sitesDir"
28
29logOut=$sitesDir/exportHBase.log
30
31# output dir is on hdfs (i.e. hdfs:///user/vagrant)
32outputDir=/user/vagrant
33
34exportCmd="hbase org.apache.hadoop.hbase.mapreduce.Export"
35
36function exportAll() {
37
38 if $(hdfs dfs -test -d "$outputDir"); then
39 echo "Output directory on hdfs $outputDir already exists."
40 else
41 echo "Creating directory on hfds $outputDir..."
42 hdfs dfs -mkdir $outputDir
43 fi
44
45 # https://stackoverflow.com/questions/4000613/perform-an-action-in-every-sub-directory-using-bash
46 for siteDir in $sitesDir/*/; do
47
48 # to get crawl_id like 00001 from $siteDir like to_crawl/sites/00001/
49 # Remove the $sitesDir prefix of to_crawl/sites followed by /,
50 # Next remove the / suffix that remains
51 crawlId=${siteDir#"$sitesDir/"}
52 crawlId=${crawlId%/}
53
54 echo "$siteDir $crawlId"
55
56 fullExportCmd="$exportCmd ${crawlId}_webpage $outputDir/$crawlId"
57
58 echo "Going to run command:"
59 #echo " $exportCmd \"${crawlId}_webpage\" \"$outputDir/$crawlId\"" 2>&1 | tee $logOut
60 echo " $fullExportCmd" 2>&1 | tee $logOut
61
62 # run the export from hbase - produces a file called part-m-00000 (perhaps multiple files called part-m-*)
63 $fullExportCmd 2>&1 | tee -a $logOut
64
65
66 # if export was successful, should have file _SUCCESS in outputdir
67 # In that case, copy the outputfile(s) named part-m-* into crawl folder
68 if $(hdfs dfs -test -f "$outputDir/$crawlId/_SUCCESS"); then
69 hdfs dfs -get $outputDir/$crawlId/part-m-* ${sitesDir}/$crawlId/.
70 else
71 echo "ERROR: EXPORT OF $crawlId FAILED" 2>&1 | tee -a $logOut
72 fi
73
74 done
75}
76
77
78# script begins here
79exportAll
Note: See TracBrowser for help on using the repository browser.