Changeset 32963 for other-projects/is-sheet-music-encore/trunk/gen-corpus-ids/HATHI-EXTRACT-FORMAT.sh
- Timestamp:
- 2019-04-01T17:45:14+13:00 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/is-sheet-music-encore/trunk/gen-corpus-ids/HATHI-EXTRACT-FORMAT.sh
r32962 r32963 2 2 3 3 input=${1:-'hathi_full_20190301.txt.gz'} 4 output=${2:-'hathi_brief_20190301.txt'} 4 5 6 echo "" 7 echo "====" 8 echo " Script to extract Format (and related fields, such as copyright)" 9 echo " from HathiTrust tab-delimited metadata dump" 10 echo "====" 11 12 echo "" 13 echo "Reading in : $input" 14 echo "Writing out : $output" 15 echo "" 16 17 echo "Processing ..." 5 18 zcat "$input" \ 6 | awk -F '\t' '{print $1 "\t" $3 "\t" $20 "\t" $24} ' 19 | awk -F '\t' '{print $1 "\t" $3 "\t" $20 "\t" $24} ' \ 20 > "$output" 21 22 echo "... Done" 23 echo ""
Note:
See TracChangeset
for help on using the changeset viewer.