Changeset 32963
- Timestamp:
- 2019-04-01T17:45:14+13:00 (5 years ago)
- Location:
- other-projects/is-sheet-music-encore/trunk/gen-corpus-ids
- Files:
-
- 1 added
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/is-sheet-music-encore/trunk/gen-corpus-ids/HATHI-EXTRACT-FORMAT.sh
r32962 r32963 2 2 3 3 input=${1:-'hathi_full_20190301.txt.gz'} 4 output=${2:-'hathi_brief_20190301.txt'} 4 5 6 echo "" 7 echo "====" 8 echo " Script to extract Format (and related fields, such as copyright)" 9 echo " from HathiTrust tab-delimited metadata dump" 10 echo "====" 11 12 echo "" 13 echo "Reading in : $input" 14 echo "Writing out : $output" 15 echo "" 16 17 echo "Processing ..." 5 18 zcat "$input" \ 6 | awk -F '\t' '{print $1 "\t" $3 "\t" $20 "\t" $24} ' 19 | awk -F '\t' '{print $1 "\t" $3 "\t" $20 "\t" $24} ' \ 20 > "$output" 21 22 echo "... Done" 23 echo "" -
other-projects/is-sheet-music-encore/trunk/gen-corpus-ids/HATHI-EXTRACT-PD-NON-GOOGLE.sh
r32962 r32963 2 2 3 3 input=${1:-'hathi_brief_20190301.txt'} 4 output=${2:-'hathi_pd_MU_Not-Google_20190301.txt'} 5 6 echo "" 7 echo "====" 8 echo " Script to filter down the extracted Music Format data entries that at" 9 echo " publicly available: public domain and NOT scanned by Google" 10 echo "====" 11 12 echo "" 13 echo "Reading in : $input" 14 echo "Writing out : $output" 15 echo "" 16 17 echo "Processing ..." 4 18 5 19 cat "$input" \ 6 | awk -F '\t' '$2 == "pd" && $3 == "MU" && $4 != "google" { print $0 }' 20 | awk -F '\t' '$2 == "pd" && $3 == "MU" && $4 != "google" { print $0 }' \ 21 > "$output" 7 22 8 # grep MU | grep -v google 23 echo "... Done" 24 echo "" 25
Note:
See TracChangeset
for help on using the changeset viewer.