Changeset 32963

Show
Ignore:
Timestamp:
01.04.2019 17:45:14 (7 months ago)
Author:
davidb
Message:

Added text and some refinement of scripts to make things easier to run

Location:
other-projects/is-sheet-music-encore/trunk/gen-corpus-ids
Files:
1 added
2 modified

Legend:

Unmodified
Added
Removed
  • other-projects/is-sheet-music-encore/trunk/gen-corpus-ids/HATHI-EXTRACT-FORMAT.sh

    r32962 r32963  
    22 
    33input=${1:-'hathi_full_20190301.txt.gz'} 
     4output=${2:-'hathi_brief_20190301.txt'} 
    45 
     6echo "" 
     7echo "====" 
     8echo "  Script to extract Format (and related fields, such as copyright)" 
     9echo "  from HathiTrust tab-delimited metadata dump" 
     10echo "====" 
     11 
     12echo "" 
     13echo "Reading in  : $input" 
     14echo "Writing out : $output" 
     15echo "" 
     16 
     17echo "Processing ..." 
    518zcat "$input" \ 
    6      | awk -F '\t' '{print $1 "\t" $3 "\t" $20 "\t" $24} ' 
     19     | awk -F '\t' '{print $1 "\t" $3 "\t" $20 "\t" $24} ' \ 
     20     > "$output" 
     21 
     22echo "... Done" 
     23echo "" 
  • other-projects/is-sheet-music-encore/trunk/gen-corpus-ids/HATHI-EXTRACT-PD-NON-GOOGLE.sh

    r32962 r32963  
    22 
    33input=${1:-'hathi_brief_20190301.txt'} 
     4output=${2:-'hathi_pd_MU_Not-Google_20190301.txt'} 
     5 
     6echo "" 
     7echo "====" 
     8echo "  Script to filter down the extracted Music Format data entries that at" 
     9echo "  publicly available: public domain and NOT scanned by Google" 
     10echo "====" 
     11 
     12echo "" 
     13echo "Reading in  : $input" 
     14echo "Writing out : $output" 
     15echo "" 
     16 
     17echo "Processing ..." 
    418 
    519cat "$input" \ 
    6     | awk -F '\t' '$2 == "pd" && $3 == "MU" && $4 != "google" { print $0 }'  
     20    | awk -F '\t' '$2 == "pd" && $3 == "MU" && $4 != "google" { print $0 }' \ 
     21    > "$output" 
    722 
    8 #  grep MU | grep -v google  
     23echo "... Done" 
     24echo "" 
     25