Changeset 32963


Ignore:
Timestamp:
2019-04-01T17:45:14+13:00 (5 years ago)
Author:
davidb
Message:

Added text and some refinement of scripts to make things easier to run

Location:
other-projects/is-sheet-music-encore/trunk/gen-corpus-ids
Files:
1 added
2 edited

Legend:

Unmodified
Added
Removed
  • other-projects/is-sheet-music-encore/trunk/gen-corpus-ids/HATHI-EXTRACT-FORMAT.sh

    r32962 r32963  
    22
    33input=${1:-'hathi_full_20190301.txt.gz'}
     4output=${2:-'hathi_brief_20190301.txt'}
    45
     6echo ""
     7echo "===="
     8echo "  Script to extract Format (and related fields, such as copyright)"
     9echo "  from HathiTrust tab-delimited metadata dump"
     10echo "===="
     11
     12echo ""
     13echo "Reading in  : $input"
     14echo "Writing out : $output"
     15echo ""
     16
     17echo "Processing ..."
    518zcat "$input" \
    6      | awk -F '\t' '{print $1 "\t" $3 "\t" $20 "\t" $24} '
     19     | awk -F '\t' '{print $1 "\t" $3 "\t" $20 "\t" $24} ' \
     20     > "$output"
     21
     22echo "... Done"
     23echo ""
  • other-projects/is-sheet-music-encore/trunk/gen-corpus-ids/HATHI-EXTRACT-PD-NON-GOOGLE.sh

    r32962 r32963  
    22
    33input=${1:-'hathi_brief_20190301.txt'}
     4output=${2:-'hathi_pd_MU_Not-Google_20190301.txt'}
     5
     6echo ""
     7echo "===="
     8echo "  Script to filter down the extracted Music Format data entries that at"
     9echo "  publicly available: public domain and NOT scanned by Google"
     10echo "===="
     11
     12echo ""
     13echo "Reading in  : $input"
     14echo "Writing out : $output"
     15echo ""
     16
     17echo "Processing ..."
    418
    519cat "$input" \
    6     | awk -F '\t' '$2 == "pd" && $3 == "MU" && $4 != "google" { print $0 }'
     20    | awk -F '\t' '$2 == "pd" && $3 == "MU" && $4 != "google" { print $0 }' \
     21    > "$output"
    722
    8 #  grep MU | grep -v google
     23echo "... Done"
     24echo ""
     25
Note: See TracChangeset for help on using the changeset viewer.