Changeset 37317
- Timestamp:
- 2023-02-13T15:18:02+13:00 (15 months ago)
- Location:
- gs3-installations/eurovision-lod/trunk/sites/eurovision/collect/eurovision/prepare
- Files:
-
- 3 added
- 15 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-installations/eurovision-lod/trunk/sites/eurovision/collect/eurovision/prepare/02-EXPLODE-SPARQLRESULTS-TO-IMPORT--SMALL.sh
r35842 r37317 1 1 #!/bin/bash 2 2 3 ./02-EXPLODE-SPARQLRESULTS-TO-IMPORT.sh local--countries-in-esc-by-year-just-2015--with-errata.json 3 4 source ./_opt_set_YEAR.bash 5 6 source ./_02_explode_shared.bash 7 8 9 f="local--countries-in-esc-by-year-just-$YEAR--with-errata.json" 10 11 12 # ./02-EXPLODE-SPARQLRESULTS-TO-IMPORT.sh $json_file 13 14 15 echo " errata-lod/$f -> $tmp_cache/sparqlresults-$f" 16 /bin/cp "errata-lod/$f" "$tmp_cache/sparqlresults-$f" 17 18 19 explode_metadata_database.pl \ 20 -collectdir "$full_collectdir" \ 21 -collection $collection \ 22 -plugin_options "-metadata_merge_on_concat_fields Country,Year -OIDtype assigned -OIDmetadata Identifier" \ 23 -plugin JSONSPARQLResultPlugin \ 24 $tmp_cache/sparqlresults-$f 25 26 explode_status=$? 27 28 if [ $explode_status != 0 ] ; then 29 echo "Error encountered when exploding:" 1>&2 30 echo " $tmp_cache/sparqlresults-$f" 1>&2 31 exit 1 32 fi 33 34 35 echo "" 36 37 add_in_year_dir=../import/sparqlresults-add-in-$YEAR 38 39 if [ -d $add_in_year_dir ] ; then 40 echo "Removing previous $add_in_year_dir" 41 /bin/rm -rf $add_in_year_dir 42 fi 43 44 echo "Creating empty $add_in_year_dir:" 45 echo " $add_in_year_dir" 46 mkdir $add_in_year_dir 47 48 echo "Moving in exploded results:" 49 echo " $tmp_cache/sparqlresults-*/ -> $add_in_year_dir/." 50 /bin/mv $tmp_cache/sparqlresults-*/* $add_in_year_dir/. 51 52 echo "" -
gs3-installations/eurovision-lod/trunk/sites/eurovision/collect/eurovision/prepare/02-EXPLODE-SPARQLRESULTS-TO-IMPORT.sh
r35980 r37317 1 1 #!/bin/bash 2 2 3 tmp_cache="tmp-cache"3 # tmp_cache="tmp-cache" 4 4 5 cwd=`pwd`6 cwd_without_prepare=${cwd%/*}7 collection=${cwd_without_prepare##*/}5 # cwd=`pwd` 6 # cwd_without_prepare=${cwd%/*} 7 # collection=${cwd_without_prepare##*/} 8 8 9 full_collectdir=${cwd_without_prepare%/*}9 # full_collectdir=${cwd_without_prepare%/*} 10 10 11 echo ""11 # echo "" 12 12 13 if [ "x$GSDL3SRCHOME" = "x" ] ; then14 echo "Environment variable GSDL3SRCHOME not set." 1>&215 echo "Have you sourced ./gs3-setup.sh?" 1>&216 exit 117 fi13 # if [ "x$GSDL3SRCHOME" = "x" ] ; then 14 # echo "Environment variable GSDL3SRCHOME not set." 1>&2 15 # echo "Have you sourced ./gs3-setup.sh?" 1>&2 16 # exit 1 17 # fi 18 18 19 if [ ! -d $tmp_cache ] ; then 20 echo "Making temporary directory:" 21 echo " $tmp_cache" 22 mkdir $tmp_cache 23 else 24 echo "Removing existing content from:" 25 echo " $tmp_cache" 26 /bin/rm -rf $tmp_cache/* 27 fi 19 # if [ ! -d $tmp_cache ] ; then 20 # echo "Making temporary directory:" 21 # echo " $tmp_cache" 22 # mkdir $tmp_cache 23 # else 24 # echo "Removing existing content from:" 25 # echo " $tmp_cache" 26 # /bin/rm -rf $tmp_cache/* 27 # fi 28 29 30 source ./_02_explode_shared.bash 31 28 32 29 33 echo "Copying to '$tmp_cache' then exploding:" … … 76 80 77 81 78 #f="local--countries-in-esc-by-year-after-1956--with-errata.json"79 #echo " errata-lod/$f -> $tmp_cache/sparqlresults-$f"80 #/bin/cp "errata-lod/$f" "$tmp_cache/sparqlresults-$f"81 82 #explode_metadata_database.pl \83 # -collectdir $GSDL3SRCHOME/web/sites/eurovision-lod/collect \84 # -collection $collection \85 # -plugin_options "-metadata_merge_on_concat_fields Country,Year -OIDtype assigned -OIDmetadata Identifier" \86 # -plugin JSONSPARQLResultPlugin \87 # $tmp_cache/sparqlresults-$f88 #89 #if [ $? != 0 ] ; then90 # echo "Error encountered when exploding:" 1>&291 # echo " $tmp_cache/sparqlresults-$f" 1>&292 # exit 193 #fi94 95 82 echo "" 96 83 echo "Regenerating sparqlresults-* files in '../import' (from exploded results in '$tmp_cache')" -
gs3-installations/eurovision-lod/trunk/sites/eurovision/collect/eurovision/prepare/03-GEN-VOTING-METADATA.sh
r37281 r37317 15 15 fi 16 16 17 if [ $esc_cutoff_endyear != "202 1" ] ; then17 if [ $esc_cutoff_endyear != "2022" ] ; then 18 18 echo "Warning: This script makes use of a XSL spreadsheet that contains voting data up" >&2 19 echo" to 202 1(inclusive)" >&219 echo" to 2022 (inclusive)" >&2 20 20 echo "The esc_cutoff_endyear is currently set to: $esc_cutoff_endyear" >&2 21 21 echo "" >&2 … … 32 32 33 33 34 if [ ! -f "$prep_dir/eurovision_song_contest_1975_202 1.xlsx" ] ; then34 if [ ! -f "$prep_dir/eurovision_song_contest_1975_2022.xlsx" ] ; then 35 35 echo "Unzipping $prep_dir/archive.zip:" 36 36 … … 63 63 $prep_dir/xlsx-fromcountry-jsonmetadata.py \ 64 64 --votingtype "J" \ 65 $prep_dir/eurovision_song_contest_1975_202 1.xlsx \65 $prep_dir/eurovision_song_contest_1975_2022.xlsx \ 66 66 $prep_dir/metadata-votes/metadata-votes-fromcountry-jury.json 67 67 … … 70 70 $prep_dir/xlsx-fromcountry-jsonmetadata.py \ 71 71 --votingtype "T" \ 72 $prep_dir/eurovision_song_contest_1975_202 1.xlsx \72 $prep_dir/eurovision_song_contest_1975_2022.xlsx \ 73 73 $prep_dir/metadata-votes/metadata-votes-fromcountry-tele.json 74 74 … … 77 77 $prep_dir/xlsx-fromcountry-jsonmetadata.py \ 78 78 --votingtype "JT" \ 79 $prep_dir/eurovision_song_contest_1975_202 1.xlsx \79 $prep_dir/eurovision_song_contest_1975_2022.xlsx \ 80 80 $prep_dir/metadata-votes/metadata-votes-fromcountry-comb.json 81 81 82 82 if [ $? = 0 ] ; then 83 83 $prep_dir/xlsx-tocountry-jsonmetadata.py \ 84 $prep_dir/eurovision_song_contest_1975_202 1.xlsx \84 $prep_dir/eurovision_song_contest_1975_2022.xlsx \ 85 85 $prep_dir/metadata-votes-tocountry.json $prep_dir/collectionConfig--gsf-headMetaTags.xml 86 86 -
gs3-installations/eurovision-lod/trunk/sites/eurovision/collect/eurovision/prepare/04-COPY-VOTING-METADATA-TO-IMPORT--SMALL.sh
r35842 r37317 1 1 #!/bin/bash 2 2 3 4 source ./_opt_set_YEAR.bash 3 5 4 6 echo "" … … 16 18 echo "Copying:" 17 19 18 echo " voting-excel/metadata-votes/* 2015* ../import/fromcountry-metadata-votes/."20 echo " voting-excel/metadata-votes/*$YEAR* ../import/fromcountry-metadata-votes/." 19 21 /bin/rm -rf "../import/fromcountry-metadata-votes" 20 22 mkdir "../import/fromcountry-metadata-votes" 21 /bin/cp -r voting-excel/metadata-votes/*2015* ../import/fromcountry-metadata-votes/. 23 /bin/cp -r voting-excel/metadata-votes/*$YEAR* ../import/fromcountry-metadata-votes/. 24 25 26 echo " voting-excel/metadata-votes/metadata-votes-fromcountry-*.json ../import/fromcountry-metadata-votes/." 27 /bin/cp voting-excel/metadata-votes/metadata-votes-fromcountry-*.json ../import/fromcountry-metadata-votes/. 22 28 23 29 echo "" -
gs3-installations/eurovision-lod/trunk/sites/eurovision/collect/eurovision/prepare/05-PARSE-ADDITIONAL-METADATA-FROM-WIKIPEDIA--SMALL.sh
r35842 r37317 1 1 #!/bin/bash 2 2 3 ./05-PARSE-ADDITIONAL-METADATA-FROM-WIKIPEDIA.sh 2015 2015 3 source ./_opt_set_YEAR.bash 4 5 ./05-PARSE-ADDITIONAL-METADATA-FROM-WIKIPEDIA.sh $YEAR $YEAR 6 -
gs3-installations/eurovision-lod/trunk/sites/eurovision/collect/eurovision/prepare/06-COPY-PARSED-ADDITIONAL-METADATA-TO-IMPORT.sh
r35962 r37317 63 63 #fi 64 64 65 66 forced2022_dir=../import/sparqlresults-local--countries-in-esc-by-year-after-1956--with-errata.00000101 67 68 #echo "****" 69 #echo "**** Commenting out the section of the script that does a 'force' 2022" 70 #echo "****" 71 72 ls errata-categories/metadata-esc-year/*2022*.nul >/dev/null 2>&1 73 has_2022_nul_files_status=$? 74 75 if [ $has_2022_nul_files_status = 0 ] ; then 76 echo "" 77 echo "Copying (forcing 2022 files into 00000101 import area):" 78 echo " errata-categories/metadata-esc-year/*2022.nul -> $forced2022_dir/." 79 /bin/cp "errata-categories/metadata-esc-year/"*2022.nul "$forced2022_dir/." 80 fi 81 65 82 echo "" 66 83 -
gs3-installations/eurovision-lod/trunk/sites/eurovision/collect/eurovision/prepare/07b-GENERATE-ESSENTIA-FEATURES-DATA--SMALL.sh
r35961 r37317 1 1 #!/bin/bash 2 2 3 if [ $# = 0 ] ; then 4 year=2015 5 else 6 year=$1 7 fi 3 source ./_opt_set_YEAR.bash 8 4 9 /bin/rm -f "votes-$year.csv" 10 /bin/rm -f "contestants-$year.csv" 5 #if [ $# = 0 ] ; then 6 # year=2015 7 #else 8 # year=$1 9 #fi 11 10 12 ./07b-GENERATE-ESSENTIA-FEATURES-DATA.sh "scrape_contestants" --start $year --end $year 13 ./07b-GENERATE-ESSENTIA-FEATURES-DATA.sh "download_audio" --start $year --end $year contestants-$year.csv 14 ./07b-GENERATE-ESSENTIA-FEATURES-DATA.sh "compute_audio_features" --start $year --end $year 11 /bin/rm -f "votes-$YEAR.csv" 12 /bin/rm -f "contestants-$YEAR.csv" 13 14 ./07b-GENERATE-ESSENTIA-FEATURES-DATA.sh "scrape_contestants" --start $YEAR --end $YEAR 15 ./07b-GENERATE-ESSENTIA-FEATURES-DATA.sh "download_audio" --start $YEAR --end $YEAR contestants-$YEAR.csv 16 ./07b-GENERATE-ESSENTIA-FEATURES-DATA.sh "compute_audio_features" --start $YEAR --end $YEAR 15 17 16 18 -
gs3-installations/eurovision-lod/trunk/sites/eurovision/collect/eurovision/prepare/08b-COPY-CSV-AND-AUDIO-FEATURES-TO-IMPORT--SMALL.sh
r35989 r37317 1 1 #!/bin/bash 2 2 3 if [ $# = 0 ] ; then 4 ./08b-COPY-CSV-AND-AUDIO-FEATURES-TO-IMPORT.py --startyear 2015 --endyear 2015 5 else 6 ./08b-COPY-CSV-AND-AUDIO-FEATURES-TO-IMPORT.py $* 7 fi 3 source ./_opt_set_YEAR.bash 4 5 ./08b-COPY-CSV-AND-AUDIO-FEATURES-TO-IMPORT.sh --startyear $YEAR --endyear $YEAR -
gs3-installations/eurovision-lod/trunk/sites/eurovision/collect/eurovision/prepare/08b-COPY-CSV-AND-AUDIO-FEATURES-TO-IMPORT_support.py
r36015 r37317 133 133 for d in os.listdir(import_dir): 134 134 full_d = os.path.join(import_dir, d) 135 if os.path.isdir(full_d) and re.search(r"^(sparqlresults-local--countries-in-esc-by-year-.*)|( missing-cat-countries)|(inaugural-year)$",d):135 if os.path.isdir(full_d) and re.search(r"^(sparqlresults-local--countries-in-esc-by-year-.*)|(sparqlresults-add-in-.*)|(missing-cat-countries)|(inaugural-year)$",d): 136 136 entrant_nul_dirs.append(full_d) 137 137 … … 256 256 src_csv_files.append(f"contestants-{start_year}-to-{esc_cutoff_endyear}.csv") 257 257 else: 258 src_csv_files.append(f"contestants-{start_year}-to-{end_year}.csv") 258 if start_year == end_year: 259 src_csv_files.append(f"contestants-{start_year}.csv") 260 else: 261 src_csv_files.append(f"contestants-{start_year}-to-{end_year}.csv") 259 262 260 263 … … 264 267 dst_csv_dirs += sparql_result_dirs 265 268 269 addin_result_dirs = glob(os.path.join(import_dir,"sparqlresults-add-in-*/")) 270 dst_csv_dirs += addin_result_dirs 271 266 272 267 273 print() -
gs3-installations/eurovision-lod/trunk/sites/eurovision/collect/eurovision/prepare/DBPEDIA-LOD-SPARQL-QUERY.sh
r35004 r37317 23 23 source ./_lod_utils.bash 24 24 25 sparql_url="https://dbpedia.demo.openlinksw.com/sparql/"26 #sparql_url="https://dbpedia.org/sparql"25 #sparql_url="https://dbpedia.demo.openlinksw.com/sparql/" 26 sparql_url="https://dbpedia.org/sparql" 27 27 28 28 graph_arg="default-graph-uri=http%3A%2F%2Fdbpedia.org" -
gs3-installations/eurovision-lod/trunk/sites/eurovision/collect/eurovision/prepare/LOCAL-LOD-PUT-GRAPH-TTL.sh
r35881 r37317 1 1 #!/bin/bash 2 2 3 graph =${1:-eurovision-errata}3 graph_name=${1:-eurovision-errata} 4 4 input_ttl_file=${2:-errata-lod/eurovision-errata.ttl} 5 5 … … 14 14 15 15 echo "Ingesting locally into graph:" 16 echo " $graph "16 echo " $graph_name" 17 17 echo "" 18 18 echo "The Turtle TTL file:" … … 21 21 echo "" 22 22 23 s-put http://localhost:3030/greenstone/data "$graph" "$input_ttl_file"23 #s-put http://localhost:3030/greenstone/data "$graph_name" "$input_ttl_file" 24 24 25 gs-triplestore-add3 "$graph_name" "$input_ttl_file" 26 -
gs3-installations/eurovision-lod/trunk/sites/eurovision/collect/eurovision/prepare/LOCAL-LOD-QUERY.sh
r35882 r37317 8 8 9 9 echo "" 10 echo "Sending:" 10 #echo "Sending:" 11 echo "Sending SPARQL query:" 11 12 echo " $input_sparql_query_file" 12 echo ""13 echo "To local SPARQL endpoint:"14 echo " http://localhost:3030/greenstone/query"13 #echo "" 14 #echo "To local SPARQL endpoint:" 15 #echo " http://localhost:3030/greenstone/query" 15 16 echo "" 16 17 echo "Saving output JSON resultset as:" … … 20 21 # s-query --service http://localhost:3030/greenstone/query 'SELECT * {?s ?p ?o}' 21 22 22 s-query --service http://localhost:3030/greenstone/query \ 23 "$sparql_query" > "$output_json_queryresult_file" 23 #s-query --service http://localhost:3030/greenstone/query \ 24 # "$sparql_query" > "$output_json_queryresult_file" 25 26 27 #echo gs-triplestore-query3 28 #echo "#----" 29 #echo "$sparql_query" 30 #echo "#----" 31 gs-triplestore-query3 "$sparql_query" > "$output_json_queryresult_file" -
gs3-installations/eurovision-lod/trunk/sites/eurovision/collect/eurovision/prepare/README-ADD-A-YEAR.txt
r37302 r37317 1 2 ### 3 # A. Generate the base SPARQL resultset 4 ### 5 6 # Reset the import directory to the last version completely processed 7 8 /bin/mv import import.CHECKPT 9 tar xvzf import.tar.gz 10 11 # Now set about generating the new data for the year you're adding in (e.g. 2022) 12 13 cd prepare 14 15 /bin/mv errata-lod/local--countries-in-esc-by-year-just-2022--with-errata.json errata-lod/local--countries-in-esc-by-year-just-2022--with-errata.json.CHECKPT 16 ./01-DOWNLOAD-ESC-LOD-DATA--SMALL.sh 2022 17 18 # This runs a SPARQL query and generates a fresh version of: 19 20 errata-lod/local--countries-in-esc-by-year-just-2022--with-errata.json 21 22 # If running the script hangs, this is a sign that the entries in: 23 24 errata-lod/eurovision-errata.ttl 25 26 # do not perfectly align with the non-URI (i.e., string) ?artists and ?entrants. 27 # Accessing this through the install DL collection, run and then review the following 28 # and update accordingly. 29 30 ./09-GEN-PROBLEM-LOD-LISTS.sh 31 32 problem-lod-lists/dbpedia-problem-entrants.html 33 problem-lod-lists/dbpedia-problem-songs.html 34 35 # Now rerun /01-DOWNLOAD-ESC-LOD-DATA--SMALL.sh 2022 36 37 # Next run: 38 39 ./02-EXPLODE-SPARQLRESULTS-TO-IMPORT--SMALL.sh 40 41 ### 42 # B. Expand the FromContry voting data 43 ### 44 45 # Updates on Eurovision voting data, to date, have been posted on: 46 47 https://data.world/datagraver/eurovision-song-contest-scores-1975-2019/workspace/file 48 49 # (Google sign-in required) 50 51 # No need to much around with just cutting out the year on this one, the 52 # regular full scripts can be used. 53 54 # Update the *cut-off* *year* in the "03" script and then run: 55 56 ./03-GEN-VOTING-METADATA.sh 57 ./04-COPY-VOTING-METADATA-TO-IMPORT.sh 58 59 # Take note of the instruction the "03" and "04" scripts produce: 60 61 # ./03 62 # Saving <gsf:headMetaTags /> block as: voting-excel/collectionConfig--gsf-headMetaTags.xml 63 # To be placed verbatim inside of '<display><format>' section of collectionConfig.xml: 64 65 # ./04 66 # Copying: 67 # voting-excel/collectionConfig--gsf-headMetaTags.xml ../etc/. 1 68 2 69 3 rm -f errata-lod/local--countries-in-esc-by-year-just-2015--with-errata.json 4 ./01-DOWNLOAD-ESC-LOD-DATA--SMALL.sh 2015 5 70 # and update your collectionConfig.xml file if not previously done. 6 71 7 72 73 ### 74 # C. Topup from Wikepedia (missing-cats) 75 ### 8 76 9 77 78 # It was found that Wikipedia sometimes listed countries in its category 79 # pages that were not captured/parsed out in the DBpedia version. 80 81 # In terms of topping up the import directory with an additional year 82 # things get a bit messy at this point. 83 84 85 # Run the following script targetted at your top-up year: 86 87 ./05-PARSE-ADDITIONAL-METADATA-FROM-WIKIPEDIA--SMALL.sh 2022 88 89 # This generates: 90 91 errata-categories/metadata-esc-year/metadata_esc.json 92 errata-categories/missing-cat-countries/metadata.json 93 94 # with just the entries for the add-in year. 95 96 97 # Next run: 98 99 ./06-COPY-PARSED-ADDITIONAL-METADATA-TO-IMPORT--SMALL.sh 2022 100 101 102 # This script changes tactics from how the 'process everything' scripts work. 103 # From here on in the top instructions, all the new content is placed in: 104 105 ../import/sparqlresults-add-in-2022 106 107 10 108 rm essentia-audio-features/cache-eurovisionworld/esc-2022-* 11 109 ./07b-GENERATE-ESSENTIA-FEATURES-DATA.sh scrape_contestants --start 2022 --end 2022 110 12 111 ./07b-GENERATE-ESSENTIA-FEATURES-DATA.sh download_audio --start 2022 --end 2022 contestants-2022.csv 13 112 ./07b-GENERATE-ESSENTIA-FEATURES-DATA.sh compute_audio_features --start 2022 --end 2022 14 113 15 114 115 ./08b-COPY-CSV-AND-AUDIO-FEATURES-TO-IMPORT--SMALL.sh 2022 116 117 118 119 120 # A substitute for 06 is 121 122 /bin/cp -i errata-categories/metadata-esc-year/*2022.nul ../import/sparqlresults-local--countries-in-esc-by-year-after-1956--with-errata.00000101/. 123 124 125 126 127 128 129 Append the content about 2022 in: 130 errata-categories/missing-cat-countries/metadata.json 131 132 Into: 133 ../import/missing-cat-countries/metadata.json 134 135 /bin/cp -i errata-categories/metadata-esc-year/*2022.nul ../import/missing-cat-countries/. 136 -
gs3-installations/eurovision-lod/trunk/sites/eurovision/collect/eurovision/prepare/UPLOAD-TTL-EUROVISION-ERRATA-GRAPH.sh
r35939 r37317 5 5 echo "" \ 6 6 && echo "====" \ 7 && echo "" \ 7 8 && echo "Resetting SPARQL Graph 'eurovision-errata' to be empty" \ 8 9 && ./LOCAL-LOD-RESET--ERRATA-ALL.sh \ 9 10 && echo "====" \ 11 && echo "" \ 12 && echo "====" \ 13 && echo "----" \ 14 && echo "Uploading full set of song + entrant + dbr:artist triples for 1956" \ 15 && ./LOCAL-LOD-PUT-GRAPH-TTL.sh eurovision-errata "$prep_dir/eurovision-errata1956.ttl" \ 16 && echo "----" \ 17 && echo "" \ 18 && echo "----" \ 19 && echo "Uploading errata for song and entrant strings" \ 10 20 && ./LOCAL-LOD-PUT-GRAPH-TTL.sh eurovision-errata "$prep_dir/eurovision-errata.ttl" \ 21 && echo "----" \ 11 22 && echo "====" \ 12 23 && echo "" -
gs3-installations/eurovision-lod/trunk/sites/eurovision/collect/eurovision/prepare/errata-categories/escwikipedia.py
r36003 r37317 51 51 52 52 header_cols = table_header.find_all("th"); 53 54 #print("html_tablerows_to_hashmap(): <th> header_cols") 55 #print(repr(header_cols)) 56 53 57 for header in header_cols: 54 header_label = header.contents[0].strip() 58 #print("html_tablerows_to_hashmap(): header") 59 #print(repr(header)) 60 61 # header_label = header.contents[0].strip() 62 header_label = header.contents[0].string.strip() 63 if (header_label == "R/O"): 64 # Change Running Order to Draw 65 header_label = "Draw" 55 66 if (header_label == "Language(s)"): 56 67 header_label = "Language" … … 302 313 table_rows = results_table.find_all("tr"); 303 314 print(" Number of rows in Results table = " + str(len(table_rows))) 315 #print("----") 316 #print(repr(table_rows)) 317 #print("----") 304 318 305 319 header_to_vals = html_tablerows_to_hashmap(table_rows)
Note:
See TracChangeset
for help on using the changeset viewer.