- Timestamp:
- 2021-05-24T13:53:40+12:00 (3 years ago)
- Location:
- main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare
- Files:
-
- 6 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/03-GEN-VOTING-METADATA.sh
r35070 r35185 21 21 22 22 23 if [ ! -f "$prep_dir/eurovision_song_contest_1975_20 19.xlsx" ] ; then23 if [ ! -f "$prep_dir/eurovision_song_contest_1975_2021.xlsx" ] ; then 24 24 echo "Unzipping $prep_dir/archive.zip:" 25 25 … … 50 50 $prep_dir/xlsx-fromcountry-jsonmetadata.py \ 51 51 --votingtype "J" \ 52 $prep_dir/eurovision_song_contest_1975_20 19.xlsx \52 $prep_dir/eurovision_song_contest_1975_2021.xlsx \ 53 53 $prep_dir/metadata-votes/metadata-votes-fromcountry-jury.json 54 54 … … 57 57 $prep_dir/xlsx-fromcountry-jsonmetadata.py \ 58 58 --votingtype "T" \ 59 $prep_dir/eurovision_song_contest_1975_20 19.xlsx \59 $prep_dir/eurovision_song_contest_1975_2021.xlsx \ 60 60 $prep_dir/metadata-votes/metadata-votes-fromcountry-tele.json 61 61 … … 64 64 $prep_dir/xlsx-fromcountry-jsonmetadata.py \ 65 65 --votingtype "JT" \ 66 $prep_dir/eurovision_song_contest_1975_20 19.xlsx \66 $prep_dir/eurovision_song_contest_1975_2021.xlsx \ 67 67 $prep_dir/metadata-votes/metadata-votes-fromcountry-comb.json 68 68 69 69 if [ $? = 0 ] ; then 70 $prep_dir/xlsx-tocountry-jsonmetadata.py $prep_dir/eurovision_song_contest_1975_20 19.xlsx $prep_dir/metadata-votes-tocountry.json70 $prep_dir/xlsx-tocountry-jsonmetadata.py $prep_dir/eurovision_song_contest_1975_2021.xlsx $prep_dir/metadata-votes-tocountry.json 71 71 72 72 if [ $? != 0 ] ; then -
main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/05-PARSE-ADDITIONAL-METADATA-FROM-WIKIPEDIA.sh
r35013 r35185 26 26 $prep_dir/esc-wikipedia-download-and-process-votes.py \ 27 27 --startyear 1956 \ 28 --endyear 20 19\28 --endyear 2021 \ 29 29 --cachedir $prep_dir/cache-wikipedia \ 30 30 $prep_dir/metadata-esc-year/metadata_esc.json … … 41 41 $prep_dir//esc-wikipedia-download-and-detect-missing-cat-entries.py \ 42 42 --startyear 1956 \ 43 --endyear 20 19\43 --endyear 2021 \ 44 44 --cachedir $prep_dir/cache-wikipedia \ 45 45 --queryfile.sparql "$prep_dir/dbpedia--countries-missing-from-esc-category-in-the-year.sparql" \ -
main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/06-COPY-PARSED-ADDITIONAL-METADATA-TO-IMPORT.sh
r35013 r35185 39 39 /bin/cp "errata-categories/missing-cat-countries/"* "$miscat_dir/." 40 40 41 42 forced2021_dir=../import/sparqlresults-local--countries-in-esc-by-year-after-1956--with-errata.00000001 43 44 echo "" 45 echo "Copying (forcing 2021 files into 00000001 import area):" 46 echo " errata-categories/metadata-esc-year/*2021.nul -> $forced2021_dir/." 47 /bin/cp "errata-categories/metadata-esc-year/"*2021.nul "$forced2021_dir/." 48 41 49 echo "" 42 50 -
main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/esc-wikipedia-download-and-detect-missing-cat-entries.py
r35032 r35185 146 146 147 147 148 final_result_ids = [ 'Grand_final', 'Final', 'Results' ] 148 #final_result_ids = [ 'Grand_final', 'Final', 'Results' ] 149 final_result_ids = ['Final', 'Results' ] 149 150 semifinal_result_ids = [ 'Semi-final'] 150 151 semifinal_pair_result_ids = [ 'Semi-final_1', 'Semi-final_2' ] … … 153 154 154 155 for year in range(start_year, end_year+1): 156 if (year == 2020): 157 print("******") 158 print("* Skipping year 2020, as no contest held that year") 159 print("******") 160 continue 161 155 162 print("==========") 156 163 … … 158 165 article_final_country_year_recs = escwikipedia.process_article_page(article_country_year_html,year,final_result_ids, stop_at_first=True) 159 166 article_semifinal_country_year_recs = escwikipedia.process_article_page(article_country_year_html,year,semifinal_result_ids, stop_at_first=True) 167 160 168 if (len(article_semifinal_country_year_recs.keys()) == 0): 161 169 article_semifinal_country_year_recs = escwikipedia.process_article_page(article_country_year_html,year,semifinal_pair_result_ids, stop_at_first=False) … … 174 182 #### 175 183 176 for cat_country in category_countries.keys(): 177 if (cat_country in article_countries): 178 del article_countries[cat_country] 179 else: 180 print("Country listed in Category page that did not reach Final/Semifinal: " + cat_country) 181 184 if (year < 2021): 185 186 for cat_country in category_countries.keys(): 187 if (cat_country in article_countries): 188 del article_countries[cat_country] 189 else: 190 print("Country listed in Category page that did not reach Final/Semifinal: " + cat_country) 191 else: 192 print("******") 193 print("* Skipping country in article check for 2021, as those entries no in DBpedia (at time of coding)") 194 print("******") 195 182 196 # Missing countries are the ones left in 'article_countries' 183 197 if (len(article_countries.keys()) == 0): -
main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/esc-wikipedia-download-and-process-votes.py
r35105 r35185 161 161 all_country_year_recs = [] 162 162 163 final_result_ids = [ 'Grand_final', 'Final', 'Results' ] 163 # final_result_ids = [ 'Grand_final', 'Final', 'Results' ] 164 final_result_ids = [ 'Final', 'Results' ] 164 165 165 166 for year in range(start_year, end_year+1): 167 if (year == 2020): 168 print("******") 169 print("* Skipping year 2020, as no contest held that year") 170 print("******") 171 continue 172 166 173 article_year_html = escwikipedia.retrieve_article_page(year) 167 174 country_year_recs = escwikipedia.process_article_page(article_year_html,year,final_result_ids, -
main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/escwikipedia.py
r35128 r35185 65 65 66 66 for y in range(1, len(table_rows)): 67 tds = table_rows[y].find_all("td"); 67 # print("*** table row [" + str(y) + "]: " + repr(table_rows[y])); 68 69 tds = table_rows[y].find_all(["th","td"]); 70 # print("**** tds = " + repr(tds)); 68 71 for x in range(0,len(tds)): 69 val = tds[x] 72 val = tds[x] 70 73 header_label = headers[x] 71 74 header_to_vals[header_label].append(val) … … 137 140 138 141 def convert_cols_to_country_year_recs(header_to_vals,year,accumulated_country_year_recs,extra_metadata): 139 142 # print("**** convert_cols_to_country_year_recs: year = " + str(year)); 143 # print("**** header_to_vals = " + repr(header_to_vals)); 144 140 145 a_href_re = re.compile(r"^.*" + str(year) + r"$") 141 146 … … 143 148 144 149 for country_tag in header_to_vals.get("Country"): 150 # print("**** country tag = " + repr(country_tag)); 145 151 country = country_tag.find("a",href=a_href_re).string 146 152 # print("**** country = " + country) … … 279 285 results_text_span = esc_year_soup.find("span",id=fr_id) 280 286 if (results_text_span is not None): 281 print(" Found Final Results heading with id: " + fr_id);287 print(" Found Final/Semi-Final Results heading with id: " + fr_id); 282 288 results_heading = results_text_span.parent 283 289 results_heading_list.append(results_heading) … … 290 296 for result_heading in results_heading_list: 291 297 292 results_table = results_heading.findNext( 'table')293 table_rows = results_table.find_all( 'tr');298 results_table = results_heading.findNext("table") 299 table_rows = results_table.find_all("tr"); 294 300 print(" Number of rows in Results table = " + str(len(table_rows))) 295 301
Note:
See TracChangeset
for help on using the changeset viewer.