Changeset 34980
- Timestamp:
- 2021-03-28T15:17:24+13:00 (3 years ago)
- Location:
- main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories
- Files:
-
- 1 added
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/esc-wikipedia-download-and-detect-missing-cat-entries.py
r34979 r34980 11 11 12 12 import escwikipedia 13 13 import util 14 14 15 15 def gs_fileset_country_in_year(country_year_name, country_year_rec, nul_output_dir_name): … … 106 106 107 107 # json_output_filename = getattr(args,'output-file.json') 108 109 110 # escwikipedia.cat_test()108 final_result_ids = [ 'Grand_final', 'Final', 'Results' ] 109 semifinal_result_ids = [ 'Semi-final'] 110 semifinal_pair_result_ids = [ 'Semi-final_1', 'Semi-final_2' ] 111 111 112 112 for year in range(start_year, end_year+1): 113 113 print("==========") 114 114 115 esc_article_country_year_recs = escwikipedia.process_esc_article_page(year) 116 115 article_country_year_html = escwikipedia.retrieve_article_page(year) 116 article_final_country_year_recs = escwikipedia.process_article_page(article_country_year_html,year,final_result_ids, stop_at_first=True) 117 article_semifinal_country_year_recs = escwikipedia.process_article_page(article_country_year_html,year,semifinal_result_ids, stop_at_first=True) 118 if (len(article_semifinal_country_year_recs.keys()) == 0): 119 article_semifinal_country_year_recs = escwikipedia.process_article_page(article_country_year_html,year,semifinal_pair_result_ids, stop_at_first=False) 120 article_country_year_recs = util.merge_dicts(article_final_country_year_recs,article_semifinal_country_year_recs) 117 121 article_countries = {} 118 for country_year in esc_article_country_year_recs.keys(): 119 ##print("**** country_year = '" + country_year + "'") 122 for country_year in article_country_year_recs.keys(): 120 123 article_country = re.search(r"(^.*)\d{4}(?:\w+)?",country_year).group(1) 121 124 122 # print("article country = " + article_country)123 125 article_countries[article_country] = 1 126 127 category_countries = escwikipedia.process_category_page(year) 124 128 125 print("*** article_countries = " + str(article_countries.keys())); 126 127 category_countries = esc_category_countries = escwikipedia.process_category_page(year) 128 #print("*** category_countries = " + str(category_countries.keys())); 129 130 131 129 #### 132 130 # Work out list "article" - "category" 133 131 #### 134 132 135 133 for cat_country in category_countries.keys(): … … 137 135 del article_countries[cat_country] 138 136 else: 139 print(" **** Warning: Failed to find category country '" + cat_country + "' in article_countries")137 print("Country listed in Category page that did not reach Fina/Semifinal: " + cat_country) 140 138 141 139 # Missing countries are the ones left in 'article_countries' -
main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/esc-wikipedia-download-and-process-votes.py
r34979 r34980 133 133 134 134 all_country_year_recs = [] 135 135 136 final_result_ids = [ 'Grand_final', 'Final', 'Results' ] 137 136 138 for year in range(start_year, end_year+1): 137 country_year_recs = escwikipedia.process_esc_article_page(year) 139 article_year_html = escwikipedia.retrieve_article_page(year) 140 country_year_recs = escwikipedia.process_article_page(article_year_html,year,final_result_ids, 141 stop_at_first=True) 138 142 139 143 all_country_year_recs.append(country_year_recs) -
main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/escwikipedia.py
r34979 r34980 135 135 136 136 137 def convert_cols_to_country_year_recs(header_to_vals,year): 138 139 country_year_recs = {} 137 def convert_cols_to_country_year_recs(header_to_vals,year,accumulated_country_year_recs): 140 138 141 139 a_href_re = re.compile(r"^.*" + str(year) + r"$") 142 140 143 141 country_list = [] 144 142 … … 148 146 country_list.append(country) 149 147 148 # Only need to wory about a country double up occuring in this country_list 149 # OK for there to be a double up between 'accumlated' and country_list, as 150 # this is just a sign of a country that was in a semi-final progressing to 151 # the final 152 153 country_year_recs = {} 154 150 155 for i in range(0,len(country_list)): 151 156 country = country_list[i] … … 214 219 country_year_recs[country_year] = this_country_year_rec 215 220 216 return country_year_recs 221 # Top up the accumulated version with what has been added into country_year_recs 222 223 for country_year in country_year_recs.keys(): 224 accumulated_country_year_recs[country_year] = country_year_recs[country_year] 225 226 return accumulated_country_year_recs 217 227 218 228 … … 227 237 metadata_val = country_rec.get(metadata_key) 228 238 print(" " + metadata_key + " = " + repr(metadata_val)) 229 230 def process_esc_article_page(year):239 240 def retrieve_article_page(year): 231 241 esc_wiki_page = "Eurovision_Song_Contest_" + str(year) 232 242 esc_wiki_page_file = esc_wiki_page.replace(" ","_") + ".html" 233 243 esc_wiki_page_filename = os.path.join(cache_dir,esc_wiki_page_file) 234 244 235 esc_ year_html = ""245 esc_article_year_html = "" 236 246 if not(os.path.exists(esc_wiki_page_filename)): 237 247 print("Retrieving Wikipedia page '" + esc_wiki_page + "'") 238 esc_ year_wp = wikipedia.page(esc_wiki_page,auto_suggest=False,preload=True,redirect=True)239 esc_ year_html = esc_year_wp.html()248 esc_article_year_wp = wikipedia.page(esc_wiki_page,auto_suggest=False,preload=True,redirect=True) 249 esc_article_year_html = esc_article_year_wp.html() 240 250 241 251 print(" Saving page to cache") 242 write_text_file(esc_wiki_page_filename,esc_ year_html)252 write_text_file(esc_wiki_page_filename,esc_article_year_html) 243 253 else: 244 254 print("Retrieving " + esc_wiki_page_file + " from cache") 245 esc_year_html = read_text_file(esc_wiki_page_filename) 246 247 248 esc_year_soup = bs4.BeautifulSoup(esc_year_html, 'html.parser') 249 250 final_result_ids = [ 'Grand_final', 'Final', 'Results' ] 251 252 results_heading = None 253 for fr_id in final_result_ids: 254 255 esc_article_year_html = read_text_file(esc_wiki_page_filename) 256 257 258 return esc_article_year_html 259 260 def process_article_page(esc_article_year_html,year,result_ids, stop_at_first): 261 262 country_year_recs = {} 263 264 esc_year_soup = bs4.BeautifulSoup(esc_article_year_html, 'html.parser') 265 266 #result_ids = [ 'Grand_final', 'Final', 'Results' ] 267 268 results_heading_list = [] 269 for fr_id in result_ids: 270 271 # Manually deal with an exception, where "Final" turns up in a side-bar infobox 272 # before the actual Results section in the main page 255 273 if ((year == 1996) and (fr_id == "Final")): 256 274 continue … … 260 278 print(" Found Final Results heading with id: " + fr_id); 261 279 results_heading = results_text_span.parent 280 results_heading_list.append(results_heading) 262 281 # print("**** parent tag: " + results_heading.name); 263 break 264 265 # print (results_heading) 266 267 results_table = results_heading.findNext('table') 268 table_rows = results_table.find_all('tr'); 269 print(" " + esc_wiki_page_file + ": number of rows in Results table = " + str(len(table_rows))) 270 271 header_to_vals = html_tablerows_to_hashmap(table_rows) 272 273 country_year_recs = convert_cols_to_country_year_recs(header_to_vals,year) 282 if (stop_at_first): 283 break 284 285 # print(repr(results_heading)) 286 287 for result_heading in results_heading_list: 288 289 results_table = results_heading.findNext('table') 290 table_rows = results_table.find_all('tr'); 291 print(" Number of rows in Results table = " + str(len(table_rows))) 292 293 header_to_vals = html_tablerows_to_hashmap(table_rows) 294 295 convert_cols_to_country_year_recs(header_to_vals,year,country_year_recs) 274 296 275 297 print("==========")
Note:
See TracChangeset
for help on using the changeset viewer.