Changeset 34980


Ignore:
Timestamp:
2021-03-28T15:17:24+13:00 (3 years ago)
Author:
davidb
Message:

Changes after testing

Location:
main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories
Files:
1 added
3 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/esc-wikipedia-download-and-detect-missing-cat-entries.py

    r34979 r34980  
    1111
    1212import escwikipedia
    13 
     13import util
    1414
    1515def gs_fileset_country_in_year(country_year_name, country_year_rec, nul_output_dir_name):
     
    106106   
    107107    # json_output_filename = getattr(args,'output-file.json')
    108 
    109 
    110     # escwikipedia.cat_test()
     108    final_result_ids = [ 'Grand_final', 'Final', 'Results' ]
     109    semifinal_result_ids = [ 'Semi-final']
     110    semifinal_pair_result_ids = [ 'Semi-final_1', 'Semi-final_2' ]
    111111   
    112112    for year in range(start_year, end_year+1):
    113113        print("==========")
    114114
    115         esc_article_country_year_recs = escwikipedia.process_esc_article_page(year)
    116 
     115        article_country_year_html = escwikipedia.retrieve_article_page(year)
     116        article_final_country_year_recs = escwikipedia.process_article_page(article_country_year_html,year,final_result_ids, stop_at_first=True)
     117        article_semifinal_country_year_recs = escwikipedia.process_article_page(article_country_year_html,year,semifinal_result_ids, stop_at_first=True)
     118        if (len(article_semifinal_country_year_recs.keys()) == 0):
     119            article_semifinal_country_year_recs = escwikipedia.process_article_page(article_country_year_html,year,semifinal_pair_result_ids, stop_at_first=False)
     120        article_country_year_recs = util.merge_dicts(article_final_country_year_recs,article_semifinal_country_year_recs)
    117121        article_countries = {}
    118         for country_year in esc_article_country_year_recs.keys():
    119             ##print("**** country_year = '" + country_year + "'")
     122        for country_year in article_country_year_recs.keys():
    120123            article_country = re.search(r"(^.*)\d{4}(?:\w+)?",country_year).group(1)
    121124           
    122             # print("article country = " + article_country)
    123125            article_countries[article_country] = 1
     126               
     127        category_countries = escwikipedia.process_category_page(year)
    124128
    125         print("*** article_countries = " + str(article_countries.keys()));
    126            
    127         category_countries = esc_category_countries = escwikipedia.process_category_page(year)
    128         #print("*** category_countries = " + str(category_countries.keys()));
    129 
    130        
    131        
     129        ####
    132130        # Work out list "article" - "category"
    133 
     131        ####
    134132
    135133        for cat_country in category_countries.keys():
     
    137135                del article_countries[cat_country]
    138136            else:
    139                 print("**** Warning: Failed to find category country '" + cat_country + "' in article_countries")
     137                print("Country listed in Category page that did not reach Fina/Semifinal: " + cat_country)
    140138               
    141139        # Missing countries are the ones left in 'article_countries'
  • main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/esc-wikipedia-download-and-process-votes.py

    r34979 r34980  
    133133
    134134    all_country_year_recs = []
    135        
     135
     136    final_result_ids = [ 'Grand_final', 'Final', 'Results' ]
     137   
    136138    for year in range(start_year, end_year+1):
    137         country_year_recs = escwikipedia.process_esc_article_page(year)
     139        article_year_html = escwikipedia.retrieve_article_page(year)
     140        country_year_recs = escwikipedia.process_article_page(article_year_html,year,final_result_ids,
     141                                                                      stop_at_first=True)
    138142
    139143        all_country_year_recs.append(country_year_recs)
  • main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/escwikipedia.py

    r34979 r34980  
    135135
    136136
    137 def convert_cols_to_country_year_recs(header_to_vals,year):
    138 
    139     country_year_recs = {}
     137def convert_cols_to_country_year_recs(header_to_vals,year,accumulated_country_year_recs):
    140138
    141139    a_href_re = re.compile(r"^.*" + str(year) + r"$")
    142 
     140   
    143141    country_list = []
    144142   
     
    148146        country_list.append(country)
    149147
     148    # Only need to wory about a country double up occuring in this country_list
     149    # OK for there to be a double up between 'accumlated' and country_list, as
     150    # this is just a sign of a country that was in a semi-final progressing to
     151    # the final
     152   
     153    country_year_recs = {}
     154   
    150155    for i in range(0,len(country_list)):
    151156        country = country_list[i]
     
    214219        country_year_recs[country_year] = this_country_year_rec
    215220
    216     return country_year_recs
     221    # Top up the accumulated version with what has been added into country_year_recs
     222   
     223    for country_year in country_year_recs.keys():
     224        accumulated_country_year_recs[country_year] = country_year_recs[country_year]
     225       
     226    return accumulated_country_year_recs
    217227
    218228
     
    227237            metadata_val = country_rec.get(metadata_key)
    228238            print("  " + metadata_key + " = " + repr(metadata_val))
    229        
    230 def process_esc_article_page(year):
     239
     240def retrieve_article_page(year):
    231241    esc_wiki_page = "Eurovision_Song_Contest_" + str(year)
    232242    esc_wiki_page_file = esc_wiki_page.replace(" ","_") + ".html"
    233243    esc_wiki_page_filename = os.path.join(cache_dir,esc_wiki_page_file)
    234244
    235     esc_year_html = ""
     245    esc_article_year_html = ""
    236246    if not(os.path.exists(esc_wiki_page_filename)):
    237247        print("Retrieving Wikipedia page '" + esc_wiki_page + "'")
    238         esc_year_wp = wikipedia.page(esc_wiki_page,auto_suggest=False,preload=True,redirect=True)
    239         esc_year_html = esc_year_wp.html()
     248        esc_article_year_wp = wikipedia.page(esc_wiki_page,auto_suggest=False,preload=True,redirect=True)
     249        esc_article_year_html = esc_article_year_wp.html()
    240250
    241251        print("  Saving page to cache")
    242         write_text_file(esc_wiki_page_filename,esc_year_html)
     252        write_text_file(esc_wiki_page_filename,esc_article_year_html)
    243253    else:
    244254        print("Retrieving " + esc_wiki_page_file + " from cache")
    245         esc_year_html = read_text_file(esc_wiki_page_filename)
    246 
    247        
    248     esc_year_soup = bs4.BeautifulSoup(esc_year_html, 'html.parser')
    249    
    250     final_result_ids = [ 'Grand_final', 'Final', 'Results' ]
    251 
    252     results_heading = None
    253     for fr_id in final_result_ids:
    254 
     255        esc_article_year_html = read_text_file(esc_wiki_page_filename)
     256
     257       
     258    return esc_article_year_html
     259
     260def process_article_page(esc_article_year_html,year,result_ids, stop_at_first):
     261
     262    country_year_recs = {}
     263   
     264    esc_year_soup = bs4.BeautifulSoup(esc_article_year_html, 'html.parser')
     265   
     266    #result_ids = [ 'Grand_final', 'Final', 'Results' ]
     267   
     268    results_heading_list = []
     269    for fr_id in result_ids:
     270
     271        # Manually deal with an exception, where "Final" turns up in a side-bar infobox
     272        # before the actual Results section in the main page       
    255273        if ((year == 1996) and (fr_id == "Final")):
    256274            continue
     
    260278            print("  Found Final Results heading with id: " + fr_id);
    261279            results_heading = results_text_span.parent
     280            results_heading_list.append(results_heading)
    262281            # print("**** parent tag: " + results_heading.name);
    263             break
    264 
    265     # print (results_heading)
    266 
    267     results_table = results_heading.findNext('table')
    268     table_rows = results_table.find_all('tr');
    269     print("  " + esc_wiki_page_file + ": number of rows in Results table = " + str(len(table_rows)))
    270 
    271     header_to_vals = html_tablerows_to_hashmap(table_rows)
    272 
    273     country_year_recs = convert_cols_to_country_year_recs(header_to_vals,year)
     282            if (stop_at_first):
     283                break
     284
     285    # print(repr(results_heading))
     286
     287    for result_heading in results_heading_list:
     288       
     289        results_table = results_heading.findNext('table')
     290        table_rows = results_table.find_all('tr');
     291        print("  Number of rows in Results table = " + str(len(table_rows)))
     292
     293        header_to_vals = html_tablerows_to_hashmap(table_rows)
     294
     295        convert_cols_to_country_year_recs(header_to_vals,year,country_year_recs)
    274296
    275297    print("==========")
Note: See TracChangeset for help on using the changeset viewer.