Context Navigation

← Previous Changeset
Next Changeset →

Changeset 34980

Timestamp:

2021-03-28T15:17:24+13:00 (3 years ago)

Author:

davidb

Message:

Changes after testing

Location:

main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories

Files:

: 1 added
: 3 edited

esc-wikipedia-download-and-detect-missing-cat-entries.py (modified) (3 diffs)
esc-wikipedia-download-and-process-votes.py (modified) (1 diff)
escwikipedia.py (modified) (5 diffs)
util.py (added)

Legend:

: Unmodified
: Added
: Removed

main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/esc-wikipedia-download-and-detect-missing-cat-entries.py

-              r34979
+              r34980
 import escwikipedia
+import util
 def gs_fileset_country_in_year(country_year_name, country_year_rec, nul_output_dir_name):
 …
     # json_output_filename = getattr(args,'output-file.json')
     # escwikipedia.cat_test()
+    final_result_ids = [ 'Grand_final', 'Final', 'Results' ]
+    semifinal_result_ids = [ 'Semi-final']
+    semifinal_pair_result_ids = [ 'Semi-final_1', 'Semi-final_2' ]
     for year in range(start_year, end_year+1):
         print("==========")
+        esc_article_country_year_recs = escwikipedia.process_esc_article_page(year)
+        article_country_year_html = escwikipedia.retrieve_article_page(year)
+        article_final_country_year_recs = escwikipedia.process_article_page(article_country_year_html,year,final_result_ids, stop_at_first=True)
+        article_semifinal_country_year_recs = escwikipedia.process_article_page(article_country_year_html,year,semifinal_result_ids, stop_at_first=True)
+        if (len(article_semifinal_country_year_recs.keys()) == 0):
+            article_semifinal_country_year_recs = escwikipedia.process_article_page(article_country_year_html,year,semifinal_pair_result_ids, stop_at_first=False)
+        article_country_year_recs = util.merge_dicts(article_final_country_year_recs,article_semifinal_country_year_recs)
         article_countries = {}
+        for country_year in esc_article_country_year_recs.keys():
+            ##print("**** country_year = '" + country_year + "'")
+        for country_year in article_country_year_recs.keys():
             article_country = re.search(r"(^.*)\d{4}(?:\w+)?",country_year).group(1)
-            # print("article country = " + article_country)
             article_countries[article_country] = 1
+        category_countries = escwikipedia.process_category_page(year)
+        print("*** article_countries = " + str(article_countries.keys()));
+        category_countries = esc_category_countries = escwikipedia.process_category_page(year)
+        #print("*** category_countries = " + str(category_countries.keys()));
+        ####
         # Work out list "article" - "category"
+        ####
         for cat_country in category_countries.keys():
 …
                 del article_countries[cat_country]
             else:
                 print("**** Warning: Failed to find category country '" + cat_country + "' in article_countries")
+                print("Country listed in Category page that did not reach Fina/Semifinal: " + cat_country)
         # Missing countries are the ones left in 'article_countries'

main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/esc-wikipedia-download-and-process-votes.py

-              r34979
+              r34980
     all_country_year_recs = []
+    final_result_ids = [ 'Grand_final', 'Final', 'Results' ]
     for year in range(start_year, end_year+1):
+        country_year_recs = escwikipedia.process_esc_article_page(year)
+        article_year_html = escwikipedia.retrieve_article_page(year)
+        country_year_recs = escwikipedia.process_article_page(article_year_html,year,final_result_ids,
+                                                                      stop_at_first=True)
         all_country_year_recs.append(country_year_recs)

main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/escwikipedia.py

-              r34979
+              r34980
+def convert_cols_to_country_year_recs(header_to_vals,year):
+    country_year_recs = {}
+def convert_cols_to_country_year_recs(header_to_vals,year,accumulated_country_year_recs):
     a_href_re = re.compile(r"^.*" + str(year) + r"$")
     country_list = []
 …
         country_list.append(country)
+    # Only need to wory about a country double up occuring in this country_list
+    # OK for there to be a double up between 'accumlated' and country_list, as
+    # this is just a sign of a country that was in a semi-final progressing to
+    # the final
+    country_year_recs = {}
     for i in range(0,len(country_list)):
         country = country_list[i]
 …
         country_year_recs[country_year] = this_country_year_rec
+    return country_year_recs
+    # Top up the accumulated version with what has been added into country_year_recs
+    for country_year in country_year_recs.keys():
+        accumulated_country_year_recs[country_year] = country_year_recs[country_year]
+    return accumulated_country_year_recs
 …
             metadata_val = country_rec.get(metadata_key)
             print("  " + metadata_key + " = " + repr(metadata_val))
 def process_esc_article_page(year):
+def retrieve_article_page(year):
     esc_wiki_page = "Eurovision_Song_Contest_" + str(year)
     esc_wiki_page_file = esc_wiki_page.replace(" ","_") + ".html"
     esc_wiki_page_filename = os.path.join(cache_dir,esc_wiki_page_file)
     esc_year_html = ""
+    esc_article_year_html = ""
     if not(os.path.exists(esc_wiki_page_filename)):
         print("Retrieving Wikipedia page '" + esc_wiki_page + "'")
         esc_year_wp = wikipedia.page(esc_wiki_page,auto_suggest=False,preload=True,redirect=True)
         esc_year_html = esc_year_wp.html()
+        esc_article_year_wp = wikipedia.page(esc_wiki_page,auto_suggest=False,preload=True,redirect=True)
+        esc_article_year_html = esc_article_year_wp.html()
         print("  Saving page to cache")
         write_text_file(esc_wiki_page_filename,esc_year_html)
+        write_text_file(esc_wiki_page_filename,esc_article_year_html)
     else:
         print("Retrieving " + esc_wiki_page_file + " from cache")
+        esc_year_html = read_text_file(esc_wiki_page_filename)
+    esc_year_soup = bs4.BeautifulSoup(esc_year_html, 'html.parser')
+    final_result_ids = [ 'Grand_final', 'Final', 'Results' ]
+    results_heading = None
+    for fr_id in final_result_ids:
+        esc_article_year_html = read_text_file(esc_wiki_page_filename)
+    return esc_article_year_html
+def process_article_page(esc_article_year_html,year,result_ids, stop_at_first):
+    country_year_recs = {}
+    esc_year_soup = bs4.BeautifulSoup(esc_article_year_html, 'html.parser')
+    #result_ids = [ 'Grand_final', 'Final', 'Results' ]
+    results_heading_list = []
+    for fr_id in result_ids:
+        # Manually deal with an exception, where "Final" turns up in a side-bar infobox
+        # before the actual Results section in the main page
         if ((year == 1996) and (fr_id == "Final")):
             continue
 …
             print("  Found Final Results heading with id: " + fr_id);
             results_heading = results_text_span.parent
+            results_heading_list.append(results_heading)
             # print("**** parent tag: " + results_heading.name);
+            break
+    # print (results_heading)
+    results_table = results_heading.findNext('table')
+    table_rows = results_table.find_all('tr');
+    print("  " + esc_wiki_page_file + ": number of rows in Results table = " + str(len(table_rows)))
+    header_to_vals = html_tablerows_to_hashmap(table_rows)
+    country_year_recs = convert_cols_to_country_year_recs(header_to_vals,year)
+            if (stop_at_first):
+                break
+    # print(repr(results_heading))
+    for result_heading in results_heading_list:
+        results_table = results_heading.findNext('table')
+        table_rows = results_table.find_all('tr');
+        print("  Number of rows in Results table = " + str(len(table_rows)))
+        header_to_vals = html_tablerows_to_hashmap(table_rows)
+        convert_cols_to_country_year_recs(header_to_vals,year,country_year_recs)
     print("==========")

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 34980

Legend:

main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/esc-wikipedia-download-and-detect-missing-cat-entries.py

main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/esc-wikipedia-download-and-process-votes.py

main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/escwikipedia.py

Download in other formats: