# TODO # Check to see if Song/Title coming back with quotes around it # In addition to Languages->Language, split of "," ?? # (e.g., 2005 has "Language" but entries like English, Spanish) from __future__ import print_function import json import os import re import requests import argparse import bs4 import wikipedia DEBUG=False cache_dir_=None def init_cache(cache_dir): global cache_dir_ cache_dir_ = cache_dir if not(os.path.exists(cache_dir_)): print("Making cache directory: " + cache_dir_) os.mkdir(cache_dir_) def read_text_file(input_filename): f = open(input_filename, "r") text = f.read() f.close() return text def write_text_file(output_filename, text): f = open(output_filename, "w") f.write(text) f.close() def html_tablerows_to_hashmap(table_rows): table_header = table_rows[0] header_to_vals = {} headers = [] header_cols = table_header.find_all("th"); for header in header_cols: header_label = header.contents[0].strip() if (header_label == "Language(s)"): header_label = "Language" if (header_label == "Song"): header_label = "Title" headers.append(header_label) header_to_vals[header_label] = [] print(" Headers = " + ",".join(header_to_vals.keys())) for y in range(1, len(table_rows)): tds = table_rows[y].find_all("td"); for x in range(0,len(tds)): val = tds[x] header_label = headers[x] header_to_vals[header_label].append(val) return header_to_vals def convert_cols_to_country_year_recsOLD(header_to_vals,year): country_year_recs = {} a_href_re = re.compile(r"^.*" + str(year) + r"$") for country_tag in header_to_vals.get("Country"): country = country_tag.find("a",href=a_href_re).string # print("**** country = " + country) country_year = country+str(year) country_year_recs[country_year] = { "Country": country, "Year": year } for key in header_to_vals.keys(): if (key == "Country"): continue vals = header_to_vals.get(key) for l in range(0,len(vals)): country_tag = header_to_vals.get("Country")[l] country_flag_img = country_tag.find("img") country = country_tag.find("a",href=a_href_re).string country_year = country+str(year) val = vals[l] if key == "Artist": a_val = val.find("a") if (a_val is not None): val = a_val elif key == "Title": a_val = val.find("a") if (a_val is not None): val = a_val elif key == "Language": a_val = val.find("a") if (a_val is not None): val = a_val elif key == "Place": span_val = val.find("span") if (span_val is not None): val = span_val for inner_val in val.contents: if (inner_val.string and re.search("[^\s]",inner_val.string)): val = inner_val break val = val.string.strip() #print("country = " + country); #print("key = " + key); #print("*** storing: " + country + "[" + key + "] = " + val) country_year_recs[country_year][key] = val return country_year_recs def convert_cols_to_country_year_recs(header_to_vals,year,accumulated_country_year_recs,extra_metadata): a_href_re = re.compile(r"^.*" + str(year) + r"$") country_list = [] for country_tag in header_to_vals.get("Country"): country = country_tag.find("a",href=a_href_re).string # print("**** country = " + country) country_list.append(country) # Only need to worry about a country double up occuring in this country_list # OK for there to be a double up between 'accumlated' and country_list, as # this is just a sign of a country that was in a semi-final progressing to # the final country_year_recs = {} for i in range(0,len(country_list)): country = country_list[i] country_year = country + str(year) this_country_year_rec = {} if (extra_metadata is not None): this_country_year_rec = extra_metadata.copy() this_country_year_rec["Country"] = country this_country_year_rec["Year"] = year for key in header_to_vals.keys(): val = header_to_vals.get(key)[i] if (key == "Country"): country_flag_img = val.find("img") this_country_year_rec["FlagImg"] = str(country_flag_img) continue if key == "Artist": a_val = val.find("a") if (a_val is not None): val = a_val elif key == "Title": a_val = val.find("a") if (a_val is not None): val = a_val elif key == "Language": a_val = val.find("a") if (a_val is not None): val = a_val elif key == "Place": span_val = val.find("span") if (span_val is not None): val = span_val elif key == "Points": key = "VoteGrandTotal" for inner_val in val.contents: if (inner_val.string and re.search("[^\s]",inner_val.string)): val = inner_val break val = val.string.strip() #print("country = " + country); #print("key = " + key); #print("*** storing: " + country + "[" + key + "] = " + val) this_country_year_rec[key] = val if (country_year in country_year_recs): # 1956, where countries had 2 entries! country_year_rec = country_year_recs[country_year] del country_year_recs[country_year] country_year_title = country_year_rec.get("Title") country_year_suffix = re.sub(r"\s+","",country_year_title) new_country_year_key = country_year + country_year_suffix[0:3] country_year_recs[new_country_year_key] = country_year_rec this_country_year_title = this_country_year_rec.get("Title") this_country_year_suffix = re.sub(r"\s+","",this_country_year_title) country_year = country_year + this_country_year_suffix[0:3] country_year_recs[country_year] = this_country_year_rec # Top up the accumulated version with what has been added into country_year_recs for country_year in country_year_recs.keys(): accumulated_country_year_recs[country_year] = country_year_recs[country_year] return accumulated_country_year_recs def debug_output_country_year_recs(country_year_recs): for country_name in country_year_recs.keys(): country_rec = country_year_recs.get(country_name) print("[" + country_name + "]") for metadata_key in country_rec.keys(): metadata_val = country_rec.get(metadata_key) print(" " + metadata_key + " = " + repr(metadata_val)) def retrieve_article_page(year): esc_wiki_page = "Eurovision_Song_Contest_" + str(year) esc_wiki_page_file = esc_wiki_page.replace(" ","_") + ".html" esc_wiki_page_filename = os.path.join(cache_dir_,esc_wiki_page_file) esc_article_year_html = "" if not(os.path.exists(esc_wiki_page_filename)): print("Retrieving Wikipedia page '" + esc_wiki_page + "'") esc_article_year_wp = wikipedia.page(esc_wiki_page,auto_suggest=False,preload=True,redirect=True) esc_article_year_html = esc_article_year_wp.html() print(" Saving page to cache") write_text_file(esc_wiki_page_filename,esc_article_year_html) else: print("Retrieving " + esc_wiki_page_file + " from cache") esc_article_year_html = read_text_file(esc_wiki_page_filename) return esc_article_year_html def process_article_page(esc_article_year_html,year,result_ids, stop_at_first, extra_metadata=None): country_year_recs = {} esc_year_soup = bs4.BeautifulSoup(esc_article_year_html, 'html.parser') #result_ids = [ 'Grand_final', 'Final', 'Results' ] results_heading_list = [] for fr_id in result_ids: # Manually deal with an exception, where "Final" turns up in a side-bar infobox # before the actual Results section in the main page if ((year == 1996) and (fr_id == "Final")): continue results_text_span = esc_year_soup.find("span",id=fr_id) if (results_text_span is not None): print(" Found Final Results heading with id: " + fr_id); results_heading = results_text_span.parent results_heading_list.append(results_heading) # print("**** parent tag: " + results_heading.name); if (stop_at_first): break # print(repr(results_heading)) for result_heading in results_heading_list: results_table = results_heading.findNext('table') table_rows = results_table.find_all('tr'); print(" Number of rows in Results table = " + str(len(table_rows))) header_to_vals = html_tablerows_to_hashmap(table_rows) convert_cols_to_country_year_recs(header_to_vals,year,country_year_recs, extra_metadata) print("==========") ## debug_output_country_year_recs(country_year_recs) # Splice in logo image for that year into each country_year_rec #infobox_table = esc_year_soup.find("table",{"class": "infobox"}) #infobox_td = infobox_table.find("td",{"class": "infobox-image"}) #infobox_img = infobox_td.find("img") #infobox_img = esc_year_soup.select("table.infobox td.infobox-image img") #infobox_img = esc_year_soup.select("table.infobox td.infobox-image img") #infobox_img = esc_year_soup.select("table.infobox td.infobox-image") # infobox_img = esc_year_soup.select("table.infobox") #infobox_table = esc_year_soup.find("table",{"class": "infobox"}) #infobox_img = infobox_table.tbody.tr[0].td[0].a.img # Looks like Wikipedia has changed some of its infobox CSS # If processing newer downloads, then the following is probably the select # statement to use # new school! #infobox_img = esc_year_soup.select("table.infobox td.infobox-image img") # old school infobox_imgs = esc_year_soup.select("table.infobox tr td a.image img") if (len(infobox_imgs) == 0): print("****") print("****!!! No ESC Logo image found!!!") print("****") else: # Some pages include addition image graphics, such as a map showing country entrants # => Want the first one infobox_logo_img = infobox_imgs[0]; for country_year_key in country_year_recs.keys(): country_year_rec = country_year_recs.get(country_year_key) country_year_rec["YearLogoImg"] = str(infobox_logo_img) return country_year_recs def process_category_page(year): category_countries = {} esc_wiki_page = "Category:Countries_in_the_Eurovision_Song_Contest_" + str(year) esc_wiki_page_file = esc_wiki_page.replace(" ","_") + ".html" esc_wiki_page_filename = os.path.join(cache_dir_,esc_wiki_page_file) esc_cat_year_html = "" if not(os.path.exists(esc_wiki_page_filename)): print("Retrieving Wikipedia page '" + esc_wiki_page + "'") ##esc_cat_year_wp = wikipedia.page(esc_wiki_page,auto_suggest=False,preload=True,redirect=True) #esc_cat_year_wp = wikipedia.page(esc_wiki_page) #esc_cat_year_html = esc_cat_year_wp.html() response = requests.get("https://en.wikipedia.org/wiki/"+esc_wiki_page) esc_cat_year_html = response.text print(" Saving page to cache") write_text_file(esc_wiki_page_filename,esc_cat_year_html) else: print("Retrieving " + esc_wiki_page_file + " from cache") esc_cat_year_html = read_text_file(esc_wiki_page_filename) esc_cat_year_soup = bs4.BeautifulSoup(esc_cat_year_html, 'html.parser') # print(repr(esc_cat_year_soup.body)) a_href_re = re.compile(r"^/wiki/([^/:]+)_in_the_Eurovision_Song_Contest_" + str(year) + r"$") esc_cat_a_tags = esc_cat_year_soup.find_all("a",href=a_href_re) for a_tag in esc_cat_a_tags: href = a_tag.get("href") country = re.search(a_href_re,href).group(1) country = country.replace("_"," ") category_countries[country] = 1 return category_countries