[34978] | 1 |
|
---|
[34979] | 2 | # TODO
|
---|
| 3 | # Check to see if Song/Title coming back with quotes around it
|
---|
| 4 | # In addition to Languages->Language, split of "," ??
|
---|
| 5 | # (e.g., 2005 has "Language" but entries like English, Spanish)
|
---|
| 6 |
|
---|
| 7 |
|
---|
| 8 |
|
---|
[34978] | 9 | from __future__ import print_function
|
---|
| 10 |
|
---|
| 11 | import json
|
---|
| 12 | import os
|
---|
| 13 | import re
|
---|
[34979] | 14 | import requests
|
---|
[34978] | 15 |
|
---|
| 16 | import argparse
|
---|
| 17 | import bs4
|
---|
| 18 | import wikipedia
|
---|
| 19 |
|
---|
| 20 | DEBUG=False
|
---|
| 21 |
|
---|
| 22 |
|
---|
[34990] | 23 | cache_dir_=None
|
---|
[34978] | 24 |
|
---|
[34990] | 25 | def init_cache(cache_dir):
|
---|
| 26 | global cache_dir_
|
---|
| 27 | cache_dir_ = cache_dir
|
---|
| 28 | if not(os.path.exists(cache_dir_)):
|
---|
| 29 | print("Making cache directory: " + cache_dir_)
|
---|
| 30 | os.mkdir(cache_dir_)
|
---|
[34978] | 31 |
|
---|
| 32 |
|
---|
| 33 | def read_text_file(input_filename):
|
---|
| 34 | f = open(input_filename, "r")
|
---|
| 35 | text = f.read()
|
---|
| 36 | f.close()
|
---|
| 37 |
|
---|
| 38 | return text
|
---|
| 39 |
|
---|
| 40 | def write_text_file(output_filename, text):
|
---|
| 41 | f = open(output_filename, "w")
|
---|
| 42 | f.write(text)
|
---|
| 43 | f.close()
|
---|
| 44 |
|
---|
| 45 | def html_tablerows_to_hashmap(table_rows):
|
---|
| 46 |
|
---|
| 47 | table_header = table_rows[0]
|
---|
| 48 |
|
---|
| 49 | header_to_vals = {}
|
---|
| 50 | headers = []
|
---|
| 51 |
|
---|
| 52 | header_cols = table_header.find_all("th");
|
---|
| 53 | for header in header_cols:
|
---|
| 54 | header_label = header.contents[0].strip()
|
---|
| 55 | if (header_label == "Language(s)"):
|
---|
| 56 | header_label = "Language"
|
---|
| 57 |
|
---|
| 58 | if (header_label == "Song"):
|
---|
| 59 | header_label = "Title"
|
---|
| 60 |
|
---|
| 61 | headers.append(header_label)
|
---|
| 62 | header_to_vals[header_label] = []
|
---|
| 63 |
|
---|
| 64 | print(" Headers = " + ",".join(header_to_vals.keys()))
|
---|
| 65 |
|
---|
| 66 | for y in range(1, len(table_rows)):
|
---|
| 67 | tds = table_rows[y].find_all("td");
|
---|
| 68 | for x in range(0,len(tds)):
|
---|
| 69 | val = tds[x]
|
---|
| 70 | header_label = headers[x]
|
---|
| 71 | header_to_vals[header_label].append(val)
|
---|
| 72 |
|
---|
| 73 | return header_to_vals
|
---|
| 74 |
|
---|
| 75 | def convert_cols_to_country_year_recsOLD(header_to_vals,year):
|
---|
| 76 |
|
---|
| 77 | country_year_recs = {}
|
---|
| 78 |
|
---|
| 79 | a_href_re = re.compile(r"^.*" + str(year) + r"$")
|
---|
| 80 |
|
---|
| 81 | for country_tag in header_to_vals.get("Country"):
|
---|
| 82 | country = country_tag.find("a",href=a_href_re).string
|
---|
| 83 | # print("**** country = " + country)
|
---|
| 84 | country_year = country+str(year)
|
---|
| 85 |
|
---|
[35014] | 86 | country_year_recs[country_year] = { "Country": country, "Year": year }
|
---|
| 87 |
|
---|
[34978] | 88 |
|
---|
| 89 | for key in header_to_vals.keys():
|
---|
| 90 | if (key == "Country"):
|
---|
| 91 | continue
|
---|
| 92 |
|
---|
| 93 | vals = header_to_vals.get(key)
|
---|
| 94 |
|
---|
| 95 | for l in range(0,len(vals)):
|
---|
| 96 | country_tag = header_to_vals.get("Country")[l]
|
---|
| 97 | country_flag_img = country_tag.find("img")
|
---|
| 98 | country = country_tag.find("a",href=a_href_re).string
|
---|
| 99 | country_year = country+str(year)
|
---|
| 100 |
|
---|
| 101 | val = vals[l]
|
---|
| 102 |
|
---|
| 103 | if key == "Artist":
|
---|
| 104 | a_val = val.find("a")
|
---|
| 105 | if (a_val is not None):
|
---|
| 106 | val = a_val
|
---|
| 107 | elif key == "Title":
|
---|
| 108 | a_val = val.find("a")
|
---|
| 109 | if (a_val is not None):
|
---|
| 110 | val = a_val
|
---|
| 111 | elif key == "Language":
|
---|
| 112 | a_val = val.find("a")
|
---|
| 113 | if (a_val is not None):
|
---|
| 114 | val = a_val
|
---|
| 115 | elif key == "Place":
|
---|
| 116 | span_val = val.find("span")
|
---|
| 117 | if (span_val is not None):
|
---|
| 118 | val = span_val
|
---|
| 119 |
|
---|
| 120 | for inner_val in val.contents:
|
---|
| 121 | if (inner_val.string and re.search("[^\s]",inner_val.string)):
|
---|
| 122 | val = inner_val
|
---|
| 123 | break
|
---|
| 124 |
|
---|
| 125 | val = val.string.strip()
|
---|
| 126 |
|
---|
| 127 | #print("country = " + country);
|
---|
| 128 | #print("key = " + key);
|
---|
| 129 | #print("*** storing: " + country + "[" + key + "] = " + val)
|
---|
| 130 |
|
---|
| 131 | country_year_recs[country_year][key] = val
|
---|
| 132 |
|
---|
| 133 | return country_year_recs
|
---|
| 134 |
|
---|
| 135 |
|
---|
| 136 |
|
---|
| 137 |
|
---|
[35032] | 138 | def convert_cols_to_country_year_recs(header_to_vals,year,accumulated_country_year_recs,extra_metadata):
|
---|
[34978] | 139 |
|
---|
| 140 | a_href_re = re.compile(r"^.*" + str(year) + r"$")
|
---|
[34980] | 141 |
|
---|
[34978] | 142 | country_list = []
|
---|
| 143 |
|
---|
| 144 | for country_tag in header_to_vals.get("Country"):
|
---|
| 145 | country = country_tag.find("a",href=a_href_re).string
|
---|
| 146 | # print("**** country = " + country)
|
---|
| 147 | country_list.append(country)
|
---|
| 148 |
|
---|
[35104] | 149 | # Only need to worry about a country double up occuring in this country_list
|
---|
[34980] | 150 | # OK for there to be a double up between 'accumlated' and country_list, as
|
---|
| 151 | # this is just a sign of a country that was in a semi-final progressing to
|
---|
| 152 | # the final
|
---|
| 153 |
|
---|
| 154 | country_year_recs = {}
|
---|
| 155 |
|
---|
[34978] | 156 | for i in range(0,len(country_list)):
|
---|
| 157 | country = country_list[i]
|
---|
| 158 | country_year = country + str(year)
|
---|
| 159 |
|
---|
[35032] | 160 | this_country_year_rec = {}
|
---|
| 161 | if (extra_metadata is not None):
|
---|
| 162 | this_country_year_rec = extra_metadata.copy()
|
---|
| 163 |
|
---|
| 164 | this_country_year_rec["Country"] = country
|
---|
| 165 | this_country_year_rec["Year"] = year
|
---|
[34978] | 166 |
|
---|
| 167 | for key in header_to_vals.keys():
|
---|
[35117] | 168 | val = header_to_vals.get(key)[i]
|
---|
| 169 |
|
---|
[34978] | 170 | if (key == "Country"):
|
---|
[35117] | 171 | country_flag_img = val.find("img")
|
---|
[34978] | 172 | this_country_year_rec["FlagImg"] = str(country_flag_img)
|
---|
| 173 | continue
|
---|
| 174 |
|
---|
| 175 | if key == "Artist":
|
---|
| 176 | a_val = val.find("a")
|
---|
| 177 | if (a_val is not None):
|
---|
| 178 | val = a_val
|
---|
| 179 | elif key == "Title":
|
---|
| 180 | a_val = val.find("a")
|
---|
| 181 | if (a_val is not None):
|
---|
| 182 | val = a_val
|
---|
| 183 | elif key == "Language":
|
---|
| 184 | a_val = val.find("a")
|
---|
| 185 | if (a_val is not None):
|
---|
| 186 | val = a_val
|
---|
| 187 | elif key == "Place":
|
---|
| 188 | span_val = val.find("span")
|
---|
| 189 | if (span_val is not None):
|
---|
| 190 | val = span_val
|
---|
[35032] | 191 | elif key == "Points":
|
---|
| 192 | key = "VoteGrandTotal"
|
---|
[34978] | 193 |
|
---|
| 194 | for inner_val in val.contents:
|
---|
| 195 | if (inner_val.string and re.search("[^\s]",inner_val.string)):
|
---|
| 196 | val = inner_val
|
---|
| 197 | break
|
---|
| 198 |
|
---|
| 199 | val = val.string.strip()
|
---|
| 200 |
|
---|
| 201 | #print("country = " + country);
|
---|
| 202 | #print("key = " + key);
|
---|
| 203 | #print("*** storing: " + country + "[" + key + "] = " + val)
|
---|
| 204 |
|
---|
| 205 | this_country_year_rec[key] = val
|
---|
| 206 |
|
---|
| 207 | if (country_year in country_year_recs):
|
---|
| 208 | # 1956, where countries had 2 entries!
|
---|
| 209 |
|
---|
| 210 | country_year_rec = country_year_recs[country_year]
|
---|
| 211 | del country_year_recs[country_year]
|
---|
| 212 |
|
---|
| 213 | country_year_title = country_year_rec.get("Title")
|
---|
| 214 | country_year_suffix = re.sub(r"\s+","",country_year_title)
|
---|
| 215 | new_country_year_key = country_year + country_year_suffix[0:3]
|
---|
| 216 | country_year_recs[new_country_year_key] = country_year_rec
|
---|
| 217 |
|
---|
| 218 | this_country_year_title = this_country_year_rec.get("Title")
|
---|
| 219 | this_country_year_suffix = re.sub(r"\s+","",this_country_year_title)
|
---|
| 220 | country_year = country_year + this_country_year_suffix[0:3]
|
---|
| 221 |
|
---|
| 222 | country_year_recs[country_year] = this_country_year_rec
|
---|
| 223 |
|
---|
[34980] | 224 | # Top up the accumulated version with what has been added into country_year_recs
|
---|
| 225 |
|
---|
| 226 | for country_year in country_year_recs.keys():
|
---|
| 227 | accumulated_country_year_recs[country_year] = country_year_recs[country_year]
|
---|
| 228 |
|
---|
| 229 | return accumulated_country_year_recs
|
---|
[34978] | 230 |
|
---|
| 231 |
|
---|
| 232 | def debug_output_country_year_recs(country_year_recs):
|
---|
| 233 |
|
---|
| 234 | for country_name in country_year_recs.keys():
|
---|
| 235 | country_rec = country_year_recs.get(country_name)
|
---|
| 236 |
|
---|
| 237 | print("[" + country_name + "]")
|
---|
| 238 |
|
---|
| 239 | for metadata_key in country_rec.keys():
|
---|
| 240 | metadata_val = country_rec.get(metadata_key)
|
---|
| 241 | print(" " + metadata_key + " = " + repr(metadata_val))
|
---|
[34980] | 242 |
|
---|
| 243 | def retrieve_article_page(year):
|
---|
[34978] | 244 | esc_wiki_page = "Eurovision_Song_Contest_" + str(year)
|
---|
| 245 | esc_wiki_page_file = esc_wiki_page.replace(" ","_") + ".html"
|
---|
[34990] | 246 | esc_wiki_page_filename = os.path.join(cache_dir_,esc_wiki_page_file)
|
---|
[34978] | 247 |
|
---|
[34980] | 248 | esc_article_year_html = ""
|
---|
[34978] | 249 | if not(os.path.exists(esc_wiki_page_filename)):
|
---|
| 250 | print("Retrieving Wikipedia page '" + esc_wiki_page + "'")
|
---|
[34980] | 251 | esc_article_year_wp = wikipedia.page(esc_wiki_page,auto_suggest=False,preload=True,redirect=True)
|
---|
| 252 | esc_article_year_html = esc_article_year_wp.html()
|
---|
[34978] | 253 |
|
---|
| 254 | print(" Saving page to cache")
|
---|
[34980] | 255 | write_text_file(esc_wiki_page_filename,esc_article_year_html)
|
---|
[34978] | 256 | else:
|
---|
| 257 | print("Retrieving " + esc_wiki_page_file + " from cache")
|
---|
[34980] | 258 | esc_article_year_html = read_text_file(esc_wiki_page_filename)
|
---|
[34978] | 259 |
|
---|
| 260 |
|
---|
[34980] | 261 | return esc_article_year_html
|
---|
| 262 |
|
---|
[35032] | 263 | def process_article_page(esc_article_year_html,year,result_ids, stop_at_first, extra_metadata=None):
|
---|
[34980] | 264 |
|
---|
| 265 | country_year_recs = {}
|
---|
[34978] | 266 |
|
---|
[34980] | 267 | esc_year_soup = bs4.BeautifulSoup(esc_article_year_html, 'html.parser')
|
---|
| 268 |
|
---|
| 269 | #result_ids = [ 'Grand_final', 'Final', 'Results' ]
|
---|
| 270 |
|
---|
| 271 | results_heading_list = []
|
---|
| 272 | for fr_id in result_ids:
|
---|
[34978] | 273 |
|
---|
[34980] | 274 | # Manually deal with an exception, where "Final" turns up in a side-bar infobox
|
---|
| 275 | # before the actual Results section in the main page
|
---|
[34978] | 276 | if ((year == 1996) and (fr_id == "Final")):
|
---|
| 277 | continue
|
---|
| 278 |
|
---|
| 279 | results_text_span = esc_year_soup.find("span",id=fr_id)
|
---|
| 280 | if (results_text_span is not None):
|
---|
| 281 | print(" Found Final Results heading with id: " + fr_id);
|
---|
| 282 | results_heading = results_text_span.parent
|
---|
[34980] | 283 | results_heading_list.append(results_heading)
|
---|
[34978] | 284 | # print("**** parent tag: " + results_heading.name);
|
---|
[34980] | 285 | if (stop_at_first):
|
---|
| 286 | break
|
---|
[34978] | 287 |
|
---|
[34980] | 288 | # print(repr(results_heading))
|
---|
[34978] | 289 |
|
---|
[34980] | 290 | for result_heading in results_heading_list:
|
---|
| 291 |
|
---|
| 292 | results_table = results_heading.findNext('table')
|
---|
| 293 | table_rows = results_table.find_all('tr');
|
---|
| 294 | print(" Number of rows in Results table = " + str(len(table_rows)))
|
---|
[34978] | 295 |
|
---|
[34980] | 296 | header_to_vals = html_tablerows_to_hashmap(table_rows)
|
---|
[34978] | 297 |
|
---|
[35032] | 298 | convert_cols_to_country_year_recs(header_to_vals,year,country_year_recs, extra_metadata)
|
---|
[34978] | 299 |
|
---|
| 300 | print("==========")
|
---|
| 301 |
|
---|
| 302 | ## debug_output_country_year_recs(country_year_recs)
|
---|
| 303 |
|
---|
[35127] | 304 | # Splice in logo image for that year into each country_year_rec
|
---|
| 305 | #infobox_table = esc_year_soup.find("table",{"class": "infobox"})
|
---|
| 306 | #infobox_td = infobox_table.find("td",{"class": "infobox-image"})
|
---|
| 307 | #infobox_img = infobox_td.find("img")
|
---|
| 308 |
|
---|
| 309 | #infobox_img = esc_year_soup.select("table.infobox td.infobox-image img")
|
---|
| 310 |
|
---|
| 311 | #infobox_img = esc_year_soup.select("table.infobox td.infobox-image img")
|
---|
| 312 | #infobox_img = esc_year_soup.select("table.infobox td.infobox-image")
|
---|
| 313 | # infobox_img = esc_year_soup.select("table.infobox")
|
---|
| 314 |
|
---|
| 315 | #infobox_table = esc_year_soup.find("table",{"class": "infobox"})
|
---|
| 316 | #infobox_img = infobox_table.tbody.tr[0].td[0].a.img
|
---|
| 317 |
|
---|
| 318 | # Looks like Wikipedia has changed some of its infobox CSS
|
---|
| 319 | # If processing newer downloads, then the following is probably the select
|
---|
| 320 | # statement to use
|
---|
| 321 |
|
---|
| 322 | # new school!
|
---|
| 323 | #infobox_img = esc_year_soup.select("table.infobox td.infobox-image img")
|
---|
| 324 |
|
---|
| 325 | # old school
|
---|
| 326 | infobox_imgs = esc_year_soup.select("table.infobox tr td a.image img")
|
---|
| 327 |
|
---|
| 328 | if (len(infobox_imgs) == 0):
|
---|
| 329 | print("****")
|
---|
| 330 | print("****!!! No ESC Logo image found!!!")
|
---|
| 331 | print("****")
|
---|
| 332 | else:
|
---|
| 333 | # Some pages include addition image graphics, such as a map showing country entrants
|
---|
| 334 | # => Want the first one
|
---|
| 335 | infobox_logo_img = infobox_imgs[0];
|
---|
| 336 |
|
---|
| 337 | for country_year_key in country_year_recs.keys():
|
---|
| 338 | country_year_rec = country_year_recs.get(country_year_key)
|
---|
| 339 |
|
---|
| 340 | country_year_rec["YearLogoImg"] = str(infobox_logo_img)
|
---|
| 341 |
|
---|
| 342 |
|
---|
[34978] | 343 | return country_year_recs
|
---|
| 344 |
|
---|
| 345 |
|
---|
[34979] | 346 | def process_category_page(year):
|
---|
[34978] | 347 |
|
---|
[34979] | 348 | category_countries = {}
|
---|
| 349 |
|
---|
| 350 | esc_wiki_page = "Category:Countries_in_the_Eurovision_Song_Contest_" + str(year)
|
---|
| 351 | esc_wiki_page_file = esc_wiki_page.replace(" ","_") + ".html"
|
---|
[34990] | 352 | esc_wiki_page_filename = os.path.join(cache_dir_,esc_wiki_page_file)
|
---|
[34978] | 353 |
|
---|
[34979] | 354 | esc_cat_year_html = ""
|
---|
| 355 | if not(os.path.exists(esc_wiki_page_filename)):
|
---|
| 356 | print("Retrieving Wikipedia page '" + esc_wiki_page + "'")
|
---|
| 357 | ##esc_cat_year_wp = wikipedia.page(esc_wiki_page,auto_suggest=False,preload=True,redirect=True)
|
---|
| 358 | #esc_cat_year_wp = wikipedia.page(esc_wiki_page)
|
---|
| 359 | #esc_cat_year_html = esc_cat_year_wp.html()
|
---|
| 360 |
|
---|
| 361 | response = requests.get("https://en.wikipedia.org/wiki/"+esc_wiki_page)
|
---|
| 362 | esc_cat_year_html = response.text
|
---|
| 363 |
|
---|
| 364 | print(" Saving page to cache")
|
---|
| 365 | write_text_file(esc_wiki_page_filename,esc_cat_year_html)
|
---|
| 366 | else:
|
---|
| 367 | print("Retrieving " + esc_wiki_page_file + " from cache")
|
---|
| 368 | esc_cat_year_html = read_text_file(esc_wiki_page_filename)
|
---|
| 369 |
|
---|
| 370 |
|
---|
| 371 | esc_cat_year_soup = bs4.BeautifulSoup(esc_cat_year_html, 'html.parser')
|
---|
| 372 | # print(repr(esc_cat_year_soup.body))
|
---|
| 373 |
|
---|
| 374 | a_href_re = re.compile(r"^/wiki/([^/:]+)_in_the_Eurovision_Song_Contest_" + str(year) + r"$")
|
---|
| 375 |
|
---|
| 376 | esc_cat_a_tags = esc_cat_year_soup.find_all("a",href=a_href_re)
|
---|
| 377 |
|
---|
| 378 | for a_tag in esc_cat_a_tags:
|
---|
| 379 | href = a_tag.get("href")
|
---|
| 380 | country = re.search(a_href_re,href).group(1)
|
---|
| 381 | country = country.replace("_"," ")
|
---|
| 382 |
|
---|
| 383 | category_countries[country] = 1
|
---|
| 384 |
|
---|
| 385 | return category_countries
|
---|
| 386 |
|
---|
| 387 |
|
---|
| 388 |
|
---|