Context Navigation

source: main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/escwikipedia.py@ 35127

Last change on this file since 35127 was 35127, checked in by davidb, 3 years ago
Code to extract ESC Logo for year
File size: 12.8 KB

Rev	Line
[34978]	1
[34979]	2	# TODO
	3	# Check to see if Song/Title coming back with quotes around it
	4	# In addition to Languages->Language, split of "," ??
	5	# (e.g., 2005 has "Language" but entries like English, Spanish)
	6
	7
	8
[34978]	9	from __future__ import print_function
	10
	11	import json
	12	import os
	13	import re
[34979]	14	import requests
[34978]	15
	16	import argparse
	17	import bs4
	18	import wikipedia
	19
	20	DEBUG=False
	21
	22
[34990]	23	cache_dir_=None
[34978]	24
[34990]	25	def init_cache(cache_dir):
	26	global cache_dir_
	27	cache_dir_ = cache_dir
	28	if not(os.path.exists(cache_dir_)):
	29	print("Making cache directory: " + cache_dir_)
	30	os.mkdir(cache_dir_)
[34978]	31
	32
	33	def read_text_file(input_filename):
	34	f = open(input_filename, "r")
	35	text = f.read()
	36	f.close()
	37
	38	return text
	39
	40	def write_text_file(output_filename, text):
	41	f = open(output_filename, "w")
	42	f.write(text)
	43	f.close()
	44
	45	def html_tablerows_to_hashmap(table_rows):
	46
	47	table_header = table_rows[0]
	48
	49	header_to_vals = {}
	50	headers = []
	51
	52	header_cols = table_header.find_all("th");
	53	for header in header_cols:
	54	header_label = header.contents[0].strip()
	55	if (header_label == "Language(s)"):
	56	header_label = "Language"
	57
	58	if (header_label == "Song"):
	59	header_label = "Title"
	60
	61	headers.append(header_label)
	62	header_to_vals[header_label] = []
	63
	64	print(" Headers = " + ",".join(header_to_vals.keys()))
	65
	66	for y in range(1, len(table_rows)):
	67	tds = table_rows[y].find_all("td");
	68	for x in range(0,len(tds)):
	69	val = tds[x]
	70	header_label = headers[x]
	71	header_to_vals[header_label].append(val)
	72
	73	return header_to_vals
	74
	75	def convert_cols_to_country_year_recsOLD(header_to_vals,year):
	76
	77	country_year_recs = {}
	78
	79	a_href_re = re.compile(r"^.*" + str(year) + r"$")
	80
	81	for country_tag in header_to_vals.get("Country"):
	82	country = country_tag.find("a",href=a_href_re).string
	83	# print("**** country = " + country)
	84	country_year = country+str(year)
	85
[35014]	86	country_year_recs[country_year] = { "Country": country, "Year": year }
	87
[34978]	88
	89	for key in header_to_vals.keys():
	90	if (key == "Country"):
	91	continue
	92
	93	vals = header_to_vals.get(key)
	94
	95	for l in range(0,len(vals)):
	96	country_tag = header_to_vals.get("Country")[l]
	97	country_flag_img = country_tag.find("img")
	98	country = country_tag.find("a",href=a_href_re).string
	99	country_year = country+str(year)
	100
	101	val = vals[l]
	102
	103	if key == "Artist":
	104	a_val = val.find("a")
	105	if (a_val is not None):
	106	val = a_val
	107	elif key == "Title":
	108	a_val = val.find("a")
	109	if (a_val is not None):
	110	val = a_val
	111	elif key == "Language":
	112	a_val = val.find("a")
	113	if (a_val is not None):
	114	val = a_val
	115	elif key == "Place":
	116	span_val = val.find("span")
	117	if (span_val is not None):
	118	val = span_val
	119
	120	for inner_val in val.contents:
	121	if (inner_val.string and re.search("[^\s]",inner_val.string)):
	122	val = inner_val
	123	break
	124
	125	val = val.string.strip()
	126
	127	#print("country = " + country);
	128	#print("key = " + key);
	129	#print("*** storing: " + country + "[" + key + "] = " + val)
	130
	131	country_year_recs[country_year][key] = val
	132
	133	return country_year_recs
	134
	135
	136
	137
[35032]	138	def convert_cols_to_country_year_recs(header_to_vals,year,accumulated_country_year_recs,extra_metadata):
[34978]	139
	140	a_href_re = re.compile(r"^.*" + str(year) + r"$")
[34980]	141
[34978]	142	country_list = []
	143
	144	for country_tag in header_to_vals.get("Country"):
	145	country = country_tag.find("a",href=a_href_re).string
	146	# print("**** country = " + country)
	147	country_list.append(country)
	148
[35104]	149	# Only need to worry about a country double up occuring in this country_list
[34980]	150	# OK for there to be a double up between 'accumlated' and country_list, as
	151	# this is just a sign of a country that was in a semi-final progressing to
	152	# the final
	153
	154	country_year_recs = {}
	155
[34978]	156	for i in range(0,len(country_list)):
	157	country = country_list[i]
	158	country_year = country + str(year)
	159
[35032]	160	this_country_year_rec = {}
	161	if (extra_metadata is not None):
	162	this_country_year_rec = extra_metadata.copy()
	163
	164	this_country_year_rec["Country"] = country
	165	this_country_year_rec["Year"] = year
[34978]	166
	167	for key in header_to_vals.keys():
[35117]	168	val = header_to_vals.get(key)[i]
	169
[34978]	170	if (key == "Country"):
[35117]	171	country_flag_img = val.find("img")
[34978]	172	this_country_year_rec["FlagImg"] = str(country_flag_img)
	173	continue
	174
	175	if key == "Artist":
	176	a_val = val.find("a")
	177	if (a_val is not None):
	178	val = a_val
	179	elif key == "Title":
	180	a_val = val.find("a")
	181	if (a_val is not None):
	182	val = a_val
	183	elif key == "Language":
	184	a_val = val.find("a")
	185	if (a_val is not None):
	186	val = a_val
	187	elif key == "Place":
	188	span_val = val.find("span")
	189	if (span_val is not None):
	190	val = span_val
[35032]	191	elif key == "Points":
	192	key = "VoteGrandTotal"
[34978]	193
	194	for inner_val in val.contents:
	195	if (inner_val.string and re.search("[^\s]",inner_val.string)):
	196	val = inner_val
	197	break
	198
	199	val = val.string.strip()
	200
	201	#print("country = " + country);
	202	#print("key = " + key);
	203	#print("*** storing: " + country + "[" + key + "] = " + val)
	204
	205	this_country_year_rec[key] = val
	206
	207	if (country_year in country_year_recs):
	208	# 1956, where countries had 2 entries!
	209
	210	country_year_rec = country_year_recs[country_year]
	211	del country_year_recs[country_year]
	212
	213	country_year_title = country_year_rec.get("Title")
	214	country_year_suffix = re.sub(r"\s+","",country_year_title)
	215	new_country_year_key = country_year + country_year_suffix[0:3]
	216	country_year_recs[new_country_year_key] = country_year_rec
	217
	218	this_country_year_title = this_country_year_rec.get("Title")
	219	this_country_year_suffix = re.sub(r"\s+","",this_country_year_title)
	220	country_year = country_year + this_country_year_suffix[0:3]
	221
	222	country_year_recs[country_year] = this_country_year_rec
	223
[34980]	224	# Top up the accumulated version with what has been added into country_year_recs
	225
	226	for country_year in country_year_recs.keys():
	227	accumulated_country_year_recs[country_year] = country_year_recs[country_year]
	228
	229	return accumulated_country_year_recs
[34978]	230
	231
	232	def debug_output_country_year_recs(country_year_recs):
	233
	234	for country_name in country_year_recs.keys():
	235	country_rec = country_year_recs.get(country_name)
	236
	237	print("[" + country_name + "]")
	238
	239	for metadata_key in country_rec.keys():
	240	metadata_val = country_rec.get(metadata_key)
	241	print(" " + metadata_key + " = " + repr(metadata_val))
[34980]	242
	243	def retrieve_article_page(year):
[34978]	244	esc_wiki_page = "Eurovision_Song_Contest_" + str(year)
	245	esc_wiki_page_file = esc_wiki_page.replace(" ","_") + ".html"
[34990]	246	esc_wiki_page_filename = os.path.join(cache_dir_,esc_wiki_page_file)
[34978]	247
[34980]	248	esc_article_year_html = ""
[34978]	249	if not(os.path.exists(esc_wiki_page_filename)):
	250	print("Retrieving Wikipedia page '" + esc_wiki_page + "'")
[34980]	251	esc_article_year_wp = wikipedia.page(esc_wiki_page,auto_suggest=False,preload=True,redirect=True)
	252	esc_article_year_html = esc_article_year_wp.html()
[34978]	253
	254	print(" Saving page to cache")
[34980]	255	write_text_file(esc_wiki_page_filename,esc_article_year_html)
[34978]	256	else:
	257	print("Retrieving " + esc_wiki_page_file + " from cache")
[34980]	258	esc_article_year_html = read_text_file(esc_wiki_page_filename)
[34978]	259
	260
[34980]	261	return esc_article_year_html
	262
[35032]	263	def process_article_page(esc_article_year_html,year,result_ids, stop_at_first, extra_metadata=None):
[34980]	264
	265	country_year_recs = {}
[34978]	266
[34980]	267	esc_year_soup = bs4.BeautifulSoup(esc_article_year_html, 'html.parser')
	268
	269	#result_ids = [ 'Grand_final', 'Final', 'Results' ]
	270
	271	results_heading_list = []
	272	for fr_id in result_ids:
[34978]	273
[34980]	274	# Manually deal with an exception, where "Final" turns up in a side-bar infobox
	275	# before the actual Results section in the main page
[34978]	276	if ((year == 1996) and (fr_id == "Final")):
	277	continue
	278
	279	results_text_span = esc_year_soup.find("span",id=fr_id)
	280	if (results_text_span is not None):
	281	print(" Found Final Results heading with id: " + fr_id);
	282	results_heading = results_text_span.parent
[34980]	283	results_heading_list.append(results_heading)
[34978]	284	# print("**** parent tag: " + results_heading.name);
[34980]	285	if (stop_at_first):
	286	break
[34978]	287
[34980]	288	# print(repr(results_heading))
[34978]	289
[34980]	290	for result_heading in results_heading_list:
	291
	292	results_table = results_heading.findNext('table')
	293	table_rows = results_table.find_all('tr');
	294	print(" Number of rows in Results table = " + str(len(table_rows)))
[34978]	295
[34980]	296	header_to_vals = html_tablerows_to_hashmap(table_rows)
[34978]	297
[35032]	298	convert_cols_to_country_year_recs(header_to_vals,year,country_year_recs, extra_metadata)
[34978]	299
	300	print("==========")
	301
	302	## debug_output_country_year_recs(country_year_recs)
	303
[35127]	304	# Splice in logo image for that year into each country_year_rec
	305	#infobox_table = esc_year_soup.find("table",{"class": "infobox"})
	306	#infobox_td = infobox_table.find("td",{"class": "infobox-image"})
	307	#infobox_img = infobox_td.find("img")
	308
	309	#infobox_img = esc_year_soup.select("table.infobox td.infobox-image img")
	310
	311	#infobox_img = esc_year_soup.select("table.infobox td.infobox-image img")
	312	#infobox_img = esc_year_soup.select("table.infobox td.infobox-image")
	313	# infobox_img = esc_year_soup.select("table.infobox")
	314
	315	#infobox_table = esc_year_soup.find("table",{"class": "infobox"})
	316	#infobox_img = infobox_table.tbody.tr[0].td[0].a.img
	317
	318	# Looks like Wikipedia has changed some of its infobox CSS
	319	# If processing newer downloads, then the following is probably the select
	320	# statement to use
	321
	322	# new school!
	323	#infobox_img = esc_year_soup.select("table.infobox td.infobox-image img")
	324
	325	# old school
	326	infobox_imgs = esc_year_soup.select("table.infobox tr td a.image img")
	327
	328	if (len(infobox_imgs) == 0):
	329	print("****")
	330	print("****!!! No ESC Logo image found!!!")
	331	print("****")
	332	else:
	333	# Some pages include addition image graphics, such as a map showing country entrants
	334	# => Want the first one
	335	infobox_logo_img = infobox_imgs[0];
	336
	337	for country_year_key in country_year_recs.keys():
	338	country_year_rec = country_year_recs.get(country_year_key)
	339
	340	country_year_rec["YearLogoImg"] = str(infobox_logo_img)
	341
	342
[34978]	343	return country_year_recs
	344
	345
[34979]	346	def process_category_page(year):
[34978]	347
[34979]	348	category_countries = {}
	349
	350	esc_wiki_page = "Category:Countries_in_the_Eurovision_Song_Contest_" + str(year)
	351	esc_wiki_page_file = esc_wiki_page.replace(" ","_") + ".html"
[34990]	352	esc_wiki_page_filename = os.path.join(cache_dir_,esc_wiki_page_file)
[34978]	353
[34979]	354	esc_cat_year_html = ""
	355	if not(os.path.exists(esc_wiki_page_filename)):
	356	print("Retrieving Wikipedia page '" + esc_wiki_page + "'")
	357	##esc_cat_year_wp = wikipedia.page(esc_wiki_page,auto_suggest=False,preload=True,redirect=True)
	358	#esc_cat_year_wp = wikipedia.page(esc_wiki_page)
	359	#esc_cat_year_html = esc_cat_year_wp.html()
	360
	361	response = requests.get("https://en.wikipedia.org/wiki/"+esc_wiki_page)
	362	esc_cat_year_html = response.text
	363
	364	print(" Saving page to cache")
	365	write_text_file(esc_wiki_page_filename,esc_cat_year_html)
	366	else:
	367	print("Retrieving " + esc_wiki_page_file + " from cache")
	368	esc_cat_year_html = read_text_file(esc_wiki_page_filename)
	369
	370
	371	esc_cat_year_soup = bs4.BeautifulSoup(esc_cat_year_html, 'html.parser')
	372	# print(repr(esc_cat_year_soup.body))
	373
	374	a_href_re = re.compile(r"^/wiki/([^/:]+)_in_the_Eurovision_Song_Contest_" + str(year) + r"$")
	375
	376	esc_cat_a_tags = esc_cat_year_soup.find_all("a",href=a_href_re)
	377
	378	for a_tag in esc_cat_a_tags:
	379	href = a_tag.get("href")
	380	country = re.search(a_href_re,href).group(1)
	381	country = country.replace("_"," ")
	382
	383	category_countries[country] = 1
	384
	385	return category_countries
	386
	387
	388

Note: See TracBrowser for help on using the repository browser.

Download in other formats: