Context Navigation

source: main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/escwikipedia.py@ 35117

Last change on this file since 35117 was 35117, checked in by davidb, 3 years ago
Fix for why extracting flag was always picking the last entry in the table
File size: 11.3 KB

Line
1
2	# TODO
3	# Check to see if Song/Title coming back with quotes around it
4	# In addition to Languages->Language, split of "," ??
5	# (e.g., 2005 has "Language" but entries like English, Spanish)
6
7
8
9	from __future__ import print_function
10
11	import json
12	import os
13	import re
14	import requests
15
16	import argparse
17	import bs4
18	import wikipedia
19
20	DEBUG=False
21
22
23	cache_dir_=None
24
25	def init_cache(cache_dir):
26	global cache_dir_
27	cache_dir_ = cache_dir
28	if not(os.path.exists(cache_dir_)):
29	print("Making cache directory: " + cache_dir_)
30	os.mkdir(cache_dir_)
31
32
33	def read_text_file(input_filename):
34	f = open(input_filename, "r")
35	text = f.read()
36	f.close()
37
38	return text
39
40	def write_text_file(output_filename, text):
41	f = open(output_filename, "w")
42	f.write(text)
43	f.close()
44
45	def html_tablerows_to_hashmap(table_rows):
46
47	table_header = table_rows[0]
48
49	header_to_vals = {}
50	headers = []
51
52	header_cols = table_header.find_all("th");
53	for header in header_cols:
54	header_label = header.contents[0].strip()
55	if (header_label == "Language(s)"):
56	header_label = "Language"
57
58	if (header_label == "Song"):
59	header_label = "Title"
60
61	headers.append(header_label)
62	header_to_vals[header_label] = []
63
64	print(" Headers = " + ",".join(header_to_vals.keys()))
65
66	for y in range(1, len(table_rows)):
67	tds = table_rows[y].find_all("td");
68	for x in range(0,len(tds)):
69	val = tds[x]
70	header_label = headers[x]
71	header_to_vals[header_label].append(val)
72
73	return header_to_vals
74
75	def convert_cols_to_country_year_recsOLD(header_to_vals,year):
76
77	country_year_recs = {}
78
79	a_href_re = re.compile(r"^.*" + str(year) + r"$")
80
81	for country_tag in header_to_vals.get("Country"):
82	country = country_tag.find("a",href=a_href_re).string
83	# print("**** country = " + country)
84	country_year = country+str(year)
85
86	country_year_recs[country_year] = { "Country": country, "Year": year }
87
88
89	for key in header_to_vals.keys():
90	if (key == "Country"):
91	continue
92
93	vals = header_to_vals.get(key)
94
95	for l in range(0,len(vals)):
96	country_tag = header_to_vals.get("Country")[l]
97	country_flag_img = country_tag.find("img")
98	country = country_tag.find("a",href=a_href_re).string
99	country_year = country+str(year)
100
101	val = vals[l]
102
103	if key == "Artist":
104	a_val = val.find("a")
105	if (a_val is not None):
106	val = a_val
107	elif key == "Title":
108	a_val = val.find("a")
109	if (a_val is not None):
110	val = a_val
111	elif key == "Language":
112	a_val = val.find("a")
113	if (a_val is not None):
114	val = a_val
115	elif key == "Place":
116	span_val = val.find("span")
117	if (span_val is not None):
118	val = span_val
119
120	for inner_val in val.contents:
121	if (inner_val.string and re.search("[^\s]",inner_val.string)):
122	val = inner_val
123	break
124
125	val = val.string.strip()
126
127	#print("country = " + country);
128	#print("key = " + key);
129	#print("*** storing: " + country + "[" + key + "] = " + val)
130
131	country_year_recs[country_year][key] = val
132
133	return country_year_recs
134
135
136
137
138	def convert_cols_to_country_year_recs(header_to_vals,year,accumulated_country_year_recs,extra_metadata):
139
140	a_href_re = re.compile(r"^.*" + str(year) + r"$")
141
142	country_list = []
143
144	for country_tag in header_to_vals.get("Country"):
145	country = country_tag.find("a",href=a_href_re).string
146	# print("**** country = " + country)
147	country_list.append(country)
148
149	# Only need to worry about a country double up occuring in this country_list
150	# OK for there to be a double up between 'accumlated' and country_list, as
151	# this is just a sign of a country that was in a semi-final progressing to
152	# the final
153
154	country_year_recs = {}
155
156	for i in range(0,len(country_list)):
157	country = country_list[i]
158	country_year = country + str(year)
159
160	this_country_year_rec = {}
161	if (extra_metadata is not None):
162	this_country_year_rec = extra_metadata.copy()
163
164	this_country_year_rec["Country"] = country
165	this_country_year_rec["Year"] = year
166
167	for key in header_to_vals.keys():
168	val = header_to_vals.get(key)[i]
169
170	if (key == "Country"):
171	country_flag_img = val.find("img")
172	this_country_year_rec["FlagImg"] = str(country_flag_img)
173	continue
174
175	if key == "Artist":
176	a_val = val.find("a")
177	if (a_val is not None):
178	val = a_val
179	elif key == "Title":
180	a_val = val.find("a")
181	if (a_val is not None):
182	val = a_val
183	elif key == "Language":
184	a_val = val.find("a")
185	if (a_val is not None):
186	val = a_val
187	elif key == "Place":
188	span_val = val.find("span")
189	if (span_val is not None):
190	val = span_val
191	elif key == "Points":
192	key = "VoteGrandTotal"
193
194	for inner_val in val.contents:
195	if (inner_val.string and re.search("[^\s]",inner_val.string)):
196	val = inner_val
197	break
198
199	val = val.string.strip()
200
201	#print("country = " + country);
202	#print("key = " + key);
203	#print("*** storing: " + country + "[" + key + "] = " + val)
204
205	this_country_year_rec[key] = val
206
207	if (country_year in country_year_recs):
208	# 1956, where countries had 2 entries!
209
210	country_year_rec = country_year_recs[country_year]
211	del country_year_recs[country_year]
212
213	country_year_title = country_year_rec.get("Title")
214	country_year_suffix = re.sub(r"\s+","",country_year_title)
215	new_country_year_key = country_year + country_year_suffix[0:3]
216	country_year_recs[new_country_year_key] = country_year_rec
217
218	this_country_year_title = this_country_year_rec.get("Title")
219	this_country_year_suffix = re.sub(r"\s+","",this_country_year_title)
220	country_year = country_year + this_country_year_suffix[0:3]
221
222	country_year_recs[country_year] = this_country_year_rec
223
224	# Top up the accumulated version with what has been added into country_year_recs
225
226	for country_year in country_year_recs.keys():
227	accumulated_country_year_recs[country_year] = country_year_recs[country_year]
228
229	return accumulated_country_year_recs
230
231
232	def debug_output_country_year_recs(country_year_recs):
233
234	for country_name in country_year_recs.keys():
235	country_rec = country_year_recs.get(country_name)
236
237	print("[" + country_name + "]")
238
239	for metadata_key in country_rec.keys():
240	metadata_val = country_rec.get(metadata_key)
241	print(" " + metadata_key + " = " + repr(metadata_val))
242
243	def retrieve_article_page(year):
244	esc_wiki_page = "Eurovision_Song_Contest_" + str(year)
245	esc_wiki_page_file = esc_wiki_page.replace(" ","_") + ".html"
246	esc_wiki_page_filename = os.path.join(cache_dir_,esc_wiki_page_file)
247
248	esc_article_year_html = ""
249	if not(os.path.exists(esc_wiki_page_filename)):
250	print("Retrieving Wikipedia page '" + esc_wiki_page + "'")
251	esc_article_year_wp = wikipedia.page(esc_wiki_page,auto_suggest=False,preload=True,redirect=True)
252	esc_article_year_html = esc_article_year_wp.html()
253
254	print(" Saving page to cache")
255	write_text_file(esc_wiki_page_filename,esc_article_year_html)
256	else:
257	print("Retrieving " + esc_wiki_page_file + " from cache")
258	esc_article_year_html = read_text_file(esc_wiki_page_filename)
259
260
261	return esc_article_year_html
262
263	def process_article_page(esc_article_year_html,year,result_ids, stop_at_first, extra_metadata=None):
264
265	country_year_recs = {}
266
267	esc_year_soup = bs4.BeautifulSoup(esc_article_year_html, 'html.parser')
268
269	#result_ids = [ 'Grand_final', 'Final', 'Results' ]
270
271	results_heading_list = []
272	for fr_id in result_ids:
273
274	# Manually deal with an exception, where "Final" turns up in a side-bar infobox
275	# before the actual Results section in the main page
276	if ((year == 1996) and (fr_id == "Final")):
277	continue
278
279	results_text_span = esc_year_soup.find("span",id=fr_id)
280	if (results_text_span is not None):
281	print(" Found Final Results heading with id: " + fr_id);
282	results_heading = results_text_span.parent
283	results_heading_list.append(results_heading)
284	# print("**** parent tag: " + results_heading.name);
285	if (stop_at_first):
286	break
287
288	# print(repr(results_heading))
289
290	for result_heading in results_heading_list:
291
292	results_table = results_heading.findNext('table')
293	table_rows = results_table.find_all('tr');
294	print(" Number of rows in Results table = " + str(len(table_rows)))
295
296	header_to_vals = html_tablerows_to_hashmap(table_rows)
297
298	convert_cols_to_country_year_recs(header_to_vals,year,country_year_recs, extra_metadata)
299
300	print("==========")
301
302	## debug_output_country_year_recs(country_year_recs)
303
304	return country_year_recs
305
306
307	def process_category_page(year):
308
309	category_countries = {}
310
311	esc_wiki_page = "Category:Countries_in_the_Eurovision_Song_Contest_" + str(year)
312	esc_wiki_page_file = esc_wiki_page.replace(" ","_") + ".html"
313	esc_wiki_page_filename = os.path.join(cache_dir_,esc_wiki_page_file)
314
315	esc_cat_year_html = ""
316	if not(os.path.exists(esc_wiki_page_filename)):
317	print("Retrieving Wikipedia page '" + esc_wiki_page + "'")
318	##esc_cat_year_wp = wikipedia.page(esc_wiki_page,auto_suggest=False,preload=True,redirect=True)
319	#esc_cat_year_wp = wikipedia.page(esc_wiki_page)
320	#esc_cat_year_html = esc_cat_year_wp.html()
321
322	response = requests.get("https://en.wikipedia.org/wiki/"+esc_wiki_page)
323	esc_cat_year_html = response.text
324
325	print(" Saving page to cache")
326	write_text_file(esc_wiki_page_filename,esc_cat_year_html)
327	else:
328	print("Retrieving " + esc_wiki_page_file + " from cache")
329	esc_cat_year_html = read_text_file(esc_wiki_page_filename)
330
331
332	esc_cat_year_soup = bs4.BeautifulSoup(esc_cat_year_html, 'html.parser')
333	# print(repr(esc_cat_year_soup.body))
334
335	a_href_re = re.compile(r"^/wiki/([^/:]+)_in_the_Eurovision_Song_Contest_" + str(year) + r"$")
336
337	esc_cat_a_tags = esc_cat_year_soup.find_all("a",href=a_href_re)
338
339	for a_tag in esc_cat_a_tags:
340	href = a_tag.get("href")
341	country = re.search(a_href_re,href).group(1)
342	country = country.replace("_"," ")
343
344	category_countries[country] = 1
345
346	return category_countries
347
348
349

Note: See TracBrowser for help on using the repository browser.

Download in other formats: