source: main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/escwikipedia.py@ 35117

Last change on this file since 35117 was 35117, checked in by davidb, 3 years ago

Fix for why extracting flag was always picking the last entry in the table

File size: 11.3 KB
Line 
1
2# TODO
3# Check to see if Song/Title coming back with quotes around it
4# In addition to Languages->Language, split of "," ??
5# (e.g., 2005 has "Language" but entries like English, Spanish)
6
7
8
9from __future__ import print_function
10
11import json
12import os
13import re
14import requests
15
16import argparse
17import bs4
18import wikipedia
19
20DEBUG=False
21
22
23cache_dir_=None
24
25def init_cache(cache_dir):
26 global cache_dir_
27 cache_dir_ = cache_dir
28 if not(os.path.exists(cache_dir_)):
29 print("Making cache directory: " + cache_dir_)
30 os.mkdir(cache_dir_)
31
32
33def read_text_file(input_filename):
34 f = open(input_filename, "r")
35 text = f.read()
36 f.close()
37
38 return text
39
40def write_text_file(output_filename, text):
41 f = open(output_filename, "w")
42 f.write(text)
43 f.close()
44
45def html_tablerows_to_hashmap(table_rows):
46
47 table_header = table_rows[0]
48
49 header_to_vals = {}
50 headers = []
51
52 header_cols = table_header.find_all("th");
53 for header in header_cols:
54 header_label = header.contents[0].strip()
55 if (header_label == "Language(s)"):
56 header_label = "Language"
57
58 if (header_label == "Song"):
59 header_label = "Title"
60
61 headers.append(header_label)
62 header_to_vals[header_label] = []
63
64 print(" Headers = " + ",".join(header_to_vals.keys()))
65
66 for y in range(1, len(table_rows)):
67 tds = table_rows[y].find_all("td");
68 for x in range(0,len(tds)):
69 val = tds[x]
70 header_label = headers[x]
71 header_to_vals[header_label].append(val)
72
73 return header_to_vals
74
75def convert_cols_to_country_year_recsOLD(header_to_vals,year):
76
77 country_year_recs = {}
78
79 a_href_re = re.compile(r"^.*" + str(year) + r"$")
80
81 for country_tag in header_to_vals.get("Country"):
82 country = country_tag.find("a",href=a_href_re).string
83 # print("**** country = " + country)
84 country_year = country+str(year)
85
86 country_year_recs[country_year] = { "Country": country, "Year": year }
87
88
89 for key in header_to_vals.keys():
90 if (key == "Country"):
91 continue
92
93 vals = header_to_vals.get(key)
94
95 for l in range(0,len(vals)):
96 country_tag = header_to_vals.get("Country")[l]
97 country_flag_img = country_tag.find("img")
98 country = country_tag.find("a",href=a_href_re).string
99 country_year = country+str(year)
100
101 val = vals[l]
102
103 if key == "Artist":
104 a_val = val.find("a")
105 if (a_val is not None):
106 val = a_val
107 elif key == "Title":
108 a_val = val.find("a")
109 if (a_val is not None):
110 val = a_val
111 elif key == "Language":
112 a_val = val.find("a")
113 if (a_val is not None):
114 val = a_val
115 elif key == "Place":
116 span_val = val.find("span")
117 if (span_val is not None):
118 val = span_val
119
120 for inner_val in val.contents:
121 if (inner_val.string and re.search("[^\s]",inner_val.string)):
122 val = inner_val
123 break
124
125 val = val.string.strip()
126
127 #print("country = " + country);
128 #print("key = " + key);
129 #print("*** storing: " + country + "[" + key + "] = " + val)
130
131 country_year_recs[country_year][key] = val
132
133 return country_year_recs
134
135
136
137
138def convert_cols_to_country_year_recs(header_to_vals,year,accumulated_country_year_recs,extra_metadata):
139
140 a_href_re = re.compile(r"^.*" + str(year) + r"$")
141
142 country_list = []
143
144 for country_tag in header_to_vals.get("Country"):
145 country = country_tag.find("a",href=a_href_re).string
146 # print("**** country = " + country)
147 country_list.append(country)
148
149 # Only need to worry about a country double up occuring in this country_list
150 # OK for there to be a double up between 'accumlated' and country_list, as
151 # this is just a sign of a country that was in a semi-final progressing to
152 # the final
153
154 country_year_recs = {}
155
156 for i in range(0,len(country_list)):
157 country = country_list[i]
158 country_year = country + str(year)
159
160 this_country_year_rec = {}
161 if (extra_metadata is not None):
162 this_country_year_rec = extra_metadata.copy()
163
164 this_country_year_rec["Country"] = country
165 this_country_year_rec["Year"] = year
166
167 for key in header_to_vals.keys():
168 val = header_to_vals.get(key)[i]
169
170 if (key == "Country"):
171 country_flag_img = val.find("img")
172 this_country_year_rec["FlagImg"] = str(country_flag_img)
173 continue
174
175 if key == "Artist":
176 a_val = val.find("a")
177 if (a_val is not None):
178 val = a_val
179 elif key == "Title":
180 a_val = val.find("a")
181 if (a_val is not None):
182 val = a_val
183 elif key == "Language":
184 a_val = val.find("a")
185 if (a_val is not None):
186 val = a_val
187 elif key == "Place":
188 span_val = val.find("span")
189 if (span_val is not None):
190 val = span_val
191 elif key == "Points":
192 key = "VoteGrandTotal"
193
194 for inner_val in val.contents:
195 if (inner_val.string and re.search("[^\s]",inner_val.string)):
196 val = inner_val
197 break
198
199 val = val.string.strip()
200
201 #print("country = " + country);
202 #print("key = " + key);
203 #print("*** storing: " + country + "[" + key + "] = " + val)
204
205 this_country_year_rec[key] = val
206
207 if (country_year in country_year_recs):
208 # 1956, where countries had 2 entries!
209
210 country_year_rec = country_year_recs[country_year]
211 del country_year_recs[country_year]
212
213 country_year_title = country_year_rec.get("Title")
214 country_year_suffix = re.sub(r"\s+","",country_year_title)
215 new_country_year_key = country_year + country_year_suffix[0:3]
216 country_year_recs[new_country_year_key] = country_year_rec
217
218 this_country_year_title = this_country_year_rec.get("Title")
219 this_country_year_suffix = re.sub(r"\s+","",this_country_year_title)
220 country_year = country_year + this_country_year_suffix[0:3]
221
222 country_year_recs[country_year] = this_country_year_rec
223
224 # Top up the accumulated version with what has been added into country_year_recs
225
226 for country_year in country_year_recs.keys():
227 accumulated_country_year_recs[country_year] = country_year_recs[country_year]
228
229 return accumulated_country_year_recs
230
231
232def debug_output_country_year_recs(country_year_recs):
233
234 for country_name in country_year_recs.keys():
235 country_rec = country_year_recs.get(country_name)
236
237 print("[" + country_name + "]")
238
239 for metadata_key in country_rec.keys():
240 metadata_val = country_rec.get(metadata_key)
241 print(" " + metadata_key + " = " + repr(metadata_val))
242
243def retrieve_article_page(year):
244 esc_wiki_page = "Eurovision_Song_Contest_" + str(year)
245 esc_wiki_page_file = esc_wiki_page.replace(" ","_") + ".html"
246 esc_wiki_page_filename = os.path.join(cache_dir_,esc_wiki_page_file)
247
248 esc_article_year_html = ""
249 if not(os.path.exists(esc_wiki_page_filename)):
250 print("Retrieving Wikipedia page '" + esc_wiki_page + "'")
251 esc_article_year_wp = wikipedia.page(esc_wiki_page,auto_suggest=False,preload=True,redirect=True)
252 esc_article_year_html = esc_article_year_wp.html()
253
254 print(" Saving page to cache")
255 write_text_file(esc_wiki_page_filename,esc_article_year_html)
256 else:
257 print("Retrieving " + esc_wiki_page_file + " from cache")
258 esc_article_year_html = read_text_file(esc_wiki_page_filename)
259
260
261 return esc_article_year_html
262
263def process_article_page(esc_article_year_html,year,result_ids, stop_at_first, extra_metadata=None):
264
265 country_year_recs = {}
266
267 esc_year_soup = bs4.BeautifulSoup(esc_article_year_html, 'html.parser')
268
269 #result_ids = [ 'Grand_final', 'Final', 'Results' ]
270
271 results_heading_list = []
272 for fr_id in result_ids:
273
274 # Manually deal with an exception, where "Final" turns up in a side-bar infobox
275 # before the actual Results section in the main page
276 if ((year == 1996) and (fr_id == "Final")):
277 continue
278
279 results_text_span = esc_year_soup.find("span",id=fr_id)
280 if (results_text_span is not None):
281 print(" Found Final Results heading with id: " + fr_id);
282 results_heading = results_text_span.parent
283 results_heading_list.append(results_heading)
284 # print("**** parent tag: " + results_heading.name);
285 if (stop_at_first):
286 break
287
288 # print(repr(results_heading))
289
290 for result_heading in results_heading_list:
291
292 results_table = results_heading.findNext('table')
293 table_rows = results_table.find_all('tr');
294 print(" Number of rows in Results table = " + str(len(table_rows)))
295
296 header_to_vals = html_tablerows_to_hashmap(table_rows)
297
298 convert_cols_to_country_year_recs(header_to_vals,year,country_year_recs, extra_metadata)
299
300 print("==========")
301
302 ## debug_output_country_year_recs(country_year_recs)
303
304 return country_year_recs
305
306
307def process_category_page(year):
308
309 category_countries = {}
310
311 esc_wiki_page = "Category:Countries_in_the_Eurovision_Song_Contest_" + str(year)
312 esc_wiki_page_file = esc_wiki_page.replace(" ","_") + ".html"
313 esc_wiki_page_filename = os.path.join(cache_dir_,esc_wiki_page_file)
314
315 esc_cat_year_html = ""
316 if not(os.path.exists(esc_wiki_page_filename)):
317 print("Retrieving Wikipedia page '" + esc_wiki_page + "'")
318 ##esc_cat_year_wp = wikipedia.page(esc_wiki_page,auto_suggest=False,preload=True,redirect=True)
319 #esc_cat_year_wp = wikipedia.page(esc_wiki_page)
320 #esc_cat_year_html = esc_cat_year_wp.html()
321
322 response = requests.get("https://en.wikipedia.org/wiki/"+esc_wiki_page)
323 esc_cat_year_html = response.text
324
325 print(" Saving page to cache")
326 write_text_file(esc_wiki_page_filename,esc_cat_year_html)
327 else:
328 print("Retrieving " + esc_wiki_page_file + " from cache")
329 esc_cat_year_html = read_text_file(esc_wiki_page_filename)
330
331
332 esc_cat_year_soup = bs4.BeautifulSoup(esc_cat_year_html, 'html.parser')
333 # print(repr(esc_cat_year_soup.body))
334
335 a_href_re = re.compile(r"^/wiki/([^/:]+)_in_the_Eurovision_Song_Contest_" + str(year) + r"$")
336
337 esc_cat_a_tags = esc_cat_year_soup.find_all("a",href=a_href_re)
338
339 for a_tag in esc_cat_a_tags:
340 href = a_tag.get("href")
341 country = re.search(a_href_re,href).group(1)
342 country = country.replace("_"," ")
343
344 category_countries[country] = 1
345
346 return category_countries
347
348
349
Note: See TracBrowser for help on using the repository browser.