1 |
|
---|
2 | # TODO
|
---|
3 | # Check to see if Song/Title coming back with quotes around it
|
---|
4 | # In addition to Languages->Language, split of "," ??
|
---|
5 | # (e.g., 2005 has "Language" but entries like English, Spanish)
|
---|
6 |
|
---|
7 |
|
---|
8 |
|
---|
9 | from __future__ import print_function
|
---|
10 |
|
---|
11 | import json
|
---|
12 | import os
|
---|
13 | import re
|
---|
14 | import requests
|
---|
15 |
|
---|
16 | import argparse
|
---|
17 | import bs4
|
---|
18 | import wikipedia
|
---|
19 |
|
---|
20 | DEBUG=False
|
---|
21 |
|
---|
22 |
|
---|
23 | cache_dir_=None
|
---|
24 |
|
---|
25 | def init_cache(cache_dir):
|
---|
26 | global cache_dir_
|
---|
27 | cache_dir_ = cache_dir
|
---|
28 | if not(os.path.exists(cache_dir_)):
|
---|
29 | print("Making cache directory: " + cache_dir_)
|
---|
30 | os.mkdir(cache_dir_)
|
---|
31 |
|
---|
32 |
|
---|
33 | def read_text_file(input_filename):
|
---|
34 | f = open(input_filename, "r")
|
---|
35 | text = f.read()
|
---|
36 | f.close()
|
---|
37 |
|
---|
38 | return text
|
---|
39 |
|
---|
40 | def write_text_file(output_filename, text):
|
---|
41 | f = open(output_filename, "w")
|
---|
42 | f.write(text)
|
---|
43 | f.close()
|
---|
44 |
|
---|
45 | def html_tablerows_to_hashmap(table_rows):
|
---|
46 |
|
---|
47 | table_header = table_rows[0]
|
---|
48 |
|
---|
49 | header_to_vals = {}
|
---|
50 | headers = []
|
---|
51 |
|
---|
52 | header_cols = table_header.find_all("th");
|
---|
53 | for header in header_cols:
|
---|
54 | header_label = header.contents[0].strip()
|
---|
55 | if (header_label == "Language(s)"):
|
---|
56 | header_label = "Language"
|
---|
57 |
|
---|
58 | if (header_label == "Song"):
|
---|
59 | header_label = "Title"
|
---|
60 |
|
---|
61 | headers.append(header_label)
|
---|
62 | header_to_vals[header_label] = []
|
---|
63 |
|
---|
64 | print(" Headers = " + ",".join(header_to_vals.keys()))
|
---|
65 |
|
---|
66 | for y in range(1, len(table_rows)):
|
---|
67 | tds = table_rows[y].find_all("td");
|
---|
68 | for x in range(0,len(tds)):
|
---|
69 | val = tds[x]
|
---|
70 | header_label = headers[x]
|
---|
71 | header_to_vals[header_label].append(val)
|
---|
72 |
|
---|
73 | return header_to_vals
|
---|
74 |
|
---|
75 | def convert_cols_to_country_year_recsOLD(header_to_vals,year):
|
---|
76 |
|
---|
77 | country_year_recs = {}
|
---|
78 |
|
---|
79 | a_href_re = re.compile(r"^.*" + str(year) + r"$")
|
---|
80 |
|
---|
81 | for country_tag in header_to_vals.get("Country"):
|
---|
82 | country = country_tag.find("a",href=a_href_re).string
|
---|
83 | # print("**** country = " + country)
|
---|
84 | country_year = country+str(year)
|
---|
85 |
|
---|
86 | country_year_recs[country_year] = { "Country": country, "Year": year }
|
---|
87 |
|
---|
88 |
|
---|
89 | for key in header_to_vals.keys():
|
---|
90 | if (key == "Country"):
|
---|
91 | continue
|
---|
92 |
|
---|
93 | vals = header_to_vals.get(key)
|
---|
94 |
|
---|
95 | for l in range(0,len(vals)):
|
---|
96 | country_tag = header_to_vals.get("Country")[l]
|
---|
97 | country_flag_img = country_tag.find("img")
|
---|
98 | country = country_tag.find("a",href=a_href_re).string
|
---|
99 | country_year = country+str(year)
|
---|
100 |
|
---|
101 | val = vals[l]
|
---|
102 |
|
---|
103 | if key == "Artist":
|
---|
104 | a_val = val.find("a")
|
---|
105 | if (a_val is not None):
|
---|
106 | val = a_val
|
---|
107 | elif key == "Title":
|
---|
108 | a_val = val.find("a")
|
---|
109 | if (a_val is not None):
|
---|
110 | val = a_val
|
---|
111 | elif key == "Language":
|
---|
112 | a_val = val.find("a")
|
---|
113 | if (a_val is not None):
|
---|
114 | val = a_val
|
---|
115 | elif key == "Place":
|
---|
116 | span_val = val.find("span")
|
---|
117 | if (span_val is not None):
|
---|
118 | val = span_val
|
---|
119 |
|
---|
120 | for inner_val in val.contents:
|
---|
121 | if (inner_val.string and re.search("[^\s]",inner_val.string)):
|
---|
122 | val = inner_val
|
---|
123 | break
|
---|
124 |
|
---|
125 | val = val.string.strip()
|
---|
126 |
|
---|
127 | #print("country = " + country);
|
---|
128 | #print("key = " + key);
|
---|
129 | #print("*** storing: " + country + "[" + key + "] = " + val)
|
---|
130 |
|
---|
131 | country_year_recs[country_year][key] = val
|
---|
132 |
|
---|
133 | return country_year_recs
|
---|
134 |
|
---|
135 |
|
---|
136 |
|
---|
137 |
|
---|
138 | def convert_cols_to_country_year_recs(header_to_vals,year,accumulated_country_year_recs,extra_metadata):
|
---|
139 |
|
---|
140 | a_href_re = re.compile(r"^.*" + str(year) + r"$")
|
---|
141 |
|
---|
142 | country_list = []
|
---|
143 |
|
---|
144 | for country_tag in header_to_vals.get("Country"):
|
---|
145 | country = country_tag.find("a",href=a_href_re).string
|
---|
146 | # print("**** country = " + country)
|
---|
147 | country_list.append(country)
|
---|
148 |
|
---|
149 | # Only need to worry about a country double up occuring in this country_list
|
---|
150 | # OK for there to be a double up between 'accumlated' and country_list, as
|
---|
151 | # this is just a sign of a country that was in a semi-final progressing to
|
---|
152 | # the final
|
---|
153 |
|
---|
154 | country_year_recs = {}
|
---|
155 |
|
---|
156 | for i in range(0,len(country_list)):
|
---|
157 | country = country_list[i]
|
---|
158 | country_year = country + str(year)
|
---|
159 |
|
---|
160 | this_country_year_rec = {}
|
---|
161 | if (extra_metadata is not None):
|
---|
162 | this_country_year_rec = extra_metadata.copy()
|
---|
163 |
|
---|
164 | this_country_year_rec["Country"] = country
|
---|
165 | this_country_year_rec["Year"] = year
|
---|
166 |
|
---|
167 | for key in header_to_vals.keys():
|
---|
168 | val = header_to_vals.get(key)[i]
|
---|
169 |
|
---|
170 | if (key == "Country"):
|
---|
171 | country_flag_img = val.find("img")
|
---|
172 | this_country_year_rec["FlagImg"] = str(country_flag_img)
|
---|
173 | continue
|
---|
174 |
|
---|
175 | if key == "Artist":
|
---|
176 | a_val = val.find("a")
|
---|
177 | if (a_val is not None):
|
---|
178 | val = a_val
|
---|
179 | elif key == "Title":
|
---|
180 | a_val = val.find("a")
|
---|
181 | if (a_val is not None):
|
---|
182 | val = a_val
|
---|
183 | elif key == "Language":
|
---|
184 | a_val = val.find("a")
|
---|
185 | if (a_val is not None):
|
---|
186 | val = a_val
|
---|
187 | elif key == "Place":
|
---|
188 | span_val = val.find("span")
|
---|
189 | if (span_val is not None):
|
---|
190 | val = span_val
|
---|
191 | elif key == "Points":
|
---|
192 | key = "VoteGrandTotal"
|
---|
193 |
|
---|
194 | for inner_val in val.contents:
|
---|
195 | if (inner_val.string and re.search("[^\s]",inner_val.string)):
|
---|
196 | val = inner_val
|
---|
197 | break
|
---|
198 |
|
---|
199 | val = val.string.strip()
|
---|
200 |
|
---|
201 | #print("country = " + country);
|
---|
202 | #print("key = " + key);
|
---|
203 | #print("*** storing: " + country + "[" + key + "] = " + val)
|
---|
204 |
|
---|
205 | this_country_year_rec[key] = val
|
---|
206 |
|
---|
207 | if (country_year in country_year_recs):
|
---|
208 | # 1956, where countries had 2 entries!
|
---|
209 |
|
---|
210 | country_year_rec = country_year_recs[country_year]
|
---|
211 | del country_year_recs[country_year]
|
---|
212 |
|
---|
213 | country_year_title = country_year_rec.get("Title")
|
---|
214 | country_year_suffix = re.sub(r"\s+","",country_year_title)
|
---|
215 | new_country_year_key = country_year + country_year_suffix[0:3]
|
---|
216 | country_year_recs[new_country_year_key] = country_year_rec
|
---|
217 |
|
---|
218 | this_country_year_title = this_country_year_rec.get("Title")
|
---|
219 | this_country_year_suffix = re.sub(r"\s+","",this_country_year_title)
|
---|
220 | country_year = country_year + this_country_year_suffix[0:3]
|
---|
221 |
|
---|
222 | country_year_recs[country_year] = this_country_year_rec
|
---|
223 |
|
---|
224 | # Top up the accumulated version with what has been added into country_year_recs
|
---|
225 |
|
---|
226 | for country_year in country_year_recs.keys():
|
---|
227 | accumulated_country_year_recs[country_year] = country_year_recs[country_year]
|
---|
228 |
|
---|
229 | return accumulated_country_year_recs
|
---|
230 |
|
---|
231 |
|
---|
232 | def debug_output_country_year_recs(country_year_recs):
|
---|
233 |
|
---|
234 | for country_name in country_year_recs.keys():
|
---|
235 | country_rec = country_year_recs.get(country_name)
|
---|
236 |
|
---|
237 | print("[" + country_name + "]")
|
---|
238 |
|
---|
239 | for metadata_key in country_rec.keys():
|
---|
240 | metadata_val = country_rec.get(metadata_key)
|
---|
241 | print(" " + metadata_key + " = " + repr(metadata_val))
|
---|
242 |
|
---|
243 | def retrieve_article_page(year):
|
---|
244 | esc_wiki_page = "Eurovision_Song_Contest_" + str(year)
|
---|
245 | esc_wiki_page_file = esc_wiki_page.replace(" ","_") + ".html"
|
---|
246 | esc_wiki_page_filename = os.path.join(cache_dir_,esc_wiki_page_file)
|
---|
247 |
|
---|
248 | esc_article_year_html = ""
|
---|
249 | if not(os.path.exists(esc_wiki_page_filename)):
|
---|
250 | print("Retrieving Wikipedia page '" + esc_wiki_page + "'")
|
---|
251 | esc_article_year_wp = wikipedia.page(esc_wiki_page,auto_suggest=False,preload=True,redirect=True)
|
---|
252 | esc_article_year_html = esc_article_year_wp.html()
|
---|
253 |
|
---|
254 | print(" Saving page to cache")
|
---|
255 | write_text_file(esc_wiki_page_filename,esc_article_year_html)
|
---|
256 | else:
|
---|
257 | print("Retrieving " + esc_wiki_page_file + " from cache")
|
---|
258 | esc_article_year_html = read_text_file(esc_wiki_page_filename)
|
---|
259 |
|
---|
260 |
|
---|
261 | return esc_article_year_html
|
---|
262 |
|
---|
263 | def process_article_page(esc_article_year_html,year,result_ids, stop_at_first, extra_metadata=None):
|
---|
264 |
|
---|
265 | country_year_recs = {}
|
---|
266 |
|
---|
267 | esc_year_soup = bs4.BeautifulSoup(esc_article_year_html, 'html.parser')
|
---|
268 |
|
---|
269 | #result_ids = [ 'Grand_final', 'Final', 'Results' ]
|
---|
270 |
|
---|
271 | results_heading_list = []
|
---|
272 | for fr_id in result_ids:
|
---|
273 |
|
---|
274 | # Manually deal with an exception, where "Final" turns up in a side-bar infobox
|
---|
275 | # before the actual Results section in the main page
|
---|
276 | if ((year == 1996) and (fr_id == "Final")):
|
---|
277 | continue
|
---|
278 |
|
---|
279 | results_text_span = esc_year_soup.find("span",id=fr_id)
|
---|
280 | if (results_text_span is not None):
|
---|
281 | print(" Found Final Results heading with id: " + fr_id);
|
---|
282 | results_heading = results_text_span.parent
|
---|
283 | results_heading_list.append(results_heading)
|
---|
284 | # print("**** parent tag: " + results_heading.name);
|
---|
285 | if (stop_at_first):
|
---|
286 | break
|
---|
287 |
|
---|
288 | # print(repr(results_heading))
|
---|
289 |
|
---|
290 | for result_heading in results_heading_list:
|
---|
291 |
|
---|
292 | results_table = results_heading.findNext('table')
|
---|
293 | table_rows = results_table.find_all('tr');
|
---|
294 | print(" Number of rows in Results table = " + str(len(table_rows)))
|
---|
295 |
|
---|
296 | header_to_vals = html_tablerows_to_hashmap(table_rows)
|
---|
297 |
|
---|
298 | convert_cols_to_country_year_recs(header_to_vals,year,country_year_recs, extra_metadata)
|
---|
299 |
|
---|
300 | print("==========")
|
---|
301 |
|
---|
302 | ## debug_output_country_year_recs(country_year_recs)
|
---|
303 |
|
---|
304 | return country_year_recs
|
---|
305 |
|
---|
306 |
|
---|
307 | def process_category_page(year):
|
---|
308 |
|
---|
309 | category_countries = {}
|
---|
310 |
|
---|
311 | esc_wiki_page = "Category:Countries_in_the_Eurovision_Song_Contest_" + str(year)
|
---|
312 | esc_wiki_page_file = esc_wiki_page.replace(" ","_") + ".html"
|
---|
313 | esc_wiki_page_filename = os.path.join(cache_dir_,esc_wiki_page_file)
|
---|
314 |
|
---|
315 | esc_cat_year_html = ""
|
---|
316 | if not(os.path.exists(esc_wiki_page_filename)):
|
---|
317 | print("Retrieving Wikipedia page '" + esc_wiki_page + "'")
|
---|
318 | ##esc_cat_year_wp = wikipedia.page(esc_wiki_page,auto_suggest=False,preload=True,redirect=True)
|
---|
319 | #esc_cat_year_wp = wikipedia.page(esc_wiki_page)
|
---|
320 | #esc_cat_year_html = esc_cat_year_wp.html()
|
---|
321 |
|
---|
322 | response = requests.get("https://en.wikipedia.org/wiki/"+esc_wiki_page)
|
---|
323 | esc_cat_year_html = response.text
|
---|
324 |
|
---|
325 | print(" Saving page to cache")
|
---|
326 | write_text_file(esc_wiki_page_filename,esc_cat_year_html)
|
---|
327 | else:
|
---|
328 | print("Retrieving " + esc_wiki_page_file + " from cache")
|
---|
329 | esc_cat_year_html = read_text_file(esc_wiki_page_filename)
|
---|
330 |
|
---|
331 |
|
---|
332 | esc_cat_year_soup = bs4.BeautifulSoup(esc_cat_year_html, 'html.parser')
|
---|
333 | # print(repr(esc_cat_year_soup.body))
|
---|
334 |
|
---|
335 | a_href_re = re.compile(r"^/wiki/([^/:]+)_in_the_Eurovision_Song_Contest_" + str(year) + r"$")
|
---|
336 |
|
---|
337 | esc_cat_a_tags = esc_cat_year_soup.find_all("a",href=a_href_re)
|
---|
338 |
|
---|
339 | for a_tag in esc_cat_a_tags:
|
---|
340 | href = a_tag.get("href")
|
---|
341 | country = re.search(a_href_re,href).group(1)
|
---|
342 | country = country.replace("_"," ")
|
---|
343 |
|
---|
344 | category_countries[country] = 1
|
---|
345 |
|
---|
346 | return category_countries
|
---|
347 |
|
---|
348 |
|
---|
349 |
|
---|