Changeset 34978
- Timestamp:
- 2021-03-27T22:16:25+13:00 (3 years ago)
- Location:
- main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories
- Files:
-
- 1 added
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/esc-wikipedia-download-and-process-votes.py
r34977 r34978 1 1 #!/usr/bin/env python 2 2 3 from __future__ import print_function 3 ### TODO 4 ### Grab image for competition year out of infobox??? 4 5 6 import json 5 7 import os 6 8 import re 7 9 8 import bs4 9 import wikipedia 10 import argparse 11 12 import escwikipedia 10 13 11 14 12 cache_dir="cache-wikipedia" 15 def gs_fileset_country_in_year(country_year_name, country_year_rec, nul_output_dir_name): 16 17 # Looking to build data-structure (for output as JSON) in the form 18 # { "FileSet": 19 # [ 20 # { "FileName": "France1991\.nul" }, 21 # { "Description": 22 # { 23 # "Metadata": 24 # [ 25 # { "name": "Place", "content": 12 }, 26 # { "name": "Title", "content": "...." }, 27 # ... 28 # ] 29 # } 30 # } 31 # ] 32 # } 33 34 metadata_array = [] 35 36 country_year_id = re.sub(r'\s+', '', country_year_name) 37 38 metadata_array.append({"name": "Identifier", "content": country_year_id}) 39 40 for metadata_key in country_year_rec.keys(): 41 metadata_val = country_year_rec.get(metadata_key) 42 metadata_array.append({"name": metadata_key, "content": metadata_val}) 13 43 14 44 15 def read_text_file(input_filename): 16 f = open(input_filename, "r") 17 text = f.read() 18 f.close() 45 # id encodes country and year, 46 filename_id = country_year_id + "\\.nul" 19 47 20 return text48 nul_filename = os.path.join(nul_output_dir_name,country_year_id+".nul"); 21 49 22 def write_text_file(output_filename, text): 23 f = open(output_filename, "w") 24 f.write(text) 25 f.close() 50 print(" Creating: " + nul_filename) 51 52 with open(nul_filename, 'w') as outfile: 53 outfile.write("") 54 55 fileset = { 56 "FileSet" : [ 57 { "FileName": filename_id, }, 58 { "Description" : { "Metadata" : metadata_array } } 59 ] 60 } 61 62 return fileset 26 63 27 def html_tablerows_to_hashmap(table_rows):28 64 29 table_header = table_rows[0]30 65 31 header_to_vals = {} 32 headers = [] 66 def gs_directory_metadata(all_country_year_array_of_recs, json_output_filename): 67 68 # Express the grouped from-country voting data 69 # in the Greenstone JSON metadata format: 70 71 # { "DirectoryMetadata": 72 # [ 73 # { "FileSet": 74 # [ 75 # { "FileName": "France1991\.nul" }, 76 # { "Description": 77 # { 78 # "Metadata": 79 # [ 80 # { "name": "Place", "content": "12" }, 81 # ... 82 # ] 83 # } 84 # } 85 # ] 86 # } 87 # ... 88 # ] 89 # } 90 91 nul_output_dir_name = os.path.dirname(json_output_filename) 92 93 directory_metadata = [] 94 95 for all_in_given_year_recs in all_country_year_array_of_recs: 33 96 34 header_cols = table_header.find_all("th"); 35 for header in header_cols: 36 header_label = header.contents[0].strip() 37 if (header_label == "Language(s)"): 38 header_label = "Language" 39 40 headers.append(header_label) 41 header_to_vals[header_label] = [] 97 for country_year_name in all_in_given_year_recs.keys(): 98 country_year_rec = all_in_given_year_recs.get(country_year_name) 42 99 43 print(" Headers = " + ",".join(header_to_vals.keys())) 44 45 for y in range(1, len(table_rows)): 46 tds = table_rows[y].find_all("td"); 47 for x in range(0,len(tds)): 48 val = tds[x] 49 header_label = headers[x] 50 header_to_vals[header_label].append(val) 51 52 return header_to_vals 100 fileset = gs_fileset_country_in_year(country_year_name, country_year_rec, nul_output_dir_name) 101 directory_metadata.append(fileset) 102 53 103 54 def convert_cols_to_country_year_recs(header_to_vals,year): 104 greenstone_metadata_json = { "DirectoryMetadata": directory_metadata } 55 105 56 country_year_recs = {}106 return greenstone_metadata_json 57 107 58 a_href_re = re.compile(r"^.*" + str(year) + r"$")59 60 for country_tag in header_to_vals.get("Country"):61 # print("@@@@@ td = " + repr(country_tag));62 63 #country = country_tag.find("a",href=re.compile(r"^.*" + str(year) + r"$")).string64 country = country_tag.find("a",href=a_href_re).string65 # print("**** country = " + country)66 country_year = country+str(year)67 68 country_year_recs[country_year] = {}69 70 for key in header_to_vals.keys():71 if (key == "Country"):72 continue73 108 74 vals = header_to_vals.get(key)75 76 for l in range(0,len(vals)):77 country_tag = header_to_vals.get("Country")[l]78 country_flag_img = country_tag.find("img")79 #country = country_tag.find("a",href=re.compile(r"^.*" + str(year) + r"$")).string80 country = country_tag.find("a",href=a_href_re).string81 country_year = country+str(year)82 83 val = vals[l]84 85 if key == "Artist":86 a_val = val.find("a")87 if (a_val is not None):88 val = a_val89 elif key == "Song":90 a_val = val.find("a")91 if (a_val is not None):92 val = a_val93 elif key == "Language":94 a_val = val.find("a")95 if (a_val is not None):96 val = a_val97 elif key == "Place":98 span_val = val.find("span")99 if (span_val is not None):100 val = span_val101 109 102 for inner_val in val.contents: 103 if (inner_val.string and re.search("[^\s]",inner_val.string)): 104 val = inner_val 105 break 106 107 val = val.string.strip() 110 def save_greenstone_json_metadata(greenstone_metadata_json,json_output_filename): 108 111 109 #print("country = " + country); 110 #print("key = " + key); 111 #print("*** storing: " + country + "[" + key + "] = " + val) 112 113 country_year_recs[country_year][key] = val 114 115 return country_year_recs 116 117 def output_country_year_recs(country_year_recs): 118 119 for country_name in country_year_recs.keys(): 120 country_rec = country_year_recs.get(country_name) 121 122 print("[" + country_name + "]") 123 124 for metadata_key in country_rec.keys(): 125 metadata_val = country_rec.get(metadata_key) 126 print(" " + metadata_key + " = " + repr(metadata_val)) 127 128 def process_category_page(year): 129 esc_wiki_page = "Eurovision_Song_Contest_" + str(year) 130 esc_wiki_page_file = esc_wiki_page.replace(" ","_") + ".html" 131 esc_wiki_page_filename = os.path.join(cache_dir,esc_wiki_page_file) 132 133 esc_year_html = "" 134 if not(os.path.exists(esc_wiki_page_filename)): 135 print("Retrieving Wikipedia page '" + esc_wiki_page + "'") 136 esc_year_wp = wikipedia.page(esc_wiki_page,auto_suggest=False,preload=True,redirect=True) 137 esc_year_html = esc_year_wp.html() 138 139 print(" Saving page to cache") 140 write_text_file(esc_wiki_page_filename,esc_year_html) 141 else: 142 print("Retrieving " + esc_wiki_page_file + " from cache") 143 esc_year_html = read_text_file(esc_wiki_page_filename) 144 145 146 esc_year_soup = bs4.BeautifulSoup(esc_year_html, 'html.parser') 147 148 final_result_ids = [ 'Grand_final', 'Final', 'Results' ] 149 150 results_heading = None 151 for fr_id in final_result_ids: 152 153 if ((year == 1996) and (fr_id == "Final")): 154 continue 155 156 results_text_span = esc_year_soup.find("span",id=fr_id) 157 if (results_text_span is not None): 158 print(" Found Final Results heading with id: " + fr_id); 159 results_heading = results_text_span.parent 160 # print("**** parent tag: " + results_heading.name); 161 break 162 163 # print (results_heading) 164 165 results_table = results_heading.findNext('table') 166 table_rows = results_table.find_all('tr'); 167 print(" " + esc_wiki_page_file + ": number of rows in Results table = " + str(len(table_rows))) 168 169 header_to_vals = html_tablerows_to_hashmap(table_rows) 170 171 country_year_recs = convert_cols_to_country_year_recs(header_to_vals,year) 172 173 print() 174 print("==========") 175 176 output_country_year_recs(country_year_recs) 112 with open(json_output_filename, 'w') as outfile: 113 json.dump(greenstone_metadata_json, outfile, indent=2) 177 114 178 115 … … 182 119 # https://en.wikipedia.org/wiki/Category:Countries_in_the_Eurovision_Song_Contest_1957 183 120 184 if not(os.path.exists(cache_dir)):185 print("Making cache directory: " + cache_dir)186 os.mkdir(cache_dir)187 188 for year in range(1956, 2020):189 process_category_page(year)121 parser = argparse.ArgumentParser() 122 parser.add_argument('--startyear', type=int, default=1956) 123 parser.add_argument('--endyear', type=int, default=1999) 124 parser.add_argument('output-file.json', nargs="?", default="metadata-esc-year/metadata-esc.json"); 125 126 args = parser.parse_args() 190 127 128 start_year = getattr(args,'startyear') 129 end_year = getattr(args,'endyear') 191 130 192 # for row in table_rows: 193 # print("*** " + repr(row)); 131 json_output_filename = getattr(args,'output-file.json') 132 133 134 all_country_year_recs = [] 194 135 195 # for sibling in results_heading.previous_siblings:196 # print(repr(sibling))136 for year in range(start_year, end_year+1): 137 country_year_recs = escwikipedia.process_category_page(year) 197 138 139 all_country_year_recs.append(country_year_recs) 140 141 print() 142 print("Generating Greenstone JSON metadata for:") 143 greenstone_metadata_json = gs_directory_metadata(all_country_year_recs,json_output_filename) 144 145 print() 146 print("Saving output as: " + json_output_filename) 147 save_greenstone_json_metadata(greenstone_metadata_json,json_output_filename) 148 print() 149 198 150 199 151 200 201 202 #td_tag.findNext('td')203 204 # results_table = results_heading.findNext('table')205 # print(repr(results_table))206 207 208 #ny = wikipedia.page("New York")209 #ny.html()210 #soup = BeautifulSoup("<html>a web page</html>", 'html.parser')211 #id_soup.p['id']
Note:
See TracChangeset
for help on using the changeset viewer.