- Timestamp:
- 2021-03-28T00:14:39+13:00 (3 years ago)
- Location:
- main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories
- Files:
-
- 1 added
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/esc-wikipedia-download-and-process-votes.py
r34978 r34979 121 121 parser = argparse.ArgumentParser() 122 122 parser.add_argument('--startyear', type=int, default=1956) 123 parser.add_argument('--endyear', type=int, default= 1999)123 parser.add_argument('--endyear', type=int, default=2019) 124 124 parser.add_argument('output-file.json', nargs="?", default="metadata-esc-year/metadata-esc.json"); 125 125 … … 135 135 136 136 for year in range(start_year, end_year+1): 137 country_year_recs = escwikipedia.process_ category_page(year)137 country_year_recs = escwikipedia.process_esc_article_page(year) 138 138 139 139 all_country_year_recs.append(country_year_recs) -
main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/escwikipedia.py
r34978 r34979 1 2 # TODO 3 # Check to see if Song/Title coming back with quotes around it 4 # In addition to Languages->Language, split of "," ?? 5 # (e.g., 2005 has "Language" but entries like English, Spanish) 6 7 1 8 2 9 from __future__ import print_function … … 5 12 import os 6 13 import re 14 import requests 7 15 8 16 import argparse 9 17 import bs4 10 18 import wikipedia 19 20 import wikipediaapi 11 21 12 22 DEBUG=False … … 218 228 print(" " + metadata_key + " = " + repr(metadata_val)) 219 229 220 def process_ category_page(year):230 def process_esc_article_page(year): 221 231 esc_wiki_page = "Eurovision_Song_Contest_" + str(year) 222 232 esc_wiki_page_file = esc_wiki_page.replace(" ","_") + ".html" … … 263 273 country_year_recs = convert_cols_to_country_year_recs(header_to_vals,year) 264 274 265 print()266 275 print("==========") 267 276 … … 271 280 272 281 273 274 282 def process_category_page(year): 283 284 category_countries = {} 285 286 esc_wiki_page = "Category:Countries_in_the_Eurovision_Song_Contest_" + str(year) 287 esc_wiki_page_file = esc_wiki_page.replace(" ","_") + ".html" 288 esc_wiki_page_filename = os.path.join(cache_dir,esc_wiki_page_file) 289 290 esc_cat_year_html = "" 291 if not(os.path.exists(esc_wiki_page_filename)): 292 print("Retrieving Wikipedia page '" + esc_wiki_page + "'") 293 ##esc_cat_year_wp = wikipedia.page(esc_wiki_page,auto_suggest=False,preload=True,redirect=True) 294 #esc_cat_year_wp = wikipedia.page(esc_wiki_page) 295 #esc_cat_year_html = esc_cat_year_wp.html() 296 297 response = requests.get("https://en.wikipedia.org/wiki/"+esc_wiki_page) 298 esc_cat_year_html = response.text 299 300 print(" Saving page to cache") 301 write_text_file(esc_wiki_page_filename,esc_cat_year_html) 302 else: 303 print("Retrieving " + esc_wiki_page_file + " from cache") 304 esc_cat_year_html = read_text_file(esc_wiki_page_filename) 305 306 307 esc_cat_year_soup = bs4.BeautifulSoup(esc_cat_year_html, 'html.parser') 308 # print(repr(esc_cat_year_soup.body)) 309 310 a_href_re = re.compile(r"^/wiki/([^/:]+)_in_the_Eurovision_Song_Contest_" + str(year) + r"$") 311 312 esc_cat_a_tags = esc_cat_year_soup.find_all("a",href=a_href_re) 313 314 for a_tag in esc_cat_a_tags: 315 href = a_tag.get("href") 316 country = re.search(a_href_re,href).group(1) 317 country = country.replace("_"," ") 318 319 category_countries[country] = 1 320 321 return category_countries 322 323 324 325 def cat_test(): 326 327 wiki_wiki = wikipediaapi.Wikipedia("en",extract_format=wikipediaapi.ExtractFormat.HTML) 328 329 cat_title = "Category:Countries_in_the_Eurovision_Song_Contest_1956" 330 cat = wiki_wiki.page(cat_title) 331 332 print("**** html = " + cat.text) 333 334 335 print("Category members: " + cat_title) 336 for p in cat.categorymembers.values(): 337 if p.namespace == wikipediaapi.Namespace.CATEGORY: 338 # it is category, so you have to make decision 339 # if you want to fetch also text from pages that belong 340 # to this category 341 print("CatNS") 342 print(p) 343 elif p.namespace == wikipediaapi.Namespace.MAIN: 344 # it is page => we can get text 345 print("MAIN") 346 print(p) 347 print(p.text)
Note:
See TracChangeset
for help on using the changeset viewer.