Changeset 34979 for main


Ignore:
Timestamp:
2021-03-28T00:14:39+13:00 (3 years ago)
Author:
davidb
Message:

Initial version; code tidy up in other areas

Location:
main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories
Files:
1 added
2 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/esc-wikipedia-download-and-process-votes.py

    r34978 r34979  
    121121    parser = argparse.ArgumentParser()
    122122    parser.add_argument('--startyear', type=int, default=1956)
    123     parser.add_argument('--endyear',   type=int, default=1999)
     123    parser.add_argument('--endyear',   type=int, default=2019)
    124124    parser.add_argument('output-file.json', nargs="?", default="metadata-esc-year/metadata-esc.json");
    125125   
     
    135135       
    136136    for year in range(start_year, end_year+1):
    137         country_year_recs = escwikipedia.process_category_page(year)
     137        country_year_recs = escwikipedia.process_esc_article_page(year)
    138138
    139139        all_country_year_recs.append(country_year_recs)
  • main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/escwikipedia.py

    r34978 r34979  
     1
     2# TODO
     3# Check to see if Song/Title coming back with quotes around it
     4# In addition to Languages->Language, split of "," ??
     5#   (e.g., 2005 has "Language" but entries like English, Spanish)
     6
     7
    18
    29from __future__ import print_function
     
    512import os
    613import re
     14import requests
    715
    816import argparse
    917import bs4
    1018import wikipedia
     19
     20import wikipediaapi
    1121
    1222DEBUG=False
     
    218228            print("  " + metadata_key + " = " + repr(metadata_val))
    219229       
    220 def process_category_page(year):
     230def process_esc_article_page(year):
    221231    esc_wiki_page = "Eurovision_Song_Contest_" + str(year)
    222232    esc_wiki_page_file = esc_wiki_page.replace(" ","_") + ".html"
     
    263273    country_year_recs = convert_cols_to_country_year_recs(header_to_vals,year)
    264274
    265     print()
    266275    print("==========")
    267276
     
    271280
    272281
    273 
    274 
     282def process_category_page(year):
     283
     284    category_countries = {}
     285   
     286    esc_wiki_page = "Category:Countries_in_the_Eurovision_Song_Contest_" + str(year)
     287    esc_wiki_page_file = esc_wiki_page.replace(" ","_") + ".html"
     288    esc_wiki_page_filename = os.path.join(cache_dir,esc_wiki_page_file)
     289
     290    esc_cat_year_html = ""
     291    if not(os.path.exists(esc_wiki_page_filename)):
     292        print("Retrieving Wikipedia page '" + esc_wiki_page + "'")
     293        ##esc_cat_year_wp = wikipedia.page(esc_wiki_page,auto_suggest=False,preload=True,redirect=True)
     294        #esc_cat_year_wp = wikipedia.page(esc_wiki_page)
     295        #esc_cat_year_html = esc_cat_year_wp.html()
     296
     297        response = requests.get("https://en.wikipedia.org/wiki/"+esc_wiki_page)
     298        esc_cat_year_html = response.text
     299       
     300        print("  Saving page to cache")
     301        write_text_file(esc_wiki_page_filename,esc_cat_year_html)
     302    else:
     303        print("Retrieving " + esc_wiki_page_file + " from cache")
     304        esc_cat_year_html = read_text_file(esc_wiki_page_filename)
     305
     306       
     307    esc_cat_year_soup = bs4.BeautifulSoup(esc_cat_year_html, 'html.parser')
     308    # print(repr(esc_cat_year_soup.body))
     309         
     310    a_href_re = re.compile(r"^/wiki/([^/:]+)_in_the_Eurovision_Song_Contest_" + str(year) + r"$")
     311
     312    esc_cat_a_tags = esc_cat_year_soup.find_all("a",href=a_href_re)
     313   
     314    for a_tag in esc_cat_a_tags:
     315        href = a_tag.get("href")
     316        country = re.search(a_href_re,href).group(1)
     317        country = country.replace("_"," ")
     318       
     319        category_countries[country] = 1
     320           
     321    return category_countries
     322
     323
     324
     325def cat_test():
     326
     327    wiki_wiki = wikipediaapi.Wikipedia("en",extract_format=wikipediaapi.ExtractFormat.HTML)
     328
     329    cat_title = "Category:Countries_in_the_Eurovision_Song_Contest_1956"
     330    cat = wiki_wiki.page(cat_title)
     331
     332    print("**** html = " + cat.text)
     333   
     334   
     335    print("Category members: " + cat_title)
     336    for p in cat.categorymembers.values():
     337        if p.namespace == wikipediaapi.Namespace.CATEGORY:
     338            # it is category, so you have to make decision
     339            # if you want to fetch also text from pages that belong
     340            # to this category
     341            print("CatNS")
     342            print(p)
     343        elif p.namespace == wikipediaapi.Namespace.MAIN:
     344            # it is page => we can get text
     345            print("MAIN")
     346            print(p)
     347            print(p.text)
Note: See TracChangeset for help on using the changeset viewer.