Changeset 34990


Ignore:
Timestamp:
2021-04-04T13:07:02+12:00 (3 years ago)
Author:
davidb
Message:

Refactoring in response to developing top-level PREPARE script

Location:
main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/esc-wikipedia-download-and-detect-missing-cat-entries.py

    r34980 r34990  
    77import os
    88import re
     9# import shutil
    910
    1011import argparse
     
    9899    parser.add_argument('--startyear', type=int, default=1956)
    99100    parser.add_argument('--endyear',   type=int, default=2019)
    100     # parser.add_argument('output-file.json', nargs="?", default="metadata-esc-year/metadata-esc.json");
     101    parser.add_argument('--cachedir', default="cache-wikipedia")
     102    parser.add_argument('output-file.sparql', nargs="?", default="dbpedia--countries-missing-from-esc-category-in-the-year.sparql");
    101103   
    102104    args = parser.parse_args()
     
    104106    start_year = getattr(args,'startyear')
    105107    end_year   = getattr(args,'endyear')
     108
     109    cache_dir = getattr(args,'cachedir')
     110    escwikipedia.init_cache(cache_dir)
    106111   
    107     # json_output_filename = getattr(args,'output-file.json')
     112    sparql_values_output_filename  = getattr(args,'output-file.sparql')
     113    sparql_values_template_filename = sparql_values_output_filename + ".in"
     114
    108115    final_result_ids = [ 'Grand_final', 'Final', 'Results' ]
    109116    semifinal_result_ids = [ 'Semi-final']
    110117    semifinal_pair_result_ids = [ 'Semi-final_1', 'Semi-final_2' ]
     118
     119    missing_year_countries = {}
    111120   
    112121    for year in range(start_year, end_year+1):
     
    135144                del article_countries[cat_country]
    136145            else:
    137                 print("Country listed in Category page that did not reach Fina/Semifinal: " + cat_country)
     146                print("Country listed in Category page that did not reach Final/Semifinal: " + cat_country)
    138147               
    139148        # Missing countries are the ones left in 'article_countries'
     
    146155            print("++++++++++")
    147156
     157            missing_year_countries[year] = article_countries
     158
    148159        print("==========")
    149160        print()
    150        
     161
     162    print()
     163
     164    missing_uri_lines = []
     165   
     166    for year in missing_year_countries.keys():
     167        year_str = str(year)
     168        for country in missing_year_countries[year]:
     169            # Example:
     170            #  dbr:Cyprus_in_the_Eurovision_Song_Contest_2010
     171           
     172            country_wikisafe = country.replace(" ","_")
     173           
     174            missing_uri = "      (dbr:" + country_wikisafe + "_in_the_Eurovision_Song_Contest_" + year_str+")"
     175           
     176            missing_uri_lines.append(missing_uri)
     177           
     178            #print("  " + missing_uri)
     179
     180    missing_uri_lines_text = "\n".join(missing_uri_lines)
     181    util.write_text_file_from_template(sparql_values_template_filename,"**missing-country-year-uris**",missing_uri_lines_text,sparql_values_output_filename)
     182
     183#    problem_category_in_year_filename = "../problem-lod-lists/dbpedia-problem-category-in-year.sparql"   
     184#    shutil.copyfile(sparql_values_output_filename,problem_category_in_year_filename)
     185                   
     186    print()
     187
    151188
    152189       
  • main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/esc-wikipedia-download-and-process-votes.py

    r34980 r34990  
    8989    #  }
    9090
     91    print("**** json_output_filename = " + json_output_filename)
     92   
    9193    nul_output_dir_name = os.path.dirname(json_output_filename)
    9294   
     
    122124    parser.add_argument('--startyear', type=int, default=1956)
    123125    parser.add_argument('--endyear',   type=int, default=2019)
     126    parser.add_argument('--cachedir', default="cache-wikipedia")
    124127    parser.add_argument('output-file.json', nargs="?", default="metadata-esc-year/metadata-esc.json");
    125128   
     
    128131    start_year = getattr(args,'startyear')
    129132    end_year   = getattr(args,'endyear')
    130    
     133
     134    cache_dir = getattr(args,'cachedir')
     135    escwikipedia.init_cache(cache_dir)
     136
    131137    json_output_filename = getattr(args,'output-file.json')
    132138
  • main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/escwikipedia.py

    r34980 r34990  
    1818import wikipedia
    1919
    20 import wikipediaapi
    21 
    2220DEBUG=False
    2321
    2422
    25 cache_dir="cache-wikipedia"
    26 
    27 # Global Init
    28 if not(os.path.exists(cache_dir)):
    29     print("Making cache directory: " + cache_dir)
    30     os.mkdir(cache_dir)
     23cache_dir_=None
     24
     25def init_cache(cache_dir):
     26    global cache_dir_
     27    cache_dir_ = cache_dir
     28    if not(os.path.exists(cache_dir_)):
     29        print("Making cache directory: " + cache_dir_)
     30        os.mkdir(cache_dir_)
    3131   
    3232
     
    241241    esc_wiki_page = "Eurovision_Song_Contest_" + str(year)
    242242    esc_wiki_page_file = esc_wiki_page.replace(" ","_") + ".html"
    243     esc_wiki_page_filename = os.path.join(cache_dir,esc_wiki_page_file)
     243    esc_wiki_page_filename = os.path.join(cache_dir_,esc_wiki_page_file)
    244244
    245245    esc_article_year_html = ""
     
    308308    esc_wiki_page = "Category:Countries_in_the_Eurovision_Song_Contest_" + str(year)
    309309    esc_wiki_page_file = esc_wiki_page.replace(" ","_") + ".html"
    310     esc_wiki_page_filename = os.path.join(cache_dir,esc_wiki_page_file)
     310    esc_wiki_page_filename = os.path.join(cache_dir_,esc_wiki_page_file)
    311311
    312312    esc_cat_year_html = ""
     
    345345
    346346
    347 def cat_test():
    348 
    349     wiki_wiki = wikipediaapi.Wikipedia("en",extract_format=wikipediaapi.ExtractFormat.HTML)
    350 
    351     cat_title = "Category:Countries_in_the_Eurovision_Song_Contest_1956"
    352     cat = wiki_wiki.page(cat_title)
    353 
    354     print("**** html = " + cat.text)
    355    
    356    
    357     print("Category members: " + cat_title)
    358     for p in cat.categorymembers.values():
    359         if p.namespace == wikipediaapi.Namespace.CATEGORY:
    360             # it is category, so you have to make decision
    361             # if you want to fetch also text from pages that belong
    362             # to this category
    363             print("CatNS")
    364             print(p)
    365         elif p.namespace == wikipediaapi.Namespace.MAIN:
    366             # it is page => we can get text
    367             print("MAIN")
    368             print(p)
    369             print(p.text)
  • main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/util.py

    r34980 r34990  
    77    z.update(y)    # modifies z with y's keys and values & returns None
    88    return z
     9
     10
     11def write_text_file_from_template(input_template_filename,marker,text,output_filename):
     12
     13    fin = open(input_template_filename, "r")
     14    lines = fin.readlines()
     15    fin.close()
     16
     17    fout = open(output_filename, "w")
     18    for line in lines:
     19        line = line.replace(marker,text)
     20        fout.write(line)
     21
     22    fout.close()
Note: See TracChangeset for help on using the changeset viewer.