Changeset 35016


Ignore:
Timestamp:
2021-04-04T20:24:02+12:00 (3 years ago)
Author:
davidb
Message:

Changes in response to testing

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/esc-wikipedia-download-and-detect-missing-cat-entries.py

    r34990 r35016  
    55
    66
     7import json
    78import os
    89import re
     
    4849
    4950
    50 def gs_directory_metadata(all_country_year_array_of_recs, json_output_filename):
     51def gs_directory_metadata(missing_year_countries, json_output_filename):
    5152
    5253
     
    5556    directory_metadata = []
    5657
    57     for all_in_given_year_recs in all_country_year_array_of_recs:
    58        
    59         for country_year_name in all_in_given_year_recs.keys():
    60             country_year_rec = all_in_given_year_recs.get(country_year_name)
    61 
     58    for year in missing_year_countries.keys():
     59        year_str = str(year)
     60       
     61        for country in missing_year_countries[year]:
     62
     63            country_year_name = country + year_str
     64            country_year_rec = missing_year_countries[year][country]
     65           
     66            # dig out metadata rec based on year and country
     67           
    6268            fileset = gs_fileset_country_in_year(country_year_name, country_year_rec, nul_output_dir_name)
    6369            directory_metadata.append(fileset)
     
    7076
    7177
    72 #def save_greenstone_json_metadata(greenstone_metadata_json,json_output_filename):
    73 #
    74 #    with open(json_output_filename, 'w') as outfile:
    75 #        json.dump(greenstone_metadata_json, outfile, indent=2)
    76    
     78def save_greenstone_json_metadata(greenstone_metadata_json,json_output_filename):
     79
     80    with open(json_output_filename, 'w') as outfile:
     81        json.dump(greenstone_metadata_json, outfile, indent=2)
     82   
     83
     84def generate_sparql_query(sparql_values_output_filename,missing_year_countries):
     85
     86    sparql_values_template_filename = sparql_values_output_filename + ".in"
     87
     88    missing_uri_lines = []
     89   
     90    for year in missing_year_countries.keys():
     91        year_str = str(year)
     92        for country in missing_year_countries[year]:
     93            # Example:
     94            #  dbr:Cyprus_in_the_Eurovision_Song_Contest_2010
     95           
     96            country_wikisafe = country.replace(" ","_")
     97           
     98            missing_uri = "      (dbr:" + country_wikisafe + "_in_the_Eurovision_Song_Contest_" + year_str+")"
     99           
     100            missing_uri_lines.append(missing_uri)
     101           
     102           
     103    missing_uri_lines_text = "\n".join(missing_uri_lines)
     104    util.write_text_file_from_template(sparql_values_template_filename,"**missing-country-year-uris**",missing_uri_lines_text,sparql_values_output_filename)
     105
     106#    problem_category_in_year_filename = "../problem-lod-lists/dbpedia-problem-category-in-year.sparql"   
     107#    shutil.copyfile(sparql_values_output_filename,problem_category_in_year_filename)
     108                   
     109
     110
    77111   
    78112if __name__ == "__main__":
     
    100134    parser.add_argument('--endyear',   type=int, default=2019)
    101135    parser.add_argument('--cachedir', default="cache-wikipedia")
    102     parser.add_argument('output-file.sparql', nargs="?", default="dbpedia--countries-missing-from-esc-category-in-the-year.sparql");
     136    parser.add_argument('--queryfile.sparql', default=None)
     137    parser.add_argument('output-file.json', nargs="?", default="metadata.json");
    103138   
    104139    args = parser.parse_args()
     
    110145    escwikipedia.init_cache(cache_dir)
    111146   
    112     sparql_values_output_filename  = getattr(args,'output-file.sparql')
    113     sparql_values_template_filename = sparql_values_output_filename + ".in"
     147    sparql_values_output_filename  = getattr(args,'queryfile.sparql')
     148
     149    json_output_filename  = getattr(args,'output-file.json')
     150
    114151
    115152    final_result_ids = [ 'Grand_final', 'Final', 'Results' ]
     
    128165            article_semifinal_country_year_recs = escwikipedia.process_article_page(article_country_year_html,year,semifinal_pair_result_ids, stop_at_first=False)
    129166        article_country_year_recs = util.merge_dicts(article_final_country_year_recs,article_semifinal_country_year_recs)
     167
    130168        article_countries = {}
    131169        for country_year in article_country_year_recs.keys():
    132170            article_country = re.search(r"(^.*)\d{4}(?:\w+)?",country_year).group(1)
    133171           
    134             article_countries[article_country] = 1
     172            article_countries[article_country] = article_country_year_recs[country_year]
    135173               
    136174        category_countries = escwikipedia.process_category_page(year)
     
    162200    print()
    163201
    164     missing_uri_lines = []
    165    
    166     for year in missing_year_countries.keys():
    167         year_str = str(year)
    168         for country in missing_year_countries[year]:
    169             # Example:
    170             #  dbr:Cyprus_in_the_Eurovision_Song_Contest_2010
    171            
    172             country_wikisafe = country.replace(" ","_")
    173            
    174             missing_uri = "      (dbr:" + country_wikisafe + "_in_the_Eurovision_Song_Contest_" + year_str+")"
    175            
    176             missing_uri_lines.append(missing_uri)
    177            
    178             #print("  " + missing_uri)
    179 
    180     missing_uri_lines_text = "\n".join(missing_uri_lines)
    181     util.write_text_file_from_template(sparql_values_template_filename,"**missing-country-year-uris**",missing_uri_lines_text,sparql_values_output_filename)
    182 
    183 #    problem_category_in_year_filename = "../problem-lod-lists/dbpedia-problem-category-in-year.sparql"   
    184 #    shutil.copyfile(sparql_values_output_filename,problem_category_in_year_filename)
    185                    
    186     print()
    187 
    188 
    189        
    190    
    191 
     202    greenstone_metadata_json = gs_directory_metadata(missing_year_countries,json_output_filename)
     203    save_greenstone_json_metadata(greenstone_metadata_json,json_output_filename)
     204   
     205    if (sparql_values_output_filename is not None):
     206        generate_sparql_query(sparql_values_output_filename,missing_year_countries)   
     207        print()
Note: See TracChangeset for help on using the changeset viewer.