Changeset 35016
- Timestamp:
- 2021-04-04T20:24:02+12:00 (3 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/esc-wikipedia-download-and-detect-missing-cat-entries.py
r34990 r35016 5 5 6 6 7 import json 7 8 import os 8 9 import re … … 48 49 49 50 50 def gs_directory_metadata( all_country_year_array_of_recs, json_output_filename):51 def gs_directory_metadata(missing_year_countries, json_output_filename): 51 52 52 53 … … 55 56 directory_metadata = [] 56 57 57 for all_in_given_year_recs in all_country_year_array_of_recs: 58 59 for country_year_name in all_in_given_year_recs.keys(): 60 country_year_rec = all_in_given_year_recs.get(country_year_name) 61 58 for year in missing_year_countries.keys(): 59 year_str = str(year) 60 61 for country in missing_year_countries[year]: 62 63 country_year_name = country + year_str 64 country_year_rec = missing_year_countries[year][country] 65 66 # dig out metadata rec based on year and country 67 62 68 fileset = gs_fileset_country_in_year(country_year_name, country_year_rec, nul_output_dir_name) 63 69 directory_metadata.append(fileset) … … 70 76 71 77 72 #def save_greenstone_json_metadata(greenstone_metadata_json,json_output_filename): 73 # 74 # with open(json_output_filename, 'w') as outfile: 75 # json.dump(greenstone_metadata_json, outfile, indent=2) 76 78 def save_greenstone_json_metadata(greenstone_metadata_json,json_output_filename): 79 80 with open(json_output_filename, 'w') as outfile: 81 json.dump(greenstone_metadata_json, outfile, indent=2) 82 83 84 def generate_sparql_query(sparql_values_output_filename,missing_year_countries): 85 86 sparql_values_template_filename = sparql_values_output_filename + ".in" 87 88 missing_uri_lines = [] 89 90 for year in missing_year_countries.keys(): 91 year_str = str(year) 92 for country in missing_year_countries[year]: 93 # Example: 94 # dbr:Cyprus_in_the_Eurovision_Song_Contest_2010 95 96 country_wikisafe = country.replace(" ","_") 97 98 missing_uri = " (dbr:" + country_wikisafe + "_in_the_Eurovision_Song_Contest_" + year_str+")" 99 100 missing_uri_lines.append(missing_uri) 101 102 103 missing_uri_lines_text = "\n".join(missing_uri_lines) 104 util.write_text_file_from_template(sparql_values_template_filename,"**missing-country-year-uris**",missing_uri_lines_text,sparql_values_output_filename) 105 106 # problem_category_in_year_filename = "../problem-lod-lists/dbpedia-problem-category-in-year.sparql" 107 # shutil.copyfile(sparql_values_output_filename,problem_category_in_year_filename) 108 109 110 77 111 78 112 if __name__ == "__main__": … … 100 134 parser.add_argument('--endyear', type=int, default=2019) 101 135 parser.add_argument('--cachedir', default="cache-wikipedia") 102 parser.add_argument('output-file.sparql', nargs="?", default="dbpedia--countries-missing-from-esc-category-in-the-year.sparql"); 136 parser.add_argument('--queryfile.sparql', default=None) 137 parser.add_argument('output-file.json', nargs="?", default="metadata.json"); 103 138 104 139 args = parser.parse_args() … … 110 145 escwikipedia.init_cache(cache_dir) 111 146 112 sparql_values_output_filename = getattr(args,'output-file.sparql') 113 sparql_values_template_filename = sparql_values_output_filename + ".in" 147 sparql_values_output_filename = getattr(args,'queryfile.sparql') 148 149 json_output_filename = getattr(args,'output-file.json') 150 114 151 115 152 final_result_ids = [ 'Grand_final', 'Final', 'Results' ] … … 128 165 article_semifinal_country_year_recs = escwikipedia.process_article_page(article_country_year_html,year,semifinal_pair_result_ids, stop_at_first=False) 129 166 article_country_year_recs = util.merge_dicts(article_final_country_year_recs,article_semifinal_country_year_recs) 167 130 168 article_countries = {} 131 169 for country_year in article_country_year_recs.keys(): 132 170 article_country = re.search(r"(^.*)\d{4}(?:\w+)?",country_year).group(1) 133 171 134 article_countries[article_country] = 1172 article_countries[article_country] = article_country_year_recs[country_year] 135 173 136 174 category_countries = escwikipedia.process_category_page(year) … … 162 200 print() 163 201 164 missing_uri_lines = [] 165 166 for year in missing_year_countries.keys(): 167 year_str = str(year) 168 for country in missing_year_countries[year]: 169 # Example: 170 # dbr:Cyprus_in_the_Eurovision_Song_Contest_2010 171 172 country_wikisafe = country.replace(" ","_") 173 174 missing_uri = " (dbr:" + country_wikisafe + "_in_the_Eurovision_Song_Contest_" + year_str+")" 175 176 missing_uri_lines.append(missing_uri) 177 178 #print(" " + missing_uri) 179 180 missing_uri_lines_text = "\n".join(missing_uri_lines) 181 util.write_text_file_from_template(sparql_values_template_filename,"**missing-country-year-uris**",missing_uri_lines_text,sparql_values_output_filename) 182 183 # problem_category_in_year_filename = "../problem-lod-lists/dbpedia-problem-category-in-year.sparql" 184 # shutil.copyfile(sparql_values_output_filename,problem_category_in_year_filename) 185 186 print() 187 188 189 190 191 202 greenstone_metadata_json = gs_directory_metadata(missing_year_countries,json_output_filename) 203 save_greenstone_json_metadata(greenstone_metadata_json,json_output_filename) 204 205 if (sparql_values_output_filename is not None): 206 generate_sparql_query(sparql_values_output_filename,missing_year_countries) 207 print()
Note:
See TracChangeset
for help on using the changeset viewer.