Changeset 34990
- Timestamp:
- 2021-04-04T13:07:02+12:00 (3 years ago)
- Location:
- main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/esc-wikipedia-download-and-detect-missing-cat-entries.py
r34980 r34990 7 7 import os 8 8 import re 9 # import shutil 9 10 10 11 import argparse … … 98 99 parser.add_argument('--startyear', type=int, default=1956) 99 100 parser.add_argument('--endyear', type=int, default=2019) 100 # parser.add_argument('output-file.json', nargs="?", default="metadata-esc-year/metadata-esc.json"); 101 parser.add_argument('--cachedir', default="cache-wikipedia") 102 parser.add_argument('output-file.sparql', nargs="?", default="dbpedia--countries-missing-from-esc-category-in-the-year.sparql"); 101 103 102 104 args = parser.parse_args() … … 104 106 start_year = getattr(args,'startyear') 105 107 end_year = getattr(args,'endyear') 108 109 cache_dir = getattr(args,'cachedir') 110 escwikipedia.init_cache(cache_dir) 106 111 107 # json_output_filename = getattr(args,'output-file.json') 112 sparql_values_output_filename = getattr(args,'output-file.sparql') 113 sparql_values_template_filename = sparql_values_output_filename + ".in" 114 108 115 final_result_ids = [ 'Grand_final', 'Final', 'Results' ] 109 116 semifinal_result_ids = [ 'Semi-final'] 110 117 semifinal_pair_result_ids = [ 'Semi-final_1', 'Semi-final_2' ] 118 119 missing_year_countries = {} 111 120 112 121 for year in range(start_year, end_year+1): … … 135 144 del article_countries[cat_country] 136 145 else: 137 print("Country listed in Category page that did not reach Fina /Semifinal: " + cat_country)146 print("Country listed in Category page that did not reach Final/Semifinal: " + cat_country) 138 147 139 148 # Missing countries are the ones left in 'article_countries' … … 146 155 print("++++++++++") 147 156 157 missing_year_countries[year] = article_countries 158 148 159 print("==========") 149 160 print() 150 161 162 print() 163 164 missing_uri_lines = [] 165 166 for year in missing_year_countries.keys(): 167 year_str = str(year) 168 for country in missing_year_countries[year]: 169 # Example: 170 # dbr:Cyprus_in_the_Eurovision_Song_Contest_2010 171 172 country_wikisafe = country.replace(" ","_") 173 174 missing_uri = " (dbr:" + country_wikisafe + "_in_the_Eurovision_Song_Contest_" + year_str+")" 175 176 missing_uri_lines.append(missing_uri) 177 178 #print(" " + missing_uri) 179 180 missing_uri_lines_text = "\n".join(missing_uri_lines) 181 util.write_text_file_from_template(sparql_values_template_filename,"**missing-country-year-uris**",missing_uri_lines_text,sparql_values_output_filename) 182 183 # problem_category_in_year_filename = "../problem-lod-lists/dbpedia-problem-category-in-year.sparql" 184 # shutil.copyfile(sparql_values_output_filename,problem_category_in_year_filename) 185 186 print() 187 151 188 152 189 -
main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/esc-wikipedia-download-and-process-votes.py
r34980 r34990 89 89 # } 90 90 91 print("**** json_output_filename = " + json_output_filename) 92 91 93 nul_output_dir_name = os.path.dirname(json_output_filename) 92 94 … … 122 124 parser.add_argument('--startyear', type=int, default=1956) 123 125 parser.add_argument('--endyear', type=int, default=2019) 126 parser.add_argument('--cachedir', default="cache-wikipedia") 124 127 parser.add_argument('output-file.json', nargs="?", default="metadata-esc-year/metadata-esc.json"); 125 128 … … 128 131 start_year = getattr(args,'startyear') 129 132 end_year = getattr(args,'endyear') 130 133 134 cache_dir = getattr(args,'cachedir') 135 escwikipedia.init_cache(cache_dir) 136 131 137 json_output_filename = getattr(args,'output-file.json') 132 138 -
main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/escwikipedia.py
r34980 r34990 18 18 import wikipedia 19 19 20 import wikipediaapi21 22 20 DEBUG=False 23 21 24 22 25 cache_dir="cache-wikipedia" 26 27 # Global Init 28 if not(os.path.exists(cache_dir)): 29 print("Making cache directory: " + cache_dir) 30 os.mkdir(cache_dir) 23 cache_dir_=None 24 25 def init_cache(cache_dir): 26 global cache_dir_ 27 cache_dir_ = cache_dir 28 if not(os.path.exists(cache_dir_)): 29 print("Making cache directory: " + cache_dir_) 30 os.mkdir(cache_dir_) 31 31 32 32 … … 241 241 esc_wiki_page = "Eurovision_Song_Contest_" + str(year) 242 242 esc_wiki_page_file = esc_wiki_page.replace(" ","_") + ".html" 243 esc_wiki_page_filename = os.path.join(cache_dir ,esc_wiki_page_file)243 esc_wiki_page_filename = os.path.join(cache_dir_,esc_wiki_page_file) 244 244 245 245 esc_article_year_html = "" … … 308 308 esc_wiki_page = "Category:Countries_in_the_Eurovision_Song_Contest_" + str(year) 309 309 esc_wiki_page_file = esc_wiki_page.replace(" ","_") + ".html" 310 esc_wiki_page_filename = os.path.join(cache_dir ,esc_wiki_page_file)310 esc_wiki_page_filename = os.path.join(cache_dir_,esc_wiki_page_file) 311 311 312 312 esc_cat_year_html = "" … … 345 345 346 346 347 def cat_test():348 349 wiki_wiki = wikipediaapi.Wikipedia("en",extract_format=wikipediaapi.ExtractFormat.HTML)350 351 cat_title = "Category:Countries_in_the_Eurovision_Song_Contest_1956"352 cat = wiki_wiki.page(cat_title)353 354 print("**** html = " + cat.text)355 356 357 print("Category members: " + cat_title)358 for p in cat.categorymembers.values():359 if p.namespace == wikipediaapi.Namespace.CATEGORY:360 # it is category, so you have to make decision361 # if you want to fetch also text from pages that belong362 # to this category363 print("CatNS")364 print(p)365 elif p.namespace == wikipediaapi.Namespace.MAIN:366 # it is page => we can get text367 print("MAIN")368 print(p)369 print(p.text) -
main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/errata-categories/util.py
r34980 r34990 7 7 z.update(y) # modifies z with y's keys and values & returns None 8 8 return z 9 10 11 def write_text_file_from_template(input_template_filename,marker,text,output_filename): 12 13 fin = open(input_template_filename, "r") 14 lines = fin.readlines() 15 fin.close() 16 17 fout = open(output_filename, "w") 18 for line in lines: 19 line = line.replace(marker,text) 20 fout.write(line) 21 22 fout.close()
Note:
See TracChangeset
for help on using the changeset viewer.