source: main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/08b-COPY-CSV-AND-AUDIO-FEATURES-TO-IMPORT_support.py@ 35990

Last change on this file since 35990 was 35990, checked in by davidb, 2 years ago

Changes after testing

  • Property svn:executable set to *
File size: 8.4 KB
Line 
1#!/usr/bin/env python3
2
3import argparse
4
5import os
6import re
7import shutil
8
9from glob import glob
10
11prep_dir = "essentia-audio-features"
12
13import_dir = os.path.join("..","import")
14audio_dir = os.path.join(prep_dir,"audio")
15
16
17parser = argparse.ArgumentParser(description="Copy Essentia generated audio features into Greenstone3 collection's '../import' directory")
18parser.add_argument('--startyear', type=int, default=None,
19 help='Start year range of the Eurovision Song Contest')
20parser.add_argument('--endyear', type=int, default=None,
21 help='End year range of the Eurovision Song Contest')
22args = parser.parse_args()
23
24start_year = int(args.startyear) if args.startyear is not None else None
25end_year = int(args.endyear) if args.endyear is not None else None
26
27
28years = [d for d in os.listdir(audio_dir) if os.path.isdir(os.path.join(audio_dir, d))]
29
30sorted_years = sorted(years)
31
32opt_filtered_sorted_years = []
33
34for y in sorted_years:
35
36 if (start_year is not None) and int(y) < start_year:
37 continue
38
39 if (end_year is not None) and int(y) > end_year:
40 continue
41
42 opt_filtered_sorted_years.append(y)
43
44
45print("Reading in Audio-Feature JSON files by year")
46
47all_af_json_files = []
48
49
50for year in opt_filtered_sorted_years:
51# print(" " + year)
52
53 if year == "2020":
54 print("====")
55 print("Skipping year 2020 as no contest was held that year")
56 print("====")
57 continue
58
59 audio_year_dir = os.path.join(audio_dir,year)
60
61 for f in os.listdir(audio_year_dir):
62 if re.search(r"\.json$",f):
63 year_entrant = os.path.join(year,f)
64 all_af_json_files.append(year_entrant)
65
66
67#print("json files = " + repr(all_af_json_files))
68
69
70entrant_recs = []
71
72
73for year_and_entrant in all_af_json_files:
74
75 match = re.search(r"^(\d{4})/([^_]+)_([^_]+)_([^_]+)$",year_and_entrant)
76
77 if match:
78 year = match.group(1)
79 country = match.group(2)
80 title = match.group(3)
81 artist = match.group(4)
82
83 entrant_rec = { "year": year, "country": country, "title": title, "artist": artist }
84
85
86 # print("year={0}, country={1}, title={2}, artist={3}".format(year,country,title,artist))
87 # print("entrant = " + repr(entrant_rec))
88
89 entrant_recs.append(entrant_rec)
90
91 else:
92 print("**** No match found for: '" + year_and_entrant +"'")
93
94
95#
96# af = audio-features
97#
98all_af_nul_files = []
99src_af_lookup = {}
100
101#print()
102#print("Nul files derived from year/country_title_artist audio feature JSON files:")
103
104for entrant_rec in entrant_recs:
105 year = entrant_rec["year"]
106 country = entrant_rec["country"]
107 title = entrant_rec["title"]
108 artist = entrant_rec["artist"]
109
110 full_year_entrant = os.path.join(audio_dir,year,country+"_" + title + "_" + artist)
111
112 # Tweak the values so they align with those used in SPARQL resultsset
113 country = country.replace("&","and").replace(" ","")
114
115 if year == "1956":
116 title_label = entrant_rec["title"].lower().replace(" ","")
117 #title_first3 = title_label[0].upper() + title_label[1:3].lower()
118 title_first3 = title_label[0:3].upper()
119 year += title_first3
120
121 nul_file = country + year + ".nul"
122
123 all_af_nul_files.append(nul_file)
124 src_af_lookup[nul_file] = full_year_entrant
125 # print("**## storing nul_file = " + nul_file + " => " + full_year_entrant);
126
127# print(", ".join(all_af_nul_files))
128
129
130entrant_nul_dirs = []
131
132
133for d in os.listdir(import_dir):
134 full_d = os.path.join(import_dir, d)
135 if os.path.isdir(full_d) and re.search(r"^(sparqlresults-local--countries-in-esc-by-year-.*)|(missing-cat-countries)|(inaugural-year)$",d):
136 entrant_nul_dirs.append(full_d)
137
138
139#print("Entrant Nul dirs:")
140#print(repr(entrant_nul_dirs))
141
142entrant_nul_files_lookup = {}
143
144for d in entrant_nul_dirs:
145
146 for f in os.listdir(d):
147 if re.search(r"\.nul$",f):
148 nul_file = f
149 full_nul_file = os.path.join(d,f)
150
151 if not nul_file in entrant_nul_files_lookup:
152 entrant_nul_files_lookup[nul_file] = []
153
154 entrant_nul_files_lookup[nul_file].append(full_nul_file)
155
156#print()
157#print("Lookup dict:")
158#print(repr(entrant_nul_files_lookup))
159
160
161
162# The country of Macedonia renamed itself to North Macedonia in response
163# to a long running naming debate with the region of Macedonia in Greece
164#
165#
166# Audio-Feature files sometimes refer to North Macedonia in places
167# where the SPARQL results gives the country as Macedonia
168#
169# => Pass through the audio-feature data changing 'NorthMacedoniaYYYY.nul' to
170# 'MacedoniaYYY.nul' for any place where there is no entrant_nul_files_lookup
171# for 'NorthMacedonia'
172
173print()
174print("Running errata to better align Audio Feature derived .nul files with Sparql Resultset")
175for i,af_nul_file in enumerate(all_af_nul_files):
176
177 if not af_nul_file in entrant_nul_files_lookup:
178 af_nul_file_errata = None
179
180 if af_nul_file == "Germany1956Imw.nul":
181 af_nul_file_errata = "Germany1956ImW.nul"
182
183 if af_nul_file == "Yugoslavia1992.nul":
184 af_nul_file_errata = "FRYugoslavia1992.nul"
185
186 elif re.search(r"^NorthMacedonia",af_nul_file):
187 af_nul_file_check = af_nul_file.replace("North","")
188 if af_nul_file_check in entrant_nul_files_lookup:
189 af_nul_file_errata = af_nul_file_check
190
191 if af_nul_file_errata is not None:
192 print(" Errata: Fixing " + af_nul_file + " => " + af_nul_file_errata)
193
194 src_af_lookup[af_nul_file_errata] = src_af_lookup[af_nul_file]
195 del src_af_lookup[af_nul_file]
196
197 all_af_nul_files[i] = af_nul_file_errata
198
199
200print("Matching Audio-Feature files to nul files in 'import':")
201
202
203print()
204print("The following files have Audio Features, but did not match in to a .nul file in the 'import' diretory")
205
206matched_af_nul_files = []
207
208for af_nul_file in all_af_nul_files:
209 if af_nul_file in entrant_nul_files_lookup:
210 matched_af_nul_files.append(af_nul_file)
211 else:
212 print(" " + af_nul_file)
213
214
215print()
216print("Copying the following Essentia audio feature JSON files into .nul matched positions in 'import'")
217
218for af_nul_file in matched_af_nul_files:
219 import_files = entrant_nul_files_lookup[af_nul_file]
220 for import_file in import_files:
221 # print("** looking up " + af_nul_file)
222 src_json_file = src_af_lookup[af_nul_file]
223 dst_json_file = import_file.replace(".nul","-Essentia.json")
224 dst_assoc_json_file = import_file.replace(".nul","-AssocEssentia.json")
225
226 print(" Copying: " + src_json_file + " => " + dst_json_file)
227 shutil.copy(src_json_file,dst_json_file)
228
229 print(" Copying: " + src_json_file + " => " + dst_assoc_json_file)
230 shutil.copy(src_json_file,dst_assoc_json_file)
231
232
233# Sort out contestant CSV files to copy into ../import area
234
235src_csv_files = []
236
237if start_year is None and end_year is None:
238 # 1956 and 1956 - env.esc_cutoff_year
239 esc_cutoff_endyear = os.environ["esc_cutoff_endyear"]
240 #esc_cutoff_endyear = 2021
241
242 src_csv_files.append("contestants-1956.cvs")
243 src_csv_files.append(f"contestants-1957-to-{esc_cutoff_endyear}.cvs")
244
245elif start_year is None :
246 src_csv_files.append(f"contestants-1956-to-{end_year}.cvs")
247
248elif end_year is None :
249 src_csv_files.append(f"contestants-{start_year}-to-{esc_cutoff_endyear}.cvs")
250else:
251 src_csv_files.append(f"contestants-{start_year}-to-{end_year}.cvs")
252
253
254dst_csv_dirs = [os.path.join(import_dir,"inaugural-year"), os.path.join(import_dir,"missing-cat-countries") ]
255
256sparql_result_dirs = glob(os.path.join(import_dir,"sparqlresults-local--countries-in-esc-by-year-*/"))
257dst_csv_dirs += sparql_result_dirs
258
259
260print()
261print("Copying the following Contestants CSV files into 'import'")
262
263for dst_dir in dst_csv_dirs:
264 for src_csv_file in src_csv_files:
265
266 src_csv_filename = os.path.join(prep_dir,src_csv_file)
267 dst_csv_filename = os.path.join(dst_dir,src_csv_file)
268
269 print(" Copying: " + src_csv_filename + " => " + dst_csv_filename)
270 shutil.copy(src_csv_filename,dst_filename)
271
272
Note: See TracBrowser for help on using the repository browser.