1 | #!/usr/bin/env python3
|
---|
2 |
|
---|
3 | import argparse
|
---|
4 |
|
---|
5 | import os
|
---|
6 | import re
|
---|
7 | import shutil
|
---|
8 |
|
---|
9 | from glob import glob
|
---|
10 |
|
---|
11 | prep_dir = "essentia-audio-features"
|
---|
12 |
|
---|
13 | import_dir = os.path.join("..","import")
|
---|
14 | audio_dir = os.path.join(prep_dir,"audio")
|
---|
15 |
|
---|
16 |
|
---|
17 | parser = argparse.ArgumentParser(description="Copy Essentia generated audio features into Greenstone3 collection's '../import' directory")
|
---|
18 | parser.add_argument('--startyear', type=int, default=None,
|
---|
19 | help='Start year range of the Eurovision Song Contest')
|
---|
20 | parser.add_argument('--endyear', type=int, default=None,
|
---|
21 | help='End year range of the Eurovision Song Contest')
|
---|
22 | args = parser.parse_args()
|
---|
23 |
|
---|
24 | start_year = int(args.startyear) if args.startyear is not None else None
|
---|
25 | end_year = int(args.endyear) if args.endyear is not None else None
|
---|
26 |
|
---|
27 |
|
---|
28 | years = [d for d in os.listdir(audio_dir) if os.path.isdir(os.path.join(audio_dir, d))]
|
---|
29 |
|
---|
30 | sorted_years = sorted(years)
|
---|
31 |
|
---|
32 | opt_filtered_sorted_years = []
|
---|
33 |
|
---|
34 | for y in sorted_years:
|
---|
35 |
|
---|
36 | if (start_year is not None) and int(y) < start_year:
|
---|
37 | continue
|
---|
38 |
|
---|
39 | if (end_year is not None) and int(y) > end_year:
|
---|
40 | continue
|
---|
41 |
|
---|
42 | opt_filtered_sorted_years.append(y)
|
---|
43 |
|
---|
44 |
|
---|
45 | print("Reading in Audio-Feature JSON files by year")
|
---|
46 |
|
---|
47 | all_af_json_files = []
|
---|
48 |
|
---|
49 |
|
---|
50 | for year in opt_filtered_sorted_years:
|
---|
51 | # print(" " + year)
|
---|
52 |
|
---|
53 | if year == "2020":
|
---|
54 | print("====")
|
---|
55 | print("Skipping year 2020 as no contest was held that year")
|
---|
56 | print("====")
|
---|
57 | continue
|
---|
58 |
|
---|
59 | audio_year_dir = os.path.join(audio_dir,year)
|
---|
60 |
|
---|
61 | for f in os.listdir(audio_year_dir):
|
---|
62 | if re.search(r"\.json$",f):
|
---|
63 | year_entrant = os.path.join(year,f)
|
---|
64 | all_af_json_files.append(year_entrant)
|
---|
65 |
|
---|
66 |
|
---|
67 | #print("json files = " + repr(all_af_json_files))
|
---|
68 |
|
---|
69 |
|
---|
70 | entrant_recs = []
|
---|
71 |
|
---|
72 |
|
---|
73 | for year_and_entrant in all_af_json_files:
|
---|
74 |
|
---|
75 | match = re.search(r"^(\d{4})/([^_]+)_([^_]+)_([^_]+)$",year_and_entrant)
|
---|
76 |
|
---|
77 | if match:
|
---|
78 | year = match.group(1)
|
---|
79 | country = match.group(2)
|
---|
80 | title = match.group(3)
|
---|
81 | artist = match.group(4)
|
---|
82 |
|
---|
83 | entrant_rec = { "year": year, "country": country, "title": title, "artist": artist }
|
---|
84 |
|
---|
85 |
|
---|
86 | # print("year={0}, country={1}, title={2}, artist={3}".format(year,country,title,artist))
|
---|
87 | # print("entrant = " + repr(entrant_rec))
|
---|
88 |
|
---|
89 | entrant_recs.append(entrant_rec)
|
---|
90 |
|
---|
91 | else:
|
---|
92 | print("**** No match found for: '" + year_and_entrant +"'")
|
---|
93 |
|
---|
94 |
|
---|
95 | #
|
---|
96 | # af = audio-features
|
---|
97 | #
|
---|
98 | all_af_nul_files = []
|
---|
99 | src_af_lookup = {}
|
---|
100 |
|
---|
101 | #print()
|
---|
102 | #print("Nul files derived from year/country_title_artist audio feature JSON files:")
|
---|
103 |
|
---|
104 | for entrant_rec in entrant_recs:
|
---|
105 | year = entrant_rec["year"]
|
---|
106 | country = entrant_rec["country"]
|
---|
107 | title = entrant_rec["title"]
|
---|
108 | artist = entrant_rec["artist"]
|
---|
109 |
|
---|
110 | full_year_entrant = os.path.join(audio_dir,year,country+"_" + title + "_" + artist)
|
---|
111 |
|
---|
112 | # Tweak the values so they align with those used in SPARQL resultsset
|
---|
113 | country = country.replace("&","and").replace(" ","")
|
---|
114 |
|
---|
115 | if year == "1956":
|
---|
116 | title_label = entrant_rec["title"].lower().replace(" ","")
|
---|
117 | #title_first3 = title_label[0].upper() + title_label[1:3].lower()
|
---|
118 | title_first3 = title_label[0:3].upper()
|
---|
119 | year += title_first3
|
---|
120 |
|
---|
121 | nul_file = country + year + ".nul"
|
---|
122 |
|
---|
123 | all_af_nul_files.append(nul_file)
|
---|
124 | src_af_lookup[nul_file] = full_year_entrant
|
---|
125 | # print("**## storing nul_file = " + nul_file + " => " + full_year_entrant);
|
---|
126 |
|
---|
127 | # print(", ".join(all_af_nul_files))
|
---|
128 |
|
---|
129 |
|
---|
130 | entrant_nul_dirs = []
|
---|
131 |
|
---|
132 |
|
---|
133 | for d in os.listdir(import_dir):
|
---|
134 | full_d = os.path.join(import_dir, d)
|
---|
135 | if os.path.isdir(full_d) and re.search(r"^(sparqlresults-local--countries-in-esc-by-year-.*)|(missing-cat-countries)|(inaugural-year)$",d):
|
---|
136 | entrant_nul_dirs.append(full_d)
|
---|
137 |
|
---|
138 |
|
---|
139 | #print("Entrant Nul dirs:")
|
---|
140 | #print(repr(entrant_nul_dirs))
|
---|
141 |
|
---|
142 | entrant_nul_files_lookup = {}
|
---|
143 |
|
---|
144 | for d in entrant_nul_dirs:
|
---|
145 |
|
---|
146 | for f in os.listdir(d):
|
---|
147 | if re.search(r"\.nul$",f):
|
---|
148 | nul_file = f
|
---|
149 | full_nul_file = os.path.join(d,f)
|
---|
150 |
|
---|
151 | if not nul_file in entrant_nul_files_lookup:
|
---|
152 | entrant_nul_files_lookup[nul_file] = []
|
---|
153 |
|
---|
154 | entrant_nul_files_lookup[nul_file].append(full_nul_file)
|
---|
155 |
|
---|
156 | #print()
|
---|
157 | #print("Lookup dict:")
|
---|
158 | #print(repr(entrant_nul_files_lookup))
|
---|
159 |
|
---|
160 |
|
---|
161 |
|
---|
162 | # The country of Macedonia renamed itself to North Macedonia in response
|
---|
163 | # to a long running naming debate with the region of Macedonia in Greece
|
---|
164 | #
|
---|
165 | #
|
---|
166 | # Audio-Feature files sometimes refer to North Macedonia in places
|
---|
167 | # where the SPARQL results gives the country as Macedonia
|
---|
168 | #
|
---|
169 | # => Pass through the audio-feature data changing 'NorthMacedoniaYYYY.nul' to
|
---|
170 | # 'MacedoniaYYY.nul' for any place where there is no entrant_nul_files_lookup
|
---|
171 | # for 'NorthMacedonia'
|
---|
172 |
|
---|
173 | print()
|
---|
174 | print("Running errata to better align Audio Feature derived .nul files with Sparql Resultset")
|
---|
175 | for i,af_nul_file in enumerate(all_af_nul_files):
|
---|
176 |
|
---|
177 | if not af_nul_file in entrant_nul_files_lookup:
|
---|
178 | af_nul_file_errata = None
|
---|
179 |
|
---|
180 | if af_nul_file == "Germany1956Imw.nul":
|
---|
181 | af_nul_file_errata = "Germany1956ImW.nul"
|
---|
182 |
|
---|
183 | if af_nul_file == "Yugoslavia1992.nul":
|
---|
184 | af_nul_file_errata = "FRYugoslavia1992.nul"
|
---|
185 |
|
---|
186 | elif re.search(r"^NorthMacedonia",af_nul_file):
|
---|
187 | af_nul_file_check = af_nul_file.replace("North","")
|
---|
188 | if af_nul_file_check in entrant_nul_files_lookup:
|
---|
189 | af_nul_file_errata = af_nul_file_check
|
---|
190 |
|
---|
191 | if af_nul_file_errata is not None:
|
---|
192 | print(" Errata: Fixing " + af_nul_file + " => " + af_nul_file_errata)
|
---|
193 |
|
---|
194 | src_af_lookup[af_nul_file_errata] = src_af_lookup[af_nul_file]
|
---|
195 | del src_af_lookup[af_nul_file]
|
---|
196 |
|
---|
197 | all_af_nul_files[i] = af_nul_file_errata
|
---|
198 |
|
---|
199 |
|
---|
200 | print("Matching Audio-Feature files to nul files in 'import':")
|
---|
201 |
|
---|
202 |
|
---|
203 | print()
|
---|
204 | print("The following files have Audio Features, but did not match in to a .nul file in the 'import' diretory")
|
---|
205 |
|
---|
206 | matched_af_nul_files = []
|
---|
207 |
|
---|
208 | for af_nul_file in all_af_nul_files:
|
---|
209 | if af_nul_file in entrant_nul_files_lookup:
|
---|
210 | matched_af_nul_files.append(af_nul_file)
|
---|
211 | else:
|
---|
212 | print(" " + af_nul_file)
|
---|
213 |
|
---|
214 |
|
---|
215 | print()
|
---|
216 | print("Copying the following Essentia audio feature JSON files into .nul matched positions in 'import'")
|
---|
217 |
|
---|
218 | for af_nul_file in matched_af_nul_files:
|
---|
219 | import_files = entrant_nul_files_lookup[af_nul_file]
|
---|
220 | for import_file in import_files:
|
---|
221 | # print("** looking up " + af_nul_file)
|
---|
222 | src_json_file = src_af_lookup[af_nul_file]
|
---|
223 | dst_json_file = import_file.replace(".nul","-Essentia.json")
|
---|
224 | dst_assoc_json_file = import_file.replace(".nul","-AssocEssentia.json")
|
---|
225 |
|
---|
226 | print(" Copying: " + src_json_file + " => " + dst_json_file)
|
---|
227 | # shutil.copy(src_json_file,dst_json_file)
|
---|
228 |
|
---|
229 | print(" Copying: " + src_json_file + " => " + dst_assoc_json_file)
|
---|
230 | # shutil.copy(src_json_file,dst_assoc_json_file)
|
---|
231 |
|
---|
232 |
|
---|
233 | # Sort out contestant CSV files to copy into ../import area
|
---|
234 |
|
---|
235 | src_csv_filenames = []
|
---|
236 |
|
---|
237 | if start_year is None and end_year is None:
|
---|
238 | # 1956 and 1956 - env.esc_cutoff_year
|
---|
239 | esc_cutoff_endyear = os.environ["esc_cutoff_endyear"]
|
---|
240 | #esc_cutoff_endyear = 2021
|
---|
241 |
|
---|
242 | src_csv_filenames.append("contestants-1956.cvs")
|
---|
243 | src_csv_filenames.append(f"contestants-1957-to-{esc_cutoff_endyear}.cvs")
|
---|
244 |
|
---|
245 | elif start_year is None :
|
---|
246 | src_csv_filenames.append(f"contestants-1956-to-{end_year}.cvs")
|
---|
247 |
|
---|
248 | elif end_year is None :
|
---|
249 | src_csv_filenames.append(f"contestants-{start_year}-to-{esc_cutoff_endyear}.cvs")
|
---|
250 | else:
|
---|
251 | src_csv_filenames.append(f"contestants-{start_year}-to-{end_year}.cvs")
|
---|
252 |
|
---|
253 |
|
---|
254 | dst_csv_dirs = [os.path.join(import_dir,"inaugural-year"), os.path.join(import_dir,"missing-cat-countries") ]
|
---|
255 |
|
---|
256 |
|
---|
257 | sparql_result_dirs = glob(os.path.join(import_dir,"sparqlresults-local--countries-in-esc-by-year-*/"))
|
---|
258 |
|
---|
259 | dst_csv_dirs += sparql_result_dirs
|
---|
260 |
|
---|
261 | print("****** src_csv_filename = " + repr(src_csv_filenames))
|
---|
262 |
|
---|
263 | print("****** dst_csv_dirs = " + repr(dst_csv_dirs))
|
---|
264 |
|
---|
265 |
|
---|
266 | # cp contestants-2015.csv ../../import/sparqlresults-local--countries-in-esc-by-year-just-2015--with-errata/metadata-contestants-2015.csv
|
---|