source: main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/prepare/voting-excel/xlsx-to-jsonmetadata.py@ 34876

Last change on this file since 34876 was 34876, checked in by davidb, 3 years ago

Code updated to generate list of GS3 metadata tags needed for voting country metadata to appear in document view

  • Property svn:executable set to *
File size: 7.1 KB
Line 
1#!/usr/bin/env python
2
3from __future__ import print_function
4
5import os
6import re
7import sys
8import json
9
10import argparse
11import openpyxl
12
13import xlsxutil
14
15jury_from_countries = {}
16tele_from_countries = {}
17
18def eprint(*args, **kwargs):
19 print(*args, file=sys.stderr, **kwargs)
20
21def fileset_voting_for_esc_country_in_year(data_hashmap):
22
23 # Only generating a single fileset record here, however
24 # the Greenstone format allows for this to be an array
25 # of fileset entries => return [ fileset ]
26
27 fileset = {}
28
29 return [ fileset ]
30
31
32def filter_finalist_votes(entry):
33 return entry.get('(semi-) final') == "f" and entry.get('Duplicate') == None
34
35def sortkey_year_to_country(entry):
36 return str(entry.get('Year'))+entry.get('To country')
37
38
39def create_to_country_voting_groups(data_hashmap_array):
40
41 # Example values for header-names
42 # (semi-) final: f, sf
43 # Jury or Televoting: J, T
44 # Year: 1975
45 # To country: Belgium
46 # From country: Belgium
47 # Edition: 1975f, 1975sf
48
49
50 # Filter down to just the voting results concerning finals
51 data_hashmap_array_finals = list(filter(filter_finalist_votes, data_hashmap_array))
52
53 # Sort so array entries are grouped by the country receiving the votes in a given year
54 data_hashmap_array_finals.sort(key=sortkey_year_to_country)
55
56
57 # Debug output
58 #
59 # for data_hashmap in data_hashmap_array_finals:
60 # print data_hashmap.get('To country'), data_hashmap.get('Year'), data_hashmap.get('Points'), "(Points from " + data_hashmap.get('From country') + ")"
61
62
63 # Build array of country groups
64 # A country grouping includes all the votes that country receive that year
65 country_groups = []
66
67 prev_data_hashmap = data_hashmap_array_finals[0]
68 country_group = [ ]
69
70 i = 1
71 num_finals = len(data_hashmap_array_finals)
72
73 while (i < num_finals):
74 country_group.append(prev_data_hashmap)
75 data_hashmap = data_hashmap_array_finals[i]
76
77 if (data_hashmap.get('To country') != prev_data_hashmap.get('To country')):
78 # moving on to a new country group
79 country_groups.append(country_group)
80 country_group = [ ]
81
82 prev_data_hashmap = data_hashmap
83 i = i + 1
84
85 country_group.append(prev_data_hashmap)
86 country_groups.append(country_group)
87
88 return country_groups
89
90
91def fileset_voting_for_esc_country_in_year(to_country_year_votes):
92
93 # Looking to build data-structure (for output as JSON) in the form
94 # { "FileSet":
95 # [
96 # { "FileName": "France1991\.nul" },
97 # { "Description":
98 # {
99 # "Metadata":
100 # [
101 # { "name": "Germany-J", "content": 12 }, # J = Jury Vote
102 # { "name": "Germany-T", "content": 6 }, # T = Televote (if present)
103 # ...
104 # ]
105 # }
106 # }
107 # ]
108 # }
109
110 fileset_array = []
111
112 metadata_array = []
113
114 jury_metadata_vals = []
115 tele_metadata_vals = []
116
117 for to_country_year_vote in to_country_year_votes:
118 to_country = to_country_year_vote.get('To country')
119 year = to_country_year_vote.get('Year')
120 from_country = to_country_year_vote.get('From country')
121 vote_type = to_country_year_vote.get('Jury or Televoting')
122 points = to_country_year_vote.get('Points')
123
124 id_from_country = re.sub(r'\s+', '', from_country)
125
126 voting_rec = { "name": id_from_country+"-"+vote_type, "content": points }
127
128 metadata_array.append(voting_rec)
129
130 if (vote_type == "J"):
131 jury_metadata_vals.append(id_from_country+"-J")
132 jury_from_countries[id_from_country] = 1
133
134 elif (vote_type == "T"):
135 tele_metadata_vals.append(id_from_country+"-T")
136 tele_from_countries[id_from_country] = 1
137 else:
138 eprint("Warning: Unrecognized voting type: " + vote_type)
139
140 if (len(jury_metadata_vals)>0):
141 metadata_array.append({ "name": "JuryVotesJSON", "content": json.dumps(jury_metadata_vals) })
142 if (len(tele_metadata_vals)>0):
143 metadata_array.append({ "name": "TeleVotesJSON", "content": json.dumps(tele_metadata_vals) })
144
145 id_to_country = to_country_year_votes[0].get('To country')
146 id_to_country = re.sub(r'\s+', '', id_to_country)
147 id_year = to_country_year_votes[0].get('Year');
148 id = id_to_country + str(id_year);
149 filename_id = id + "\\.nul"
150
151 fileset = {
152 "FileSet" : [
153 { "FileName": filename_id, },
154 { "Description" : { "Metadata" : metadata_array } }
155 ]
156 }
157
158 return fileset
159
160
161if __name__ == "__main__":
162
163 parser = argparse.ArgumentParser()
164 parser.add_argument('input-file.xlsx')
165 parser.add_argument('output-file.json', nargs='?')
166 parser.add_argument('--sheetname')
167
168 args = parser.parse_args()
169
170 excel_input_filename = getattr(args,'input-file.xlsx');
171 json_output_filename = getattr(args,'output-file.json');
172 sheetname = getattr(args,'sheetname');
173
174 if (json_output_filename == None):
175 json_output_filename = os.path.splitext(excel_input_filename)[0]+'.json'
176
177 worksheet = xlsxutil.load_xslx_sheet(excel_input_filename,sheetname)
178
179 data_hashmap_array = xlsxutil.convert_worksheet_to_hashmaps(worksheet)
180
181 print("Number of data rows: " + str(len(data_hashmap_array)))
182
183
184 to_country_year_voting_groups = create_to_country_voting_groups(data_hashmap_array)
185
186 # Debug output
187 #
188 # print to_country_year_voting_groups
189
190 # Next step is to express the grouped by-country voting data
191 # in the Greenstone JSON metadata format:
192
193 # { "DirectoryMetadata":
194 # [
195 # { "FileSet":
196 # [
197 # { "FileName": "France1991\.nul" },
198 # { "Description":
199 # {
200 # "Metadata":
201 # [
202 # { "name": "Germany-J", "content": "12" }, # J = Jury Vote
203 # ...
204 # ]
205 # }
206 # }
207 # ]
208 # }
209 # ...
210 # ]
211 # }
212
213 directory_metadata = []
214
215 print("Creating Greenstone JSON voting metadata for:")
216 for to_country_year_votes in to_country_year_voting_groups:
217
218 fileset = fileset_voting_for_esc_country_in_year(to_country_year_votes)
219 directory_metadata.append(fileset)
220
221 filename_id = fileset.get('FileSet')[0].get('FileName')
222 num_countries_voting_data = len(fileset.get('FileSet')[1].get('Description').get('Metadata'))
223
224 print(" " + filename_id.ljust(28) + ": " + str(num_countries_voting_data) + " votes")
225
226
227 print("")
228 for from_country in sorted(jury_from_countries.keys()):
229 print("<gsf:metadata name=\""+from_country+"-J\" />")
230
231 print("")
232 for from_country in sorted(tele_from_countries.keys()):
233 print("<gsf:metadata name=\""+from_country+"-T\" />")
234
235 greenstone_metadata_json = { "DirectoryMetadata": directory_metadata }
236
237 with open(json_output_filename, 'w') as outfile:
238 json.dump(greenstone_metadata_json, outfile, indent=2)
Note: See TracBrowser for help on using the repository browser.