source: main/trunk/model-sites-dev/heritage-nz/collect/reports-2019/prepare/xls-to-csv.py@ 33024

Last change on this file since 33024 was 33024, checked in by davidb, 5 years ago

Tweak so Title in spreadsheet maps to dc.Title, so as not to conflict with the automatically set Title from TextPlugin, when the SecondaryPlugin runs on the extracted text file

File size: 6.2 KB
RevLine 
[32807]1import xlrd
2import csv
3import os
4
[32813]5## worksheet_name = "Archaeological reports"
[32807]6
[32813]7
8# https://code.activestate.com/recipes/546518-simple-conversion-of-excel-files-into-csv-and-yaml/download/1/
9def format_excelval(book, type, value, wanttupledate):
10 """ Clean up the incoming excel data """
11 ## Data Type Codes:
12 ## EMPTY 0
13 ## TEXT 1 a Unicode string
14 ## NUMBER 2 float
15 ## DATE 3 float
16 ## BOOLEAN 4 int; 1 means TRUE, 0 means FALSE
17 ## ERROR 5
18 returnrow = []
19 if type == 2: # TEXT
20 if value == int(value): value = int(value)
21 elif type == 3: # NUMBER
22 datetuple = xlrd.xldate_as_tuple(value, book.datemode)
23 value = datetuple if wanttupledate else tupledate_to_isodate(datetuple)
24 elif type == 5: # ERROR
25 value = xlrd.error_text_from_code[value]
26 return value
27
28
[33018]29letter_to_folder = {
30 'a': "A - B",
31 'b': "A - B",
32 'c': "C - D",
33 'd': "C - D",
34 'e': "E - F",
35 'f': "E - F",
36 'g': "G - H",
37 'h': "G - H",
38 'i': "I - J",
39 'j': "I - J",
40 'k': "K - L",
41 'l': "K - L",
42 'm': "M - N",
43 'n': "M - N",
44 'o': "O - P",
45 'p': "O - P",
46 'q': "Q - R",
47 'r': "Q - R",
48 's': "S - T",
49 't': "S - T",
50 'u': "U - V",
51 'v': "U - V",
52 'w': "W - Z",
53 'x': "W - Z",
54 'y': "W - Z",
55 'z': "W - Z"
56}
57
58def id_to_relative_pdf_file(id):
59 first_char = id[0:1]
60 first_char_lower = first_char.lower()
61
62 file_tail = letter_to_folder[first_char_lower]
63 file_root = "Reports " + file_tail
64
65 pdf_file = os.path.join("Digital-Library-PDFs",file_root,id+".pdf")
66
67 return pdf_file
68
69
[32807]70def excel_to_bound_pdf_csv(excel_filename):
[32813]71 workbook = xlrd.open_workbook(excel_filename, formatting_info=True)
72 ## worksheet = workbook.sheet_by_name(worksheet_name)
73 worksheet = workbook.sheet_by_index(0)
74
[32807]75 excel_filename_split = os.path.split(excel_filename)
76 excel_dirname = excel_filename_split[0]
77 excel_file = excel_filename_split[1]
78 excel_file_splitext = os.path.splitext(excel_file)
79 excel_file_root = excel_file_splitext[0];
80
81 pdfbound_filename = os.path.join(excel_dirname,"PDF-BOUND "+excel_file_root+".csv")
82 unbound_filename = os.path.join(excel_dirname,"UNBOUND "+excel_file_root+".csv")
83
84 # print "Worksheet: " + worksheet_name
85 csv_pdfbound_ofile = open(pdfbound_filename, 'wb')
86 #csv_unbound_ofile = open('{}.csv'.format(excel_file_root), 'wb')
87 csv_unbound_ofile = open(unbound_filename, 'wb')
88
89 pdfbound_wr = csv.writer(csv_pdfbound_ofile, quoting=csv.QUOTE_ALL)
90 unbound_wr = csv.writer(csv_unbound_ofile, quoting=csv.QUOTE_ALL)
91
[32820]92 # 1. Skip lines until Header encountered (as defined by encountering "Report Identifier")
93 # 2. Write out Header line as Greenstone friendly Metadata terms
94 # => remove sp spaces, change "Report Identifier" to hnz.Identifier
[32807]95 # 3. Process the rest of the file, checking if accompanying
96 # PDF file present or not
97
98 num_rows = worksheet.nrows
99
[32820]100 # 1, Skip lines until Header encountered
[32807]101 row_i = 0
102 found_header = False
103 while row_i<num_rows:
[32808]104 first_cell = worksheet.cell_value(row_i,0);
[32807]105 if first_cell == "Report Identifier":
106 found_header = True
107 break
108 print "Skipping row {} as not yet encountered 'Report Identifier' metadata label in column 0".format(row_i)
109 row_i = row_i + 1
110
[32820]111 # 2. Process Header into Greenstone friendly metadata terms
[32807]112 if found_header:
113 entry_utf8_row = []
114 for entry in worksheet.row_values(row_i):
[32813]115 if entry == "Report Identifier":
116 entry = "hnz.Identifier"
117
[33024]118 if entry == "Title":
119 entry = "dc.Title"
120
[32807]121 entry_utf8 = unicode(entry).encode("utf-8")
122 metadata_entry_utf8 = entry_utf8.replace(" ", "")
123
124 entry_utf8_row.append(metadata_entry_utf8)
125 unbound_wr.writerow(entry_utf8_row)
126
[32808]127 # Add in 'Filename' as first column in pdfbound CSV file
128 entry_utf8_row.insert(0, "Filename")
129 pdfbound_wr.writerow(entry_utf8_row)
130
131
[32807]132 row_i = row_i + 1
133 while row_i<num_rows:
[32808]134 num_cols = worksheet.row_len(row_i)
[32820]135 formatted_utf8_row = []
[32808]136
137 found_pdf = False
138
139 for col_j in range(num_cols):
[32813]140 cell = worksheet.cell(row_i,col_j)
[32808]141
[32813]142 cell_type = worksheet.cell_type(row_i,col_j)
143 cell_value = worksheet.cell_value(row_i,col_j)
144
[32820]145 formatted_cell = format_excelval(workbook,cell_type,cell_value,False)
[32813]146
[32808]147 if col_j == 0:
148 # Check to see if companion PDF file present
[33018]149 # pdf_filename = os.path.join(excel_dirname,"pdfs",formatted_cell+".pdf")
150 id = formatted_cell.replace(" ","")
151
152 pdf_file = id_to_relative_pdf_file(id)
153 pdf_file_root, pdf_ext = os.path.splitext(pdf_file)
154 PDF_file = pdf_file_root + ".PDF"
155
156 pdf_filename = os.path.join(excel_dirname,pdf_file)
157 PDF_filename = os.path.join(excel_dirname,PDF_file)
158
[32808]159 if os.path.exists(pdf_filename):
160 found_pdf = True
[32820]161 formatted_utf8_row.insert(0, pdf_file)
[33018]162 elif os.path.exists(PDF_filename):
163 found_pdf = True
164 formatted_utf8_row.insert(0, PDF_file)
165 else:
166 print "Unbound id: '" + id + "'"
167
168 if isinstance(formatted_cell, basestring):
169 # Remove any trailing whitespace.
170 # Newline at end particular harmful for a entry in the CSV file
171 formatted_cell = formatted_cell.rstrip();
172
[32820]173 formatted_cell_utf8 = unicode(formatted_cell).encode("utf-8")
174 formatted_utf8_row.append(formatted_cell_utf8)
[32808]175 if found_pdf:
[32820]176 pdfbound_wr.writerow(formatted_utf8_row)
[32808]177 else:
[32820]178 unbound_wr.writerow(formatted_utf8_row)
[32807]179
180 row_i = row_i + 1
181 else:
182 print "Failed to find metadata label 'Report Identifier' in column 0"
183
184
185 csv_pdfbound_ofile.close()
186 csv_unbound_ofile.close()
187
188if __name__ == "__main__":
189 excel_to_bound_pdf_csv(os.sys.argv[1])
190
Note: See TracBrowser for help on using the repository browser.