root/main/trunk/model-sites-dev/heritage-nz/collect/reports-2019/prepare/xls-to-csv.py @ 33024

Revision 33024, 6.2 KB (checked in by davidb, 8 months ago)

Tweak so Title in spreadsheet maps to dc.Title, so as not to conflict with the automatically set Title from TextPlugin?, when the SecondaryPlugin? runs on the extracted text file

Line 
1import xlrd
2import csv
3import os
4
5## worksheet_name = "Archaeological reports"
6
7
8# https://code.activestate.com/recipes/546518-simple-conversion-of-excel-files-into-csv-and-yaml/download/1/
9def format_excelval(book, type, value, wanttupledate):
10    """ Clean up the incoming excel data """
11    ##  Data Type Codes:
12    ##  EMPTY   0
13    ##  TEXT    1 a Unicode string
14    ##  NUMBER  2 float
15    ##  DATE    3 float
16    ##  BOOLEAN 4 int; 1 means TRUE, 0 means FALSE
17    ##  ERROR   5
18    returnrow = []
19    if   type == 2: # TEXT
20        if value == int(value): value = int(value)
21    elif type == 3: # NUMBER
22        datetuple = xlrd.xldate_as_tuple(value, book.datemode)
23        value = datetuple if wanttupledate else tupledate_to_isodate(datetuple)
24    elif type == 5: # ERROR
25        value = xlrd.error_text_from_code[value]
26    return value
27
28
29letter_to_folder = {
30  'a': "A - B",
31  'b': "A - B",
32  'c': "C - D",
33  'd': "C - D",
34  'e': "E - F",
35  'f': "E - F",
36  'g': "G - H",
37  'h': "G - H",
38  'i': "I - J",
39  'j': "I - J",
40  'k': "K - L",
41  'l': "K - L",
42  'm': "M - N",
43  'n': "M - N",
44  'o': "O - P",
45  'p': "O - P",
46  'q': "Q - R",
47  'r': "Q - R",
48  's': "S - T",
49  't': "S - T",
50  'u': "U - V",
51  'v': "U - V",
52  'w': "W - Z",
53  'x': "W - Z",
54  'y': "W - Z",
55  'z': "W - Z"
56}
57
58def id_to_relative_pdf_file(id):
59    first_char = id[0:1]
60    first_char_lower = first_char.lower()
61
62    file_tail = letter_to_folder[first_char_lower]
63    file_root = "Reports " + file_tail
64
65    pdf_file = os.path.join("Digital-Library-PDFs",file_root,id+".pdf")
66
67    return pdf_file
68
69
70def excel_to_bound_pdf_csv(excel_filename):
71    workbook = xlrd.open_workbook(excel_filename, formatting_info=True)
72    ## worksheet = workbook.sheet_by_name(worksheet_name)
73    worksheet = workbook.sheet_by_index(0)
74
75    excel_filename_split = os.path.split(excel_filename)
76    excel_dirname = excel_filename_split[0]
77    excel_file = excel_filename_split[1]
78    excel_file_splitext = os.path.splitext(excel_file)
79    excel_file_root = excel_file_splitext[0];
80
81    pdfbound_filename = os.path.join(excel_dirname,"PDF-BOUND "+excel_file_root+".csv")
82    unbound_filename  = os.path.join(excel_dirname,"UNBOUND "+excel_file_root+".csv")
83
84    # print "Worksheet: " + worksheet_name
85    csv_pdfbound_ofile = open(pdfbound_filename, 'wb')
86    #csv_unbound_ofile  = open('{}.csv'.format(excel_file_root), 'wb')
87    csv_unbound_ofile  = open(unbound_filename, 'wb')
88
89    pdfbound_wr = csv.writer(csv_pdfbound_ofile, quoting=csv.QUOTE_ALL)
90    unbound_wr  = csv.writer(csv_unbound_ofile, quoting=csv.QUOTE_ALL)
91
92    # 1. Skip lines until Header encountered (as defined by encountering "Report Identifier")
93    # 2. Write out Header line as Greenstone friendly Metadata terms
94    #      => remove sp spaces, change "Report Identifier" to hnz.Identifier
95    # 3. Process the rest of the file, checking if accompanying
96    #     PDF file present or not
97
98    num_rows = worksheet.nrows
99
100    # 1, Skip lines until Header encountered
101    row_i = 0
102    found_header = False
103    while row_i<num_rows:
104        first_cell = worksheet.cell_value(row_i,0);
105        if first_cell == "Report Identifier":
106            found_header = True
107            break
108        print "Skipping row {} as not yet encountered 'Report Identifier' metadata label in column 0".format(row_i)
109        row_i = row_i + 1
110
111    # 2. Process Header into Greenstone friendly metadata terms     
112    if found_header:
113        entry_utf8_row = []
114        for entry in worksheet.row_values(row_i):
115            if entry == "Report Identifier":
116                entry = "hnz.Identifier"
117
118            if entry == "Title":
119                entry = "dc.Title"
120
121            entry_utf8 = unicode(entry).encode("utf-8")
122            metadata_entry_utf8 = entry_utf8.replace(" ", "")
123
124            entry_utf8_row.append(metadata_entry_utf8)
125        unbound_wr.writerow(entry_utf8_row)
126
127        # Add in 'Filename' as first column in pdfbound CSV file
128        entry_utf8_row.insert(0, "Filename")
129        pdfbound_wr.writerow(entry_utf8_row)
130
131
132        row_i = row_i + 1
133        while row_i<num_rows:
134            num_cols = worksheet.row_len(row_i)
135            formatted_utf8_row = []
136
137            found_pdf = False
138
139            for col_j in range(num_cols):
140                cell = worksheet.cell(row_i,col_j)
141
142                cell_type  = worksheet.cell_type(row_i,col_j)
143                cell_value = worksheet.cell_value(row_i,col_j)
144
145                formatted_cell = format_excelval(workbook,cell_type,cell_value,False)
146
147                if col_j == 0:
148                    # Check to see if companion PDF file present
149                    # pdf_filename = os.path.join(excel_dirname,"pdfs",formatted_cell+".pdf")
150                    id = formatted_cell.replace(" ","")
151
152                    pdf_file = id_to_relative_pdf_file(id)
153                    pdf_file_root, pdf_ext = os.path.splitext(pdf_file)
154                    PDF_file = pdf_file_root + ".PDF"
155
156                    pdf_filename = os.path.join(excel_dirname,pdf_file)
157                    PDF_filename = os.path.join(excel_dirname,PDF_file)
158
159                    if os.path.exists(pdf_filename):
160                        found_pdf = True
161                        formatted_utf8_row.insert(0, pdf_file)
162                    elif os.path.exists(PDF_filename):
163                        found_pdf = True
164                        formatted_utf8_row.insert(0, PDF_file)
165                    else:
166                        print "Unbound id: '" + id + "'"
167
168                if isinstance(formatted_cell, basestring):
169                    # Remove any trailing whitespace. 
170                    # Newline at end particular harmful for a entry in the CSV file
171                    formatted_cell = formatted_cell.rstrip();
172
173                formatted_cell_utf8 = unicode(formatted_cell).encode("utf-8")
174                formatted_utf8_row.append(formatted_cell_utf8)
175            if found_pdf:
176                pdfbound_wr.writerow(formatted_utf8_row)
177            else:
178                unbound_wr.writerow(formatted_utf8_row)
179
180            row_i = row_i + 1
181    else:
182        print "Failed to find metadata label 'Report Identifier' in column 0"
183
184
185    csv_pdfbound_ofile.close()
186    csv_unbound_ofile.close()
187
188if __name__ == "__main__":
189    excel_to_bound_pdf_csv(os.sys.argv[1])
190
Note: See TracBrowser for help on using the browser.