Changeset 33191
- Timestamp:
- 2019-06-22T10:33:10+12:00 (5 years ago)
- Location:
- main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xls-to-csv.py
r33188 r33191 4 4 5 5 import xlsutil 6 7 ## worksheet_name = "Archaeological reports"8 6 9 7 letter_to_folder = { … … 48 46 49 47 48 # **** Written, but never used 49 # **** Col numbers refer to older version of spreadsheet 50 # 51 # There is now a mapping of header-name to index position which would be 52 # a better way to do thing 53 50 54 def fixup_cell(col_num,cell_str): 51 55 # col == 1 => Author … … 74 78 75 79 def excel_to_bound_pdf_csv(excel_filename): 76 #workbook = xlrd.open_workbook(excel_filename, formatting_info=True)77 80 workbook = xlrd.open_workbook(excel_filename) 78 ## worksheet = workbook.sheet_by_name(worksheet_name) 79 worksheet = workbook.sheet_by_index(0) 81 82 worksheet = workbook.sheet_by_index(xlsutil.worksheet_index_pos) 83 ## worksheet = workbook.sheet_by_name(xlsutil.worksheet_name) 80 84 81 85 excel_filename_split = os.path.split(excel_filename) … … 88 92 unbound_filename = os.path.join(excel_dirname,"UNBOUND "+excel_file_root+".csv") 89 93 90 # print "Worksheet: " + worksheet_name91 94 csv_pdfbound_ofile = open(pdfbound_filename, 'wb') 92 #csv_unbound_ofile = open('{}.csv'.format(excel_file_root), 'wb')93 95 csv_unbound_ofile = open(unbound_filename, 'wb') 94 96 95 97 pdfbound_wr = csv.writer(csv_pdfbound_ofile, quoting=csv.QUOTE_ALL) 96 98 unbound_wr = csv.writer(csv_unbound_ofile, quoting=csv.QUOTE_ALL) 97 98 # # 1. Skip lines until Header encountered (as defined by encountering "Report Identifier")99 # # 2. Write out Header line as Greenstone friendly Metadata terms100 # # => remove sp spaces, change "Report Identifier" to hnz.Identifier101 # # 3. Process the rest of the file, checking if accompanying102 # # PDF file present or not103 104 99 105 100 # 1. For header-line, build up hashmap of header-names to column number … … 110 105 # specified in 'sanitized_headers' 111 106 112 113 107 num_rows = worksheet.nrows 114 108 115 # 1, Skip lines until Header encountered 116 # row_i = 0 117 # found_header = False 118 # while row_i<num_rows: 119 # first_cell = worksheet.cell_value(row_i,0); 120 # if first_cell == "Report Identifier": 121 # found_header = True 122 # break 123 # print "Skipping row {} as not yet encountered 'Report Identifier' metadata label in column 0".format(row_i) 124 # row_i = row_i + 1 125 109 # 1. Get header-line hashmap of header-names to column numbers 126 110 header_names_mapping = xlsutil.getHeaderMappingToColNum(worksheet) 127 111 128 # 2. Process Header into Greenstone friendly metadata terms129 112 if header_names_mapping is None: 130 113 print "Failed to find metadata label 'Report Identifier' in column 0" 131 114 exit() 132 115 else: 116 # 2. Process Header into Greenstone friendly metadata terms 117 133 118 entry_utf8_row = [] 134 119 row_i = 0; … … 164 149 pdfbound_wr.writerow(entry_utf8_row) 165 150 166 # 3. Process the rest of the file ...151 # 3. Process the rest of the file (metadata values) ... 167 152 168 153 row_i = row_i + 1 … … 179 164 col_j = header_names_mapping[header_name] 180 165 181 # cell = worksheet.cell(row_i,col_j)182 183 # cell_type = worksheet.cell_type(row_i,col_j)184 166 cell_value = worksheet.cell_value(row_i,col_j) 185 167 186 # formatted_cell_value = format_excelval(workbook,cell_type,cell_value,False)187 168 formatted_cell_value = xlsutil.format_if_int(cell_value) 188 169 189 # if col_j == 0:190 170 if header_name == "Report Identifier": 191 171 # Check to see if companion PDF file present -
main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xls-to-sanitized-csv.py
r33188 r33191 5 5 import xlsutil 6 6 7 ## worksheet_name = "Archaeological reports"8 9 10 7 def excel_to_sanitized_csv(excel_filename): 11 8 workbook = xlrd.open_workbook(excel_filename) 12 # worksheet = workbook.sheet_by_name(worksheet_name) 13 worksheet = workbook.sheet_by_index(0) 9 10 worksheet = workbook.sheet_by_index(xlsutil.worksheet_index_pos) 11 ## worksheet = workbook.sheet_by_name(xlsutil.worksheet_name) 14 12 15 13 excel_filename_split = os.path.split(excel_filename) … … 21 19 sanitized_csv_filename = os.path.join(excel_dirname,excel_file_root+".csv") 22 20 23 # print "Worksheet: " + worksheet_name24 21 sanitized_csv_ofile = open(sanitized_csv_filename, 'wb') 25 22 … … 31 28 num_rows = worksheet.nrows 32 29 33 # row_i = 034 # num_header_cols = worksheet.row_len(row_i)35 36 30 # 1. get header-map 37 31 header_names_mapping = xlsutil.getHeaderMappingToColNum(worksheet) 38 32 39 # found_header = False40 # for col_j in range(num_header_cols):41 # header_cell_value = worksheet.cell_value(row_i,col_j)42 # if header_cell_value == "Report Identifier":43 # found_header = True44 #45 # header_names_mapping[header_cell_value] = col_j46 #47 # if not(found_header):48 33 if header_names_mapping is None: 49 34 print "Failed to find \"Report Identifier\" header in spreadsheet (Row 0 or Sheet 0)" … … 65 50 cell_value = worksheet.cell_value(row_i,header_col_j) 66 51 67 68 69 52 cell_value_utf8 = unicode(cell_value).encode("utf-8") 70 53 -
main/trunk/model-sites-dev/heritage-nz/collect/pdf-reports/prepare/xlsutil.py
r33188 r33191 1 1 import xlrd 2 3 # Currently access worksheet by index num 4 # Alternative is to do this by worksheet name 5 worksheet_index_pos = 0 6 ## worksheet_name = "Archaeological reports" 2 7 3 8 sanitized_headers = [ "Report Identifier", "Author", "Title", "Produced By", "Date", "HNZPT Region", "File No",
Note:
See TracChangeset
for help on using the changeset viewer.