import xlrd import csv import os ## worksheet_name = "Archaeological reports" # https://code.activestate.com/recipes/546518-simple-conversion-of-excel-files-into-csv-and-yaml/download/1/ def format_excelval(book, type, value, wanttupledate): """ Clean up the incoming excel data """ ## Data Type Codes: ## EMPTY 0 ## TEXT 1 a Unicode string ## NUMBER 2 float ## DATE 3 float ## BOOLEAN 4 int; 1 means TRUE, 0 means FALSE ## ERROR 5 returnrow = [] if type == 2: # TEXT if value == int(value): value = int(value) elif type == 3: # NUMBER datetuple = xlrd.xldate_as_tuple(value, book.datemode) value = datetuple if wanttupledate else tupledate_to_isodate(datetuple) elif type == 5: # ERROR value = xlrd.error_text_from_code[value] return value letter_to_folder = { 'a': "A - B", 'b': "A - B", 'c': "C - D", 'd': "C - D", 'e': "E - F", 'f': "E - F", 'g': "G - H", 'h': "G - H", 'i': "I - J", 'j': "I - J", 'k': "K - L", 'l': "K - L", 'm': "M - N", 'n': "M - N", 'o': "O - P", 'p': "O - P", 'q': "Q - R", 'r': "Q - R", 's': "S - T", 't': "S - T", 'u': "U - V", 'v': "U - V", 'w': "W - Z", 'x': "W - Z", 'y': "W - Z", 'z': "W - Z" } def id_to_relative_pdf_file(id): first_char = id[0:1] first_char_lower = first_char.lower() file_tail = letter_to_folder[first_char_lower] file_root = "Reports " + file_tail pdf_file = os.path.join("Digital-Library-PDFs",file_root,id+".pdf") return pdf_file def excel_to_bound_pdf_csv(excel_filename): workbook = xlrd.open_workbook(excel_filename, formatting_info=True) ## worksheet = workbook.sheet_by_name(worksheet_name) worksheet = workbook.sheet_by_index(0) excel_filename_split = os.path.split(excel_filename) excel_dirname = excel_filename_split[0] excel_file = excel_filename_split[1] excel_file_splitext = os.path.splitext(excel_file) excel_file_root = excel_file_splitext[0]; pdfbound_filename = os.path.join(excel_dirname,"PDF-BOUND "+excel_file_root+".csv") unbound_filename = os.path.join(excel_dirname,"UNBOUND "+excel_file_root+".csv") # print "Worksheet: " + worksheet_name csv_pdfbound_ofile = open(pdfbound_filename, 'wb') #csv_unbound_ofile = open('{}.csv'.format(excel_file_root), 'wb') csv_unbound_ofile = open(unbound_filename, 'wb') pdfbound_wr = csv.writer(csv_pdfbound_ofile, quoting=csv.QUOTE_ALL) unbound_wr = csv.writer(csv_unbound_ofile, quoting=csv.QUOTE_ALL) # 1. Skip lines until Header encountered (as defined by encountering "Report Identifier") # 2. Write out Header line as Greenstone friendly Metadata terms # => remove sp spaces, change "Report Identifier" to hnz.Identifier # 3. Process the rest of the file, checking if accompanying # PDF file present or not num_rows = worksheet.nrows # 1, Skip lines until Header encountered row_i = 0 found_header = False while row_i