Changeset 38701
- Timestamp:
- 2024-02-06T13:17:33+13:00 (4 months ago)
- Location:
- gs3-installations/intermuse/trunk/sites/intermuse/collect/ephemeral-heterogeneous/prepare
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-installations/intermuse/trunk/sites/intermuse/collect/ephemeral-heterogeneous/prepare/pagedimage-fs-generate-metadata-files.py
r38692 r38701 11 11 if __name__ == "__main__": 12 12 13 toplevel_dir="Digitised-materials-ready-for-digital-archive"14 output_ filename = None13 input_toplevel_dir="Digitised-materials-ready-for-digital-archive" 14 output_toplevel_dir="Digitised-materials-ready-for-digital-archive--additional" 15 15 16 16 if len(sys.argv) > 1: 17 toplevel_dir = sys.argv[1]17 input_toplevel_dir = sys.argv[1] 18 18 19 #if len(sys.argv) > 2:20 # output_filename= sys.argv[2]19 if len(sys.argv) > 2: 20 output_toplevel_dir = sys.argv[2] 21 21 22 fs_structure_dirs = pagedimagefs.determine_fs_structure( toplevel_dir)22 fs_structure_dirs = pagedimagefs.determine_fs_structure(input_toplevel_dir) 23 23 24 24 pagedimagemd.add_directory_metadata(fs_structure_dirs) … … 27 27 pagedimagefs.print_fs_structure(fs_structure_dirs) 28 28 29 pagedimagemd.output_directory_metadata(fs_structure_dirs )29 pagedimagemd.output_directory_metadata(fs_structure_dirs,input_toplevel_dir,output_toplevel_dir) 30 30 31 # if output_filename == None:32 # print(needs_item_file_recs)33 # #pagedimagefs.output_pdf_files(all_pdfs)34 # else:35 # x=136 # #pagedimagefs.save_pdf_list(all_pdfs,output_filename)37 31 38 32 -
gs3-installations/intermuse/trunk/sites/intermuse/collect/ephemeral-heterogeneous/prepare/pagedimagemd.py
r38693 r38701 8 8 'RCM BMS chapters programmes and congress' : {}, 9 9 'YorkïŒHMS BMS Borthwick IIIF/BMS' : { 10 "type" : "Programme",11 "publisher" : "The British Music Society of York",12 " regex": r'^BMS_\d+.*\.pdf$'10 "type" : "Programme", 11 "publisher" : "The British Music Society of York", 12 "file-regex" : r'^BMS_\d+.*\.pdf$' 13 13 }, 14 14 'YorkïŒHMS BMS Borthwick IIIF/HMS' : { … … 58 58 return matching_md_rules; 59 59 60 def add_pdf_filenames_metadata(fs_structure_dirs,full_dirname,dir_match_type): 61 fs_structure_this_dir = fs_structure_dirs[full_dirname] 62 63 pdf_filenames = fs_structure_this_dir["pdf-filenames"] 64 65 66 for pdf_filename in pdf_filenames: 67 metadata_description_list = [] 68 69 title_from_filename = re.sub(r"\.\w+$","",pdf_filename) 70 title_from_filename = re.sub(r"_"," ",title_from_filename,flags=re.IGNORECASE) 71 72 metadata_description_list.append( 73 f"<Metadata mode=\"override\" name=\"im.Title\">{title_from_filename}</Metadata>" 74 ) 75 metadata_description_list.append( 76 f"<Metadata mode=\"override\" name=\"im.{dir_match_type}Title\">{title_from_filename}</Metadata>" 77 ) 78 79 pdf_filename_re = "^"+re.sub(r"\.","\\.",pdf_filename)+"$" 80 81 metadata_fileset = { "FileSet": { "FileName": f"{pdf_filename_re}", "Descriptions": metadata_description_list } } 82 83 fs_structure_this_dir["metadata-filesets"].append(metadata_fileset) 84 85 60 86 def add_directory_metadata(fs_structure_dirs): 61 87 … … 98 124 ) 99 125 100 is_programme = "true" if (type == " programme") else "false"126 is_programme = "true" if (type == "Programme") else "false" 101 127 102 128 metadata_description_list.append( … … 114 140 fs_structure_this_dir["metadata-filesets"].append(metadata_fileset) 115 141 142 add_pdf_filenames_metadata(fs_structure_dirs,full_dirname,type) 143 116 144 # debugging 117 145 # print(f"{matching_md_dir_rules} matches {full_dirname}") 118 146 119 147 120 def output_directory_metadata(fs_structure_dirs): 121 122 for full_dirname in sorted(fs_structure_dirs.keys()): 123 124 fs_structure_this_dir = fs_structure_dirs[full_dirname] 148 def output_directory_metadata(fs_structure_dirs, input_toplevel_dirname,output_toplevel_dirname): 149 150 print("Writing out added metadata-filesets as metadata.xml files to:") 151 print(f" {output_toplevel_dirname}") 152 print("----") 153 print("Processing directories:") 154 155 for input_full_dirname in sorted(fs_structure_dirs.keys()): 156 157 output_full_dirname = re.sub(r"^"+input_toplevel_dirname+os.sep,output_toplevel_dirname+os.sep,input_full_dirname) 158 print(f" {output_full_dirname}") 159 160 fs_structure_this_dir = fs_structure_dirs[input_full_dirname] 125 161 126 162 if len(fs_structure_this_dir["metadata-filesets"]) > 0: 127 163 164 fileset_lines = "" 165 128 166 for metadata_fileset in fs_structure_this_dir["metadata-filesets"]: 129 167 … … 133 171 for description in metadata_fileset["FileSet"]["Descriptions"]: 134 172 description_lines += f" {description}\n" 135 136 # Outer metadata.xml in the form 137 138 # <?xml version="1.0" encoding="UTF-8"?> 139 # <!DOCTYPE DirectoryMetadata SYSTEM "http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd"> 140 # <DirectoryMetadata> 141 # 142 # <FileSet> 143 # <FileName>...</FileName> 144 # <Description> 145 # # ... 146 # </Description> 147 # </FileSet> 148 # 149 # </DirectoryMetadata> 150 151 152 metadata_xml = f"""<?xml version="1.0" encoding="UTF-8"?> 153 <!DOCTYPE DirectoryMetadata SYSTEM "http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd"> 154 <DirectoryMetadata> 173 174 175 fileset_lines += f""" 155 176 156 177 <FileSet> … … 159 180 {description_lines} 160 181 </Description> 161 <FileSet> 162 182 </FileSet> 183 """ 184 185 fileset_lines += "\n" 186 187 # Outer metadata.xml in the form 188 189 # <?xml version="1.0" encoding="UTF-8"?> 190 # <!DOCTYPE DirectoryMetadata SYSTEM "http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd"> 191 # <DirectoryMetadata> 192 # 193 # <FileSet> 194 # <FileName>...</FileName> 195 # <Description> 196 # # ... 197 # </Description> 198 # </FileSet> 199 # 200 # </DirectoryMetadata> 201 202 203 metadata_xml = f"""<?xml version="1.0" encoding="UTF-8"?> 204 <!DOCTYPE DirectoryMetadata SYSTEM "http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd"> 205 <DirectoryMetadata> 206 {fileset_lines} 163 207 </DirectoryMetadata>""" 164 165 print("----") 166 print(metadata_xml) 167 print("----") 208 209 output_full_metadata_filename = os.path.join(output_full_dirname,"metadata.xml") 210 211 os.makedirs(output_full_dirname, exist_ok=True ) 212 213 with open(output_full_metadata_filename, "w", encoding="utf-8") as fout: 214 fout.write(metadata_xml) 215 fout.write("\n") 168 216 169 217
Note:
See TracChangeset
for help on using the changeset viewer.