source: other-projects/metadata-encoding/py/comparisonTest/sanitise.py@ 38473

Last change on this file since 38473 was 38473, checked in by jc550, 6 months ago

add script to get dois of mathml titles

File size: 1.6 KB
Line 
1import re, sys, argparse
2
3# Functions to make title consistent between platforms
4faceMarkupTags = [
5 "b",
6 "i",
7 "u",
8 "ovl",
9 "sup",
10 "sub",
11 "scp",
12 "tt"
13]
14
15mathMLTag = "mml"
16
17tagRegex = "[<]\/?[^ ]+?[>]"
18
19def clear_face_markup(title):
20 # Get rid of all facemarkup tags
21 regexTitle = title
22 for tagString in faceMarkupTags:
23 newString = "<" + tagString + ">"
24 regexTitle = regexTitle.replace(newString, "")
25 newString = "</" + tagString + ">"
26 regexTitle = regexTitle.replace(newString, "")
27
28 print("regex done: " + regexTitle)
29 return regexTitle
30
31def clear_math_ml_tags(title):
32 # find all regex matches and remove mathml tags
33 regexTitle = title
34 regexMatches = re.findall(tagRegex, regexTitle)
35 for match in regexMatches:
36 if match.find(mathMLTag) != -1:
37 regexTitle = regexTitle.replace(match, "")
38 return regexTitle
39
40def clear_tags(title):
41 facemarkupRemovedTitle = clear_face_markup(title)
42 return clear_math_ml_tags(facemarkupRemovedTitle)
43
44def add_subtitles():
45 return "Title with added subtitles"
46
47# commandline interface for checking things
48def main():
49 parser = argparse.ArgumentParser(
50 prog="sanitise",
51 description="sanitises title input for consistent output"
52 )
53 parser.add_argument("title", help="Title of the article")
54 parser.add_argument("-s", "--subtitle", help="Subtitle (if one provided)")
55 args = parser.parse_args()
56
57 print(args)
58
59 tagClearTitle = clear_tags(args.title)
60
61 print(tagClearTitle)
62
63
64if __name__ == "__main__":
65 main()
Note: See TracBrowser for help on using the repository browser.